In [1]:
import pandas as pd
import numpy as np
import csv

from sqlalchemy import create_engine


In [2]:
import os
import json
from config import password

import pandas as pd
import numpy as np

import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine

from flask import Flask, jsonify, render_template
from flask_sqlalchemy import SQLAlchemy

app = Flask(__name__)

app.config["SQLALCHEMY_DATABASE_URI"] = "postgresql+psycopg2://postgres:{password}@localhost:5432/crime_db"
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
db = SQLAlchemy(app)
# reflect an existing database into a new model
Base = automap_base()
# reflect the tables
Base.prepare(db.engine, reflect=True)

# prepare to load the entire chicago table into a dataframe
Chicago_Metadata = Base.classes.chicago
stmt = db.session.query(Chicago_Metadata).statement
df = pd.read_sql_query(stmt, db.session.bind)
print("Loaded dataframe successfully...")

# Filter dataframe by certain crime types
crime_types = ['THEFT', 'BATTERY', 'CRIMINAL DAMAGE', 'NARCOTICS', 'ASSAULT', 
    'MOTOR VEHICLE THEFT', 'ROBBERY', 'WEAPONS VIOLATION', 'CONCEALED CARRY LICENSE VIOLATION', 'HOMICIDE', 'ARSON']
filtered_df = df[df.Primary_Type.isin(crime_types)]

Loaded dataframe successfully...


In [3]:
len(filtered_df)

2199538

In [4]:
# Check date range
least_recent_date = filtered_df['Date'].min()
recent_date = filtered_df['Date'].max()
print(f'Start date: {least_recent_date} and Recent date: {recent_date}')

Start date: 2010-01-01 00:01:00 and Recent date: 2019-09-10 23:55:00


In [5]:
#start date 2013
start_date = '2013-01-01 00:01:00'
end_date = '2013-12-31 23:59:00'
mask = (filtered_df['Date'] > start_date) & (filtered_df['Date'] <= end_date)
filtered_crime_df_2013 = filtered_df.loc[mask]
filtered_crime_df_2013.head()

Unnamed: 0,index,ID,Date,Primary_Type,Description,Location_Description,Arrest,District,Year,Latitude,Longitude,Historical_Wards,Zip_Codes,Police_Districts
1505999,1524371,9448361,2013-12-31 23:55:00,BATTERY,SIMPLE,APARTMENT,True,15,2013,41.879424,-87.751596,11,22216,25
1506000,1524372,9446913,2013-12-31 23:50:00,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,False,6,2013,41.734898,-87.631916,18,21554,20
1506001,1524373,9446862,2013-12-31 23:50:00,BATTERY,DOMESTIC BATTERY SIMPLE,STREET,False,2,2013,41.791527,-87.588817,32,22260,24
1506002,1524374,9446989,2013-12-31 23:50:00,BATTERY,AGGRAVATED:KNIFE/CUTTING INSTR,APARTMENT,False,12,2013,41.877601,-87.681989,48,21184,15
1506003,1524375,9450852,2013-12-31 23:45:00,BATTERY,SIMPLE,RESTAURANT,False,18,2013,41.891316,-87.631118,22,4446,14


In [6]:
len(filtered_crime_df_2013)

236792

In [7]:
filtered_crime_df_2013.shape 

(236792, 14)

In [8]:
crime_type = ['HOMICIDE']
homicide_df_2013 = filtered_crime_df_2013[filtered_crime_df_2013.Primary_Type.isin(crime_type)]

In [9]:
len(homicide_df_2013)

429

In [10]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from geopy.distance import great_circle
from shapely.geometry import MultiPoint
coords = homicide_df_2013.as_matrix(columns=['Latitude', 'Longitude'])

  """


In [11]:
kms_per_radian = 6371.0088
epsilon = 1.5 / kms_per_radian
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

Number of clusters: 27


In [12]:
def get_centermost_point(cluster):
    centroid = (MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y)
    centermost_point = min(cluster, key=lambda point: great_circle(point, centroid).m)
    return tuple(centermost_point)
centermost_points = clusters.map(get_centermost_point)


In [13]:
lats, lons = zip(*centermost_points)
homicide_rep_points_2013 = pd.DataFrame({'Longitude':lons, 'Latitude':lats})
homicide_rep_points_2013['Crime_type'] = 'HOMICIDE'
homicide_rep_points_2013['Year'] = '2013'

In [14]:
homicide_rep_points_2013

Unnamed: 0,Longitude,Latitude,Crime_type,Year
0,-87.634373,41.753582,HOMICIDE,2013
1,-87.723179,41.879329,HOMICIDE,2013
2,-87.619114,41.816599,HOMICIDE,2013
3,-87.565262,41.713516,HOMICIDE,2013
4,-87.66036,41.993071,HOMICIDE,2013
5,-87.781944,41.96928,HOMICIDE,2013
6,-87.681666,42.015439,HOMICIDE,2013
7,-87.704876,41.970868,HOMICIDE,2013
8,-87.681606,41.884218,HOMICIDE,2013
9,-87.734745,41.740779,HOMICIDE,2013


In [15]:
crime_type = ['ARSON']
arson_df_2013 = filtered_crime_df_2013[filtered_crime_df_2013.Primary_Type.isin(crime_type)]
len(arson_df_2013)

364

In [16]:
coords = arson_df_2013.as_matrix(columns=['Latitude', 'Longitude'])
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

Number of clusters: 34


  """Entry point for launching an IPython kernel.


In [17]:
centermost_points = clusters.map(get_centermost_point)

In [18]:
lats, lons = zip(*centermost_points)
arson_rep_points_2013 = pd.DataFrame({'Longitude':lons, 'Latitude':lats})
arson_rep_points_2013['Crime_type'] = 'ARSON'
arson_rep_points_2013['Year'] = '2013'

In [19]:
arson_rep_points_2013


Unnamed: 0,Longitude,Latitude,Crime_type,Year
0,-87.726267,41.901488,ARSON,2013
1,-87.652986,41.78145,ARSON,2013
2,-87.538902,41.710924,ARSON,2013
3,-87.626539,41.695278,ARSON,2013
4,-87.565733,41.704095,ARSON,2013
5,-87.664091,42.011595,ARSON,2013
6,-87.764848,41.796125,ARSON,2013
7,-87.556362,41.741818,ARSON,2013
8,-87.72098,41.756398,ARSON,2013
9,-87.663921,41.900673,ARSON,2013


In [20]:
crime_type = ['NARCOTICS']
narcotics_df_2013 = filtered_crime_df_2013[filtered_crime_df_2013.Primary_Type.isin(crime_type)]
len(narcotics_df_2013)

34106

In [21]:
coords = narcotics_df_2013.as_matrix(columns=['Latitude', 'Longitude'])
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

  """Entry point for launching an IPython kernel.


Number of clusters: 5


In [22]:
centermost_points = clusters.map(get_centermost_point)
lats, lons = zip(*centermost_points)
narcotics_rep_points_2013 = pd.DataFrame({'Longitude':lons, 'Latitude':lats})
narcotics_rep_points_2013['Crime_type'] = 'NARCOTICS'
narcotics_rep_points_2013['Year'] = '2013'
narcotics_rep_points_2013

Unnamed: 0,Longitude,Latitude,Crime_type,Year
0,-87.685266,41.840729,NARCOTICS,2013
1,-87.907473,41.9539,NARCOTICS,2013
2,-87.901514,41.9766,NARCOTICS,2013
3,-91.686566,36.619446,NARCOTICS,2013
4,-87.738687,41.683646,NARCOTICS,2013


In [23]:
crime_type = ['THEFT']
theft_df_2013 = filtered_crime_df_2013[filtered_crime_df_2013.Primary_Type.isin(crime_type)]
len(theft_df_2013)

71454

In [24]:
coords = theft_df_2013.as_matrix(columns=['Latitude', 'Longitude'])
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

  """Entry point for launching an IPython kernel.


Number of clusters: 6


In [25]:
centermost_points = clusters.map(get_centermost_point)
lats, lons = zip(*centermost_points)
theft_rep_points_2013 = pd.DataFrame({'Longitude':lons, 'Latitude':lats})
theft_rep_points_2013['Crime_type'] = 'THEFT'
theft_rep_points_2013['Year'] = '2013'
theft_rep_points_2013

Unnamed: 0,Longitude,Latitude,Crime_type,Year
0,-87.667003,41.85504,THEFT,2013
1,-87.903639,41.978466,THEFT,2013
2,-87.925509,41.989622,THEFT,2013
3,-87.88717,41.959441,THEFT,2013
4,-87.934273,42.008162,THEFT,2013
5,-91.686566,36.619446,THEFT,2013


In [26]:
crime_type = ['BATTERY']
battery_df_2013 = filtered_crime_df_2013[filtered_crime_df_2013.Primary_Type.isin(crime_type)]
len(battery_df_2013)

53988

In [27]:
coords = battery_df_2013.as_matrix(columns=['Latitude', 'Longitude'])
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

  """Entry point for launching an IPython kernel.


Number of clusters: 5


In [28]:
centermost_points = clusters.map(get_centermost_point)
lats, lons = zip(*centermost_points)
battery_rep_points_2013 = pd.DataFrame({'Longitude':lons, 'Latitude':lats})
battery_rep_points_2013['Crime_type'] = 'BATTERY'
battery_rep_points_2013['Year'] = '2013'
battery_rep_points_2013

Unnamed: 0,Longitude,Latitude,Crime_type,Year
0,-87.669208,41.831724,BATTERY,2013
1,-87.900984,41.976763,BATTERY,2013
2,-87.909079,41.960023,BATTERY,2013
3,-87.905815,42.007135,BATTERY,2013
4,-91.686566,36.619446,BATTERY,2013


In [29]:
crime_type = ['ROBBERY']
robbery_df_2013 = filtered_crime_df_2013[filtered_crime_df_2013.Primary_Type.isin(crime_type)]
len(robbery_df_2013)

11818

In [30]:
coords = robbery_df_2013.as_matrix(columns=['Latitude', 'Longitude'])
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

  """Entry point for launching an IPython kernel.


Number of clusters: 8


In [31]:
centermost_points = clusters.map(get_centermost_point)
lats, lons = zip(*centermost_points)
robbery_rep_points_2013 = pd.DataFrame({'Longitude':lons, 'Latitude':lats})
robbery_rep_points_2013['Crime_type'] = 'ROBBERY'
robbery_rep_points_2013['Year'] = '2013'
robbery_rep_points_2013

Unnamed: 0,Longitude,Latitude,Crime_type,Year
0,-87.665556,41.830045,ROBBERY,2013
1,-87.547153,41.652886,ROBBERY,2013
2,-87.837701,41.977465,ROBBERY,2013
3,-87.707816,41.706103,ROBBERY,2013
4,-87.763879,42.004647,ROBBERY,2013
5,-87.807069,41.973901,ROBBERY,2013
6,-87.801435,41.789675,ROBBERY,2013
7,-87.80664,42.011223,ROBBERY,2013


In [32]:
crime_type = ['ASSAULT']
assault_df_2013 = filtered_crime_df_2013[filtered_crime_df_2013.Primary_Type.isin(crime_type)]
len(assault_df_2013)

17968

In [33]:
coords = assault_df_2013.as_matrix(columns=['Latitude', 'Longitude'])
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

  """Entry point for launching an IPython kernel.


Number of clusters: 5


In [34]:
centermost_points = clusters.map(get_centermost_point)
lats, lons = zip(*centermost_points)
assault_rep_points_2013 = pd.DataFrame({'Longitude':lons, 'Latitude':lats})
assault_rep_points_2013['Crime_type'] = 'ASSAULT'
assault_rep_points_2013['Year'] = '2013'
assault_rep_points_2013

Unnamed: 0,Longitude,Latitude,Crime_type,Year
0,-87.667898,41.828173,ASSAULT,2013
1,-87.896576,41.977021,ASSAULT,2013
2,-87.575382,41.674814,ASSAULT,2013
3,-87.917645,41.962448,ASSAULT,2013
4,-87.907073,42.008849,ASSAULT,2013


In [35]:
crime_type = ['MOTOR VEHICLE THEFT']
mvt_df_2013 = filtered_crime_df_2013[filtered_crime_df_2013.Primary_Type.isin(crime_type)]
len(mvt_df_2013)

12574

In [36]:
coords = mvt_df_2013.as_matrix(columns=['Latitude', 'Longitude'])
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

  """Entry point for launching an IPython kernel.


Number of clusters: 6


In [37]:
centermost_points = clusters.map(get_centermost_point)
lats, lons = zip(*centermost_points)
mvt_rep_points_2013 = pd.DataFrame({'Longitude':lons, 'Latitude':lats})
mvt_rep_points_2013['Crime_type'] = 'MOTOR VEHICLE THEFT'
mvt_rep_points_2013['Year'] = '2013'
mvt_rep_points_2013

Unnamed: 0,Longitude,Latitude,Crime_type,Year
0,-87.677475,41.847877,MOTOR VEHICLE THEFT,2013
1,-87.548368,41.658298,MOTOR VEHICLE THEFT,2013
2,-87.885098,41.981296,MOTOR VEHICLE THEFT,2013
3,-87.899927,42.005708,MOTOR VEHICLE THEFT,2013
4,-91.686566,36.619446,MOTOR VEHICLE THEFT,2013
5,-87.892262,41.955789,MOTOR VEHICLE THEFT,2013


In [38]:
total_cluster_df_2013 = pd.concat([homicide_rep_points_2013, arson_rep_points_2013, narcotics_rep_points_2013, theft_rep_points_2013, battery_rep_points_2013,
                     robbery_rep_points_2013, assault_rep_points_2013, mvt_rep_points_2013], ignore_index=True)

In [39]:
total_cluster_df_2013

Unnamed: 0,Longitude,Latitude,Crime_type,Year
0,-87.634373,41.753582,HOMICIDE,2013
1,-87.723179,41.879329,HOMICIDE,2013
2,-87.619114,41.816599,HOMICIDE,2013
3,-87.565262,41.713516,HOMICIDE,2013
4,-87.660360,41.993071,HOMICIDE,2013
5,-87.781944,41.969280,HOMICIDE,2013
6,-87.681666,42.015439,HOMICIDE,2013
7,-87.704876,41.970868,HOMICIDE,2013
8,-87.681606,41.884218,HOMICIDE,2013
9,-87.734745,41.740779,HOMICIDE,2013


In [40]:
#save CSV
total_cluster_df_2013.to_csv("assets/data/clusters2013.csv", encoding="utf-8")