In [1]:
import pandas as pd
import numpy as np
import csv

from sqlalchemy import create_engine


In [2]:
import os
import json
from config import password

import pandas as pd
import numpy as np

import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine

from flask import Flask, jsonify, render_template
from flask_sqlalchemy import SQLAlchemy

app = Flask(__name__)

app.config["SQLALCHEMY_DATABASE_URI"] = "postgresql+psycopg2://postgres:{password}@localhost:5432/crime_db"
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
db = SQLAlchemy(app)
# reflect an existing database into a new model
Base = automap_base()
# reflect the tables
Base.prepare(db.engine, reflect=True)

# prepare to load the entire chicago table into a dataframe
Chicago_Metadata = Base.classes.chicago
stmt = db.session.query(Chicago_Metadata).statement
df = pd.read_sql_query(stmt, db.session.bind)
print("Loaded dataframe successfully...")

# Filter dataframe by certain crime types
crime_types = ['THEFT', 'BATTERY', 'CRIMINAL DAMAGE', 'NARCOTICS', 'ASSAULT', 
    'MOTOR VEHICLE THEFT', 'ROBBERY', 'WEAPONS VIOLATION', 'CONCEALED CARRY LICENSE VIOLATION', 'HOMICIDE', 'ARSON']
filtered_df = df[df.Primary_Type.isin(crime_types)]

Loaded dataframe successfully...


In [3]:
len(filtered_df)

2199538

In [4]:
# Check date range
least_recent_date = filtered_df['Date'].min()
recent_date = filtered_df['Date'].max()
print(f'Start date: {least_recent_date} and Recent date: {recent_date}')

Start date: 2010-01-01 00:01:00 and Recent date: 2019-09-10 23:55:00


In [5]:
#start date 2019
start_date = '2019-01-01 00:01:00'
end_date = '2019-12-31 23:59:00'
mask = (filtered_df['Date'] > start_date) & (filtered_df['Date'] <= end_date)
filtered_crime_df_2019 = filtered_df.loc[mask]
filtered_crime_df_2019.head()

Unnamed: 0,index,ID,Date,Primary_Type,Description,Location_Description,Arrest,District,Year,Latitude,Longitude,Historical_Wards,Zip_Codes,Police_Districts
0,0,11824091,2019-09-10 23:55:00,WEAPONS VIOLATION,UNLAWFUL USE HANDGUN,RESIDENCE PORCH/HALLWAY,False,7,2019,41.775402,-87.653178,17,21559,17
2,2,11824121,2019-09-10 23:50:00,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,True,15,2019,41.880829,-87.752634,11,22216,25
3,3,11824152,2019-09-10 23:47:00,CRIMINAL DAMAGE,TO PROPERTY,RESIDENCE,False,6,2019,41.751657,-87.650131,17,21554,20
4,5,11824113,2019-09-10 23:42:00,ASSAULT,SIMPLE,SIDEWALK,False,15,2019,41.89988,-87.748366,4,4299,25
7,8,11824122,2019-09-10 23:33:00,BATTERY,DOMESTIC BATTERY SIMPLE,ALLEY,True,25,2019,41.919319,-87.758462,4,22615,6


In [6]:
len(filtered_crime_df_2019)

137125

In [7]:
filtered_crime_df_2019.shape 

(137125, 14)

In [9]:
crime_type = ['HOMICIDE']
homicide_df_2019 = filtered_crime_df_2019[filtered_crime_df_2019.Primary_Type.isin(crime_type)]

In [10]:
len(homicide_df_2019)

356

In [11]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from geopy.distance import great_circle
from shapely.geometry import MultiPoint
coords = homicide_df_2019.as_matrix(columns=['Latitude', 'Longitude'])

  """


In [12]:
kms_per_radian = 6371.0088
epsilon = 1.5 / kms_per_radian
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

Number of clusters: 21


In [13]:
def get_centermost_point(cluster):
    centroid = (MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y)
    centermost_point = min(cluster, key=lambda point: great_circle(point, centroid).m)
    return tuple(centermost_point)
centermost_points = clusters.map(get_centermost_point)


In [14]:
lats, lons = zip(*centermost_points)
homicide_rep_points_2019 = pd.DataFrame({'Longitude':lons, 'Latitude':lats})
homicide_rep_points_2019['Crime_type'] = 'HOMICIDE'
homicide_rep_points_2019['Year'] = '2019'

In [15]:
homicide_rep_points_2019

Unnamed: 0,Longitude,Latitude,Crime_type,Year
0,-87.655833,41.864102,HOMICIDE,2019
1,-87.743143,41.807168,HOMICIDE,2019
2,-87.536489,41.711484,HOMICIDE,2019
3,-87.626358,41.757639,HOMICIDE,2019
4,-87.729324,41.878823,HOMICIDE,2019
5,-87.691362,41.920311,HOMICIDE,2019
6,-87.669085,42.018967,HOMICIDE,2019
7,-87.644258,41.935855,HOMICIDE,2019
8,-87.633564,41.911161,HOMICIDE,2019
9,-87.708548,41.971042,HOMICIDE,2019


In [16]:
crime_type = ['ARSON']
arson_df_2019= filtered_crime_df_2019[filtered_crime_df_2019.Primary_Type.isin(crime_type)]
len(arson_df_2019)

255

In [17]:
coords = arson_df_2019.as_matrix(columns=['Latitude', 'Longitude'])
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

Number of clusters: 43


  """Entry point for launching an IPython kernel.


In [18]:
centermost_points = clusters.map(get_centermost_point)

In [19]:
lats, lons = zip(*centermost_points)
arson_rep_points_2019 = pd.DataFrame({'Longitude':lons, 'Latitude':lats})
arson_rep_points_2019['Crime_type'] = 'ARSON'
arson_rep_points_2019['Year'] = '2019'

In [20]:
arson_rep_points_2019

Unnamed: 0,Longitude,Latitude,Crime_type,Year
0,-87.572315,41.751198,ARSON,2019
1,-87.718157,41.965221,ARSON,2019
2,-87.658316,41.768635,ARSON,2019
3,-87.734733,41.896039,ARSON,2019
4,-87.531663,41.693044,ARSON,2019
5,-87.698809,41.79823,ARSON,2019
6,-87.74064,41.974209,ARSON,2019
7,-87.634594,41.893182,ARSON,2019
8,-87.681385,41.939535,ARSON,2019
9,-87.792272,41.944192,ARSON,2019


In [21]:
crime_type = ['NARCOTICS']
narcotics_df_2019 = filtered_crime_df_2019[filtered_crime_df_2019.Primary_Type.isin(crime_type)]
len(narcotics_df_2019)

9769

In [22]:
coords = narcotics_df_2019.as_matrix(columns=['Latitude', 'Longitude'])
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

  """Entry point for launching an IPython kernel.


Number of clusters: 7


In [23]:
centermost_points = clusters.map(get_centermost_point)
lats, lons = zip(*centermost_points)
narcotics_rep_points_2019 = pd.DataFrame({'Longitude':lons, 'Latitude':lats})
narcotics_rep_points_2019['Crime_type'] = 'NARCOTICS'
narcotics_rep_points_2019['Year'] = '2019'
narcotics_rep_points_2019

Unnamed: 0,Longitude,Latitude,Crime_type,Year
0,-87.692809,41.845354,NARCOTICS,2019
1,-87.806976,41.980243,NARCOTICS,2019
2,-87.539883,41.652856,NARCOTICS,2019
3,-87.900984,41.976763,NARCOTICS,2019
4,-87.907473,41.9539,NARCOTICS,2019
5,-87.788736,41.997342,NARCOTICS,2019
6,-87.748322,41.989045,NARCOTICS,2019


In [24]:
crime_type = ['THEFT']
theft_df_2019 = filtered_crime_df_2019[filtered_crime_df_2019.Primary_Type.isin(crime_type)]
len(theft_df_2019)

42050

In [25]:
coords = theft_df_2019.as_matrix(columns=['Latitude', 'Longitude'])
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

  """Entry point for launching an IPython kernel.


Number of clusters: 5


In [26]:
centermost_points = clusters.map(get_centermost_point)
lats, lons = zip(*centermost_points)
theft_rep_points_2019 = pd.DataFrame({'Longitude':lons, 'Latitude':lats})
theft_rep_points_2019['Crime_type'] = 'THEFT'
theft_rep_points_2019['Year'] = '2019'
theft_rep_points_2019

Unnamed: 0,Longitude,Latitude,Crime_type,Year
0,-87.661461,41.863029,THEFT,2019
1,-87.901514,41.9766,THEFT,2019
2,-87.91487,41.994301,THEFT,2019
3,-87.917645,41.962448,THEFT,2019
4,-87.559086,41.683054,THEFT,2019


In [27]:
crime_type = ['BATTERY']
battery_df_2019 = filtered_crime_df_2019[filtered_crime_df_2019.Primary_Type.isin(crime_type)]
len(battery_df_2019)

35103

In [28]:
coords = battery_df_2019.as_matrix(columns=['Latitude', 'Longitude'])
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

  """Entry point for launching an IPython kernel.


Number of clusters: 8


In [29]:
centermost_points = clusters.map(get_centermost_point)
lats, lons = zip(*centermost_points)
battery_rep_points_2019 = pd.DataFrame({'Longitude':lons, 'Latitude':lats})
battery_rep_points_2019['Crime_type'] = 'BATTERY'
battery_rep_points_2019['Year'] = '2019'
battery_rep_points_2019

Unnamed: 0,Longitude,Latitude,Crime_type,Year
0,-87.669198,41.834122,BATTERY,2019
1,-87.900984,41.976763,BATTERY,2019
2,-87.544723,41.654766,BATTERY,2019
3,-87.881938,41.994914,BATTERY,2019
4,-87.88717,41.959441,BATTERY,2019
5,-91.686566,36.619446,BATTERY,2019
6,-87.907473,41.9539,BATTERY,2019
7,-87.899009,42.005441,BATTERY,2019


In [30]:
crime_type = ['ROBBERY']
robbery_df_2019 = filtered_crime_df_2019[filtered_crime_df_2019.Primary_Type.isin(crime_type)]
len(robbery_df_2019)

5456

In [31]:
coords = robbery_df_2019.as_matrix(columns=['Latitude', 'Longitude'])
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

  """Entry point for launching an IPython kernel.


Number of clusters: 9


In [32]:
centermost_points = clusters.map(get_centermost_point)
lats, lons = zip(*centermost_points)
robbery_rep_points_2019 = pd.DataFrame({'Longitude':lons, 'Latitude':lats})
robbery_rep_points_2019['Crime_type'] = 'ROBBERY'
robbery_rep_points_2019['Year'] = '2019'
robbery_rep_points_2019

Unnamed: 0,Longitude,Latitude,Crime_type,Year
0,-87.665743,41.837829,ROBBERY,2019
1,-87.547152,41.652961,ROBBERY,2019
2,-87.836618,41.976181,ROBBERY,2019
3,-87.816866,41.980315,ROBBERY,2019
4,-87.786738,42.011481,ROBBERY,2019
5,-87.807138,41.997178,ROBBERY,2019
6,-87.5401,41.680482,ROBBERY,2019
7,-87.904123,41.978108,ROBBERY,2019
8,-87.702547,41.705718,ROBBERY,2019


In [33]:
crime_type = ['ASSAULT']
assault_df_2019 = filtered_crime_df_2019[filtered_crime_df_2019.Primary_Type.isin(crime_type)]
len(assault_df_2019)

14588

In [34]:
coords = assault_df_2019.as_matrix(columns=['Latitude', 'Longitude'])
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

  """Entry point for launching an IPython kernel.


Number of clusters: 6


In [35]:
centermost_points = clusters.map(get_centermost_point)
lats, lons = zip(*centermost_points)
assault_rep_points_2019 = pd.DataFrame({'Longitude':lons, 'Latitude':lats})
assault_rep_points_2019['Crime_type'] = 'ASSAULT'
assault_rep_points_2019['Year'] = '2019'
assault_rep_points_2019

Unnamed: 0,Longitude,Latitude,Crime_type,Year
0,-87.66586,41.830497,ASSAULT,2019
1,-87.900984,41.976763,ASSAULT,2019
2,-87.881938,41.994914,ASSAULT,2019
3,-87.914046,41.995434,ASSAULT,2019
4,-87.927365,42.006074,ASSAULT,2019
5,-87.915105,41.953783,ASSAULT,2019


In [36]:
crime_type = ['MOTOR VEHICLE THEFT']
mvt_df_2019 = filtered_crime_df_2019[filtered_crime_df_2019.Primary_Type.isin(crime_type)]
len(mvt_df_2019)

6303

In [37]:
coords = mvt_df_2019.as_matrix(columns=['Latitude', 'Longitude'])
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

  """Entry point for launching an IPython kernel.


Number of clusters: 5


In [38]:
centermost_points = clusters.map(get_centermost_point)
lats, lons = zip(*centermost_points)
mvt_rep_points_2019 = pd.DataFrame({'Longitude':lons, 'Latitude':lats})
mvt_rep_points_2019['Crime_type'] = 'MOTOR VEHICLE THEFT'
mvt_rep_points_2019['Year'] = '2019'
mvt_rep_points_2019

Unnamed: 0,Longitude,Latitude,Crime_type,Year
0,-87.675636,41.844288,MOTOR VEHICLE THEFT,2019
1,-87.548392,41.651498,MOTOR VEHICLE THEFT,2019
2,-87.881455,41.987154,MOTOR VEHICLE THEFT,2019
3,-87.905227,41.97629,MOTOR VEHICLE THEFT,2019
4,-87.576315,41.673646,MOTOR VEHICLE THEFT,2019


In [39]:
total_cluster_df_2019 = pd.concat([homicide_rep_points_2019, arson_rep_points_2019, narcotics_rep_points_2019, theft_rep_points_2019, battery_rep_points_2019,
                     robbery_rep_points_2019, assault_rep_points_2019, mvt_rep_points_2019], ignore_index=True)

In [40]:
total_cluster_df_2019

Unnamed: 0,Longitude,Latitude,Crime_type,Year
0,-87.655833,41.864102,HOMICIDE,2019
1,-87.743143,41.807168,HOMICIDE,2019
2,-87.536489,41.711484,HOMICIDE,2019
3,-87.626358,41.757639,HOMICIDE,2019
4,-87.729324,41.878823,HOMICIDE,2019
5,-87.691362,41.920311,HOMICIDE,2019
6,-87.669085,42.018967,HOMICIDE,2019
7,-87.644258,41.935855,HOMICIDE,2019
8,-87.633564,41.911161,HOMICIDE,2019
9,-87.708548,41.971042,HOMICIDE,2019


In [41]:
#save CSV
total_cluster_df_2019.to_csv("assets/data/clusters2019.csv", encoding="utf-8")