In [1]:
import pandas as pd
import numpy as np
import csv

from sqlalchemy import create_engine


In [2]:
import os
import json
from config import password

import pandas as pd
import numpy as np

import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine

from flask import Flask, jsonify, render_template
from flask_sqlalchemy import SQLAlchemy

app = Flask(__name__)

app.config["SQLALCHEMY_DATABASE_URI"] = "postgresql+psycopg2://postgres:{password}@localhost:5432/crime_db"
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
db = SQLAlchemy(app)
# reflect an existing database into a new model
Base = automap_base()
# reflect the tables
Base.prepare(db.engine, reflect=True)

# prepare to load the entire chicago table into a dataframe
Chicago_Metadata = Base.classes.chicago
stmt = db.session.query(Chicago_Metadata).statement
df = pd.read_sql_query(stmt, db.session.bind)
print("Loaded dataframe successfully...")

# Filter dataframe by certain crime types
crime_types = ['THEFT', 'BATTERY', 'CRIMINAL DAMAGE', 'NARCOTICS', 'ASSAULT', 
    'MOTOR VEHICLE THEFT', 'ROBBERY', 'WEAPONS VIOLATION', 'CONCEALED CARRY LICENSE VIOLATION', 'HOMICIDE', 'ARSON']
filtered_df = df[df.Primary_Type.isin(crime_types)]

Loaded dataframe successfully...


In [3]:
len(filtered_df)

2199538

In [4]:
# Check date range
least_recent_date = filtered_df['Date'].min()
recent_date = filtered_df['Date'].max()
print(f'Start date: {least_recent_date} and Recent date: {recent_date}')

Start date: 2010-01-01 00:01:00 and Recent date: 2019-09-10 23:55:00


In [5]:
#start date 2018
start_date = '2018-01-01 00:01:00'
end_date = '2018-12-31 23:59:00'
mask = (filtered_df['Date'] > start_date) & (filtered_df['Date'] <= end_date)
filtered_crime_df_2018 = filtered_df.loc[mask]
filtered_crime_df_2018.head()

Unnamed: 0,index,ID,Date,Primary_Type,Description,Location_Description,Arrest,District,Year,Latitude,Longitude,Historical_Wards,Zip_Codes,Police_Districts
178819,179891,11556487,2018-12-31 23:59:00,CRIMINAL DAMAGE,TO VEHICLE,STREET,False,22,2018,41.689079,-87.696064,33,4447,9
178820,179892,11552699,2018-12-31 23:57:00,CRIMINAL DAMAGE,TO PROPERTY,APARTMENT,False,6,2018,41.740521,-87.647391,18,21554,20
178821,179893,11552724,2018-12-31 23:56:00,BATTERY,AGG: HANDS/FIST/FEET NO/MINOR INJURY,OTHER,True,12,2018,41.857068,-87.657625,8,14920,15
178822,179894,11552731,2018-12-31 23:55:00,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,False,6,2018,41.751914,-87.647717,17,21554,20
178823,179895,11552715,2018-12-31 23:49:00,BATTERY,AGGRAVATED: HANDGUN,STREET,False,15,2018,41.875684,-87.760479,52,22216,25


In [6]:
len(filtered_crime_df_2018)

200726

In [7]:
filtered_crime_df_2018.shape 

(200726, 14)

In [8]:
crime_type = ['HOMICIDE']
homicide_df_2018 = filtered_crime_df_2018[filtered_crime_df_2018.Primary_Type.isin(crime_type)]

In [9]:
len(homicide_df_2018)

589

In [10]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from geopy.distance import great_circle
from shapely.geometry import MultiPoint
coords = homicide_df_2018.as_matrix(columns=['Latitude', 'Longitude'])

  """


In [11]:
kms_per_radian = 6371.0088
epsilon = 1.5 / kms_per_radian
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

Number of clusters: 27


In [12]:
def get_centermost_point(cluster):
    centroid = (MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y)
    centermost_point = min(cluster, key=lambda point: great_circle(point, centroid).m)
    return tuple(centermost_point)
centermost_points = clusters.map(get_centermost_point)


In [13]:
lats, lons = zip(*centermost_points)
homicide_rep_points_2018 = pd.DataFrame({'Longitude':lons, 'Latitude':lats})
homicide_rep_points_2018['Crime_type'] = 'HOMICIDE'
homicide_rep_points_2018['Year'] = '2018'

In [14]:
homicide_rep_points_2018

Unnamed: 0,Longitude,Latitude,Crime_type,Year
0,-87.63593,41.758054,HOMICIDE,2018
1,-87.765095,41.775264,HOMICIDE,2018
2,-87.731877,41.886927,HOMICIDE,2018
3,-87.673828,42.01265,HOMICIDE,2018
4,-87.698341,41.821812,HOMICIDE,2018
5,-87.658865,41.965997,HOMICIDE,2018
6,-87.560899,41.70906,HOMICIDE,2018
7,-87.703032,41.939363,HOMICIDE,2018
8,-87.667909,41.8997,HOMICIDE,2018
9,-87.710824,41.698002,HOMICIDE,2018


In [15]:
crime_type = ['ARSON']
arson_df_2018= filtered_crime_df_2018[filtered_crime_df_2018.Primary_Type.isin(crime_type)]
len(arson_df_2018)

373

In [16]:
coords = arson_df_2018.as_matrix(columns=['Latitude', 'Longitude'])
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

Number of clusters: 25


  """Entry point for launching an IPython kernel.


In [17]:
centermost_points = clusters.map(get_centermost_point)

In [18]:
lats, lons = zip(*centermost_points)
arson_rep_points_2018 = pd.DataFrame({'Longitude':lons, 'Latitude':lats})
arson_rep_points_2018['Crime_type'] = 'ARSON'
arson_rep_points_2018['Year'] = '2018'

In [19]:
arson_rep_points_2018

Unnamed: 0,Longitude,Latitude,Crime_type,Year
0,-87.739807,41.899098,ARSON,2018
1,-87.607133,41.832498,ARSON,2018
2,-87.648087,41.766382,ARSON,2018
3,-87.679869,41.992188,ARSON,2018
4,-87.817026,41.951353,ARSON,2018
5,-87.670123,41.85865,ARSON,2018
6,-87.740064,41.740902,ARSON,2018
7,-87.604323,41.807211,ARSON,2018
8,-87.623946,41.858413,ARSON,2018
9,-87.624551,41.887385,ARSON,2018


In [20]:
crime_type = ['NARCOTICS']
narcotics_df_2018 = filtered_crime_df_2018[filtered_crime_df_2018.Primary_Type.isin(crime_type)]
len(narcotics_df_2018)

12797

In [21]:
coords = narcotics_df_2018.as_matrix(columns=['Latitude', 'Longitude'])
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

  """Entry point for launching an IPython kernel.


Number of clusters: 9


In [22]:
centermost_points = clusters.map(get_centermost_point)
lats, lons = zip(*centermost_points)
narcotics_rep_points_2018 = pd.DataFrame({'Longitude':lons, 'Latitude':lats})
narcotics_rep_points_2018['Crime_type'] = 'NARCOTICS'
narcotics_rep_points_2018['Year'] = '2018'
narcotics_rep_points_2018

Unnamed: 0,Longitude,Latitude,Crime_type,Year
0,-87.690896,41.844748,NARCOTICS,2018
1,-87.900984,41.976763,NARCOTICS,2018
2,-87.542304,41.653215,NARCOTICS,2018
3,-87.702547,41.705718,NARCOTICS,2018
4,-87.907473,41.9539,NARCOTICS,2018
5,-87.806584,42.016179,NARCOTICS,2018
6,-87.927365,42.006074,NARCOTICS,2018
7,-87.836685,41.965721,NARCOTICS,2018
8,-87.761899,41.999368,NARCOTICS,2018


In [23]:
crime_type = ['THEFT']
theft_df_2018 = filtered_crime_df_2018[filtered_crime_df_2018.Primary_Type.isin(crime_type)]
len(theft_df_2018)

64017

In [24]:
coords = theft_df_2018.as_matrix(columns=['Latitude', 'Longitude'])
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

  """Entry point for launching an IPython kernel.


Number of clusters: 5


In [25]:
centermost_points = clusters.map(get_centermost_point)
lats, lons = zip(*centermost_points)
theft_rep_points_2018 = pd.DataFrame({'Longitude':lons, 'Latitude':lats})
theft_rep_points_2018['Crime_type'] = 'THEFT'
theft_rep_points_2018['Year'] = '2018'
theft_rep_points_2018

Unnamed: 0,Longitude,Latitude,Crime_type,Year
0,-87.663879,41.862102,THEFT,2018
1,-87.903496,41.977791,THEFT,2018
2,-87.559364,41.683456,THEFT,2018
3,-87.919274,41.996998,THEFT,2018
4,-87.934273,42.008162,THEFT,2018


In [26]:
crime_type = ['BATTERY']
battery_df_2018 = filtered_crime_df_2018[filtered_crime_df_2018.Primary_Type.isin(crime_type)]
len(battery_df_2018)

49709

In [28]:
coords = battery_df_2018.as_matrix(columns=['Latitude', 'Longitude'])
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

  """Entry point for launching an IPython kernel.


Number of clusters: 7


In [29]:
centermost_points = clusters.map(get_centermost_point)
lats, lons = zip(*centermost_points)
battery_rep_points_2018 = pd.DataFrame({'Longitude':lons, 'Latitude':lats})
battery_rep_points_2018['Crime_type'] = 'BATTERY'
battery_rep_points_2018['Year'] = '2018'
battery_rep_points_2018

Unnamed: 0,Longitude,Latitude,Crime_type,Year
0,-87.669198,41.834122,BATTERY,2018
1,-87.89923,41.9773,BATTERY,2018
2,-87.544724,41.654673,BATTERY,2018
3,-87.915105,41.953783,BATTERY,2018
4,-87.88118,41.962076,BATTERY,2018
5,-87.576315,41.673646,BATTERY,2018
6,-87.91487,41.994301,BATTERY,2018


In [30]:
crime_type = ['ROBBERY']
robbery_df_2018 = filtered_crime_df_2018[filtered_crime_df_2018.Primary_Type.isin(crime_type)]
len(robbery_df_2018)

9678

In [31]:
coords = robbery_df_2018.as_matrix(columns=['Latitude', 'Longitude'])
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

  """Entry point for launching an IPython kernel.


Number of clusters: 7


In [32]:
centermost_points = clusters.map(get_centermost_point)
lats, lons = zip(*centermost_points)
robbery_rep_points_2018 = pd.DataFrame({'Longitude':lons, 'Latitude':lats})
robbery_rep_points_2018['Crime_type'] = 'ROBBERY'
robbery_rep_points_2018['Year'] = '2018'
robbery_rep_points_2018

Unnamed: 0,Longitude,Latitude,Crime_type,Year
0,-87.676635,41.848196,ROBBERY,2018
1,-87.545912,41.65473,ROBBERY,2018
2,-87.814409,41.978512,ROBBERY,2018
3,-87.744379,41.994598,ROBBERY,2018
4,-87.795371,41.797098,ROBBERY,2018
5,-87.903899,41.980783,ROBBERY,2018
6,-87.818426,42.001664,ROBBERY,2018


In [35]:
crime_type = ['ASSAULT']
assault_df_2018 = filtered_crime_df_2018[filtered_crime_df_2018.Primary_Type.isin(crime_type)]
len(assault_df_2018)

20340

In [36]:
coords = assault_df_2018.as_matrix(columns=['Latitude', 'Longitude'])
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

  """Entry point for launching an IPython kernel.


Number of clusters: 6


In [37]:
centermost_points = clusters.map(get_centermost_point)
lats, lons = zip(*centermost_points)
assault_rep_points_2018 = pd.DataFrame({'Longitude':lons, 'Latitude':lats})
assault_rep_points_2018['Crime_type'] = 'ASSAULT'
assault_rep_points_2018['Year'] = '2018'
assault_rep_points_2018

Unnamed: 0,Longitude,Latitude,Crime_type,Year
0,-87.667978,41.831459,ASSAULT,2018
1,-87.545321,41.653675,ASSAULT,2018
2,-87.901783,41.977139,ASSAULT,2018
3,-87.881938,41.994914,ASSAULT,2018
4,-87.914585,41.96079,ASSAULT,2018
5,-87.573877,41.673676,ASSAULT,2018


In [38]:
crime_type = ['MOTOR VEHICLE THEFT']
mvt_df_2018 = filtered_crime_df_2018[filtered_crime_df_2018.Primary_Type.isin(crime_type)]
len(mvt_df_2018)

9939

In [39]:
coords = mvt_df_2018.as_matrix(columns=['Latitude', 'Longitude'])
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

  """Entry point for launching an IPython kernel.


Number of clusters: 3


In [40]:
centermost_points = clusters.map(get_centermost_point)
lats, lons = zip(*centermost_points)
mvt_rep_points_2018 = pd.DataFrame({'Longitude':lons, 'Latitude':lats})
mvt_rep_points_2018['Crime_type'] = 'MOTOR VEHICLE THEFT'
mvt_rep_points_2018['Year'] = '2018'
mvt_rep_points_2018

Unnamed: 0,Longitude,Latitude,Crime_type,Year
0,-87.675675,41.845861,MOTOR VEHICLE THEFT,2018
1,-87.548366,41.658167,MOTOR VEHICLE THEFT,2018
2,-87.925509,41.989622,MOTOR VEHICLE THEFT,2018


In [42]:
total_cluster_df_2018 = pd.concat([homicide_rep_points_2018, arson_rep_points_2018, narcotics_rep_points_2018, theft_rep_points_2018, battery_rep_points_2018,
                     robbery_rep_points_2018, assault_rep_points_2018, mvt_rep_points_2018], ignore_index=True)

In [43]:
total_cluster_df_2018

Unnamed: 0,Longitude,Latitude,Crime_type,Year
0,-87.635930,41.758054,HOMICIDE,2018
1,-87.765095,41.775264,HOMICIDE,2018
2,-87.731877,41.886927,HOMICIDE,2018
3,-87.673828,42.012650,HOMICIDE,2018
4,-87.698341,41.821812,HOMICIDE,2018
5,-87.658865,41.965997,HOMICIDE,2018
6,-87.560899,41.709060,HOMICIDE,2018
7,-87.703032,41.939363,HOMICIDE,2018
8,-87.667909,41.899700,HOMICIDE,2018
9,-87.710824,41.698002,HOMICIDE,2018


In [44]:
#save CSV
total_cluster_df_2018.to_csv("assets/data/clusters2018.csv", encoding="utf-8")