In [1]:
import pandas as pd
import numpy as np
import csv

from sqlalchemy import create_engine


In [2]:
import os
import json
from config import password

import pandas as pd
import numpy as np

import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine

from flask import Flask, jsonify, render_template
from flask_sqlalchemy import SQLAlchemy

app = Flask(__name__)

app.config["SQLALCHEMY_DATABASE_URI"] = "postgresql+psycopg2://postgres:{password}@localhost:5432/crime_db"
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
db = SQLAlchemy(app)
# reflect an existing database into a new model
Base = automap_base()
# reflect the tables
Base.prepare(db.engine, reflect=True)

# prepare to load the entire chicago table into a dataframe
Chicago_Metadata = Base.classes.chicago
stmt = db.session.query(Chicago_Metadata).statement
df = pd.read_sql_query(stmt, db.session.bind)
print("Loaded dataframe successfully...")

# Filter dataframe by certain crime types
crime_types = ['THEFT', 'BATTERY', 'CRIMINAL DAMAGE', 'NARCOTICS', 'ASSAULT', 
    'MOTOR VEHICLE THEFT', 'ROBBERY', 'WEAPONS VIOLATION', 'CONCEALED CARRY LICENSE VIOLATION', 'HOMICIDE', 'ARSON']
filtered_df = df[df.Primary_Type.isin(crime_types)]

Loaded dataframe successfully...


In [3]:
len(filtered_df)

2199538

In [4]:
# Check date range
least_recent_date = filtered_df['Date'].min()
recent_date = filtered_df['Date'].max()
print(f'Start date: {least_recent_date} and Recent date: {recent_date}')

Start date: 2010-01-01 00:01:00 and Recent date: 2019-09-10 23:55:00


In [7]:
#start date 2014
start_date = '2014-01-01 00:01:00'
end_date = '2014-12-31 23:59:00'
mask = (filtered_df['Date'] > start_date) & (filtered_df['Date'] <= end_date)
filtered_crime_df_2014 = filtered_df.loc[mask]
filtered_crime_df_2014.head()

Unnamed: 0,index,ID,Date,Primary_Type,Description,Location_Description,Arrest,District,Year,Latitude,Longitude,Historical_Wards,Zip_Codes,Police_Districts
1232282,1248993,9911557,2014-12-31 23:58:00,BATTERY,SIMPLE,RESIDENCE PORCH/HALLWAY,False,24,2014,41.993559,-87.683593,46,4450,11
1232284,1248995,9911147,2014-12-31 23:50:00,WEAPONS VIOLATION,UNLAWFUL POSS OF HANDGUN,ALLEY,True,6,2014,41.750468,-87.655615,18,21554,20
1232285,1248996,9911122,2014-12-31 23:50:00,WEAPONS VIOLATION,UNLAWFUL USE HANDGUN,RESIDENCE PORCH/HALLWAY,True,11,2014,41.892408,-87.721101,41,21572,16
1232288,1248999,9911077,2014-12-31 23:34:00,BATTERY,SIMPLE,BAR OR TAVERN,True,19,2014,41.926035,-87.649589,51,21190,5
1232290,1249002,9911776,2014-12-31 23:30:00,THEFT,OVER $500,OTHER,False,18,2014,41.893542,-87.629702,22,4446,14


In [8]:
len(filtered_crime_df_2014)

208051

In [9]:
filtered_crime_df_2014.shape 

(208051, 14)

In [10]:
crime_type = ['HOMICIDE']
homicide_df_2014 = filtered_crime_df_2014[filtered_crime_df_2014.Primary_Type.isin(crime_type)]

In [11]:
len(homicide_df_2014)

426

In [12]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from geopy.distance import great_circle
from shapely.geometry import MultiPoint
coords = homicide_df_2014.as_matrix(columns=['Latitude', 'Longitude'])

  """


In [13]:
kms_per_radian = 6371.0088
epsilon = 1.5 / kms_per_radian
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

Number of clusters: 26


In [14]:
def get_centermost_point(cluster):
    centroid = (MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y)
    centermost_point = min(cluster, key=lambda point: great_circle(point, centroid).m)
    return tuple(centermost_point)
centermost_points = clusters.map(get_centermost_point)


In [15]:
lats, lons = zip(*centermost_points)
homicide_rep_points_2014 = pd.DataFrame({'Longitude':lons, 'Latitude':lats})
homicide_rep_points_2014['Crime_type'] = 'HOMICIDE'
homicide_rep_points_2014['Year'] = '2014'

In [16]:
homicide_rep_points_2014

Unnamed: 0,Longitude,Latitude,Crime_type,Year
0,-87.639429,41.755473,HOMICIDE,2014
1,-87.727576,41.884657,HOMICIDE,2014
2,-87.691953,41.985637,HOMICIDE,2014
3,-87.605892,41.654286,HOMICIDE,2014
4,-87.760363,41.777183,HOMICIDE,2014
5,-87.720667,41.967383,HOMICIDE,2014
6,-87.672883,42.004694,HOMICIDE,2014
7,-87.658339,41.965387,HOMICIDE,2014
8,-87.538377,41.700934,HOMICIDE,2014
9,-87.637241,41.898412,HOMICIDE,2014


In [17]:
crime_type = ['ARSON']
arson_df_2014 = filtered_crime_df_2014[filtered_crime_df_2014.Primary_Type.isin(crime_type)]
len(arson_df_2014)

396

In [18]:
coords = arson_df_2014.as_matrix(columns=['Latitude', 'Longitude'])
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

Number of clusters: 29


  """Entry point for launching an IPython kernel.


In [19]:
centermost_points = clusters.map(get_centermost_point)

In [20]:
lats, lons = zip(*centermost_points)
arson_rep_points_2014 = pd.DataFrame({'Longitude':lons, 'Latitude':lats})
arson_rep_points_2014['Crime_type'] = 'ARSON'
arson_rep_points_2014['Year'] = '2014'

In [21]:
arson_rep_points_2014


Unnamed: 0,Longitude,Latitude,Crime_type,Year
0,-87.653436,41.832158,ARSON,2014
1,-87.680395,41.931017,ARSON,2014
2,-87.610398,41.816187,ARSON,2014
3,-87.631043,41.677124,ARSON,2014
4,-87.639235,41.868542,ARSON,2014
5,-87.681244,41.82772,ARSON,2014
6,-87.724939,41.743573,ARSON,2014
7,-87.562086,41.707454,ARSON,2014
8,-87.806941,41.981433,ARSON,2014
9,-87.765728,41.786574,ARSON,2014


In [22]:
crime_type = ['NARCOTICS']
narcotics_df_2014 = filtered_crime_df_2014[filtered_crime_df_2014.Primary_Type.isin(crime_type)]
len(narcotics_df_2014)

28906

In [23]:
coords = narcotics_df_2014.as_matrix(columns=['Latitude', 'Longitude'])
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

  """Entry point for launching an IPython kernel.


Number of clusters: 5


In [24]:
centermost_points = clusters.map(get_centermost_point)
lats, lons = zip(*centermost_points)
narcotics_rep_points_2014 = pd.DataFrame({'Longitude':lons, 'Latitude':lats})
narcotics_rep_points_2014['Crime_type'] = 'NARCOTICS'
narcotics_rep_points_2014['Year'] = '2014'
narcotics_rep_points_2014

Unnamed: 0,Longitude,Latitude,Crime_type,Year
0,-87.684583,41.845154,NARCOTICS,2014
1,-87.907473,41.9539,NARCOTICS,2014
2,-87.905227,41.97629,NARCOTICS,2014
3,-87.574483,41.645076,NARCOTICS,2014
4,-91.686566,36.619446,NARCOTICS,2014


In [25]:
crime_type = ['THEFT']
theft_df_2014 = filtered_crime_df_2014[filtered_crime_df_2014.Primary_Type.isin(crime_type)]
len(theft_df_2014)

61440

In [26]:
coords = theft_df_2014.as_matrix(columns=['Latitude', 'Longitude'])
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

  """Entry point for launching an IPython kernel.


Number of clusters: 4


In [27]:
centermost_points = clusters.map(get_centermost_point)
lats, lons = zip(*centermost_points)
theft_rep_points_2014 = pd.DataFrame({'Longitude':lons, 'Latitude':lats})
theft_rep_points_2014['Crime_type'] = 'THEFT'
theft_rep_points_2014['Year'] = '2014'
theft_rep_points_2014

Unnamed: 0,Longitude,Latitude,Crime_type,Year
0,-87.666042,41.853885,THEFT,2014
1,-87.901783,41.977139,THEFT,2014
2,-87.904804,42.00712,THEFT,2014
3,-91.686566,36.619446,THEFT,2014


In [28]:
crime_type = ['BATTERY']
battery_df_2014 = filtered_crime_df_2014[filtered_crime_df_2014.Primary_Type.isin(crime_type)]
len(battery_df_2014)

49414

In [29]:
coords = battery_df_2014.as_matrix(columns=['Latitude', 'Longitude'])
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

  """Entry point for launching an IPython kernel.


Number of clusters: 5


In [30]:
centermost_points = clusters.map(get_centermost_point)
lats, lons = zip(*centermost_points)
battery_rep_points_2014 = pd.DataFrame({'Longitude':lons, 'Latitude':lats})
battery_rep_points_2014['Crime_type'] = 'BATTERY'
battery_rep_points_2014['Year'] = '2014'
battery_rep_points_2014

Unnamed: 0,Longitude,Latitude,Crime_type,Year
0,-87.670438,41.832049,BATTERY,2014
1,-87.900984,41.976763,BATTERY,2014
2,-87.884524,41.957746,BATTERY,2014
3,-87.907473,41.9539,BATTERY,2014
4,-87.575472,41.68032,BATTERY,2014


In [31]:
crime_type = ['ROBBERY']
robbery_df_2014 = filtered_crime_df_2014[filtered_crime_df_2014.Primary_Type.isin(crime_type)]
len(robbery_df_2014)

9795

In [32]:
coords = robbery_df_2014.as_matrix(columns=['Latitude', 'Longitude'])
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

  """Entry point for launching an IPython kernel.


Number of clusters: 6


In [33]:
centermost_points = clusters.map(get_centermost_point)
lats, lons = zip(*centermost_points)
robbery_rep_points_2014 = pd.DataFrame({'Longitude':lons, 'Latitude':lats})
robbery_rep_points_2014['Crime_type'] = 'ROBBERY'
robbery_rep_points_2014['Year'] = '2014'
robbery_rep_points_2014

Unnamed: 0,Longitude,Latitude,Crime_type,Year
0,-87.66826,41.83047,ROBBERY,2014
1,-87.547153,41.652886,ROBBERY,2014
2,-87.836636,41.950677,ROBBERY,2014
3,-87.836582,41.984622,ROBBERY,2014
4,-87.814943,41.999697,ROBBERY,2014
5,-87.903639,41.978466,ROBBERY,2014


In [34]:
crime_type = ['ASSAULT']
assault_df_2014 = filtered_crime_df_2014[filtered_crime_df_2014.Primary_Type.isin(crime_type)]
len(assault_df_2014)

16889

In [35]:
coords = assault_df_2014.as_matrix(columns=['Latitude', 'Longitude'])
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

  """Entry point for launching an IPython kernel.


Number of clusters: 5


In [36]:
centermost_points = clusters.map(get_centermost_point)
lats, lons = zip(*centermost_points)
assault_rep_points_2014 = pd.DataFrame({'Longitude':lons, 'Latitude':lats})
assault_rep_points_2014['Crime_type'] = 'ASSAULT'
assault_rep_points_2014['Year'] = '2014'
assault_rep_points_2014

Unnamed: 0,Longitude,Latitude,Crime_type,Year
0,-87.667898,41.828173,ASSAULT,2014
1,-87.900984,41.976763,ASSAULT,2014
2,-87.890339,42.002993,ASSAULT,2014
3,-87.914585,41.96079,ASSAULT,2014
4,-91.686566,36.619446,ASSAULT,2014


In [37]:
crime_type = ['MOTOR VEHICLE THEFT']
mvt_df_2014 = filtered_crime_df_2013[filtered_crime_df_2014.Primary_Type.isin(crime_type)]
len(mvt_df_2014)

9893

In [38]:
coords = mvt_df_2014.as_matrix(columns=['Latitude', 'Longitude'])
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

  """Entry point for launching an IPython kernel.


Number of clusters: 4


In [39]:
centermost_points = clusters.map(get_centermost_point)
lats, lons = zip(*centermost_points)
mvt_rep_points_2014 = pd.DataFrame({'Longitude':lons, 'Latitude':lats})
mvt_rep_points_2014['Crime_type'] = 'MOTOR VEHICLE THEFT'
mvt_rep_points_2014['Year'] = '2014'
mvt_rep_points_2014

Unnamed: 0,Longitude,Latitude,Crime_type,Year
0,-87.675636,41.844288,MOTOR VEHICLE THEFT,2014
1,-87.88692,41.980436,MOTOR VEHICLE THEFT,2014
2,-87.545911,41.654527,MOTOR VEHICLE THEFT,2014
3,-91.686566,36.619446,MOTOR VEHICLE THEFT,2014


In [40]:
total_cluster_df_2014 = pd.concat([homicide_rep_points_2014, arson_rep_points_2014, narcotics_rep_points_2014, theft_rep_points_2014, battery_rep_points_2014,
                     robbery_rep_points_2014, assault_rep_points_2014, mvt_rep_points_2014], ignore_index=True)

In [41]:
total_cluster_df_2014

Unnamed: 0,Longitude,Latitude,Crime_type,Year
0,-87.639429,41.755473,HOMICIDE,2014
1,-87.727576,41.884657,HOMICIDE,2014
2,-87.691953,41.985637,HOMICIDE,2014
3,-87.605892,41.654286,HOMICIDE,2014
4,-87.760363,41.777183,HOMICIDE,2014
5,-87.720667,41.967383,HOMICIDE,2014
6,-87.672883,42.004694,HOMICIDE,2014
7,-87.658339,41.965387,HOMICIDE,2014
8,-87.538377,41.700934,HOMICIDE,2014
9,-87.637241,41.898412,HOMICIDE,2014


In [42]:
#save CSV
total_cluster_df_2014.to_csv("assets/data/clusters2014.csv", encoding="utf-8")