In [1]:
import pandas as pd
import numpy as np
import csv

from sqlalchemy import create_engine


In [2]:
import os
import json
from config import password

import pandas as pd
import numpy as np

import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine

from flask import Flask, jsonify, render_template
from flask_sqlalchemy import SQLAlchemy

app = Flask(__name__)

app.config["SQLALCHEMY_DATABASE_URI"] = "postgresql+psycopg2://postgres:{password}@localhost:5432/crime_db"
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
db = SQLAlchemy(app)
# reflect an existing database into a new model
Base = automap_base()
# reflect the tables
Base.prepare(db.engine, reflect=True)

# prepare to load the entire chicago table into a dataframe
Chicago_Metadata = Base.classes.chicago
stmt = db.session.query(Chicago_Metadata).statement
df = pd.read_sql_query(stmt, db.session.bind)
print("Loaded dataframe successfully...")

# Filter dataframe by certain crime types
crime_types = ['THEFT', 'BATTERY', 'CRIMINAL DAMAGE', 'NARCOTICS', 'ASSAULT', 
    'MOTOR VEHICLE THEFT', 'ROBBERY', 'WEAPONS VIOLATION', 'CONCEALED CARRY LICENSE VIOLATION', 'HOMICIDE', 'ARSON']
filtered_df = df[df.Primary_Type.isin(crime_types)]

Loaded dataframe successfully...


In [3]:
len(filtered_df)

2199538

In [4]:
# Check date range
least_recent_date = filtered_df['Date'].min()
recent_date = filtered_df['Date'].max()
print(f'Start date: {least_recent_date} and Recent date: {recent_date}')

Start date: 2010-01-01 00:01:00 and Recent date: 2019-09-10 23:55:00


In [5]:
#start date 2011
start_date = '2011-01-01 00:01:00'
end_date = '2011-12-31 23:59:00'
mask = (filtered_df['Date'] > start_date) & (filtered_df['Date'] <= end_date)
filtered_crime_df_2011 = filtered_df.loc[mask]
filtered_crime_df_2011.head()

Unnamed: 0,index,ID,Date,Primary_Type,Description,Location_Description,Arrest,District,Year,Latitude,Longitude,Historical_Wards,Zip_Codes,Police_Districts
2147689,2167812,8421333,2011-12-31 18:20:00,THEFT,OVER $500,APARTMENT,False,4,2011,41.752464,-87.569903,43,22538,19
2147790,2167574,8427459,2011-12-31 23:59:00,THEFT,$500 AND UNDER,APARTMENT,False,24,2011,42.017942,-87.682288,3,22528,11
2147791,2167575,8424391,2011-12-31 23:55:00,THEFT,$500 AND UNDER,STREET,False,7,2011,41.776589,-87.630571,53,21559,17
2147793,2167577,8422464,2011-12-31 23:50:00,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,False,9,2011,41.802483,-87.646246,12,14924,23
2147794,2167578,8421387,2011-12-31 23:50:00,BATTERY,SIMPLE,PARK PROPERTY,True,18,2011,41.89199,-87.611462,22,21182,14


In [6]:
len(filtered_crime_df_2011)

269947

In [7]:
filtered_crime_df_2011.shape 

(269947, 14)

In [8]:
crime_type = ['HOMICIDE']
homicide_df_2011 = filtered_crime_df_2011[filtered_crime_df_2011.Primary_Type.isin(crime_type)]

In [9]:
len(homicide_df_2011)

438

In [10]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from geopy.distance import great_circle
from shapely.geometry import MultiPoint
coords = homicide_df_2011.as_matrix(columns=['Latitude', 'Longitude'])

  """


In [11]:
kms_per_radian = 6371.0088
epsilon = 1.5 / kms_per_radian
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

Number of clusters: 24


In [12]:
def get_centermost_point(cluster):
    centroid = (MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y)
    centermost_point = min(cluster, key=lambda point: great_circle(point, centroid).m)
    return tuple(centermost_point)
centermost_points = clusters.map(get_centermost_point)


In [13]:
lats, lons = zip(*centermost_points)
homicide_rep_points_2011 = pd.DataFrame({'Longitude':lons, 'Latitude':lats})
homicide_rep_points_2011['Crime_type'] = 'HOMICIDE'
homicide_rep_points_2011['Year'] = '2011'

In [14]:
homicide_rep_points_2011

Unnamed: 0,Longitude,Latitude,Crime_type,Year
0,-87.638712,41.772023,HOMICIDE,2011
1,-87.7291,41.878828,HOMICIDE,2011
2,-87.627964,41.694968,HOMICIDE,2011
3,-87.641875,41.895495,HOMICIDE,2011
4,-87.713396,41.953851,HOMICIDE,2011
5,-87.672458,42.015235,HOMICIDE,2011
6,-87.661003,41.9645,HOMICIDE,2011
7,-87.665642,41.895581,HOMICIDE,2011
8,-87.609073,41.659136,HOMICIDE,2011
9,-87.647258,41.837566,HOMICIDE,2011


In [15]:
crime_type = ['ARSON']
arson_df_2011 = filtered_crime_df_2011[filtered_crime_df_2011.Primary_Type.isin(crime_type)]
len(arson_df_2011)

504

In [16]:
coords = arson_df_2011.as_matrix(columns=['Latitude', 'Longitude'])
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

Number of clusters: 21


  """Entry point for launching an IPython kernel.


In [17]:
centermost_points = clusters.map(get_centermost_point)

In [18]:
lats, lons = zip(*centermost_points)
arson_rep_points_2011 = pd.DataFrame({'Longitude':lons, 'Latitude':lats})
arson_rep_points_2011['Crime_type'] = 'ARSON'
arson_rep_points_2011['Year'] = '2011'

In [19]:
arson_rep_points_2011


Unnamed: 0,Longitude,Latitude,Crime_type,Year
0,-87.645751,41.769922,ARSON,2011
1,-87.541345,41.700463,ARSON,2011
2,-87.725055,41.901874,ARSON,2011
3,-87.543518,41.649157,ARSON,2011
4,-87.713433,41.740485,ARSON,2011
5,-87.639586,41.878931,ARSON,2011
6,-87.562183,41.714651,ARSON,2011
7,-87.784436,41.776727,ARSON,2011
8,-87.688718,41.999697,ARSON,2011
9,-87.771631,41.982926,ARSON,2011


In [21]:
crime_type = ['NARCOTICS']
narcotics_df_2011 = filtered_crime_df_2011[filtered_crime_df_2011.Primary_Type.isin(crime_type)]
len(narcotics_df_2011)

38579

In [22]:
coords = narcotics_df_2011.as_matrix(columns=['Latitude', 'Longitude'])
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

  """Entry point for launching an IPython kernel.


Number of clusters: 7


In [23]:
centermost_points = clusters.map(get_centermost_point)
lats, lons = zip(*centermost_points)
narcotics_rep_points_2011 = pd.DataFrame({'Longitude':lons, 'Latitude':lats})
narcotics_rep_points_2011['Crime_type'] = 'NARCOTICS'
narcotics_rep_points_2011['Year'] = '2011'
narcotics_rep_points_2011

Unnamed: 0,Longitude,Latitude,Crime_type,Year
0,-87.680298,41.834803,NARCOTICS,2011
1,-87.907473,41.9539,NARCOTICS,2011
2,-87.899335,41.977437,NARCOTICS,2011
3,-87.545913,41.654656,NARCOTICS,2011
4,-91.686566,36.619446,NARCOTICS,2011
5,-87.842281,41.979202,NARCOTICS,2011
6,-87.785597,42.012,NARCOTICS,2011


In [24]:
crime_type = ['THEFT']
theft_df_2011 = filtered_crime_df_2011[filtered_crime_df_2011.Primary_Type.isin(crime_type)]
len(theft_df_2011)

75027

In [25]:
coords = theft_df_2011.as_matrix(columns=['Latitude', 'Longitude'])
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

  """Entry point for launching an IPython kernel.


Number of clusters: 3


In [26]:
centermost_points = clusters.map(get_centermost_point)
lats, lons = zip(*centermost_points)
theft_rep_points_2011 = pd.DataFrame({'Longitude':lons, 'Latitude':lats})
theft_rep_points_2011['Crime_type'] = 'THEFT'
theft_rep_points_2011['Year'] = '2011'
theft_rep_points_2011

Unnamed: 0,Longitude,Latitude,Crime_type,Year
0,-87.666086,41.855682,THEFT,2011
1,-87.901783,41.977139,THEFT,2011
2,-91.686566,36.619446,THEFT,2011


In [27]:
crime_type = ['BATTERY']
battery_df_2011 = filtered_crime_df_2011[filtered_crime_df_2011.Primary_Type.isin(crime_type)]
len(battery_df_2011)

60446

In [28]:
coords = battery_df_2011.as_matrix(columns=['Latitude', 'Longitude'])
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

  """Entry point for launching an IPython kernel.


Number of clusters: 4


In [29]:
centermost_points = clusters.map(get_centermost_point)
lats, lons = zip(*centermost_points)
battery_rep_points_2011 = pd.DataFrame({'Longitude':lons, 'Latitude':lats})
battery_rep_points_2011['Crime_type'] = 'BATTERY'
battery_rep_points_2011['Year'] = '2011'
battery_rep_points_2011

Unnamed: 0,Longitude,Latitude,Crime_type,Year
0,-87.667983,41.831645,BATTERY,2011
1,-87.898013,41.977213,BATTERY,2011
2,-91.686566,36.619446,BATTERY,2011
3,-87.907073,42.008849,BATTERY,2011


In [30]:
crime_type = ['ROBBERY']
robbery_df_2011 = filtered_crime_df_2011[filtered_crime_df_2011.Primary_Type.isin(crime_type)]
len(robbery_df_2011)

13977

In [31]:
coords = robbery_df_2011.as_matrix(columns=['Latitude', 'Longitude'])
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

  """Entry point for launching an IPython kernel.


Number of clusters: 7


In [32]:
centermost_points = clusters.map(get_centermost_point)
lats, lons = zip(*centermost_points)
robbery_rep_points_2011 = pd.DataFrame({'Longitude':lons, 'Latitude':lats})
robbery_rep_points_2011['Crime_type'] = 'ROBBERY'
robbery_rep_points_2011['Year'] = '2011'
robbery_rep_points_2011

Unnamed: 0,Longitude,Latitude,Crime_type,Year
0,-87.667936,41.82985,ROBBERY,2011
1,-87.824299,41.977052,ROBBERY,2011
2,-87.547148,41.654547,ROBBERY,2011
3,-87.714178,41.706006,ROBBERY,2011
4,-87.908269,41.951856,ROBBERY,2011
5,-87.906473,42.008885,ROBBERY,2011
6,-91.686566,36.619446,ROBBERY,2011


In [36]:
crime_type = ['ASSAULT']
assault_df_2011 = filtered_crime_df_2011[filtered_crime_df_2011.Primary_Type.isin(crime_type)]
len(assault_df_2011)

20405

In [37]:
coords = assault_df_2011.as_matrix(columns=['Latitude', 'Longitude'])
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

  """Entry point for launching an IPython kernel.


Number of clusters: 4


In [38]:
centermost_points = clusters.map(get_centermost_point)
lats, lons = zip(*centermost_points)
assault_rep_points_2011 = pd.DataFrame({'Longitude':lons, 'Latitude':lats})
assault_rep_points_2011['Crime_type'] = 'ASSAULT'
assault_rep_points_2011['Year'] = '2011'
assault_rep_points_2011

Unnamed: 0,Longitude,Latitude,Crime_type,Year
0,-87.667893,41.827968,ASSAULT,2011
1,-87.896576,41.977021,ASSAULT,2011
2,-87.927882,41.99247,ASSAULT,2011
3,-87.893399,41.950804,ASSAULT,2011


In [39]:
crime_type = ['MOTOR VEHICLE THEFT']
mvt_df_2011 = filtered_crime_df_2011[filtered_crime_df_2011.Primary_Type.isin(crime_type)]
len(mvt_df_2011)

19382

In [40]:
coords = mvt_df_2011.as_matrix(columns=['Latitude', 'Longitude'])
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

  """Entry point for launching an IPython kernel.


Number of clusters: 8


In [41]:
centermost_points = clusters.map(get_centermost_point)
lats, lons = zip(*centermost_points)
mvt_rep_points_2011 = pd.DataFrame({'Longitude':lons, 'Latitude':lats})
mvt_rep_points_2011['Crime_type'] = 'MOTOR VEHICLE THEFT'
mvt_rep_points_2011['Year'] = '2011'
mvt_rep_points_2011

Unnamed: 0,Longitude,Latitude,Crime_type,Year
0,-87.67646,41.836656,MOTOR VEHICLE THEFT,2011
1,-87.545911,41.654807,MOTOR VEHICLE THEFT,2011
2,-87.776185,42.010714,MOTOR VEHICLE THEFT,2011
3,-87.883611,41.980826,MOTOR VEHICLE THEFT,2011
4,-87.909079,41.960023,MOTOR VEHICLE THEFT,2011
5,-87.906463,41.979006,MOTOR VEHICLE THEFT,2011
6,-87.88566,41.995516,MOTOR VEHICLE THEFT,2011
7,-91.686566,36.619446,MOTOR VEHICLE THEFT,2011


In [43]:
total_cluster_df_2011 = pd.concat([homicide_rep_points_2011, arson_rep_points_2011, narcotics_rep_points_2011, theft_rep_points_2011, battery_rep_points_2011,
                     robbery_rep_points_2011, assault_rep_points_2011, mvt_rep_points_2011], ignore_index=True)

In [44]:
total_cluster_df_2011

Unnamed: 0,Longitude,Latitude,Crime_type,Year
0,-87.638712,41.772023,HOMICIDE,2011
1,-87.729100,41.878828,HOMICIDE,2011
2,-87.627964,41.694968,HOMICIDE,2011
3,-87.641875,41.895495,HOMICIDE,2011
4,-87.713396,41.953851,HOMICIDE,2011
5,-87.672458,42.015235,HOMICIDE,2011
6,-87.661003,41.964500,HOMICIDE,2011
7,-87.665642,41.895581,HOMICIDE,2011
8,-87.609073,41.659136,HOMICIDE,2011
9,-87.647258,41.837566,HOMICIDE,2011


In [45]:
#save CSV
total_cluster_df_2011.to_csv("assets/data/clusters2011.csv", encoding="utf-8")