In [21]:
import matplotlib.pyplot as plt
import pandas as pd
import datetime

In [22]:
# Import Chicago crime dataset
# url = 'https://raw.githubusercontent.com/jackrapp/team_sundress/natBranch/holdingout.csv'
file = 'data/cleaned_crimes.csv'
df = pd.read_csv(file).drop(['Unnamed: 0'],axis=1)

In [23]:
df.keys()


Index(['Primary Type', 'Description', 'Location Description', 'Arrest',
       'Domestic', 'Ward', 'X Coordinate', 'Y Coordinate', 'Year', 'Latitude',
       'Longitude', 'Month', 'Day', 'Weekday', 'HourOfDay', 'Weapon'],
      dtype='object')

In [24]:
# Check for null values
df.count()

Primary Type            1325803
Description             1325803
Location Description    1325803
Arrest                  1325803
Domestic                1325803
Ward                    1325803
X Coordinate            1325803
Y Coordinate            1325803
Year                    1325803
Latitude                1325803
Longitude               1325803
Month                   1325803
Day                     1325803
Weekday                 1325803
HourOfDay               1325803
Weapon                  1325803
dtype: int64

In [25]:
# Pearsons Correlation
df.corr(method='pearson')

Unnamed: 0,Arrest,Domestic,Ward,X Coordinate,Y Coordinate,Year,Latitude,Longitude,Month,Day,Weekday,HourOfDay
Arrest,1.0,-0.042038,-0.018806,-0.032251,-0.034502,-0.082909,-0.034296,-0.033065,-0.033303,-0.001198,0.002881,0.06976
Domestic,-0.042038,1.0,-0.076961,0.017535,-0.10411,0.014125,-0.103944,0.016062,-0.006167,-0.001635,0.029653,-0.041736
Ward,-0.018806,-0.076961,1.0,-0.440303,0.653341,0.017501,0.653601,-0.434491,0.004603,-0.001317,0.012944,-0.001109
X Coordinate,-0.032251,0.017535,-0.440303,1.0,-0.526403,0.015353,-0.528932,0.999895,0.006909,0.00347,-0.002883,-0.005368
Y Coordinate,-0.034502,-0.10411,0.653341,-0.526403,1.0,0.015782,0.999994,-0.517357,0.005632,-8.6e-05,0.009871,-0.001347
Year,-0.082909,0.014125,0.017501,0.015353,0.015782,1.0,0.015692,0.015726,-0.001783,0.001121,-0.00236,-0.010516
Latitude,-0.034296,-0.103944,0.653601,-0.528932,0.999994,0.015692,1.0,-0.519917,0.005595,-9.8e-05,0.009861,-0.001322
Longitude,-0.033065,0.016062,-0.434491,0.999895,-0.517357,0.015726,-0.519917,1.0,0.007054,0.003494,-0.002742,-0.005422
Month,-0.033303,-0.006167,0.004603,0.006909,0.005632,-0.001783,0.005595,0.007054,1.0,-0.009696,0.001839,0.001724
Day,-0.001198,-0.001635,-0.001317,0.00347,-8.6e-05,0.001121,-9.8e-05,0.003494,-0.009696,1.0,-0.001012,0.013508


In [26]:
# Drop lat & lon - almost 100% correlated with X & Y coord
pearson_df = df.drop(['Latitude','Longitude'], axis=1)
pearson_df.head()

Unnamed: 0,Primary Type,Description,Location Description,Arrest,Domestic,Ward,X Coordinate,Y Coordinate,Year,Month,Day,Weekday,HourOfDay,Weapon
0,THEFT,OVER $500,BAR OR TAVERN,False,False,43.0,1169965.0,1916711.0,2014,1,1,2,0,NONE
1,CRIMINAL DAMAGE,TO VEHICLE,STREET,False,False,12.0,1156472.0,1885670.0,2014,1,1,2,0,NONE
2,OTHER OFFENSE,HARASSMENT BY TELEPHONE,STREET,False,False,16.0,1170753.0,1862063.0,2014,1,1,2,0,NONE
3,CRIM SEXUAL ABUSE,AGGRAVATED,RESIDENCE,False,True,3.0,1178506.0,1869664.0,2014,1,1,2,0,NONE
4,THEFT,FROM BUILDING,APARTMENT,False,False,35.0,1154565.0,1916914.0,2014,1,1,2,0,NONE


In [27]:
# Convert all text cells to numeric using pandas get dummies
dummy_df = pd.get_dummies(pearson_df)
dummy_df.head()

Unnamed: 0,Arrest,Domestic,Ward,X Coordinate,Y Coordinate,Year,Month,Day,Weekday,HourOfDay,...,Weapon_AIR RIFLE,Weapon_FIREARM,Weapon_HANDGUN,Weapon_HANDS/FIST/FEET NO/MINOR INJURY,Weapon_HANDS/FIST/FEET SERIOUS INJURY,Weapon_KNIFE/CUTTING INSTRUMENT,Weapon_NONE,Weapon_OTHER,Weapon_OTHER DANGEROUS WEAPON,Weapon_OTHER FIREARM
0,False,False,43.0,1169965.0,1916711.0,2014,1,1,2,0,...,0,0,0,0,0,0,1,0,0,0
1,False,False,12.0,1156472.0,1885670.0,2014,1,1,2,0,...,0,0,0,0,0,0,1,0,0,0
2,False,False,16.0,1170753.0,1862063.0,2014,1,1,2,0,...,0,0,0,0,0,0,1,0,0,0
3,False,True,3.0,1178506.0,1869664.0,2014,1,1,2,0,...,0,0,0,0,0,0,1,0,0,0
4,False,False,35.0,1154565.0,1916914.0,2014,1,1,2,0,...,0,0,0,0,0,0,1,0,0,0


In [28]:
# Keep num features below 1000 - total cells less than 1 billion
rows = dummy_df['Arrest'].count()
columns = len(dummy_df.columns)
print(f"{rows} rows, {columns} columns = {rows*columns} cells")

1325803 rows, 359 columns = 475963277 cells


In [30]:
# Test data 2018
test_df = dummy_df[dummy_df['Year']==2018]

In [31]:
test_df.shape

(263105, 359)

# Start Random Forest Tests

In [52]:
# Random Forest dependencies
from sklearn.ensemble import RandomForestClassifier

### Arrest Associations

In [53]:
# What can we predict with the data
y = test_df["Arrest"]

In [54]:
X = test_df.drop("Arrest", axis=1)
feature_names = X.columns
X.shape, y.shape

((263105, 358), (263105,))

In [55]:
# Split the data using train_test_split
from sklearn.model_selection import train_test_split

# Stratify ensures equal distriubution of male/female values in the training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=1, stratify=y)

In [56]:
# Random Forest using days
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.8979126442373474

In [57]:
# Confusion matrix gives chart of t/f values with true false in upper left and true positive in lower right, 
# then the other corners are false positive/bottom left, false negative/top right
from sklearn.metrics import confusion_matrix

predictions = rf.predict(X_test)

confusion_matrix(y_test, predictions)

array([[51633,  1021],
       [ 5694,  7429]])

Arrests - most important features
* As expected a lot of association with narcotics posession, interference with police officer and narcotics dealing
* Avoid the sidewalk and grocery food store
* Association with day/hour of day can be seen in graphs that show crimes increase near the weekend and are very dependent on time of day
* The location/ward dependency most likely are because of the extreme differences in crime across Chicago

In [58]:
 # Find feature importances
importances = rf.feature_importances_

In [38]:
 # Rank features by importance
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.08771737268791782, 'Y Coordinate'),
 (0.0868147417187789, 'X Coordinate'),
 (0.07275710232956527, 'Primary Type_NARCOTICS POSSESION'),
 (0.06729339602306623, 'Day'),
 (0.0653720498391774, 'HourOfDay'),
 (0.057680994546778275, 'Ward'),
 (0.05328953176985181, 'Month'),
 (0.041737760608743006, 'Weekday'),
 (0.02999415099475373, 'Primary Type_NARCOTICS MANU/DEL'),
 (0.020791256968321403, 'Description_UNLAWFUL POSSESION'),
 (0.017825015399829156, 'Primary Type_NARCOTICS'),
 (0.016496802524556844, 'Description_HEROIN(WHITE)'),
 (0.01579906865520813, 'Primary Type_WEAPONS VIOLATION'),
 (0.014641977617283319, 'Description_RETAIL THEFT'),
 (0.014182907072757212, 'Description_TO LAND'),
 (0.011955916911912385, 'Description_CRACK'),
 (0.010769685999350173, 'Primary Type_CRIMINAL TRESPASS'),
 (0.010479286640591907, 'Primary Type_INTERFERENCE WITH PUBLIC OFFICER'),
 (0.009089811205698852, 'Primary Type_THEFT'),
 (0.008663321406075383, 'Description_AGGRAVATED PO'),
 (0.007981033027366795, 'Descr

### Domestic Associations

In [46]:
# What can we predict with the data
y = test_df["Domestic"]
X = test_df.drop("Domestic", axis=1)
feature_names = X.columns
X.shape, y.shape

((263105, 358), (263105,))

In [47]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=5, stratify=y)

In [48]:
# Random Forest using domestic
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.9171746963224228

In [51]:
# Confusion matrix gives chart of t/f values with true false in upper left and true positive in lower right, 
# then the other corners are false positive/bottom left, false negative/top right
from sklearn.metrics import confusion_matrix

predictions = rf.predict(X_test)

confusion_matrix(y_test, predictions)

array([[53144,  1746],
       [ 3702,  7185]])

Domestic - Most important features
* As expected domestic has a strong association with domestic battery, battery in general, apartments and residences
* Each feature has strong location connections due to geographic distributions of poverty in Chicago

In [49]:
 # Find feature importances
importances = rf.feature_importances_

In [50]:
# Rank features by importance
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.19207102737931042, 'Description_DOMESTIC BATTERY SIMPLE'),
 (0.08291401600889894, 'Y Coordinate'),
 (0.07951645392727187, 'X Coordinate'),
 (0.07743512789366823, 'Primary Type_BATTERY'),
 (0.06061690832347568, 'Day'),
 (0.058792834141975156, 'HourOfDay'),
 (0.05336427039525532, 'Ward'),
 (0.048155546728453874, 'Month'),
 (0.0376872016238805, 'Location Description_APARTMENT'),
 (0.03692723690103175, 'Weekday'),
 (0.032335759025505874, 'Description_SIMPLE'),
 (0.030066735324188812, 'Location Description_RESIDENCE'),
 (0.01746781858194966, 'Description_AGGRAVATED DOMESTIC BATTERY'),
 (0.01582956368823809, 'Description_VIOLATE ORDER OF PROTECTION'),
 (0.012882940858005374, 'Primary Type_THEFT'),
 (0.01012171598691728, 'Primary Type_OTHER OFFENSE'),
 (0.009477008176854937, 'Primary Type_ASSAULT'),
 (0.00793253692836865, 'Arrest'),
 (0.007520026636955998, 'Description_AGGRAVATED'),
 (0.005994403804686204, 'Location Description_STREET'),
 (0.0058218338888953915, 'Primary Type_BURGLARY'),


### Ward Associations

In [65]:
# What can we predict with the data
y = test_df["Ward"]
X = test_df.drop(["Ward",'X Coordinate','Y Coordinate'], axis=1)
feature_names = X.columns
X.shape, y.shape

((263105, 356), (263105,))

In [66]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=5, stratify=y)

In [67]:
# Random Forest using domestic
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.09105006309196223

In [68]:
# Confusion matrix gives chart of t/f values with true false in upper left and true positive in lower right, 
# then the other corners are false positive/bottom left, false negative/top right
from sklearn.metrics import confusion_matrix

predictions = rf.predict(X_test)

confusion_matrix(y_test, predictions)

array([[ 34,  54,  41, ...,  14,  15,   7],
       [ 37, 246,  69, ...,  23,  32,  12],
       [ 19,  70,  98, ...,  22,   8,  11],
       ...,
       [ 17,  30,  23, ...,  35,  13,   6],
       [  8,  41,  34, ...,  17,  62,  10],
       [ 11,  32,  19, ...,   6,  14,  14]])

#### Ward - Most Important Features
* Features are day/time related and location description located, so where and when crime occurs varies a lot by ward
* Often crimes occur in the restuarant/business districts of town so the street/restaurant description may make sense

In [69]:
 # Find feature importances
importances = rf.feature_importances_

# Rank features by importance
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.28827675203305053, 'HourOfDay'),
 (0.24250823742411914, 'Day'),
 (0.1727345880837036, 'Month'),
 (0.1338612764948961, 'Weekday'),
 (0.009790865972109066, 'Arrest'),
 (0.005896246082484167, 'Domestic'),
 (0.0037624246861882767, 'Location Description_STREET'),
 (0.003236376301324562, 'Location Description_RESTAURANT'),
 (0.0030885118283712066, 'Location Description_OTHER'),
 (0.0030149437705715206, 'Location Description_RESIDENCE'),
 (0.0028674874450158707, 'Location Description_SIDEWALK'),
 (0.00266117437086595, 'Location Description_APARTMENT'),
 (0.002499086073185414, 'Location Description_ALLEY'),
 (0.0024978484010711537, 'Primary Type_BATTERY'),
 (0.0023368413870128914, 'Primary Type_ASSAULT'),
 (0.002296333882378718, 'Location Description_PARKING LOT/GARAGE(NON.RESID.)'),
 (0.002162128694939928, 'Location Description_RESIDENCE PORCH/HALLWAY'),
 (0.0021223604867432995, 'Location Description_VEHICLE NON-COMMERCIAL'),
 (0.002099292962936098, 'Location Description_RESIDENTIAL YARD 