In [687]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.inspection import permutation_importance
from catboost import *
import shap
from time import time

In [688]:
# incidents = read_data()
incidents_initial = pd.read_csv('data/cleaned_data.csv', sep = None, dtype={'YEAR OCCURRED': np.int_, 'MONTH OCCURRED': np.int_,'GEO CODE': np.str_, 'CRIME CATEGORY DESCRIPTION': np.str_,}, engine = 'python')

In [689]:
incidents_initial.head(20)

Unnamed: 0.1,Unnamed: 0,CRIME CATEGORY DESCRIPTION,RESPONSE TIME,MINS OF DAY,DAY OF WEEK,DAY OF MONTH,DAY OF YEAR,MONTH OCCURRED,YEAR OCCURRED,GEO CODE,LATITUDE,LONGITUDE
0,1,Traffic Offenses,14.0,1138.0,1.0,18.0,169.0,6,2019,546-15,,
1,2,Accident,7.0,945.0,4.0,28.0,241.0,8,2020,546-11,40.116511,-88.210833
2,3,Weapons Offenses,0.0,390.0,0.0,15.0,74.0,3,1993,547-11,40.112553,-88.203804
3,4,Assist Other Agency/Business,0.0,569.0,5.0,1.0,214.0,8,2020,548-10,,
4,5,Traffic Offenses,0.0,664.0,2.0,10.0,254.0,9,2008,575-11,40.111277,-88.207467
5,6,Traffic Offenses,0.0,109.0,5.0,11.0,162.0,6,2005,575-02,,
6,7,Disorderly Conduct,9.0,665.0,4.0,8.0,312.0,11,2013,576-11,40.101881,-88.200156
7,8,Assist Other Agency/Business,0.0,1292.0,3.0,7.0,219.0,8,1997,517-03,,
8,9,Traffic Offenses,6.0,1176.0,0.0,23.0,328.0,11,2020,605-08,,
9,10,Traffic Offenses,0.0,1360.0,1.0,25.0,238.0,8,2020,575-06,,


In [None]:
counts = incidents_initial['CRIME CATEGORY DESCRIPTION'].value_counts()
#filter categories

low_cats = counts.index[1:]
incidents_filtered = incidents_initial
for cat in low_cats:
    incidents_filtered = incidents_filtered[incidents_filtered['CRIME CATEGORY DESCRIPTION'] != cat]

In [None]:
incidents = incidents_filtered.drop(columns=['LATITUDE', 'LONGITUDE'])
# incidents = incidents_initial
#uncomment the line below to exclude traffic offenses from data
# incidents = incidents_initial[incidents_initial['CRIME CATEGORY DESCRIPTION'] != 'Traffic Offenses']

#only want the more urgent crimes
max_mins = 10
incidents = incidents[incidents['RESPONSE TIME'] >0]
incidents = incidents[incidents['RESPONSE TIME'] <= max_mins]

incidents = incidents.dropna()
y = incidents['RESPONSE TIME']
X = incidents.drop(columns=['RESPONSE TIME'])
X = X.loc[:, ~X.columns.str.contains('^Unnamed')]
# y.value_counts()

In [None]:
X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.8)

In [None]:
categorical_features_indices = [0,7]

In [None]:
model=CatBoostRegressor(iterations=50, depth=10, learning_rate=0.1, loss_function='RMSE', od_type = 'Iter')
# model=CatBoostRegressor()

model.fit(X_train, y_train,cat_features=categorical_features_indices,eval_set=(X_validation, y_validation),plot=False, verbose=False)

In [None]:
from sklearn.metrics import r2_score
pred = model.predict(X_validation)
rmse = (np.sqrt(mean_squared_error(y_validation, pred)))
r2 = r2_score(y_validation, pred)

In [None]:
rmse


In [None]:
error = pd.DataFrame()
error['Predicted'] = pred
error['Actual'] = y_validation.values

length = len(error)
ind =  [[]] * length
for i in range(length):
    ind[i] = i
# error['index'] = ind
error = error.sort_values(by=['Actual'])

fig = plt.figure()
ax1 = fig.add_subplot(111)

ax1.scatter(ind, error['Predicted'], s=1, c='b', marker="s", label='Predicted')
ax1.scatter(ind, error['Actual'], s=1, c='r', marker="s", label='Actual')
plt.legend(loc='upper left');
plt.show()

In [None]:
error.hist(column='Predicted', bins=max_mins)

In [None]:
error.hist(column='Actual', bins=max_mins)

In [None]:
model.get_params()

In [None]:

shap_values = model.get_feature_importance(Pool(X_validation, label=y_validation,cat_features=categorical_features_indices), 
                                                                     type="ShapValues")
expected_value = shap_values[0,-1]
shap_values = shap_values[:,:-1]

shap.initjs()
shap.force_plot(expected_value, shap_values[3,:], X_validation.iloc[3,:])

In [None]:
shap.initjs()
#police response time by time of day
shap.dependence_plot("MINS OF DAY", shap_values, X_validation, x_jitter=1)


In [None]:
shap.dependence_plot("DAY OF YEAR", shap_values, X_validation, interaction_index="MONTH OCCURRED",show=False)


In [None]:
shap.dependence_plot("MONTH OCCURRED", shap_values, X_validation, interaction_index="DAY OF YEAR",show=False)


In [None]:
shap.initjs()


x = shap.dependence_plot("CRIME CATEGORY DESCRIPTION", shap_values, X_validation, interaction_index="MINS OF DAY",show=False)
fig = plt.gcf()
# fig.set_size_inches(18.5, 6)
plt.savefig('shap-chart-crime-cat')

In [None]:
#show every dependece graph
for i in range(7):
    shap.dependence_plot("rank("+str(i)+")", shap_values, X_validation)

In [None]:
explainer = shap.Explainer(model)
shap_values = explainer(X)
shap.plots.bar(shap_values)

In [None]:
shap.plots.waterfall(shap_values[0])

In [None]:
shap.plots.beeswarm(shap_values)

In [None]:
shap.summary_plot(shap_values)

In [None]:
cats = incidents_initial['CRIME CATEGORY DESCRIPTION'].unique()
dfs = []

In [None]:
i = 0
for c in cats:
    i+=1
    if i == 100:
        break
    a = incidents_initial[incidents_initial['CRIME CATEGORY DESCRIPTION'] == c]
    print(c)
    a.hist(column='RESPONSE TIME', bins=60)