In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import datetime as dt
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from mlxtend.plotting import plot_learning_curves
from mlxtend.plotting import plot_decision_regions
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from sklearn.ensemble import StackingClassifier
from sklearn.decomposition import PCA

In [2]:
file = 'use_of_force_model_data.csv'
data = pd.read_csv(file, index_col = 0)
data

Unnamed: 0,datetime,Incident Location: Street/Highway,Incident Location: Public Transport,Incident Location: Retail Premises,"Incident Location: Open ground (e.g. park, car park, field)",Incident Location: Licensed Premises,Incident Location: Sports or Event Stadia,Incident Location: Hospital/A&E (non-mental-health setting),Incident Location: Mental Health Setting,Incident Location: Police vehicle with prisoner handling cage,...,Newham,Out of force,Redbridge,Richmond upon Thames,Southwark,Sutton,Tower Hamlets,Waltham Forest,Wandsworth,is force required?
0,2020-04-01 00:12:00,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2020-04-01 01:30:00,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,2020-04-01 03:30:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,2020-04-01 04:25:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,2020-04-01 04:12:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147890,2021-01-31 18:00:00,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
147891,2021-01-31 15:20:00,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
147892,2021-01-31 22:00:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
147893,2021-01-31 19:48:00,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
data.drop(columns = 'datetime', inplace = True)

In [4]:
data.head()

Unnamed: 0,Incident Location: Street/Highway,Incident Location: Public Transport,Incident Location: Retail Premises,"Incident Location: Open ground (e.g. park, car park, field)",Incident Location: Licensed Premises,Incident Location: Sports or Event Stadia,Incident Location: Hospital/A&E (non-mental-health setting),Incident Location: Mental Health Setting,Incident Location: Police vehicle with prisoner handling cage,Incident Location: Police vehicle without prisoner handling cage,...,Newham,Out of force,Redbridge,Richmond upon Thames,Southwark,Sutton,Tower Hamlets,Waltham Forest,Wandsworth,is force required?
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [5]:
data_small = data.sample(20000, random_state = 7)

In [8]:
for i in data_small.columns:
    print(data_small[i].value_counts())

1    11564
0     8436
Name: Incident Location: Street/Highway, dtype: int64
0    19801
1      199
Name: Incident Location: Public Transport, dtype: int64
0    19611
1      389
Name: Incident Location: Retail Premises, dtype: int64
0    19208
1      792
Name: Incident Location: Open ground (e.g. park, car park, field), dtype: int64
0    19948
1       52
Name: Incident Location: Licensed Premises, dtype: int64
0    19994
1        6
Name: Incident Location: Sports or Event Stadia, dtype: int64
0    19625
1      375
Name: Incident Location: Hospital/A&E (non-mental-health setting), dtype: int64
0    19915
1       85
Name: Incident Location: Mental Health Setting, dtype: int64
0    19959
1       41
Name: Incident Location: Police vehicle with prisoner handling cage, dtype: int64
0    19966
1       34
Name: Incident Location: Police vehicle without prisoner handling cage, dtype: int64
0    15832
1     4168
Name: Incident Location: Dwelling, dtype: int64
0    19619
1      381
Name: Incident L

# Set Baseline Target Result

In [10]:
data_small['is force required?'].value_counts()

0    10777
1     9223
Name: is force required?, dtype: int64

Our baseline target accuracy to beat is 46%.

# Assign Input and Output Variables

In [11]:
X = data_small.iloc[:, :-1]
y = data_small.iloc[:, -1]

In [15]:
X.shape

(20000, 77)

# Dimensionality Reduction (PCA)

In [14]:
pca = PCA(n_components = 20)
pca.fit(X)

PCA(n_components=20)

In [16]:
X_pca = pca.transform(X)

In [17]:
X_pca.shape

(20000, 20)

# Split Data

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, random_state = 7, test_size = 0.2)

# Create Baseline Models

In [23]:
# Evaluation Model
baseline_score = {}
def evaluate_model(model, X_test, y_test, model_name):
    preds = model.predict(X_test)
    plt.figure()
    plot_learning_curves(X_train, y_train, X_test, y_test, model, print_model = False, style = 'ggplot')
    plt.title(f'{model_name} Learning Curve')
    plt.show()
    print(f'Model: {model_name}' )
    print('Model score:', round(model.score(X_test, y_test),2))
    print('----------------------------------------------------------')
    print('Classification Report:\n')
    print(metrics.classification_report(y_test, preds))
    baseline_score[model_name] = round(model.score(X_test, y_test),3)
    print('----------------------------------------------------------')
    print('Baseline Scores List')
    print(baseline_score)

# Evaluation bagging
bagging_score = {}
def evaluate_bagger(model, X_test, y_test, model_name):
    preds = model.predict(X_test)
    plt.figure()
    plot_learning_curves(X_train, y_train, X_test, y_test, model, print_model = False, style = 'ggplot')
    plt.title(f'{model_name} Learning Curve')
    plt.show()
    print(f'Model: {model_name}' )
    print('Model score:', round(model.score(X_test, y_test),2))
    print('----------------------------------------------------------')
    print('Classification Report:\n')
    print(metrics.classification_report(y_test, preds))
    bagging_score[model_name] = round(model.score(X_test, y_test),3)
    print('----------------------------------------------------------')
    print('Bagging Scores List')
    print(bagging_score)

In [20]:
# Parameter Searching

knn = KNeighborsClassifier()
params_knn = {'n_neighbors': np.arange(1, 25)}
knn_gs = GridSearchCV(knn, params_knn, cv = 5)
knn_gs.fit(X_train, y_train)
knn_best = knn_gs.best_estimator_
print(knn_best)

KNeighborsClassifier(n_neighbors=20)


In [22]:
dt = DecisionTreeClassifier()
params_dt = {'max_leaf_nodes': np.arange(2, 100)}
dt_gs = GridSearchCV(dt, params_dt, cv = 5)
dt_gs.fit(X_train, y_train)
dt_best = dt_gs.best_estimator_
print(dt_best)

DecisionTreeClassifier(max_leaf_nodes=33)


In [24]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

knn = KNeighborsClassifier(n_neighbors = 20)
knn.fit(X_train, y_train)

svc = SVC(probability = True)
svc.fit(X_train, y_train)

dt = DecisionTreeClasifier(max_leaf_nodes = 33)
dt.fit(X_train, y_train)

nb = BernoulliNB()
nb.fit(X_train, y_train)

random_forest = RandomForestClassifier(criterion='entropy', n_estimators= 100, max_features='auto', random_state=7)
random_forest.fit(X_train, y_train)

classifiers = [lr, knn, svc, dt, nb, random_forest]
classifier_names = ['Logistic Regression', 'KNN', 'SVC', 'Decision Tree', 'Bernoulli Naive Bayes', 'Random Forest']

NameError: name 'DecisionTreeClasifier' is not defined

In [None]:
for model, name in zip(classifiers, classifier_names):
    evaluate_model(model, X_test, y_test, name)