In [72]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import datetime as dt

# Read Data

In [44]:
file = 'use_of_force_model_data.csv'
data = pd.read_csv(file, index_col = 0)
data

Unnamed: 0,datetime,Incident Location: Street/Highway,Incident Location: Public Transport,Incident Location: Retail Premises,"Incident Location: Open ground (e.g. park, car park, field)",Incident Location: Licensed Premises,Incident Location: Sports or Event Stadia,Incident Location: Hospital/A&E (non-mental-health setting),Incident Location: Mental Health Setting,Incident Location: Police vehicle with prisoner handling cage,...,Newham,Out of force,Redbridge,Richmond upon Thames,Southwark,Sutton,Tower Hamlets,Waltham Forest,Wandsworth,is force required?
0,2020-04-01 00:12:00,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2020-04-01 01:30:00,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,2020-04-01 03:30:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,2020-04-01 04:25:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,2020-04-01 04:12:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147890,2021-01-31 18:00:00,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
147891,2021-01-31 15:20:00,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
147892,2021-01-31 22:00:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
147893,2021-01-31 19:48:00,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [45]:
# Convert datetime to datetime ordinal
data['datetime'] = pd.to_datetime(data['datetime'])
data['datetime'] = data['datetime'].map(dt.datetime.toordinal)
data


Unnamed: 0,datetime,Incident Location: Street/Highway,Incident Location: Public Transport,Incident Location: Retail Premises,"Incident Location: Open ground (e.g. park, car park, field)",Incident Location: Licensed Premises,Incident Location: Sports or Event Stadia,Incident Location: Hospital/A&E (non-mental-health setting),Incident Location: Mental Health Setting,Incident Location: Police vehicle with prisoner handling cage,...,Newham,Out of force,Redbridge,Richmond upon Thames,Southwark,Sutton,Tower Hamlets,Waltham Forest,Wandsworth,is force required?
0,737516,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,737516,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,737516,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,737516,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,737516,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147890,737821,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
147891,737821,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
147892,737821,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
147893,737821,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [46]:
for i in data.columns:
    print(i, data[i].isnull().sum())

datetime 0
Incident Location: Street/Highway 0
Incident Location: Public Transport 0
Incident Location: Retail Premises 0
Incident Location: Open ground (e.g. park, car park, field) 0
Incident Location: Licensed Premises 0
Incident Location: Sports or Event Stadia 0
Incident Location: Hospital/A&E (non-mental-health setting) 0
Incident Location: Mental Health Setting 0
Incident Location: Police vehicle with prisoner handling cage 0
Incident Location: Police vehicle without prisoner handling cage 0
Incident Location: Dwelling 0
Incident Location: Police station (excluding custody block) 0
Incident Location: Custody Block 0
Incident Location: Ambulance 0
Incident Location: Other 0
MainDuty: AFO 0
MainDuty: ARV 0
MainDuty: CID 0
MainDuty: CTSFO 0
MainDuty: Custody 0
MainDuty: Dog handler duty 0
MainDuty: Foot patrol 0
MainDuty: Mobile patrol 0
MainDuty: Mounted section duties 0
MainDuty: Off duty 0
MainDuty: Other 0
MainDuty: PSU/Public order 0
MainDuty: Roads policing 0
MainDuty: Surveil

# Assign Input Output variables

In [51]:
y = data['is force required?']
y.shape

(147895,)

In [52]:
X = data.iloc[:, :-1]
X.shape

(147895, 78)

# Split Data

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 7, test_size = 0.2)

# Create Baseline Models

In [68]:
# Evaluation Model
def evaluate_model(model, X_test, y_test, model_name):
    preds = model.predict(X_test)
    print(f'Model: {model_name}' )
    print('Model score:', round(model.score(X_test, y_test),2))
    print('----------------------------------------------------------')
    print('Classification Report:\n')
    print(metrics.classification_report(y_test, preds))

In [71]:
#Baseline result
y.value_counts()

0    79945
1    67950
Name: is force required?, dtype: int64

Our target to beat is 45.9% Accuracy

# logistic regression

In [55]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

LogisticRegression()

In [69]:
evaluate_model(lr, X_test, y_test, 'LogisticRegression')

Model: LogisticRegression
Model score: 0.54
----------------------------------------------------------
Classification Report:

              precision    recall  f1-score   support

           0       0.54      1.00      0.70     15998
           1       0.00      0.00      0.00     13581

    accuracy                           0.54     29579
   macro avg       0.27      0.50      0.35     29579
weighted avg       0.29      0.54      0.38     29579



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Knn

In [None]:
knn = KNeighborsClassifier()
params_knn = {'n_neighbors': np.arange(1, 25)}
knn_gs = GridSearchCV(knn, params_knn, cv = 5)
knn_gs.fit(X_train, y_train)
knn_best = knn_gs.best_estimator_
print(knn_best)