In [1]:
import json
import joblib
import pickle
import pandas as pd
import numpy as np
import seaborn as sns

import requests

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.model_selection import KFold


from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import precision_score, recall_score, roc_auc_score, f1_score
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import log_loss


from sklearn.metrics import roc_curve, auc

from category_encoders import LeaveOneOutEncoder

import xgboost

# from sklearn.utils.extmath import cartesian

import matplotlib.pyplot as plt
import matplotlib.image as mpimg

%matplotlib inline

In [2]:
df_original = pd.read_csv("../capstone_train/train.csv")
df_cluster = pd.read_csv("../capstone_train/clusters/df_clusters_for_model.csv", index_col=0)
df_ori_cluster = df_original.join(df_cluster['cluster'])

In [4]:
def pre_process(df):
    
    df = df.copy()
    
    # remove data from Metropolitan station
    df = df[~(df['station'] == 'metropolitan')]
 
    # remove data from station/gender/ethnicity < 30
    def remove_insignificant_data(df):
        df = df.copy()
        for station in df['station'].unique():
            for gender in df['Gender'].unique():
                for ethnicity in df['Officer-defined ethnicity'].unique():
                    cond1 = df['station'] == station
                    cond2 = df['Gender'] == gender
                    cond3 = df['Officer-defined ethnicity'] == ethnicity
                    df_filtered = df.loc[cond1&cond2&cond3]
                    if len(df_filtered) < 30:
                        df = df.drop(df_filtered.index)
                        #print(f'Removed {len(df_filtered)} entry from {station}/{gender}/{ethnicity}')
        return df
    df = remove_insignificant_data(df)
    
    # ex: '2019-12-01T00:36:39.650000+'
    df['Date'] = df['Date'].str[:19]
    df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%dT%H:%M:%S')
    df['hour'] = df['Date'].dt.hour
    df['month'] = df['Date'].dt.month
    df['day_of_week'] = df['Date'].dt.day_name()
    
    # fill NaN's:
    # 1) 'Removal of more than just outer clothing' = False if not a 'Vehicle search'
    # 2) 'Part of a policing operation' = False
    df.loc[(df['Type'].isin(['Vehicle search'])), 'Removal of more than just outer clothing'] = np.NaN
    df.loc[(pd.isnull(df['Removal of more than just outer clothing']) & df['Type'].isin(['Person and Vehicle search', 'Person search'])), 'Removal of more than just outer clothing'] = False
    df.loc[(pd.isnull(df['Part of a policing operation'])), 'Part of a policing operation'] = False

 
    # column types
    #df['Removal of more than just outer clothing'] = df['Removal of more than just outer clothing'].astype('bool')
    df['Part of a policing operation'] = df['Part of a policing operation'].astype('bool') 
    df['cluster'] = df['cluster'].astype('category')
    df['station'] = df['station'].astype('category') 
    df['Type'] = df['Type'].astype('category') 
    df['Object of search'] = df['Object of search'].astype('category') 
    df['Legislation'] = df['Legislation'].astype('category')
    df['Gender'] = df['Gender'].astype('category')
    df['Officer-defined ethnicity'] = df['Officer-defined ethnicity'].astype('category') 
    df['Age range'] = df['Age range'].astype('category') 
    df['day_of_week'] = df['day_of_week'].astype('category')

    # build target: positive outcomes + Outcome linked to object of search
    positive_outcomes = ['Local resolution', 'Community resolution', 'Offender given drugs possession warning','Khat or Cannabis warning', 'Caution (simple or conditional)', 'Offender given penalty notice', 'Arrest', 'Penalty Notice for Disorder','Suspected psychoactive substances seized - No further action', 'Summons / charged by post', 'Article found - Detailed outcome unavailable', 'Offender cautioned', 'Suspect arrested','Suspect summonsed to court']
    def build_target(df):
        if df['Outcome'] in positive_outcomes and df['Outcome linked to object of search'] == True:
            return 1
        else:
            return 0
    df['target'] = df.apply(build_target, axis=1)
    
    return df
df_clean = pre_process(df_ori_cluster)
df_clean.head()

Unnamed: 0,observation_id,Type,Date,Part of a policing operation,Latitude,Longitude,Gender,Age range,Self-defined ethnicity,Officer-defined ethnicity,...,Object of search,Outcome,Outcome linked to object of search,Removal of more than just outer clothing,station,cluster,hour,month,day_of_week,target
0,34d76816-cfc2-4bdd-b3a2-bf0c40b12689,Person search,2019-12-01 00:00:00,True,,,Male,18-24,Asian/Asian British - Any other Asian background,Asian,...,Controlled drugs,A no further action disposal,,False,devon-and-cornwall,,0,12,Sunday,0
1,83c87ec1-e6d4-4bbb-8e0c-159516706359,Person search,2019-12-01 00:09:00,True,,,Male,18-24,,White,...,Controlled drugs,A no further action disposal,,False,devon-and-cornwall,,0,12,Sunday,0
2,8c2eae4b-035a-492f-92d6-cd27e6c900a1,Person search,2019-12-01 00:10:00,True,,,Female,18-24,White - English/Welsh/Scottish/Northern Irish/...,White,...,Controlled drugs,A no further action disposal,,False,devon-and-cornwall,,0,12,Sunday,0
3,e63a036a-a728-4efe-8e08-7198f56d6d0a,Person search,2019-12-01 00:10:00,False,,,Male,18-24,,Asian,...,Controlled drugs,A no further action disposal,,False,devon-and-cornwall,,0,12,Sunday,0
4,40c70aca-1f7f-4efe-8c5b-ade0f2e9937d,Person search,2019-12-01 00:12:00,True,50.368247,-4.126646,Male,18-24,,White,...,Controlled drugs,A no further action disposal,,False,devon-and-cornwall,devon-and-cornwall1,0,12,Sunday,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
660606,a23dc59d-29fe-48e5-b012-81feb6acfddf,Person search,2018-08-29 02:45:00,False,,,Male,18-24,White - English/Welsh/Scottish/Northern Irish/...,White,...,Controlled drugs,A no further action disposal,,False,wiltshire,,2,8,Wednesday,0
660607,425fde18-f7d5-4ebe-baf4-714cadd5a445,Person and Vehicle search,2018-08-29 21:00:00,False,51.540219,-1.764708,Male,18-24,White - English/Welsh/Scottish/Northern Irish/...,White,...,Controlled drugs,Penalty Notice for Disorder,True,False,wiltshire,wiltshire8,21,8,Wednesday,1
660608,7c249ab7-d4bf-47ef-86bd-aad51d7d2aab,Person search,2018-08-29 21:10:00,False,51.540219,-1.764708,Male,18-24,White - English/Welsh/Scottish/Northern Irish/...,White,...,Controlled drugs,A no further action disposal,,False,wiltshire,wiltshire8,21,8,Wednesday,0
660609,4fa54513-20c4-4a53-a5db-8a4bb85b1e32,Person search,2018-08-29 21:15:00,False,51.540219,-1.764708,Male,18-24,White - English/Welsh/Scottish/Northern Irish/...,White,...,Controlled drugs,A no further action disposal,,False,wiltshire,wiltshire8,21,8,Wednesday,0


### Save pre-processsed data

In [5]:
df_clean_path = '../capstone_train/df_clean.csv'
df_clean.to_csv(df_clean_path, index=True)

with open('dtypes_pre_process.pickle', 'wb') as fh:
    pickle.dump(df_clean.dtypes, fh)

df_clean = pd.read_csv(df_clean_path, index_col=0)
df_clean.head()

Unnamed: 0,observation_id,Type,Date,Part of a policing operation,Latitude,Longitude,Gender,Age range,Self-defined ethnicity,Officer-defined ethnicity,...,Object of search,Outcome,Outcome linked to object of search,Removal of more than just outer clothing,station,cluster,hour,month,day_of_week,target
0,34d76816-cfc2-4bdd-b3a2-bf0c40b12689,Person search,2019-12-01 00:00:00,True,,,Male,18-24,Asian/Asian British - Any other Asian background,Asian,...,Controlled drugs,A no further action disposal,,False,devon-and-cornwall,,0,12,Sunday,0
1,83c87ec1-e6d4-4bbb-8e0c-159516706359,Person search,2019-12-01 00:09:00,True,,,Male,18-24,,White,...,Controlled drugs,A no further action disposal,,False,devon-and-cornwall,,0,12,Sunday,0
2,8c2eae4b-035a-492f-92d6-cd27e6c900a1,Person search,2019-12-01 00:10:00,True,,,Female,18-24,White - English/Welsh/Scottish/Northern Irish/...,White,...,Controlled drugs,A no further action disposal,,False,devon-and-cornwall,,0,12,Sunday,0
3,e63a036a-a728-4efe-8e08-7198f56d6d0a,Person search,2019-12-01 00:10:00,False,,,Male,18-24,,Asian,...,Controlled drugs,A no further action disposal,,False,devon-and-cornwall,,0,12,Sunday,0
4,40c70aca-1f7f-4efe-8c5b-ade0f2e9937d,Person search,2019-12-01 00:12:00,True,50.368247,-4.126646,Male,18-24,,White,...,Controlled drugs,A no further action disposal,,False,devon-and-cornwall,devon-and-cornwall1,0,12,Sunday,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
660606,a23dc59d-29fe-48e5-b012-81feb6acfddf,Person search,2018-08-29 02:45:00,False,,,Male,18-24,White - English/Welsh/Scottish/Northern Irish/...,White,...,Controlled drugs,A no further action disposal,,False,wiltshire,,2,8,Wednesday,0
660607,425fde18-f7d5-4ebe-baf4-714cadd5a445,Person and Vehicle search,2018-08-29 21:00:00,False,51.540219,-1.764708,Male,18-24,White - English/Welsh/Scottish/Northern Irish/...,White,...,Controlled drugs,Penalty Notice for Disorder,True,False,wiltshire,wiltshire8,21,8,Wednesday,1
660608,7c249ab7-d4bf-47ef-86bd-aad51d7d2aab,Person search,2018-08-29 21:10:00,False,51.540219,-1.764708,Male,18-24,White - English/Welsh/Scottish/Northern Irish/...,White,...,Controlled drugs,A no further action disposal,,False,wiltshire,wiltshire8,21,8,Wednesday,0
660609,4fa54513-20c4-4a53-a5db-8a4bb85b1e32,Person search,2018-08-29 21:15:00,False,51.540219,-1.764708,Male,18-24,White - English/Welsh/Scottish/Northern Irish/...,White,...,Controlled drugs,A no further action disposal,,False,wiltshire,wiltshire8,21,8,Wednesday,0
