# Step 2: Baseline Model

In [23]:
import pandas as pd
pd.set_option('display.max_columns', 100)
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import cross_val_score
# from category_encoders import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, RobustScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.utils import resample
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score, make_scorer, confusion_matrix

# needed to use matplotlib inside jupyter notebook
%matplotlib inline 

# Data Cleaning

In [2]:
df_ = pd.read_csv("data/train.csv")
df_.head(10)

Unnamed: 0,observation_id,Type,Date,Part of a policing operation,Latitude,Longitude,Gender,Age range,Self-defined ethnicity,Officer-defined ethnicity,Legislation,Object of search,Outcome,Outcome linked to object of search,Removal of more than just outer clothing,station
0,2e4d0094-c30b-471b-a211-72a9790feca2,Person search,2020-12-01T01:10:00+00:00,,50.798824,-1.089471,Male,25-34,Other ethnic group - Not stated,White,Police and Criminal Evidence Act 1984 (section 1),Article for use in theft,Community resolution,False,False,hampshire
1,4779fbe8-6e05-4534-85fd-db32952ee309,Person search,2020-12-01T02:00:00+00:00,,50.785099,-1.09154,Male,over 34,White - Any other White background,Other,Misuse of Drugs Act 1971 (section 23),Controlled drugs,A no further action disposal,False,False,hampshire
2,cb5c685d-acac-42e2-914d-75e6ff73b0a8,Person search,2020-12-01T09:15:00+00:00,,50.952006,-1.403341,Male,over 34,White - English/Welsh/Scottish/Northern Irish/...,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,A no further action disposal,False,True,hampshire
3,f486e116-5b1e-45db-9931-a7f070c5c478,Person search,2020-12-01T10:20:00+00:00,,50.806383,-1.079844,Male,10-17,Other ethnic group - Not stated,White,Police and Criminal Evidence Act 1984 (section 1),Stolen goods,A no further action disposal,False,False,hampshire
4,78f4020e-12cc-4889-bf1a-2f2c29b2f662,Person search,2020-12-01T10:24:00+00:00,,50.80667,-1.081982,Male,10-17,Other ethnic group - Not stated,Asian,Police and Criminal Evidence Act 1984 (section 1),Offensive weapons,A no further action disposal,False,False,hampshire
5,769ac39b-cfe7-4587-855f-b574cdd46933,Person search,2020-12-01T10:30:00+00:00,,50.808359,-1.079465,Male,18-24,Mixed/Multiple ethnic groups - White and Black...,Black,Police and Criminal Evidence Act 1984 (section 1),Offensive weapons,Arrest,False,False,hampshire
6,2f3c1b3b-b41a-4b12-b1ab-3c34f93cd20a,Person search,2020-12-01T10:50:00+00:00,,50.914047,-1.401123,Male,over 34,Black/African/Caribbean/Black British - Any ot...,Black,Misuse of Drugs Act 1971 (section 23),Controlled drugs,A no further action disposal,False,True,hampshire
7,b3f957b8-b942-4789-b47c-aa1a6dfcabe7,Person search,2020-12-01T11:00:00+00:00,,50.915304,-1.398959,Male,18-24,White - English/Welsh/Scottish/Northern Irish/...,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,A no further action disposal,False,False,hampshire
8,79807773-d027-4cdf-ad41-531b3bb2cbab,Person search,2020-12-01T11:10:00+00:00,,50.830251,-1.077629,Male,18-24,Black/African/Caribbean/Black British - Caribbean,Black,Misuse of Drugs Act 1971 (section 23),Controlled drugs,Community resolution,False,False,hampshire
9,fa5f250a-08f0-411d-85ff-eeaab9659cb5,Person search,2020-12-01T11:10:00+00:00,,50.830251,-1.077629,Male,10-17,White - English/Welsh/Scottish/Northern Irish/...,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,A no further action disposal,False,False,hampshire


In [16]:
# dropping observation_id column and metropolitan entries

df_clean = df_.copy().drop(columns='observation_id')

df_clean = df_cat.loc[df_cat["station"]!='metropolitan',:].copy()

# convert date to datetime

df_clean["Date"] = pd.to_datetime(df_clean["Date"], infer_datetime_format=True, dayfirst=False)

# deal with unordered categorical columns

cat_columns = ['Type', 'Gender', 'Self-defined ethnicity', 'Officer-defined ethnicity', 'Legislation',
       'Object of search', 'Outcome', 'station']

for col in cat_columns:
    df_clean[col] = df_clean[col].astype('category').cat.as_unordered()

# deal with categorical columns

cat_columns = ['Age range']

df_clean['Age range'] = df_clean['Age range'].astype('category').cat.as_ordered().cat.reorder_categories(['under 10', '10-17', '18-24', '25-34', 'over 34'], ordered=True)

# deal with boolean columns

bool_columns = ['Part of a policing operation','Outcome linked to object of search','Removal of more than just outer clothing']

for col in bool_columns:
    df_clean[col] = df_clean[col].astype('boolean')


# generate target columns
# Note on target: 'A no further action disposal' -> 0 ; not 'A no further action disposal' and 'Outcome linked to object of search' -> 1 
df_clean['target'] = 0
df_clean.loc[(df_clean["Outcome"]!='A no further action disposal') & (df_clean['Outcome linked to object of search']==True), 'target'] = 1


df_clean

Unnamed: 0,Type,Date,Part of a policing operation,Latitude,Longitude,Gender,Age range,Self-defined ethnicity,Officer-defined ethnicity,Legislation,Object of search,Outcome,Outcome linked to object of search,Removal of more than just outer clothing,station,target
0,Person search,2020-12-01 01:10:00+00:00,,50.798824,-1.089471,Male,25-34,Other ethnic group - Not stated,White,Police and Criminal Evidence Act 1984 (section 1),Article for use in theft,Community resolution,False,False,hampshire,0
1,Person search,2020-12-01 02:00:00+00:00,,50.785099,-1.091540,Male,over 34,White - Any other White background,Other,Misuse of Drugs Act 1971 (section 23),Controlled drugs,A no further action disposal,False,False,hampshire,0
2,Person search,2020-12-01 09:15:00+00:00,,50.952006,-1.403341,Male,over 34,White - English/Welsh/Scottish/Northern Irish/...,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,A no further action disposal,False,True,hampshire,0
3,Person search,2020-12-01 10:20:00+00:00,,50.806383,-1.079844,Male,10-17,Other ethnic group - Not stated,White,Police and Criminal Evidence Act 1984 (section 1),Stolen goods,A no further action disposal,False,False,hampshire,0
4,Person search,2020-12-01 10:24:00+00:00,,50.806670,-1.081982,Male,10-17,Other ethnic group - Not stated,Asian,Police and Criminal Evidence Act 1984 (section 1),Offensive weapons,A no further action disposal,False,False,hampshire,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
856605,Person and Vehicle search,2020-04-30 15:10:00+00:00,,54.965502,-1.604609,Male,18-24,White - Any other White background,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,A no further action disposal,True,False,northumbria,0
856606,Person and Vehicle search,2020-04-30 15:10:00+00:00,,54.965502,-1.604609,Male,25-34,White - Any other White background,Other,Misuse of Drugs Act 1971 (section 23),Controlled drugs,A no further action disposal,True,True,northumbria,0
856607,Person and Vehicle search,2020-04-30 17:00:00+00:00,,54.966266,-1.453704,Male,18-24,White - English/Welsh/Scottish/Northern Irish/...,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,Khat or Cannabis warning,True,False,northumbria,1
856608,Person search,2020-04-30 17:35:00+00:00,,54.971596,-1.636589,Male,25-34,White - English/Welsh/Scottish/Northern Irish/...,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,Arrest,True,False,northumbria,1


# Train-test Split

## Drop Ex-Post columns

In [26]:
drop_cols = ['Self-defined ethnicity', 'Outcome', 'Outcome linked to object of search']

df_clean = df_clean.drop(columns=drop_cols).copy()

In [27]:
df_train, df_test = train_test_split(df_clean, test_size=0.3, random_state=42)

In [28]:
df_train

Unnamed: 0,Type,Date,Part of a policing operation,Latitude,Longitude,Gender,Age range,Officer-defined ethnicity,Legislation,Object of search,Removal of more than just outer clothing,station,target
699639,Person and Vehicle search,2020-05-10 14:15:00+00:00,False,51.620921,-0.391129,Male,over 34,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,False,hertfordshire,0
367343,Person search,2020-11-09 19:45:00+00:00,,52.764399,-0.880878,Male,over 34,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,False,leicestershire,0
735521,Person search,2021-09-21 04:38:00+00:00,False,,,Female,10-17,White,Police and Criminal Evidence Act 1984 (section 1),Offensive weapons,,south-yorkshire,0
462496,Person search,2021-07-07 14:52:43+00:00,,53.400170,-2.975487,Male,10-17,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,False,merseyside,0
596120,Person search,2021-05-03 21:00:00+00:00,False,,,Male,over 34,White,Police and Criminal Evidence Act 1984 (section 1),Stolen goods,,south-yorkshire,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
515313,Person search,2021-10-31 22:15:00+00:00,False,51.015584,-0.148957,Male,10-17,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,False,sussex,0
738818,Person and Vehicle search,2020-02-22 20:01:00+00:00,False,51.796203,1.150793,Female,25-34,White,Police and Criminal Evidence Act 1984 (section 1),Stolen goods,False,essex,0
286379,Person search,2020-06-14 22:18:00+00:00,False,,,Male,over 34,White,Police and Criminal Evidence Act 1984 (section 1),Offensive weapons,,south-yorkshire,0
301314,Person search,2021-12-12 02:01:00+00:00,,53.234997,-1.423111,Male,25-34,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,False,derbyshire,0


# Pipeline Definition

In [29]:
df_clean.dtypes

Type                                                   category
Date                                        datetime64[ns, UTC]
Part of a policing operation                            boolean
Latitude                                                float64
Longitude                                               float64
Gender                                                 category
Age range                                              category
Officer-defined ethnicity                              category
Legislation                                            category
Object of search                                       category
Removal of more than just outer clothing                boolean
station                                                category
target                                                    int64
dtype: object

In [None]:
def create_pipeline(df, model):

    y = df["target"].copy()
    X = df.drop(columns=["target"]).copy()

    # Encode Target data
    # lb = LabelEncoder()
    # lb.fit(y)
    # y_bin = pd.DataFrame(data= lb.transform(y), columns= ['readmitted'])

    # categorical_cols = list(X.select_dtypes(include=['object']).columns)
    # numerical_cols = list(X.select_dtypes(include=['number']).columns)

    categorical_cols = ['race', 'discharge_disposition_code', 'admission_source_code',
       'payer_code', 'max_glu_serum', 'change', 'medical_specialty', 'age',
       'blood_transfusion', 'diag_1', 'diag_2', 'diag_3', 'A1Cresult',
       'diuretics', 'insulin', 'diabetesMed']
    numerical_cols = ['admission_type_code', 'time_in_hospital',
       'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient',
       'number_diagnoses', 'hemoglobin_level']

    # Define categorical pipeline
    cat_pipe = Pipeline([
            ('encoder', category_encoders.CountEncoder()),
            ('scaler', StandardScaler())
        ])

    # Define numerical pipeline
    numeric_pipe = Pipeline([
       ('imputer', SimpleImputer()),
       ('scaler', StandardScaler())
    ])

    

    # Combine categorical and numerical pipelines
    preprocessor = ColumnTransformer([('cat', cat_pipe, categorical_cols),
                                      ('num', numeric_pipe, numerical_cols)],
                                       remainder='drop')


    # Fit a pipeline with transformers and an estimator to the training data
    pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('model', model)
        ])

    ros = RandomOverSampler(random_state=0)
    X_resampled, y_resampled = ros.fit_resample(X, y_bin)   

    pipeline.fit(X_resampled, np.ravel(y_resampled))
    # pipeline.fit(X, np.ravel(y_bin))


    return pipeline, X, y, lb


def see_cross_val(pipeline, X, y_bin):

    roc_scorer = make_scorer(f1_score)

    roc_aucs = cross_val_score(pipeline, X, np.ravel(y_bin), cv=5, scoring=roc_scorer)
    print(roc_aucs)