# Step 15: Create Model Files for Redeployment

In this step of the project we will train the model on the whole dataset and create the pickle files that will be deployed on the API.

In [5]:
import aux_functions
from app_functions import attempt_predict
from transformers import TimeTransformer2, BoolTransformer, lat_lon_imputer, Group_Age_Range, Group_Ethnicity

import json
import joblib
import pickle
import requests
from time import sleep
import random

import pandas as pd
pd.set_option('display.max_columns', 100)
import os
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
import numpy as np
import datetime as dt

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import cross_val_score
# from category_encoders import OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder, RobustScaler, OrdinalEncoder
from category_encoders.target_encoder import TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler

from sklearn.utils import resample
from sklearn.metrics import precision_score, recall_score, f1_score,  accuracy_score, roc_auc_score, make_scorer, confusion_matrix, roc_curve, precision_recall_curve

# needed to use matplotlib inside jupyter notebook
%matplotlib inline 

# Get the data

In [11]:
df_ = pd.read_csv("data/train.csv")

drop_cols = ['Self-defined ethnicity', 'Outcome', 'Outcome linked to object of search', 'Removal of more than just outer clothing']

# statios that have low number of outcomes
drop_stations = ['humberside', 'lancashire', 'metropolitan', 'west-midlands', 'leicestershire']

df_clean = aux_functions.clean_data(df_, drop_cols, drop_stations)
df_clean["is_new"] = False

df_new = pd.read_csv("data/requests1_final.csv")

df_new_clean = aux_functions.clean_new_data(df_new)
df_new_clean["is_new"] = True

df_combined = pd.concat([df_clean,df_new_clean], axis=0)

df_combined["year"] = pd.to_datetime(df_combined["Date"], infer_datetime_format=True, dayfirst=False).dt.year

df_final = df_combined[df_combined["year"] >= 2021]

display(df_final["year"].value_counts())

df_train, df_test = train_test_split(df_final.drop(columns=["year", "is_new"]), test_size=0.3, random_state=42)

2021    185059
2022      4000
Name: year, dtype: int64

# Pipeline Definition

In [12]:
def create_pipeline(df, model):

    y = df["target"].copy()
    X = df.drop(columns=["target"]).copy()


    time_cols = ['Date']

    categorical_cols = ['Type', 'Object of search']
    
    bool_cols = ['Part of a policing operation']

    numerical_cols = ['Latitude', 'Longitude', 'station'] # station is needed for NaN imputing

    # Define timeseries pipeline
    time_pipe = Pipeline([
            ('time_transformer', TimeTransformer2()),
            ('scaler', MinMaxScaler())
        ])
    
    
    # Define categorical pipeline
    cat_pipe = Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', TargetEncoder(handle_unknown='value', min_samples_leaf=20 , smoothing=10))
        ])
    
    # Define boolean pipeline
    bool_pipe = Pipeline([
            ('bool_transformer', BoolTransformer())
        ])

    # Define numerical pipeline
    numeric_pipe = Pipeline([
            ('imputer', lat_lon_imputer()),
            ('scaler', MinMaxScaler()),
            ('extra_imputer', SimpleImputer(strategy='mean')),    # in case station was not present in training set
        ])

    
    # Combine categorical and numerical pipelines
    preprocessor = ColumnTransformer([
        ('time_transformer', time_pipe, time_cols),
        ('cat', cat_pipe, categorical_cols),
        ('bool', bool_pipe, bool_cols),
        ('num', numeric_pipe, numerical_cols)
        ],
    remainder='drop')


    # Fit a pipeline with transformers and an estimator to the training data
    pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('model', model)
        ])

    rus = RandomUnderSampler(random_state=42)
    X_resampled, y_resampled = rus.fit_resample(X, y)

    pipeline.fit(X_resampled, np.ravel(y_resampled))
    # pipeline.fit(X, np.ravel(y))


    return pipeline, X_resampled, y_resampled
    # return pipeline, X, y


def see_cross_val(pipeline, X, y):

    prec_scorer = make_scorer(precision_score)

    prec_scores = cross_val_score(pipeline, X, np.ravel(y), cv=5, scoring=prec_scorer)
    print(prec_scores)

# Train the Model with all the Data

In [13]:
# Check transformed data

model = GaussianNB()

pipeline, X, y = create_pipeline(df_train, model)

columns = ['quarter', 'hour', 'day of the week', 'Type', 'Object of search', 'Part of a policing operation', 'Latitude', 'Longitude']

# columns = ['quarter', 'year', 'hour', 'day of the week', 'Type', 'Object of search', 'Part of a policing operation']

pd.DataFrame(data=pipeline.named_steps["preprocessor"].transform(X), columns=columns)


Unnamed: 0,quarter,hour,day of the week,Type,Object of search,Part of a policing operation,Latitude,Longitude
0,1.0,0.782609,0.666667,0.488212,0.552382,0.0,0.515787,0.519093
1,1.0,0.26087,0.666667,0.488212,0.552382,0.0,0.490178,0.51933
2,0.666667,0.73913,1.0,0.488212,0.506325,0.0,0.538316,0.664155
3,0.666667,0.347826,0.833333,0.488212,0.304678,0.0,0.223698,0.81245
4,0.333333,0.956522,0.666667,0.488212,0.552382,0.0,0.534688,0.653133
...,...,...,...,...,...,...,...,...
48155,0.0,1.0,0.0,0.488212,0.552382,0.0,0.484817,0.51823
48156,1.0,0.826087,0.5,0.537253,0.552382,1.0,0.433096,0.671368
48157,0.666667,0.652174,0.0,0.488212,0.552382,0.0,0.491603,0.51609
48158,0.666667,0.0,0.833333,0.488212,0.552382,0.0,0.271676,0.593182


observation_id                   cd15041d-1ac2-44e5-994e-10e2a60ac574
Type                                                    Person search
Date                                        2021-10-15T18:43:21+00:00
Part of a policing operation                                     <NA>
Latitude                                                    53.632482
Longitude                                                   -2.969119
Gender                                                           Male
Age range                                                       10-17
Officer-defined ethnicity                                       White
Legislation                     Misuse of Drugs Act 1971 (section 23)
Object of search                                     Controlled drugs
station                                                    merseyside
Name: 0, dtype: object

# Extract Model Files

In [14]:
with open("columns.json", 'w') as fh:
    json.dump(X.columns.tolist(), fh)


with open("dtypes.pickle", 'wb') as fh:
    pickle.dump(X.dtypes, fh)

joblib.dump(pipeline, "pipeline.pickle") 

['pipeline.pickle']

# Test the App

In [27]:
with open('data/trial_moment_1.json') as moment_1:
  parsed_json = json.load(moment_1)

print(parsed_json[1]["data"])

for n in range(len(parsed_json)):
  print(parsed_json[n]["data"])

{'observation_id': 'eeb891e3-3913-4590-82a9-dc23c212dceb', 'Type': 'Person search', 'Date': '2022-04-18T22:24:57+00:00', 'Part of a policing operation': nan, 'Latitude': nan, 'Longitude': nan, 'Gender': 'Male', 'Age range': '10-17', 'Officer-defined ethnicity': 'Other', 'Legislation': 'Police and Criminal Evidence Act 1984 (section 1)', 'Object of search': 'Articles for use in criminal damage', 'station': 'city-of-london'}
{'observation_id': '1558a55e-3df2-4665-8beb-0f0c5eaa0408', 'Type': 'Person search', 'Date': '2022-04-06T18:25:00+00:00', 'Part of a policing operation': True, 'Latitude': nan, 'Longitude': nan, 'Gender': 'Male', 'Age range': '10-17', 'Officer-defined ethnicity': 'White', 'Legislation': 'Misuse of Drugs Act 1971 (section 23)', 'Object of search': 'Controlled drugs', 'station': 'nottinghamshire'}
{'observation_id': 'eeb891e3-3913-4590-82a9-dc23c212dceb', 'Type': 'Person search', 'Date': '2022-04-18T22:24:57+00:00', 'Part of a policing operation': nan, 'Latitude': nan, 

In [28]:
APP_NAME = 'ldsacapstone-production.up.railway.app'

In [29]:
# testing the predict endpoing

url = f"https://{APP_NAME}/should_search/"

for n in range(len(parsed_json)):

    payload = parsed_json[n]["data"]

    r = requests.post(url, data=json.dumps(payload), headers={"Content-Type": "application/json"})
    display(r)
    display(r.content)




<Response [200]>

b'{\n  "outcome": "False"\n}\n'

<Response [200]>

b'{\n  "outcome": "True"\n}\n'

<Response [200]>

b'{\n  "outcome": "False"\n}\n'

<Response [200]>

b'{\n  "outcome": "False"\n}\n'

<Response [200]>

b'{\n  "outcome": "True"\n}\n'

<Response [200]>

b'{\n  "outcome": "False"\n}\n'

<Response [200]>

b'{\n  "outcome": "False"\n}\n'

<Response [200]>

b'{\n  "outcome": "False"\n}\n'

<Response [200]>

b'{\n  "outcome": "False"\n}\n'

<Response [200]>

b'{\n  "outcome": "True"\n}\n'

<Response [200]>

b'{\n  "outcome": "False"\n}\n'

<Response [200]>

b'{\n  "outcome": "False"\n}\n'

<Response [200]>

b'{\n  "outcome": "True"\n}\n'

<Response [200]>

b'{\n  "outcome": "True"\n}\n'

<Response [200]>

b'{\n  "outcome": "False"\n}\n'

<Response [200]>

b'{\n  "outcome": "False"\n}\n'

<Response [200]>

b'{\n  "outcome": "False"\n}\n'

<Response [200]>

b'{\n  "outcome": "True"\n}\n'

<Response [200]>

b'{\n  "outcome": "True"\n}\n'

<Response [200]>

b'{\n  "outcome": "False"\n}\n'

In [30]:
with open('data/trial_moment_2.json') as moment_1:
  parsed_json = json.load(moment_1)

print(parsed_json[1]["data"])

for n in range(len(parsed_json)):
  print(parsed_json[n]["data"])

{'observation_id': 'eeb891e3-3913-4590-82a9-dc23c212dceb', 'outcome': True}
{'observation_id': '1558a55e-3df2-4665-8beb-0f0c5eaa0408', 'outcome': False}
{'observation_id': 'eeb891e3-3913-4590-82a9-dc23c212dceb', 'outcome': True}
{'observation_id': '898d6606-c55b-4a54-9480-f967beaff1cf', 'outcome': False}
{'observation_id': '3ff08b3c-c1fc-4c9f-97fe-470cf3a61cef', 'outcome': False}
{'observation_id': '73d7c589-7605-42ab-9c5c-d0fbb897adb0', 'outcome': False}
{'observation_id': '1d276941-2dd7-4cc0-b14d-764ee8bcd966', 'outcome': False}
{'observation_id': 'fd0a32a8-1672-45da-bf93-b93524441b24', 'outcome': True}
{'observation_id': '15af9195-3fa0-40aa-9083-da7767f0c801', 'outcome': True}
{'observation_id': '9ac66922-d13b-4877-8bab-41ef925d3bb8', 'outcome': True}
{'observation_id': '27dc412f-64e2-4245-9a25-db0514b2e980', 'outcome': True}
{'observation_id': 'aeed813b-d4e9-489d-81b3-9b8ec8f62d52', 'outcome': False}
{'observation_id': 'caabe84e-97e0-4305-b540-eda6d56e51f3', 'outcome': True}
{'obse

In [31]:
# Testing the /update endpoint

url = f"https://{APP_NAME}/search_result/"
for n in range(len(parsed_json)):

    payload = parsed_json[n]['data']

    r = requests.post(url, data=json.dumps(payload), headers={"Content-Type": "application/json"})
    display(r)
    display(r.content)


<Response [200]>

b'{\n  "observation_id": "1558a55e-3df2-4665-8beb-0f0c5eaa0408", \n  "outcome": "False", \n  "predicted_outcome": "False"\n}\n'

<Response [200]>

b'{\n  "observation_id": "eeb891e3-3913-4590-82a9-dc23c212dceb", \n  "outcome": "True", \n  "predicted_outcome": "True"\n}\n'

<Response [200]>

b'{\n  "observation_id": "898d6606-c55b-4a54-9480-f967beaff1cf", \n  "outcome": "False", \n  "predicted_outcome": "False"\n}\n'

<Response [200]>

b'{\n  "observation_id": "3ff08b3c-c1fc-4c9f-97fe-470cf3a61cef", \n  "outcome": "False", \n  "predicted_outcome": "False"\n}\n'

<Response [200]>

b'{\n  "observation_id": "73d7c589-7605-42ab-9c5c-d0fbb897adb0", \n  "outcome": "False", \n  "predicted_outcome": "True"\n}\n'

<Response [200]>

b'{\n  "observation_id": "1d276941-2dd7-4cc0-b14d-764ee8bcd966", \n  "outcome": "False", \n  "predicted_outcome": "False"\n}\n'

<Response [200]>

b'{\n  "observation_id": "fd0a32a8-1672-45da-bf93-b93524441b24", \n  "outcome": "True", \n  "predicted_outcome": "False"\n}\n'

<Response [200]>

b'{\n  "observation_id": "15af9195-3fa0-40aa-9083-da7767f0c801", \n  "outcome": "True", \n  "predicted_outcome": "False"\n}\n'

<Response [200]>

b'{\n  "observation_id": "9ac66922-d13b-4877-8bab-41ef925d3bb8", \n  "outcome": "True", \n  "predicted_outcome": "False"\n}\n'

<Response [200]>

b'{\n  "observation_id": "27dc412f-64e2-4245-9a25-db0514b2e980", \n  "outcome": "True", \n  "predicted_outcome": "True"\n}\n'

<Response [200]>

b'{\n  "observation_id": "aeed813b-d4e9-489d-81b3-9b8ec8f62d52", \n  "outcome": "False", \n  "predicted_outcome": "False"\n}\n'

<Response [200]>

b'{\n  "observation_id": "caabe84e-97e0-4305-b540-eda6d56e51f3", \n  "outcome": "True", \n  "predicted_outcome": "False"\n}\n'

<Response [200]>

b'{\n  "observation_id": "29d64e93-a47d-45ba-b02f-258f0c459091", \n  "outcome": "True", \n  "predicted_outcome": "True"\n}\n'

<Response [200]>

b'{\n  "observation_id": "1037c940-3a60-47eb-bef4-2596218725f0", \n  "outcome": "False", \n  "predicted_outcome": "True"\n}\n'

<Response [200]>

b'{\n  "observation_id": "923a1db5-1a27-4eb0-92ea-819c7ffdf85e", \n  "outcome": "True", \n  "predicted_outcome": "False"\n}\n'

<Response [200]>

b'{\n  "observation_id": "9249984b-5b19-4730-95d2-201197fff581", \n  "outcome": "False", \n  "predicted_outcome": "False"\n}\n'

<Response [200]>

b'{\n  "observation_id": "e477ebd7-a324-4d63-9def-51b5f6a81a54", \n  "outcome": "True", \n  "predicted_outcome": "False"\n}\n'

<Response [200]>

b'{\n  "observation_id": "d46707aa-cf72-4fb4-aa64-fe6eff29e8f7", \n  "outcome": "True", \n  "predicted_outcome": "True"\n}\n'

<Response [200]>

b'{\n  "observation_id": "06e5e365-15a8-465a-8092-eef6762a78d7", \n  "outcome": "False", \n  "predicted_outcome": "True"\n}\n'

<Response [200]>

b'{\n  "observation_id": "89f0b741-dd82-4600-8b98-4c3230a0540d", \n  "outcome": "False", \n  "predicted_outcome": "False"\n}\n'

# Check data in the database

In [32]:
#server is down, so no worries (this credentials will fail, use yours)
username = 'postgres'
password = 'e4a0hDUQGnH26EDm82PK'
host_name = 'containers-us-west-118.railway.app'
port = 7357
db_name = 'railway'

conn_str = 'postgresql://{}:{}@{}:{}/{}'.format(username, password, host_name, port, db_name)
engine = sqlalchemy.create_engine(conn_str)
query = 'SELECT * FROM prediction;'
data_server = pd.DataFrame(engine.connect().execute(sqlalchemy.text(query)))
data_server

NameError: name 'sqlalchemy' is not defined

In [3]:
data_server.to_csv('data/requests1_final.csv')

In [6]:
data_server[["outcome","true_outcome"]].value_counts(dropna=False)

outcome  true_outcome
True     False           1941
False    False           1185
True     True             638
False    True             236
dtype: int64

In [4]:
data_server["true_outcome"].value_counts(dropna=False)

False    3126
True      874
Name: true_outcome, dtype: int64

In [22]:
df_

Unnamed: 0,observation_id,Type,Date,Part of a policing operation,Latitude,Longitude,Gender,Age range,Self-defined ethnicity,Officer-defined ethnicity,Legislation,Object of search,Outcome,Outcome linked to object of search,Removal of more than just outer clothing,station
0,2e4d0094-c30b-471b-a211-72a9790feca2,Person search,2020-12-01T01:10:00+00:00,,50.798824,-1.089471,Male,25-34,Other ethnic group - Not stated,White,Police and Criminal Evidence Act 1984 (section 1),Article for use in theft,Community resolution,False,False,hampshire
1,4779fbe8-6e05-4534-85fd-db32952ee309,Person search,2020-12-01T02:00:00+00:00,,50.785099,-1.091540,Male,over 34,White - Any other White background,Other,Misuse of Drugs Act 1971 (section 23),Controlled drugs,A no further action disposal,False,False,hampshire
2,cb5c685d-acac-42e2-914d-75e6ff73b0a8,Person search,2020-12-01T09:15:00+00:00,,50.952006,-1.403341,Male,over 34,White - English/Welsh/Scottish/Northern Irish/...,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,A no further action disposal,False,True,hampshire
3,f486e116-5b1e-45db-9931-a7f070c5c478,Person search,2020-12-01T10:20:00+00:00,,50.806383,-1.079844,Male,10-17,Other ethnic group - Not stated,White,Police and Criminal Evidence Act 1984 (section 1),Stolen goods,A no further action disposal,False,False,hampshire
4,78f4020e-12cc-4889-bf1a-2f2c29b2f662,Person search,2020-12-01T10:24:00+00:00,,50.806670,-1.081982,Male,10-17,Other ethnic group - Not stated,Asian,Police and Criminal Evidence Act 1984 (section 1),Offensive weapons,A no further action disposal,False,False,hampshire
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
856605,ee337b9a-12ad-45fd-8c60-49091a0f4ab8,Person and Vehicle search,2020-04-30T15:10:00+00:00,,54.965502,-1.604609,Male,18-24,White - Any other White background,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,A no further action disposal,True,False,northumbria
856606,5973a004-e579-4dd2-bc26-71ab5717f87a,Person and Vehicle search,2020-04-30T15:10:00+00:00,,54.965502,-1.604609,Male,25-34,White - Any other White background,Other,Misuse of Drugs Act 1971 (section 23),Controlled drugs,A no further action disposal,True,True,northumbria
856607,ad053a34-364e-4d24-8f5c-9734ab5fdbe0,Person and Vehicle search,2020-04-30T17:00:00+00:00,,54.966266,-1.453704,Male,18-24,White - English/Welsh/Scottish/Northern Irish/...,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,Khat or Cannabis warning,True,False,northumbria
856608,8736e5ec-7ca2-420b-ad56-4dd88d27fe6e,Person search,2020-04-30T17:35:00+00:00,,54.971596,-1.636589,Male,25-34,White - English/Welsh/Scottish/Northern Irish/...,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,Arrest,True,False,northumbria


In [27]:
df_.loc[(df_['Outcome linked to object of search'] == False) & (df_['Object of search'] == 'Controlled drugs'), ['Outcome linked to object of search', 'Legislation', 'Outcome']].value_counts()

Outcome linked to object of search  Legislation                                              Outcome                        
False                               Misuse of Drugs Act 1971 (section 23)                    A no further action disposal       63508
                                                                                             Arrest                              7928
                                                                                             Community resolution                2852
                                                                                             Summons / charged by post           1623
                                                                                             Penalty Notice for Disorder          297
                                                                                             Caution (simple or conditional)      290
                                    Police and Criminal Evidence Act 19

In [7]:
a = ['a' , 'b']
b = ['c', 'd']

z = [(x,y) for x in a for y in b]

z[0][1]

'c'