In [1]:
import aux_functions
from app_functions import attempt_predict
from transformers import TimeTransformer2, BoolTransformer, lat_lon_imputer, Group_Age_Range, Group_Ethnicity

import json
import joblib
import pickle
import requests
from time import sleep
import random
import sqlalchemy
import psycopg2

import pandas as pd
pd.set_option('display.max_columns', 100)
import os
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
import numpy as np
import datetime as dt

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import cross_val_score
# from category_encoders import OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder, RobustScaler, OrdinalEncoder
from category_encoders.target_encoder import TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB, CategoricalNB

from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler

from sklearn.utils import resample
from sklearn.metrics import precision_score, recall_score, f1_score,  accuracy_score, roc_auc_score, make_scorer, confusion_matrix, roc_curve

# needed to use matplotlib inside jupyter notebook
%matplotlib inline 

# Get the data

In [2]:
df_ = pd.read_csv("data/train.csv")

drop_cols = ['Self-defined ethnicity', 'Outcome', 'Outcome linked to object of search', 'Removal of more than just outer clothing']

df_clean = aux_functions.clean_data(df_, drop_cols)

# Pipeline Definition

In [3]:
def create_pipeline(df, model):

    y = df["target"].copy()
    X = df.drop(columns=["target"]).copy()


    time_cols = ['Date']

    categorical_cols = ['Type', 'Object of search']
    
    bool_cols = ['Part of a policing operation']

    numerical_cols = ['Latitude', 'Longitude', 'station'] # station is needed for NaN imputing

    # Define timeseries pipeline
    time_pipe = Pipeline([
            ('time_transformer', TimeTransformer2()),
            ('scaler', MinMaxScaler())
        ])
    
    
    # Define categorical pipeline
    cat_pipe = Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', TargetEncoder(handle_unknown='value', min_samples_leaf=20 , smoothing=10))
        ])
    
    # Define boolean pipeline
    bool_pipe = Pipeline([
            ('bool_transformer', BoolTransformer())
        ])

    # Define numerical pipeline
    numeric_pipe = Pipeline([
            ('imputer', lat_lon_imputer()),
            ('extra_imputer', SimpleImputer(strategy='mean')),
            ('scaler', MinMaxScaler())
        ])

    
    # Combine categorical and numerical pipelines
    preprocessor = ColumnTransformer([
        ('time_transformer', time_pipe, time_cols),
        ('cat', cat_pipe, categorical_cols),
        ('bool', bool_pipe, bool_cols),
        ('num', numeric_pipe, numerical_cols)],
    remainder='drop')


    # Fit a pipeline with transformers and an estimator to the training data
    pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('model', model)
        ])

    rus = RandomUnderSampler(random_state=42)
    X_resampled, y_resampled = rus.fit_resample(X, y)

    pipeline.fit(X_resampled, np.ravel(y_resampled))
    # pipeline.fit(X, np.ravel(y))


    return pipeline, X_resampled, y_resampled
    # return pipeline, X, y


def see_cross_val(pipeline, X, y):

    prec_scorer = make_scorer(precision_score)

    prec_scores = cross_val_score(pipeline, X, np.ravel(y), cv=5, scoring=prec_scorer)
    print(prec_scores)

# Train the Model with all the Data

In [4]:
# Check transformed data

model = GaussianNB()

pipeline, X, y = create_pipeline(df_clean, model)

see_cross_val(pipeline, X, y)

[0.52007437 0.54811129 0.51345917 0.5432847  0.53321139]


# Extract Model Files

In [8]:
X.columns

Index(['observation_id', 'Type', 'Date', 'Part of a policing operation',
       'Latitude', 'Longitude', 'Gender', 'Age range',
       'Officer-defined ethnicity', 'Legislation', 'Object of search',
       'station'],
      dtype='object')

In [5]:
with open("columns.json", 'w') as fh:
    json.dump(X.columns.tolist(), fh)


with open("dtypes.pickle", 'wb') as fh:
    pickle.dump(X.dtypes, fh)

joblib.dump(pipeline, "pipeline.pickle") 

['pipeline.pickle']

In [11]:
pipeline.predict(df_.loc[df_['observation_id'] == 'ba138584-f313-467c-a6c5-74ef31ea5e0e', X.columns])

array([0])

# Test the App

In [3]:
with open('data/trial_moment_1.json') as moment_1:
  parsed_json = json.load(moment_1)

print(parsed_json[1]["data"])

for n in range(len(parsed_json)):
  print(parsed_json[n]["data"])

{'observation_id': 'eeb891e3-3913-4590-82a9-dc23c212dceb', 'Type': 'Person search', 'Date': '2022-04-18T22:24:57+00:00', 'Part of a policing operation': nan, 'Latitude': nan, 'Longitude': nan, 'Gender': 'Male', 'Age range': '10-17', 'Officer-defined ethnicity': 'Other', 'Legislation': 'Police and Criminal Evidence Act 1984 (section 1)', 'Object of search': 'Articles for use in criminal damage', 'station': 'city-of-london'}
{'observation_id': '1558a55e-3df2-4665-8beb-0f0c5eaa0408', 'Type': 'Person search', 'Date': '2022-04-06T18:25:00+00:00', 'Part of a policing operation': True, 'Latitude': nan, 'Longitude': nan, 'Gender': 'Male', 'Age range': '10-17', 'Officer-defined ethnicity': 'White', 'Legislation': 'Misuse of Drugs Act 1971 (section 23)', 'Object of search': 'Controlled drugs', 'station': 'nottinghamshire'}
{'observation_id': 'eeb891e3-3913-4590-82a9-dc23c212dceb', 'Type': 'Person search', 'Date': '2022-04-18T22:24:57+00:00', 'Part of a policing operation': nan, 'Latitude': nan, 

In [2]:
APP_NAME = 'ldsacapstone-production.up.railway.app'

In [11]:
# testing the predict endpoing

url = f"https://{APP_NAME}/should_search/"

for n in range(len(parsed_json)):

    payload = parsed_json[n]["data"]

    r = requests.post(url, data=json.dumps(payload), headers={"Content-Type": "application/json"})
    display(r)
    display(r.content)




<Response [200]>

b'{\n  "outcome": "[ True]"\n}\n'

<Response [200]>

b'{\n  "outcome": "[False]"\n}\n'

<Response [200]>

b'{\n  "outcome": "[ True]"\n}\n'

<Response [200]>

b'{\n  "outcome": "[ True]"\n}\n'

<Response [200]>

b'{\n  "outcome": "[False]"\n}\n'

<Response [200]>

b'{\n  "outcome": "[ True]"\n}\n'

<Response [200]>

b'{\n  "outcome": "[False]"\n}\n'

<Response [200]>

b'{\n  "outcome": "[ True]"\n}\n'

<Response [200]>

b'{\n  "outcome": "[ True]"\n}\n'

<Response [200]>

b'{\n  "outcome": "[False]"\n}\n'

<Response [200]>

b'{\n  "outcome": "[ True]"\n}\n'

<Response [200]>

b'{\n  "outcome": "[False]"\n}\n'

<Response [200]>

b'{\n  "outcome": "[False]"\n}\n'

<Response [200]>

b'{\n  "outcome": "[False]"\n}\n'

<Response [200]>

b'{\n  "outcome": "[ True]"\n}\n'

<Response [200]>

b'{\n  "outcome": "[ True]"\n}\n'

<Response [200]>

b'{\n  "outcome": "[ True]"\n}\n'

<Response [200]>

b'{\n  "outcome": "[False]"\n}\n'

<Response [200]>

b'{\n  "outcome": "[False]"\n}\n'

<Response [200]>

b'{\n  "outcome": "[ True]"\n}\n'

In [14]:
with open('data/trial_moment_2.json') as moment_1:
  parsed_json = json.load(moment_1)

print(parsed_json[1]["data"])

for n in range(len(parsed_json)):
  print(parsed_json[n]["data"])

{'observation_id': 'eeb891e3-3913-4590-82a9-dc23c212dceb', 'outcome': True}
{'observation_id': '1558a55e-3df2-4665-8beb-0f0c5eaa0408', 'outcome': False}
{'observation_id': 'eeb891e3-3913-4590-82a9-dc23c212dceb', 'outcome': True}
{'observation_id': '898d6606-c55b-4a54-9480-f967beaff1cf', 'outcome': False}
{'observation_id': '3ff08b3c-c1fc-4c9f-97fe-470cf3a61cef', 'outcome': False}
{'observation_id': '73d7c589-7605-42ab-9c5c-d0fbb897adb0', 'outcome': False}
{'observation_id': '1d276941-2dd7-4cc0-b14d-764ee8bcd966', 'outcome': False}
{'observation_id': 'fd0a32a8-1672-45da-bf93-b93524441b24', 'outcome': True}
{'observation_id': '15af9195-3fa0-40aa-9083-da7767f0c801', 'outcome': True}
{'observation_id': '9ac66922-d13b-4877-8bab-41ef925d3bb8', 'outcome': True}
{'observation_id': '27dc412f-64e2-4245-9a25-db0514b2e980', 'outcome': True}
{'observation_id': 'aeed813b-d4e9-489d-81b3-9b8ec8f62d52', 'outcome': False}
{'observation_id': 'caabe84e-97e0-4305-b540-eda6d56e51f3', 'outcome': True}
{'obse

In [17]:
# Testing the /update endpoint

url = f"https://{APP_NAME}/search_result/"
for n in range(len(parsed_json)):

    payload = parsed_json[n]['data']

    r = requests.post(url, data=json.dumps(payload), headers={"Content-Type": "application/json"})
    display(r)
    display(r.content)


<Response [200]>

b'{\n  "observation_id": "1558a55e-3df2-4665-8beb-0f0c5eaa0408", \n  "outcome": "False", \n  "predicted_outcome": "True"\n}\n'

<Response [200]>

b'{\n  "observation_id": "eeb891e3-3913-4590-82a9-dc23c212dceb", \n  "outcome": "True", \n  "predicted_outcome": "False"\n}\n'

<Response [200]>

b'{\n  "observation_id": "898d6606-c55b-4a54-9480-f967beaff1cf", \n  "outcome": "False", \n  "predicted_outcome": "True"\n}\n'

<Response [200]>

b'{\n  "observation_id": "3ff08b3c-c1fc-4c9f-97fe-470cf3a61cef", \n  "outcome": "False", \n  "predicted_outcome": "True"\n}\n'

<Response [200]>

b'{\n  "observation_id": "73d7c589-7605-42ab-9c5c-d0fbb897adb0", \n  "outcome": "False", \n  "predicted_outcome": "False"\n}\n'

<Response [200]>

b'{\n  "observation_id": "1d276941-2dd7-4cc0-b14d-764ee8bcd966", \n  "outcome": "False", \n  "predicted_outcome": "True"\n}\n'

<Response [200]>

b'{\n  "observation_id": "fd0a32a8-1672-45da-bf93-b93524441b24", \n  "outcome": "True", \n  "predicted_outcome": "False"\n}\n'

<Response [200]>

b'{\n  "observation_id": "15af9195-3fa0-40aa-9083-da7767f0c801", \n  "outcome": "True", \n  "predicted_outcome": "True"\n}\n'

<Response [200]>

b'{\n  "observation_id": "9ac66922-d13b-4877-8bab-41ef925d3bb8", \n  "outcome": "True", \n  "predicted_outcome": "True"\n}\n'

<Response [200]>

b'{\n  "observation_id": "27dc412f-64e2-4245-9a25-db0514b2e980", \n  "outcome": "True", \n  "predicted_outcome": "False"\n}\n'

<Response [200]>

b'{\n  "observation_id": "aeed813b-d4e9-489d-81b3-9b8ec8f62d52", \n  "outcome": "False", \n  "predicted_outcome": "True"\n}\n'

<Response [200]>

b'{\n  "observation_id": "caabe84e-97e0-4305-b540-eda6d56e51f3", \n  "outcome": "True", \n  "predicted_outcome": "False"\n}\n'

<Response [200]>

b'{\n  "observation_id": "29d64e93-a47d-45ba-b02f-258f0c459091", \n  "outcome": "True", \n  "predicted_outcome": "False"\n}\n'

<Response [200]>

b'{\n  "observation_id": "1037c940-3a60-47eb-bef4-2596218725f0", \n  "outcome": "False", \n  "predicted_outcome": "False"\n}\n'

<Response [200]>

b'{\n  "observation_id": "923a1db5-1a27-4eb0-92ea-819c7ffdf85e", \n  "outcome": "True", \n  "predicted_outcome": "True"\n}\n'

<Response [200]>

b'{\n  "observation_id": "9249984b-5b19-4730-95d2-201197fff581", \n  "outcome": "False", \n  "predicted_outcome": "True"\n}\n'

<Response [200]>

b'{\n  "observation_id": "e477ebd7-a324-4d63-9def-51b5f6a81a54", \n  "outcome": "True", \n  "predicted_outcome": "True"\n}\n'

<Response [200]>

b'{\n  "observation_id": "d46707aa-cf72-4fb4-aa64-fe6eff29e8f7", \n  "outcome": "True", \n  "predicted_outcome": "False"\n}\n'

<Response [200]>

b'{\n  "observation_id": "06e5e365-15a8-465a-8092-eef6762a78d7", \n  "outcome": "False", \n  "predicted_outcome": "False"\n}\n'

<Response [200]>

b'{\n  "observation_id": "89f0b741-dd82-4600-8b98-4c3230a0540d", \n  "outcome": "False", \n  "predicted_outcome": "True"\n}\n'

In [18]:
#server is down, so no worries (this credentials will fail, use yours)
username = 'postgres'
password = 'e4a0hDUQGnH26EDm82PK'
host_name = 'containers-us-west-118.railway.app'
port = 7357
db_name = 'railway'

conn_str = 'postgresql://{}:{}@{}:{}/{}'.format(username, password, host_name, port, db_name)
engine = sqlalchemy.create_engine(conn_str)
query = 'SELECT * FROM prediction;'
data_server = pd.DataFrame(engine.connect().execute(sqlalchemy.text(query)))
data_server

Unnamed: 0,id,observation_id,type,date,part_of_a_policing_operation,latitude,longitude,gender,age_range,officer_defined_ethnicity,legislation,object_of_search,station,proba,outcome,true_outcome
0,42,1558a55e-3df2-4665-8beb-0f0c5eaa0408,Person search,2022-04-06T18:25:00+00:00,True,,,Male,10-17,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,nottinghamshire,0.850019,True,False
1,43,eeb891e3-3913-4590-82a9-dc23c212dceb,Person search,2022-04-18T22:24:57+00:00,False,,,Male,10-17,Other,Police and Criminal Evidence Act 1984 (section 1),Articles for use in criminal damage,city-of-london,0.108532,False,True
2,44,898d6606-c55b-4a54-9480-f967beaff1cf,Person and Vehicle search,2022-01-07T20:20:00+00:00,True,,,Male,25-34,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,nottinghamshire,0.894015,True,False
3,45,3ff08b3c-c1fc-4c9f-97fe-470cf3a61cef,Person search,2022-05-08T20:21:00+00:00,True,,,Male,over 34,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,nottinghamshire,0.847,True,False
4,46,73d7c589-7605-42ab-9c5c-d0fbb897adb0,Person search,2022-02-12T01:43:00+00:00,False,54.525684,-1.556382,Male,over 34,White,Police and Criminal Evidence Act 1984 (section 1),Article for use in theft,durham,0.058079,False,False
5,47,1d276941-2dd7-4cc0-b14d-764ee8bcd966,Person search,2022-05-13T18:23:00+00:00,True,,,Male,25-34,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,nottinghamshire,0.849693,True,False
6,48,fd0a32a8-1672-45da-bf93-b93524441b24,Person search,2022-01-26T01:48:18+00:00,False,52.558052,-0.272402,Female,over 34,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,cambridgeshire,0.47005,False,True
7,49,15af9195-3fa0-40aa-9083-da7767f0c801,Person search,2022-02-16T05:01:31+00:00,False,51.513462,-0.082913,Male,18-24,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,city-of-london,0.532643,True,True
8,50,9ac66922-d13b-4877-8bab-41ef925d3bb8,Person search,2022-03-04T23:48:16+00:00,False,52.339134,-0.163148,Male,10-17,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,cambridgeshire,0.536486,True,True
9,51,27dc412f-64e2-4245-9a25-db0514b2e980,Person search,2022-03-17T20:20:00+00:00,True,,,Male,18-24,White,Police and Criminal Evidence Act 1984 (section 1),Articles for use in criminal damage,nottinghamshire,0.353656,False,True
