In [1]:
import aux_functions
from app_functions import attempt_predict
from transformers import TimeTransformer, BoolTransformer

import json
import joblib
import pickle
import requests
from time import sleep
import random

import pandas as pd
pd.set_option('display.max_columns', 100)
import os
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
import numpy as np
import datetime as dt

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import cross_val_score
# from category_encoders import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, RobustScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.utils import resample
from sklearn.metrics import precision_score, recall_score, f1_score,  accuracy_score, roc_auc_score, make_scorer, confusion_matrix, roc_curve

# needed to use matplotlib inside jupyter notebook
%matplotlib inline 

# Get the data

In [2]:
df_ = pd.read_csv("data/train.csv")

drop_cols = ['Self-defined ethnicity', 'Outcome', 'Outcome linked to object of search', 'Removal of more than just outer clothing']


df_clean = aux_functions.clean_data(df_, drop_cols)

df_train, df_test = train_test_split(df_clean, test_size=0.3, random_state=42)

df_test.head()

Unnamed: 0,observation_id,Type,Date,Part of a policing operation,Latitude,Longitude,Gender,Age range,Officer-defined ethnicity,Legislation,Object of search,station,target
438050,b7287b3c-fb25-42a5-afa3-e8320817eb6f,Person search,2021-07-30T16:00:00+00:00,False,53.797198,-1.789092,Male,25-34,Asian,Misuse of Drugs Act 1971 (section 23),Controlled drugs,west-yorkshire,0
8521,4f9c9bbb-8806-4a9f-a83d-a7d1c9957527,Person search,2020-12-20T01:26:43+00:00,,51.508963,-0.073894,Male,over 34,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,city-of-london,0
601361,1e57860f-9cb6-4afe-aabf-00b11c5012d2,Person search,2021-05-12T13:05:27+00:00,,53.397291,-3.034496,Male,over 34,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,merseyside,0
282181,8d166c18-7351-41e0-8be6-3f9fb574d8d9,Person search,2020-06-15T11:20:00+00:00,,53.041893,-2.97916,Male,over 34,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,north-wales,0
797640,3a2bdbf8-83f2-448b-bcf7-a2502632d5a8,Person search,2020-08-13T23:00:00+00:00,False,51.022386,-0.353517,Male,10-17,White,Police and Criminal Evidence Act 1984 (section 1),Offensive weapons,sussex,0


# Recover Model

In [3]:
with open(os.path.join("columns.json"), 'r') as fh:
    columns = json.load(fh)

with open(os.path.join("dtypes.pickle"), 'rb') as fh:
    dtypes = pickle.load(fh)

pipeline_recovered = joblib.load(os.path.join("pipeline.pickle"))

# Develop APP Protection Function

## Objective

* Validate that observation includes a valid id
* Validate that observation has no unknown fields
* Validate that fields have the correct data types
* Validate that categorical columns have correct values


In [4]:
df_test.columns

Index(['observation_id', 'Type', 'Date', 'Part of a policing operation',
       'Latitude', 'Longitude', 'Gender', 'Age range',
       'Officer-defined ethnicity', 'Legislation', 'Object of search',
       'station', 'target'],
      dtype='object')

# Generate Observation

In [5]:
y_test = df_test["target"].copy()
X_test = df_test.drop(columns=["target"]).copy()

In [6]:
obs_dict = X_test.iloc[0, :].to_dict()

obs_dict

{'observation_id': 'b7287b3c-fb25-42a5-afa3-e8320817eb6f',
 'Type': 'Person search',
 'Date': '2021-07-30T16:00:00+00:00',
 'Part of a policing operation': False,
 'Latitude': 53.797198,
 'Longitude': -1.789092,
 'Gender': 'Male',
 'Age range': '25-34',
 'Officer-defined ethnicity': 'Asian',
 'Legislation': 'Misuse of Drugs Act 1971 (section 23)',
 'Object of search': 'Controlled drugs',
 'station': 'west-yorkshire'}

In [7]:
# example with extra columns

obs_dict = {'observation_id': "b7287b3c-fb25-42a5-afa3-e8320817eb6f",
    'Type': 'Person search',
    'Date': '2021-07-30T16:00:00+00:00',
    'Part of a policing operation': False,
    'Latitude': 53.797198,
    'Longitude': -1.789092,
    'Gender': 'Male',
    'extra column 1': 1,
    'extra column 2': 3,
    'Age range': '25-34',
    'Officer-defined ethnicity': 'Asian',
    'Legislation': 'Misuse of Drugs Act 1971 (section 23)',
    'Object of search': 'Controlled drugs',
    'station': 'west-yorkshire'}

response, check = attempt_predict(obs_dict)

if check:
    print("passed")
    display(response)
else: 
    display(response)

{'error': "Unrecognized columns provided: {'extra column 2', 'extra column 1'}"}

In [8]:
# example with missing columns

obs_dict = {'observation_id': "b7287b3c-fb25-42a5-afa3-e8320817eb6f",
    'Type': 'Person search',
    'Date': '2021-07-30T16:00:00+00:00',
    'Part of a policing operation': False,
    'Latitude': 53.797198,
    'Age range': '25-34',
    'Officer-defined ethnicity': 'Asian',
    'Legislation': 'Misuse of Drugs Act 1971 (section 23)',
    'Object of search': 'Controlled drugs',
    'station': 'west-yorkshire'}

response, check = attempt_predict(obs_dict)

if check:
    print("passed")
    display(response)
else: 
    display(response)

{'error': "Missing columns: {'Gender', 'Longitude'}"}

In [9]:
# Example with weird Type category

obs_dict = {'observation_id': "b7287b3c-fb25-42a5-afa3-e8320817eb6f",
    'Type': 'Alien Invaders',
    'Date': '2021-07-30T16:00:00+00:00',
    'Part of a policing operation': False,
    'Latitude': 53.797198,
    'Longitude': -1.789092,
    'Gender': 'Male',
    'Age range': '25-34',
    'Officer-defined ethnicity': 'Asian',
    'Legislation': 'Misuse of Drugs Act 1971 (section 23)',
    'Object of search': 'Controlled drugs',
    'station': 'west-yorkshire'}

response, check = attempt_predict(obs_dict)

if check:
    print("passed")
    display(response)
else: 
    display(response)


{'error': "Invalid value provided for Type: Alien Invaders. Allowed values are: 'Person search','Person and Vehicle search','Vehicle search'"}

In [10]:
# Example with weird Gender category

obs_dict = {'observation_id': "b7287b3c-fb25-42a5-afa3-e8320817eb6f",
    'Type': 'Person search',
    'Date': '2021-07-30T16:00:00+00:00',
    'Part of a policing operation': False,
    'Latitude': 53.797198,
    'Longitude': -1.789092,
    'Gender': 'Whatever',
    'Age range': '25-34',
    'Officer-defined ethnicity': 'Asian',
    'Legislation': 'Misuse of Drugs Act 1971 (section 23)',
    'Object of search': 'Controlled drugs',
    'station': 'west-yorkshire'}

response, check = attempt_predict(obs_dict)

if check:
    print("passed")
    display(response)
else: 
    display(response)


{'error': "Invalid value provided for Gender: Whatever. Allowed values are: 'Female','Male','Other'"}

In [11]:
# Example with weird Age range category

obs_dict = {'observation_id': "b7287b3c-fb25-42a5-afa3-e8320817eb6f",
    'Type': 'Person search',
    'Date': '2021-07-30T16:00:00+00:00',
    'Part of a policing operation': False,
    'Latitude': 53.797198,
    'Longitude': -1.789092,
    'Gender': 'Male',
    'Age range': '10000000000000>',
    'Officer-defined ethnicity': 'Asian',
    'Legislation': 'Misuse of Drugs Act 1971 (section 23)',
    'Object of search': 'Controlled drugs',
    'station': 'west-yorkshire'}

response, check = attempt_predict(obs_dict)

if check:
    print("passed")
    display(response)
else: 
    display(response)

{'error': "Invalid value provided for Age range: 10000000000000>. Allowed values are: 'under 10','10-17','18-24','25-34','over 34'"}

In [12]:
# Example with weird Officer-defined ethnicity category

obs_dict = {'observation_id': "b7287b3c-fb25-42a5-afa3-e8320817eb6f",
    'Type': 'Person search',
    'Date': '2021-07-30T16:00:00+00:00',
    'Part of a policing operation': False,
    'Latitude': 53.797198,
    'Longitude': -1.789092,
    'Gender': 'Male',
    'Age range': '25-34',
    'Officer-defined ethnicity': 'Extra-terrestrial',
    'Legislation': 'Misuse of Drugs Act 1971 (section 23)',
    'Object of search': 'Controlled drugs',
    'station': 'west-yorkshire'}

response, check = attempt_predict(obs_dict)

if check:
    print("passed")
    display(response)
else: 
    display(response)


{'error': "Invalid value provided for Officer-defined ethnicity: Extra-terrestrial. Allowed values are: 'Asian','Black','Mixed','White','Other'"}

In [13]:
# Example with weird date

obs_dict = {'observation_id': "b7287b3c-fb25-42a5-afa3-e8320817eb6f",
    'Type': 'Person search',
    'Date': 'yes',
    'Part of a policing operation': False,
    'Latitude': 53.797198,
    'Longitude': -1.789092,
    'Gender': 'Male',
    'Age range': '25-34',
    'Officer-defined ethnicity': 'Black',
    'Legislation': 'Misuse of Drugs Act 1971 (section 23)',
    'Object of search': 'Controlled drugs',
    'station': 'west-yorkshire'}

response, check = attempt_predict(obs_dict)

if check:
    print("passed")
    display(response)
else: 
    display(response)

{'Date': 'yes', 'error': 'Date format is incorrect'}

In [14]:
# Example with date before 2020 ethnicity category

obs_dict = {'observation_id': "b7287b3c-fb25-42a5-afa3-e8320817eb6f",
    'Type': 'Person search',
    'Date': '2019-07-30T16:00:00+00:00',
    'Part of a policing operation': False,
    'Latitude': 53.797198,
    'Longitude': -1.789092,
    'Gender': 'Male',
    'Age range': '25-34',
    'Officer-defined ethnicity': 'Black',
    'Legislation': 'Misuse of Drugs Act 1971 (section 23)',
    'Object of search': 'Controlled drugs',
    'station': 'west-yorkshire'}

response, check = attempt_predict(obs_dict)

if check:
    print("passed")
    display(response)
else: 
    display(response)

{'Date': '2019-07-30T16:00:00+00:00', 'error': 'Provided date is before 2020'}

# Check if App is LIVE

In [None]:
APP_NAME = 'ldsacapstone-production.up.railway.app'

# check predict module

url = f"https://{APP_NAME}/should_search"
payload = {  
        'observation_id': 'teste1-teste2-teste3-teste4-teste5',
        'Type': 'Person search',
        'Date': '2020-12-01 01:10:00+0000',
        'Part of a policing operation': False,
        'Latitude': 50.798824,
        'Longitude': -1.089471,
        'Gender': 'Male',
        'Age range': '25-34',
        'Officer-defined ethnicity': 'White',
        'Legislation': 'Police and Criminal Evidence Act 1984 (section 1)',
        'Object of search': 'Article for use in theft',
        'station': 'hampshire'
    }

r = requests.post(url, json=payload)
display(r)
display(r.content)

# check update module

url = f"https://{APP_NAME}/search_result"
payload = {
    "observation_id": 'teste1-teste2-teste3-teste4-teste5',
    "outcome": str(False)
}

r = requests.post(url, json=payload)
display(r)
display(r.content)

# Run through test data set

In [15]:
# reshuffle available test set
r = list(range(len(X_test)))
random.shuffle(r)

for i in r:
    
    obs_dict = X_test.iloc[i, :].to_dict()

    response, check = attempt_predict(obs_dict)

    if check:
        print("passed")
        display(response)
    else: 
        display(response)
    
    sleep(0.5)

{'Part of a policing operation': <NA>,
 'error': 'Provided "Part of a policing operation" field is not of the correct data type'}

passed


{'observation_id': '83db8b1c-abd3-454d-aef9-b0400f05f8ce',
 'Type': 'Person search',
 'Date': '2020-04-08T19:58:00+00:00',
 'Part of a policing operation': False,
 'Latitude': nan,
 'Longitude': nan,
 'Gender': 'Male',
 'Age range': '18-24',
 'Officer-defined ethnicity': 'White',
 'Legislation': 'Misuse of Drugs Act 1971 (section 23)',
 'Object of search': 'Controlled drugs',
 'station': 'south-yorkshire'}

passed


{'observation_id': '6d0fbe35-c537-40be-9db2-e44060dbe6c8',
 'Type': 'Person search',
 'Date': '2021-03-28T07:00:00+00:00',
 'Part of a policing operation': False,
 'Latitude': nan,
 'Longitude': nan,
 'Gender': 'Male',
 'Age range': '18-24',
 'Officer-defined ethnicity': 'Asian',
 'Legislation': 'Misuse of Drugs Act 1971 (section 23)',
 'Object of search': 'Controlled drugs',
 'station': 'south-yorkshire'}

{'Part of a policing operation': <NA>,
 'error': 'Provided "Part of a policing operation" field is not of the correct data type'}

passed


{'observation_id': '0bb0a01d-8f0f-4413-bb7e-84a29d634c57',
 'Type': 'Person and Vehicle search',
 'Date': '2020-10-08T19:40:00+00:00',
 'Part of a policing operation': False,
 'Latitude': 51.706051,
 'Longitude': -0.360699,
 'Gender': 'Female',
 'Age range': '10-17',
 'Officer-defined ethnicity': 'White',
 'Legislation': 'Misuse of Drugs Act 1971 (section 23)',
 'Object of search': 'Controlled drugs',
 'station': 'hertfordshire'}

passed


{'observation_id': 'e11fc10d-fb50-4107-a03b-9efbba9cea20',
 'Type': 'Person search',
 'Date': '2020-05-10T18:45:00+00:00',
 'Part of a policing operation': False,
 'Latitude': nan,
 'Longitude': nan,
 'Gender': 'Male',
 'Age range': '10-17',
 'Officer-defined ethnicity': 'White',
 'Legislation': 'Misuse of Drugs Act 1971 (section 23)',
 'Object of search': 'Controlled drugs',
 'station': 'south-yorkshire'}

{'Part of a policing operation': <NA>,
 'error': 'Provided "Part of a policing operation" field is not of the correct data type'}

KeyboardInterrupt: 

In [None]:
r = list(range(len(X_test)))
test_list = random.shuffle(r)

url = f"http://{APP_NAME}/should_search"
payload = {  
        'observation_id': '2e4d0094-c30b-471b-a211-72a9790feca2',
        'Type': 'Person search',
        'Date': '2020-12-01 01:10:00+0000',
        'Part of a policing operation': False,
        'Latitude': 50.798824,
        'Longitude': -1.089471,
        'Gender': 'Male',
        'Age range': '25-34',
        'Officer-defined ethnicity': 'White',
        'Legislation': 'Police and Criminal Evidence Act 1984 (section 1)',
        'Object of search': 'Article for use in theft',
        'station': 'hampshire'
    }

r = requests.post(url, json=payload)
display(r)
print(r.content)

In [16]:
# TEST LOCALLY
#APP_NAME = 'localhost:5000' 

# TEST REMOTE
APP_NAME = 'ldsacapstone-production.up.railway.app' 


In [20]:
url = f"https://{APP_NAME}/should_search"
payload = {'observation_id': "b7287b3c-fb25-42a5-afa3-e8320817eb6f",
    'Type': 'Person search',
    'Date': '2021-07-30T16:00:00+00:00',
    'Part of a policing operation': False,
    'Latitude': 53.797198,
    'Longitude': -1.789092,
    'Gender': 'Male',
    'Age range': '25-34',
    'Officer-defined ethnicity': 'Extra-terrestrial',
    'Legislation': 'Misuse of Drugs Act 1971 (section 23)',
    'Object of search': 'Controlled drugs',
    'station': 'west-yorkshire'}


r = requests.post(url, json=payload)
display(r)
print(r.content)

<Response [200]>

b'{\n  "error": "Invalid value provided for Officer-defined ethnicity: Extra-terrestrial. Allowed values are: \'Asian\',\'Black\',\'Mixed\',\'White\',\'Other\'"\n}\n'
