In [28]:
import aux_functions
from app_functions import attempt_predict
from transformers import TimeTransformer2, BoolTransformer, lat_lon_imputer, Group_Age_Range, Group_Ethnicity

import json
import joblib
import pickle
import requests
from time import sleep
import random

import pandas as pd
pd.set_option('display.max_columns', 100)
import os
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
import numpy as np
import datetime as dt

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import cross_val_score
# from category_encoders import OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder, RobustScaler, OrdinalEncoder
from category_encoders.target_encoder import TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB, CategoricalNB

from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler

from sklearn.utils import resample
from sklearn.metrics import precision_score, recall_score, f1_score,  accuracy_score, roc_auc_score, make_scorer, confusion_matrix, roc_curve

# needed to use matplotlib inside jupyter notebook
%matplotlib inline 

# Get the data

In [29]:
df_ = pd.read_csv("data/train.csv")

drop_cols = ['Self-defined ethnicity', 'Outcome', 'Outcome linked to object of search', 'Removal of more than just outer clothing']

df_clean = aux_functions.clean_data(df_, drop_cols)

# Pipeline Definition

In [30]:
def create_pipeline(df, model):

    y = df["target"].copy()
    X = df.drop(columns=["target"]).copy()


    time_cols = ['Date']

    categorical_cols = ['Type', 'Object of search']
    
    bool_cols = ['Part of a policing operation']

    numerical_cols = ['Latitude', 'Longitude', 'station'] # station is needed for NaN imputing

    # Define timeseries pipeline
    time_pipe = Pipeline([
            ('time_transformer', TimeTransformer2()),
            ('scaler', MinMaxScaler())
        ])
    
    
    # Define categorical pipeline
    cat_pipe = Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', TargetEncoder(handle_unknown='value', min_samples_leaf=20 , smoothing=10))
        ])
    
    # Define boolean pipeline
    bool_pipe = Pipeline([
            ('bool_transformer', BoolTransformer())
        ])

    # Define numerical pipeline
    numeric_pipe = Pipeline([
            ('imputer', lat_lon_imputer()),
            ('scaler', MinMaxScaler())
        ])

    
    # Combine categorical and numerical pipelines
    preprocessor = ColumnTransformer([
        ('time_transformer', time_pipe, time_cols),
        ('cat', cat_pipe, categorical_cols),
        ('bool', bool_pipe, bool_cols),
        ('num', numeric_pipe, numerical_cols)],
    remainder='drop')


    # Fit a pipeline with transformers and an estimator to the training data
    pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('model', model)
        ])

    rus = RandomUnderSampler(random_state=42)
    X_resampled, y_resampled = rus.fit_resample(X, y)

    pipeline.fit(X_resampled, np.ravel(y_resampled))
    # pipeline.fit(X, np.ravel(y))


    return pipeline, X_resampled, y_resampled
    # return pipeline, X, y


def see_cross_val(pipeline, X, y):

    prec_scorer = make_scorer(precision_score)

    prec_scores = cross_val_score(pipeline, X, np.ravel(y), cv=5, scoring=prec_scorer)
    print(prec_scores)

# Train the Model with all the Data

In [31]:
# Check transformed data

model = GaussianNB()

pipeline, X, y = create_pipeline(df_clean, model)

see_cross_val(pipeline, X, y)

[0.52007437 0.54811129 0.51345917 0.5432847  0.53321139]


# Extract Model Files

In [32]:
with open("columns.json", 'w') as fh:
    json.dump(X_train.columns.tolist(), fh)


with open("dtypes.pickle", 'wb') as fh:
    pickle.dump(X_train.dtypes, fh)

joblib.dump(pipeline, "pipeline.pickle") 

['pipeline.pickle']

# Test the App

In [37]:
with open('data/trial_moment_1.json') as moment_1:
  parsed_json = json.load(moment_1)

print(parsed_json[1]["data"])

for n in range(len(parsed_json)):
  print(parsed_json[n]["data"])

{'observation_id': 'eeb891e3-3913-4590-82a9-dc23c212dceb', 'Type': 'Person search', 'Date': '2022-04-18T22:24:57+00:00', 'Part of a policing operation': nan, 'Latitude': nan, 'Longitude': nan, 'Gender': 'Male', 'Age range': '10-17', 'Officer-defined ethnicity': 'Other', 'Legislation': 'Police and Criminal Evidence Act 1984 (section 1)', 'Object of search': 'Articles for use in criminal damage', 'station': 'city-of-london'}
{'observation_id': '1558a55e-3df2-4665-8beb-0f0c5eaa0408', 'Type': 'Person search', 'Date': '2022-04-06T18:25:00+00:00', 'Part of a policing operation': True, 'Latitude': nan, 'Longitude': nan, 'Gender': 'Male', 'Age range': '10-17', 'Officer-defined ethnicity': 'White', 'Legislation': 'Misuse of Drugs Act 1971 (section 23)', 'Object of search': 'Controlled drugs', 'station': 'nottinghamshire'}
{'observation_id': 'eeb891e3-3913-4590-82a9-dc23c212dceb', 'Type': 'Person search', 'Date': '2022-04-18T22:24:57+00:00', 'Part of a policing operation': nan, 'Latitude': nan, 

In [40]:
print(~np.isnan(parsed_json[0]["data"]["Latitude"]))

False


In [44]:
APP_NAME = 'ldsacapstone-production.up.railway.app'

In [53]:
parsed_json[1]["data"]

{'observation_id': 'eeb891e3-3913-4590-82a9-dc23c212dceb',
 'Type': 'Person search',
 'Date': '2022-04-18T22:24:57+00:00',
 'Part of a policing operation': nan,
 'Latitude': nan,
 'Longitude': nan,
 'Gender': 'Male',
 'Age range': '10-17',
 'Officer-defined ethnicity': 'Other',
 'Legislation': 'Police and Criminal Evidence Act 1984 (section 1)',
 'Object of search': 'Articles for use in criminal damage',
 'station': 'city-of-london'}

In [63]:
str(np.NaN)

'nan'

In [65]:
float('nan')

nan

In [64]:
url = f"https://{APP_NAME}/should_search/"
payload = parsed_json[1]["data"]
r = requests.post(url, data=json.dumps(payload), headers={"Content-Type": "application/json"})
display(r)
display(r.content)




<Response [200]>

b'{\n  "Part of a policing operation": NaN, \n  "error": "Provided \\"Part of a policing operation\\" field is not of the correct data type"\n}\n'

In [None]:
# Testing the /update endpoint

url = f"https://{APP_NAME}/search_result/"
payload = {
    "observation_id": 'teste1-teste2-teste3-teste4-teste5',
    "outcome": str(False)
}

r = requests.post(url, json=payload)
display(r)
display(r.content)