### To Do's:  
- remove text cleaner
- replace age binarizer by threshold
- imputation of missing values
- upload new model to heroku
- See if [FunctionTransformer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.FunctionTransformer.html#sklearn.preprocessing.FunctionTransformer) can be useful to avoid custom transforms.
- maybe try to upload with custom transforms.

# Import statements

In [1]:
# Operating System
import os

import pickle
import json

#Pandas and Numpy
import pandas as pd
import numpy as np

#ramdom generator
import random

from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.base import TransformerMixin, BaseEstimator

import category_encoders as ce
from sklearn.preprocessing import KBinsDiscretizer, Binarizer

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import precision_score, recall_score, accuracy_score

# Custom transformers
from custom_transformers.transformer import Selector, TextSelector, NumberSelector, BoolSelector, TextCleanerTransformer


#CURL requests
import requests 

#Plots
import matplotlib.pyplot as plt
%matplotlib inline

# Statistics
from scipy.stats import binom_test

# Strings
import re

# Load original data

In [2]:
file_path = os.path.join('C:\\Users\\jnpicao\\Documents\\GitHub\\batch3-workspace\\Capstone\\data', 'train.csv')

# Option for reading a sample of the file
# sample 20% of the rows
p = 1

random.seed(178) # this is to get always the same sample. can be removed if we want the sample to change
try:
    df_original = pd.read_csv(file_path, 
                             skiprows = lambda row_num: random.random() > p and row_num > 0, 
                             #nrows = 10000, 
                             header=0,
                             warn_bad_lines=True)
except:
    print('Ooops!!! We got an error!')
else:
    # Drop observations correspoding to stops that didn't lead to a search
    df = df_original[df_original.VehicleSearchedIndicator==True].reset_index(drop=True).drop(columns='VehicleSearchedIndicator')


# Train-Test split

In [3]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='ContrabandIndicator'), 
                                                    df['ContrabandIndicator'], 
                                                    test_size=0.6, 
                                                    random_state=42)

# Define classes for pipeline

In [4]:
# # Custom transformer to implement sentence cleaning
# class TextCleanerTransformer(TransformerMixin):
#     def __init__(self, 
#                  regex_list=[("[\.\?\(\)\|:;_!@/*\-]", " "), (" +", " ")], 
#                  lower=True, 
#                  remove_punct=True):
#         self.regex_list = regex_list
#         self.lower = lower
#         self.remove_punct = remove_punct
        
#     def transform(self, X, *_):
#         #X = list(map(self._clean_sentence, X.values))
#         X_copy = X.copy()
#         X_copy = X_copy.applymap(self._clean_sentence)
#         return X_copy
    
#     def _clean_sentence(self, sentence):
        
#         # Make sure it is a string!
#         sentence = str(sentence)
        
#         # Replace given regexes
#         for regex in self.regex_list:
#             sentence = re.sub(regex[0], regex[1], sentence)
            
#         # lowercase
#         if self.lower:
#             sentence = sentence.lower()
        
#         # Trim
#         sentence = sentence.strip()
        
#         return sentence
    
#     def fit(self, *_):
#         return self

In [5]:
# class Selector(BaseEstimator, TransformerMixin):
#     """
#     Transformer to select a column from the dataframe to perform additional transformations on
#     """ 
#     def __init__(self, key):
#         self.key = key
        
#     def fit(self, X, y=None):
#         return self
    

# class TextSelector(Selector):
#     """
#     Transformer to select a single column from the data frame to perform additional transformations on
#     Use on text columns in the data
#     """
#     def transform(self, X):
#         return X[self.key]
    
    
# class NumberSelector(Selector):
#     """
#     Transformer to select a single column from the data frame to perform additional transformations on
#     Use on numeric columns in the data
#     """
#     def transform(self, X):
#         return X[self.key]

    
# class BoolSelector(Selector):
#     """
#     Transformer to select a single column from the data frame to perform additional transformations on
#     Use on numeric columns in the data
#     """
#     def transform(self, X):
#         return X[self.key]

# Pipeline workflow

In [6]:
text_pipe = Pipeline([
                ('selector', TextSelector(['Department Name', 'SearchAuthorizationCode', 'StatuteReason'])),
                ('ordinalencoder', ce.ordinal.OrdinalEncoder())
            ])

numerical_pipe =  Pipeline([
                ('selector', NumberSelector(['SubjectAge'])),
                ('binarizer', Binarizer(threshold=27))
            ])

# bool_pipe =  Pipeline([
#                 ('selector', BoolSelector(['ResidentIndicator']))                
#             ])

# Feature Union allow use to use multiple distinct features in our classifier
feats = FeatureUnion([('text', text_pipe), 
                      ('numerical', numerical_pipe)])

pipe_clf = Pipeline([
    ('features',feats),
    ('classifier', RandomForestClassifier(random_state = 42)),
])

In [7]:
pipe_clf.fit(X_train, y_train)

y_prob_pipe = pipe_clf.predict_proba(X_test)[:,1]

decision_thr = 0.5
y_pred_pipe = y_prob_pipe > decision_thr



# Metrics

In [8]:
y_pred = y_pred_pipe
y_prob = y_prob_pipe

In [9]:
precision_score(y_test, y_pred)

0.5998483699772555

In [10]:
recall_score(y_test, y_pred)

0.514367442465219

In [11]:
accuracy_score(y_test, y_pred)

0.7231464187985928

In [12]:
# pipeline = make_pipeline(TextCleanerTransformer(),
#                          ce.ordinal.OrdinalEncoder(),
#                          RandomForestClassifier(random_state = 42)
#                          )
# pipeline.fit(X_train[cols_to_use], y_train)

In [13]:
#y_pred = pipeline.predict(X_test[cols_to_use])

# Serialize the model

Columns to use

In [14]:
cols_to_use = ['Department Name', 'SearchAuthorizationCode', 'StatuteReason', 'SubjectAge']

In [15]:
with open('columns.json', 'w') as fh:
    json.dump(X_train[cols_to_use].columns.tolist(), fh)

Columns data types

In [16]:
with open('dtypes.pickle', 'wb') as fh:
    pickle.dump(X_train[cols_to_use].dtypes, fh)

Fitted model

In [17]:
from sklearn.externals import joblib
joblib.dump(pipe_clf, 'pipeline.pickle') 

['pipeline.pickle']

Example observation

In [18]:
X_train.iloc[-1,:].to_json('observation.json')

# Deserialize and use the model

Columns to use

In [19]:
with open('columns.json', 'r') as fh:
    cols = json.load(fh)
cols

['Department Name', 'SearchAuthorizationCode', 'StatuteReason', 'SubjectAge']

Columns data types

In [20]:
with open('dtypes.pickle', 'rb') as fh:
    dtypes = pickle.load(fh)
dtypes

Department Name             object
SearchAuthorizationCode     object
StatuteReason               object
SubjectAge                 float64
dtype: object

Fitted model

In [21]:
with open('pipeline.pickle', 'rb') as fh:
    pipe_deserialized = joblib.load(fh)

pipe_deserialized


Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=None,
       transformer_list=[('text', Pipeline(memory=None,
     steps=[('selector', TextSelector(key=['Department Name', 'SearchAuthorizationCode', 'StatuteReason'])), ('ordinalencoder', OrdinalEncoder(cols=['Department Name', 'SearchAuthorizationCode', 'St...mators=10, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False))])

Sample observation

In [22]:
with open('observation.json') as fh:
    new_obs = json.load(fh)

new_obs_str = json.dumps(new_obs)

In [23]:
new_obs_str

'{"Department Name": "Darien", "InterventionDateTime": "08/28/2014 06:19:00 PM", "InterventionLocationName": "DARIEN", "InterventionReasonCode": "E", "ReportingOfficerIdentificationID": "PSC21368", "ResidentIndicator": true, "SearchAuthorizationCode": "I", "StatuteReason": "Defective Lights", "SubjectAge": 27.0, "SubjectEthnicityCode": "N", "SubjectRaceCode": "W", "SubjectSexCode": "M", "TownResidentIndicator": false}'

In [24]:
new_obs_dict = json.loads(new_obs_str)

In [25]:
obs = pd.DataFrame([new_obs_dict], columns=cols)
obs = obs.astype(dtypes)
obs

Unnamed: 0,Department Name,SearchAuthorizationCode,StatuteReason,SubjectAge
0,Darien,I,Defective Lights,27.0


### Use deserialized model to predict class of sample observation

In [26]:
pipe_deserialized.predict(obs)

array([False])

Compare result with original model

In [27]:
pipe_deserialized.predict_proba(obs)[0,1]

0.0

In [28]:
pipe_clf.predict_proba(obs)

array([[1., 0.]])

# Generate observation samples for Flask

In [29]:
try:
    del idx
except:
    print("idx doesn't exist!")

idx doesn't exist!


In [30]:
try:
    idx = idx + 1
except:
    idx = 1

print("i = {}".format(idx))

i = 1


In [31]:
random.seed()
row = random.choice(X_test.index.values)
row

68157

In [32]:
#X_test.loc[row,:].to_json().replace('"', '\\"')

In [33]:
#X_test.loc[row,:]

In [34]:
new_request = '{\\"id\\": ' + str(idx) + ', \\"observation\\": ' + X_test.loc[row,:].to_json().replace('"', '\\"') + '}'
print(new_request)

{\"id\": 1, \"observation\": {\"Department Name\":\"CSP Troop D\",\"InterventionDateTime\":\"02\/17\/2018 06:25:00 PM\",\"InterventionLocationName\":\"KILLINGLY\",\"InterventionReasonCode\":\"V\",\"ReportingOfficerIdentificationID\":\"1000001948\",\"ResidentIndicator\":false,\"SearchAuthorizationCode\":\"I\",\"StatuteReason\":\"Other\",\"SubjectAge\":65.0,\"SubjectEthnicityCode\":\"N\",\"SubjectRaceCode\":\"W\",\"SubjectSexCode\":\"M\",\"TownResidentIndicator\":false}}


In [35]:
pipe_deserialized.predict(X_test.loc[[row], cols_to_use])

array([False])

In [36]:
pipe_deserialized.predict_proba(X_test.loc[[row], cols_to_use])

array([[0.75209235, 0.24790765]])

### confirm result

In [37]:
pipe_clf.predict_proba(X_test.loc[[row], cols_to_use])

array([[0.75209235, 0.24790765]])

In [38]:
str(idx)

'1'

In [39]:
"{\"id\": 37, \"observation\": {\"Department Name\":\"State Police\",\"InterventionDateTime\":\"03\/20\/2015 12:49:00 PM\",\"InterventionLocationName\":\"WATERTOWN           \",\"InterventionReasonCode\":\"E\",\"ReportingOfficerIdentificationID\":\"1000002596\",\"ResidentIndicator\":true,\"SearchAuthorizationCode\":\"O\",\"StatuteReason\":\"Other\/Error\",\"SubjectAge\":20.0,\"SubjectEthnicityCode\":\"N\",\"SubjectRaceCode\":\"W\",\"SubjectSexCode\":\"M\",\"TownResidentIndicator\":false}}"

'{"id": 37, "observation": {"Department Name":"State Police","InterventionDateTime":"03\\/20\\/2015 12:49:00 PM","InterventionLocationName":"WATERTOWN           ","InterventionReasonCode":"E","ReportingOfficerIdentificationID":"1000002596","ResidentIndicator":true,"SearchAuthorizationCode":"O","StatuteReason":"Other\\/Error","SubjectAge":20.0,"SubjectEthnicityCode":"N","SubjectRaceCode":"W","SubjectSexCode":"M","TownResidentIndicator":false}}'

In [40]:
#X_test.loc[row,:].to_json()

In [41]:
y = requests.post('https://heroku-app-model-deploy.herokuapp.com/predict', json = {"id": 87, "observation": {"Department Name":"State Police","InterventionDateTime":"03\\/20\\/2015 12:49:00 PM","InterventionLocationName":"WATERTOWN           ","InterventionReasonCode":"E","ReportingOfficerIdentificationID":"1000002596","ResidentIndicator":true,"SearchAuthorizationCode":"O","StatuteReason":"Other\\/Error","SubjectAge":20.0,"SubjectEthnicityCode":"N","SubjectRaceCode":"W","SubjectSexCode":"M","TownResidentIndicator":false}})
print(y.json())

NameError: name 'true' is not defined

In [None]:
y = requests.post('https://heroku-app-model-deploy.herokuapp.com/update', json = {"id": 3, "true_class": 1})
print(y.json())

In [None]:
new_request = '{\\"id\\": 16, \\"observation\\": ' + X_test.loc[row,:].to_json().replace('"', '\\"') + '}'
print(new_request)

In [None]:
y_test.loc[row]

In [None]:
X_test.iloc[-1,:].to_json().replace('"', '\\"')

In [None]:
new_request = '{\
\\"id\\": 0, \
\\"observation\\": {\\"Department Name\\":\\"Bloomfield\\",\\"InterventionDateTime\\":\\"01\\/15\\/2018 05:01:00 PM\\",\\"InterventionLocationName\\":\\"Bloomfield\\",\\"InterventionReasonCode\\":\\"V\\",\\"ReportingOfficerIdentificationID\\":\\"2103\\",\\"ResidentIndicator\\":true,\\"SearchAuthorizationCode\\":\\"C\\",\\"StatuteReason\\":\\"Traffic Control Signal\\",\\"SubjectAge\\":31.0,\\"SubjectEthnicityCode\\":\\"N\\",\\"SubjectRaceCode\\":\\"B\\",\\"SubjectSexCode\\":\\"M\\",\\"TownResidentIndicator\\":true}\
}'
print(new_request)

In [None]:
json.load("{"Department Name":"Bloomfield","InterventionDateTime":"01\/15\/2018 05:01:00 PM","InterventionLocationName":"Bloomfield","InterventionReasonCode":"V","ReportingOfficerIdentificationID":"2103","ResidentIndicator":true,"SearchAuthorizationCode":"C","StatuteReason":"Traffic Control Signal","SubjectAge":31.0,"SubjectEthnicityCode":"N","SubjectRaceCode":"B","SubjectSexCode":"M","TownResidentIndicator":true}")

In [None]:
json.load("{\"id\": 0, \"observation\": {\"Department Name\":\"Bloomfield\",\"InterventionDateTime\":\"01\/15\/2018 05:01:00 PM\",\"InterventionLocationName\":\"Bloomfield\",\"InterventionReasonCode\":\"V\",\"ReportingOfficerIdentificationID\":\"2103\",\"ResidentIndicator\":true,\"SearchAuthorizationCode\":\"C\",\"StatuteReason\":\"Traffic Control Signal\",\"SubjectAge\":31.0,\"SubjectEthnicityCode\":\"N\",\"SubjectRaceCode\":\"B\",\"SubjectSexCode\":\"M\",\"TownResidentIndicator\":true}}")
