# Import statements

In [96]:
# Operating System
import os

import pickle
import json

#Pandas and Numpy
import pandas as pd
import numpy as np

#ramdom generator
import random

from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.base import TransformerMixin, BaseEstimator

import category_encoders as ce
from sklearn.preprocessing import KBinsDiscretizer

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import precision_score, recall_score, accuracy_score

#CURL requests
import requests 

#Plots
import matplotlib.pyplot as plt
%matplotlib inline

# Statistics
from scipy.stats import binom_test

# Strings
import re

# Load original data

In [4]:
file_path = os.path.join('C:\\Users\\jnpicao\\Documents\\GitHub\\batch3-workspace\\Capstone\\data', 'train.csv')

# Option for reading a sample of the file
# sample 20% of the rows
p = 1

random.seed(178) # this is to get always the same sample. can be removed if we want the sample to change
try:
    df_original = pd.read_csv(file_path, 
                             skiprows = lambda row_num: random.random() > p and row_num > 0, 
                             #nrows = 10000, 
                             header=0,
                             warn_bad_lines=True)
except:
    print('Ooops!!! We got an error!')
else:
    # Drop observations correspoding to stops that didn't lead to a search
    df = df_original[df_original.VehicleSearchedIndicator==True].reset_index(drop=True).drop(columns='VehicleSearchedIndicator')


# Train-Test split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='ContrabandIndicator'), 
                                                    df['ContrabandIndicator'], 
                                                    test_size=0.6, 
                                                    random_state=42)

# Define classes for pipeline

In [55]:
# # Custom transformer to implement sentence cleaning
# class TextCleanerTransformer(TransformerMixin):
#     def __init__(self, 
#                  regex_list=[("[\.\?\(\)\|:;_!@/*\-]", " "), (" +", " ")], 
#                  lower=True, 
#                  remove_punct=True):
#         self.regex_list = regex_list
#         self.lower = lower
#         self.remove_punct = remove_punct
        
#     def transform(self, X, *_):
#         #X = list(map(self._clean_sentence, X.values))
#         X_copy = X.copy()
#         X_copy = X_copy.applymap(self._clean_sentence)
#         return X_copy
    
#     def _clean_sentence(self, sentence):
        
#         # Make sure it is a string!
#         sentence = str(sentence)
        
#         # Replace given regexes
#         for regex in self.regex_list:
#             sentence = re.sub(regex[0], regex[1], sentence)
            
#         # lowercase
#         if self.lower:
#             sentence = sentence.lower()
        
#         # Trim
#         sentence = sentence.strip()
        
#         return sentence
    
#     def fit(self, *_):
#         return self

In [56]:
# # Custom transformer to implement sentence cleaning
# class TextPreprocessor(TransformerMixin):
#     def __init__(self, 
#                  regex_list=[("[\.\?\(\)\|:;_!@/*\-]", " "), (" +", " ")], 
#                  lower=True, 
#                  remove_punct=True):
#         self.regex_list = regex_list
#         self.lower = lower
#         self.remove_punct = remove_punct
        
#     def transform(self, X, *_):
#         X_copy = X.copy()
#         X_copy = list(map(self._clean_sentence, X_copy))
#         return X_copy
    
#     def _clean_sentence(self, sentence):
        
#         # Make sure it is a string!
#         sentence = str(sentence)
        
#         # Replace given regexes
#         for regex in self.regex_list:
#             sentence = re.sub(regex[0], regex[1], sentence)
            
#         # lowercase
#         if self.lower:
#             sentence = sentence.lower()
        
#         # Trim
#         sentence = sentence.strip()
        
#         return sentence
    
#     def fit(self, *_):
#         return self

In [57]:
# class TextPreprocessor(BaseEstimator, TransformerMixin):
#     def __init__(self,
#                  variety="BrE",
#                  user_abbrevs={},
#                  n_jobs=1):
#         """
#         Text preprocessing transformer includes steps:
#             1. Text normalization
#             2. Punctuation removal
#             3. Stop words removal
#             4. Lemmatization
        
#         variety - format of date (AmE - american type, BrE - british format) 
#         user_abbrevs - dict of user abbreviations mappings (from normalise package)
#         n_jobs - parallel jobs to run
#         """
#         self.variety = variety
#         self.user_abbrevs = user_abbrevs
#         self.n_jobs = n_jobs

#     def fit(self, X, y=None):
#         return self

#     def transform(self, X, *_):
#         X_copy = X.copy()

#         partitions = 1
#         cores = mp.cpu_count()
#         if self.n_jobs <= -1:
#             partitions = cores
#         elif self.n_jobs <= 0:
#             return X_copy.apply(self._preprocess_text)
#         else:
#             partitions = min(self.n_jobs, cores)

#         data_split = np.array_split(X_copy, partitions)
#         pool = mp.Pool(cores)
#         data = pd.concat(pool.map(self._preprocess_part, data_split))
#         pool.close()
#         pool.join()

#         return data

#     def _preprocess_part(self, part):
#         return part.apply(self._preprocess_text)

#     def _preprocess_text(self, text):
#         normalized_text = self._normalize(text)
#         doc = nlp(normalized_text)
#         removed_punct = self._remove_punct(doc)
#         removed_stop_words = self._remove_stop_words(removed_punct)
#         return self._lemmatize(removed_stop_words)

#     def _normalize(self, text):
#         # some issues in normalise package
#         try:
#             return ' '.join(normalise(text, variety=self.variety, user_abbrevs=self.user_abbrevs, verbose=False))
#         except:
#             return text

#     def _remove_punct(self, doc):
#         return [t for t in doc if t.text not in string.punctuation]

#     def _remove_stop_words(self, doc):
#         return [t for t in doc if not t.is_stop]

#     def _lemmatize(self, doc):
#         return ' '.join([t.lemma_ for t in doc])

In [58]:
# Custom transformer to implement sentence cleaning
class TextCleanerTransformer(TransformerMixin):
    def __init__(self, 
                 regex_list=[("[\.\?\(\)\|:;_!@/*\-]", " "), (" +", " ")], 
                 lower=True, 
                 remove_punct=True):
        self.regex_list = regex_list
        self.lower = lower
        self.remove_punct = remove_punct
        
    def transform(self, X, *_):
        #X = list(map(self._clean_sentence, X.values))
        X_copy = X.copy()
        X_copy = X_copy.applymap(self._clean_sentence)
        return X_copy
    
    def _clean_sentence(self, sentence):
        
        # Make sure it is a string!
        sentence = str(sentence)
        
        # Replace given regexes
        for regex in self.regex_list:
            sentence = re.sub(regex[0], regex[1], sentence)
            
        # lowercase
        if self.lower:
            sentence = sentence.lower()
        
        # Trim
        sentence = sentence.strip()
        
        return sentence
    
    def fit(self, *_):
        return self

In [59]:
class Selector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a column from the dataframe to perform additional transformations on
    """ 
    def __init__(self, key):
        self.key = key
        
    def fit(self, X, y=None):
        return self
    

class TextSelector(Selector):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def transform(self, X):
        return X[self.key]
    
    
class NumberSelector(Selector):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def transform(self, X):
        return X[self.key]

    
class BoolSelector(Selector):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def transform(self, X):
        return X[self.key]

# Pipeline workflow

In [60]:
# text_pipe = Pipeline([
#                 ('selector', TextSelector(['Department Name', 'SearchAuthorizationCode', 'StatuteReason'])),
#                 ('cleaner', TextCleanerTransformer()),
#                 ('ordinalencoder', ce.ordinal.OrdinalEncoder())
#             ])

text_pipe = Pipeline([
                ('selector', TextSelector(['Department Name', 'SearchAuthorizationCode', 'StatuteReason'])),
                ('ordinalencoder', ce.ordinal.OrdinalEncoder())
            ])


numerical_pipe =  Pipeline([
                ('selector', NumberSelector(['SubjectAge'])),
                ('binarizer', KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform'))
            ])

# bool_pipe =  Pipeline([
#                 ('selector', BoolSelector(['ResidentIndicator']))                
#             ])

# Feature Union allow use to use multiple distinct features in our classifier
feats = FeatureUnion([('text', text_pipe), 
                      ('numerical', numerical_pipe)])

#feats = FeatureUnion([('text', text_pipe)])

# feats = FeatureUnion([('text', text_pipe), 
#                       ('numerical', numerical_pipe),
#                       ('bool', bool_pipe)])

pipe_clf = Pipeline([
    ('features',feats),
    ('classifier', RandomForestClassifier(random_state = 42)),
])

In [61]:
pipe_clf.fit(X_train, y_train)

y_prob_pipe = pipe_clf.predict_proba(X_test)[:,1]

decision_thr = 0.5
y_pred_pipe = y_prob_pipe > decision_thr



# Metrics

In [62]:
y_pred = y_pred_pipe
y_prob = y_prob_pipe

In [63]:
precision_score(y_test, y_pred)

0.5822126458132105

In [64]:
recall_score(y_test, y_pred)

0.5094266025224288

In [65]:
accuracy_score(y_test, y_pred)

0.7140033879164314

In [66]:
# pipeline = make_pipeline(TextCleanerTransformer(),
#                          ce.ordinal.OrdinalEncoder(),
#                          RandomForestClassifier(random_state = 42)
#                          )
# pipeline.fit(X_train[cols_to_use], y_train)

In [67]:
#y_pred = pipeline.predict(X_test[cols_to_use])

# Serialize the model

Columns to use

In [68]:
cols_to_use = ['Department Name', 'SearchAuthorizationCode', 'StatuteReason', 'SubjectAge']

In [69]:
with open('columns.json', 'w') as fh:
    json.dump(X_train[cols_to_use].columns.tolist(), fh)

Columns data types

In [70]:
with open('dtypes.pickle', 'wb') as fh:
    pickle.dump(X_train[cols_to_use].dtypes, fh)

Fitted model

In [71]:
from sklearn.externals import joblib
joblib.dump(pipe_clf, 'pipeline.pickle') 

['pipeline.pickle']

Example observation

In [72]:
X_train.iloc[-1,:].to_json('observation.json')

# Deserialize and use the model

Columns to use

In [73]:
with open('columns.json', 'r') as fh:
    cols = json.load(fh)
cols

['Department Name', 'SearchAuthorizationCode', 'StatuteReason', 'SubjectAge']

Columns data types

In [74]:
with open('dtypes.pickle', 'rb') as fh:
    dtypes = pickle.load(fh)
dtypes

Department Name             object
SearchAuthorizationCode     object
StatuteReason               object
SubjectAge                 float64
dtype: object

Fitted model

In [75]:
with open('pipeline.pickle', 'rb') as fh:
    pipe_deserialized = joblib.load(fh)

pipe_deserialized


Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=None,
       transformer_list=[('text', Pipeline(memory=None,
     steps=[('selector', TextSelector(key=['Department Name', 'SearchAuthorizationCode', 'StatuteReason'])), ('ordinalencoder', OrdinalEncoder(cols=['Department Name', 'SearchAuthorizationCode', 'St...mators=10, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False))])

Sample observation

In [76]:
with open('observation.json') as fh:
    new_obs = json.load(fh)

new_obs_str = json.dumps(new_obs)

In [77]:
new_obs_str

'{"Department Name": "Darien", "InterventionDateTime": "08/28/2014 06:19:00 PM", "InterventionLocationName": "DARIEN", "InterventionReasonCode": "E", "ReportingOfficerIdentificationID": "PSC21368", "ResidentIndicator": true, "SearchAuthorizationCode": "I", "StatuteReason": "Defective Lights", "SubjectAge": 27.0, "SubjectEthnicityCode": "N", "SubjectRaceCode": "W", "SubjectSexCode": "M", "TownResidentIndicator": false}'

In [78]:
new_obs_dict = json.loads(new_obs_str)

In [79]:
obs = pd.DataFrame([new_obs_dict], columns=cols)
obs = obs.astype(dtypes)
obs

Unnamed: 0,Department Name,SearchAuthorizationCode,StatuteReason,SubjectAge
0,Darien,I,Defective Lights,27.0


### Use deserialized model to predict class of sample observation

In [92]:
pipe_deserialized.predict(obs)

array([False])

Compare result with original model

In [91]:
pipe_deserialized.predict_proba(obs)[0,1]

0.1

In [82]:
pipe_clf.predict_proba(obs)

array([[0.9, 0.1]])

# Generate observation samples for Flask

In [83]:
del idx

In [101]:
try:
    idx = idx + 1
except:
    idx = 1

print("i = {}".format(idx))

i = 2


In [102]:
random.seed()
row = random.choice(X_test.index.values)
row

48186

In [103]:
X_test.loc[row,:].values

array(['New Haven', '02/05/2017 10:47:00 AM', 'NEW HAVEN', 'V', '6511',
       True, 'O', 'Window Tint', 26.0, 'N', 'B', 'M', True], dtype=object)

In [104]:
pipe_deserialized.predict(X_test.loc[[row], cols_to_use])

array([False])

In [105]:
pipe_deserialized.predict_proba(X_test.loc[[row], cols_to_use])

array([[0.89544387, 0.10455613]])

### confirm result

In [106]:
pipe_clf.predict_proba(X_test.loc[[row], cols_to_use])

array([[0.89544387, 0.10455613]])

In [108]:
X_test.loc[row,:].to_json().replace('"', '\\"')

'{\\"Department Name\\":\\"New Haven\\",\\"InterventionDateTime\\":\\"02\\/05\\/2017 10:47:00 AM\\",\\"InterventionLocationName\\":\\"NEW HAVEN\\",\\"InterventionReasonCode\\":\\"V\\",\\"ReportingOfficerIdentificationID\\":\\"6511\\",\\"ResidentIndicator\\":true,\\"SearchAuthorizationCode\\":\\"O\\",\\"StatuteReason\\":\\"Window Tint\\",\\"SubjectAge\\":26.0,\\"SubjectEthnicityCode\\":\\"N\\",\\"SubjectRaceCode\\":\\"B\\",\\"SubjectSexCode\\":\\"M\\",\\"TownResidentIndicator\\":true}'

In [107]:
new_request = '{\\"id\\": ' + str(idx) + ', \\"observation\\": ' +  + '}'
print(new_request)

{\"id\": 2, \"observation\": {\"Department Name\":\"New Haven\",\"InterventionDateTime\":\"02\/05\/2017 10:47:00 AM\",\"InterventionLocationName\":\"NEW HAVEN\",\"InterventionReasonCode\":\"V\",\"ReportingOfficerIdentificationID\":\"6511\",\"ResidentIndicator\":true,\"SearchAuthorizationCode\":\"O\",\"StatuteReason\":\"Window Tint\",\"SubjectAge\":26.0,\"SubjectEthnicityCode\":\"N\",\"SubjectRaceCode\":\"B\",\"SubjectSexCode\":\"M\",\"TownResidentIndicator\":true}}


In [None]:
"{\"id\": 37, \"observation\": {\"Department Name\":\"State Police\",\"InterventionDateTime\":\"03\/20\/2015 12:49:00 PM\",\"InterventionLocationName\":\"WATERTOWN           \",\"InterventionReasonCode\":\"E\",\"ReportingOfficerIdentificationID\":\"1000002596\",\"ResidentIndicator\":true,\"SearchAuthorizationCode\":\"O\",\"StatuteReason\":\"Other\/Error\",\"SubjectAge\":20.0,\"SubjectEthnicityCode\":\"N\",\"SubjectRaceCode\":\"W\",\"SubjectSexCode\":\"M\",\"TownResidentIndicator\":false}}"

In [94]:
X_test.loc[row,:].to_json()

'{"Department Name":"State Police","InterventionDateTime":"03\\/20\\/2015 12:49:00 PM","InterventionLocationName":"WATERTOWN           ","InterventionReasonCode":"E","ReportingOfficerIdentificationID":"1000002596","ResidentIndicator":true,"SearchAuthorizationCode":"O","StatuteReason":"Other\\/Error","SubjectAge":20.0,"SubjectEthnicityCode":"N","SubjectRaceCode":"W","SubjectSexCode":"M","TownResidentIndicator":false}'

In [98]:
y = requests.post('https://heroku-app-model-deploy.herokuapp.com/predict', json = {"id": 87, "observation": {"Department Name":"State Police","InterventionDateTime":"03\\/20\\/2015 12:49:00 PM","InterventionLocationName":"WATERTOWN           ","InterventionReasonCode":"E","ReportingOfficerIdentificationID":"1000002596","ResidentIndicator":true,"SearchAuthorizationCode":"O","StatuteReason":"Other\\/Error","SubjectAge":20.0,"SubjectEthnicityCode":"N","SubjectRaceCode":"W","SubjectSexCode":"M","TownResidentIndicator":false}})
print(y.json())

NameError: name 'true' is not defined

In [109]:
y = requests.post('https://heroku-app-model-deploy.herokuapp.com/update', json = {"id": 3, "true_class": 1})
print(y.json())

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [42]:
new_request = '{\\"id\\": 16, \\"observation\\": ' + X_test.loc[row,:].to_json().replace('"', '\\"') + '}'
print(new_request)

{\"id\": 16, \"observation\": {\"Department Name\":\"Vernon\",\"InterventionDateTime\":\"03\/25\/2018 11:16:00 PM\",\"InterventionLocationName\":\"VERNON\",\"InterventionReasonCode\":\"V\",\"ReportingOfficerIdentificationID\":\"625\",\"ResidentIndicator\":true,\"SearchAuthorizationCode\":\"O\",\"StatuteReason\":\"Speed Related\",\"SubjectAge\":20.0,\"SubjectEthnicityCode\":\"N\",\"SubjectRaceCode\":\"W\",\"SubjectSexCode\":\"M\",\"TownResidentIndicator\":false}}


In [43]:
y_test.loc[row]

True

In [None]:
X_test.iloc[-1,:].to_json().replace('"', '\\"')

In [None]:
new_request = '{\
\\"id\\": 0, \
\\"observation\\": {\\"Department Name\\":\\"Bloomfield\\",\\"InterventionDateTime\\":\\"01\\/15\\/2018 05:01:00 PM\\",\\"InterventionLocationName\\":\\"Bloomfield\\",\\"InterventionReasonCode\\":\\"V\\",\\"ReportingOfficerIdentificationID\\":\\"2103\\",\\"ResidentIndicator\\":true,\\"SearchAuthorizationCode\\":\\"C\\",\\"StatuteReason\\":\\"Traffic Control Signal\\",\\"SubjectAge\\":31.0,\\"SubjectEthnicityCode\\":\\"N\\",\\"SubjectRaceCode\\":\\"B\\",\\"SubjectSexCode\\":\\"M\\",\\"TownResidentIndicator\\":true}\
}'
print(new_request)

In [None]:
json.load("{"Department Name":"Bloomfield","InterventionDateTime":"01\/15\/2018 05:01:00 PM","InterventionLocationName":"Bloomfield","InterventionReasonCode":"V","ReportingOfficerIdentificationID":"2103","ResidentIndicator":true,"SearchAuthorizationCode":"C","StatuteReason":"Traffic Control Signal","SubjectAge":31.0,"SubjectEthnicityCode":"N","SubjectRaceCode":"B","SubjectSexCode":"M","TownResidentIndicator":true}")

In [None]:
json.load("{\"id\": 0, \"observation\": {\"Department Name\":\"Bloomfield\",\"InterventionDateTime\":\"01\/15\/2018 05:01:00 PM\",\"InterventionLocationName\":\"Bloomfield\",\"InterventionReasonCode\":\"V\",\"ReportingOfficerIdentificationID\":\"2103\",\"ResidentIndicator\":true,\"SearchAuthorizationCode\":\"C\",\"StatuteReason\":\"Traffic Control Signal\",\"SubjectAge\":31.0,\"SubjectEthnicityCode\":\"N\",\"SubjectRaceCode\":\"B\",\"SubjectSexCode\":\"M\",\"TownResidentIndicator\":true}}")
