# Import statements

In [1]:
# Operating System
import os

import pickle
import json

#Pandas and Numpy
import pandas as pd
import numpy as np

#ramdom generator
import random

from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.base import TransformerMixin, BaseEstimator

import category_encoders as ce
from sklearn.preprocessing import KBinsDiscretizer

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import precision_score, recall_score, accuracy_score



#Plots
import matplotlib.pyplot as plt
%matplotlib inline

# Statistics
from scipy.stats import binom_test

# Strings
import re

# Define utility functions

In [2]:
def simple_clean(doc, regex_list=[("[\.\?\(\)\|:;_!@/*\-]", " "), (" +", " ")] ):
    
    # Make sure it is a string!
    doc = str(doc)
    
    # remove or replace characters
    for regex in regex_list:
        doc = re.sub(regex[0], regex[1], doc)
    # lowercase
    doc = doc.lower()
    # Trim
    doc = doc.strip()
    # tokenize
    #words = tokenizer.tokenize(doc)
    # remove punctuation
    #words = list(filter(lambda x: x not in string.punctuation, words))
    # stem
    #stems = list(map(stemmer.stem, words))
    #new_doc = " ".join(stems)
    new_doc = doc
    return new_doc

# Define class for pipeline

In [40]:
# Custom transformer to implement sentence cleaning
class TextCleanerTransformer(TransformerMixin):
    def __init__(self, 
                 regex_list=[("[\.\?\(\)\|:;_!@/*\-]", " "), (" +", " ")], 
                 lower=True, 
                 remove_punct=True):
        self.regex_list = regex_list
        self.lower = lower
        self.remove_punct = remove_punct
        
    def transform(self, X, *_):
        #X = list(map(self._clean_sentence, X.values))
        X_copy = X.copy()
        X_copy = X_copy.applymap(self._clean_sentence)
        return X_copy
    
    def _clean_sentence(self, sentence):
        
        # Make sure it is a string!
        sentence = str(sentence)
        
        # Replace given regexes
        for regex in self.regex_list:
            sentence = re.sub(regex[0], regex[1], sentence)
            
        # lowercase
        if self.lower:
            sentence = sentence.lower()
        
        # Trim
        sentence = sentence.strip()
        
        return sentence
    
    def fit(self, *_):
        return self

In [4]:
# Custom transformer to implement sentence cleaning
class TextPreprocessor(TransformerMixin):
    def __init__(self, 
                 regex_list=[("[\.\?\(\)\|:;_!@/*\-]", " "), (" +", " ")], 
                 lower=True, 
                 remove_punct=True):
        self.regex_list = regex_list
        self.lower = lower
        self.remove_punct = remove_punct
        
    def transform(self, X, *_):
        X_copy = X.copy()
        X_copy = list(map(self._clean_sentence, X_copy))
        return X_copy
    
    def _clean_sentence(self, sentence):
        
        # Make sure it is a string!
        sentence = str(sentence)
        
        # Replace given regexes
        for regex in self.regex_list:
            sentence = re.sub(regex[0], regex[1], sentence)
            
        # lowercase
        if self.lower:
            sentence = sentence.lower()
        
        # Trim
        sentence = sentence.strip()
        
        return sentence
    
    def fit(self, *_):
        return self

In [None]:
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self,
                 variety="BrE",
                 user_abbrevs={},
                 n_jobs=1):
        """
        Text preprocessing transformer includes steps:
            1. Text normalization
            2. Punctuation removal
            3. Stop words removal
            4. Lemmatization
        
        variety - format of date (AmE - american type, BrE - british format) 
        user_abbrevs - dict of user abbreviations mappings (from normalise package)
        n_jobs - parallel jobs to run
        """
        self.variety = variety
        self.user_abbrevs = user_abbrevs
        self.n_jobs = n_jobs

    def fit(self, X, y=None):
        return self

    def transform(self, X, *_):
        X_copy = X.copy()

        partitions = 1
        cores = mp.cpu_count()
        if self.n_jobs <= -1:
            partitions = cores
        elif self.n_jobs <= 0:
            return X_copy.apply(self._preprocess_text)
        else:
            partitions = min(self.n_jobs, cores)

        data_split = np.array_split(X_copy, partitions)
        pool = mp.Pool(cores)
        data = pd.concat(pool.map(self._preprocess_part, data_split))
        pool.close()
        pool.join()

        return data

    def _preprocess_part(self, part):
        return part.apply(self._preprocess_text)

    def _preprocess_text(self, text):
        normalized_text = self._normalize(text)
        doc = nlp(normalized_text)
        removed_punct = self._remove_punct(doc)
        removed_stop_words = self._remove_stop_words(removed_punct)
        return self._lemmatize(removed_stop_words)

    def _normalize(self, text):
        # some issues in normalise package
        try:
            return ' '.join(normalise(text, variety=self.variety, user_abbrevs=self.user_abbrevs, verbose=False))
        except:
            return text

    def _remove_punct(self, doc):
        return [t for t in doc if t.text not in string.punctuation]

    def _remove_stop_words(self, doc):
        return [t for t in doc if not t.is_stop]

    def _lemmatize(self, doc):
        return ' '.join([t.lemma_ for t in doc])

# Load original data

In [5]:
#file_path = ".\data\train.csv"
#file_path = os.path.join('C:\\Users\\jnpicao\\Documents\\GitHub\\batch3-workspace\\Capstone', 'data', 'train.csv')
file_path = os.path.join('C:\\Users\\jnpicao\\Documents\\GitHub', 'train.csv')

# Option for reading a sample of the file
# sample 20% of the rows
p = 0.5

random.seed(178) # this is to get always the same sample. can be removed if we want the sample to change
try:
    df_original = pd.read_csv(file_path, 
                             skiprows = lambda row_num: random.random() > p and row_num > 0, 
                             #nrows = 10000, 
                             header=0,
                             warn_bad_lines=True)
except:
    print('Ooops!!! We got an error!')
else:
    # Drop observations correspoding to stops that didn't lead to a search
    df = df_original[df_original.VehicleSearchedIndicator==True].reset_index(drop=True).drop(columns='VehicleSearchedIndicator')


# Train-Test split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='ContrabandIndicator'), 
                                                    df['ContrabandIndicator'], 
                                                    test_size=0.5, 
                                                    random_state=42)

In [7]:
X_train.head()

Unnamed: 0,Department Name,InterventionDateTime,InterventionLocationName,InterventionReasonCode,ReportingOfficerIdentificationID,ResidentIndicator,SearchAuthorizationCode,StatuteReason,SubjectAge,SubjectEthnicityCode,SubjectRaceCode,SubjectSexCode,TownResidentIndicator
26546,Windsor,05/05/2017 05:33:00 AM,WINDSOR,V,205,True,O,Other,57.0,N,W,M,False
18116,Farmington,05/09/2016 11:19:08 AM,Farmington,V,390,True,O,Speed Related,24.0,H,W,M,True
14762,Greenwich,12/28/2015 10:56:11 PM,GREENWICH,I,120000146,False,C,Stop Sign,32.0,H,W,M,False
6930,New Britain,07/21/2014 05:15:00 PM,New Britain,I,DCM0330,True,O,Suspended License,41.0,H,W,M,True
21620,Norwich,10/25/2016 12:15:00 AM,NORWICH,V,1063,True,I,Moving Violation,32.0,N,W,F,False


# Train the model

In [8]:
cols_to_use = ['Department Name','StatuteReason']
X_train[cols_to_use].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19196 entries, 26546 to 15795
Data columns (total 2 columns):
Department Name    19196 non-null object
StatuteReason      19195 non-null object
dtypes: object(2)
memory usage: 449.9+ KB


In [37]:
pd.DataFrame(list(map(simple_clean, X_train[cols_to_use])))

Unnamed: 0,0
0,department name
1,statutereason


In [39]:
X_train['Department Name'].applymap(simple_clean)

AttributeError: 'Series' object has no attribute 'applymap'

In [42]:
pipeline = make_pipeline(TextCleanerTransformer(),
                         ce.ordinal.OrdinalEncoder(),
                         RandomForestClassifier(random_state = 42)
                         )
pipeline.fit(X_train[cols_to_use], y_train)



Pipeline(memory=None,
     steps=[('textcleanertransformer', <__main__.TextCleanerTransformer object at 0x000000768FCC60F0>), ('ordinalencoder', OrdinalEncoder(cols=['Department Name', 'StatuteReason'],
        drop_invariant=False, handle_missing='value',
        handle_unknown='value',
        mapping=[{'col': 'Department N...mators=10, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False))])

In [44]:
y_pred = pipeline.predict(X_test[cols_to_use])

### Assess results

In [47]:
y_pred.shape

(19196,)

In [48]:
X_test.shape

(19196, 13)

In [49]:
precision_score(y_test, y_pred)

0.4989133059647428

In [50]:
recall_score(y_test, y_pred)

0.3322077504421933

In [51]:
accuracy_score(y_test, y_pred)

0.6755574077932903

# Serialize the model

Columns to use

In [17]:
with open('columns.json', 'w') as fh:
    json.dump(X_train[cols_to_use].columns.tolist(), fh)

Columns data types

In [18]:
with open('dtypes.pickle', 'wb') as fh:
    pickle.dump(X_train[cols_to_use].dtypes, fh)

Fitted model

In [19]:
from sklearn.externals import joblib
joblib.dump(pipeline, 'pipeline.pickle') 

['pipeline.pickle']

Example observation

In [20]:
X_train.iloc[-1,:].to_json('observation.json')

# Deserialize and use the model

Columns to use

In [21]:
with open('columns.json', 'r') as fh:
    cols = json.load(fh)
cols

['Department Name', 'StatuteReason']

Columns data types

In [22]:
with open('dtypes.pickle', 'rb') as fh:
    dtypes = pickle.load(fh)
dtypes

Department Name    object
StatuteReason      object
dtype: object

Fitted model

In [23]:
with open('pipeline.pickle', 'rb') as fh:
    pipe_deserialized = joblib.load(fh)

pipe_deserialized


Pipeline(memory=None,
     steps=[('ordinalencoder', OrdinalEncoder(cols=[0, 1], drop_invariant=False, handle_missing='value',
        handle_unknown='value',
        mapping=[{'col': 0, 'mapping': Windsor             1
Farmington          2
Greenwich           3
New Britain         4
Norwich             5
New Haven          ...mators=10, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False))])

Sample observation

In [24]:
with open('observation.json') as fh:
    new_obs = json.load(fh)

new_obs_str = json.dumps(new_obs)

In [25]:
new_obs_str

'{"Department Name": "West Hartford", "InterventionDateTime": "02/08/2016 04:47:57 PM", "InterventionLocationName": "West Hartford", "InterventionReasonCode": "V", "ReportingOfficerIdentificationID": "1000002168", "ResidentIndicator": true, "SearchAuthorizationCode": "O", "StatuteReason": "Moving Violation", "SubjectAge": 50.0, "SubjectEthnicityCode": "N", "SubjectRaceCode": "W", "SubjectSexCode": "M", "TownResidentIndicator": false}'

In [26]:
new_obs_dict = json.loads(new_obs_str)

In [27]:
obs = pd.DataFrame([new_obs_dict], columns=cols)
obs = obs.astype(dtypes)
obs

Unnamed: 0,Department Name,StatuteReason
0,West Hartford,Moving Violation


### Use deserialized model to predict class of sample observation

In [28]:
pipe_deserialized.predict(obs.values)

array([ True])

Compare result with original model

In [29]:
pipe_deserialized.predict_proba(obs.values)

array([[0.27238847, 0.72761153]])

In [30]:
pipeline.predict_proba(obs.values)

array([[0.27238847, 0.72761153]])

# Generate observation samples for Flask

In [31]:
def generate_new_sample():
    return

In [38]:
random.seed()
row = random.choice(X_test.index.values)
row

35919

In [39]:
X_test.loc[row,:].values

array(['Vernon', '03/25/2018 11:16:00 PM', 'VERNON', 'V', '625', True,
       'O', 'Speed Related', 20.0, 'N', 'W', 'M', False], dtype=object)

In [40]:
pipeline.predict(X_test.loc[[row],cols_to_use].values)

array([ True])

In [41]:
pipeline.predict_proba(X_test.loc[[row],cols_to_use].values)

array([[0.30813559, 0.69186441]])

In [42]:
new_request = '{\\"id\\": 16, \\"observation\\": ' + X_test.loc[row,:].to_json().replace('"', '\\"') + '}'
print(new_request)

{\"id\": 16, \"observation\": {\"Department Name\":\"Vernon\",\"InterventionDateTime\":\"03\/25\/2018 11:16:00 PM\",\"InterventionLocationName\":\"VERNON\",\"InterventionReasonCode\":\"V\",\"ReportingOfficerIdentificationID\":\"625\",\"ResidentIndicator\":true,\"SearchAuthorizationCode\":\"O\",\"StatuteReason\":\"Speed Related\",\"SubjectAge\":20.0,\"SubjectEthnicityCode\":\"N\",\"SubjectRaceCode\":\"W\",\"SubjectSexCode\":\"M\",\"TownResidentIndicator\":false}}


In [43]:
y_test.loc[row]

True

In [None]:
X_test.iloc[-1,:].to_json().replace('"', '\\"')

In [None]:
new_request = '{\
\\"id\\": 0, \
\\"observation\\": {\\"Department Name\\":\\"Bloomfield\\",\\"InterventionDateTime\\":\\"01\\/15\\/2018 05:01:00 PM\\",\\"InterventionLocationName\\":\\"Bloomfield\\",\\"InterventionReasonCode\\":\\"V\\",\\"ReportingOfficerIdentificationID\\":\\"2103\\",\\"ResidentIndicator\\":true,\\"SearchAuthorizationCode\\":\\"C\\",\\"StatuteReason\\":\\"Traffic Control Signal\\",\\"SubjectAge\\":31.0,\\"SubjectEthnicityCode\\":\\"N\\",\\"SubjectRaceCode\\":\\"B\\",\\"SubjectSexCode\\":\\"M\\",\\"TownResidentIndicator\\":true}\
}'
print(new_request)

In [None]:
json.load("{"Department Name":"Bloomfield","InterventionDateTime":"01\/15\/2018 05:01:00 PM","InterventionLocationName":"Bloomfield","InterventionReasonCode":"V","ReportingOfficerIdentificationID":"2103","ResidentIndicator":true,"SearchAuthorizationCode":"C","StatuteReason":"Traffic Control Signal","SubjectAge":31.0,"SubjectEthnicityCode":"N","SubjectRaceCode":"B","SubjectSexCode":"M","TownResidentIndicator":true}")

In [None]:
json.load("{\"id\": 0, \"observation\": {\"Department Name\":\"Bloomfield\",\"InterventionDateTime\":\"01\/15\/2018 05:01:00 PM\",\"InterventionLocationName\":\"Bloomfield\",\"InterventionReasonCode\":\"V\",\"ReportingOfficerIdentificationID\":\"2103\",\"ResidentIndicator\":true,\"SearchAuthorizationCode\":\"C\",\"StatuteReason\":\"Traffic Control Signal\",\"SubjectAge\":31.0,\"SubjectEthnicityCode\":\"N\",\"SubjectRaceCode\":\"B\",\"SubjectSexCode\":\"M\",\"TownResidentIndicator\":true}}")
