# Import statements

In [1]:
# Operating System
import os

#Pandas and Numpy
import pandas as pd
import numpy as np

#ramdom generator
import random

from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import TransformerMixin, BaseEstimator

import category_encoders as ce
from sklearn.preprocessing import KBinsDiscretizer

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import precision_score, recall_score, accuracy_score



#Plots
import matplotlib.pyplot as plt
%matplotlib inline

# Statistics
from scipy.stats import binom_test

# Strings
import re

# Define utility functions

In [2]:
def simple_clean(doc, regex_list=[("[\.\?\(\)\|:;_!@/*\-]", " "), (" +", " ")] ):
    
    # Make sure it is a string!
    doc = str(doc)
    
    # remove or replace characters
    for regex in regex_list:
        doc = re.sub(regex[0], regex[1], doc)
    # lowercase
    doc = doc.lower()
    # Trim
    doc = doc.strip()
    # tokenize
    #words = tokenizer.tokenize(doc)
    # remove punctuation
    #words = list(filter(lambda x: x not in string.punctuation, words))
    # stem
    #stems = list(map(stemmer.stem, words))
    #new_doc = " ".join(stems)
    new_doc = doc
    return new_doc

# Load original data

In [141]:
#file_path = ".\data\train.csv"
file_path = os.path.join('data', 'train.csv')

# Option for reading a sample of the file
# sample 20% of the rows
p = 0.5

random.seed(178) # this is to get always the same sample. can be removed if we want the sample to change
try:
    df_original = pd.read_csv(file_path, 
                             skiprows = lambda row_num: random.random() > p and row_num > 0, 
                             #nrows = 10000, 
                             header=0,
                             warn_bad_lines=True)
except:
    print('Ooops!!! We got an error!')
else:
    # Drop observations correspoding to stops that didn't lead to a search
    df = df_original[df_original.VehicleSearchedIndicator==True].reset_index(drop=True).drop(columns='VehicleSearchedIndicator')


# Train-Test split

In [142]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='ContrabandIndicator'), 
                                                    df['ContrabandIndicator'], 
                                                    test_size=0.5, 
                                                    random_state=42)

# Pre-process 

Both the train set and the test set will enter this pre-processing to prepare for fits ans transforms:
* Select columns to keep;
* Clean text features;

In [175]:
def pre_process_data(df: pd.DataFrame) -> pd.DataFrame:
    
    """Transforms the original DataFrame to make it suited to enter the workflow.
    
    Args:
        df (pd.DataFrame):  Original DataFrame, it will be first X_train and then X_test.
        
    Returns:
        df_new (pd.DataFrame): Transformed DataFrame, ready to enter the workflow.
        
    TODO:
        * to do list
        
    """
    
    columns_to_keep = ['Department Name', 'SubjectAge']
#     columns_to_keep = ['Department Name', 
#                        'InterventionReasonCode',
#                        'ResidentIndicator', 
#                        'SearchAuthorizationCode', 
#                        'StatuteReason', 
#                        'SubjectAge', 
#                        'SubjectEthnicityCode', 
#                        'SubjectRaceCode', 
#                        'SubjectSexCode'
#                        ]
    
    # copy the argument
    df_new = df.copy()
    
    df_new = df_new[columns_to_keep]
    
    df_new['Department Name'] = df_new['Department Name'].apply(simple_clean)    
    
    return df_new

In [176]:
# Initializations

ordinalencoder = ce.ordinal.OrdinalEncoder()
binarizer = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
clf = RandomForestClassifier(random_state = 42)


X_train_cln = pre_process_data(X_train)
X_train_cln = ordinalencoder.fit_transform(X_train_cln)
X_train_cln['SubjectAge'] = binarizer.fit_transform(X_train_cln[['SubjectAge']])
clf.fit(X_train_cln, y_train)


X_test_cln = pre_process_data(X_test)
X_test_cln = ordinalencoder.fit_transform(X_test_cln)
X_test_cln['SubjectAge'] = binarizer.fit_transform(X_test_cln[['SubjectAge']])
y_pred = clf.predict(X_test_cln)




In [177]:
clf.decision_path(X_train_cln)

(<19196x10762 sparse matrix of type '<class 'numpy.int64'>'
 	with 2375350 stored elements in Compressed Sparse Row format>,
 array([    0,  1079,  2142,  3225,  4296,  5353,  6440,  7519,  8594,
         9681, 10762], dtype=int32))

In [178]:
X_train_cln.shape

(19196, 2)

In [179]:
y_pred

array([ True, False, False, ..., False, False, False])

In [180]:
precision_score(y_test, y_pred)

0.5383079610681307

In [181]:
recall_score(y_test, y_pred)

0.346840328027014

In [182]:
accuracy_score(y_test, y_pred)

0.6920191706605543

In [187]:
(y_pred==y_test).sum()/y_test.sum()

2.1360347322720696

In [205]:
((y_pred==True) & (y_test==True)).sum() / y_test.sum()

0.346840328027014

### Predict all `True`

In [151]:
y_pred_true = np.ones(y_test.shape).astype(bool)

In [152]:
precision_score(y_test, y_pred_true)

0.32397374453011046

In [153]:
recall_score(y_test, y_pred_true)

1.0

In [154]:
accuracy_score(y_test, y_pred_true)

0.32397374453011046

### Predict all `False`

In [155]:
y_pred_false = np.zeros(y_test.shape).astype(bool)

In [156]:
precision_score(y_test, y_pred_false)

  'precision', 'predicted', average, warn_for)


0.0

In [157]:
recall_score(y_test, y_pred_false)

0.0

In [158]:
accuracy_score(y_test, y_pred_false)

0.6760262554698896

### Predict randomly with positive\negative ratio `r`

In [159]:
np.random.seed(425)
ratio = 0.90

In [160]:
# y_pred_rand = np.random.randint(2, size=y_test.shape).astype(bool)
y_pred_rand = np.random.binomial(n=1, p=ratio, size=y_test.shape).astype(bool)

In [161]:
y_pred_rand.sum()/y_pred_rand.shape[0]

0.8999791623254845

In [162]:
y_pred_rand.shape

(19196,)

In [163]:
precision_score(y_test, y_pred_rand)

0.3224704792776106

In [164]:
recall_score(y_test, y_pred_rand)

0.8958031837916064

In [165]:
accuracy_score(y_test, y_pred_rand)

0.35648051677432796

In [15]:
type(y_pred)

numpy.ndarray