In [69]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer

import time
from sklearn.neighbors import KNeighborsClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier

from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier

from sklearn.decomposition import PCA
from sklearn import svm
from sklearn.svm import SVC

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

# 1. Grab Data

In [4]:
df = pd.read_csv('data/cleaned_data')

In [7]:
frisk_columns = ['officer_age', 'officer_race', 'officer_gender', 'subject_race', 'subject_gender']
X = df[frisk_columns]
y = df['frisk']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

### a. Scale, Normalize

In [26]:
#Establish X and y
y = df['frisk']
X = df[frisk_columns] 

#Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

#Remove 'object'-type features from X
cont_features = [col for col in X.columns if X[col].dtype in [np.float64, np.int64]]

# Remove "object"-type features from X_train and X_test
X_train_cont = X_train.loc[:, cont_features]
X_test_cont = X_test.loc[:, cont_features]

In [29]:
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer

# Impute missing values with median using SimpleImputer
impute = SimpleImputer(strategy='median')
X_train_imputed = impute.fit_transform(X_train_cont)
X_test_imputed = impute.transform(X_test_cont)

#normalize our data
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

# Scale the train and test data
ss = StandardScaler()
X_train_imputed_scaled = ss.fit_transform(X_train_imputed)
X_test_imputed_scaled = ss.transform(X_test_imputed)

In [30]:
# Create X_cat which contains only the categorical variables
features_cat = [col for col in X.columns if X[col].dtype in [np.object]]
X_train_cat = X_train.loc[:, features_cat]
X_test_cat = X_test.loc[:, features_cat]

# Fill missing values with the string 'missing'
X_train_cat.fillna(value='missing', inplace=True)
X_test_cat.fillna(value='missing', inplace=True)

In [31]:
from sklearn.preprocessing import OneHotEncoder

# OneHotEncode categorical variables
ohe = OneHotEncoder(handle_unknown='ignore')

# Transform training and test sets
X_train_ohe = ohe.fit_transform(X_train_cat)
X_test_ohe = ohe.transform(X_test_cat)

# Convert these columns into a DataFrame 
columns = ohe.get_feature_names(input_features=X_train_cat.columns)
cat_train_df = pd.DataFrame(X_train_ohe.todense(), columns=columns)
cat_test_df = pd.DataFrame(X_test_ohe.todense(), columns=columns)

In [39]:
#combining all the X_train categorial and continuous data
X_train_all = pd.concat([pd.DataFrame(X_train_imputed_scaled), cat_train_df], axis=1)
X_test_all = pd.concat([pd.DataFrame(X_test_imputed_scaled), cat_test_df], axis=1)

# 2. Modeling

In [75]:
def evaluate(model, name):
    
    output = {'model': name}
    start1 = time.time()
    model.fit(X_train_all, y_train)
    traintime = time.time() - start1
    
    # training metrics
    
    trainpred = model.predict(X_train_all)
    output['train_precision'] = precision_score(y_train, trainpred)
    output['train_recall'] = recall_score(y_train, trainpred)
    output['train_accuracy'] = accuracy_score(y_train, trainpred)
    output['train_f1'] = f1_score(y_train, trainpred)
    output['train_time'] = traintime
    
    # testing metrics
    
    start2 = time.time()
    pred = model.predict(X_test_all)
    testtime = time.time() - start2
    
    output['test_precision'] = precision_score(y_test, pred)
    output['test_recall'] = recall_score(y_test, pred)
    output['test_accuracy'] = accuracy_score(y_test, pred)
    output['test_f1'] = f1_score(y_test, pred)
    output['test_time'] = testtime
    
    # confusion matrix for test set
    
    conf = pd.crosstab(y_test, pred)
    
    return output, conf

### a. KNN

In [41]:
# find optimal k 
def find_best_k(X_train_all, y_train, X_test_all, y_test, min_k=1, max_k=25):
    best_k = 0
    best_score = 0.0
    for k in range(min_k, max_k+1, 2):
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train, y_train)
        preds = knn.predict(X_test)
        f1 = f1_score(y_test, preds)
        if f1 > best_score:
            best_k = k
            best_score = f1
    
    print("Best Value for k: {}".format(best_k))
    print("F1-Score: {}".format(best_score))

In [42]:
find_best_k(X_train_all, y_train, X_test_all, y_test)

Best Value for k: 1
F1-Score: 0.27936507936507937


In [44]:
knn = KNeighborsClassifier(n_neighbors=1)
knn_results = evaluate(knn, 'knn')
knn_results[0]

{'model': 'knn',
 'train_precision': 0.31040157998683343,
 'train_recall': 0.33483252455911944,
 'train_accuracy': 0.6775984401234902,
 'train_f1': 0.3221545294084154,
 'train_time': 0.8813753128051758,
 'test_precision': 0.2674772036474164,
 'test_recall': 0.292358803986711,
 'test_accuracy': 0.6557625649913345,
 'test_f1': 0.27936507936507937,
 'test_time': 2.7236382961273193}

In [45]:
knn_results[1]

col_0,0,1
frisk,Unnamed: 1_level_1,Unnamed: 2_level_1
0,5438,1687
1,1491,616


### b. Trees

#### b.1. Decision Tree

In [46]:
dt = DecisionTreeClassifier()
dt_results = evaluate(dt, 'decision_tree')
dt_results[0]

{'model': 'decision_tree',
 'train_precision': 0.7854251012145749,
 'train_recall': 0.06888389158480293,
 'train_accuracy': 0.7826463738287386,
 'train_f1': 0.1266594124047878,
 'train_time': 0.0999751091003418,
 'test_precision': 0.375,
 'test_recall': 0.03132415757000474,
 'test_accuracy': 0.7670060658578857,
 'test_f1': 0.05781865965834428,
 'test_time': 0.0039980411529541016}

In [47]:
dt_results[1]

col_0,0,1
frisk,Unnamed: 1_level_1,Unnamed: 2_level_1
0,7015,110
1,2041,66


#### b.2. Bagged Tree

In [48]:
bt = BaggingClassifier()
bt_results = evaluate(bt, 'bagged_trees')
bt_results[0]

{'model': 'bagged_trees',
 'train_precision': 0.6881616939364774,
 'train_recall': 0.08462539945555687,
 'train_accuracy': 0.7817797757677517,
 'train_f1': 0.150716694772344,
 'train_time': 0.4922654628753662,
 'test_precision': 0.3521739130434783,
 'test_recall': 0.03844328429046037,
 'test_accuracy': 0.7644064124783362,
 'test_f1': 0.06931964056482671,
 'test_time': 0.03397989273071289}

In [49]:
bt_results[1]

col_0,0,1
frisk,Unnamed: 1_level_1,Unnamed: 2_level_1
0,6976,149
1,2026,81


#### b.3. Random Forest

In [51]:
rf = RandomForestClassifier()
rf_results = evaluate(rf, 'random_forest')
rf_results[0]

{'model': 'random_forest',
 'train_precision': 0.7337016574585635,
 'train_recall': 0.07858918215173394,
 'train_accuracy': 0.7826463738287386,
 'train_f1': 0.14197134915544155,
 'train_time': 2.229724884033203,
 'test_precision': 0.3768844221105528,
 'test_recall': 0.03559563360227812,
 'test_accuracy': 0.7664644714038128,
 'test_f1': 0.0650477016478751,
 'test_time': 0.19988679885864258}

In [52]:
rf_results[1]

col_0,0,1
frisk,Unnamed: 1_level_1,Unnamed: 2_level_1
0,7001,124
1,2032,75


### c. Boosting

#### AdaBoost

In [55]:
ada = AdaBoostClassifier()
ada_results = evaluate(ada, 'adaboost')
ada_results[0]

{'model': 'adaboost',
 'train_precision': 1.0,
 'train_recall': 0.00023671440407148776,
 'train_accuracy': 0.7712451930888805,
 'train_f1': 0.00047331676724647967,
 'train_time': 1.0121383666992188,
 'test_precision': 1.0,
 'test_recall': 0.00047460844803037496,
 'test_accuracy': 0.7718804159445407,
 'test_f1': 0.0009487666034155599,
 'test_time': 0.07499432563781738}

In [56]:
ada_results[1]

col_0,0,1
frisk,Unnamed: 1_level_1,Unnamed: 2_level_1
0,7125,0
1,2106,1


# Results

In [77]:
result_dicts = [knn_results, dt_results, bt_results, 
                rf_results, ada_results, ]
results = pd.DataFrame([i[0] for i in result_dicts])

In [79]:
results

Unnamed: 0,model,train_precision,train_recall,train_accuracy,train_f1,train_time,test_precision,test_recall,test_accuracy,test_f1,test_time
0,knn,0.310402,0.334833,0.677598,0.322155,0.881375,0.267477,0.292359,0.655763,0.279365,2.723638
1,decision_tree,0.785425,0.068884,0.782646,0.126659,0.099975,0.375,0.031324,0.767006,0.057819,0.003998
2,bagged_trees,0.688162,0.084625,0.78178,0.150717,0.492265,0.352174,0.038443,0.764406,0.06932,0.03398
3,random_forest,0.733702,0.078589,0.782646,0.141971,2.229725,0.376884,0.035596,0.766464,0.065048,0.199887
4,adaboost,1.0,0.000237,0.771245,0.000473,1.012138,1.0,0.000475,0.77188,0.000949,0.074994
