In [None]:
%load_ext autoreload
%autoreload 2

## Package Imports & Setup 

In [None]:
# setting project path
import os
import sys

gparent = os.path.join(os.pardir, os.pardir)
sys.path.append(gparent)

# imports
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.compose import make_column_selector
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline 
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import make_scorer, plot_confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.base import clone
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline as make_sm_pipeline

from boruta import BorutaPy

from src import classes as c
from src import functions as f
import matplotlib.pyplot as plt
import seaborn as sns

# setting style
sns.set_theme('talk')
plt.style.use('fivethirtyeight')
sns.set_palette(palette='Blues_r')

## Importing the Data

In [None]:
path = os.path.join(gparent, 'data/processed', 'modeling.csv')
df = pd.read_csv(path, keep_default_na=False)

## Preprocessing and Harness Objects

In [None]:
string_selector = make_column_selector(dtype_include='object')
number_selector = make_column_selector(dtype_include='number', dtype_exclude='object')

preprocessing = make_column_transformer((OneHotEncoder
                                         (handle_unknown='ignore'),string_selector),
                                          (StandardScaler(), number_selector))

sm = SMOTE(random_state=2021)

In [None]:
# creating f1 scorer
f1 = f.f1

In [None]:
modeling = c.Harness(f1)

## Baseline Model KNN

Dropping engineered features to establish the baseline score.

In [None]:
# features engineered during eda
feature_list = ['Weapon Flag', 'Reported Year', 'Reported Month',
                'Day of Month', 'Day of Week', 'Reported Hour', 'Beat Flag']

In [None]:
# excluded = ['Weapon Type', 'Officer ID', 'Initial Call Type', 'Final Call Type',
#              'Call Type', 'Officer Squad', 'Frisk Flag', 'Weapon Flag',
#              'Reported Year', 'Reported Month','Day of Month',
#              'Day of Week', 'Reported Hour', 'Beat Flag']

In [None]:
excluded = ['Officer ID','Frisk Flag','Weapon Flag',
             'Reported Year', 'Reported Month','Day of Month',
             'Day of Week', 'Reported Hour', 'Beat Flag']

In [None]:
baseline_df = f.framer(df, [], excluded)

In [None]:
baseline_df.head(2)

In [None]:
baseline_df.info()

In [None]:
X, y = f.Xy(baseline_df)

In [None]:
X_train, X_test, y_train, y_test = f.splitter(X,y)

In [None]:
KNN = KNeighborsClassifier()

In [None]:
# baseline = make_pipeline(preprocessing, KNN)

In [None]:
# baseline.fit(X_train, y_train)

In [None]:
# f1_score(baseline.predict(X_train), y_train)

In [None]:
## checking cross validation scores
# modeling.report(baseline, X_train, y_train, 'Baseline', 'KNN - Baseline Features')

## KNN Baseline With SMOTE

In [None]:
baseline_sm = make_sm_pipeline(preprocessing, sm, KNN)

In [None]:
# baseline_sm.fit(X_train, y_train)

In [None]:
# f1_score(baseline_sm.predict(X_train), y_train)

In [None]:
# # checking cross validation scores
# modeling.report(baseline_sm, X_train, y_train, 'KNN SMOTE',\
#                 'Baseline KNN w/ SMOTE')

## Logistic Regression

In [None]:
LR = LogisticRegression(max_iter=1000)

In [None]:
lr_smote = make_sm_pipeline(preprocessing, sm, LR)

In [None]:
lr_smote.fit(X_train, y_train)

In [None]:
#scoring the model
f1_score(lr_smote.predict(X_train), y_train)

In [None]:
# checking cross val scores
modeling.report(lr_smote, X_train, y_train, 'LR SMOTE', 'Log Reg w/ SMOTE')

## Sub-Splitting the Training Data

In [None]:
Xs_train, Xs_test, ys_train, ys_test = f.splitter(X_train, y_train)

In [None]:
lr_smote.fit(Xs_train, ys_train)

In [None]:
f1_score(lr_smote.predict(Xs_train), ys_train)

In [None]:
f1_score(lr_smote.predict(Xs_test), ys_test)

In [None]:
f.confusion(lr_smote, Xs_train, ys_train)

In [None]:
f.confusion(lr_smote, Xs_test, ys_test)

## Investigating the Data More Closely

In [None]:
cols = baseline_df.columns
for col in cols:
    print(f'{df[col].value_counts()}\n')

In [None]:
baseline_df['Officer YOB'].describe()

## Creating Age Feature
Creating `Officer Age` feature, dropping invalid ages and `Officer YOB` feature.

In [None]:
df['Officer Age'] = df['Reported Year'] - df['Officer YOB']
df['Officer Age'] = df['Officer Age'].apply(lambda x: np.nan if x > 100 else x)
df.dropna(subset = ['Officer Age'], inplace=True)

In [None]:
df.drop('Officer YOB', axis=1, inplace=True)

## Testing Age Feature

In [None]:
# features engineered during eda
feature_list = ['Weapon Flag', 'Reported Year', 'Reported Month',
                'Day of Month', 'Day of Week', 'Reported Hour', 'Beat Flag']

In [None]:
excluded = excluded = ['Officer ID','Frisk Flag','Weapon Flag',
             'Reported Year', 'Reported Month','Day of Month',
             'Day of Week', 'Reported Hour', 'Beat Flag']

In [None]:
age_df = f.framer(df, [], excluded)

In [None]:
age_df.head(2)

In [None]:
age_df.info()

In [None]:
X, y = f.Xy(age_df)

In [None]:
X_train, X_test, y_train, y_test = f.splitter(X,y)

In [None]:
lr_smote2 = make_sm_pipeline(preprocessing, sm, LR)

In [None]:
lr_smote2.fit(X_train, y_train)

In [None]:
#scoring the model
f1_score(lr_smote2.predict(X_train), y_train)

In [None]:
# checking cross val scores
modeling.report(lr_smote2, X_train, y_train, 'LR SMOTE', 'Added Officer Age')

In [None]:
modeling.history

## Creating Gender & Racial Affinity Features

In [None]:
df['Racial Affinity'] = [1 if df['Subject Perceived Race'][row]\
                         == df['Officer Race'][row] else 0 for row in df.index]

In [None]:
df['Officer Gender'].value_counts()

In [None]:
df['Officer Gender'] = df['Officer Gender'].apply(lambda x: 'Male' if x=='M' else 'Female')

In [None]:
df['Gender Affinity'] = [1 if df['Subject Perceived Gender'][row]\
                         == df['Officer Gender'][row] else 0 for row in df.index]

In [None]:
df.head(3)

## Testing New Features

In [None]:
# features engineered during eda
feature_list = ['Weapon Flag', 'Reported Year', 'Reported Month',
                'Day of Month', 'Day of Week', 'Reported Hour', 'Beat Flag']

In [None]:
excluded = excluded = excluded = ['Officer ID','Frisk Flag','Weapon Flag',
             'Reported Year', 'Reported Month','Day of Month',
             'Day of Week', 'Reported Hour', 'Beat Flag']

In [None]:
df_3 = f.framer(df, [], excluded)

In [None]:
df_3.head(2)

In [None]:
df_3.info()

In [None]:
X, y = f.Xy(df_3)

In [None]:
X_train, X_test, y_train, y_test = f.splitter(X,y)

In [None]:
lr_smote3 = make_sm_pipeline(preprocessing, sm, LR)

In [None]:
lr_smote3.fit(X_train, y_train)

In [None]:
#scoring the model
f1_score(lr_smote3.predict(X_train), y_train)

In [None]:
# checking cross val scores
modeling.report(lr_smote3, X_train, y_train, 'LR SMOTE ', 'Log Reg w/ New Age, Race, Gender Features')

In [None]:
modeling.history

## Testing Dropping Nulls
Droppin Subject Race Nulls

In [None]:
dropd = df[(df['Subject Perceived Race']!='NA')]

In [None]:
# features engineered during eda
feature_list = ['Weapon Flag', 'Reported Year', 'Reported Month',
                'Day of Month', 'Day of Week', 'Reported Hour', 'Beat Flag']

In [None]:
excluded = excluded = ['Officer ID','Frisk Flag','Weapon Flag',
             'Reported Year', 'Reported Month','Day of Month',
             'Day of Week', 'Reported Hour', 'Beat Flag']

In [None]:
df_4 = f.framer(dropd, [], excluded)

In [None]:
X, y = f.Xy(df_4)

In [None]:
X_train, X_test, y_train, y_test = f.splitter(X,y)

In [None]:
lr_smote4 = make_sm_pipeline(preprocessing, sm, LR)

In [None]:
lr_smote4.fit(X_train, y_train)

In [None]:
#scoring the model
f1_score(lr_smote4.predict(X_train), y_train)

In [None]:
# checking cross val scores
modeling.report(lr_smote4, X_train, y_train, 'LR SMOTE New Age, Race, Gender Features', 'Dropped Subject Race NA')

In [None]:
modeling.history

## Dropping Officer Race Nulls

In [None]:
dropd2 = df[(df['Subject Perceived Race']!='NA') &\
           (df['Officer Race']!='NA')]

In [None]:
# features engineered during eda
feature_list = ['Weapon Flag', 'Reported Year', 'Reported Month',
                'Day of Month', 'Day of Week', 'Reported Hour', 'Beat Flag']

In [None]:
excluded = excluded = ['Officer ID','Frisk Flag','Weapon Flag',
             'Reported Year', 'Reported Month','Day of Month',
             'Day of Week', 'Reported Hour', 'Beat Flag']

In [None]:
df_5 = f.framer(dropd2, [], excluded)

In [None]:
df_5.head(2)

In [None]:
df_5.info()

In [None]:
X, y = f.Xy(df_5)

In [None]:
X_train, X_test, y_train, y_test = f.splitter(X,y)

In [None]:
lr_smote5 = make_sm_pipeline(preprocessing, sm, LR)

In [None]:
lr_smote5.fit(X_train, y_train)

In [None]:
#scoring the model
f1_score(lr_smote5.predict(X_train), y_train)

In [None]:
# checking cross val scores
modeling.report(lr_smote5, X_train, y_train, 'LR SMOTE New Age, Race, Gender Features', 'Drpd S Race, O Race NA')

In [None]:
modeling.history

## Dropping Subject Age Nulls

In [None]:
dropd3 = df[(df['Subject Perceived Race']!='NA') &\
            (df['Officer Race']!='NA')&\
            (df['Subject Age Group']!='NA')]

In [None]:
# features engineered during eda
feature_list = ['Weapon Flag', 'Reported Year', 'Reported Month',
                'Day of Month', 'Day of Week', 'Reported Hour', 'Beat Flag']

In [None]:
excluded = excluded = ['Officer ID','Frisk Flag','Weapon Flag',
             'Reported Year', 'Reported Month','Day of Month',
             'Day of Week', 'Reported Hour', 'Beat Flag']

In [None]:
df_6 = f.framer(dropd3, [], excluded)

In [None]:
df_6.head(2)

In [None]:
df_6.info()

In [None]:
X, y = f.Xy(df_6)

In [None]:
X_train, X_test, y_train, y_test = f.splitter(X,y)

In [None]:
lr_smote6 = make_sm_pipeline(preprocessing, sm, LR)

In [None]:
lr_smote6.fit(X_train, y_train)

In [None]:
#scoring the model
f1_score(lr_smote6.predict(X_train), y_train)

In [None]:
# checking cross val scores
modeling.report(lr_smote6, X_train, y_train,\
                'LR SMOTE New Age, Race, Gender Features',\
                'Dpd Nulls S Race, O Race NA, S Age')

In [None]:
modeling.history

## Testing on Sub-Splits

In [None]:
Xs_train, Xs_test, ys_train, ys_test = f.splitter(X_train, y_train)

In [None]:
lr_smote6.fit(Xs_train, ys_train)

In [None]:
f1_score(lr_smote6.predict(Xs_train), ys_train)

In [None]:
f1_score(lr_smote6.predict(Xs_test), ys_test)

In [None]:
f.confusion(lr_smote6, Xs_train, ys_train)

In [None]:
f.confusion(lr_smote6, Xs_test, ys_test)

## Balanced Class Weights

In [None]:
LR_bal = LogisticRegression(class_weight='balanced', max_iter=1000)

In [None]:
lr_smote7 = make_sm_pipeline(preprocessing, sm, LR_bal)

In [None]:
lr_smote7.fit(X_train, y_train)

In [None]:
#scoring the model
f1_score(lr_smote7.predict(X_train), y_train)

In [None]:
# checking cross val scores
modeling.report(lr_smote7, X_train, y_train, 'LR_bal', 'Dpd Nulls S Race, O Race NA, S Age')

In [None]:
modeling.history

## Checking C Values

## Rough Estimate By Hand

In [None]:
# # preprocessing
# string_selector = make_column_selector(dtype_include='object')
# number_selector = make_column_selector(dtype_include='number', dtype_exclude='object')
# preprocessing = make_column_transformer((OneHotEncoder
#                                          (handle_unknown='ignore'),string_selector),
#                                           (StandardScaler(), number_selector))
# # C = [0.001,.009,0.01,.09,1,5,10,25]
# C = np.logspace(-3, 3, 7)
# for c in C:
#     LR_clf = LogisticRegression(class_weight='balanced', C=c, max_iter=1000)
#     # make_pipeline from imblearn
#     sm = SMOTE(random_state=2021) 
#     pipeline  = make_sm_pipeline(preprocessing, sm, LR_clf)
#     pipeline.fit(X_train, y_train)
#     modeling.report(pipeline, X_train, y_train, 'LR_bal', f'C={c}')
# f1_score(pipeline.predict(X_train), y_train)

In [None]:
modeling.history

## Grid Searching Parameters

In [None]:
LR_clf = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=2021)

pipeline  = make_sm_pipeline(preprocessing, sm, LR_clf)
pipeline.fit(X_train, y_train)

C = np.logspace(-3, 3, 7)
penalty = ['l1', 'l2']
solver = ['liblinear', 'saga']

grid_values = {'logisticregression__C':C, 
               'logisticregression__penalty':penalty,
               'logisticregression__solver':solver }

grid_clf_f1 = GridSearchCV(pipeline, param_grid = grid_values, n_jobs=-1, scoring = 'f1')
grid_clf_f1.fit(X_train, y_train)

In [None]:
print('Best C:', grid_clf_f1.best_estimator_.get_params()['logisticregression__C'])
print('Best penalty:', grid_clf_f1.best_estimator_.get_params()['logisticregression__penalty'])
print('Best solver:', grid_clf_f1.best_estimator_.get_params()['logisticregression__solver'])

In [None]:
LR_tuned = LogisticRegression(class_weight='balanced', C=.001,
                            solver='liblinear', penalty='l2',
                            max_iter=1000, random_state=2021)

pipeline  = make_sm_pipeline(preprocessing, sm, LR_tuned)
pipeline.fit(X_train, y_train)
f1_score(pipeline.predict(X_train), y_train)
modeling.report(pipeline, X_train, y_train, 'LR_tuned', 'C=.01, solver=liblinear, penalty=l1')

In [None]:
modeling.history

## Testing Tuned Model on Sub-Splits

In [None]:
Xs_train, Xs_test, ys_train, ys_test = f.splitter(X_train, y_train)

In [None]:
pipeline  = make_sm_pipeline(preprocessing, sm, LR_tuned)
pipeline.fit(Xs_train, ys_train)

In [None]:
f1_score(pipeline.predict(Xs_train), ys_train)

In [None]:
f1_score(pipeline.predict(Xs_test), ys_test)

In [None]:
f.confusion(pipeline, Xs_train, ys_train)

In [None]:
f.confusion(pipeline, Xs_test, ys_test)

## Adding Polynomial Features

In [None]:
preprocessing2 = make_column_transformer((PolynomialFeatures(degree=5, interaction_only=True), number_selector),
                                         (OneHotEncoder(handle_unknown='ignore'),string_selector),
                                          (StandardScaler(), number_selector))

In [None]:
preprocessing2.fit_transform(X_train);

In [None]:
lr_poly = make_sm_pipeline(preprocessing2, sm, LR_tuned)

In [None]:
lr_poly.fit(X_train, y_train)

In [None]:
f1_score(lr_poly.predict(X_train), y_train)

In [None]:
modeling.report(lr_poly, X_train, y_train, 'LR_tuned Poly', 'LR Polys: d=5, interaction_only=True')

In [None]:
modeling.history

## Testing Tuned Model on Sub-Splits

In [None]:
Xs_train, Xs_test, ys_train, ys_test = f.splitter(X_train, y_train)

In [None]:
pipeline  = make_sm_pipeline(preprocessing2, sm, LR_tuned)
pipeline.fit(Xs_train, ys_train)

In [None]:
f1_score(pipeline.predict(Xs_train), ys_train)

In [None]:
f1_score(pipeline.predict(Xs_test), ys_test)

In [None]:
f.confusion(pipeline, Xs_train, ys_train)

In [None]:
f.confusion(pipeline, Xs_test, ys_test)

## Tuning Polys Model without SMOTE

In [None]:
# LR_unbal = LogisticRegression(max_iter=1000, random_state=2021)

# pipeline  = make_pipeline(preprocessing2, LR_unbal)
# pipeline.fit(X_train, y_train)
# f1_score(pipeline.predict(X_train), y_train)
# modeling.report(pipeline, X_train, y_train, 'LR_unbal',
#                 'polys w/ no smote')

In [None]:
# C = np.logspace(-3, 3, 7)
# penalty = ['l1', 'l2']
# solver = ['liblinear', 'saga']
# weight = [None, 'balanced']

# grid_values = {'logisticregression__C':C, 
#                'logisticregression__penalty':penalty,
#                'logisticregression__solver':solver,
#               'logisticregression__class_weight':weight}

# grid_clf_f1 = GridSearchCV(pipeline, param_grid = grid_values, n_jobs=-1, verbose=3, scoring = 'f1')
# grid_clf_f1.fit(X_train, y_train)

In [None]:
# print('Best C:', grid_clf_f1.best_estimator_.get_params()['logisticregression__C'])
# print('Best penalty:', grid_clf_f1.best_estimator_.get_params()['logisticregression__penalty'])
# print('Best solver:', grid_clf_f1.best_estimator_.get_params()['logisticregression__solver'])
# print('Best class weight:', grid_clf_f1.best_estimator_.get_params()['logisticregression__class_weight'])

In [None]:
# LR_utuned = LogisticRegression(max_iter=1000, C=1, penalty='l1',
#                                solver='liblinear',
#                                class_weight='balanced',
#                                random_state=2021)

# pipeline  = make_pipeline(preprocessing2, LR_utuned)
# pipeline.fit(X_train, y_train)
# f1_score(pipeline.predict(X_train), y_train)
# modeling.report(pipeline, X_train, y_train, 'LR_utuned',
#                 'tuned w/ no smote')

In [None]:
# modeling.history

## Tuning no SMOTE, No Polys

In [None]:
# LR_unbal = LogisticRegression(max_iter=1000, random_state=2021)

# pipeline  = make_pipeline(preprocessing, LR_unbal)
# pipeline.fit(X_train, y_train)
# f1_score(pipeline.predict(X_train), y_train)
# modeling.report(pipeline, X_train, y_train, 'LR_unbal',
#                 'no polys, no smote')

In [None]:
# C = np.logspace(-3, 3, 7)
# penalty = ['l1', 'l2']
# solver = ['liblinear', 'saga']
# weight = [None, 'balanced']

# grid_values = {'logisticregression__C':C, 
#                'logisticregression__penalty':penalty,
#                'logisticregression__solver':solver,
#               'logisticregression__class_weight':weight}

# grid_clf_f1 = GridSearchCV(pipeline, param_grid = grid_values, scoring = 'f1')
# grid_clf_f1.fit(X_train, y_train)

In [None]:
# print('Best C:', grid_clf_f1.best_estimator_.get_params()['logisticregression__C'])
# print('Best penalty:', grid_clf_f1.best_estimator_.get_params()['logisticregression__penalty'])
# print('Best solver:', grid_clf_f1.best_estimator_.get_params()['logisticregression__solver'])
# print('Best class weight:', grid_clf_f1.best_estimator_.get_params()['logisticregression__class_weight'])

In [None]:
# LR_utuned = LogisticRegression(max_iter=1000, C=.1, penalty='l1',
#                                solver='liblinear',
#                                class_weight='balanced',
#                                random_state=2021)

# pipeline  = make_pipeline(preprocessing, LR_utuned)
# pipeline.fit(X_train, y_train)
# f1_score(pipeline.predict(X_train), y_train)
# modeling.report(pipeline, X_train, y_train, 'LR_utuned',
#                 'tuned no polys, no smote')

In [None]:
# modeling.history

In [None]:
# Xs_train, Xs_test, ys_train, ys_test = f.splitter(X_train, y_train)

In [None]:
# pipeline  = make_pipeline(preprocessing2, LR_tuned)
# pipeline.fit(Xs_train, ys_train)

In [None]:
# f1_score(pipeline.predict(Xs_train), ys_train)

In [None]:
# f1_score(pipeline.predict(Xs_test), ys_test)

In [None]:
# f.confusion(pipeline, Xs_train, ys_train)

In [None]:
# f.confusion(pipeline, Xs_test, ys_test)

## Feature Selection

In [None]:
dropd4 = df[(df['Subject Perceived Race']!='NA') &\
            (df['Officer Race']!='NA')&\
            (df['Subject Age Group']!='NA')]

In [None]:
# 
feature_list = ['Weapon Type', 'Officer ID', 'Initial Call Type', 'Final Call Type',
                'Call Type', 'Officer Squad', 'Arrest Flag',
                'Frisk Flag',  'Sector', 'Beat', 'Weapon Flag',
                'Reported Year', 'Reported Month',
                'Day of Month', 'Day of Week', 'Reported Hour', 'Beat Flag']

In [None]:
excluded = excluded = ['Officer ID','Frisk Flag','Weapon Flag',
             'Reported Year', 'Reported Month','Day of Month',
             'Day of Week', 'Reported Hour', 'Beat Flag']

In [None]:
f.feature_test(dropd4, LR_tuned, feature_list)

In [None]:
dropd4.columns

In [None]:
select_list = ['Target', 'Subject Age Group', 'Weapon Type', 'Officer ID',
               'Officer Gender', 'Officer Race', 'Subject Perceived Race',
               'Subject Perceived Gender','Initial Call Type', 'Precinct',
               'Weapon Flag', 'Beat Flag', 'Officer Age',
               'Racial Affinity', 'Gender Affinity']

In [None]:
df_7 = dropd4[select_list]

In [None]:
df_7.head(2)

In [None]:
df_7.info()

In [None]:
X, y = f.Xy(df_7)

In [None]:
X_train, X_test, y_train, y_test = f.splitter(X,y)

In [None]:
selector = make_pipeline(preprocessing2, LR_tuned)

In [None]:
selector.fit(X_train, y_train)

In [None]:
#scoring the model
f1_score(selector.predict(X_train), y_train)

In [None]:
# checking cross val scores
modeling.report(selector, X_train, y_train, 'LR+polys, no smote', 'Top Features Selected')

In [None]:
modeling.history

## Re-Tuning on Selected Features

In [None]:
LR_Select = LogisticRegression(max_iter=500, random_state=2021)

pipeline  = make_pipeline(preprocessing2, LR_Select)
pipeline.fit(X_train, y_train)
f1_score(pipeline.predict(X_train), y_train)
modeling.report(pipeline, X_train, y_train, 'LR_Select',
                'polys, no smote')

In [None]:
# C = np.logspace(-3, 3, 7)
# penalty = ['l1', 'l2']
# solver = ['liblinear', 'saga']
# weight = [None, 'balanced']

# grid_values = {'logisticregression__C':C, 
#                'logisticregression__penalty':penalty,
#                'logisticregression__solver':solver,
#               'logisticregression__class_weight':weight}

# grid_clf_f1 = GridSearchCV(pipeline, param_grid = grid_values, scoring = 'f1')
# grid_clf_f1.fit(X_train, y_train)

In [None]:
print('Best C:', grid_clf_f1.best_estimator_.get_params()['logisticregression__C'])
print('Best penalty:', grid_clf_f1.best_estimator_.get_params()['logisticregression__penalty'])
print('Best solver:', grid_clf_f1.best_estimator_.get_params()['logisticregression__solver'])
print('Best class weight:', grid_clf_f1.best_estimator_.get_params()['logisticregression__class_weight'])

In [None]:
LR_seltuned = LogisticRegression(max_iter=500, C=1, penalty='l1',
                               solver='liblinear',
                               class_weight='balanced',
                               random_state=2021)

pipeline  = make_pipeline(preprocessing2, LR_seltuned)
pipeline.fit(X_train, y_train)
f1_score(pipeline.predict(X_train), y_train)
modeling.report(pipeline, X_train, y_train, 'LR_seltuned',
                'polys, no smote, selected features')

In [None]:
modeling.history

## Testing on SubSplits

In [None]:
Xs_train, Xs_test, ys_train , ys_test = f.splitter(X_train, y_train)

In [None]:
preprocessing2.fit_transform(Xs_train)

In [None]:
sub_pipeline = make_pipeline(preprocessing2, LR_seltuned)

In [None]:
sub_pipeline.fit(Xs_train, ys_train)

In [None]:
f1_score(sub_pipeline.predict(Xs_train), ys_train)

In [None]:
f1_score(sub_pipeline.predict(Xs_test), ys_test)

In [None]:
modeling.report(sub_pipeline, Xs_train, ys_train, 'LR Polys', 'tuned select features  training')

In [None]:
f.confusion(sub_pipeline, Xs_train, ys_train)

In [None]:
modeling.report(sub_pipeline, Xs_test, ys_test, 'LR_Polys', 'tuned select Features testing')

In [None]:
f.confusion(sub_pipeline, Xs_test, ys_test)

In [None]:
modeling.history

## Boruta 

In [None]:
X, y = f.Xy()

In [None]:
X.head()

In [None]:
num_cols = X.columns[X.dtypes.apply(lambda c: np.issubdtype(c, np.number))]

In [None]:
ss = StandardScaler()

X[num_cols] = ss.fit_transform(X[num_cols])

In [None]:
X.head(2)

In [None]:
X = pd.get_dummies(X)

In [None]:
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)

# define Boruta feature selection method
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=2021)

# find all relevant features - 5 features should be selected
feat_selector.fit(X.values, y.values)

# check selected features - first 5 features are selected
feat_selector.support_

# check ranking of features
feat_selector.ranking_

In [None]:
# call transform() on X to filter it down to selected features
X_filtered = feat_selector.transform(X.values)

green_area = X.columns[feat_selector.support_].to_list()
blue_area = X.columns[feat_selector.support_weak_].to_list()
print('features in the green area:', green_area)
print('features in the blue area:', blue_area)

In [None]:
X.columns[feat_selector.ranking_].sort_values(ascending=False).to_list()

In [None]:
X_filtered = feat_selector.transform(X.values)

In [None]:
boruta = pd.DataFrame(X_filtered)

In [None]:
X_train, X_test, y_train, y_test = f.splitter(boruta, y)

In [None]:
boruta_selected = make_pipeline(preprocessing2, LR_utuned)

In [None]:
boruta_selected.fit(X_train, y_train)

In [None]:
#scoring the model
f1_score(boruta_selected.predict(X_train), y_train)

In [None]:
# checking cross val scores
modeling.report(boruta_selected, X_train, y_train, 'LR+polys, no smote', 'Boruta Selected')

In [None]:
modeling.history

## Decision Tree

In [None]:
# features engineered during eda
feature_list3 = ['Weapon Flag', 'Reported Year', 'Reported Month',
                'Day of Month', 'Day of Week', 'Reported Hour', 'Beat Flag']

In [None]:
excluded = ['Officer ID','Frisk Flag','Weapon Flag',
             'Reported Year', 'Reported Month','Day of Month',
             'Day of Week', 'Reported Hour']

In [None]:
baseline_df2 = f.framer(df, [], excluded)

In [None]:
baseline_df2.head(2)

In [None]:
X, y = f.Xy(baseline_df2)

In [None]:
X_train, X_test, y_train, y_test = f.splitter(X,y)

In [None]:
preprocessing.fit_transform(X_train)

In [None]:
DT = DecisionTreeClassifier()

In [None]:
dt_pipeline = make_sm_pipeline(preprocessing2, DT)

In [None]:
modeling.report(dt_pipeline, X_train, y_train, 'Tree CLF', 'Tree w/o SMOTE')

## Random Forest

In [None]:
RF = RandomForestClassifier(random_state=2021)

In [None]:
rf_pipeline = make_pipeline(preprocessing, RF)

In [None]:
modeling.report(rf_pipeline, X_train, y_train, 'RandomForest', 'Forest w/o SMOTE')

In [None]:
rf_pipeline.get_params().keys()

In [None]:
# criterion = ['gini', 'entropy']
# max_depth = [2, 3, None]
# min_samples_split = [2, 5, 10]
# class_weight = ['balanced', 'balanced_subsample']

# grid_values = {'randomforestclassifier__criterion':criterion, 
#                'randomforestclassifier__max_depth':max_depth,
#                'randomforestclassifier__min_samples_split':min_samples_split,
#               'randomforestclassifier__class_weight':class_weight}

# grid_rf_f1 = GridSearchCV(rf_pipeline, param_grid = grid_values, scoring = 'f1')
# grid_rf_f1.fit(X_train, y_train)

In [None]:
# print('Best criterion:', grid_rf_f1.best_estimator_.get_params()['randomforestclassifier__criterion'])
# print('Best max_depth:', grid_rf_f1.best_estimator_.get_params()['randomforestclassifier__max_depth'])
# print('Best min_samples_split:', grid_rf_f1.best_estimator_.get_params()['randomforestclassifier__min_samples_split'])
# print('Best class weight:', grid_rf_f1.best_estimator_.get_params()['randomforestclassifier__class_weight'])

In [None]:
RF = RandomForestClassifier(criterion='gini',
                            max_depth=3, min_samples_split=2,
                            class_weight='balanced',
                            random_state=2021)

In [None]:
rf_pipeline = make_pipeline(preprocessing, RF)

In [None]:
modeling.report(rf_pipeline, X_train, y_train, 'RandomForest', 'Forest w/o SMOTE')

In [None]:
modeling.history

In [None]:
rf_pipeline.fit(X_train, y_train)

In [None]:
f.confusion(rf_pipeline, X_train, y_train)