In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# setting project path
import os
import sys

gparent = os.path.join(os.pardir, os.pardir)
sys.path.append(gparent)

# imports
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.compose import make_column_selector
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline 
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import make_scorer, plot_confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.base import clone
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline as make_sm_pipeline

from src import classes as c
from src import functions as f
import matplotlib.pyplot as plt
import seaborn as sns

# setting style
sns.set_theme('talk')
plt.style.use('fivethirtyeight')
sns.set_palette(palette='Blues_r')

## Importing the Data

In [None]:
path = os.path.join(gparent, 'data/processed', 'modeling.csv')
df = pd.read_csv(path, keep_default_na=False)

## Preprocessing and Harness Objects

In [None]:
string_selector = make_column_selector(dtype_include='object')
number_selector = make_column_selector(dtype_include='number', dtype_exclude='object')

In [None]:
preprocessing = make_column_transformer((OneHotEncoder
                                         (handle_unknown='ignore'),string_selector),
                                          (StandardScaler(), number_selector))

In [None]:
sm = SMOTE(random_state=2021)

In [None]:
# creating f1 scorer
f1 = f.f1

In [None]:
modeling = c.Harness(f1)

## Baseline Model

Dropping engineered features to establish the baseline score.

In [None]:
# features engineered during eda
feature_list = ['Weapon Flag', 'Reported Year', 'Reported Month',
                'Day of Month', 'Day of Week', 'Reported Hour', 'Beat Flag']

In [None]:
excluded = ['Weapon Type', 'Officer ID', 'Initial Call Type', 'Final Call Type',
             'Call Type', 'Officer Squad', 'Arrest Flag',
             'Frisk Flag',  'Sector', 'Beat', 'Weapon Flag',
             'Reported Year', 'Reported Month','Day of Month',
             'Day of Week', 'Reported Hour', 'Beat Flag']

In [None]:
baseline_df = f.framer(df, [], excluded)

In [None]:
baseline_df.head(2)

In [None]:
baseline_df.info()

In [None]:
X, y = f.Xy(baseline_df)

In [None]:
X_train, X_test, y_train, y_test = f.splitter(X,y)

In [None]:
preprocessing.fit_transform(X_train)

In [None]:
KNN = KNeighborsClassifier()

In [None]:
baseline = make_pipeline(preprocessing, KNN)

In [None]:
modeling.report(baseline, X_train, y_train, 'Baseline', 'KNN - Baseline Features')

In [None]:
baseline_sm = make_sm_pipeline(preprocessing, sm, KNN)

In [None]:
modeling.report(baseline_sm, X_train, y_train, 'Baseline_SM',\
                'Baseline Model w/ SMOTE')

With OID:
0.4344416 , 0.44728326, 0.44414215, 0.42996076, 0.43894279

In [None]:
# f.confusion(baseline, X_train, y_train)

In [None]:
# f.confusion(baseline_sm, X_train, y_train)

In [None]:
# modeling.history

## KNN With Engineered Features
Engineered columns from original EDA:

'Weapon Flag', 'Reported Year', 'Reported Month', 'Day of Month', 'Day of Week', 'Reported Hour', 'Beat Flag'

In [None]:
feature_list1 = ['Sector',
 'Beat',
 'Weapon Flag',
 'Reported Year',
 'Reported Month',
 'Day of Month',
 'Day of Week',
 'Reported Hour',
 'Beat Flag']

In [None]:
excluded1 = ['Weapon Type',
 'Initial Call Type',
 'Final Call Type',
 'Call Type',
 'Officer Squad',
 'Arrest Flag',
 'Frisk Flag',]

In [None]:
engineered = f.framer(df, feature_list1, excluded1)

In [None]:
f.feature_test(engineered, KNN, feature_list1)

## Logistic Regression Baseline

In [None]:
# features engineered during eda
feature_list2 = ['Weapon Flag', 'Reported Year', 'Reported Month',
                'Day of Month', 'Day of Week', 'Reported Hour', 'Beat Flag']

In [None]:
excluded2 = ['Weapon Type', 'Officer ID', 'Initial Call Type', 'Final Call Type',
             'Call Type', 'Officer Squad', 'Arrest Flag',
             'Frisk Flag',  'Sector', 'Beat', 'Weapon Flag',
             'Reported Year', 'Reported Month','Day of Month',
             'Day of Week', 'Reported Hour', 'Beat Flag']

In [None]:
baseline_df1 = f.framer(df, [], excluded2)

In [None]:
baseline_df1.head(2)

In [None]:
X, y = f.Xy(baseline_df1)

In [None]:
X_train, X_test, y_train, y_test = f.splitter(X,y)

In [None]:
preprocessing.fit_transform(X_train)

In [None]:
LR = LogisticRegression(max_iter=1000)

In [None]:
baseline_LR = make_pipeline(preprocessing, LR)

In [None]:
modeling.report(baseline_LR, X_train, y_train, 'LR Baseline',\
                'Log Reg Baseline')

In [None]:
lr_smote = make_sm_pipeline(preprocessing, sm, LR)

In [None]:
modeling.report(lr_smote, X_train, y_train, 'LR SMOTE', 'Log Reg w/ SMOTE')

In [None]:
f.feature_test(engineered, LR, feature_list1)

In [None]:
f.confusion(lr_smote, X_train, y_train)

## Adding Polynomial Features

In [None]:
preprocessing2 = make_column_transformer((PolynomialFeatures(degree=3), number_selector),
                                         (OneHotEncoder(handle_unknown='ignore'),string_selector),
                                          (StandardScaler(), number_selector))

In [None]:
preprocessing2.fit_transform(X_train)

In [None]:
lr_poly = make_sm_pipeline(preprocessing2, sm, LR)

In [None]:
modeling.report(lr_poly, X_train, y_train, 'LR Poly', 'Log Reg Polys w/ SMOTE')

In [None]:
modeling.history

## Decision Tree

In [None]:
# features engineered during eda
feature_list3 = ['Weapon Flag', 'Reported Year', 'Reported Month',
                'Day of Month', 'Day of Week', 'Reported Hour', 'Beat Flag']

In [None]:
excluded3 = ['Weapon Type', 'Officer ID', 'Initial Call Type', 'Final Call Type',
             'Call Type', 'Officer Squad', 'Arrest Flag',
             'Frisk Flag',  'Sector', 'Beat', 'Weapon Flag',
             'Reported Year', 'Reported Month','Day of Month',
             'Day of Week', 'Reported Hour', 'Beat Flag']

In [None]:
baseline_df2 = f.framer(df, [], excluded3)

In [None]:
baseline_df2.head(2)

In [None]:
X, y = f.Xy(baseline_df2)

In [None]:
X_train, X_test, y_train, y_test = f.splitter(X,y)

In [None]:
preprocessing.fit_transform(X_train)

In [None]:
DT = DecisionTreeClassifier()

In [None]:
dt_pipeline = make_sm_pipeline(preprocessing, sm, DT)

In [None]:
modeling.report(dt_pipeline, X_train, y_train, 'Tree CLF', 'Tree w/ SMOTE')

## Random Forest

In [None]:
RF = RandomForestClassifier()

In [None]:
rf_pipeline = make_sm_pipeline(preprocessing, sm, RF)

In [None]:
modeling.report(rf_pipeline, X_train, y_train, 'RandomForest', 'Forest w/ SMOTE')

In [None]:
f.confusion(rf_pipeline, X_train, y_train)

In [None]:
y_train.value_counts(normalize=True)

In [None]:
Xs_train, Xs_test, ys_train , ys_test = f.splitter(X_train, y_train)

In [None]:
preprocessing.fit_transform(Xs_train)

In [None]:
sub_pipeline = make_sm_pipeline(preprocessing, sm, RF)

In [None]:
modeling.report(sub_pipeline, Xs_train, ys_train, 'RandomForest', 'Forest on SubSplit')

In [None]:
f.confusion(sub_pipeline, Xs_train, ys_train)

In [None]:
modeling.report(sub_pipeline, Xs_test, ys_test, 'RandomForest', 'Forest on SubSplit')

In [None]:
f.confusion(sub_pipeline, Xs_test, ys_test)