In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# setting project path
import os
import sys

gparent = os.path.join(os.pardir, os.pardir)
sys.path.append(gparent)

# imports
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import make_column_selector
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline 
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import make_scorer, plot_confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.base import clone

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline as make_sm_pipeline

from src import classes as c
from src import functions as f
import matplotlib.pyplot as plt
import seaborn as sns

# setting style
sns.set_theme('talk')
plt.style.use('fivethirtyeight')
sns.set_palette(palette='Blues_r')

## Importing the Data

In [3]:
path = os.path.join(gparent, 'data/processed', 'modeling.csv')
df = pd.read_csv(path, keep_default_na=False)

## Preprocessing and Harness Objects

In [4]:
string_selector = make_column_selector(dtype_include='object')
number_selector = make_column_selector(dtype_include='number', dtype_exclude='object')

In [5]:
preprocessing = make_column_transformer((OneHotEncoder
                                         (handle_unknown='ignore'),string_selector),
                                          (StandardScaler(), number_selector))

In [6]:
sm = SMOTE(random_state=2021)

In [7]:
# creating f1 scorer
f1 = f.f1

In [8]:
modeling = c.Harness(f1)

## Baseline Model

Dropping engineered features to establish the baseline score.

In [None]:
# features engineered during eda
feature_list = ['Weapon Flag', 'Reported Year', 'Reported Month',
                'Day of Month', 'Day of Week', 'Reported Hour', 'Beat Flag']

In [None]:
excluded = ['Weapon Type', 'Initial Call Type', 'Final Call Type',
             'Call Type', 'Officer Squad', 'Arrest Flag',
             'Frisk Flag',  'Sector', 'Beat', 'Weapon Flag',
             'Reported Year', 'Reported Month','Day of Month',
             'Day of Week', 'Reported Hour', 'Beat Flag']

In [None]:
baseline_df = f.framer(df, [], excluded)

In [None]:
baseline_df.head(2)

In [None]:
X, y = f.Xy(baseline_df)

In [None]:
X_train, X_test, y_train, y_test = f.splitter(X,y)

In [None]:
preprocessing.fit_transform(X_train);

In [None]:
KNN = KNeighborsClassifier()

In [None]:
baseline = make_pipeline(preprocessing, KNN)

In [None]:
modeling.report(baseline, X_train, y_train, 'Baseline', 'KNN - Baseline Features')

In [None]:
baseline_sm = make_sm_pipeline(preprocessing, sm, KNN)

In [None]:
modeling.report(baseline_sm, X_train, y_train, 'Baseline_SM',\
                'Baseline Model w/ SMOTE')

In [None]:
# f.confusion(baseline, X_train, y_train)

In [None]:
# f.confusion(baseline_sm, X_train, y_train)

In [None]:
modeling.history

## KNN With Engineered Features
Engineered columns from original EDA:

'Weapon Flag', 'Reported Year', 'Reported Month', 'Day of Month', 'Day of Week', 'Reported Hour', 'Beat Flag'

In [20]:
feature_list1 = ['Sector',
 'Beat',
 'Weapon Flag',
 'Reported Year',
 'Reported Month',
 'Day of Month',
 'Day of Week',
 'Reported Hour',
 'Beat Flag']

In [21]:
excluded1 = ['Weapon Type',
 'Initial Call Type',
 'Final Call Type',
 'Call Type',
 'Officer Squad',
 'Arrest Flag',
 'Frisk Flag',]

In [22]:
engineered = f.framer(df, feature_list1, excluded1)

In [23]:
f.feature_test(engineered, KNN, feature_list1)

NameError: name 'KNN' is not defined

## Logistic Regression

In [10]:
# features engineered during eda
feature_list2 = ['Weapon Flag', 'Reported Year', 'Reported Month',
                'Day of Month', 'Day of Week', 'Reported Hour', 'Beat Flag']

In [11]:
excluded2 = ['Weapon Type', 'Initial Call Type', 'Final Call Type',
             'Call Type', 'Officer Squad', 'Arrest Flag',
             'Frisk Flag',  'Sector', 'Beat', 'Weapon Flag',
             'Reported Year', 'Reported Month','Day of Month',
             'Day of Week', 'Reported Hour', 'Beat Flag']

In [12]:
baseline_df1 = f.framer(df, [], excluded2)

In [13]:
baseline_df1.head(2)

Unnamed: 0,Target,Subject Age Group,Officer ID,Officer YOB,Officer Gender,Officer Race,Subject Perceived Race,Subject Perceived Gender,Precinct
0,1,,7500,1984,M,Black,Asian,Male,South
1,0,,5670,1965,M,White,,,


In [14]:
X, y = f.Xy(baseline_df1)

In [15]:
X_train, X_test, y_train, y_test = f.splitter(X,y)

In [16]:
preprocessing.fit_transform(X_train);

In [17]:
LR = LogisticRegression(max_iter=1000)

In [18]:
baseline_LR = make_pipeline(preprocessing, LR)

In [19]:
modeling.report(baseline_LR, X_train, y_train, 'LR Baseline',\
                'Log Reg Baseline')

LR Baseline has an average F1 of 0.22351468310274009


array([0.2244898 , 0.22162884, 0.21597096, 0.24177778, 0.21370604])

In [27]:
lr_smote = make_sm_pipeline(preprocessing, sm, LR)

In [28]:
modeling.report(lr_smote, X_train, y_train, 'LR SMOTE', 'Log Reg w/ SMOTE')

LR SMOTE has an average F1 of 0.4895653871836288


array([0.48577154, 0.47143142, 0.49570944, 0.49880287, 0.49611167])

In [26]:
f.feature_test(engineered, LR, feature_list1)

LogisticRegression(max_iter=1000) Sector Model has an average F1 of 0.49372091753402997
LogisticRegression(max_iter=1000) Beat Model has an average F1 of 0.490860176583577


KeyboardInterrupt: 