In [1]:
%load_ext autoreload
%autoreload 2

In [122]:
# setting project path
import os
import sys

gparent = os.path.join(os.pardir, os.pardir)
sys.path.append(gparent)

# imports
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import make_column_selector
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline 
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.base import clone

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline as make_sm_pipeline

from src import classes as c
from src import functions as f
import matplotlib.pyplot as plt
import seaborn as sns

# setting style
sns.set_theme('talk')
plt.style.use('fivethirtyeight')
sns.set_palette(palette='Blues_r')

## Importing the Data

In [107]:
path = os.path.join(gparent, 'data/processed', 'modeling.csv')
df = pd.read_csv(path, keep_default_na=False)

## Preprocessing and Harness Objects

In [125]:
string_selector = make_column_selector(dtype_include='object')
number_selector = make_column_selector(dtype_include='number', dtype_exclude='object')

In [126]:
preprocessing = make_column_transformer((OneHotEncoder
                                         (handle_unknown='ignore'),string_selector),
                                          (MinMaxScaler(), number_selector))

In [127]:
sm = SMOTE(random_state=2021)

In [139]:
precision = f.precision
f1 = f.f1

In [140]:
modeling = c.Harness(f1)

## Baseline Model

Dropping engineered features to establish the baseline score.

In [141]:
column_list = df.columns
excluded = ['Weapon Flag', 'Reported Year', 'Reported Month', 'Day of Month',
'Day of Week', 'Reported Hour', 'Beat Flag']
cols = [x for x in column_list if x not in excluded]

In [142]:
# dataframe of cleaned data with no engineered features besides the target
basedf = df[cols]

In [143]:
basedf.head(2)

Unnamed: 0,Target,Subject Age Group,Weapon Type,Officer ID,Officer YOB,Officer Gender,Officer Race,Subject Perceived Race,Subject Perceived Gender,Initial Call Type,Final Call Type,Call Type,Officer Squad,Arrest Flag,Frisk Flag,Precinct,Sector,Beat
0,1,,,7500,1984,M,Black,Asian,Male,,,,SOUTH PCT 1ST W - ROBERT,0,0,South,O,O2
1,0,,,5670,1965,M,White,,,,,,,0,0,,,


In [144]:
X = basedf.drop(['Target'], axis=1)
y = basedf['Target']

In [145]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=2021,
                                                    stratify=y
                                                   )

In [146]:
preprocessing.fit_transform(X_train)

<35220x1929 sparse matrix of type '<class 'numpy.float64'>'
	with 538724 stored elements in Compressed Sparse Row format>

In [147]:
clf = KNeighborsClassifier()

In [148]:
baseline = make_pipeline(preprocessing, clf)

In [149]:
modeling.report(baseline, X_train, y_train, 'Baseline', 'KNN w/ all base cols')

Baseline has an average percision of 0.5560659437218776
[0.5630137  0.55649241 0.55013369 0.54348562 0.5672043 ]


array([0.5630137 , 0.55649241, 0.55013369, 0.54348562, 0.5672043 ])

## Feature Engineering

In [18]:
df.columns

Index(['Target', 'Subject Age Group', 'Stop Resolution', 'Weapon Type',
       'Officer ID', 'Officer YOB', 'Officer Gender', 'Officer Race',
       'Subject Perceived Race', 'Subject Perceived Gender',
       'Initial Call Type', 'Final Call Type', 'Call Type', 'Officer Squad',
       'Arrest Flag', 'Frisk Flag', 'Precinct', 'Sector', 'Beat'],
      dtype='object')

In [4]:
column_list = df.columns
excluded = ['Stop Resolution', 'Weapon Type', 'Officer ID',
          'Initial Call Type', 'Final Call Type', 'Officer Squad',
            'Precinct', 'Sector', 'Call Type', 'Arrest Flag', 
            'Frisk Flag', 'Beat']
cols = [x for x in column_list if x not in excluded]

In [5]:
df = df[cols]

In [4]:
X = df.drop('Target', axis=1)
y = df['Target']

In [15]:
pl1 = make_pipeline(preprocessing, clf)

In [16]:
modeling.report(pl1, X_train, y_train, 'baseline')

baseline has an average percision of 0.2857927777164726
[0.30049875 0.26337449 0.28936743 0.303233   0.27249022]


array([0.30049875, 0.26337449, 0.28936743, 0.303233  , 0.27249022])

In [17]:
pl2 = make_sm_pipeline(preprocessing, sm, clf)

In [18]:
modeling.report(pl2, X_train, y_train, 'baseline2', 'SMOTE')

baseline2 has an average percision of 0.2807899868164585
[0.28070796 0.26982012 0.27421603 0.28720721 0.29199861]


array([0.28070796, 0.26982012, 0.27421603, 0.28720721, 0.29199861])

In [19]:
clf2 = LogisticRegression()

In [20]:
pl3 = make_sm_pipeline(preprocessing, sm , clf2)

In [21]:
modeling.report(pl3, X_train, y_train, 'log reg')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

log reg has an average percision of 0.29090060868182843
[0.29624872 0.28493365 0.29127726 0.28791037 0.29413305]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([0.29624872, 0.28493365, 0.29127726, 0.28791037, 0.29413305])

In [22]:
clf3 = LogisticRegression(max_iter=275)

In [23]:
pl4 = make_sm_pipeline(preprocessing, sm, clf3)

In [24]:
modeling.report(pl4, X_train, y_train, 'log reg', 'max_inter=275')

log reg has an average percision of 0.2907362078890012
[0.29632485 0.28426528 0.29127726 0.28764982 0.29416383]


array([0.29632485, 0.28426528, 0.29127726, 0.28764982, 0.29416383])