In [None]:
%load_ext autoreload
%autoreload 2

## Package Imports & Setup 

In [None]:
# setting project path
import os
import sys

gparent = os.path.join(os.pardir, os.pardir)
sys.path.append(gparent)

# imports
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.compose import make_column_selector
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline 
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import make_scorer, plot_confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.base import clone
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline as make_sm_pipeline

from boruta import BorutaPy

from src import classes as c
from src import functions as f
import matplotlib.pyplot as plt
import seaborn as sns

# setting style
sns.set_theme('talk')
plt.style.use('fivethirtyeight')
sns.set_palette(palette='Blues_r')

## Importing the Data

In [None]:
path = os.path.join(gparent, 'data/processed', 'modeling.csv')
df = pd.read_csv(path, keep_default_na=False)

## Preprocessing and Harness Objects

In [None]:
string_selector = make_column_selector(dtype_include='object')
number_selector = make_column_selector(dtype_include='number', dtype_exclude='object')

preprocessing_ss = make_column_transformer((OneHotEncoder
                                         (handle_unknown='ignore'),string_selector),
                                          (StandardScaler(), number_selector))
preprocessing_mm = make_column_transformer((OneHotEncoder
                                         (handle_unknown='ignore'),string_selector),
                                          (MinMaxScaler(), number_selector))
sm = SMOTE(random_state=2021)

In [None]:
# creating f1 scorer
f1 = f.f1

In [None]:
modeling = c.Harness(f1)

## Baseline Dataframe

Dropping engineered features to establish the baseline score.

In [None]:
df.head()

In [None]:
# features engineered during eda
feature_list = ['Weapon Flag', 'Reported Year', 'Reported Month',
                'Day of Month', 'Day of Week', 'Reported Hour', 'Beat Flag']

In [None]:
# excluding Officer ID, 'Final Call Type', Frisk Flag & time features
excluded = ['Officer ID','Final Call Type', 'Frisk Flag','Weapon Flag', 
             'Reported Year', 'Reported Month','Day of Month',
             'Day of Week', 'Reported Hour', 'Beat Flag']

In [None]:
baseline_df = f.framer(df, [], excluded)

In [None]:
baseline_df.head(2)

In [None]:
X, y = f.Xy(baseline_df)

In [None]:
X_train, X_test, y_train, y_test = f.splitter(X,y)

## Baseline Logistic Regression

In [None]:
LR = LogisticRegression(max_iter=1000, random_state = 2021, verbose=2)

In [None]:
baseline = make_pipeline(preprocessing_mm, LR)

In [None]:
baseline.fit(X_train, y_train)

In [None]:
#scoring the model
f1_score(baseline.predict(X_train), y_train)

In [None]:
# checking cross val scores
modeling.report(baseline, X_train, y_train, 'LR Baseline', 'Log Reg, min/max')

## Sub-Splitting the Training Data

In [None]:
f.subsplit_test(X_train, y_train, baseline)

## Baseline Logistic Regression w/smote

In [None]:
baseline_smote = make_sm_pipeline(preprocessing_mm, sm, LR)

In [None]:
baseline_smote.fit(X_train, y_train)

In [None]:
#scoring the model
f1_score(baseline_smote.predict(X_train), y_train)

In [None]:
# checking cross val scores
modeling.report(baseline_smote, X_train, y_train, 'LR Baseline SMOTE', 'Log Reg min/max')

In [None]:
modeling.history

## Sub-Splitting the Training Data

In [None]:
f.subsplit_test(X_train, y_train, baseline_smote)

## Dropping Subject and Officer Race Nulls

In [None]:
df = df[(df['Subject Perceived Race']!='NA') &\
            (df['Officer Race']!='NA')]

In [None]:
# features engineered during eda
feature_list = ['Weapon Flag', 'Reported Year', 'Reported Month',
                'Day of Month', 'Day of Week', 'Reported Hour', 'Beat Flag']

In [None]:
# excluding Officer ID, 'Final Call Type', Frisk Flag & time features
excluded = ['Officer ID','Final Call Type', 'Frisk Flag','Weapon Flag', 
             'Reported Year', 'Reported Month','Day of Month',
             'Day of Week', 'Reported Hour', 'Beat Flag']

In [None]:
dropped_df = f.framer(df, [], excluded)

In [None]:
dropped_df.head(2)

In [None]:
X, y = f.Xy(dropped_df)

In [None]:
X_train, X_test, y_train, y_test = f.splitter(X,y)

## Baseline Logistic Regression w/SMOTE on Dropped Nulls Data

In [None]:
baseline_smote = make_sm_pipeline(preprocessing_mm, sm, LR)

In [None]:
baseline_smote.fit(X_train, y_train)

In [None]:
#scoring the model
f1_score(baseline_smote.predict(X_train), y_train)

In [None]:
# checking cross val scores
modeling.report(baseline_smote, X_train, y_train, 'LR Baseline SMOTE', 'LR, mm, Dropped Nulls')

In [None]:
modeling.history

## Sub-Splitting the Training Data

In [None]:
f.subsplit_test(X_train, y_train, baseline_smote)

## Investigating the Data More Closely

In [None]:
cols = baseline_df.columns
for col in cols:
    print(f'{df[col].value_counts()}\n')

In [None]:
baseline_df['Officer YOB'].describe()

## Creating & Testing Officer Age Feature
Creating `Officer Age` feature, dropping invalid ages and `Officer YOB` feature.

In [None]:
df['Officer Age'] = df['Reported Year'] - df['Officer YOB']
df['Officer Age'] = df['Officer Age'].apply(lambda x: np.nan if x > 100 else x)
df.dropna(subset = ['Officer Age'], inplace=True)

In [None]:
df.drop('Officer YOB', axis=1, inplace=True)

In [None]:
age_df = f.framer(df, [], excluded)

In [None]:
age_df.head(2)

In [None]:
age_df.info()

In [None]:
X, y = f.Xy(age_df)

In [None]:
X_train, X_test, y_train, y_test = f.splitter(X,y)

In [None]:
baseline_smote.fit(X_train, y_train)

In [None]:
#scoring the model
f1_score(baseline_smote.predict(X_train), y_train)

In [None]:
# checking cross val scores
modeling.report(baseline_smote, X_train, y_train, 'LR SMOTE', 'Dropped Nulls, mm, officer age')

In [None]:
modeling.history

In [None]:
f.subsplit_test(X_train, y_train, baseline_smote)

## Creating Racial Affinity Feature

In [None]:
df['Racial Affinity'] = [1 if df['Subject Perceived Race'][row]\
                         == df['Officer Race'][row] else 0 for row in df.index]

In [None]:
df.head(3)

In [None]:
# excluding Officer ID, 'Final Call Type', and Frisk Flag
excluded = ['Officer ID','Final Call Type', 'Frisk Flag','Weapon Flag', 
             'Reported Year', 'Reported Month','Day of Month',
             'Day of Week', 'Reported Hour', 'Beat Flag', 'Officer Age']

In [None]:
RA_df = f.framer(df, [], excluded)

In [None]:
RA_df.head(2)

In [None]:
X, y = f.Xy(RA_df)

In [None]:
X_train, X_test, y_train, y_test = f.splitter(X,y)

In [None]:
baseline_smote.fit(X_train, y_train)

In [None]:
#scoring the model
f1_score(baseline_smote.predict(X_train), y_train)

In [None]:
# checking cross val scores
modeling.report(baseline_smote, X_train, y_train, 'LR SMOTE ', 'Dropped Nulls, mm, racial affinity')

In [None]:
modeling.history

In [None]:
f.subsplit_test(X_train, y_train, baseline_smote)

## Gender Affinity Feature

In [None]:
df['Officer Gender'].value_counts()

In [None]:
df['Officer Gender'] = df['Officer Gender'].apply(lambda x: 'Male' if x=='M' else 'Female')

In [None]:
df['Gender Affinity'] = [1 if df['Subject Perceived Gender'][row]\
                         == df['Officer Gender'][row] else 0 for row in df.index]

In [None]:
df.head(2)

In [None]:
excluded = ['Officer ID','Final Call Type', 'Frisk Flag','Weapon Flag', 
             'Reported Year', 'Reported Month','Day of Month',
             'Day of Week', 'Reported Hour', 'Beat Flag', 'Officer Age',
            'Racial Affinity']

In [None]:
GA_df = f.framer(df, [], excluded)

In [None]:
GA_df.head(2)

In [None]:
X, y = f.Xy(GA_df)

In [None]:
X_train, X_test, y_train, y_test = f.splitter(X,y)

In [None]:
baseline_smote.fit(X_train, y_train)

In [None]:
#scoring the model
f1_score(baseline_smote.predict(X_train), y_train)

In [None]:
# checking cross val scores
modeling.report(baseline_smote, X_train, y_train, 'LR SMOTE ', 'Dropped Nulls, mm, gender affinity')

In [None]:
modeling.history

In [None]:
f.subsplit_test(X_train, y_train, baseline_smote)

### Weapon Type

In [None]:
# checking weapon types
df['Weapon Type'].unique()

In [None]:
NONE= ['None', 'NA','None/Not Applicable']

GUN = ['Firearm Other', 'Handgun', 'Other Firearm',
        'Rifle', 'Firearm (unk type)', 'Firearm',
        'Shotgun', 'Automatic Handgun']

KNIFE = ['Lethal Cutting Instrument', 'Knife/Cutting/Stabbing Instrument']

OTHER = ['Club, Blackjack, Brass Knuckles',
         'Fire/Incendiary Device', 'Blunt Object/Striking Implement',
         'Mace/Pepper Spray', 'Club', 'Taser/Stun Gun',
         'Brass Knuckles','Blackjack',
         'Personal Weapons (hands, feet, etc.)']

In [None]:
df['Weapon Bins'] = df['Weapon Type'].copy()

In [None]:
df['Weapon Bins']

In [None]:
for type in NONE:
    df['Weapon Bins'].replace(type, 'NONE', inplace=True)
for type in GUN:
    df['Weapon Bins'].replace(type, 'GUN', inplace=True)
for type in KNIFE:
    df['Weapon Bins'].replace(type, 'KNIFE', inplace=True)
for type in OTHER:
    df['Weapon Bins'].replace(type, 'OTHER', inplace=True)

In [None]:
df['Weapon Bins'].value_counts(normalize=True)

## 94% of Subjects Are Unarmed and Only 1% Have a Gun.

In [None]:
df.head(2)

In [None]:
excluded = ['Weapon Type', 'Officer ID','Final Call Type', 'Frisk Flag','Weapon Flag', 
             'Reported Year', 'Reported Month','Day of Month',
             'Day of Week', 'Reported Hour', 'Beat Flag', 'Officer Age',
            'Racial Affinity', 'Gender Affinity']

In [None]:
WB_df = f.framer(df, [], excluded)

In [None]:
WB_df.head(2)

In [None]:
X, y = f.Xy(WB_df)

In [None]:
X_train, X_test, y_train, y_test = f.splitter(X,y)

In [None]:
baseline_smote.fit(X_train, y_train)

In [None]:
#scoring the model
f1_score(baseline_smote.predict(X_train), y_train)

In [None]:
# checking cross val scores
modeling.report(baseline_smote, X_train, y_train, 'LR SMOTE ', 'Dropped Nulls, mm, Weapon Bins')

In [None]:
modeling.history

In [None]:
f.subsplit_test(X_train, y_train, baseline_smote)

## Testing Weapon Flag
The weapon flag (0 = no weapon) has been excluded from tests up to this point.

In [None]:
excluded = ['Officer ID','Final Call Type', 'Frisk Flag','Weapon Type', 
             'Reported Year', 'Reported Month','Day of Month',
             'Day of Week', 'Reported Hour', 'Beat Flag', 'Officer Age',
            'Racial Affinity', 'Gender Affinity', 'Weapon Bins']

In [None]:
WF_df = f.framer(df, [], excluded)

In [None]:
WF_df.head(2)

In [None]:
X, y = f.Xy(WF_df)

In [None]:
X_train, X_test, y_train, y_test = f.splitter(X,y)

In [None]:
baseline_smote.fit(X_train, y_train)

In [None]:
#scoring the model
f1_score(baseline_smote.predict(X_train), y_train)

In [None]:
# checking cross val scores
modeling.report(baseline_smote, X_train, y_train, 'LR SMOTE ', 'Dropped Nulls, mm, weapon flag')

In [None]:
modeling.history

In [None]:
f.subsplit_test(X_train, y_train, baseline_smote)

## Binarizing Initial Call Type
0 = no call information provided.

In [None]:
df['Initial Call Bin'] = df['Initial Call Type'].copy()

In [None]:
df['Initial Call Bin'] = df['Initial Call Bin'].replace('NA', 0)
df['Initial Call Bin'] = df['Initial Call Bin'].map(lambda x: 1 if x!=0 else 0)

In [None]:
df['Initial Call Bin'].value_counts()

In [None]:
excluded = ['Officer ID','Final Call Type', 'Frisk Flag','Weapon Flag', 
             'Reported Year', 'Reported Month','Day of Month',
             'Day of Week', 'Reported Hour', 'Beat Flag', 'Officer Age',
            'Racial Affinity', 'Gender Affinity', 'Weapon Bins', 'Initial Call Type']

In [None]:
ICB_df = f.framer(df, [], excluded)

In [None]:
ICB_df.head(2)

In [None]:
X, y = f.Xy(ICB_df)

In [None]:
X_train, X_test, y_train, y_test = f.splitter(X,y)

In [None]:
baseline_smote.fit(X_train, y_train)

In [None]:
#scoring the model
f1_score(baseline_smote.predict(X_train), y_train)

In [None]:
# checking cross val scores
modeling.report(baseline_smote, X_train, y_train, 'LR SMOTE ', 'Dropped Nulls, mm, Initial Call Binary')

In [None]:
modeling.history

In [None]:
f.subsplit_test(X_train, y_train, baseline_smote)

## Binning Call Types

In [None]:
types = df['Call Type'].unique()

In [None]:
replace_list = [x for x in types if (x!='911') and (x!='ONVIEW')]

In [None]:
df['Call Type Bins'] = df['Call Type'].copy()

In [None]:
df['Call Type Bins'] = df['Call Type Bins'].replace(replace_list, 'OTHER')

In [None]:
df['Call Type Bins'].value_counts(normalize=True)

In [None]:
df.info()

In [None]:
excluded = ['Officer ID','Final Call Type', 'Frisk Flag','Weapon Flag', 
             'Reported Year', 'Reported Month','Day of Month',
             'Day of Week', 'Reported Hour', 'Beat Flag', 'Officer Age',
            'Racial Affinity', 'Gender Affinity', 'Weapon Bins', 
            'Initial Call Bin','Call Type']

In [None]:
BCT_df = f.framer(df, [], excluded)

In [None]:
BCT_df.head(2)

In [None]:
X, y = f.Xy(BCT_df)

In [None]:
X_train, X_test, y_train, y_test = f.splitter(X,y)

In [None]:
baseline_smote.fit(X_train, y_train)

In [None]:
#scoring the model
f1_score(baseline_smote.predict(X_train), y_train)

In [None]:
# checking cross val scores
modeling.report(baseline_smote, X_train, y_train, 'LR SMOTE ', 'Dropped Nulls, mm, call type bins')

In [None]:
modeling.history

In [None]:
f.subsplit_test(X_train, y_train, baseline_smote)

## Binning the Squad Data
The squads look like they can be binned into precinct goups and a training group.

In [None]:
df['Officer Squad Bins'] = df['Officer Squad'].copy()

In [None]:
df['Officer Squad Bins'] = df['Officer Squad Bins'].map(lambda x: x[0])

In [None]:
df['Officer Squad Bins'].value_counts(normalize=True)

In [None]:
proportions = df['Officer Squad Bins'].value_counts(normalize=True)
cutoff_idx = proportions.loc[proportions < .02].index
for i in cutoff_idx:
    df['Officer Squad Bins'].replace(i, 'OTHER', inplace=True)

In [None]:
df['Officer Squad Bins'].value_counts()

In [None]:
excluded = ['Officer ID','Final Call Type', 'Frisk Flag','Weapon Flag', 
             'Reported Year', 'Reported Month','Day of Month',
             'Day of Week', 'Reported Hour', 'Beat Flag', 'Officer Age',
            'Racial Affinity', 'Gender Affinity', 'Weapon Bins', 
            'Initial Call Bin','Call Type Bins', 'Officer Squad']

In [None]:
BSQ_df = f.framer(df, [], excluded)

In [None]:
BSQ_df.head(2)

In [None]:
X, y = f.Xy(BSQ_df)

In [None]:
X_train, X_test, y_train, y_test = f.splitter(X,y)

In [None]:
baseline_smote.fit(X_train, y_train)

In [None]:
#scoring the model
f1_score(baseline_smote.predict(X_train), y_train)

In [None]:
# checking cross val scores
modeling.report(baseline_smote, X_train, y_train, 'LR SMOTE', 'Dropped Nulls, mm, Squad Bins')

In [None]:
modeling.history

In [None]:
f.subsplit_test(X_train, y_train, baseline_smote)

## Binning Officer Race

In [None]:
df['Officer Race Bins'] = df['Officer Race'].copy()

In [None]:
df['Officer Race Bins'].replace('White', 1, inplace=True)

In [None]:
df['Officer Race Bins'] = df['Officer Race Bins'].apply(lambda x: 0 if x!=1 else 1)

In [None]:
df['Officer Race Bins'].value_counts()

In [None]:
excluded = ['Officer ID','Final Call Type', 'Frisk Flag','Weapon Flag', 
             'Reported Year', 'Reported Month','Day of Month',
             'Day of Week', 'Reported Hour', 'Beat Flag', 'Officer Age',
            'Racial Affinity', 'Gender Affinity', 'Weapon Bins', 
            'Initial Call Bin','Call Type Bins', 'Officer Squad Bins',
            'Officer Race']

In [None]:
ORB_df = f.framer(df, [], excluded)

In [None]:
ORB_df.head(2)

In [None]:
X, y = f.Xy(ORB_df)

In [None]:
X_train, X_test, y_train, y_test = f.splitter(X,y)

In [None]:
baseline_smote.fit(X_train, y_train)

In [None]:
#scoring the model
f1_score(baseline_smote.predict(X_train), y_train)

In [None]:
# checking cross val scores
modeling.report(baseline_smote, X_train, y_train, 'LR SMOTE', 'Dropped Nulls, mm, Officer Race Bins')

In [None]:
modeling.history

In [None]:
f.subsplit_test(X_train, y_train, baseline_smote)

## Testing on Top Features w/o Time

In [None]:
# features engineered during eda
feature_list = ['Weapon Flag', 'Reported Year', 'Reported Month',
                'Day of Month', 'Day of Week', 'Reported Hour']

In [None]:
excluded = ['Officer ID','Final Call Type', 'Frisk Flag','Weapon Flag', 
             'Reported Year', 'Reported Month','Day of Month',
             'Day of Week', 'Reported Hour', 'Beat Flag',
            'Racial Affinity', 'Weapon Bins', 'Officer Race',
            'Initial Call Bin','Call Type Bins', 'Officer Squad Bins']

In [None]:
df_6 = f.framer(df, [], excluded)

In [None]:
df_6.head(2)

In [None]:
df_6.info()

In [None]:
X, y = f.Xy(df_6)

In [None]:
X_train, X_test, y_train, y_test = f.splitter(X,y)

In [None]:
baseline_smote.fit(X_train, y_train)

In [None]:
#scoring the model
f1_score(baseline_smote.predict(X_train), y_train)

In [None]:
# checking cross val scores
modeling.report(baseline_smote, X_train, y_train,\
                'LR SMOTE', 'Top Features w/o Time Dropped Nulls')

In [None]:
modeling.history

## Testing on Sub-Splits

In [None]:
f.subsplit_test(X_train, y_train, baseline_smote)

## Testing on All Features w/o Time

In [None]:
# Dropping Subject Age Nulls
df = df[(df['Subject Perceived Race']!='NA') &\
            (df['Officer Race']!='NA') &\
           (df['Subject Age Group']!='NA')]

In [None]:
# features engineered during eda
feature_list = ['Weapon Flag', 'Reported Year', 'Reported Month',
                'Day of Month', 'Day of Week', 'Reported Hour']

In [None]:
excluded = ['Officer ID','Final Call Type', 'Frisk Flag','Weapon Flag',
           'Reported Year', 'Reported Month', 'Day of Month', 'Day of Week', 'Reported Hour']

In [None]:
# excluded = ['Officer ID','Final Call Type', 'Frisk Flag','Weapon Flag', 
#              'Reported Year', 'Reported Month','Day of Month',
#              'Day of Week', 'Reported Hour', 'Beat Flag',
#             'Racial Affinity', 'Weapon Bins', 
#             'Initial Call Bin','Call Type Bins', 'Officer Squad Bins']

In [None]:
df_7 = f.framer(df, [], excluded)

In [None]:
df_7.head(2)

In [None]:
df_7.info()

In [None]:
X, y = f.Xy(df_7)

In [None]:
X_train, X_test, y_train, y_test = f.splitter(X,y)

In [None]:
baseline_smote.fit(X_train, y_train)

In [None]:
#scoring the model
f1_score(baseline_smote.predict(X_train), y_train)

In [None]:
# checking cross val scores
modeling.report(baseline_smote, X_train, y_train,'LR SMOTE', 'All Features w/o Time Dropped Nulls')

In [None]:
modeling.history

## Testing on Sub-Splits

In [None]:
f.subsplit_test(X_train, y_train, baseline_smote)

## Testing on Top Features w/o Time Dropped Subject Age

In [None]:
# Dropping Subject Age Nulls
df = df[(df['Subject Perceived Race']!='NA') &\
            (df['Officer Race']!='NA') &\
           (df['Subject Age Group']!='NA')]

In [None]:
# features engineered during eda
feature_list = ['Weapon Flag', 'Reported Year', 'Reported Month',
                'Day of Month', 'Day of Week', 'Reported Hour']

In [None]:
excluded = ['Officer ID','Final Call Type', 'Frisk Flag','Weapon Flag', 
             'Reported Year', 'Reported Month','Day of Month',
             'Day of Week', 'Reported Hour', 'Beat Flag',
            'Racial Affinity', 'Weapon Bins', 'Officer Race',
            'Initial Call Bin','Call Type Bins', 'Officer Squad Bins']

In [None]:
df_8 = f.framer(df, [], excluded)

In [None]:
df_8.head(2)

In [None]:
df_8.info()

In [None]:
X, y = f.Xy(df_8)

In [None]:
X_train, X_test, y_train, y_test = f.splitter(X,y)

In [None]:
baseline_smote.fit(X_train, y_train)

In [None]:
#scoring the model
f1_score(baseline_smote.predict(X_train), y_train)

In [None]:
# checking cross val scores
modeling.report(baseline_smote, X_train, y_train,\
                'LR SMOTE', 'Top Features w/o Time Dpd Nulls S,O Race, S Age')

In [None]:
modeling.history

## Testing on Sub-Splits

In [None]:
f.subsplit_test(X_train, y_train, baseline_smote)

## Testing on All Features w/o Time Dropped Subject Age

In [None]:
# Dropping Subject Age Nulls
df = df[(df['Subject Perceived Race']!='NA') &\
            (df['Officer Race']!='NA') &\
           (df['Subject Age Group']!='NA')]

In [None]:
# features engineered during eda
feature_list = ['Weapon Flag', 'Reported Year', 'Reported Month',
                'Day of Month', 'Day of Week', 'Reported Hour']

In [None]:
excluded = ['Officer ID','Final Call Type', 'Frisk Flag','Weapon Flag',
           'Reported Year', 'Reported Month', 'Day of Month', 'Day of Week', 'Reported Hour']

In [None]:
# excluded = ['Officer ID','Final Call Type', 'Frisk Flag','Weapon Flag', 
#              'Reported Year', 'Reported Month','Day of Month',
#              'Day of Week', 'Reported Hour', 'Beat Flag',
#             'Racial Affinity', 'Weapon Bins', 
#             'Initial Call Bin','Call Type Bins', 'Officer Squad Bins']

In [None]:
df_9 = f.framer(df, [], excluded)

In [None]:
df_9.head(2)

In [None]:
df_9.info()

In [None]:
X, y = f.Xy(df_9)

In [None]:
X_train, X_test, y_train, y_test = f.splitter(X,y)

In [None]:
baseline_smote.fit(X_train, y_train)

In [None]:
#scoring the model
f1_score(baseline_smote.predict(X_train), y_train)

In [None]:
# checking cross val scores
modeling.report(baseline_smote, X_train, y_train,\
                'LR SMOTE', 'All Features w/o Time  Dpd Nulls S,O Race, S Age')

In [None]:
modeling.history

## Testing on Sub-Splits

In [None]:
f.subsplit_test(X_train, y_train, baseline_smote)

## Grid Searching Parameters

In [None]:
# C = np.logspace(-3, 7, 7)
# penalty = ['l1', 'l2']
# solver = ['liblinear', 'saga', 'lbfgs']

# grid_values = {'logisticregression__C':C, 
#                'logisticregression__penalty':penalty,
#                'logisticregression__solver':solver }

# grid_clf_f1 = GridSearchCV(baseline_smote, param_grid = grid_values,
#                            n_jobs=-1, scoring = 'f1', cv=3, verbose=4)
# grid_clf_f1.fit(X_train, y_train)

In [None]:
# print('Best C:', grid_clf_f1.best_estimator_.get_params()['logisticregression__C'])
# print('Best penalty:', grid_clf_f1.best_estimator_.get_params()['logisticregression__penalty'])
# print('Best solver:', grid_clf_f1.best_estimator_.get_params()['logisticregression__solver'])

In [None]:
LR_tuned = LogisticRegression(C=2.154434690031884,
                            solver='lbfgs', penalty='l2',
                            max_iter=1000, random_state=2021)

pipeline  = make_sm_pipeline(preprocessing_mm, sm, LR_tuned)
pipeline.fit(X_train, y_train)
f1_score(pipeline.predict(X_train), y_train)
modeling.report(pipeline, X_train, y_train, 'LR_tuned', 'C=2.154434690031884, solver=lbfgs, penalty=l2')

In [None]:
modeling.history

## Testing Tuned Model on Sub-Splits

In [None]:
f.subsplit_test(X_train, y_train, pipeline)

## Random Forest No Smote

In [None]:
RF = RandomForestClassifier(random_state=2021)

In [None]:
rf_pipeline = make_pipeline(preprocessing_mm, RF)

In [None]:
modeling.report(rf_pipeline, X_train, y_train, 'RandomForest', 'Forest No SMOTE')

In [None]:
f.subsplit_test(X_train, y_train, rf_pipeline)

## Random Forest w/ SMOTE

In [None]:
rf_sm_pipeline = make_sm_pipeline(preprocessing_mm, sm, RF)

In [None]:
modeling.report(rf_sm_pipeline, X_train, y_train, 'RandomForest', 'Forest SMOTE')

In [None]:
f.subsplit_test(X_train, y_train, rf_sm_pipeline)

## Tuning RF no SMOTE

In [None]:
# max_depth = [2, 3, 5, None]
# min_samples_split = [2, 5, 10]
# class_weight = ['balanced', 'balanced_subsample']
# max_features = [None, 'auto', 'log2' ]

# grid_values = {'randomforestclassifier__max_features': max_features,
#                'randomforestclassifier__max_depth':max_depth,
#                'randomforestclassifier__min_samples_split':min_samples_split,
#               'randomforestclassifier__class_weight':class_weight}

# grid_rf_f1 = GridSearchCV(rf_pipeline, param_grid = grid_values, scoring = 'f1',
#                           n_jobs=-1, verbose=2, cv=3)
# grid_rf_f1.fit(X_train, y_train)

In [None]:
# print('Best max_features:', grid_rf_f1.best_estimator_.get_params()['randomforestclassifier__max_features'])
# print('Best max_depth:', grid_rf_f1.best_estimator_.get_params()['randomforestclassifier__max_depth'])
# print('Best min_samples_split:', grid_rf_f1.best_estimator_.get_params()['randomforestclassifier__min_samples_split'])
# print('Best class weight:', grid_rf_f1.best_estimator_.get_params()['randomforestclassifier__class_weight'])

## Tuned Random Forest

In [None]:
RF2 = RandomForestClassifier(criterion='gini',
                            max_depth=5, min_samples_split=2,
                            class_weight='balanced_subsample',
                            random_state=2021)

In [None]:
rf_pipeline = make_pipeline(preprocessing_mm, RF2)

In [None]:
modeling.report(rf_pipeline, X_train, y_train, 'RandomForest No SMOTE', 'criterion=gini,\
                            max_depth=5, min_samples_split=2,\
                            class_weight=balanced_subsample')

In [None]:
modeling.history

## Testing on Subsplits

In [None]:
f.subsplit_test( X_train, y_train, rf_pipeline)

## Tuning RF SMOTE

In [None]:
# rf_pipeline.get_params().keys()

In [None]:
max_depth = [2, 3, 5, None]
min_samples_split = [2, 5, 10]
class_weight = [None, 'balanced', 'balanced_subsample']
max_features = [None, 'auto', 'log2' ]

grid_values = {'randomforestclassifier__max_depth':max_depth,
               'randomforestclassifier__min_samples_split':min_samples_split,
               'randomforestclassifier__class_weight':class_weight,
               'randomforestclassifier__max_features': max_features}

grid_rf_f1 = GridSearchCV(rf_sm_pipeline, param_grid = grid_values, scoring = 'f1',
                          n_jobs=-1, verbose=2, cv=3)
grid_rf_f1.fit(X_train, y_train)

In [None]:
print('Best max_features:', grid_rf_f1.best_estimator_.get_params()['randomforestclassifier__max_features'])
print('Best max_depth:', grid_rf_f1.best_estimator_.get_params()['randomforestclassifier__max_depth'])
print('Best min_samples_split:', grid_rf_f1.best_estimator_.get_params()['randomforestclassifier__min_samples_split'])
print('Best class weight:', grid_rf_f1.best_estimator_.get_params()['randomforestclassifier__class_weight'])

## Tuned Random Forest w/SMOTE

In [None]:
RF3 = RandomForestClassifier(criterion='gini',
                            max_depth=5, min_samples_split=2,
                            class_weight='balanced_subsample',
                             max_features=None,
                            random_state=2021)

In [None]:
rf_sm_pipeline = make_pipeline(preprocessing_mm, RF3)

In [None]:
modeling.report(rf_sm_pipeline, X_train, y_train, 'RandomForest SMOTE', 'criterion=gini,\
                            max_depth=5, min_samples_split=2,\
                            class_weight=balanced_subsample')

In [None]:
modeling.history

## Testing on Subsplits

In [None]:
f.subsplit_test( X_train, y_train, rf_sm_pipeline)

## Looking for Noise in the Data

In [None]:
for col in df.columns:
    print(col, df[col].nunique())

## Small DF

In [None]:
excluded = ['Officer ID','Final Call Type', 'Weapon Type', 'Officer Gender',
            'Officer Race', 'Frisk Flag', 'Weapon Flag', 'Officer Squad', 'Initial Call Type',
            'Call Type', 'Officer Squad', 'Reported Year', 'Reported Month',
            'Day of Month', 'Day of Week', 'Reported Hour']

In [None]:
df_10 = f.framer(df, [], excluded)

In [None]:
df_10.head(2)

In [None]:
X, y = f.Xy(df_10)

In [None]:
X_train, X_test, y_train, y_test = f.splitter(X,y)

In [None]:
# checking the small df
for col in df_10.columns:
    print(col, df_10[col].nunique())

## Binning Subject Race

In [None]:
df['Subject Perceived Race'].unique()

In [None]:
df['Subject Perceived Race'].value_counts(normalize=True)

In [None]:
df['Subject Perceived Race'].replace('White', 0, inplace=True)
df['Subject Perceived Race'].replace('Black', 1, inplace=True)

In [None]:
types = df['Subject Perceived Race'].unique()
replace_list = [x for x in types if (x!=0) and (x!=1)]
df['Subject Perceived Race'].replace(replace_list, 3, inplace=True)

In [None]:
df['Subject Perceived Race'].value_counts(normalize=True)

## Binning Subject Gender

In [None]:
df['Subject Perceived Gender'].unique()

In [None]:
df['Subject Perceived Gender'].value_counts(normalize=True)

In [None]:
df['Subject Perceived Gender'].replace('Male', 0, inplace=True)
df['Subject Perceived Gender'].replace('Female', 1, inplace=True)

In [None]:
types = df['Subject Perceived Gender'].unique()
replace_list = [x for x in types if (x!=0) and (x!=1)]
df['Subject Perceived Gender'].replace(replace_list, 3, inplace=True)

In [None]:
df['Subject Perceived Gender'].value_counts(normalize=True)

## Small DF With Simple Data

In [None]:
excluded = ['Officer ID','Final Call Type', 'Weapon Type', 'Officer Gender',
            'Officer Race', 'Frisk Flag', 'Weapon Flag', 'Officer Squad', 'Initial Call Type',
            'Call Type', 'Officer Squad', 'Reported Year', 'Reported Month',
            'Day of Month', 'Day of Week', 'Reported Hour']

In [None]:
df_11 = f.framer(df, [], excluded)

In [None]:
df_11.head(2)

In [None]:
X, y = f.Xy(df_11)

In [None]:
X_train, X_test, y_train, y_test = f.splitter(X,y)

## Tuning Models on Simplified Data

## LR Simple Data

In [None]:
C = np.logspace(-4, 2, 10)
penalty = ['l1', 'l2']
solver = ['liblinear', 'saga', 'lbfgs']

grid_values = {'logisticregression__C':C, 
               'logisticregression__penalty':penalty,
               'logisticregression__solver':solver }

grid_clf_f1 = GridSearchCV(baseline_smote, param_grid = grid_values,
                           n_jobs=-1, scoring = 'f1', cv=3, verbose=4)
grid_clf_f1.fit(X_train, y_train)

In [None]:
print('Best C:', grid_clf_f1.best_estimator_.get_params()['logisticregression__C'])
print('Best penalty:', grid_clf_f1.best_estimator_.get_params()['logisticregression__penalty'])
print('Best solver:', grid_clf_f1.best_estimator_.get_params()['logisticregression__solver'])

In [None]:
LR_tuned = LogisticRegression(C=.01,
                            solver='liblinear', penalty='l2',
                            max_iter=1000, random_state=2021)

pipeline  = make_sm_pipeline(preprocessing_mm, sm, LR_tuned)
pipeline.fit(X_train, y_train)
f1_score(pipeline.predict(X_train), y_train)
modeling.report(pipeline, X_train, y_train, 'LR_tuned', 'C=.01, solver=liblinear, penalty=l2')

In [None]:
modeling.history

## Testing Tuned Model on Sub-Splits

In [None]:
f.subsplit_test(X_train, y_train, pipeline)

## RF Simple Data

In [None]:
max_depth = [2, 3, 5, None]
min_samples_split = [2, 5, 10]
class_weight = [None, 'balanced', 'balanced_subsample']
max_features = [None, 'auto', 'log2' ]

grid_values = {'randomforestclassifier__max_depth':max_depth,
               'randomforestclassifier__min_samples_split':min_samples_split,
               'randomforestclassifier__class_weight':class_weight,
               'randomforestclassifier__max_features': max_features}

grid_rf_f1 = GridSearchCV(rf_sm_pipeline, param_grid = grid_values, scoring = 'f1',
                          n_jobs=-1, verbose=2, cv=3)
grid_rf_f1.fit(X_train, y_train)

In [None]:
print('Best max_features:', grid_rf_f1.best_estimator_.get_params()['randomforestclassifier__max_features'])
print('Best max_depth:', grid_rf_f1.best_estimator_.get_params()['randomforestclassifier__max_depth'])
print('Best min_samples_split:', grid_rf_f1.best_estimator_.get_params()['randomforestclassifier__min_samples_split'])
print('Best class weight:', grid_rf_f1.best_estimator_.get_params()['randomforestclassifier__class_weight'])

## Retuned Random Forest w/SMOTE

In [None]:
RF3 = RandomForestClassifier(criterion='gini',
                            max_depth=5, min_samples_split=5,
                            class_weight='balanced_subsample',
                             max_features=None,
                            random_state=2021)

In [None]:
rf_sm_pipeline = make_pipeline(preprocessing_mm, RF3)

In [None]:
modeling.report(rf_sm_pipeline, X_train, y_train, 'RF Retuned SMOTE', 'criterion=gini,\
                            max_depth=5, min_samples_split=2,\
                            class_weight=balanced_subsample')

In [None]:
modeling.history

## Testing on Subsplits

In [None]:
f.subsplit_test( X_train, y_train, rf_sm_pipeline)

## AdaBoost

In [None]:
ada = AdaBoostClassifier(random_state=2021)

ada_pipeline = make_sm_pipeline(preprocessing_mm, sm, ada)

In [None]:
ada_pipeline.fit(X_train, y_train)

In [None]:
# checking cross val scores
modeling.report(ada_pipeline, X_train, y_train,\
                'AdaBoost', 'Simple Data')

In [None]:
#scoring the model
f1_score(ada_pipeline.predict(X_train), y_train)

In [None]:
modeling.history

In [None]:
f.subsplit_test(X_train, y_train, ada_pipeline)

## AdaBoost with LR

In [None]:
ada = AdaBoostClassifier(base_estimator=LR_tuned, random_state=2021)

adaLR_pipeline = make_sm_pipeline(preprocessing_mm, sm, ada)

In [None]:
adaLR_pipeline.fit(X_train, y_train)

In [None]:
# checking cross val scores
modeling.report(adaRF_pipeline, X_train, y_train,\
                'AdaBoostLR', 'Simple Data')

In [None]:
#scoring the model
f1_score(adaLR_pipeline.predict(X_train), y_train)

In [None]:
modeling.history

In [None]:
f.subsplit_test(X_train, y_train, adaLR_pipeline)

## Boruta 

In [None]:
X, y = f.Xy(df_7)

In [None]:
X.head()

In [None]:
num_cols = X.columns[X.dtypes.apply(lambda c: np.issubdtype(c, np.number))]

In [None]:
mm = MinMaxScaler()

X[num_cols] = mm.fit_transform(X[num_cols])

In [None]:
X.head(2)

In [None]:
X = pd.get_dummies(X)

In [None]:
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)

# define Boruta 
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=2021)

# fit selector
feat_selector.fit(X.values, y.values)

In [None]:
#  get list of features
keep = X.columns[feat_selector.support_].to_list()
toss = X.columns[feat_selector.support_weak_].to_list()
print('features to keep:', keep)
print('features to toss:', toss)

In [None]:
#  get rankings
X.columns[feat_selector.ranking_].sort_values(ascending=False).to_list()

In [None]:
# call transform() on X to filter it down to selected features
X_filtered = feat_selector.transform(X.values)

In [None]:
boruta = pd.DataFrame(X_filtered)

In [None]:
X_train, X_test, y_train, y_test = f.splitter(boruta, y)

In [None]:
boruta_selected.fit(X_train, y_train)

In [None]:
#scoring the model
f1_score(boruta_selected.predict(X_train), y_train)

In [None]:
# checking cross val scores
modeling.report(boruta_selected, X_train, y_train, 'RF ', 'Boruta Selected')

In [None]:
modeling.history