In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# setting project path
import os
import sys

gparent = os.path.join(os.pardir, os.pardir)
sys.path.append(gparent)

# imports
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import make_column_selector
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline

import matplotlib.pyplot as plt
import seaborn as sns

# setting style
sns.set_theme('talk')
plt.style.use('fivethirtyeight')
sns.set_palette(palette='Blues_r')

In [3]:
path = os.path.join(gparent, 'data/processed', 'cleaned1.csv')
df = pd.read_csv(path, keep_default_na=False)

In [4]:
df.head(2)

Unnamed: 0,Target,Subject Age Group,Stop Resolution,Weapon Type,Officer ID,Officer YOB,Officer Gender,Officer Race,Subject Perceived Race,Subject Perceived Gender,...,Precinct,Sector,Beat,Weapon Flag,Reported Year,Reported Month,Day of Month,Day of Week,Reported Hour,Beat Flag
0,1,,Arrest,,7500,1984,M,Black,Asian,Male,...,South,O,O2,0,2015,10,16,4,11,1
1,0,,Field Contact,,5670,1965,M,White,,,...,,,,0,2015,3,19,3,7,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46960 entries, 0 to 46959
Data columns (total 26 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Target                    46960 non-null  int64 
 1   Subject Age Group         46960 non-null  object
 2   Stop Resolution           46960 non-null  object
 3   Weapon Type               46960 non-null  object
 4   Officer ID                46960 non-null  object
 5   Officer YOB               46960 non-null  int64 
 6   Officer Gender            46960 non-null  object
 7   Officer Race              46960 non-null  object
 8   Subject Perceived Race    46960 non-null  object
 9   Subject Perceived Gender  46960 non-null  object
 10  Initial Call Type         46960 non-null  object
 11  Final Call Type           46960 non-null  object
 12  Call Type                 46960 non-null  object
 13  Officer Squad             46960 non-null  object
 14  Arrest Flag           

In [6]:
column_list = df.columns
excluded = ['Stop Resolution', 'Weapon Type', 'Officer ID',
          'Initial Call Type', 'Final Call Type', 'Officer Squad',
            'Precinct', 'Sector', 'Beat']
cols = [x for x in column_list if x not in excluded]

In [7]:
cols

['Target',
 'Subject Age Group',
 'Officer YOB',
 'Officer Gender',
 'Officer Race',
 'Subject Perceived Race',
 'Subject Perceived Gender',
 'Call Type',
 'Arrest Flag',
 'Frisk Flag',
 'Weapon Flag',
 'Reported Year',
 'Reported Month',
 'Day of Month',
 'Day of Week',
 'Reported Hour',
 'Beat Flag']

In [8]:
test_df = df[cols]

In [9]:
test_df.head()

Unnamed: 0,Target,Subject Age Group,Officer YOB,Officer Gender,Officer Race,Subject Perceived Race,Subject Perceived Gender,Call Type,Arrest Flag,Frisk Flag,Weapon Flag,Reported Year,Reported Month,Day of Month,Day of Week,Reported Hour,Beat Flag
0,1,,1984,M,Black,Asian,Male,,0,0,0,2015,10,16,4,11,1
1,0,,1965,M,White,,,,0,0,0,2015,3,19,3,7,0
2,0,,1961,M,White,White,Male,,0,0,0,2015,3,21,5,19,0
3,0,,1963,M,White,,,,0,0,0,2015,4,1,2,4,0
4,0,,1977,M,White,Black,Male,,0,0,0,2015,4,3,4,0,0


In [10]:
X = test_df.drop('Target', axis=1)
y = test_df['Target']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=2021,
                                                    stratify=y
                                                   )

In [12]:
string_selector = make_column_selector(dtype_include='object')
number_selector = make_column_selector(dtype_include='number', dtype_exclude='object')

In [13]:
preprocessing = make_column_transformer((OneHotEncoder
                                         (handle_unknown='ignore'),string_selector),
                                          (MinMaxScaler(), number_selector))

In [14]:
preprocessing.fit_transform(X_train)

array([[0.        , 1.        , 0.        , ..., 0.5       , 0.08695652,
        1.        ],
       [1.        , 0.        , 0.        , ..., 1.        , 0.73913043,
        1.        ],
       [0.        , 0.        , 0.        , ..., 1.        , 0.43478261,
        1.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.5       , 0.13043478,
        1.        ],
       [0.        , 0.        , 0.        , ..., 1.        , 0.7826087 ,
        1.        ],
       [0.        , 0.        , 0.        , ..., 0.5       , 0.04347826,
        0.        ]])

In [15]:
preprocessing.transformers_[0][1].get_feature_names()

array(['x0_1 - 17', 'x0_18 - 25', 'x0_26 - 35', 'x0_36 - 45',
       'x0_46 - 55', 'x0_56 and Above', 'x0_NA', 'x1_F', 'x1_M', 'x1_N',
       'x2_Asian', 'x2_Black', 'x2_Hispanic', 'x2_Multi-Racial', 'x2_NA',
       'x2_N_American', 'x2_P_Islander', 'x2_Unknown', 'x2_White',
       'x3_Asian', 'x3_Black', 'x3_Hispanic', 'x3_Multi-Racial', 'x3_NA',
       'x3_N_American', 'x3_Other', 'x3_P_Islander', 'x3_Unknown',
       'x3_White', 'x4_Female', 'x4_Gender Diverse', 'x4_Male', 'x4_NA',
       'x4_Undetermined', 'x4_Unknown', 'x5_911',
       'x5_ALARM CALL (NOT POLICE ALARM)', 'x5_NA', 'x5_ONVIEW',
       'x5_TELEPHONE OTHER, NOT 911', 'x5_TEXT MESSAGE'], dtype=object)

In [16]:
sm = SMOTE(random_state=2021, sampling_strategy=.5)
clf = LogisticRegression()
pipeline = make_pipeline(preprocessing, sm, clf)

In [30]:
cross_val_score( pipeline, X_train, y_train, scoring='precision')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

array([0.66882591, 0.65571076, 0.65080645, 0.66013072, 0.64855967])

In [18]:
preprocessing2 = make_column_transformer((OneHotEncoder
                                         (handle_unknown='ignore'),string_selector),
                                          (StandardScaler(), number_selector))

In [19]:
pipeline2 = make_pipeline(preprocessing2, sm, clf)

In [31]:
cross_val_score(pipeline2, X_train, y_train, scoring='precision')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

array([0.66587302, 0.64331723, 0.64743083, 0.66235864, 0.63555913])

In [21]:
clf2 = LogisticRegression(max_iter=275)

In [22]:
pipeline3 = make_pipeline(preprocessing, sm, clf2)

In [29]:
cross_val_score(pipeline3, X_train, y_train, scoring='precision')

array([0.67017828, 0.65571076, 0.65028203, 0.6593317 , 0.64696223])

In [32]:
pipeline4 = make_pipeline(preprocessing2, sm, clf2)

In [33]:
cross_val_score(pipeline4, X_train, y_train, scoring='precision')

array([0.66587302, 0.64331723, 0.64822134, 0.66235864, 0.63555913])

## Validating on the Testing Data 

In [34]:
cross_val_score(pipeline4, X_test, y_test, scoring='precision')

array([0.66101695, 0.58458244, 0.6617284 , 0.67810026, 0.68280872])