In [1]:
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

import pickle
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 200)

  from numpy.core.umath_tests import inner1d


In [2]:
with open('../data/NYPD_EDA.pkl', 'rb') as f:
    data = pickle.load(f)

# TRAIN AND HOLDOUT DATAFRAMES

In [3]:
cutoff = round(len(data)*9/10, 0)

train = data.loc[:cutoff]
holdout = data.loc[cutoff:]

# Storing target variable from holdout dataframe so we can compare it
actuals = holdout.SUSPECT_ARRESTED_FLAG

# Drop SUSPECT_ARRESTED_FLAG column from holdout dataframe so that we can predict its values.
holdout.drop(['SUSPECT_ARRESTED_FLAG'], axis = 1, inplace = True)

# FEATURE SELECTION

In [4]:
columns = ['OFFICER_EXPLAINED_STOP_FLAG', 'OFFICER_IN_UNIFORM_FLAG', 'ID_CARD_IDENTIFIES_OFFICER_FLAG',
 'SHIELD_IDENTIFIES_OFFICER_FLAG', 'VERBAL_IDENTIFIES_OFFICER_FLAG', 'FRISKED_FLAG', 'SEARCHED_FLAG', 'OTHER_CONTRABAND_FLAG',
 'BACKGROUND_VIOLENT_CRIME', 'BACKGROUND_SUSPECT_KNOWN_CARRY_WPN', 'SUSPECTS_ACTIONS_CASING_FLAG',
 'SUSPECTS_ACTIONS_CONCEALED_POSSESSION_WEAPON_FLAG','SUSPECTS_ACTIONS_DRUG_TRANSACTIONS_FLAG',
 'SUSPECTS_ACTIONS_IDENTIFY_CRIME_PATTERN_FLAG', 'SUSPECTS_ACTIONS_LOOKOUT_FLAG','SUSPECTS_ACTIONS_PROXIMITY_TO_SCENE_FLAG',
 'SEARCH_BASIS_ADMISSION_FLAG','SEARCH_BASIS_HARD_OBJECT_FLAG', 'SEARCH_BASIS_INCIDENTAL_TO_ARREST_FLAG',
 'SEARCH_BASIS_OTHER_FLAG', 'SEARCH_BASIS_OUTLINE_FLAG', 'weapon', 'endangerment', 'physical_harm', 'theft', 'property_damage',
 'trespass', 'auto', 'sexual_assault', 'drugs', 'robbery', 'other', 'prostitution', 'weapon_found', 'aggressive_force',
'demeanor_nervous', 'demeanor_antagonistic', 'demeanor_surprised','demeanor_other', 'demeanor_tired', 'demeanor_upset',
'demeanor_inebriated', 'demeanor_unstable', 'demeanor_fled',
'appearance_bag','appearance_facial_hair', 'appearance_scars', 'appearance_braids', 'appearance_hoodie', 'appearance_hat',
           'appearance_du_rag', 
           'appearance_tattoo', 
           'appearance_mask', 'appearance_piercing', 
           'appearance_unusual',
 'appearance_menacing', 'appearance_bandana', 'appearance_wounds',

'STOP_WAS_INITIATED_Based on C/W on Scene',
'ISSUING_OFFICER_RANK_CPT', 'ISSUING_OFFICER_RANK_DI', 'ISSUING_OFFICER_RANK_DT1', 'ISSUING_OFFICER_RANK_DT2',
 'ISSUING_OFFICER_RANK_DT3', 'ISSUING_OFFICER_RANK_DTS', 'ISSUING_OFFICER_RANK_INS', 'ISSUING_OFFICER_RANK_LSA',
 'ISSUING_OFFICER_RANK_LT', 'ISSUING_OFFICER_RANK_POF', 'ISSUING_OFFICER_RANK_POM', 'ISSUING_OFFICER_RANK_SDS',
 'ISSUING_OFFICER_RANK_SGT', 'ISSUING_OFFICER_RANK_SSA', 'SUPERVISING_OFFICER_RANK_CPT', 'SUPERVISING_OFFICER_RANK_DI',
 'SUPERVISING_OFFICER_RANK_LCD', 'SUPERVISING_OFFICER_RANK_LSA', 'SUPERVISING_OFFICER_RANK_LT', 'SUPERVISING_OFFICER_RANK_POF',
 'SUPERVISING_OFFICER_RANK_POM', 'SUPERVISING_OFFICER_RANK_SDS', 'SUPERVISING_OFFICER_RANK_SGT',
 'SUPERVISING_OFFICER_RANK_SSA', 'JURISDICTION_CODE_(null)', 'JURISDICTION_CODE_A', 'JURISDICTION_CODE_H',
 'JURISDICTION_CODE_P', 'JURISDICTION_CODE_T','stop_duration_group_Brief', 'stop_duration_group_Short',
 'stop_duration_group_Medium', 'stop_duration_group_Long', 'SUSPECT_BODY_BUILD_TYPE_HEA', 'SUSPECT_BODY_BUILD_TYPE_MED',
 'SUSPECT_BODY_BUILD_TYPE_MUSC', 'SUSPECT_BODY_BUILD_TYPE_THN', 'SUSPECT_BODY_BUILD_TYPE_unknown', 'SUSPECT_SEX_MALE',
 'SUSPECT_RACE_DESCRIPTION_AMERICAN INDIAN/ALASKAN NATIVE', 'SUSPECT_RACE_DESCRIPTION_ASIAN / PACIFIC ISLANDER',
 'SUSPECT_RACE_DESCRIPTION_BLACK', 'SUSPECT_RACE_DESCRIPTION_BLACK HISPANIC', 'SUSPECT_RACE_DESCRIPTION_WHITE',
 'SUSPECT_RACE_DESCRIPTION_WHITE HISPANIC', 'suspect_height_group_very_short', 'suspect_height_group_short',
 'suspect_height_group_average', 'suspect_height_group_tall', 'suspect_weight_group_very_light', 'suspect_weight_group_light',
 'suspect_weight_group_average', 'suspect_weight_group_heavy', 'suspect_weight_group_very_heavy', 'suspect_age_group_child',
 'suspect_age_group_teen', 'suspect_age_group_young_adult', 'suspect_age_group_adult', 'suspect_age_group_middle_aged',
 'suspect_age_group_older', 'suspect_age_group_senior',
 
 'STOP_LOCATION_PATROL_BORO_NAME_PBBN',
 'STOP_LOCATION_PATROL_BORO_NAME_PBBS',
 'STOP_LOCATION_PATROL_BORO_NAME_PBBX',
 'STOP_LOCATION_PATROL_BORO_NAME_PBMN',
           'STOP_LOCATION_PATROL_BORO_NAME_PBMS',
 'STOP_LOCATION_PATROL_BORO_NAME_PBQN',
 'STOP_LOCATION_PATROL_BORO_NAME_PBQS',
 'STOP_LOCATION_PATROL_BORO_NAME_PBSI',

'DAY2_Friday',
 'DAY2_Monday',
 'DAY2_Saturday',
 'DAY2_Sunday',
 'DAY2_Thursday',
 'DAY2_Tuesday',
           

           
 ]


# 'DAY2_Wednesday',

# 'stop_time_period_midday',
# 'stop_time_period_late_night',
#  'stop_time_period_early_am',
#  'stop_time_period_morning',

#  'stop_time_period_afternoon',
#  'stop_time_period_evening',
#  'stop_time_period_night',

# PREDICT

## Logistic Regression, Finding Optimized Features

In [5]:
all_X = train[columns]
all_y = train["SUSPECT_ARRESTED_FLAG"]

lr = LogisticRegression()
# Recursive feature elimination with cross validation to find the optimized columns and accuracy scores for a
# logistic regression.
selector = RFECV(lr, cv=10)
selector.fit(all_X, all_y)
lr_optimized_columns = all_X.columns[selector.support_]

In [6]:
print(lr_optimized_columns)
len(lr_optimized_columns)

Index(['OFFICER_EXPLAINED_STOP_FLAG', 'OFFICER_IN_UNIFORM_FLAG',
       'ID_CARD_IDENTIFIES_OFFICER_FLAG', 'VERBAL_IDENTIFIES_OFFICER_FLAG',
       'FRISKED_FLAG', 'SEARCHED_FLAG', 'OTHER_CONTRABAND_FLAG',
       'BACKGROUND_VIOLENT_CRIME', 'BACKGROUND_SUSPECT_KNOWN_CARRY_WPN',
       'SUSPECTS_ACTIONS_CASING_FLAG',
       'SUSPECTS_ACTIONS_CONCEALED_POSSESSION_WEAPON_FLAG',
       'SUSPECTS_ACTIONS_DRUG_TRANSACTIONS_FLAG',
       'SUSPECTS_ACTIONS_IDENTIFY_CRIME_PATTERN_FLAG',
       'SEARCH_BASIS_ADMISSION_FLAG', 'SEARCH_BASIS_HARD_OBJECT_FLAG',
       'SEARCH_BASIS_INCIDENTAL_TO_ARREST_FLAG', 'SEARCH_BASIS_OTHER_FLAG',
       'SEARCH_BASIS_OUTLINE_FLAG', 'weapon', 'endangerment', 'theft',
       'property_damage', 'trespass', 'auto', 'sexual_assault', 'drugs',
       'robbery', 'other', 'weapon_found', 'aggressive_force',
       'demeanor_nervous', 'demeanor_antagonistic', 'demeanor_tired',
       'demeanor_upset', 'demeanor_inebriated', 'demeanor_fled',
       'appearance_bag', 'ap

89

## Testing accuracy on train dataframe

In [7]:
all_X = train[lr_optimized_columns]
all_y = train["SUSPECT_ARRESTED_FLAG"]

scores = cross_val_score(lr, all_X, all_y, cv=10)
accuracy = scores.mean()
print(scores)
accuracy

[0.87641607 0.90937178 0.91246138 0.89392379 0.88259526 0.88453608
 0.87731959 0.87113402 0.86907216 0.87731959]


0.8854149723422553

## Running the model

In [8]:
lr = LogisticRegression()
lr.fit(all_X, all_y)
holdout_predictions = lr.predict(holdout[lr_optimized_columns])

## Accuracy scores and error metrics

In [9]:
def compute_error(actuals, predictions):
    return metrics.accuracy_score(actuals, predictions)

def compute_false_negatives(actuals, predictions):
    df = pd.DataFrame({"predictions": predictions, "actuals": actuals})
    return df[(df["actuals"] == 1) & (df["predictions"] == 0)].shape[0] / (df[(df["actuals"] == 1)].shape[0])

def compute_false_positives(actuals, predictions):
    df = pd.DataFrame({"predictions": predictions, "actuals": actuals})
    return df[(df["actuals"] == 0) & (df["predictions"] == 1)].shape[0] / (df[(df["actuals"] == 0)].shape[0])


acc = compute_error(actuals, holdout_predictions)
fn = compute_false_negatives(actuals, holdout_predictions)
fp = compute_false_positives(actuals, holdout_predictions)
print("Accuracy Score: {0:.1%}".format(acc))
print("False Negatives: {0:.1%}".format(fn))
print("False Positives: {0:.1%}".format(fp))

Accuracy Score: 89.0%
False Negatives: 30.0%
False Positives: 3.6%


# CONCLUSION
Seeing ~89% accuracy on both the train and holdout datasets is highly encouraging. Although a false negative rate of 30% suggests room for improvement, a sub-4% false positive rate implies that few wrongful arrests (from the perspective of the NYPD) would be made.

In order to boost accuracy and minimize the false negative and false positive rates, I had to remove the `stop_time_period` features from the selection from which optimized features were chosen.