# Naive Logistic Regression Civilian Death

In [323]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np

In [324]:
civilian_df = pd.read_pickle('./dataframes/civ_inc_df.pkl')

In [325]:
# These are all columns I deemed to be post-incident. They are all categories for which the data was collected once
# the incident was over. The remaining columns are categories for which the data could - and is likely to - be
# information the officer/s perceived upon encountering the involved civilian.

drop_cols = ['Incident_ID', 'Civilian_Officer','Injured', 'Injury_Type', 'Medical_Aid',
             'Injury_From_Preexisting_Condition', 'Received_Force_Location',
             'Order_Of_Force_Specified', 'Order_Of_Force_Str', 'Received_Force',
             'Received_Force_Type', 'DISCHARGE_OF_FIREARM_INDIVIDUAL',
             'DISCHARGE_OF_FIREARM_INCIDENT', 'CIVILIAN_Highest_Charge', 'CIVILIAN_Crime_Qualifier',
             'CIVILIAN_Firearm_Type', 'CIVILIAN_Resistance_Type',
             'CIVILIAN_Custody_Status', 'CIVILIAN_Confirmed_Armed',
             'CIVILIAN_Confirmed_Armed_Weapon', 'OFFICER_Officer_Used_Force', 'OFFICER_Officer_Used_Force_Reason',
             'OFFICER_On_Duty', 'OFFICER_Dress', 'Incident_Date_Str', 'Incident_Time_Str', 'Arrest_Made',
             'Crime_Report_Filed', 'In_Custody_Reason', 'Zip_Code', 'Loc_Latitude', 'Loc_Longitude', 'City',
             'County', 'State']

In [326]:
civilian_df_dropped = civilian_df.drop(drop_cols, axis=1)

dummy_civ_df = pd.get_dummies(civilian_df_dropped)

# The dropped columns are all too similar to my target column so therefore dropped those too.

X = dummy_civ_df.drop(['Injury_Level_Death', 'Injury_Level_Injury',
                       'Injury_Level_Serious bodily injury', 'Injury_Level_na'], axis=1) 
y = dummy_civ_df['Injury_Level_Death']

# Split into thirds: 2/3 training, 1/3 test.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [327]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((557, 80), (275, 80), (557,), (275,))

In [328]:
lr = LogisticRegression()

In [329]:
all_incidents_lr = lr.fit(X_train, y_train)

In [330]:
all_incidents_lr.score(X_test, y_test)

0.79272727272727272

In [313]:
all_incidents_lr.best_estimator_.coef_[0]

array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        , -0.03878763,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.  

In [331]:
pd.DataFrame(all_incidents_lr.coef_[0], index=X.columns).sort_values(by=0, ascending=False)

Unnamed: 0,0
Contact_Reason_Welfare Check,1.525078
"CIVILIAN_Perceived_Armed_Weapon_Knife, blade, or stabbing instrument",1.096761
"Race_Ethnic_Group_Asian Indian, Hispanic",0.856898
Contact_Reason_Civil Assembly,0.790410
CIVILIAN_Perceived_Armed_Weapon_Firearm,0.776638
"Race_Ethnic_Group_Hispanic, White",0.613375
Age_61-65,0.599811
CIVILIAN_K12_Type_Neither,0.517489
Age_71-75,0.507315
Age_36-40,0.434970


## By Top Counties w Highest UoF Death Incident Rates

### San Joaquin County

Highest death rate, with **30%** of UoF incidents resulting in death.

In [332]:
sanjoaquin = civilian_df[civilian_df['County'] == 'San Joaquin County']

sanjoaquin_dropped = sanjoaquin.drop(drop_cols, axis=1)

dummy_sanjoaquin = pd.get_dummies(sanjoaquin_dropped)

# The dropped columns are all too similar to my target column so therefore dropped those too.

X_sanjoa = dummy_sanjoaquin.drop(['Injury_Level_Death', 'Injury_Level_Injury',
                       'Injury_Level_Serious bodily injury', 'Injury_Level_na'], axis=1) 
y_sanjoa = dummy_sanjoaquin['Injury_Level_Death']

X_train_sanjoa, X_test_sanjoa, y_train_sanjoa, y_test_sanjoa = train_test_split(X_sanjoa, y_sanjoa, test_size=0.33)

In [333]:
X_train_sanjoa.shape, X_test_sanjoa.shape, y_train_sanjoa.shape, y_test_sanjoa.shape

((16, 48), (9, 48), (16,), (9,))

In [334]:
sanjoa_lr = grid_logr.fit(X_train_sanjoa, y_train_sanjoa)

sanjoa_lr.score(X_test_sanjoa, y_test_sanjoa)

0.66666666666666663

In [335]:
pd.DataFrame(sanjoa_lr.best_estimator_.coef_[0], index=X_sanjoa.columns).sort_values(by=0, ascending=False)

Unnamed: 0,0
Contact_Reason_Crime in Progress / Investigating Suspicious Persons or Circumstances,4.134837
CIVILIAN_Mental_Status_None,2.969172
Age_31-35,2.791871
CIVILIAN_Perceived_Armed_False,2.210346
Race_Ethnic_Group_Hispanic,1.916657
CIVILIAN_Perceived_Armed_Weapon_na,1.152211
Month,0.54056
Hour,0.095684
Contact_Reason_Call for Service,0.0
CIVILIAN_Perceived_Armed_True,0.0


### Los Angeles County

2nd highest death rate, with **26%** of UoF incidents resulting in death.

In [336]:
losangeles = civilian_df[civilian_df['County'] == 'Los Angeles County']

losangeles_dropped = losangeles.drop(drop_cols, axis=1)

dummy_losangeles = pd.get_dummies(losangeles_dropped)

# The dropped columns are all too similar to my target column so therefore dropped those too.

X_la = dummy_losangeles.drop(['Injury_Level_Death', 'Injury_Level_Injury',
                       'Injury_Level_Serious bodily injury', 'Injury_Level_na'], axis=1) 
y_la = dummy_losangeles['Injury_Level_Death']

X_train_la, X_test_la, y_train_la, y_test_la = train_test_split(X_la, y_la, test_size=0.33)

In [337]:
X_train_la.shape, X_test_la.shape, y_train_la.shape, y_test_la.shape

((155, 59), (77, 59), (155,), (77,))

In [338]:
la_lr = grid_logr.fit(X_train_la, y_train_la)

la_lr.score(X_test_la, y_test_la)

0.7142857142857143

In [339]:
pd.DataFrame(la_lr.best_estimator_.coef_[0], index=X_la.columns).sort_values(by=0, ascending=False).head(20)

Unnamed: 0,0
Multiple_Locations,0.0
"CIVILIAN_Perceived_Armed_Weapon_Knife, blade, or stabbing instrument",0.0
CIVILIAN_Mental_Status_Signs of mental disability,0.0
"CIVILIAN_Mental_Status_Signs of mental disability, Signs of alcohol impairment",0.0
"CIVILIAN_Mental_Status_Signs of mental disability, Signs of developmental disability, Signs of drug impairment",0.0
"CIVILIAN_Mental_Status_Signs of mental disability, Signs of drug impairment",0.0
CIVILIAN_Assaulted_Officer_False,0.0
CIVILIAN_Assaulted_Officer_True,0.0
CIVILIAN_Perceived_Armed_False,0.0
CIVILIAN_Perceived_Armed_True,0.0


### Riverside County

3rd highest death rate, with **22%** of UoF incidents resulting in death.

In [340]:
riverside = civilian_df[civilian_df['County'] == 'Riverside County']

riverside_dropped = riverside.drop(drop_cols, axis=1)

dummy_riverside = pd.get_dummies(riverside_dropped)

# The dropped columns are all too similar to my target column so therefore dropped those too.

X_rs = dummy_riverside.drop(['Injury_Level_Death', 'Injury_Level_Injury',
                       'Injury_Level_Serious bodily injury', 'Injury_Level_na'], axis=1) 
y_rs = dummy_riverside['Injury_Level_Death']

X_train_rs, X_test_rs, y_train_rs, y_test_rs = train_test_split(X_rs, y_rs, test_size=0.33)

In [341]:
X_train_rs.shape, X_test_rs.shape, y_train_rs.shape, y_test_rs.shape

((44, 55), (22, 55), (44,), (22,))

In [342]:
rs_lr = grid_logr.fit(X_train_rs, y_train_rs)

rs_lr.score(X_test_rs, y_test_rs)

0.68181818181818177

In [343]:
pd.DataFrame(rs_lr.best_estimator_.coef_[0], index=X_rs.columns).sort_values(by=0, ascending=False).head(20)

Unnamed: 0,0
Multiple_Locations,0.0
"CIVILIAN_Perceived_Armed_Weapon_Knife, blade, or stabbing instrument",0.0
CIVILIAN_Mental_Status_Signs of drug impairment,0.0
"CIVILIAN_Mental_Status_Signs of drug impairment, Signs of alcohol impairment",0.0
CIVILIAN_Mental_Status_Signs of mental disability,0.0
"CIVILIAN_Mental_Status_Signs of mental disability, Signs of drug impairment",0.0
"CIVILIAN_Mental_Status_Signs of mental disability, Signs of drug impairment, Signs of alcohol impairment",0.0
CIVILIAN_Assaulted_Officer_False,0.0
CIVILIAN_Assaulted_Officer_True,0.0
CIVILIAN_Perceived_Armed_False,0.0


# GridSearchCV Logistic Regression

In [344]:
civilian_df_dropped = civilian_df.drop(drop_cols, axis=1)

dummy_civ_df = pd.get_dummies(civilian_df_dropped)

# The dropped columns are all too similar to my target column so therefore dropped those too.

X = dummy_civ_df.drop(['Injury_Level_Death', 'Injury_Level_Injury',
                       'Injury_Level_Serious bodily injury', 'Injury_Level_na'], axis=1) 
y = dummy_civ_df['Injury_Level_Death']

# Split into thirds: 2/3 training, 1/3 test.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [345]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((557, 80), (275, 80), (557,), (275,))

In [346]:
lr = LogisticRegression()

In [347]:
logr_params = {'C' : np.logspace(-3,3,7),
               'penalty' : ['l1', 'l2'],}

In [348]:
grid_logr = GridSearchCV(lr, param_grid=logr_params)

In [349]:
all_incidents_lr = grid_logr.fit(X_train, y_train)

In [350]:
all_incidents_lr.best_score_

0.80969479353680429

In [351]:
all_incidents_lr.score(X_test, y_test)

0.81454545454545457

In [352]:
all_incidents_lr.best_estimator_.coef_[0]

array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        , -0.04183101,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.  

In [353]:
pd.DataFrame(all_incidents_lr.best_estimator_.coef_[0], index=X.columns).sort_values(by=0)

Unnamed: 0,0
Hour,-0.041831
Multiple_Locations,0.000000
"CIVILIAN_Perceived_Armed_Weapon_Firearm, Other dangerous weapon, Unknown",0.000000
"CIVILIAN_Perceived_Armed_Weapon_Firearm, Other dangerous weapon",0.000000
"CIVILIAN_Perceived_Armed_Weapon_Firearm, Knife, blade, or stabbing instrument",0.000000
CIVILIAN_Perceived_Armed_Weapon_Firearm,0.000000
CIVILIAN_Perceived_Armed_True,0.000000
CIVILIAN_Perceived_Armed_False,0.000000
CIVILIAN_Assaulted_Officer_True,0.000000
CIVILIAN_Assaulted_Officer_False,0.000000


## By Top Counties w Highest UoF Death Incident Rates

### San Joaquin County

Highest death rate, with **30%** of UoF incidents resulting in death.

In [367]:
sanjoaquin = civilian_df[civilian_df['County'] == 'San Joaquin County']

sanjoaquin_dropped = sanjoaquin.drop(drop_cols, axis=1)

dummy_sanjoaquin = pd.get_dummies(sanjoaquin_dropped)

# The dropped columns are all too similar to my target column so therefore dropped those too.

X_sanjoa = dummy_sanjoaquin.drop(['Injury_Level_Death', 'Injury_Level_Injury',
                       'Injury_Level_Serious bodily injury', 'Injury_Level_na'], axis=1) 
y_sanjoa = dummy_sanjoaquin['Injury_Level_Death']

X_train_sanjoa, X_test_sanjoa, y_train_sanjoa, y_test_sanjoa = train_test_split(X_sanjoa, y_sanjoa, test_size=0.33)

In [368]:
X_train_sanjoa.shape, X_test_sanjoa.shape, y_train_sanjoa.shape, y_test_sanjoa.shape

((16, 48), (9, 48), (16,), (9,))

In [369]:
sanjoa_lr = grid_logr.fit(X_train_sanjoa, y_train_sanjoa)

sanjoa_lr.score(X_test_sanjoa, y_test_sanjoa)

0.77777777777777779

In [370]:
pd.DataFrame(sanjoa_lr.best_estimator_.coef_[0], index=X_sanjoa.columns).sort_values(by=0, ascending=False).head(20)

Unnamed: 0,0
Multiple_Locations,0.0
On_K12_Campus,0.0
CIVILIAN_Mental_Status_Signs of alcohol impairment,0.0
CIVILIAN_Mental_Status_Signs of drug impairment,0.0
CIVILIAN_Mental_Status_Signs of mental disability,0.0
"CIVILIAN_Mental_Status_Signs of mental disability, Signs of drug impairment",0.0
CIVILIAN_Assaulted_Officer_False,0.0
CIVILIAN_Assaulted_Officer_True,0.0
CIVILIAN_Perceived_Armed_False,0.0
CIVILIAN_Perceived_Armed_True,0.0


### Los Angeles County

2nd highest death rate, with **26%** of UoF incidents resulting in death.

In [358]:
losangeles = civilian_df[civilian_df['County'] == 'Los Angeles County']

losangeles_dropped = losangeles.drop(drop_cols, axis=1)

dummy_losangeles = pd.get_dummies(losangeles_dropped)

# The dropped columns are all too similar to my target column so therefore dropped those too.

X_la = dummy_losangeles.drop(['Injury_Level_Death', 'Injury_Level_Injury',
                       'Injury_Level_Serious bodily injury', 'Injury_Level_na'], axis=1) 
y_la = dummy_losangeles['Injury_Level_Death']

X_train_la, X_test_la, y_train_la, y_test_la = train_test_split(X_la, y_la, test_size=0.33)

In [359]:
X_train_la.shape, X_test_la.shape, y_train_la.shape, y_test_la.shape

((155, 59), (77, 59), (155,), (77,))

In [360]:
la_lr = grid_logr.fit(X_train_la, y_train_la)

la_lr.score(X_test_la, y_test_la)

0.74025974025974028

In [361]:
pd.DataFrame(la_lr.best_estimator_.coef_[0], index=X_la.columns).sort_values(by=0, ascending=False).head(20)

Unnamed: 0,0
Multiple_Locations,0.0
"CIVILIAN_Perceived_Armed_Weapon_Knife, blade, or stabbing instrument",0.0
CIVILIAN_Mental_Status_Signs of mental disability,0.0
"CIVILIAN_Mental_Status_Signs of mental disability, Signs of alcohol impairment",0.0
"CIVILIAN_Mental_Status_Signs of mental disability, Signs of developmental disability, Signs of drug impairment",0.0
"CIVILIAN_Mental_Status_Signs of mental disability, Signs of drug impairment",0.0
CIVILIAN_Assaulted_Officer_False,0.0
CIVILIAN_Assaulted_Officer_True,0.0
CIVILIAN_Perceived_Armed_False,0.0
CIVILIAN_Perceived_Armed_True,0.0


### Riverside County

3rd highest death rate, with **22%** of UoF incidents resulting in death.

In [362]:
riverside = civilian_df[civilian_df['County'] == 'Riverside County']

riverside_dropped = riverside.drop(drop_cols, axis=1)

dummy_riverside = pd.get_dummies(riverside_dropped)

# The dropped columns are all too similar to my target column so therefore dropped those too.

X_rs = dummy_riverside.drop(['Injury_Level_Death', 'Injury_Level_Injury',
                       'Injury_Level_Serious bodily injury', 'Injury_Level_na'], axis=1) 
y_rs = dummy_riverside['Injury_Level_Death']

X_train_rs, X_test_rs, y_train_rs, y_test_rs = train_test_split(X_rs, y_rs, test_size=0.33)

In [363]:
X_train_rs.shape, X_test_rs.shape, y_train_rs.shape, y_test_rs.shape

((44, 55), (22, 55), (44,), (22,))

In [364]:
rs_lr = grid_logr.fit(X_train_rs, y_train_rs)

rs_lr.score(X_test_rs, y_test_rs)

0.77272727272727271

In [365]:
pd.DataFrame(rs_lr.best_estimator_.coef_[0], index=X_rs.columns).sort_values(by=0, ascending=False).head(20)

Unnamed: 0,0
Multiple_Locations,0.0
"CIVILIAN_Perceived_Armed_Weapon_Knife, blade, or stabbing instrument",0.0
CIVILIAN_Mental_Status_Signs of drug impairment,0.0
"CIVILIAN_Mental_Status_Signs of drug impairment, Signs of alcohol impairment",0.0
CIVILIAN_Mental_Status_Signs of mental disability,0.0
"CIVILIAN_Mental_Status_Signs of mental disability, Signs of drug impairment",0.0
"CIVILIAN_Mental_Status_Signs of mental disability, Signs of drug impairment, Signs of alcohol impairment",0.0
CIVILIAN_Assaulted_Officer_False,0.0
CIVILIAN_Assaulted_Officer_True,0.0
CIVILIAN_Perceived_Armed_False,0.0
