## Connect to database

In [18]:
# import packages and configure
import psycopg2 as pg
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%config InlineBackend.figure_formats = ['svg']
%matplotlib inline

plt.rcParams['figure.figsize'] = (9, 6)
sns.set(context='notebook', style='whitegrid', font_scale=1.2)

In [19]:
# Postgres info to connect
connection_args = {
    'host': 'localhost',  # We are connecting to our _local_ version of psql
    'dbname': 'complaints_db',    # DB that we are connecting to
    'port': 5432          # port we opened on AWS
}

connection = pg.connect(**connection_args)  # What is that "**" there??

In [57]:
# pull in complaints data
query = "SELECT * FROM complaints;"
df = pd.read_sql(query, connection)
df.shape

(33358, 27)

## Check for missing data

In [58]:
df.isna().sum()

unique_mos_id                  0
first_name                     0
last_name                      0
command_now                    0
shield_no                      0
complaint_id                   0
month_received                 0
year_received                  0
month_closed                   0
year_closed                    0
command_at_incident         1544
rank_abbrev_incident           0
rank_abbrev_now                0
rank_now                       0
rank_incident                  0
mos_ethnicity                  0
mos_gender                     0
mos_age_incident               0
complainant_ethnicity       4464
complainant_gender          4195
complainant_age_incident    4812
fado_type                      0
allegation                     1
precinct                      24
contact_reason               199
outcome_description           56
board_disposition              0
dtype: int64

Looks like we're missing data on complainant demographics for ~15% of all observations. Will drop for now.

In [63]:
df = df.dropna(subset=['complainant_ethnicity','complainant_gender','complainant_age_incident'])
# this took us from 33358 to 28318 rows

## Feature and Target Transformations

In [64]:
# Combine all "substantiated" to a single label
df['disposition_clean'] = df['board_disposition']
mask = df['board_disposition'].str.startswith('Substantiated')
df.loc[mask, 'disposition_clean'] = 'Substantiated'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['disposition_clean'] = df['board_disposition']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [65]:
# create datetime fields; assume first of the month for all
# df['received_datetime'] = pd.to_datetime(df[['month_received','year_received']])

df['received_datetime'] = pd.to_datetime(df['year_received'].astype(str) + '/' + df['month_received'].astype(str) + '/01')
df['closed_datetime'] = pd.to_datetime(df['year_closed'].astype(str) + '/' + df['month_closed'].astype(str) + '/01')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['received_datetime'] = pd.to_datetime(df['year_received'].astype(str) + '/' + df['month_received'].astype(str) + '/01')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['closed_datetime'] = pd.to_datetime(df['year_closed'].astype(str) + '/' + df['month_closed'].astype(str) + '/01')


### Solution for incident groups

In [66]:
# previous incident count
df.head()

Unnamed: 0,unique_mos_id,first_name,last_name,command_now,shield_no,complaint_id,month_received,year_received,month_closed,year_closed,command_at_incident,rank_abbrev_incident,rank_abbrev_now,rank_now,rank_incident,mos_ethnicity,mos_gender,mos_age_incident,complainant_ethnicity,complainant_gender,complainant_age_incident,fado_type,allegation,precinct,contact_reason,outcome_description,board_disposition,disposition_clean,received_datetime,closed_datetime
0,10004,Jonathan,Ruiz,078 PCT,8409,42835,7,2019,5,2020,078 PCT,POM,POM,Police Officer,Police Officer,Hispanic,M,32,Black,Female,38.0,Abuse of Authority,Failure to provide RTKA card,78.0,Report-domestic dispute,No arrest made or summons issued,Substantiated (Command Lvl Instructions),Substantiated,2019-07-01,2020-05-01
1,10007,John,Sears,078 PCT,5952,24601,11,2011,8,2012,PBBS,POM,POM,Police Officer,Police Officer,White,M,24,Black,Male,26.0,Discourtesy,Action,67.0,Moving violation,Moving violation summons issued,Substantiated (Charges),Substantiated,2011-11-01,2012-08-01
2,10007,John,Sears,078 PCT,5952,24601,11,2011,8,2012,PBBS,POM,POM,Police Officer,Police Officer,White,M,24,Black,Male,26.0,Offensive Language,Race,67.0,Moving violation,Moving violation summons issued,Substantiated (Charges),Substantiated,2011-11-01,2012-08-01
3,10007,John,Sears,078 PCT,5952,26146,7,2012,9,2013,PBBS,POM,POM,Police Officer,Police Officer,White,M,25,Black,Male,45.0,Abuse of Authority,Question,67.0,PD suspected C/V of violation/crime - street,No arrest made or summons issued,Substantiated (Charges),Substantiated,2012-07-01,2013-09-01
5,10012,Paula,Smith,078 PCT,4021,37256,5,2017,10,2017,078 PCT,SGT,SGT,Sergeant,Sergeant,Black,F,50,White,Male,31.0,Abuse of Authority,Refusal to process civilian complaint,78.0,C/V telephoned PCT,No arrest made or summons issued,Substantiated (Command Lvl Instructions),Substantiated,2017-05-01,2017-10-01


In [67]:
df.shape

(28318, 30)

In [68]:
df.nunique()

unique_mos_id                3831
first_name                   1176
last_name                    2722
command_now                   410
shield_no                    3213
complaint_id                10497
month_received                 12
year_received                  23
month_closed                   12
year_closed                    21
command_at_incident           346
rank_abbrev_incident           18
rank_abbrev_now                20
rank_now                        8
rank_incident                   8
mos_ethnicity                   5
mos_gender                      2
mos_age_incident               39
complainant_ethnicity           8
complainant_gender              6
complainant_age_incident       87
fado_type                       4
allegation                     82
precinct                       79
contact_reason                 42
outcome_description            15
board_disposition              10
disposition_clean               3
received_datetime             257
closed_datetim

In [69]:
df['complaint_id'].value_counts()

36901    30
31072    20
38927    20
34557    19
41986    19
         ..
22693     1
7717      1
34342     1
32297     1
16814     1
Name: complaint_id, Length: 10497, dtype: int64

In [71]:
df[df['complaint_id'] == 31072]

Unnamed: 0,unique_mos_id,first_name,last_name,command_now,shield_no,complaint_id,month_received,year_received,month_closed,year_closed,command_at_incident,rank_abbrev_incident,rank_abbrev_now,rank_now,rank_incident,mos_ethnicity,mos_gender,mos_age_incident,complainant_ethnicity,complainant_gender,complainant_age_incident,fado_type,allegation,precinct,contact_reason,outcome_description,board_disposition,disposition_clean,received_datetime,closed_datetime
3199,14266,Paul,Chierico,113 DET,30661,31072,7,2014,4,2015,105 PCT,POM,POM,Police Officer,Police Officer,White,M,30,Hispanic,Male,20.0,Force,Pepper spray,105.0,PD suspected C/V of violation/crime - auto,Arrest - resisting arrest,Unsubstantiated,Unsubstantiated,2014-07-01,2015-04-01
3200,14266,Paul,Chierico,113 DET,30661,31072,7,2014,4,2015,105 PCT,POM,POM,Police Officer,Police Officer,White,M,30,Hispanic,Male,20.0,Force,Physical force,105.0,PD suspected C/V of violation/crime - auto,Arrest - resisting arrest,Substantiated (Charges),Substantiated,2014-07-01,2015-04-01
3201,14266,Paul,Chierico,113 DET,30661,31072,7,2014,4,2015,105 PCT,POM,POM,Police Officer,Police Officer,White,M,30,Hispanic,Male,20.0,Abuse of Authority,Threat of force (verbal or physical),105.0,PD suspected C/V of violation/crime - auto,Arrest - resisting arrest,Unsubstantiated,Unsubstantiated,2014-07-01,2015-04-01
3202,14266,Paul,Chierico,113 DET,30661,31072,7,2014,4,2015,105 PCT,POM,POM,Police Officer,Police Officer,White,M,30,Hispanic,Male,20.0,Discourtesy,Word,105.0,PD suspected C/V of violation/crime - auto,Arrest - resisting arrest,Unsubstantiated,Unsubstantiated,2014-07-01,2015-04-01
3203,14266,Paul,Chierico,113 DET,30661,31072,7,2014,4,2015,105 PCT,POM,POM,Police Officer,Police Officer,White,M,30,Hispanic,Male,20.0,Abuse of Authority,Retaliatory arrest,105.0,PD suspected C/V of violation/crime - auto,Arrest - resisting arrest,Substantiated (Charges),Substantiated,2014-07-01,2015-04-01
3216,14266,Paul,Chierico,113 DET,30661,31072,7,2014,4,2015,105 PCT,POM,POM,Police Officer,Police Officer,White,M,30,Hispanic,Male,20.0,Force,Physical force,105.0,PD suspected C/V of violation/crime - auto,Arrest - resisting arrest,Unsubstantiated,Unsubstantiated,2014-07-01,2015-04-01
3217,14266,Paul,Chierico,113 DET,30661,31072,7,2014,4,2015,105 PCT,POM,POM,Police Officer,Police Officer,White,M,30,Hispanic,Male,20.0,Abuse of Authority,Property damaged,105.0,PD suspected C/V of violation/crime - auto,Arrest - resisting arrest,Substantiated (Charges),Substantiated,2014-07-01,2015-04-01
4737,16144,Brian,Worthington,AV.UNIT,20109,31072,7,2014,4,2015,105 PCT,POM,POM,Police Officer,Police Officer,Black,M,35,Black,Male,20.0,Abuse of Authority,Frisk,105.0,PD suspected C/V of violation/crime - auto,Arrest - resisting arrest,Substantiated (Command Discipline),Substantiated,2014-07-01,2015-04-01
4738,16144,Brian,Worthington,AV.UNIT,20109,31072,7,2014,4,2015,105 PCT,POM,POM,Police Officer,Police Officer,Black,M,35,Black,Male,20.0,Abuse of Authority,Search (of person),105.0,PD suspected C/V of violation/crime - auto,Arrest - resisting arrest,Substantiated (Command Discipline),Substantiated,2014-07-01,2015-04-01
4739,16144,Brian,Worthington,AV.UNIT,20109,31072,7,2014,4,2015,105 PCT,POM,POM,Police Officer,Police Officer,Black,M,35,Hispanic,Male,20.0,Abuse of Authority,Vehicle search,105.0,PD suspected C/V of violation/crime - auto,Arrest - resisting arrest,Substantiated (Command Discipline),Substantiated,2014-07-01,2015-04-01


**Takeaway:** Each complaint_id can have multiple rows: each row associated with a unique combo of  complaint_ID AND officer AND complainant AND charge

**Temporary solution:** continue to treat each row as a unique observation (since they each have a different outcome), but create new categorical field to indicate "associated with other complaints".

In [72]:
# create column for total rows in each incident
df['complaint_row_count'] =  df.groupby('complaint_id')['complaint_id'].transform('count')
df['complaint_is_grouped'] = df['complaint_row_count'] > 1
df.head()

Unnamed: 0,unique_mos_id,first_name,last_name,command_now,shield_no,complaint_id,month_received,year_received,month_closed,year_closed,command_at_incident,rank_abbrev_incident,rank_abbrev_now,rank_now,rank_incident,mos_ethnicity,mos_gender,mos_age_incident,complainant_ethnicity,complainant_gender,complainant_age_incident,fado_type,allegation,precinct,contact_reason,outcome_description,board_disposition,disposition_clean,received_datetime,closed_datetime,complaint_row_count,complaint_is_grouped
0,10004,Jonathan,Ruiz,078 PCT,8409,42835,7,2019,5,2020,078 PCT,POM,POM,Police Officer,Police Officer,Hispanic,M,32,Black,Female,38.0,Abuse of Authority,Failure to provide RTKA card,78.0,Report-domestic dispute,No arrest made or summons issued,Substantiated (Command Lvl Instructions),Substantiated,2019-07-01,2020-05-01,5,True
1,10007,John,Sears,078 PCT,5952,24601,11,2011,8,2012,PBBS,POM,POM,Police Officer,Police Officer,White,M,24,Black,Male,26.0,Discourtesy,Action,67.0,Moving violation,Moving violation summons issued,Substantiated (Charges),Substantiated,2011-11-01,2012-08-01,2,True
2,10007,John,Sears,078 PCT,5952,24601,11,2011,8,2012,PBBS,POM,POM,Police Officer,Police Officer,White,M,24,Black,Male,26.0,Offensive Language,Race,67.0,Moving violation,Moving violation summons issued,Substantiated (Charges),Substantiated,2011-11-01,2012-08-01,2,True
3,10007,John,Sears,078 PCT,5952,26146,7,2012,9,2013,PBBS,POM,POM,Police Officer,Police Officer,White,M,25,Black,Male,45.0,Abuse of Authority,Question,67.0,PD suspected C/V of violation/crime - street,No arrest made or summons issued,Substantiated (Charges),Substantiated,2012-07-01,2013-09-01,1,False
5,10012,Paula,Smith,078 PCT,4021,37256,5,2017,10,2017,078 PCT,SGT,SGT,Sergeant,Sergeant,Black,F,50,White,Male,31.0,Abuse of Authority,Refusal to process civilian complaint,78.0,C/V telephoned PCT,No arrest made or summons issued,Substantiated (Command Lvl Instructions),Substantiated,2017-05-01,2017-10-01,1,False


### create ordinal variable for rank

In [73]:
# create new ordinal column to represent rank at time of incident 
command_rank_dict = {'Police Officer': 1,
                     'Sergeant': 3,
                     'Detective': 2,
                     'Lieutenant': 4,
                     'Captain': 5,
                     'Deputy Inspector': 6,
                     'Inspector': 6,
                     'Chiefs and other ranks': 7}

df['command_rank_num'] = df['rank_incident'].map(command_rank_dict)

### Create dummy variables

In [74]:
# create copies of columns to be dummified
dummy_columns = ['mos_ethnicity','mos_gender','complainant_ethnicity','complainant_gender',
                                'fado_type','precinct']

for d in dummy_columns:
    copy_name = d+"_copy"
    df[copy_name] = df[d]

In [75]:
df.columns

Index(['unique_mos_id', 'first_name', 'last_name', 'command_now', 'shield_no',
       'complaint_id', 'month_received', 'year_received', 'month_closed',
       'year_closed', 'command_at_incident', 'rank_abbrev_incident',
       'rank_abbrev_now', 'rank_now', 'rank_incident', 'mos_ethnicity',
       'mos_gender', 'mos_age_incident', 'complainant_ethnicity',
       'complainant_gender', 'complainant_age_incident', 'fado_type',
       'allegation', 'precinct', 'contact_reason', 'outcome_description',
       'board_disposition', 'disposition_clean', 'received_datetime',
       'closed_datetime', 'complaint_row_count', 'complaint_is_grouped',
       'command_rank_num', 'mos_ethnicity_copy', 'mos_gender_copy',
       'complainant_ethnicity_copy', 'complainant_gender_copy',
       'fado_type_copy', 'precinct_copy'],
      dtype='object')

In [76]:
df = pd.get_dummies(df, columns=['mos_ethnicity','mos_gender','complainant_ethnicity','complainant_gender',
                                'fado_type','precinct'],drop_first=True)

In [41]:
pd.set_option('display.max_columns', 999)
pd.set_option('display.max_rows', 999)

In [43]:
pd.options.display.max_rows

In [46]:
pd.set_option('display.max_rows', 500)
df.head(5)

Unnamed: 0,unique_mos_id,first_name,last_name,command_now,shield_no,complaint_id,month_received,year_received,month_closed,year_closed,command_at_incident,rank_abbrev_incident,rank_abbrev_now,rank_now,rank_incident,mos_age_incident,complainant_age_incident,allegation,contact_reason,outcome_description,board_disposition,disposition_clean,received_datetime,closed_datetime,complaint_row_count,complaint_is_grouped,command_rank_num,mos_ethnicity_copy,mos_gender_copy,complainant_ethnicity_copy,complainant_gender_copy,fado_type_copy,precinct_copy,mos_ethnicity_Asian,mos_ethnicity_Black,mos_ethnicity_Hispanic,mos_ethnicity_White,mos_gender_M,complainant_ethnicity_Asian,complainant_ethnicity_Black,complainant_ethnicity_Hispanic,complainant_ethnicity_Other Race,complainant_ethnicity_Refused,complainant_ethnicity_Unknown,complainant_ethnicity_White,complainant_gender_Gender non-conforming,complainant_gender_Male,complainant_gender_Not described,complainant_gender_Transman (FTM),complainant_gender_Transwoman (MTF),fado_type_Discourtesy,fado_type_Force,fado_type_Offensive Language,precinct_1.0,precinct_5.0,precinct_6.0,precinct_7.0,precinct_9.0,precinct_10.0,precinct_13.0,precinct_14.0,precinct_17.0,precinct_18.0,precinct_19.0,precinct_20.0,precinct_22.0,precinct_23.0,precinct_24.0,precinct_25.0,precinct_26.0,precinct_28.0,precinct_30.0,precinct_32.0,precinct_33.0,precinct_34.0,precinct_40.0,precinct_41.0,precinct_42.0,precinct_43.0,precinct_44.0,precinct_45.0,precinct_46.0,precinct_47.0,precinct_48.0,precinct_49.0,precinct_50.0,precinct_52.0,precinct_60.0,precinct_61.0,precinct_62.0,precinct_63.0,precinct_66.0,precinct_67.0,precinct_68.0,precinct_69.0,precinct_70.0,precinct_71.0,precinct_72.0,precinct_73.0,precinct_75.0,precinct_76.0,precinct_77.0,precinct_78.0,precinct_79.0,precinct_81.0,precinct_83.0,precinct_84.0,precinct_88.0,precinct_90.0,precinct_94.0,precinct_100.0,precinct_101.0,precinct_102.0,precinct_103.0,precinct_104.0,precinct_105.0,precinct_106.0,precinct_107.0,precinct_108.0,precinct_109.0,precinct_110.0,precinct_111.0,precinct_112.0,precinct_113.0,precinct_114.0,precinct_115.0,precinct_120.0,precinct_121.0,precinct_122.0,precinct_123.0,precinct_1000.0
0,10004,Jonathan,Ruiz,078 PCT,8409,42835,7,2019,5,2020,078 PCT,POM,POM,Police Officer,Police Officer,32,38.0,Failure to provide RTKA card,Report-domestic dispute,No arrest made or summons issued,Substantiated (Command Lvl Instructions),Substantiated,2019-07-01,2020-05-01,5,True,1,Hispanic,M,Black,Female,Abuse of Authority,78.0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,10007,John,Sears,078 PCT,5952,24601,11,2011,8,2012,PBBS,POM,POM,Police Officer,Police Officer,24,26.0,Action,Moving violation,Moving violation summons issued,Substantiated (Charges),Substantiated,2011-11-01,2012-08-01,2,True,1,White,M,Black,Male,Discourtesy,67.0,0,0,0,1,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,10007,John,Sears,078 PCT,5952,24601,11,2011,8,2012,PBBS,POM,POM,Police Officer,Police Officer,24,26.0,Race,Moving violation,Moving violation summons issued,Substantiated (Charges),Substantiated,2011-11-01,2012-08-01,2,True,1,White,M,Black,Male,Offensive Language,67.0,0,0,0,1,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,10007,John,Sears,078 PCT,5952,26146,7,2012,9,2013,PBBS,POM,POM,Police Officer,Police Officer,25,45.0,Question,PD suspected C/V of violation/crime - street,No arrest made or summons issued,Substantiated (Charges),Substantiated,2012-07-01,2013-09-01,1,False,1,White,M,Black,Male,Abuse of Authority,67.0,0,0,0,1,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,10009,Noemi,Sierra,078 PCT,24058,40253,8,2018,2,2019,078 PCT,POF,POF,Police Officer,Police Officer,39,16.0,Physical force,Report-dispute,Arrest - other violation/crime,Substantiated (Command Discipline A),Substantiated,2018-08-01,2019-02-01,3,True,1,Hispanic,F,,,Force,67.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Exploratory Data Analysis

In [78]:
pd.crosstab(index=df['complainant_ethnicity_copy'], columns=df['disposition_clean'],
            normalize='index').round(3)

disposition_clean,Exonerated,Substantiated,Unsubstantiated
complainant_ethnicity_copy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
American Indian,0.266,0.359,0.375
Asian,0.28,0.29,0.43
Black,0.283,0.238,0.479
Hispanic,0.263,0.244,0.493
Other Race,0.233,0.268,0.499
Refused,0.258,0.278,0.464
Unknown,0.274,0.278,0.448
White,0.249,0.278,0.473


In [79]:
pd.crosstab(index=df['complainant_ethnicity_copy'], columns=df['fado_type_copy'],
            normalize='index').round(3)

fado_type_copy,Abuse of Authority,Discourtesy,Force,Offensive Language
complainant_ethnicity_copy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
American Indian,0.75,0.125,0.109,0.016
Asian,0.617,0.164,0.181,0.038
Black,0.605,0.134,0.239,0.023
Hispanic,0.57,0.16,0.25,0.02
Other Race,0.578,0.182,0.21,0.03
Refused,0.663,0.159,0.155,0.024
Unknown,0.537,0.158,0.292,0.013
White,0.601,0.175,0.2,0.024


## First Simple Model

In [80]:
df['disposition_clean'].value_counts()

Unsubstantiated    13590
Exonerated          7748
Substantiated       6980
Name: disposition_clean, dtype: int64

In [None]:
import statsmodels.api as sm

In [82]:
# define features and target
X = df[['mos_age_incident',
'complainant_age_incident',
'complaint_is_grouped',
'command_rank_num',
'mos_ethnicity_Asian',
'mos_ethnicity_Black',
'mos_ethnicity_Hispanic',
'mos_ethnicity_White',
'mos_gender_M',
'complainant_ethnicity_Asian',
'complainant_ethnicity_Black',
'complainant_ethnicity_Hispanic',
'complainant_ethnicity_Other Race',
'complainant_ethnicity_Refused',
'complainant_ethnicity_Unknown',
'complainant_ethnicity_White',
'complainant_gender_Gender non-conforming',
'complainant_gender_Male',
'complainant_gender_Not described',
'complainant_gender_Transman (FTM)',
'complainant_gender_Transwoman (MTF)',
'fado_type_Discourtesy',
'fado_type_Force',
'fado_type_Offensive Language',
'precinct_1.0',
'precinct_5.0',
'precinct_6.0',
'precinct_7.0',
'precinct_9.0',
'precinct_10.0',
'precinct_13.0',
'precinct_14.0',
'precinct_17.0',
'precinct_18.0',
'precinct_19.0',
'precinct_20.0',
'precinct_22.0',
'precinct_23.0',
'precinct_24.0',
'precinct_25.0',
'precinct_26.0',
'precinct_28.0',
'precinct_30.0',
'precinct_32.0',
'precinct_33.0',
'precinct_34.0',
'precinct_40.0',
'precinct_41.0',
'precinct_42.0',
'precinct_43.0',
'precinct_44.0',
'precinct_45.0',
'precinct_46.0',
'precinct_47.0',
'precinct_48.0',
'precinct_49.0',
'precinct_50.0',
'precinct_52.0',
'precinct_60.0',
'precinct_61.0',
'precinct_62.0',
'precinct_63.0',
'precinct_66.0',
'precinct_67.0',
'precinct_68.0',
'precinct_69.0',
'precinct_70.0',
'precinct_71.0',
'precinct_72.0',
'precinct_73.0',
'precinct_75.0',
'precinct_76.0',
'precinct_77.0',
'precinct_78.0',
'precinct_79.0',
'precinct_81.0',
'precinct_83.0',
'precinct_84.0',
'precinct_88.0',
'precinct_90.0',
'precinct_94.0',
'precinct_100.0',
'precinct_101.0',
'precinct_102.0',
'precinct_103.0',
'precinct_104.0',
'precinct_105.0',
'precinct_106.0',
'precinct_107.0',
'precinct_108.0',
'precinct_109.0',
'precinct_110.0',
'precinct_111.0',
'precinct_112.0',
'precinct_113.0',
'precinct_114.0',
'precinct_115.0',
'precinct_120.0',
'precinct_121.0',
'precinct_122.0',
'precinct_123.0',
'precinct_1000.0']]
y = df['disposition_clean']

In [88]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [84]:
X, X_test, y, y_test = train_test_split(
    X, y, test_size=.2, random_state=33) 

In [85]:
# split again for simple validation (will implement cross validation later)
X, X_val, y, y_val = train_test_split(
    X, y, test_size=.2, random_state=33) 

In [91]:
# standard scale all sets of X

scaler = StandardScaler() #with mean=True?
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.fit_transform(X_test)
X_val_scaled = scaler.fit_transform(X_val)

In [96]:
# fit model on train data
logreg = LogisticRegression(random_state=3333,max_iter=10000).fit(X, y)

In [95]:
# validate model on validate data
logreg.score(X_val,y_val)

0.5087177223570956