Build a classifier to predict whether an arrest was made after a Terry Stop, given information about the presence of weapons, the time of day of the call, etc. Note that this is a *binary* classification problem.

In [43]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import OneHotEncoder
from datetime import datetime, date
pd.set_option('display.max_columns', None)

data = pd.read_csv('Terry_stops.csv')

data.head(50)

Unnamed: 0,Subject Age Group,Subject ID,GO / SC Num,Terry Stop ID,Stop Resolution,Weapon Type,Officer ID,Officer YOB,Officer Gender,Officer Race,Subject Perceived Race,Subject Perceived Gender,Reported Date,Reported Time,Initial Call Type,Final Call Type,Call Type,Officer Squad,Arrest Flag,Frisk Flag,Precinct,Sector,Beat
0,-,-1,20140000120677,92317,Arrest,,7500,1984,M,Black or African American,Asian,Male,2015-10-16T00:00:00,11:32:00,-,-,-,SOUTH PCT 1ST W - ROBERT,N,N,South,O,O2
1,-,-1,20150000001463,28806,Field Contact,,5670,1965,M,White,-,-,2015-03-19T00:00:00,07:59:00,-,-,-,,N,N,-,-,-
2,-,-1,20150000001516,29599,Field Contact,,4844,1961,M,White,White,Male,2015-03-21T00:00:00,19:12:00,-,-,-,,N,-,-,-,-
3,-,-1,20150000001670,32260,Field Contact,,7539,1963,M,White,-,-,2015-04-01T00:00:00,04:55:00,-,-,-,,N,N,-,-,-
4,-,-1,20150000001739,33155,Field Contact,,6973,1977,M,White,Black or African American,Male,2015-04-03T00:00:00,00:41:00,-,-,-,,N,N,-,-,-
5,-,-1,20150000001755,33571,Field Contact,,7402,1973,M,White,Black or African American,Male,2015-04-05T00:00:00,23:46:00,-,-,-,,N,N,-,-,-
6,-,-1,20150000002351,45252,Field Contact,,7591,1985,M,Hispanic or Latino,Other,Female,2015-05-20T00:00:00,21:39:00,-,-,-,WEST PCT 3RD W - MARY,N,N,-,-,-
7,-,-1,20150000002363,45182,Field Contact,,7591,1985,M,Hispanic or Latino,White,Male,2015-05-20T00:00:00,22:40:00,-,-,-,WEST PCT 3RD W - MARY,N,N,-,-,-
8,-,-1,20150000002392,45365,Field Contact,,7560,1986,M,White,White,Female,2015-05-22T00:00:00,07:39:00,-,-,-,SOUTH PCT 1ST W - R/S RELIEF,N,N,East,E,E2
9,-,-1,20150000002451,46430,Field Contact,,7591,1985,M,Hispanic or Latino,-,-,2015-05-25T00:00:00,01:06:00,-,-,-,WEST PCT 3RD W - MARY,N,N,-,-,-


In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48485 entries, 0 to 48484
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Subject Age Group         48485 non-null  object
 1   Subject ID                48485 non-null  int64 
 2   GO / SC Num               48485 non-null  int64 
 3   Terry Stop ID             48485 non-null  int64 
 4   Stop Resolution           48485 non-null  object
 5   Weapon Type               48485 non-null  object
 6   Officer ID                48485 non-null  object
 7   Officer YOB               48485 non-null  int64 
 8   Officer Gender            48485 non-null  object
 9   Officer Race              48485 non-null  object
 10  Subject Perceived Race    48485 non-null  object
 11  Subject Perceived Gender  48485 non-null  object
 12  Reported Date             48485 non-null  object
 13  Reported Time             48485 non-null  object
 14  Initial Call Type     

In [3]:
# removing unnecessary columns from dataframe
df = data.drop(columns=['Subject ID', 'GO / SC Num', 'Terry Stop ID', 'Reported Time'], axis=1)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48485 entries, 0 to 48484
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Subject Age Group         48485 non-null  object
 1   Stop Resolution           48485 non-null  object
 2   Weapon Type               48485 non-null  object
 3   Officer ID                48485 non-null  object
 4   Officer YOB               48485 non-null  int64 
 5   Officer Gender            48485 non-null  object
 6   Officer Race              48485 non-null  object
 7   Subject Perceived Race    48485 non-null  object
 8   Subject Perceived Gender  48485 non-null  object
 9   Reported Date             48485 non-null  object
 10  Initial Call Type         48485 non-null  object
 11  Final Call Type           48485 non-null  object
 12  Call Type                 48485 non-null  object
 13  Officer Squad             47879 non-null  object
 14  Arrest Flag           

In [5]:
#reformat col names 
df.rename(columns={'Subject Age Group':'subject_age_group', 'Stop Resolution':'stop_resolution', 'Weapon Type':'weapon_type', 'Officer ID':'officer_id',
       'Officer YOB':'officer_yob', 'Officer Gender':'officer_gender', 'Officer Race':'officer_race',
       'Subject Perceived Race':'subject_perceived_race', 'Subject Perceived Gender':'subject_perceived_gender', 'Reported Date':'reported_date',
       'Initial Call Type':'initial_call_type', 'Final Call Type':'final_call_type', 'Call Type':'call_type', 'Officer Squad': 'officer_squad',
       'Arrest Flag':'arrest_flag', 'Frisk Flag':'frisk_flag', 'Precinct':'precinct', 'Sector':'sector', 'Beat':'beat'}, inplace=True)

In [6]:
for col in df.columns:
    try:
        print(df[col].value_counts())
    except:
        print(df[col].value_counts())   
    print('\n')

26 - 35         16158
36 - 45         10339
18 - 25          9669
46 - 55          6274
56 and Above     2466
1 - 17           1988
-                1591
Name: subject_age_group, dtype: int64


Field Contact               19691
Offense Report              15854
Arrest                      12032
Referred for Prosecution      728
Citation / Infraction         180
Name: stop_resolution, dtype: int64


None                                    32565
-                                       13021
Lethal Cutting Instrument                1482
Knife/Cutting/Stabbing Instrument         706
Handgun                                   300
Firearm Other                             100
Blunt Object/Striking Implement            93
Club, Blackjack, Brass Knuckles            49
Firearm                                    44
Mace/Pepper Spray                          31
Other Firearm                              27
Firearm (unk type)                         15
Taser/Stun Gun                             10


In [7]:
# Some Columns have an '-' so will convert to NaN
#  and then inspect for NaN Values

df.replace('-', np.nan, inplace=True)
df.isna().sum()

subject_age_group            1591
stop_resolution                 0
weapon_type                 13021
officer_id                     24
officer_yob                     0
officer_gender                  0
officer_race                    0
subject_perceived_race       1853
subject_perceived_gender      237
reported_date                   0
initial_call_type           13234
final_call_type             13234
call_type                   13234
officer_squad                 606
arrest_flag                     0
frisk_flag                    478
precinct                     9928
sector                      10139
beat                        10073
dtype: int64

In [8]:
# Subject Age Group (10 year increments) as reported by the officer.

df['subject_age_group'].value_counts(dropna=False)

26 - 35         16158
36 - 45         10339
18 - 25          9669
46 - 55          6274
56 and Above     2466
1 - 17           1988
NaN              1591
Name: subject_age_group, dtype: int64

In [9]:
# Type of weapon, if any, identified during a search or frisk of the subject. Indicates "None" if no weapons was found.

df['weapon_type'].value_counts(dropna=False)

None                                    32565
NaN                                     13021
Lethal Cutting Instrument                1482
Knife/Cutting/Stabbing Instrument         706
Handgun                                   300
Firearm Other                             100
Blunt Object/Striking Implement            93
Club, Blackjack, Brass Knuckles            49
Firearm                                    44
Mace/Pepper Spray                          31
Other Firearm                              27
Firearm (unk type)                         15
Taser/Stun Gun                             10
Club                                        9
None/Not Applicable                         9
Fire/Incendiary Device                      8
Rifle                                       7
Shotgun                                     3
Personal Weapons (hands, feet, etc.)        2
Automatic Handgun                           2
Brass Knuckles                              1
Blackjack                         

In [10]:
# Converting NaN's to 'None' for weapon type column
df['weapon_type'] = df['weapon_type'].replace(np.nan,'None')

# Convert 'None/Not Applicable' to 'None' for alignment
df['weapon_type'] = df['weapon_type'].replace('None/Not Applicable','None')

In [11]:
# Key identifying unique officers in the dataset.
# possibly drop the 24 NaN's(?)

df['officer_id'].value_counts(dropna=False)

7456      430
7634      347
7773      333
7765      315
7758      313
         ... 
6162        1
5875        1
7496        1
6070        1
4729        1
Name: officer_id, Length: 1225, dtype: int64

In [12]:
# Perceived race of the subject, as reported by the officer.
# Convert NaN's to Unknown so that they are aligned

df['subject_perceived_race'].value_counts(dropna=False)

White                                        23696
Black or African American                    14450
Unknown                                       2791
NaN                                           1853
Hispanic                                      1684
Asian                                         1590
American Indian or Alaska Native              1391
Multi-Racial                                   809
Other                                          152
Native Hawaiian or Other Pacific Islander       69
Name: subject_perceived_race, dtype: int64

In [13]:
df['subject_perceived_race'] = df['subject_perceived_race'].replace(np.nan,'Unknown')

In [14]:
# Perceived gender of the subject, as reported by the officer.
#convert NaN's and 'Unable to determine' to 'Unknown' so that they are aligned

df['subject_perceived_gender'].value_counts(dropna=False)

Male                                                         38085
Female                                                        9793
Unable to Determine                                            326
NaN                                                            237
Unknown                                                         37
Gender Diverse (gender non-conforming and/or transgender)        7
Name: subject_perceived_gender, dtype: int64

In [15]:
df['subject_perceived_gender'] = df['subject_perceived_gender'].replace(np.nan,'Unknown')
df['subject_perceived_gender'] = df['subject_perceived_gender'].replace('Unable to Determine','Unknown')

In [16]:
# Initial classification of the call as assigned by 911(?)
# possibly replace with officer intiated? SUSPICIOUS STOP - OFFICER INITIATED ONVIEW' or UNKNOWN - COMPLAINT OF UNKNOWN NATURE
df['initial_call_type'].value_counts(dropna=False)

NaN                                               13234
SUSPICIOUS STOP - OFFICER INITIATED ONVIEW         3234
SUSPICIOUS PERSON, VEHICLE OR INCIDENT             3097
DISTURBANCE, MISCELLANEOUS/OTHER                   2503
ASLT - IP/JO - WITH OR W/O WPNS (NO SHOOTINGS)     2104
                                                  ...  
MISSING - ADULT                                       1
ESCAPE - PRISONER                                     1
KNOWN KIDNAPPNG                                       1
PROPERTY - FOUND GUN, SHELLCASINGS                    1
ALARM - ATM MACHINE, FREE STANDING                    1
Name: initial_call_type, Length: 168, dtype: int64

In [17]:
#possibly replace NAN's with 'UNKNOWN - COMPLAINT OF UNKNOWN NATURE' (?)
df['final_call_type'].value_counts(dropna=False)

NaN                                                  13234
--SUSPICIOUS CIRCUM. - SUSPICIOUS PERSON              3915
--PROWLER - TRESPASS                                  3405
--DISTURBANCE - OTHER                                 2799
--ASSAULTS, OTHER                                     2381
                                                     ...  
MVC - UNK INJURIES                                       1
--COMMERCIAL SEXUAL EXPLOITATION OF MINORS (CSEC)        1
ORDER - VIOLATING DV COURT ORDER                         1
BIAS -RACIAL, POLITICAL, SEXUAL MOTIVATION               1
NOISE - DIST, GENERAL (CONST, RESID, BALL PLAY)          1
Name: final_call_type, Length: 211, dtype: int64

In [18]:
# How the call was received by the communication center.
df['call_type'].value_counts(dropna=False)

911                              22098
NaN                              13234
ONVIEW                            9354
TELEPHONE OTHER, NOT 911          3428
ALARM CALL (NOT POLICE ALARM)      361
TEXT MESSAGE                         9
SCHEDULED EVENT (RECURRING)          1
Name: call_type, dtype: int64

In [19]:
# Functional squad assignment (not budget) of the officer as reported by the Data Analytics Platform (DAP).
df['officer_squad'].value_counts(dropna=False)

TRAINING - FIELD TRAINING SQUAD          5209
WEST PCT 1ST W - DAVID/MARY              1582
WEST PCT 2ND W - D/M RELIEF              1041
SOUTHWEST PCT 2ND W - FRANK              1007
WEST PCT 1ST W - KING/QUEEN               896
                                         ... 
ZOLD CRIME ANALYSIS UNIT - ANALYSTS         1
RECORDS - DAY SHIFT                         1
BURG/THEFT/JUV - NORTH                      1
SOUTHWEST PCT OPS - BURG/THEFT              1
TRAINING - LEARNING MANAGEMENT SYSTEM       1
Name: officer_squad, Length: 174, dtype: int64

In [20]:
# Indicator of whether a "frisk" was conducted, by the officer, of the subject, during the Terry Stop.
df['frisk_flag'].value_counts(dropna=False)

N      37025
Y      10982
NaN      478
Name: frisk_flag, dtype: int64

In [21]:
# Precinct of the address associated with the underlying Computer Aided Dispatch (CAD) event. 
# Not necessarily where the Terry Stop occurred.

df['precinct'].value_counts(dropna=False)

West         11894
North        10664
NaN           9928
East          6382
South         5814
Southwest     2320
SouthWest     1225
Unknown        200
OOJ             37
FK ERROR        21
Name: precinct, dtype: int64

In [22]:
# Sector of the address associated with the underlying Computer Aided Dispatch (CAD) event. 
# Not necessarily where the Terry Stop occurred.

df['sector'].value_counts(dropna=False)

NaN       10139
E          2337
M          2270
N          2191
K          1877
K          1762
B          1658
L          1639
D          1512
R          1455
F          1378
M          1362
S          1348
U          1302
D          1248
O          1161
J          1119
G          1087
C          1037
E           990
Q           967
W           941
Q           892
N           765
F           753
R           682
O           655
B           596
S           514
G           497
U           489
L           482
W           471
C           433
J           423
99           53
Name: sector, dtype: int64

In [23]:
# Beat of the address associated with the underlying Computer Aided Dispatch (CAD) event. 
# Not necessarily where the Terry Stop occurred.

df['beat'].value_counts(dropna=False)

NaN       10073
N3         1175
E2         1092
K3         1054
M2          852
          ...  
J2           86
99           53
99           35
OOJ          25
S             2
Name: beat, Length: 107, dtype: int64

In [24]:
# Convert Officer YOB to Age and rename column to age
df['officer_yob'] = 2021 - df['officer_yob'] 

In [25]:
df['officer_yob'].value_counts()

35     3382
34     3087
37     2798
30     2742
29     2554
36     2513
31     2341
33     2186
32     2051
39     1868
38     1743
42     1558
28     1516
40     1476
26     1304
50     1236
43     1177
45     1051
44     1030
27      963
48      933
41      838
25      728
54      721
53      632
51      612
52      568
47      565
46      539
24      486
59      457
57      439
49      429
56      420
58      262
55      228
63      220
60      215
62      174
61      161
23       60
67       44
64       43
121      35
68       34
66       21
65       17
73       11
69        9
72        5
75        2
70        1
Name: officer_yob, dtype: int64

In [26]:
df.rename(columns={'officer_yob':'officer_age'}, inplace=True)

In [29]:
# Convert reported_date column to datetime type
df['reported_date'] = pd.to_datetime(df['reported_date'], yearfirst=True)
df['reported_date'] = pd.to_numeric(df['reported_date'])

In [30]:
# Convert Arrest and Frisk columns to 1's and 0's
df['arrest_flag'] = df['arrest_flag'].replace({'Y':1, 'N':0})
df['frisk_flag'] = df['frisk_flag'].replace({'Y':1, 'N':0})

In [31]:
df['arrest_flag'].value_counts(dropna=False)

0    44661
1     3824
Name: arrest_flag, dtype: int64

In [37]:
# Dropping NaN's bc it only represents .9% of the data set
df['frisk_flag'].value_counts(normalize=True,dropna=False)
df = df.dropna(how='any',subset=['frisk_flag'])

In [38]:
# Final Inspection of Dataframe
df.shape

(48007, 19)

In [39]:
df.head()

Unnamed: 0,subject_age_group,stop_resolution,weapon_type,officer_id,officer_age,officer_gender,officer_race,subject_perceived_race,subject_perceived_gender,reported_date,initial_call_type,final_call_type,call_type,officer_squad,arrest_flag,frisk_flag,precinct,sector,beat
0,,Arrest,,7500,37,M,Black or African American,Asian,Male,1444953600000000000,,,,SOUTH PCT 1ST W - ROBERT,0,0.0,South,O,O2
1,,Field Contact,,5670,56,M,White,Unknown,Unknown,1426723200000000000,,,,,0,0.0,,,
3,,Field Contact,,7539,58,M,White,Unknown,Unknown,1427846400000000000,,,,,0,0.0,,,
4,,Field Contact,,6973,44,M,White,Black or African American,Male,1428019200000000000,,,,,0,0.0,,,
5,,Field Contact,,7402,48,M,White,Black or African American,Male,1428192000000000000,,,,,0,0.0,,,


In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48007 entries, 0 to 48484
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   subject_age_group         46499 non-null  object 
 1   stop_resolution           48007 non-null  object 
 2   weapon_type               48007 non-null  object 
 3   officer_id                47983 non-null  object 
 4   officer_age               48007 non-null  int64  
 5   officer_gender            48007 non-null  object 
 6   officer_race              48007 non-null  object 
 7   subject_perceived_race    48007 non-null  object 
 8   subject_perceived_gender  48007 non-null  object 
 9   reported_date             48007 non-null  int64  
 10  initial_call_type         34965 non-null  object 
 11  final_call_type           34965 non-null  object 
 12  call_type                 34965 non-null  object 
 13  officer_squad             47414 non-null  object 
 14  arrest

In [41]:
# save cleaned dataframe as a csv
df.to_csv('Terry_stops_cleaned.csv', index=False)