In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# setting project path
import os
import sys

gparent = os.path.join(os.pardir, os.pardir)
sys.path.append(gparent)

# imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

import matplotlib.pyplot as plt
import seaborn as sns

# setting style
sns.set_theme('talk')
plt.style.use('fivethirtyeight')
sns.set_palette(palette='Blues_r')

## Initial Data Cleaning

In [3]:
# creating path to data
path = os.path.join(gparent, 'data/raw', 'Terry_Stops.csv')
df = pd.read_csv(path)

In [4]:
# checking first few rows
df.head()

Unnamed: 0,Subject Age Group,Subject ID,GO / SC Num,Terry Stop ID,Stop Resolution,Weapon Type,Officer ID,Officer YOB,Officer Gender,Officer Race,...,Reported Time,Initial Call Type,Final Call Type,Call Type,Officer Squad,Arrest Flag,Frisk Flag,Precinct,Sector,Beat
0,-,-1,20140000120677,92317,Arrest,,7500,1984,M,Black or African American,...,11:32:00,-,-,-,SOUTH PCT 1ST W - ROBERT,N,N,South,O,O2
1,-,-1,20150000001463,28806,Field Contact,,5670,1965,M,White,...,07:59:00,-,-,-,,N,N,-,-,-
2,-,-1,20150000001516,29599,Field Contact,,4844,1961,M,White,...,19:12:00,-,-,-,,N,-,-,-,-
3,-,-1,20150000001670,32260,Field Contact,,7539,1963,M,White,...,04:55:00,-,-,-,,N,N,-,-,-
4,-,-1,20150000001739,33155,Field Contact,,6973,1977,M,White,...,00:41:00,-,-,-,,N,N,-,-,-


In [5]:
#  checking info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47213 entries, 0 to 47212
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Subject Age Group         47213 non-null  object
 1   Subject ID                47213 non-null  int64 
 2   GO / SC Num               47213 non-null  int64 
 3   Terry Stop ID             47213 non-null  int64 
 4   Stop Resolution           47213 non-null  object
 5   Weapon Type               47213 non-null  object
 6   Officer ID                47213 non-null  object
 7   Officer YOB               47213 non-null  int64 
 8   Officer Gender            47213 non-null  object
 9   Officer Race              47213 non-null  object
 10  Subject Perceived Race    47213 non-null  object
 11  Subject Perceived Gender  47213 non-null  object
 12  Reported Date             47213 non-null  object
 13  Reported Time             47213 non-null  object
 14  Initial Call Type     

In [6]:
# checking for nulls
df.isna().sum()

Subject Age Group             0
Subject ID                    0
GO / SC Num                   0
Terry Stop ID                 0
Stop Resolution               0
Weapon Type                   0
Officer ID                    0
Officer YOB                   0
Officer Gender                0
Officer Race                  0
Subject Perceived Race        0
Subject Perceived Gender      0
Reported Date                 0
Reported Time                 0
Initial Call Type             0
Final Call Type               0
Call Type                     0
Officer Squad               604
Arrest Flag                   0
Frisk Flag                    0
Precinct                      0
Sector                        0
Beat                          0
dtype: int64

In [7]:
# filling nulls and replacing - values
df['Officer Squad'].fillna('NA', inplace=True)
df.replace('-', 'NA', inplace=True)

In [8]:
#  checking work
df.isna().sum()

Subject Age Group           0
Subject ID                  0
GO / SC Num                 0
Terry Stop ID               0
Stop Resolution             0
Weapon Type                 0
Officer ID                  0
Officer YOB                 0
Officer Gender              0
Officer Race                0
Subject Perceived Race      0
Subject Perceived Gender    0
Reported Date               0
Reported Time               0
Initial Call Type           0
Final Call Type             0
Call Type                   0
Officer Squad               0
Arrest Flag                 0
Frisk Flag                  0
Precinct                    0
Sector                      0
Beat                        0
dtype: int64

In [9]:
# checking uniques
df['Stop Resolution'].unique()

array(['Arrest', 'Field Contact', 'Citation / Infraction',
       'Offense Report', 'Referred for Prosecution'], dtype=object)

In [10]:
# checking weapon types
df['Weapon Type'].unique()

array(['None', 'Firearm Other', 'Lethal Cutting Instrument', 'Handgun',
       'NA', 'Club, Blackjack, Brass Knuckles',
       'Knife/Cutting/Stabbing Instrument', 'Other Firearm', 'Rifle',
       'Fire/Incendiary Device', 'Blunt Object/Striking Implement',
       'Firearm (unk type)', 'Mace/Pepper Spray', 'None/Not Applicable',
       'Club', 'Firearm', 'Taser/Stun Gun', 'Shotgun', 'Brass Knuckles',
       'Automatic Handgun', 'Blackjack',
       'Personal Weapons (hands, feet, etc.)'], dtype=object)

In [11]:
# creating weapons flag
df['Weapon Flag'] = df['Weapon Type']

df['Weapon Flag'] = df['Weapon Flag'].replace(['None', 'NA', 'None/Not Applicable'], [0,0,0])
df['Weapon Flag'] = df['Weapon Flag'].map(lambda x: 1 if x!=0 else 0)

In [12]:
# checking uniques
df['Officer ID'].unique()

array(['7500  ', '5670  ', '4844  ', ..., '6237  ', '6983  ', '5917  '],
      dtype=object)

In [13]:
# stripping white spaces
df['Officer ID']= df['Officer ID'].apply(lambda x: x.strip())

In [14]:
#  checking uniques
df['Officer Race'].unique()

array(['Black or African American', 'White', 'Hispanic or Latino',
       'Asian', 'American Indian/Alaska Native', 'Two or More Races',
       'Not Specified', 'Nat Hawaiian/Oth Pac Islander', 'Unknown'],
      dtype=object)

In [15]:
# renaming categories for brevity
officer_race_list = ['Black or African American', 'White', 'Hispanic or Latino',
       'Asian', 'American Indian/Alaska Native', 'Two or More Races',
       'Not Specified', 'Nat Hawaiian/Oth Pac Islander', 'Unknown']

officer_replace_list = race_list = ['Black', 'White', 'Hispanic',
       'Asian', 'N_American', 'Multi-Racial',
       'NA', 'P_Islander', 'Unknown']

df['Officer Race'] = df['Officer Race'].replace(officer_race_list, officer_replace_list)

In [16]:
# checking uniques
df['Officer Gender'].unique()

array(['M', 'F', 'N'], dtype=object)

In [17]:
# checking uniques
df['Subject Perceived Race'].unique()

array(['Asian', 'NA', 'White', 'Black or African American', 'Other',
       'Unknown', 'American Indian or Alaska Native', 'Hispanic',
       'Multi-Racial', 'Native Hawaiian or Other Pacific Islander'],
      dtype=object)

In [18]:
# renaming categories for brevity
race_list = ['Asian', 'NA', 'White', 'Black or African American', 'Other',
       'Unknown', 'American Indian or Alaska Native', 'Hispanic',
       'Multi-Racial', 'Native Hawaiian or Other Pacific Islander']

replace_list =['Asian', 'NA', 'White', 'Black', 'Other',
       'Unknown', 'N_American', 'Hispanic',
       'Multi-Racial', 'P_Islander']

df['Subject Perceived Race'] = df['Subject Perceived Race'].replace(race_list, replace_list)

In [19]:
# checking uniques
df['Subject Perceived Gender'].unique()

array(['Male', 'NA', 'Female', 'Unable to Determine', 'Unknown',
       'Gender Diverse (gender non-conforming and/or transgender)'],
      dtype=object)

In [20]:
# renaming categories for brevity
gender_list = ['Male', 'NA', 'Female', 'Unable to Determine', 'Unknown',
       'Gender Diverse (gender non-conforming and/or transgender)']

gender_replace = ['Male', 'NA', 'Female', 'Undetermined', 'Unknown',
       'Gender Diverse']

df['Subject Perceived Gender'] = df['Subject Perceived Gender'].replace(gender_list, gender_replace)

In [21]:
# checking date and time columns
print(df['Reported Date'][:5])
df['Reported Time'][:5]

0    2015-10-16T00:00:00
1    2015-03-19T00:00:00
2    2015-03-21T00:00:00
3    2015-04-01T00:00:00
4    2015-04-03T00:00:00
Name: Reported Date, dtype: object


0    11:32:00
1    07:59:00
2    19:12:00
3    04:55:00
4    00:41:00
Name: Reported Time, dtype: object

In [22]:
# converting date & time columns to datetime types
df = df.astype({'Reported Time':'datetime64', 'Reported Date':'datetime64'})

In [23]:
#  getting year, month, day of month, day of week, and hour features
df['Reported Year'] = df['Reported Date'].dt.year
df['Reported Month'] = df['Reported Date'].dt.month
df['Day of Month'] = df['Reported Date'].dt.day

# Monday = 0, Sunday = 6
df['Day of Week'] = df['Reported Date'].dt.weekday
df['Reported Hour'] = df['Reported Time'].dt.hour

In [24]:
# checking work
df.sort_values(by='Reported Year').head(2)

Unnamed: 0,Subject Age Group,Subject ID,GO / SC Num,Terry Stop ID,Stop Resolution,Weapon Type,Officer ID,Officer YOB,Officer Gender,Officer Race,...,Frisk Flag,Precinct,Sector,Beat,Weapon Flag,Reported Year,Reported Month,Day of Month,Day of Week,Reported Hour
0,,-1,20140000120677,92317,Arrest,,7500,1984,M,Black,...,N,South,O,O2,0,2015,10,16,4,11
39233,46 - 55,-1,20150000218351,58128,Referred for Prosecution,,7459,1973,M,White,...,Y,North,B,B1,0,2015,6,28,6,5


In [25]:
# dropping date and time columns
df.drop(['Reported Date', 'Reported Time'], axis=1, inplace=True)

In [26]:
df['Initial Call Type'].value_counts()

NA                                                13162
SUSPICIOUS STOP - OFFICER INITIATED ONVIEW         3165
SUSPICIOUS PERSON, VEHICLE OR INCIDENT             2994
DISTURBANCE, MISCELLANEOUS/OTHER                   2429
ASLT - IP/JO - WITH OR W/O WPNS (NO SHOOTINGS)     2007
                                                  ...  
INJURED -  PERSON/INDUSTRIAL ACCIDENT                 1
MISSING - (ALZHEIMER, ENDANGERED, ELDERLY)            1
VICE - PORNOGRAPHY                                    1
REQUEST TO WATCH                                      1
HARBOR - WATER EMERGENCIES                            1
Name: Initial Call Type, Length: 167, dtype: int64

In [27]:
df['Final Call Type'].value_counts()

NA                                              13162
--SUSPICIOUS CIRCUM. - SUSPICIOUS PERSON         3772
--PROWLER - TRESPASS                             3330
--DISTURBANCE - OTHER                            2719
--ASSAULTS, OTHER                                2297
                                                ...  
PEACE-STANDBY TO ASSURE (NO COURT ORDR SVC)         1
FIGHT - VERBAL/ORAL (NO WEAPONS)                    1
TRAFFIC - MOVING VIOLATION                          1
UNDERCOVER OPS, CAUTION (INCLUDES STAKEOUTS)        1
DOWN - CHECK FOR DOWN PERSON                        1
Name: Final Call Type, Length: 207, dtype: int64

In [28]:
df['Call Type'].value_counts()

911                              21284
NA                               13162
ONVIEW                            9110
TELEPHONE OTHER, NOT 911          3309
ALARM CALL (NOT POLICE ALARM)      340
TEXT MESSAGE                         7
SCHEDULED EVENT (RECURRING)          1
Name: Call Type, dtype: int64

In [29]:
df['Officer Squad'].value_counts()

TRAINING - FIELD TRAINING SQUAD                 5098
WEST PCT 1ST W - DAVID/MARY                     1546
WEST PCT 2ND W - D/M RELIEF                     1020
SOUTHWEST PCT 2ND W - FRANK                      970
NORTH PCT 2ND WATCH - NORTH BEATS                885
                                                ... 
VICE - GENERAL INVESTIGATIONS SQUAD                1
SOUTHWEST PCT OPS - BURG/THEFT                     1
HR - BLEA - ACADEMY RECRUITS                       1
COMMUNITY OUTREACH - SPECIAL PROJECTS DETAIL       1
CANINE - DAY SQUAD                                 1
Name: Officer Squad, Length: 173, dtype: int64

In [30]:
df['Officer Squad'].unique()

array(['SOUTH PCT 1ST W - ROBERT', 'NA', 'WEST PCT 3RD W - MARY',
       'SOUTH PCT 1ST W - R/S RELIEF', 'WEST PCT 2ND W - MARY BEATS',
       'WEST PCT 2ND W - SPECIAL BEATS',
       'TRAINING - FIELD TRAINING SQUAD', 'NORTH PCT 3RD W - LINCOLN',
       'SOUTH PCT 3RD W - ROBERT', 'NORTH PCT 2ND W - NORA',
       'NORTH PCT 2ND WATCH - B/N RELIEF', 'WEST PCT 3RD W - K/Q RELIEF',
       'SOUTHWEST PCT 2ND W - WILLIAM', 'WEST PCT OPS - ACT NIGHT',
       'WEST PCT 2ND W - KING', 'SOUTHWEST PCT 3RD W - WILLIAM',
       'WEST PCT 2ND W - KING BEATS', 'SOUTH PCT OPS - NIGHT ACT',
       'WEST PCT 2ND W - D/M RELIEF', 'NORTH PCT 1ST W - LINCOLN',
       'NORTH PCT 1ST W - L/U RELIEF', 'WEST PCT 1ST W - KQ/DM RELIEF',
       'SOUTH PCT 2ND W - OCEAN RELIEF', 'NORTH PCT OPS - ACT DAY',
       'SOUTH PCT 3RD W - OCEAN', 'NORTH PCT 3RD W - NORA',
       'WEST PCT 1ST W - DAVID/MARY', 'EAST PCT 2ND W - EDWARD',
       'SOUTHWEST PCT 1ST W - FRANK', 'SOUTH PCT 1ST W - SAM',
       'SOUTHWEST PCT 

In [31]:
df['Arrest Flag'].value_counts()

N    43754
Y     3459
Name: Arrest Flag, dtype: int64

In [32]:
#  binarizing arrest flag
df['Arrest Flag'] = df['Arrest Flag'].replace('Y', 1)
df['Arrest Flag'] =df['Arrest Flag'].map(lambda x: 0 if x!=1 else 1)

In [33]:
df['Arrest Flag'].value_counts()

0    43754
1     3459
Name: Arrest Flag, dtype: int64

In [34]:
df['Frisk Flag'].value_counts()

N     36147
Y     10588
NA      478
Name: Frisk Flag, dtype: int64

In [35]:
#  binarizing frisk flag
df['Frisk Flag'] = df['Frisk Flag'].replace('Y', 1)
df['Frisk Flag'] =df['Frisk Flag'].map(lambda x: 0 if x!=1 else 1)

In [36]:
df['Frisk Flag'].value_counts()

0    36625
1    10588
Name: Frisk Flag, dtype: int64

In [37]:
df['Precinct'].unique()

array(['South', 'NA', 'East', 'North', 'West', 'Southwest', 'Unknown',
       'SouthWest', 'OOJ', 'FK ERROR'], dtype=object)

In [38]:
df['Precinct'].value_counts()

West         11432
North        10385
NA            9859
East          6213
South         5653
Southwest     2320
SouthWest     1098
Unknown        200
OOJ             33
FK ERROR        20
Name: Precinct, dtype: int64

In [39]:
# fixing Southwest precinct
df['Precinct'] = df['Precinct'].replace('SouthWest', 'Southwest')

In [40]:
drop_list = ['Unknown', 'OOJ', 'FK ERROR']
df = df[~df['Precinct'].isin(drop_list)]

In [41]:
df['Precinct'].value_counts()

West         11432
North        10385
NA            9859
East          6213
South         5653
Southwest     3418
Name: Precinct, dtype: int64

In [42]:
df['Sector'].value_counts()

NA        9865
E         2337
M         2270
N         2191
K         1762
K         1725
B         1658
L         1639
D         1512
R         1455
F         1378
S         1348
U         1302
M         1254
O         1161
D         1144
J         1119
G         1087
C         1037
Q          967
W          941
E          925
Q          794
N          694
F          669
R          616
O          598
B          518
S          476
U          453
G          442
W          428
L          416
J          395
C          384
Name: Sector, dtype: int64

In [43]:
df['Beat'].value_counts()

NA        9859
N3        1175
E2        1092
K3         964
M2         852
          ... 
C2          87
U3          85
J2          75
N1          73
99           2
Name: Beat, Length: 104, dtype: int64

In [44]:
# creating beat flag
df['Beat Flag'] = df['Beat']

df['Beat Flag'] = df['Beat Flag'].replace('NA', 0)
df['Beat Flag'] = df['Beat Flag'].map(lambda x: 1 if x!=0 else 0)

In [45]:
df['Beat Flag'].value_counts()

1    37101
0     9859
Name: Beat Flag, dtype: int64

In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46960 entries, 0 to 47212
Data columns (total 28 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Subject Age Group         46960 non-null  object
 1   Subject ID                46960 non-null  int64 
 2   GO / SC Num               46960 non-null  int64 
 3   Terry Stop ID             46960 non-null  int64 
 4   Stop Resolution           46960 non-null  object
 5   Weapon Type               46960 non-null  object
 6   Officer ID                46960 non-null  object
 7   Officer YOB               46960 non-null  int64 
 8   Officer Gender            46960 non-null  object
 9   Officer Race              46960 non-null  object
 10  Subject Perceived Race    46960 non-null  object
 11  Subject Perceived Gender  46960 non-null  object
 12  Initial Call Type         46960 non-null  object
 13  Final Call Type           46960 non-null  object
 14  Call Type             

In [47]:
# creating and binarizing the target feature
df['Target'] = df['Stop Resolution']

df['Target'] = df['Target'].replace('Arrest', 1)
df['Target'] = df['Target'].map(lambda x: 0 if (x!=1) else 1)

In [48]:
# moving target to the front of the data frame
column_name = 'Target'
first_column = df.pop(column_name)
df.insert(0, column_name, first_column)

In [49]:
# checking columns
df.columns

Index(['Target', 'Subject Age Group', 'Subject ID', 'GO / SC Num',
       'Terry Stop ID', 'Stop Resolution', 'Weapon Type', 'Officer ID',
       'Officer YOB', 'Officer Gender', 'Officer Race',
       'Subject Perceived Race', 'Subject Perceived Gender',
       'Initial Call Type', 'Final Call Type', 'Call Type', 'Officer Squad',
       'Arrest Flag', 'Frisk Flag', 'Precinct', 'Sector', 'Beat',
       'Weapon Flag', 'Reported Year', 'Reported Month', 'Day of Month',
       'Day of Week', 'Reported Hour', 'Beat Flag'],
      dtype='object')

In [50]:
# dropping columns
df.drop(['Subject ID', 'GO / SC Num',
         'Terry Stop ID'], axis=1, inplace=True)

In [51]:
# path2 = os.path.join(gparent, 'data/processed', 'cleaned1.csv')
# df.to_csv(path2, index=False, na_rep='NA')