In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# setting project path
import os
import sys

gparent = os.path.join(os.pardir, os.pardir)
sys.path.append(gparent)

# imports
import pandas as pd
import numpy as np


# setting style
# sns.set_theme('talk')
# plt.style.use('fivethirtyeight')
# sns.set_palette(palette='Blues_r')

## Background Information

This projects aims to create and train a predictive model on a dataset of Seattle Terry Stops Data. The initial data was obtain as a .csv file of 47,213 rows and 23 columns Terry Stops Data.

Data set obtained from:
[Link](https://catalog.data.gov/dataset/terry-stops)

Data Columns Explanations:
[Link](https://data.seattle.gov/Public-Safety/Terry-Stops/28ny-9ts8)

## Data Cleaning

### Initial Steps
Loading the data, checking the info.

In [4]:
# creating path to data
path = os.path.join(gparent, 'data/raw', 'Terry_Stops.csv')
df = pd.read_csv(path)

In [5]:
# checking first few rows
df.head()

Unnamed: 0,Subject Age Group,Subject ID,GO / SC Num,Terry Stop ID,Stop Resolution,Weapon Type,Officer ID,Officer YOB,Officer Gender,Officer Race,...,Reported Time,Initial Call Type,Final Call Type,Call Type,Officer Squad,Arrest Flag,Frisk Flag,Precinct,Sector,Beat
0,-,-1,20140000120677,92317,Arrest,,7500,1984,M,Black or African American,...,11:32:00,-,-,-,SOUTH PCT 1ST W - ROBERT,N,N,South,O,O2
1,-,-1,20150000001463,28806,Field Contact,,5670,1965,M,White,...,07:59:00,-,-,-,,N,N,-,-,-
2,-,-1,20150000001516,29599,Field Contact,,4844,1961,M,White,...,19:12:00,-,-,-,,N,-,-,-,-
3,-,-1,20150000001670,32260,Field Contact,,7539,1963,M,White,...,04:55:00,-,-,-,,N,N,-,-,-
4,-,-1,20150000001739,33155,Field Contact,,6973,1977,M,White,...,00:41:00,-,-,-,,N,N,-,-,-


In [6]:
#  checking info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47213 entries, 0 to 47212
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Subject Age Group         47213 non-null  object
 1   Subject ID                47213 non-null  int64 
 2   GO / SC Num               47213 non-null  int64 
 3   Terry Stop ID             47213 non-null  int64 
 4   Stop Resolution           47213 non-null  object
 5   Weapon Type               47213 non-null  object
 6   Officer ID                47213 non-null  object
 7   Officer YOB               47213 non-null  int64 
 8   Officer Gender            47213 non-null  object
 9   Officer Race              47213 non-null  object
 10  Subject Perceived Race    47213 non-null  object
 11  Subject Perceived Gender  47213 non-null  object
 12  Reported Date             47213 non-null  object
 13  Reported Time             47213 non-null  object
 14  Initial Call Type     

## Checking for Nulls

In [7]:
# checking for nulls
df.isna().sum()

Subject Age Group             0
Subject ID                    0
GO / SC Num                   0
Terry Stop ID                 0
Stop Resolution               0
Weapon Type                   0
Officer ID                    0
Officer YOB                   0
Officer Gender                0
Officer Race                  0
Subject Perceived Race        0
Subject Perceived Gender      0
Reported Date                 0
Reported Time                 0
Initial Call Type             0
Final Call Type               0
Call Type                     0
Officer Squad               604
Arrest Flag                   0
Frisk Flag                    0
Precinct                      0
Sector                        0
Beat                          0
dtype: int64

## Handling Nulls
It appears that there 604 null values, but there is also a placeholder value of `'-'` that we need to deal with as well. Replacing both with the string `'NA'`.

In [8]:
# filling nulls and replacing - values
df['Officer Squad'].fillna('NA', inplace=True)
df.replace('-', 'NA', inplace=True)

In [9]:
#  checking work
df.isna().sum()

Subject Age Group           0
Subject ID                  0
GO / SC Num                 0
Terry Stop ID               0
Stop Resolution             0
Weapon Type                 0
Officer ID                  0
Officer YOB                 0
Officer Gender              0
Officer Race                0
Subject Perceived Race      0
Subject Perceived Gender    0
Reported Date               0
Reported Time               0
Initial Call Type           0
Final Call Type             0
Call Type                   0
Officer Squad               0
Arrest Flag                 0
Frisk Flag                  0
Precinct                    0
Sector                      0
Beat                        0
dtype: int64

## Checking Feature Columns
Checking out the data, looking for any patterns & problems.

### Stop Resolution

In [10]:
# checking uniques
df['Stop Resolution'].unique()

array(['Arrest', 'Field Contact', 'Citation / Infraction',
       'Offense Report', 'Referred for Prosecution'], dtype=object)

## Creating the Target Feature
Creating a binary feature where a value of 1 means an arrest occurred and moving it to the front of the data frame. 

In [11]:
# creating and binarizing the target feature
df['Target'] = df['Stop Resolution']

df['Target'] = df['Target'].replace('Arrest', 1)
df['Target'] = df['Target'].map(lambda x: 0 if (x!=1) else 1)

In [12]:
# moving target to the front of the data frame
column_name = 'Target'
first_column = df.pop(column_name)
df.insert(0, column_name, first_column)

### Weapon Type

In [13]:
# checking weapon types
df['Weapon Type'].unique()

array(['None', 'Firearm Other', 'Lethal Cutting Instrument', 'Handgun',
       'NA', 'Club, Blackjack, Brass Knuckles',
       'Knife/Cutting/Stabbing Instrument', 'Other Firearm', 'Rifle',
       'Fire/Incendiary Device', 'Blunt Object/Striking Implement',
       'Firearm (unk type)', 'Mace/Pepper Spray', 'None/Not Applicable',
       'Club', 'Firearm', 'Taser/Stun Gun', 'Shotgun', 'Brass Knuckles',
       'Automatic Handgun', 'Blackjack',
       'Personal Weapons (hands, feet, etc.)'], dtype=object)

## Creating a Weapons Flag
Creating a feature that flags if a weapon was found during the stop.

In [14]:
# creating weapons flag
df['Weapon Flag'] = df['Weapon Type']

df['Weapon Flag'] = df['Weapon Flag'].replace(['None', 'NA', 'None/Not Applicable'], [0,0,0])
df['Weapon Flag'] = df['Weapon Flag'].map(lambda x: 1 if x!=0 else 0)

### Officer ID

In [15]:
# checking uniques
df['Officer ID'].unique()

array(['7500  ', '5670  ', '4844  ', ..., '6237  ', '6983  ', '5917  '],
      dtype=object)

## Fixing the Officer ID Column
Stripping whitespaces from the IDs.

In [16]:
# stripping whitespaces
df['Officer ID']= df['Officer ID'].apply(lambda x: x.strip())

### Officer Race

In [17]:
#  checking uniques
df['Officer Race'].unique()

array(['Black or African American', 'White', 'Hispanic or Latino',
       'Asian', 'American Indian/Alaska Native', 'Two or More Races',
       'Not Specified', 'Nat Hawaiian/Oth Pac Islander', 'Unknown'],
      dtype=object)

## Renaming Officer Race Categories for Brevity

In [18]:
# renaming categories for brevity
officer_race_list = ['Black or African American', 'White', 'Hispanic or Latino',
       'Asian', 'American Indian/Alaska Native', 'Two or More Races',
       'Not Specified', 'Nat Hawaiian/Oth Pac Islander', 'Unknown']

officer_replace_list = race_list = ['Black', 'White', 'Hispanic',
       'Asian', 'N_American', 'Multi-Racial',
       'NA', 'P_Islander', 'Unknown']

df['Officer Race'] = df['Officer Race'].replace(officer_race_list, officer_replace_list)

### Officer Gender

In [19]:
# checking uniques
df['Officer Gender'].unique()

array(['M', 'F', 'N'], dtype=object)

### Subject Race

In [20]:
# checking uniques
df['Subject Perceived Race'].unique()

array(['Asian', 'NA', 'White', 'Black or African American', 'Other',
       'Unknown', 'American Indian or Alaska Native', 'Hispanic',
       'Multi-Racial', 'Native Hawaiian or Other Pacific Islander'],
      dtype=object)

## Renaming Subject Race Categories for Brevity
Renaming, checking value counts and proportions.

In [21]:
# renaming categories for brevity
race_list = ['Asian', 'NA', 'White', 'Black or African American', 'Other',
       'Unknown', 'American Indian or Alaska Native', 'Hispanic',
       'Multi-Racial', 'Native Hawaiian or Other Pacific Islander']

replace_list =['Asian', 'NA', 'White', 'Black', 'Other',
       'Unknown', 'N_American', 'Hispanic',
       'Multi-Racial', 'P_Islander']

df['Subject Perceived Race'] = df['Subject Perceived Race']\
                                 .replace(race_list, replace_list)

In [22]:
print(df['Subject Perceived Race'].value_counts())
df['Subject Perceived Race'].value_counts(normalize=True)

White           23052
Black           14046
Unknown          2626
NA               1893
Hispanic         1684
Asian            1536
N_American       1359
Multi-Racial      809
Other             152
P_Islander         56
Name: Subject Perceived Race, dtype: int64


White           0.488255
Black           0.297503
Unknown         0.055620
NA              0.040095
Hispanic        0.035668
Asian           0.032533
N_American      0.028784
Multi-Racial    0.017135
Other           0.003219
P_Islander      0.001186
Name: Subject Perceived Race, dtype: float64

### Subject Gender

In [30]:
# checking uniques
df['Subject Perceived Gender'].unique()

array(['Male', 'NA', 'Female', 'Unable to Determine', 'Unknown',
       'Gender Diverse (gender non-conforming and/or transgender)'],
      dtype=object)

## Renaming Subject Gender Categories

In [None]:
# renaming categories for brevity
gender_list = ['Male', 'NA', 'Female', 'Unable to Determine', 'Unknown',
       'Gender Diverse (gender non-conforming and/or transgender)']

gender_replace = ['Male', 'NA', 'Female', 'Undetermined', 'Unknown',
       'Gender Diverse']

df['Subject Perceived Gender'] = df['Subject Perceived Gender'].replace(gender_list, gender_replace)

### Date & Time Features

In [32]:
# checking date and time columns
print(df['Reported Date'][:2])
df['Reported Time'][:2]

0    2015-10-16T00:00:00
1    2015-03-19T00:00:00
Name: Reported Date, dtype: object


0    11:32:00
1    07:59:00
Name: Reported Time, dtype: object

## Converting Date & Time Columns to Datetime Types

In [33]:
# converting date & time columns to datetime types
df = df.astype({'Reported Time':'datetime64', 'Reported Date':'datetime64'})

## Creating Year, Month, DOM, DOW, and Hour Features

In [None]:
#  getting year, month, day of month, day of week, and hour features
df['Reported Year'] = df['Reported Date'].dt.year
df['Reported Month'] = df['Reported Date'].dt.month
df['Day of Month'] = df['Reported Date'].dt.day

# Monday = 0, Sunday = 6
df['Day of Week'] = df['Reported Date'].dt.weekday
df['Reported Hour'] = df['Reported Time'].dt.hour

In [None]:
# checking work
df.sort_values(by='Reported Year').head(2)

In [None]:
# dropping date and time columns
df.drop(['Reported Date', 'Reported Time'], axis=1, inplace=True)

### Checking Call Type Value Counts

In [34]:
df['Initial Call Type'].value_counts()

NA                                                13162
SUSPICIOUS STOP - OFFICER INITIATED ONVIEW         3165
SUSPICIOUS PERSON, VEHICLE OR INCIDENT             2994
DISTURBANCE, MISCELLANEOUS/OTHER                   2429
ASLT - IP/JO - WITH OR W/O WPNS (NO SHOOTINGS)     2007
                                                  ...  
VICE - PORNOGRAPHY                                    1
DEMONSTRATIONS                                        1
ORDER - ASSIST DV VIC W/SRVC OF COURT ORDER           1
ALARM - RESIDENTIAL - SILENT/AUD PANIC/DURESS         1
ANIMAL, REPORT - BITE                                 1
Name: Initial Call Type, Length: 167, dtype: int64

In [35]:
df['Final Call Type'].value_counts()

NA                                                   13162
--SUSPICIOUS CIRCUM. - SUSPICIOUS PERSON              3772
--PROWLER - TRESPASS                                  3330
--DISTURBANCE - OTHER                                 2719
--ASSAULTS, OTHER                                     2297
                                                     ...  
BIAS -RACIAL, POLITICAL, SEXUAL MOTIVATION               1
MVC - REPORT, NON INJ/NON BLKG OR AFTER FACT INJ         1
--COMMERCIAL SEXUAL EXPLOITATION OF MINORS (CSEC)        1
NARCOTICS WARRANT SERVICE                                1
ORDER - VIOLATING DV COURT ORDER                         1
Name: Final Call Type, Length: 207, dtype: int64

## Checking Call Origination Feature

In [38]:
print(df['Call Type'].value_counts())
df['Call Type'].value_counts(normalize=True)

911                              21284
NA                               13162
ONVIEW                            9110
TELEPHONE OTHER, NOT 911          3309
ALARM CALL (NOT POLICE ALARM)      340
TEXT MESSAGE                         7
SCHEDULED EVENT (RECURRING)          1
Name: Call Type, dtype: int64


911                              0.450808
NA                               0.278779
ONVIEW                           0.192955
TELEPHONE OTHER, NOT 911         0.070087
ALARM CALL (NOT POLICE ALARM)    0.007201
TEXT MESSAGE                     0.000148
SCHEDULED EVENT (RECURRING)      0.000021
Name: Call Type, dtype: float64

## Call Type Analysis
- 45% of the calls came in as 911 calls.
- 28% are of unknown origin.
- 19% of the stops were based on officer observation. 

`Call Type` seems to be a potentially important feature moving forward.

In [None]:
df['Officer Squad'].value_counts()

In [None]:
df['Officer Squad'].unique()

In [None]:
df['Arrest Flag'].value_counts()

In [None]:
#  binarizing arrest flag
df['Arrest Flag'] = df['Arrest Flag'].replace('Y', 1)
df['Arrest Flag'] = df['Arrest Flag'].map(lambda x: 0 if x!=1 else 1)

In [None]:
df['Arrest Flag'].value_counts()

In [None]:
df['Frisk Flag'].value_counts()

In [None]:
#  binarizing frisk flag
df['Frisk Flag'] = df['Frisk Flag'].replace('Y', 1)
df['Frisk Flag'] = df['Frisk Flag'].map(lambda x: 0 if x!=1 else 1)

In [None]:
df['Frisk Flag'].value_counts()

In [None]:
df['Precinct'].unique()

In [None]:
df['Precinct'].value_counts()

In [None]:
# fixing Southwest precinct
df['Precinct'] = df['Precinct'].replace('SouthWest', 'Southwest')

In [None]:
drop_list = ['Unknown', 'OOJ', 'FK ERROR']
df = df[~df['Precinct'].isin(drop_list)]

In [None]:
df['Precinct'].value_counts()

In [None]:
df['Sector'].value_counts()

In [None]:
df['Beat'].value_counts()

In [None]:
# creating beat flag
df['Beat Flag'] = df['Beat']

df['Beat Flag'] = df['Beat Flag'].replace('NA', 0)
df['Beat Flag'] = df['Beat Flag'].map(lambda x: 1 if x!=0 else 0)

In [None]:
df['Beat Flag'].value_counts()

In [None]:
df.info()

In [None]:
# checking columns
df.columns

In [None]:
# dropping columns
df.drop(['Subject ID', 'GO / SC Num',
         'Terry Stop ID'], axis=1, inplace=True)

In [None]:
# path2 = os.path.join(gparent, 'data/processed', 'cleaned1.csv')
# df.to_csv(path2, index=False, na_rep='NA')