In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# setting project path
import os
import sys

gparent = os.path.join(os.pardir, os.pardir)
sys.path.append(gparent)

# imports
import pandas as pd
import numpy as np
pd.options.display.max_rows = 300

## Background Information

This projects aims to create and train a predictive model on a dataset of Seattle Terry Stops Data. The initial data was obtain as a .csv file of 47,213 rows and 23 feature columns of Terry Stops Data.

Data set obtained from:
[Link](https://catalog.data.gov/dataset/terry-stops)

Data Columns Explanations:
[Link](https://data.seattle.gov/Public-Safety/Terry-Stops/28ny-9ts8)

## Data Cleaning

### Initial Steps
Loading the data, checking the info.

In [3]:
# creating path to data
path = os.path.join(gparent, 'data/raw', 'Terry_Stops.csv')
df = pd.read_csv(path)

In [4]:
# checking first few rows
df.head()

Unnamed: 0,Subject Age Group,Subject ID,GO / SC Num,Terry Stop ID,Stop Resolution,Weapon Type,Officer ID,Officer YOB,Officer Gender,Officer Race,...,Reported Time,Initial Call Type,Final Call Type,Call Type,Officer Squad,Arrest Flag,Frisk Flag,Precinct,Sector,Beat
0,-,-1,20140000120677,92317,Arrest,,7500,1984,M,Black or African American,...,11:32:00,-,-,-,SOUTH PCT 1ST W - ROBERT,N,N,South,O,O2
1,-,-1,20150000001463,28806,Field Contact,,5670,1965,M,White,...,07:59:00,-,-,-,,N,N,-,-,-
2,-,-1,20150000001516,29599,Field Contact,,4844,1961,M,White,...,19:12:00,-,-,-,,N,-,-,-,-
3,-,-1,20150000001670,32260,Field Contact,,7539,1963,M,White,...,04:55:00,-,-,-,,N,N,-,-,-
4,-,-1,20150000001739,33155,Field Contact,,6973,1977,M,White,...,00:41:00,-,-,-,,N,N,-,-,-


In [5]:
#  checking info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47213 entries, 0 to 47212
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Subject Age Group         47213 non-null  object
 1   Subject ID                47213 non-null  int64 
 2   GO / SC Num               47213 non-null  int64 
 3   Terry Stop ID             47213 non-null  int64 
 4   Stop Resolution           47213 non-null  object
 5   Weapon Type               47213 non-null  object
 6   Officer ID                47213 non-null  object
 7   Officer YOB               47213 non-null  int64 
 8   Officer Gender            47213 non-null  object
 9   Officer Race              47213 non-null  object
 10  Subject Perceived Race    47213 non-null  object
 11  Subject Perceived Gender  47213 non-null  object
 12  Reported Date             47213 non-null  object
 13  Reported Time             47213 non-null  object
 14  Initial Call Type     

### Checking for Nulls

In [6]:
# checking for nulls
df.isna().sum()

Subject Age Group             0
Subject ID                    0
GO / SC Num                   0
Terry Stop ID                 0
Stop Resolution               0
Weapon Type                   0
Officer ID                    0
Officer YOB                   0
Officer Gender                0
Officer Race                  0
Subject Perceived Race        0
Subject Perceived Gender      0
Reported Date                 0
Reported Time                 0
Initial Call Type             0
Final Call Type               0
Call Type                     0
Officer Squad               604
Arrest Flag                   0
Frisk Flag                    0
Precinct                      0
Sector                        0
Beat                          0
dtype: int64

### Handling Nulls
It appears that there 604 null values, but there is also a placeholder value of `'-'` that we need to deal with as well. Replacing both with the string `'NA'`.

In [7]:
# filling nulls and replacing - values
df['Officer Squad'].fillna('NA', inplace=True)
df.replace('-', 'NA', inplace=True)

In [8]:
#  checking work
df.isna().sum()

Subject Age Group           0
Subject ID                  0
GO / SC Num                 0
Terry Stop ID               0
Stop Resolution             0
Weapon Type                 0
Officer ID                  0
Officer YOB                 0
Officer Gender              0
Officer Race                0
Subject Perceived Race      0
Subject Perceived Gender    0
Reported Date               0
Reported Time               0
Initial Call Type           0
Final Call Type             0
Call Type                   0
Officer Squad               0
Arrest Flag                 0
Frisk Flag                  0
Precinct                    0
Sector                      0
Beat                        0
dtype: int64

### Checking Feature Columns
Checking out the data, looking for any patterns & problems.

## Subject Age Group Value Counts

In [9]:
df['Subject Age Group'].value_counts()

26 - 35         15707
36 - 45         10017
18 - 25          9452
46 - 55          6111
56 and Above     2399
1 - 17           1978
NA               1549
Name: Subject Age Group, dtype: int64

## Subject ID Value Counts

In [10]:
df['Subject ID'].value_counts()

-1              34769
 15595726686       19
 7753260438        18
 7727117712        13
 8753759694        11
                ...  
 7729810778         1
 11891047764        1
 8627619147         1
 12120573251        1
 19740561406        1
Name: Subject ID, Length: 9169, dtype: int64

### Stop Resolution

In [11]:
# checking uniques
df['Stop Resolution'].unique()

array(['Arrest', 'Field Contact', 'Citation / Infraction',
       'Offense Report', 'Referred for Prosecution'], dtype=object)

### Creating the Target Feature
Creating a binary feature where a value of 1 means an arrest occurred and moving it to the front of the data frame. 

In [12]:
# creating and binarizing the target feature
df['Target'] = df['Stop Resolution']

df['Target'] = df['Target'].replace('Arrest', 1)
df['Target'] = df['Target'].map(lambda x: 0 if x!=1 else 1)

In [13]:
# moving target to the front of the data frame
column_name = 'Target'
first_column = df.pop(column_name)
df.insert(0, column_name, first_column)

In [14]:
# df['Target'].value_counts()

In [15]:
df['Target'].value_counts(normalize=True)

0    0.752886
1    0.247114
Name: Target, dtype: float64

## Analysis: 24% of Stops End in Arrest

### Creating a Weapons Flag
Creating a feature that flags if a weapon was found during the stop.

In [16]:
df['Weapon Type'].value_counts()

None                                    32565
NA                                      11855
Lethal Cutting Instrument                1482
Knife/Cutting/Stabbing Instrument         633
Handgun                                   291
Firearm Other                             100
Blunt Object/Striking Implement            86
Club, Blackjack, Brass Knuckles            49
Firearm                                    38
Mace/Pepper Spray                          28
Other Firearm                              23
Firearm (unk type)                         15
Taser/Stun Gun                             10
Club                                        9
Rifle                                       7
None/Not Applicable                         7
Fire/Incendiary Device                      6
Shotgun                                     3
Personal Weapons (hands, feet, etc.)        2
Automatic Handgun                           2
Brass Knuckles                              1
Blackjack                         

In [17]:
# creating weapons flag
df['Weapon Flag'] = df['Weapon Type']

In [18]:
df['Weapon Flag'].value_counts()

None                                    32565
NA                                      11855
Lethal Cutting Instrument                1482
Knife/Cutting/Stabbing Instrument         633
Handgun                                   291
Firearm Other                             100
Blunt Object/Striking Implement            86
Club, Blackjack, Brass Knuckles            49
Firearm                                    38
Mace/Pepper Spray                          28
Other Firearm                              23
Firearm (unk type)                         15
Taser/Stun Gun                             10
Club                                        9
Rifle                                       7
None/Not Applicable                         7
Fire/Incendiary Device                      6
Shotgun                                     3
Personal Weapons (hands, feet, etc.)        2
Automatic Handgun                           2
Brass Knuckles                              1
Blackjack                         

In [19]:
df['Weapon Flag'] = df['Weapon Flag'].replace(['None','NA'], 0)

In [20]:
df['Weapon Flag'] = df['Weapon Flag'].map(lambda x: 1 if x!=0 else 0)

In [21]:
df['Weapon Flag'].value_counts(normalize=True)

0    0.940843
1    0.059157
Name: Weapon Flag, dtype: float64

## Analysis: Weapons Found in 6% of Stops

### Officer ID

In [22]:
# checking uniques
df['Officer ID'].unique()

array(['7500  ', '5670  ', '4844  ', ..., '6237  ', '6983  ', '5917  '],
      dtype=object)

### Fixing the Officer ID Column
Stripping whitespaces from the IDs.

In [23]:
# stripping whitespaces
df['Officer ID']= df['Officer ID'].apply(lambda x: x.strip())

In [24]:
len(df['Officer ID'].unique())

1214

## Analysis: 1214 Officers Reported Stops

### Officer Race

In [25]:
#  checking uniques
df['Officer Race'].unique()

array(['Black or African American', 'White', 'Hispanic or Latino',
       'Asian', 'American Indian/Alaska Native', 'Two or More Races',
       'Not Specified', 'Nat Hawaiian/Oth Pac Islander', 'Unknown'],
      dtype=object)

### Renaming Officer Race Categories for Brevity

In [26]:
# renaming categories for brevity
officer_race_list = ['Black or African American', 'White', 'Hispanic or Latino',
       'Asian', 'American Indian/Alaska Native', 'Two or More Races',
       'Not Specified', 'Nat Hawaiian/Oth Pac Islander', 'Unknown']

officer_replace_list = race_list = ['Black', 'White', 'Hispanic',
       'Asian', 'N_American', 'Multi-Racial',
       'NA', 'P_Islander', 'Unknown']

df['Officer Race'] = df['Officer Race'].replace(officer_race_list, officer_replace_list)

In [27]:
df['Officer Race'].value_counts(normalize=True)

White           0.755110
Hispanic        0.058077
Multi-Racial    0.056171
Asian           0.042975
Black           0.039820
NA              0.030775
P_Islander      0.009616
N_American      0.006714
Unknown         0.000741
Name: Officer Race, dtype: float64

## Analysis: 76% of Officers Reporting Stops Were White 

### Officer Gender

In [28]:
# checking uniques
df['Officer Gender'].unique()

array(['M', 'F', 'N'], dtype=object)

In [29]:
df['Officer Gender'].value_counts(normalize=True)

M    0.885053
F    0.114333
N    0.000614
Name: Officer Gender, dtype: float64

## Analysis: 88% of Officers Were Male

### Subject Race

In [30]:
# checking uniques
df['Subject Perceived Race'].unique()

array(['Asian', 'NA', 'White', 'Black or African American', 'Other',
       'Unknown', 'American Indian or Alaska Native', 'Hispanic',
       'Multi-Racial', 'Native Hawaiian or Other Pacific Islander'],
      dtype=object)

### Renaming Subject Race Categories for Brevity
Renaming, checking value counts and proportions.

In [31]:
# renaming categories for brevity
race_list = ['Asian', 'NA', 'White', 'Black or African American', 'Other',
       'Unknown', 'American Indian or Alaska Native', 'Hispanic',
       'Multi-Racial', 'Native Hawaiian or Other Pacific Islander']

replace_list =['Asian', 'NA', 'White', 'Black', 'Other',
       'Unknown', 'N_American', 'Hispanic',
       'Multi-Racial', 'P_Islander']

df['Subject Perceived Race'] = df['Subject Perceived Race']\
                                 .replace(race_list, replace_list)

In [32]:
print(df['Subject Perceived Race'].value_counts())
df['Subject Perceived Race'].value_counts(normalize=True)

White           23052
Black           14046
Unknown          2626
NA               1893
Hispanic         1684
Asian            1536
N_American       1359
Multi-Racial      809
Other             152
P_Islander         56
Name: Subject Perceived Race, dtype: int64


White           0.488255
Black           0.297503
Unknown         0.055620
NA              0.040095
Hispanic        0.035668
Asian           0.032533
N_American      0.028784
Multi-Racial    0.017135
Other           0.003219
P_Islander      0.001186
Name: Subject Perceived Race, dtype: float64

## Analysis: Subject Racial Breakdown

- White 49%

- Black 30%

- Unknown 6%

- NA 4%

- Hispanic 4%

- Asian 3%

- Native American 3%

- Multi-Racial 2%

- Other .3%

- Pacific Islander .1%   

### Subject Gender

In [33]:
# checking uniques
df['Subject Perceived Gender'].unique()

array(['Male', 'NA', 'Female', 'Unable to Determine', 'Unknown',
       'Gender Diverse (gender non-conforming and/or transgender)'],
      dtype=object)

### Renaming Subject Gender Categories

In [34]:
# renaming categories for brevity
gender_list = ['Male', 'NA', 'Female', 'Unable to Determine', 'Unknown',
       'Gender Diverse (gender non-conforming and/or transgender)']

gender_replace = ['Male', 'NA', 'Female', 'Undetermined', 'Unknown',
       'Gender Diverse']

df['Subject Perceived Gender'] = df['Subject Perceived Gender'].replace(gender_list, gender_replace)

In [35]:
df['Subject Perceived Gender'].value_counts(normalize=True)

Male              0.783534
Female            0.202974
Undetermined      0.006905
NA                0.005931
Unknown           0.000530
Gender Diverse    0.000127
Name: Subject Perceived Gender, dtype: float64

## Analysis: 78% of Subjects Were Male

### Date & Time Features

In [36]:
# checking date and time columns
print(df['Reported Date'][:2])
df['Reported Time'][:2]

0    2015-10-16T00:00:00
1    2015-03-19T00:00:00
Name: Reported Date, dtype: object


0    11:32:00
1    07:59:00
Name: Reported Time, dtype: object

### Converting Date & Time Columns to Datetime Types

In [37]:
# converting date & time columns to datetime types
df = df.astype({'Reported Time':'datetime64', 'Reported Date':'datetime64'})

### Creating Year, Month, DOM, DOW, and Hour Features

In [38]:
#  getting year, month, day of month, day of week, and hour features
df['Reported Year'] = df['Reported Date'].dt.year
df['Reported Month'] = df['Reported Date'].dt.month
df['Day of Month'] = df['Reported Date'].dt.day

# Monday = 0, Sunday = 6
df['Day of Week'] = df['Reported Date'].dt.weekday
df['Reported Hour'] = df['Reported Time'].dt.hour

In [39]:
# checking work
df.sort_values(by='Reported Year').head(2)

Unnamed: 0,Target,Subject Age Group,Subject ID,GO / SC Num,Terry Stop ID,Stop Resolution,Weapon Type,Officer ID,Officer YOB,Officer Gender,...,Frisk Flag,Precinct,Sector,Beat,Weapon Flag,Reported Year,Reported Month,Day of Month,Day of Week,Reported Hour
0,1,,-1,20140000120677,92317,Arrest,,7500,1984,M,...,N,South,O,O2,0,2015,10,16,4,11
39233,0,46 - 55,-1,20150000218351,58128,Referred for Prosecution,,7459,1973,M,...,Y,North,B,B1,0,2015,6,28,6,5


In [40]:
# dropping date and time columns
df.drop(['Reported Date', 'Reported Time'], axis=1, inplace=True)

### Checking Call Type Feature Value Counts

### Initial Call Feature

In [41]:
df['Initial Call Type'].value_counts()

NA                                                    13162
SUSPICIOUS STOP - OFFICER INITIATED ONVIEW             3165
SUSPICIOUS PERSON, VEHICLE OR INCIDENT                 2994
DISTURBANCE, MISCELLANEOUS/OTHER                       2429
ASLT - IP/JO - WITH OR W/O WPNS (NO SHOOTINGS)         2007
TRESPASS                                               1903
THEFT (DOES NOT INCLUDE SHOPLIFT OR SVCS)              1398
SHOPLIFT - THEFT                                       1366
FIGHT - IP - PHYSICAL (NO WEAPONS)                     1282
WEAPN-IP/JO-GUN,DEADLY WPN (NO THRT/ASLT/DIST)         1150
ROBBERY - IP/JO (INCLUDES STRONG ARM)                   903
BURG - IP/JO - RES (INCL UNOCC STRUCTURES)              803
THREATS (INCLS IN-PERSON/BY PHONE/IN WRITING)           739
DIST - IP/JO - DV DIST - NO ASLT                        705
PROPERTY - DAMAGE                                       699
NARCOTICS - VIOLATIONS (LOITER, USE, SELL, NARS)        698
AUTO RECOVERY                           

In [42]:
df['Initial Call Type'].value_counts(normalize=True)

NA                                                    0.278779
SUSPICIOUS STOP - OFFICER INITIATED ONVIEW            0.067037
SUSPICIOUS PERSON, VEHICLE OR INCIDENT                0.063415
DISTURBANCE, MISCELLANEOUS/OTHER                      0.051448
ASLT - IP/JO - WITH OR W/O WPNS (NO SHOOTINGS)        0.042509
TRESPASS                                              0.040307
THEFT (DOES NOT INCLUDE SHOPLIFT OR SVCS)             0.029610
SHOPLIFT - THEFT                                      0.028933
FIGHT - IP - PHYSICAL (NO WEAPONS)                    0.027154
WEAPN-IP/JO-GUN,DEADLY WPN (NO THRT/ASLT/DIST)        0.024358
ROBBERY - IP/JO (INCLUDES STRONG ARM)                 0.019126
BURG - IP/JO - RES (INCL UNOCC STRUCTURES)            0.017008
THREATS (INCLS IN-PERSON/BY PHONE/IN WRITING)         0.015652
DIST - IP/JO - DV DIST - NO ASLT                      0.014932
PROPERTY - DAMAGE                                     0.014805
NARCOTICS - VIOLATIONS (LOITER, USE, SELL, NARS)      0

## Analysis `NA` is Largest Category at 28% 

### Final Call Feature

In [43]:
df['Final Call Type'].value_counts()[:25]

NA                                              13162
--SUSPICIOUS CIRCUM. - SUSPICIOUS PERSON         3772
--PROWLER - TRESPASS                             3330
--DISTURBANCE - OTHER                            2719
--ASSAULTS, OTHER                                2297
--WARRANT SERVICES - FELONY                      1674
--DV - ARGUMENTS, DISTURBANCE (NO ARREST)        1303
--THEFT - SHOPLIFT                               1235
--DV - DOMESTIC VIOL/ASLT (ARREST MANDATORY)     1190
--ASSAULTS - HARASSMENT, THREATS                 1049
--WARRANT SERVICES - MISDEMEANOR                 1043
--NARCOTICS - OTHER                               897
--SUSPICIOUS CIRCUM. - SUSPICIOUS VEHICLE         896
--CRISIS COMPLAINT - GENERAL                      892
--THEFT - ALL OTHER                               801
--PROPERTY DEST (DAMG)                            737
--ROBBERY - STRONG ARM                            629
--TRAFFIC - D.U.I.                                574
--BURGLARY - NON RESIDENTIAL

In [44]:
df['Final Call Type'].value_counts(normalize=True)[:25]

NA                                              0.278779
--SUSPICIOUS CIRCUM. - SUSPICIOUS PERSON        0.079893
--PROWLER - TRESPASS                            0.070531
--DISTURBANCE - OTHER                           0.057590
--ASSAULTS, OTHER                               0.048652
--WARRANT SERVICES - FELONY                     0.035456
--DV - ARGUMENTS, DISTURBANCE (NO ARREST)       0.027598
--THEFT - SHOPLIFT                              0.026158
--DV - DOMESTIC VIOL/ASLT (ARREST MANDATORY)    0.025205
--ASSAULTS - HARASSMENT, THREATS                0.022218
--WARRANT SERVICES - MISDEMEANOR                0.022091
--NARCOTICS - OTHER                             0.018999
--SUSPICIOUS CIRCUM. - SUSPICIOUS VEHICLE       0.018978
--CRISIS COMPLAINT - GENERAL                    0.018893
--THEFT - ALL OTHER                             0.016966
--PROPERTY DEST (DAMG)                          0.015610
--ROBBERY - STRONG ARM                          0.013323
--TRAFFIC - D.U.I.             

## Analysis: `NA` is Largest Category at 28%

### Checking Call Origination Feature

In [45]:
print(df['Call Type'].value_counts())
df['Call Type'].value_counts(normalize=True)

911                              21284
NA                               13162
ONVIEW                            9110
TELEPHONE OTHER, NOT 911          3309
ALARM CALL (NOT POLICE ALARM)      340
TEXT MESSAGE                         7
SCHEDULED EVENT (RECURRING)          1
Name: Call Type, dtype: int64


911                              0.450808
NA                               0.278779
ONVIEW                           0.192955
TELEPHONE OTHER, NOT 911         0.070087
ALARM CALL (NOT POLICE ALARM)    0.007201
TEXT MESSAGE                     0.000148
SCHEDULED EVENT (RECURRING)      0.000021
Name: Call Type, dtype: float64

## Analysis: Call Type
- 45% of the stops began as 911 calls.
- 28% are of unknown origin.
- 19% were officer initiated based on officer observation. 

`Call Type` seems to be a potentially important feature moving forward.

### Officer Squad Feature

In [46]:
df['Officer Squad'].value_counts()

TRAINING - FIELD TRAINING SQUAD                        5098
WEST PCT 1ST W - DAVID/MARY                            1546
WEST PCT 2ND W - D/M RELIEF                            1020
SOUTHWEST PCT 2ND W - FRANK                             970
NORTH PCT 2ND WATCH - NORTH BEATS                       885
WEST PCT 1ST W - KING/QUEEN                             834
EAST PCT 3RD W - E/G RELIEF                             816
WEST PCT 3RD W - DAVID                                  816
WEST PCT 1ST W - KQ/DM RELIEF                           809
EAST PCT 1ST W - E/G RELIEF (CHARLIE)                   768
NORTH PCT 3RD W - B/N RELIEF                            758
EAST PCT 3RD W - EDWARD                                 748
WEST PCT 2ND W - KING                                   731
NORTH PCT 2ND W - NORA                                  718
WEST PCT 3RD W - KING                                   706
EAST PCT 2ND W - CHARLIE RELIEF                         675
WEST PCT 3RD W - MARY                   

In [47]:
df['Officer Squad'].value_counts(normalize=True)

TRAINING - FIELD TRAINING SQUAD                        0.107979
WEST PCT 1ST W - DAVID/MARY                            0.032745
WEST PCT 2ND W - D/M RELIEF                            0.021604
SOUTHWEST PCT 2ND W - FRANK                            0.020545
NORTH PCT 2ND WATCH - NORTH BEATS                      0.018745
WEST PCT 1ST W - KING/QUEEN                            0.017665
EAST PCT 3RD W - E/G RELIEF                            0.017283
WEST PCT 3RD W - DAVID                                 0.017283
WEST PCT 1ST W - KQ/DM RELIEF                          0.017135
EAST PCT 1ST W - E/G RELIEF (CHARLIE)                  0.016267
NORTH PCT 3RD W - B/N RELIEF                           0.016055
EAST PCT 3RD W - EDWARD                                0.015843
WEST PCT 2ND W - KING                                  0.015483
NORTH PCT 2ND W - NORA                                 0.015208
WEST PCT 3RD W - KING                                  0.014954
EAST PCT 2ND W - CHARLIE RELIEF         

### Focusing on the Training Squad
It seems that a fair number of stops come from the `TRAINING - FIELD TRAINING SQUAD`. Checking if there are more training squads.

In [48]:
squad = df['Officer Squad'].unique()
training = [x for x in squad if 'TRAINING' in x]

### Other Training Squads
Checking the number of stops from all of the training squads.

In [49]:
df['Officer Squad'].value_counts()[training]

TRAINING - FIELD TRAINING SQUAD          5098
TRAINING - LEARNING MANAGEMENT SYSTEM       1
TRAINING - ADVANCED - SQUAD C               1
Name: Officer Squad, dtype: int64

In [50]:
df['Officer Squad'].value_counts(normalize=True)[:100]

TRAINING - FIELD TRAINING SQUAD           0.107979
WEST PCT 1ST W - DAVID/MARY               0.032745
WEST PCT 2ND W - D/M RELIEF               0.021604
SOUTHWEST PCT 2ND W - FRANK               0.020545
NORTH PCT 2ND WATCH - NORTH BEATS         0.018745
WEST PCT 1ST W - KING/QUEEN               0.017665
EAST PCT 3RD W - E/G RELIEF               0.017283
WEST PCT 3RD W - DAVID                    0.017283
WEST PCT 1ST W - KQ/DM RELIEF             0.017135
EAST PCT 1ST W - E/G RELIEF (CHARLIE)     0.016267
NORTH PCT 3RD W - B/N RELIEF              0.016055
EAST PCT 3RD W - EDWARD                   0.015843
WEST PCT 2ND W - KING                     0.015483
NORTH PCT 2ND W - NORA                    0.015208
WEST PCT 3RD W - KING                     0.014954
EAST PCT 2ND W - CHARLIE RELIEF           0.014297
WEST PCT 3RD W - MARY                     0.014255
SOUTH PCT 1ST W - R/S RELIEF              0.014106
WEST PCT 2ND W - MARY BEATS               0.013365
WEST PCT 3RD W - QUEEN         

## Analysis: Training Accounts for 11% of Stops
This is three times more than the next category.

### Binarizing the Arrest Flag

In [51]:
df['Arrest Flag'].value_counts()

N    43754
Y     3459
Name: Arrest Flag, dtype: int64

In [52]:
#  binarizing arrest flag
df['Arrest Flag'] = df['Arrest Flag'].replace('Y', 1)
df['Arrest Flag'] = df['Arrest Flag'].map(lambda x: 0 if x!=1 else 1)

In [53]:
df['Arrest Flag'].value_counts()

0    43754
1     3459
Name: Arrest Flag, dtype: int64

In [54]:
df['Target'].value_counts()

0    35546
1    11667
Name: Target, dtype: int64

In [55]:
df['Arrest Flag'].value_counts(normalize=True)

0    0.926736
1    0.073264
Name: Arrest Flag, dtype: float64

In [56]:
df['Target'].value_counts(normalize=True)

0    0.752886
1    0.247114
Name: Target, dtype: float64

## Analysis: Discrepancy Between Arrest Flag and Target
Oddly, the arrest flag counts are much lower than arrest (target) counts.

## Dropping Arrest Flag

In [57]:
df.drop('Arrest Flag', axis=1, inplace=True)

### Binarizing the Frisk Flag

In [58]:
df['Frisk Flag'].value_counts()

N     36147
Y     10588
NA      478
Name: Frisk Flag, dtype: int64

In [59]:
#  binarizing frisk flag
df['Frisk Flag'] = df['Frisk Flag'].replace('Y', 1)
df['Frisk Flag'] = df['Frisk Flag'].map(lambda x: 0 if x!=1 else 1)

In [60]:
df['Frisk Flag'].value_counts()

0    36625
1    10588
Name: Frisk Flag, dtype: int64

In [61]:
df['Frisk Flag'].value_counts(normalize=True)

0    0.77574
1    0.22426
Name: Frisk Flag, dtype: float64

### Checking the Precinct Feature

In [62]:
df['Precinct'].unique()

array(['South', 'NA', 'East', 'North', 'West', 'Southwest', 'Unknown',
       'SouthWest', 'OOJ', 'FK ERROR'], dtype=object)

In [63]:
df['Precinct'].value_counts()

West         11432
North        10385
NA            9859
East          6213
South         5653
Southwest     2320
SouthWest     1098
Unknown        200
OOJ             33
FK ERROR        20
Name: Precinct, dtype: int64

In [64]:
# fixing Southwest precinct
df['Precinct'] = df['Precinct'].replace('SouthWest', 'Southwest')

In [65]:
drop_list = ['Unknown', 'OOJ', 'FK ERROR']
df = df[~df['Precinct'].isin(drop_list)]

In [66]:
df['Precinct'].value_counts()

West         11432
North        10385
NA            9859
East          6213
South         5653
Southwest     3418
Name: Precinct, dtype: int64

In [67]:
df['Precinct'].value_counts(normalize=True)

West         0.243441
North        0.221146
NA           0.209945
East         0.132304
South        0.120379
Southwest    0.072785
Name: Precinct, dtype: float64

## Analysis: `NA` is Third Largest Category at 20%

### Checking the Sector Feature

In [68]:
df['Sector'].value_counts()

NA        9865
E         2337
M         2270
N         2191
K         1762
K         1725
B         1658
L         1639
D         1512
R         1455
F         1378
S         1348
U         1302
M         1254
O         1161
D         1144
J         1119
G         1087
C         1037
Q          967
W          941
E          925
Q          794
N          694
F          669
R          616
O          598
B          518
S          476
U          453
G          442
W          428
L          416
J          395
C          384
Name: Sector, dtype: int64

In [69]:
df['Sector'].value_counts(normalize=True)

NA        0.210072
E         0.049766
M         0.048339
N         0.046657
K         0.037521
K         0.036733
B         0.035307
L         0.034902
D         0.032198
R         0.030984
F         0.029344
S         0.028705
U         0.027726
M         0.026704
O         0.024723
D         0.024361
J         0.023829
G         0.023147
C         0.022083
Q         0.020592
W         0.020038
E         0.019698
Q         0.016908
N         0.014779
F         0.014246
R         0.013118
O         0.012734
B         0.011031
S         0.010136
U         0.009647
G         0.009412
W         0.009114
L         0.008859
J         0.008411
C         0.008177
Name: Sector, dtype: float64

## Analysis: `NA` is Largest at 21%
Four times larger than the next largest category.

### Checking the Beat Feature

In [70]:
df['Beat'].value_counts()

NA        9859
N3        1175
E2        1092
K3         964
M2         852
M3         792
K3         752
E1         730
N2         709
R2         669
B1         643
U2         636
M1         626
F2         624
M3         592
B2         583
D1         576
L1         569
K2         562
L2         536
L3         534
S2         532
D2         530
E3         515
O1         491
S3         486
D1         464
D2         457
Q3         449
K1         448
J1         443
B3         432
G2         431
F3         422
M1         421
K2         410
U1         409
R1         406
D3         406
Q3         399
G3         386
W2         385
R3         380
C1         379
J3         376
C3         375
K1         351
O2         341
F1         332
S1         330
O3         329
W1         328
E2         322
E1         321
N3         320
Q2         314
N1         307
N2         301
J2         300
C2         283
E3         281
R2         275
O1         270
G1         270
U3         257
M2         245
F1        

In [71]:
df['Beat'].value_counts(normalize=True)

NA        0.209945
N3        0.025021
E2        0.023254
K3        0.020528
M2        0.018143
M3        0.016865
K3        0.016014
E1        0.015545
N2        0.015098
R2        0.014246
B1        0.013693
U2        0.013543
M1        0.013330
F2        0.013288
M3        0.012606
B2        0.012415
D1        0.012266
L1        0.012117
K2        0.011968
L2        0.011414
L3        0.011371
S2        0.011329
D2        0.011286
E3        0.010967
O1        0.010456
S3        0.010349
D1        0.009881
D2        0.009732
Q3        0.009561
K1        0.009540
J1        0.009434
B3        0.009199
G2        0.009178
F3        0.008986
M1        0.008965
K2        0.008731
U1        0.008710
R1        0.008646
D3        0.008646
Q3        0.008497
G3        0.008220
W2        0.008198
R3        0.008092
C1        0.008071
J3        0.008007
C3        0.007986
K1        0.007474
O2        0.007261
F1        0.007070
S1        0.007027
O3        0.007006
W1        0.006985
E2        0.

## Analysis: `NA` is largest Category at 21%
`NA` is over 8 times larger that the next largest category.

### Creating a Beat Flag
Creating a binary flag where 1 indicates that a beat was entered into the record.

In [72]:
# creating beat flag
df['Beat Flag'] = df['Beat']

df['Beat Flag'] = df['Beat Flag'].replace('NA', 0)
df['Beat Flag'] = df['Beat Flag'].map(lambda x: 1 if x!=0 else 0)

In [73]:
df['Beat Flag'].value_counts()

1    37101
0     9859
Name: Beat Flag, dtype: int64

In [74]:
df['Beat Flag'].value_counts(normalize=True)

1    0.790055
0    0.209945
Name: Beat Flag, dtype: float64

In [75]:
df.drop('Beat', axis=1, inplace=True)

## Analysis: 21% of Stops Have no Beat Information

## Beat, Sector & Precinct Have Similar Rates of Nulls 

## Dropping Sector

In [76]:
df.drop('Sector', axis=1, inplace=True)

### Checking Work So Far

In [77]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46960 entries, 0 to 47212
Data columns (total 26 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Target                    46960 non-null  int64 
 1   Subject Age Group         46960 non-null  object
 2   Subject ID                46960 non-null  int64 
 3   GO / SC Num               46960 non-null  int64 
 4   Terry Stop ID             46960 non-null  int64 
 5   Stop Resolution           46960 non-null  object
 6   Weapon Type               46960 non-null  object
 7   Officer ID                46960 non-null  object
 8   Officer YOB               46960 non-null  int64 
 9   Officer Gender            46960 non-null  object
 10  Officer Race              46960 non-null  object
 11  Subject Perceived Race    46960 non-null  object
 12  Subject Perceived Gender  46960 non-null  object
 13  Initial Call Type         46960 non-null  object
 14  Final Call Type       

In [78]:
# checking columns
df.columns

Index(['Target', 'Subject Age Group', 'Subject ID', 'GO / SC Num',
       'Terry Stop ID', 'Stop Resolution', 'Weapon Type', 'Officer ID',
       'Officer YOB', 'Officer Gender', 'Officer Race',
       'Subject Perceived Race', 'Subject Perceived Gender',
       'Initial Call Type', 'Final Call Type', 'Call Type', 'Officer Squad',
       'Frisk Flag', 'Precinct', 'Weapon Flag', 'Reported Year',
       'Reported Month', 'Day of Month', 'Day of Week', 'Reported Hour',
       'Beat Flag'],
      dtype='object')

### Dropping Extraneous Columns

In [79]:
# dropping columns
df.drop(['Subject ID', 'GO / SC Num',
         'Terry Stop ID'], axis=1, inplace=True)

## Exporting Data for EDA 

### Stop Resolution Remix

In [80]:
# checking uniques
df['Stop Resolution'].value_counts().to_dict()

{'Field Contact': 18915,
 'Offense Report': 15534,
 'Arrest': 11611,
 'Referred for Prosecution': 722,
 'Citation / Infraction': 178}

In [81]:
remix = {'Field Contact': 0,
 'Offense Report': 1,
 'Arrest': 1,
 'Referred for Prosecution': 1,
 'Citation / Infraction': 0}

### Creating the  Remixed Target Feature
Creating a binary feature where a value of 1 means an major outcome (Offense Report, Arrest, Referred for Prosecution), a value of 0 means a minor outcome (Field Contact, Citation/Infraction), and then moving it to the front of the data frame. 

In [82]:
# creating and binarizing the target feature
df['Target Remixed'] = df['Stop Resolution']


df['Target Remixed'] = df['Target Remixed'].map(remix)

In [83]:
# moving target to the front of the data frame
column_name = 'Target Remixed'
first_column = df.pop(column_name)
df.insert(0, column_name, first_column)

In [84]:
df['Target Remixed'].value_counts()

1    27867
0    19093
Name: Target Remixed, dtype: int64

In [85]:
df['Target Remixed'].value_counts(normalize=True)

1    0.59342
0    0.40658
Name: Target Remixed, dtype: float64

In [86]:
df.head()

Unnamed: 0,Target Remixed,Target,Subject Age Group,Stop Resolution,Weapon Type,Officer ID,Officer YOB,Officer Gender,Officer Race,Subject Perceived Race,...,Officer Squad,Frisk Flag,Precinct,Weapon Flag,Reported Year,Reported Month,Day of Month,Day of Week,Reported Hour,Beat Flag
0,1,1,,Arrest,,7500,1984,M,Black,Asian,...,SOUTH PCT 1ST W - ROBERT,0,South,0,2015,10,16,4,11,1
1,0,0,,Field Contact,,5670,1965,M,White,,...,,0,,0,2015,3,19,3,7,0
2,0,0,,Field Contact,,4844,1961,M,White,White,...,,0,,0,2015,3,21,5,19,0
3,0,0,,Field Contact,,7539,1963,M,White,,...,,0,,0,2015,4,1,2,4,0
4,0,0,,Field Contact,,6973,1977,M,White,Black,...,,0,,0,2015,4,3,4,0,0


In [87]:
# # cleaned data with stop resolution column for eda
# path2 = os.path.join(gparent, 'data/processed', 'eda.csv')
# df.to_csv(path2, index=False, na_rep='NA')

In [88]:
# # cleaned data with stop resolution column for eda
# path2 = os.path.join(gparent, 'data/processed', 'eda_remixed.csv')
# df.to_csv(path2, index=False, na_rep='NA')

## Dropping Stop Resolution 
`Stop Resolution` contains target feature data and must be dropped for modeling purposes.

In [89]:
df.drop('Stop Resolution', axis=1, inplace=True)

## Exporting Data for Modeling

In [90]:
# # cleaned data without stop resolution for modeling
# path2 = os.path.join(gparent, 'data/processed', 'modeling.csv')
# df.to_csv(path2, index=False, na_rep='NA')

In [91]:
df.drop('Target', axis=1, inplace=True)

In [92]:
# # cleaned data without stop resolution or original target for modeling
# path2 = os.path.join(gparent, 'data/processed', 'modeling_remixed.csv')
# df.to_csv(path2, index=False, na_rep='NA')