# Cleaning For EDA

## Read in data

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('CSV/Terry_Stops.csv')

---

## Rename columns

In [3]:
df = df.rename(columns={'Subject Age Group': 'subj_age_group', 
                                        'Subject ID': 'subj_id', 
                                        'GO / SC Num': 'go_sc_num', 
                                        'Terry Stop ID': 'terry_stop_id', 
                                        'Stop Resolution': 'stop_resolution', 
                                        'Weapon Type': 'weapon',
                                        'Officer ID': 'off_id',
                                        'Officer YOB': 'off_yob',
                                        'Officer Gender': 'off_gender',
                                        'Officer Race':'off_race',
                                        'Subject Perceived Race':'subj_perceived_race',
                                        'Subject Perceived Gender':'subj_perceived_gender',
                                        'Reported Date':'reported_date',
                                        'Reported Time':'reported_time',
                                        'Initial Call Type':'init_call_type',
                                        'Final Call Type':'final_call_type',
                                        'Call Type':'call_type',
                                        'Officer Squad':'off_squad',
                                        'Arrest Flag':'arrest_flag',
                                        'Frisk Flag': 'frisk_flag',
                                        'Precinct':'precinct',
                                        'Sector':'sector',
                                        'Beat':'beat'})

---

### Change "-" values to N/A

In [4]:
df = df.replace('-', 'N/A')

### Change "-1" in Subject ID to N/A

In [5]:
df.subj_id = df.subj_id.replace(-1, 'N/A')

### Change Officer Squad values to general precint/squad

In [6]:
df.off_squad = df.off_squad.replace(to_replace=' -.*', value='', regex=True)

### Change Gender value of "N" to N/A

In [7]:
df.off_gender = df.off_gender.replace('N', 'N/A')

### Format Date of Report and Create Officer's Age Column

In [8]:
# Remove "T" character in timestamp to access time
df['reported_date'] = df['reported_date'].str.replace('T', ' ')
# Convert series to datetime
df['reported_date'] = pd.to_datetime(df['reported_date'])
# Create columns for comments' year, month, day, and hour
df['reported_year'] = pd.DatetimeIndex(df['reported_date']).year
df['reported_month'] = pd.DatetimeIndex(df['reported_date']).month
df['reported_day'] = pd.DatetimeIndex(df['reported_date']).day
df['reported_hour'] = [x[:2] for x in df['reported_time']]
# Subtract officer year of birth from the reported year
df['off_age'] = df['reported_year'] - df['off_yob']
# Drop post_date column
df = df.drop(['reported_date', 'reported_time', 'off_yob'], axis=1)

---

### Strip Whitespace from beat column

In [9]:
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

---

### Categorize weapons ordinally

In [10]:
df = df.replace(to_replace='-', value='None')
df = df.replace(to_replace='Lethal Cutting Instrument', value='Blade')
df = df.replace(to_replace='Firearm Other', value='Firearm')
df = df.replace(to_replace='Handgun', value='Firearm')
df = df.replace(to_replace='Club, Blackjack, Brass Knuckles', value='Blunt Object')
df = df.replace(to_replace='Knife/Cutting/Stabbing Instrument', value='Blade')
df = df.replace(to_replace='Rifle', value='Firearm')
df = df.replace(to_replace='Fire/Incendiary Device', value='Non-Lethal')
df = df.replace(to_replace='Firearm (unk type)', value='Firearm')
df = df.replace(to_replace='Other Firearm', value='Firearm')
df = df.replace(to_replace='Club', value='Blunt Object')
df = df.replace(to_replace='Mace/Pepper Spray', value='Non-Lethal')
df = df.replace(to_replace='Blunt Object/Striking Implement', value='Blunt Object')
df = df.replace(to_replace='Firearm', value='Firearm')
df = df.replace(to_replace='Brass Knuckles', value='Blunt Object')
df = df.replace(to_replace='Automatic Handgun', value='Firearm')
df = df.replace(to_replace='Taser/Stun Gun', value='Non-Lethal')
df = df.replace(to_replace='None/Not Applicable', value='None')
df = df.replace(to_replace='Blackjack', value='Blunt Object')
df = df.replace(to_replace='Shotgun', value='Firearm')

---

### Change officer race from "Unknown" and "Not Specified" to "N/A"

In [11]:
df.off_race = df.off_race.replace('Unknown', 'N/A')
df.off_race = df.off_race.replace('Not Specified', 'N/A')

### Change subject's perceived race from "Unknown" and "Other" to "N/A" 

In [12]:
df.subj_perceived_race = df.subj_perceived_race.replace('Unknown', 'N/A')
df.subj_perceived_race = df.subj_perceived_race.replace('Other', 'N/A')

### Change subject's perceived gender from "Unable to Determine", "Unknown", and "Gender Diverse (gender non-conforming and/or transgender)" to "N/A"

Gender diverse can encompass too wide a range to be considered useful. For example transgender is not the same as intersex but they would be under the same category. Also someone who identifies as a woman could be mislabeled gender-diverse if they do not have a say in how they are identified. 

In [13]:
df.subj_perceived_gender = df.subj_perceived_gender.replace(['Unable to Determine', 'Unknown', 'Gender Diverse (gender non-conforming and/or transgender)'], 'N/A')

## Combine beats to account for general geographical area, except for H1 and H2/H3 as they are on different sides of the map

In [14]:
df['specific_beat'] = df.beat
df['beat'] = df.beat.replace(to_replace=['B1','B2','B3'], value='B')
df['beat'] = df.beat.replace(to_replace=['C1','C2','C3'], value='C')
df['beat'] = df.beat.replace(to_replace=['D1','D2','D3'], value='D')
df['beat'] = df.beat.replace(to_replace=['E1','E2','E3'], value='E')
df['beat'] = df.beat.replace(to_replace=['F1','F2','F3'], value='F')
df['beat'] = df.beat.replace(to_replace=['G1','G2','G3'], value='G')
df['beat'] = df.beat.replace(to_replace=['J1','J2','J3'], value='J')
df['beat'] = df.beat.replace(to_replace=['K1','K2','K3'], value='K')
df['beat'] = df.beat.replace(to_replace=['L1','L2','L3'], value='L')
df['beat'] = df.beat.replace(to_replace=['M1','M2','M3'], value='M')
df['beat'] = df.beat.replace(to_replace=['N1','N2','N3'], value='N')
df['beat'] = df.beat.replace(to_replace=['O1','O2','O3'], value='O')
df['beat'] = df.beat.replace(to_replace=['Q1','Q2','Q3'], value='Q')
df['beat'] = df.beat.replace(to_replace=['R1','R2','R3'], value='R')
df['beat'] = df.beat.replace(to_replace=['S1','S2','S3'], value='S')
df['beat'] = df.beat.replace(to_replace=['U1','U2','U3'], value='U')
df['beat'] = df.beat.replace(to_replace=['W1','W2','W3'], value='W')

---

### Convert to csv

In [15]:
df.to_csv('terry_stops_eda.csv', index=False)