In [26]:
import os
import pandas as pd
import numpy as np
import re

# Data Preprocessing

The data was taken from https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4634878/

In [3]:
police_shootings = pd.read_csv('data/U.S. Police Shootings Data (Responses) - Responses.csv')

In [5]:
police_shootings.head()

Unnamed: 0,Timestamp,Date Searched,State,County,City,Agency Name,Victim Name,Victim's Age,Victim's Gender,Race,...,Name of Officer or Officers,Shootings,Was the Shooting Justified?,Receive Updates?,Name,Email Address,Twitter,Date of Incident,Results Page Number,Unnamed: 26
0,8/20/2014 12:06:49,10/15/1986,AZ - Arizona,maricopa,Phoenix,phoenix police,David Valenzuela,24.0,Male,,...,,,,,,,,,,
1,8/20/2014 12:09:29,10/15/1986,TX - Texas,Guadalupe,cibolo,cibolo police department,Kennen Marksbury,41.0,Male,White,...,,,,,,,,,,
2,8/20/2014 12:11:57,10/15/1986,NJ - New Jersey,morris,Mountain lakes,Mountain Lakes PD,Leonardo Parera,39.0,Male,White,...,,,,,,,,,,
3,8/20/2014 13:06:16,7/15/1995,TX - Texas,Harris,Houston,Bellaire Police Department,Travis O'Neill Allen,15.0,Male,White,...,Michael Leal,,,,,,,,,
4,8/20/2014 13:30:17,9/27/2003,OH - Ohio,Preble,West Alexandria,Preble County Emergency Services Unit,Clayton Helriggle,23.0,Male,White,...,Unknown,,,,,,,,,


In [4]:
police_shootings['datetime'] = pd.to_datetime(police_shootings['Timestamp'])

In [5]:
police_shootings['year'] = police_shootings['datetime'].apply(lambda x: x.year)

2014 has much more data then other years

In [6]:
police_shootings_2014 = police_shootings[police_shootings.year == 2014]

In [34]:
police_shootings_2014.head()

Unnamed: 0,Timestamp,Date Searched,State,County,City,Agency Name,Victim Name,Victim's Age,Victim's Gender,Race,...,Was the Shooting Justified?,Receive Updates?,Name,Email Address,Twitter,Date of Incident,Results Page Number,Unnamed: 26,datetime,year
0,8/20/2014 12:06:49,10/15/1986,AZ - Arizona,maricopa,Phoenix,phoenix police,David Valenzuela,24.0,Male,,...,,,,,,,,,2014-08-20 12:06:49,2014
1,8/20/2014 12:09:29,10/15/1986,TX - Texas,Guadalupe,cibolo,cibolo police department,Kennen Marksbury,41.0,Male,White,...,,,,,,,,,2014-08-20 12:09:29,2014
2,8/20/2014 12:11:57,10/15/1986,NJ - New Jersey,morris,Mountain lakes,Mountain Lakes PD,Leonardo Parera,39.0,Male,White,...,,,,,,,,,2014-08-20 12:11:57,2014
3,8/20/2014 13:06:16,7/15/1995,TX - Texas,Harris,Houston,Bellaire Police Department,Travis O'Neill Allen,15.0,Male,White,...,,,,,,,,,2014-08-20 13:06:16,2014
4,8/20/2014 13:30:17,9/27/2003,OH - Ohio,Preble,West Alexandria,Preble County Emergency Services Unit,Clayton Helriggle,23.0,Male,White,...,,,,,,,,,2014-08-20 13:30:17,2014


### Columns:

In [17]:
police_shootings.columns

Index(['Timestamp', 'Date Searched', 'State', 'County', 'City', 'Agency Name',
       'Victim Name', 'Victim's Age', 'Victim's Gender', 'Race',
       'Hispanic or Latino Origin', 'Shots Fired', 'Hit or Killed?',
       'Armed or Unarmed?', 'Weapon', 'Summary', 'Source Link',
       'Name of Officer or Officers', 'Shootings',
       'Was the Shooting Justified?', 'Receive Updates?', 'Name',
       'Email Address', 'Twitter', 'Date of Incident', 'Results Page Number',
       'Unnamed: 26'],
      dtype='object')

### Races:

In [12]:
police_shootings.Race.unique()

array([nan, 'White', 'Unknown', 'Black or African American', 'Asian',
       'American Indian or Alaska Native',
       'Native Hawaiian or Other Pacific Islander'], dtype=object)

In [18]:
police_shootings['Hispanic or Latino Origin'].unique()

array([nan, 'Not of Hispanic or Latino origin', 'Unknown',
       'Hispanic or Latino origin'], dtype=object)

## Justifiability by race

In [49]:
police_shootings_2014[police_shootings_2014.Race == 'White']['Was the Shooting Justified?'].mean()

3.2160815402038505

In [50]:
police_shootings_2014[police_shootings_2014.Race == 'Black or African American']['Was the Shooting Justified?'].mean()

2.629

In [51]:
police_shootings_2014[police_shootings_2014['Hispanic or Latino Origin'] == 'Hispanic or Latino origin']['Was the Shooting Justified?'].mean()

3.273125

In [34]:
df = police_shootings_2014

Unjustified ratio - what percent of the shootings was unjustified  
Black to white ratio - the black unjustifiable ratio compared to the white unjustifiable ratio

In [39]:
white = df[df.Race == 'White']
black = df[df.Race == 'Black or African American']
white_unjustified_ratio = (white['Was the Shooting Justified?']<3).sum()/white['Was the Shooting Justified?'].count()
black_unjustified_ratio = (black['Was the Shooting Justified?']<3).sum()/black['Was the Shooting Justified?'].count()

In [40]:
white_unjustified_ratio

0.3277463193657984

In [41]:
black_unjustified_ratio

0.471

In [42]:
black_unjustified_ratio/white_unjustified_ratio-1

0.43708707671043534

**Black people are 43% more prone to get shot unjustifiably**

# Correlation between income level and unjustifiable shooting

One can argue that black people have lower income and that the income level is the actual predictor to ujustifiable shooting.  This is not the case. There is no correlation between unjustifiable shooting and income level.

In [48]:
income = pd.read_html('https://en.wikipedia.org/wiki/List_of_United_States_counties_by_per_capita_income')

In [11]:
income_df = income[2]

In [12]:
income_df.head()

Unnamed: 0,Rank,County or county-equivalent,"State, federal district or territory",Per capitaincome,Medianhouseholdincome,Medianfamilyincome,Population,Number ofhouseholds
0,1,New York County,New York,"$62,498","$69,659","$84,627",1605272.0,736192.0
1,2,Arlington,Virginia,"$62,018","$103,208","$139,244",214861.0,94454.0
2,3,Falls Church City,Virginia,"$59,088","$120,000","$152,857",12731.0,5020.0
3,4,Marin,California,"$56,791","$90,839","$117,357",254643.0,102912.0
4,5,Alexandria City,Virginia,"$54,608","$85,706","$107,511",143684.0,65369.0


In [14]:
income_df = income_df.set_index('County or county-equivalent')

In [15]:
median_income = income_df['Medianhouseholdincome']

In [17]:
police_shootings_2014 = police_shootings_2014.merge(median_income, right_index=True, left_on='County')

In [30]:
police_shootings_2014['Medianhouseholdincomenumeric'] = pd.to_numeric(police_shootings_2014['Medianhouseholdincome'].apply(lambda x: re.sub('[\$,]','',x) if type(x)==str else 0))

In [31]:
police_shootings_2014['Was the Shooting Justified?'].corr(police_shootings_2014['Medianhouseholdincomenumeric'])

-0.0028343421835552913