In [2]:
import os
import numpy as np
import pandas as pd
from collections import Counter
from scipy.stats import ttest_ind, ks_2samp

In [4]:
os.listdir('Data/')

['.DS_Store',
 'MetroPopulation.csv',
 'PoliceDataInitiativeParticipants.csv',
 'PostKillingsData.csv']

In [24]:
# Load all data in dataframes.
metpop = pd.read_csv('Data/MetroPopulation.csv', thousands=',')
pdi = pd.read_csv('Data/PoliceDataInitiativeParticipants.csv', encoding='latin-1',)
killings = pd.read_csv('Data/PostKillingsData.csv')
officershootings = ['Fairfax', 'Los Angeles', 'Orlando', 'Atlanta',
                    'Bloomington', 'Louisville', 'Hartford', 'Austin',
                    'Henderson', 'Dallas', 'San Francisco', 'Indianapolis',
                    'Tuscon', 'Knoxville', 'Redondo Beach', 'Cincinnati',
                    'Philadelphia', 'Hampton']

In [25]:
metpop.head()

Unnamed: 0,Rank,Metropolitan Statistical Area,2010 Census,2000 Census,Growth (%)
0,1,"New York-Northern New Jersey-Long Island, NY-N...",19006798,18323002,3.7
1,2,"Los Angeles-Long Beach-Santa Ana, CA",12872808,12365627,4.1
2,3,"Chicago-Joliet-Naperville, IL-IN-WI",9569624,9098316,5.2
3,4,"Dallas-Fort Worth-Arlington, TX",6300006,5161544,22.1
4,5,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",5838471,5687147,2.7


In [26]:
pdi.head()

Unnamed: 0,PDI Participant
0,"Albuquerque, NM Police"
1,"Anchorage, AK Police"
2,"Atlanta, GA Police"
3,"Auburn, WA Police"
4,"Austin, TX Police"


In [27]:
killings.head()

Unnamed: 0,id,name,date,manner_of_death,armed,age,gender,race,city,state,signs_of_mental_illness,threat_level,flee,body_camera
0,3,Tim Elliot,2015-01-02,shot,gun,53.0,M,A,Shelton,WA,True,attack,Not fleeing,False
1,4,Lewis Lee Lembke,2015-01-02,shot,gun,47.0,M,W,Aloha,OR,False,attack,Not fleeing,False
2,5,John Paul Quintero,2015-01-03,shot and Tasered,unarmed,23.0,M,H,Wichita,KS,False,other,Not fleeing,False
3,8,Matthew Hoffman,2015-01-04,shot,toy weapon,32.0,M,W,San Francisco,CA,True,attack,Not fleeing,False
4,9,Michael Rodriguez,2015-01-04,shot,nail gun,39.0,M,H,Evans,CO,False,attack,Not fleeing,False


In [28]:
# Group killings by city.
kcount = killings.groupby('city')['id'].count().sort_values(ascending=False)

# Create standardized city column for pdi & metropolitan population.
pdi = pdi['PDI Participant'].apply(lambda x: pd.Series(x.split(',')))
pdi.columns = ['City', 'is_PDI']
pdi['is_PDI'] = 1

metpop['City'] = metpop['Metropolitan Statistical Area']
metpop['City'] = metpop['Metropolitan Statistical Area'].apply(lambda x: pd.Series(x.split(',')))
metpop['City'] = metpop['City'].apply(lambda x: x.split('-')[0])

In [29]:
# Merge all data.
result = pd.merge(metpop, pdi, on='City', how='outer')
result[['is_PDI']] = result[['is_PDI']].fillna(value=0)
result = result[np.isfinite(result['2010 Census'])]
result.set_index('City', inplace=True)
result['Killings'] = kcount

# Calculate per capita killings.
result['Killings'] = result['Killings'].fillna(0)
result['KillingsPer100Thou'] = result['Killings'] / (result['2010 Census'] / 100000)

In [30]:
result.head()

Unnamed: 0_level_0,Rank,Metropolitan Statistical Area,2010 Census,2000 Census,Growth (%),is_PDI,Killings,KillingsPer100Thou
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
New York,1.0,"New York-Northern New Jersey-Long Island, NY-N...",19006798.0,18323002.0,3.7,1.0,8.0,0.04209
Los Angeles,2.0,"Los Angeles-Long Beach-Santa Ana, CA",12872808.0,12365627.0,4.1,1.0,31.0,0.240818
Chicago,3.0,"Chicago-Joliet-Naperville, IL-IN-WI",9569624.0,9098316.0,5.2,0.0,21.0,0.219444
Dallas,4.0,"Dallas-Fort Worth-Arlington, TX",6300006.0,5161544.0,22.1,1.0,10.0,0.15873
Philadelphia,5.0,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",5838471.0,5687147.0,2.7,1.0,7.0,0.119894


In [31]:
# Split data based on participation in pdi.
nonpart = result[result['is_PDI'] == 0]
part = result[result['is_PDI'] == 1]

# Take large cities.
nonpartM = nonpart[nonpart['2010 Census'] > 1000000]
partM = part[part['2010 Census'] > 1000000]

# Officer shooting participation.
ospart = result[result.index.isin(officershootings)]
osnonpart = result[~result.index.isin(officershootings)]

# Officer shooting participation for only large cities.
osnonpartM = osnonpart[osnonpart['2010 Census'] > 1000000]
ospartM = ospart[ospart['2010 Census'] > 1000000]

# Officer shooting participation for only small cities.
osnonpartNM = osnonpart[osnonpart['2010 Census'] < 1000000]
ospartNM = ospart[ospart['2010 Census'] < 1000000]

# Statistical Tests.
# All data
print('All data (Killings per 100 Thousand Residents):')
print('Participants: Mean {}'.format(round(part['KillingsPer100Thou'].mean(), 2)))
print('Non-participants: Mean {}'.format(round(nonpart['KillingsPer100Thou'].mean(), 2)))
print(ttest_ind(nonpart['KillingsPer100Thou'], part['KillingsPer100Thou']))
print(ks_2samp(nonpart['KillingsPer100Thou'], part['KillingsPer100Thou']))

# Metro areas > 1M
print('\nMetro Areas with Pop > 1M (Killings per 100 Thousand Residents):')
print('Participants: Mean {}'.format(round(partM['KillingsPer100Thou'].mean(), 2)))
print('Non-participants: Mean {}'.format(round(nonpartM['KillingsPer100Thou'].mean(), 2)))
print(ttest_ind(nonpartM['KillingsPer100Thou'], partM['KillingsPer100Thou']))
print(ks_2samp(nonpartM['KillingsPer100Thou'], partM['KillingsPer100Thou']))

# Report officer involved shootings.
print('\nSpecifically Report Officer Involved Shootings (Killings per 100 Thousand Residents):')
print('Participants: Mean {}'.format(round(ospart['KillingsPer100Thou'].mean(), 2)))
print('Non-participants: Mean {}'.format(round(osnonpart['KillingsPer100Thou'].mean(), 2)))
print(ttest_ind(osnonpart['KillingsPer100Thou'], ospart['KillingsPer100Thou']))
print(ks_2samp(osnonpart['KillingsPer100Thou'], ospart['KillingsPer100Thou']))

# Report officer involved shootings metro areas > 1M
print('\nSpecifically Report Officer Involved Shootings, Metro Areas with Pop > 1M (Killings per 100 Thousand Residents):')
print('Participants: Mean {}'.format(round(ospartM['KillingsPer100Thou'].mean(), 2)))
print('Non-participants: Mean {}'.format(round(osnonpartM['KillingsPer100Thou'].mean(), 2)))
print(ttest_ind(osnonpartM['KillingsPer100Thou'], ospartM['KillingsPer100Thou']))
print(ks_2samp(osnonpartM['KillingsPer100Thou'], ospartM['KillingsPer100Thou']))

All data (Killings per 100 Thousand Residents):
Participants: Mean 0.36
Non-participants: Mean 0.39
Ttest_indResult(statistic=0.47499103280454535, pvalue=0.63583958474638469)
Ks_2sampResult(statistic=0.14452798663324984, pvalue=0.67057310434701256)

Metro Areas with Pop > 1M (Killings per 100 Thousand Residents):
Participants: Mean 0.31
Non-participants: Mean 0.33
Ttest_indResult(statistic=0.33506132708424402, pvalue=0.73898051601333448)
Ks_2sampResult(statistic=0.16897081413210446, pvalue=0.83188009534273188)

Specifically Report Officer Involved Shootings (Killings per 100 Thousand Residents):
Participants: Mean 0.31
Non-participants: Mean 0.39
Ttest_indResult(statistic=0.7531317525013369, pvalue=0.45315801687279988)
Ks_2sampResult(statistic=0.25151515151515158, pvalue=0.50174418964864875)

Specifically Report Officer Involved Shootings, Metro Areas with Pop > 1M (Killings per 100 Thousand Residents):
Participants: Mean 0.31
Non-participants: Mean 0.32
Ttest_indResult(statistic=0.119