In [2]:
import os
import numpy as np
import pandas as pd
from collections import Counter
from scipy.stats import ttest_ind, ks_2samp

In [4]:
os.listdir('Data/')

['.DS_Store',
 'MetroPopulation.csv',
 'PoliceDataInitiativeParticipants.csv',
 'PostKillingsData.csv']

In [16]:
# Load all data in dataframes.
metpop = pd.read_csv('Data/MetroPopulation.csv', thousands=',')
pdi = pd.read_csv('Data/PoliceDataInitiativeParticipants.csv', encoding='latin-1',)
killings = pd.read_csv('Data/PostKillingsData.csv')
officershootings = ['Fairfax', 'Los Angeles', 'Orlando', 'Atlanta',
                    'Bloomington', 'Louisville', 'Hartford', 'Austin',
                    'Henderson', 'Dallas', 'San Francisco', 'Indianapolis',
                    'Tuscon', 'Knoxville', 'Redondo Beach', 'Cincinnati',
                    'Philadelphia', 'Hampton']

In [17]:
metpop.head()

Unnamed: 0,Rank,Metropolitan Statistical Area,2010 Census,2000 Census,Growth (%)
0,1,"New York-Northern New Jersey-Long Island, NY-N...",19006798,18323002,3.7
1,2,"Los Angeles-Long Beach-Santa Ana, CA",12872808,12365627,4.1
2,3,"Chicago-Joliet-Naperville, IL-IN-WI",9569624,9098316,5.2
3,4,"Dallas-Fort Worth-Arlington, TX",6300006,5161544,22.1
4,5,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",5838471,5687147,2.7


In [18]:
pdi.head()

Unnamed: 0,PDI Participant
0,"Albuquerque, NM Police"
1,"Anchorage, AK Police"
2,"Atlanta, GA Police"
3,"Auburn, WA Police"
4,"Austin, TX Police"


In [19]:
killings.head()

Unnamed: 0,id,name,date,manner_of_death,armed,age,gender,race,city,state,signs_of_mental_illness,threat_level,flee,body_camera
0,3,Tim Elliot,2015-01-02,shot,gun,53.0,M,A,Shelton,WA,True,attack,Not fleeing,False
1,4,Lewis Lee Lembke,2015-01-02,shot,gun,47.0,M,W,Aloha,OR,False,attack,Not fleeing,False
2,5,John Paul Quintero,2015-01-03,shot and Tasered,unarmed,23.0,M,H,Wichita,KS,False,other,Not fleeing,False
3,8,Matthew Hoffman,2015-01-04,shot,toy weapon,32.0,M,W,San Francisco,CA,True,attack,Not fleeing,False
4,9,Michael Rodriguez,2015-01-04,shot,nail gun,39.0,M,H,Evans,CO,False,attack,Not fleeing,False


In [20]:
# Group killings by city.
kcount = killings.groupby('city')['id'].count().sort_values(ascending=False)

# Create standardized city column for pdi & metropolitan population.
pdi = pdi['PDI Participant'].apply(lambda x: pd.Series(x.split(',')))
pdi.columns = ['City', 'is_PDI']
pdi['is_PDI'] = 1

metpop['City'] = metpop['Metropolitan Statistical Area']
metpop['City'] = metpop['Metropolitan Statistical Area'].apply(lambda x: pd.Series(x.split(',')))
metpop['City'] = metpop['City'].apply(lambda x: x.split('-')[0])

In [29]:
# Merge all data.
result = pd.merge(metpop, pdi, on='City', how='outer')
result[['is_PDI']] = result[['is_PDI']].fillna(value=0)
result = result[np.isfinite(result['2010 Census'])]
result.set_index('City', inplace=True)
result['Killings'] = kcount

# Calculate per capita killings.
result['Killings'] = result['Killings'].fillna(0)
result['KillingsPer100Thou'] = result['Killings'] / (result['2010 Census'] / 100000)

In [30]:
result.head()

Unnamed: 0_level_0,Rank,Metropolitan Statistical Area,2010 Census,2000 Census,Growth (%),is_PDI,Killings,KillingsPer100Thou
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
New York,1.0,"New York-Northern New Jersey-Long Island, NY-N...",19006798.0,18323002.0,3.7,1.0,8.0,0.04209
Los Angeles,2.0,"Los Angeles-Long Beach-Santa Ana, CA",12872808.0,12365627.0,4.1,1.0,31.0,0.240818
Chicago,3.0,"Chicago-Joliet-Naperville, IL-IN-WI",9569624.0,9098316.0,5.2,0.0,21.0,0.219444
Dallas,4.0,"Dallas-Fort Worth-Arlington, TX",6300006.0,5161544.0,22.1,1.0,10.0,0.15873
Philadelphia,5.0,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",5838471.0,5687147.0,2.7,1.0,7.0,0.119894


In [31]:
# Split data based on participation in pdi.
nonpart = result[result['is_PDI'] == 0]
part = result[result['is_PDI'] == 1]

# Take large cities.
nonpartM = nonpart[nonpart['2010 Census'] > 1000000]
partM = part[part['2010 Census'] > 1000000]

# Officer shooting participation.
ospart = result[result.index.isin(officershootings)]
osnonpart = result[~result.index.isin(officershootings)]

# Officer shooting participation for only large cities.
osnonpartM = osnonpart[osnonpart['2010 Census'] > 1000000]
ospartM = ospart[ospart['2010 Census'] > 1000000]

# Officer shooting participation for only small cities.
osnonpartNM = osnonpart[osnonpart['2010 Census'] < 1000000]
ospartNM = ospart[ospart['2010 Census'] < 1000000]

# Statistical Tests.
# All data
print('All data (Killings per 100 Thousand Residents):')
print('Participants: Mean {}'.format(round(part['KillingsPer100Thou'].mean(), 2)))
print('Non-participants: Mean {}'.format(round(nonpart['KillingsPer100Thou'].mean(), 2)))
print(ttest_ind(nonpart['KillingsPer100Thou'], part['KillingsPer100Thou']))
print(ks_2samp(nonpart['KillingsPer100Thou'], part['KillingsPer100Thou']))

# Metro areas > 1M
print('\nMetro Areas with Pop > 1M (Killings per 100 Thousand Residents):')
print('Participants: Mean {}'.format(round(partM['KillingsPer100Thou'].mean(), 2)))
print('Non-participants: Mean {}'.format(round(nonpartM['KillingsPer100Thou'].mean(), 2)))
print(ttest_ind(nonpartM['KillingsPer100Thou'], partM['KillingsPer100Thou']))
print(ks_2samp(nonpartM['KillingsPer100Thou'], partM['KillingsPer100Thou']))

# Report officer involved shootings.
print('\nSpecifically Report Officer Involved Shootings (Killings per 100 Thousand Residents):')
print('Participants: Mean {}'.format(round(ospart['KillingsPer100Thou'].mean(), 2)))
print('Non-participants: Mean {}'.format(round(osnonpart['KillingsPer100Thou'].mean(), 2)))
print(ttest_ind(osnonpart['KillingsPer100Thou'], ospart['KillingsPer100Thou']))
print(ks_2samp(osnonpart['KillingsPer100Thou'], ospart['KillingsPer100Thou']))

# Report officer involved shootings metro areas > 1M
print('\nSpecifically Report Officer Involved Shootings, Metro Areas with Pop > 1M (Killings per 100 Thousand Residents):')
print('Participants: Mean {}'.format(round(ospartM['KillingsPer100Thou'].mean(), 2)))
print('Non-participants: Mean {}'.format(round(osnonpartM['KillingsPer100Thou'].mean(), 2)))
print(ttest_ind(osnonpartM['KillingsPer100Thou'], ospartM['KillingsPer100Thou']))
print(ks_2samp(osnonpartM['KillingsPer100Thou'], ospartM['KillingsPer100Thou']))

All data (Killings per 100 Thousand Residents):
Participants: Mean 0.36
Non-participants: Mean 0.39
Ttest_indResult(statistic=0.47499103280454535, pvalue=0.63583958474638469)
Ks_2sampResult(statistic=0.14452798663324984, pvalue=0.67057310434701256)

Metro Areas with Pop > 1M (Killings per 100 Thousand Residents):
Participants: Mean 0.31
Non-participants: Mean 0.33
Ttest_indResult(statistic=0.33506132708424402, pvalue=0.73898051601333448)
Ks_2sampResult(statistic=0.16897081413210446, pvalue=0.83188009534273188)

Specifically Report Officer Involved Shootings (Killings per 100 Thousand Residents):
Participants: Mean 0.31
Non-participants: Mean 0.39
Ttest_indResult(statistic=0.7531317525013369, pvalue=0.45315801687279988)
Ks_2sampResult(statistic=0.25151515151515158, pvalue=0.50174418964864875)

Specifically Report Officer Involved Shootings, Metro Areas with Pop > 1M (Killings per 100 Thousand Residents):
Participants: Mean 0.31
Non-participants: Mean 0.32
Ttest_indResult(statistic=0.119

In [40]:
# this is agency-level crime data for jurisdictions of 100K and up
# all data from 2014
metro_crime = pd.read_csv('Data/metro2014crime_100Kandup.csv')

In [46]:
metro_crime.columns

Index([u'Agency', u'State', u'Months', u'Population', u'Violent crime total',
       u'Murder and nonnegligent Manslaughter', u'Legacy rape /1',
       u'Revised rape /2', u'Robbery', u'Aggravated assault',
       u'Property crime total', u'Burglary', u'Larceny-theft',
       u'Motor vehicle theft', u'Violent Crime rate',
       u'Murder and nonnegligent manslaughter rate', u'Legacy rape rate /1',
       u'Revised rape rate /2', u'Robbery rate', u'Aggravated assault rate',
       u'Property crime rate', u'Burglary rate', u'Larceny-theft rate',
       u'Motor vehicle theft rate', u'is_PDI'],
      dtype='object')

In [41]:
metro_crime.head()

Unnamed: 0,Agency,State,Months,Population,Violent crime total,Murder and nonnegligent Manslaughter,Legacy rape /1,Revised rape /2,Robbery,Aggravated assault,...,Violent Crime rate,Murder and nonnegligent manslaughter rate,Legacy rape rate /1,Revised rape rate /2,Robbery rate,Aggravated assault rate,Property crime rate,Burglary rate,Larceny-theft rate,Motor vehicle theft rate
0,Abilene Police Dept,TX,12,120686,571.0,7,,93.0,128,343,...,473.1,5.8,,77.1,106.1,284.2,4428.0,907.3,3220.8,300.0
1,Akron City Police Dept,OH,12,197891,1366.0,26,,176.0,436,728,...,690.3,13.1,,88.9,220.3,367.9,4610.1,1456.9,2860.2,293.1
2,Albuquerque Police Dept,NM,12,558874,4934.0,30,402.0,,1381,3121,...,882.8,5.4,71.9,,247.1,558.4,5446.1,1095.6,3713.9,636.6
3,Alexandria Police Dept,VA,12,151065,280.0,4,,25.0,142,109,...,185.4,2.6,,16.5,94.0,72.2,1974.6,171.4,1626.5,176.7
4,Allentown City Police Dept,PA,12,118710,611.0,9,,66.0,312,224,...,514.7,7.6,,55.6,262.8,188.7,3218.8,791.8,2197.0,230.0


In [42]:
# clean city names
metro_crime.loc[:, 'Agency'] = map(lambda s: s.replace(' Police Dept', ''), metro_crime.loc[:, 'Agency'])

In [43]:
# how many are part of PDI?
metro_crime = metro_crime.merge(pdi, how='left', left_on='Agency', right_on='City')
metro_crime.drop(['City'], axis=1, inplace=True)
metro_crime['is_PDI'].fillna(0, inplace=True)

In [44]:
metro_crime.is_PDI.sum()

62.0

In [48]:
def pop_group(pop):
    if pop < 250000:
        return '100K-250K'
    elif pop < 500000:
        return '250K-500K'
    elif pop < 1000000:
        return '500K-1M'
    else:
        return '1M+'

In [49]:
metro_crime['pop_bin'] = map(pop_group, metro_crime.loc[:, 'Population'])

In [53]:
metro_crime.groupby(['pop_bin', 'is_PDI']).count()['Agency']

pop_bin    is_PDI
100K-250K  0.0       186
           1.0        31
1M+        0.0         5
           1.0         6
250K-500K  0.0        33
           1.0        13
500K-1M    0.0        12
           1.0        12
Name: Agency, dtype: int64

In [51]:
metro_crime.groupby(['pop_bin', 'is_PDI']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Months,Population,Violent crime total,Murder and nonnegligent Manslaughter,Legacy rape /1,Revised rape /2,Robbery,Aggravated assault,Property crime total,Burglary,...,Violent Crime rate,Murder and nonnegligent manslaughter rate,Legacy rape rate /1,Revised rape rate /2,Robbery rate,Aggravated assault rate,Property crime rate,Burglary rate,Larceny-theft rate,Motor vehicle theft rate
pop_bin,is_PDI,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
100K-250K,0.0,12.0,148614.3,651.537634,7.930108,33.214286,69.592308,198.532258,386.435484,4709.704301,964.956989,...,425.504839,5.05,22.866071,47.099231,129.357527,251.302151,3135.187634,633.944624,2200.920968,300.322043
100K-250K,1.0,12.0,150483.5,899.833333,11.580645,68.5,82.0,286.064516,500.387097,6859.064516,1356.0,...,571.663333,7.377419,39.25,54.408,184.329032,315.954839,4320.932258,853.441935,3044.809677,422.680645
1M+,0.0,12.0,3295749.0,23549.25,245.2,,1235.2,8880.8,13304.8,85405.2,16113.0,...,750.275,9.1,,46.04,305.26,417.04,3215.3,711.3,2094.44,409.58
1M+,1.0,12.0,1757511.0,9952.166667,131.833333,338.5,1047.75,3823.666667,5185.333333,51734.166667,9845.833333,...,569.683333,7.4,28.7,60.75,213.433333,298.816667,3152.65,612.55,2058.683333,481.4
250K-500K,0.0,11.787879,348695.7,2174.181818,26.363636,91.2,217.086957,727.848485,1241.030303,12589.1875,2684.606061,...,636.95,7.815625,28.855556,61.291304,212.89375,364.06875,3684.377419,791.66875,2534.148387,385.296875
250K-500K,1.0,12.0,329582.3,3017.769231,59.307692,105.4,196.25,1208.769231,1588.384615,13847.461538,2828.384615,...,870.815385,17.284615,26.68,64.275,345.684615,458.023077,4085.207692,836.438462,2679.315385,569.423077
500K-1M,0.0,11.333333,713402.5,5301.25,71.5,147.0,475.0,1722.333333,3141.75,27123.833333,5814.333333,...,847.481818,11.6,24.7,67.475,274.563636,505.5,4189.018182,907.490909,2882.818182,398.745455
500K-1M,1.0,12.0,703216.2,6128.583333,74.5,314.666667,439.444444,2000.25,3645.583333,32846.909091,6158.833333,...,878.608333,10.541667,52.433333,60.544444,285.641667,523.941667,4586.309091,877.675,3051.527273,625.116667


In [47]:
print metro_crime.Population.min()
print metro_crime.Population.max()

100025
8473938
