In [32]:
import pylab as pl
import pandas as pd
import numpy as np
#imports downloader
import scipy.stats
from scipy.stats import rankdata

from __future__ import print_function, division

try:
    import urllib2 as urllib
except ImportError:
    import urllib.request as urllib

import statsmodels.api as sm
import statsmodels.formula.api as smf


%pylab inline
import json
import os, shutil
import sys
import requests

Populating the interactive namespace from numpy and matplotlib


In [2]:
#Read in data
TXdata = pd.read_csv("https://stacks.stanford.edu/file/druid:py883nd2578/TX-clean.csv.gz", index_col = None, compression = 'gzip')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
## Drop columns not needed

TXdataTrim = TXdata.drop([u'state', u'county_fips', u'fine_grained_location', 
       u'police_department', u'driver_gender', u'driver_age_raw',
       u'driver_age', u'violation_raw',
       u'violation', u'lat', u'lon',
       u'officer_id'], axis=1)

In [4]:
#Group data by county, driver race and whether or not a search was conducted

TXcrs = TXdataTrim.groupby(['county_name', 'driver_race','search_conducted']).count()

In [5]:
## Drop columns not needed

TXcrs.drop([u'stop_date', u'stop_time', u'location_raw', u'driver_race_raw',
       u'search_type_raw', u'search_type', u'contraband_found',
       u'stop_outcome', u'is_arrested', u'driver_race_original'],axis=1, inplace=True)

In [6]:
#For ease of debugging, make copy of initial dataset

TXsearch = TXcrs

In [7]:
#Dataframe for stops where search was conducted (for outcome test)

TXcontraband = TXdataTrim[TXdataTrim.search_conducted == True] 
TXcontraband = TXcontraband.groupby(['county_name', 'driver_race','contraband_found']).count()
TXcontraband.drop([u'stop_date', u'stop_time', u'location_raw', u'driver_race_raw',
       u'search_type_raw', u'search_type', u'search_conducted',
       u'stop_outcome', u'is_arrested', u'driver_race_original'],axis=1, inplace=True)

In [8]:
TXcontraband.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id
county_name,driver_race,contraband_found,Unnamed: 3_level_1
Anderson County,Black,False,210
Anderson County,Black,True,136
Anderson County,Hispanic,False,152
Anderson County,Hispanic,True,77
Anderson County,Other,False,2
Anderson County,White,False,737
Anderson County,White,True,405
Andrews County,Asian,False,1
Andrews County,Asian,True,1
Andrews County,Black,False,27


In [9]:
TXsearch.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id
county_name,driver_race,search_conducted,Unnamed: 3_level_1
Anderson County,Asian,False,638
Anderson County,Black,False,18504
Anderson County,Black,True,346
Anderson County,Hispanic,False,10699
Anderson County,Hispanic,True,229
Anderson County,Other,False,138
Anderson County,Other,True,2
Anderson County,White,False,87781
Anderson County,White,True,1142
Andrews County,Asian,False,119


In [10]:
#List of counties to iterator through

counties = TXsearch.index.get_level_values(0).unique()
counties

Index(['Anderson County', 'Andrews County', 'Angelina County',
       'Aransas County', 'Archer County', 'Armstrong County',
       'Atascosa County', 'Austin County', 'Bailey County', 'Bandera County',
       ...
       'Willacy County', 'Williamson County', 'Wilson County',
       'Winkler County', 'Wise County', 'Wood County', 'Yoakum County',
       'Young County', 'Zapata County', 'Zavala County'],
      dtype='object', name='county_name', length=254)

In [11]:
#List of driver_race to iterator through

driverRace = TXsearch.index.get_level_values(1).unique()
driverRace

Index(['Asian', 'Black', 'Hispanic', 'Other', 'White'], dtype='object', name='driver_race')

In [14]:
#police_department    driver_race   num_stops   num_searches   num_hits   search_rate   hit_rate


In [15]:
TXsearch.loc['Anderson County'].loc['Asian']

Unnamed: 0_level_0,id
search_conducted,Unnamed: 1_level_1
False,638


In [16]:
TXsearch.loc['Anderson County'].loc['Black'].sum()

id    18850
dtype: int64

In [17]:
# Set columns of dataframe (and check the head has your column heading, but empty)

columns=['police_department', 'driver_race', 'num_stops', 'num_searches', 'num_hits']
dfTXstops = pd.DataFrame(columns=columns)

print(dfTXstops.head())


Empty DataFrame
Columns: [police_department, driver_race, num_stops, num_searches, num_hits]
Index: []


In [18]:
#Define function to return stops
def get_stops(cty, rce):
    if len(TXsearch[(TXsearch.index.get_level_values(0) == cty) & (TXsearch.index.get_level_values(1) == rce) & \
                            (TXsearch.index.get_level_values(2) == True)]['id']) < 1:
        return NaN
    else:
        return TXsearch.loc[cty].loc[rce].sum()[0]

In [19]:
#Define function to return searches
def get_searches(cty, rce):
    if len(TXsearch[(TXsearch.index.get_level_values(0) == cty) & (TXsearch.index.get_level_values(1) == rce) & \
                            (TXsearch.index.get_level_values(2) == True)]['id']) < 1:
        return NaN
    else:
        return TXsearch.loc[cty].loc[rce].loc[True][0]

In [20]:
#Define function to return hits
def get_hits(cty, rce):
    if len(TXcontraband[(TXcontraband.index.get_level_values(0) == cty) & (TXcontraband.index.get_level_values(1) == rce) & \
                            (TXcontraband.index.get_level_values(2) == True)]['id']) < 1:
        return NaN
    else:
        return TXcontraband.loc[cty].loc[rce].loc[True][0]

In [21]:
TXsearch.loc['Anderson County'].loc['Black'].sum()

id    18850
dtype: int64

In [23]:

#Build the dataframe using for loop (rows are: i)

i=0

for cty in counties:
    for rce in driverRace:
     
        num_stops = get_stops(cty, rce)   
        num_searches = get_searches(cty, rce)   
        num_hits =  get_hits(cty, rce)
    
        dfTXstops.loc[i]= [cty, rce, num_stops, num_searches, num_hits]

        i = i + 1


In [24]:
dfTXstops.head()

Unnamed: 0,police_department,driver_race,num_stops,num_searches,num_hits
0,Anderson County,Asian,,,
1,Anderson County,Black,18850.0,346.0,136.0
2,Anderson County,Hispanic,10928.0,229.0,77.0
3,Anderson County,Other,140.0,2.0,
4,Anderson County,White,88923.0,1142.0,405.0


In [25]:
cleanTXstops = dfTXstops.dropna()

In [26]:
cleanTXstops.head(20)

Unnamed: 0,police_department,driver_race,num_stops,num_searches,num_hits
1,Anderson County,Black,18850,346,136
2,Anderson County,Hispanic,10928,229,77
4,Anderson County,White,88923,1142,405
5,Andrews County,Asian,121,2,1
6,Andrews County,Black,1046,37,10
7,Andrews County,Hispanic,12331,252,99
8,Andrews County,Other,47,2,1
9,Andrews County,White,21576,282,118
10,Angelina County,Asian,671,3,2
11,Angelina County,Black,19616,395,166


In [51]:
## Calculate 100 largest police departments (counties) by number of stops

totalTXstopsByDept = pd.DataFrame(cleanTXstops.groupby('police_department', as_index=False)['num_stops'].sum())
totalTXstopsByDept.head()

Unnamed: 0,police_department,num_stops
0,Anderson County,118701
1,Andrews County,35121
2,Angelina County,125828
3,Aransas County,34199
4,Archer County,49251


In [52]:
len(totalTXstopsByDept)

254

In [54]:
# Rank police departments (counties) by number of stops
keepLargest = 100

largestDepts = totalTXstopsByDept[len(totalTXstopsByDept) - rankdata(totalTXstopsByDept['num_stops']) < keepLargest]

In [55]:
largestDepts.head()

Unnamed: 0,police_department,num_stops
0,Anderson County,118701
2,Angelina County,125828
6,Atascosa County,126873
10,Bastrop County,148131
13,Bell County,269280


In [56]:
# Merge largest departments on to 'cleanTXstops' and drop the small departments

largeTXstops = pd.merge(cleanTXstops, largestDepts, on=['police_department'], how='left')

In [59]:
largeTXstops.shape

(1020, 6)

In [61]:
largeTXstops.dropna(inplace=True)

In [66]:
largeTXstops.drop(['num_stops_y'], axis=1, inplace=True) #drop total number of stops (county total which got merged on)
largeTXstops.rename(columns = {'num_stops_x':"num_stops"}, inplace = True) #rename no. of stops column



In [69]:
cleanTXstops = largeTXstops

In [71]:
# We want to drop records where driver race is 'Other'
cleanTXstops = cleanTXstops[cleanTXstops['driver_race']!='Other']

In [72]:
cleanTXstops['search_rate'] = cleanTXstops['num_searches'] / cleanTXstops['num_stops']
cleanTXstops['hit_rate'] = cleanTXstops['num_hits'] / cleanTXstops['num_searches']
cleanTXstops.head(20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,police_department,driver_race,num_stops,num_searches,num_hits,search_rate,hit_rate
0,Anderson County,Black,18850,346,136,0.0183554,0.393064
1,Anderson County,Hispanic,10928,229,77,0.0209553,0.336245
2,Anderson County,White,88923,1142,405,0.0128426,0.354641
8,Angelina County,Asian,671,3,2,0.00447094,0.666667
9,Angelina County,Black,19616,395,166,0.0201366,0.420253
10,Angelina County,Hispanic,15589,320,97,0.0205273,0.303125
12,Angelina County,White,89749,1215,468,0.0135378,0.385185
24,Atascosa County,Asian,1043,7,3,0.00671141,0.428571
25,Atascosa County,Black,4232,60,20,0.0141777,0.333333
26,Atascosa County,Hispanic,61501,1226,299,0.0199346,0.243883


In [73]:
cleanTXstops.shape

(385, 7)

In [74]:
cleanTXstops.to_csv('cleanTXstops.csv')