# Introduction

Welcome to the M1 Workshop session in data visualization for Exploratory Data Analysis (EDA) in `Python`.

# Application: the `Stanford Open policing project` dataset

The Stanford Open Policing Dataset records trafic stops by US police including data on the vehicle, driver, violation, outcome and many more variables. It has been used in research to investigate racial bias and other issues.



In [None]:
# load packages
import pandas as pd
import seaborn as sns

sns.set(color_codes=True)

import matplotlib.pyplot as plt


In [None]:
# load the data
data = pd.read_csv('/la_new_orleans_2020_04_01.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
# Let's check out the data

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 512092 entries, 0 to 512091
Data columns (total 32 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   raw_row_number      512092 non-null  object 
 1   date                512088 non-null  object 
 2   time                512092 non-null  object 
 3   location            416106 non-null  object 
 4   lat                 260408 non-null  float64
 5   lng                 260408 non-null  float64
 6   district            512092 non-null  object 
 7   zone                512092 non-null  object 
 8   subject_age         499306 non-null  float64
 9   subject_race        500362 non-null  object 
 10  subject_sex         500362 non-null  object 
 11  officer_assignment  511969 non-null  object 
 12  type                362185 non-null  object 
 13  arrest_made         512092 non-null  bool   
 14  citation_issued     512092 non-null  bool   
 16  outcome             335605 non-nul

In [None]:
data['datetime'] = data['date'].str.cat(data['time'], ' ')

In [None]:
data.index = pd.to_datetime(data['datetime'])

In [None]:
data.drop(columns=['date', 'time'], inplace = True)
data.head()

Unnamed: 0_level_0,raw_row_number,location,lat,lng,district,zone,subject_age,subject_race,subject_sex,officer_assignment,type,arrest_made,citation_issued,warning_issued,outcome,contraband_found,contraband_drugs,contraband_weapons,frisk_performed,search_conducted,search_person,search_vehicle,search_basis,reason_for_stop,vehicle_color,vehicle_make,vehicle_model,vehicle_year,raw_actions_taken,raw_subject_race,datetime
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
2010-01-01 01:11:00,1,,,,6,E,26.0,black,female,6th District,vehicular,False,False,False,,,,,False,False,False,False,,TRAFFIC VIOLATION,BLACK,DODGE,CARAVAN,2005.0,,BLACK,2010-01-01 01:11:00
2010-01-01 01:29:00,9087,,,,7,C,37.0,black,male,7th District,vehicular,False,False,False,,,,,False,False,False,False,,TRAFFIC VIOLATION,BLUE,NISSAN,MURANO,2005.0,,BLACK,2010-01-01 01:29:00
2010-01-01 01:29:00,9086,,,,7,C,37.0,black,male,7th District,vehicular,False,False,False,,,,,False,False,False,False,,TRAFFIC VIOLATION,BLUE,NISSAN,MURANO,2005.0,,BLACK,2010-01-01 01:29:00
2010-01-01 14:00:00,267,,,,7,I,96.0,black,male,7th District,vehicular,False,False,False,,,,,False,False,False,False,,TRAFFIC VIOLATION,GRAY,JEEP,GRAND CHEROKEE,2003.0,,BLACK,2010-01-01 14:00:00
2010-01-01 02:06:00,2,,,,5,D,17.0,black,male,5th District,,False,False,False,,,,,False,False,False,False,,CALL FOR SERVICE,,,,,,BLACK,2010-01-01 02:06:00


## Ideas to look into stuff?

- Crosstabs / Distribution of Sex / Race
- Searches conducted
- Drugs / Weapons found?
Expensive cars vs cheap cars - proxy by vehicle age (option) 
- Sex / Race / Age vs stop-outcome

In [None]:
# Crosstabs / Distribution of Sex / Race

pd.crosstab(data['subject_sex'], data['subject_race'], normalize='index')

subject_race,asian/pacific islander,black,hispanic,other,unknown,white
subject_sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
female,0.007846,0.715175,0.014974,0.000603,0.003362,0.258041
male,0.007471,0.692518,0.031905,0.000717,0.007685,0.259704


In [None]:
# Crosstabs / Distribution of Race / Reason for stop

pd.crosstab(data['subject_race'], data['reason_for_stop'], normalize='columns')

reason_for_stop,CALL FOR SERVICE,CITIZEN CONTACT,CRIMINAL VIOLATION,FLAGGED DOWN,JUVENILE VIOLATION,OTHER,PRESENT AT CRIME SCENE,SUSPECT PERSON,SUSPECT VEHICLE,TRAFFIC VIOLATION
subject_race,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
asian/pacific islander,0.005392,0.004049,0.004166,0.003605,0.000321,0.004916,0.001269,0.00211,0.003965,0.01037
black,0.697255,0.710526,0.535562,0.530644,0.910347,0.738288,0.794416,0.761864,0.760317,0.697728
hispanic,0.025748,0.018029,0.022697,0.034475,0.005463,0.020821,0.030457,0.014344,0.021445,0.031791
other,0.000542,0.000443,0.0005,0.000451,0.0,0.000578,0.001269,0.000315,0.0,0.000866
unknown,0.004037,0.003036,0.002366,0.004957,0.000321,0.003801,0.003807,0.001464,0.003784,0.009122
white,0.267025,0.263917,0.434709,0.425868,0.083548,0.231595,0.168782,0.219902,0.210488,0.250123


In [None]:
# Grouping Race and Sex vs Arrest made

data.groupby(['subject_race', 'subject_sex'])['arrest_made'].mean()

subject_race            subject_sex
asian/pacific islander  female         0.049738
                        male           0.085757
black                   female         0.169189
                        male           0.211642
hispanic                female         0.117513
                        male           0.162509
other                   female         0.090909
                        male           0.110236
unknown                 female         0.075356
                        male           0.079692
white                   female         0.134734
                        male           0.176940
Name: arrest_made, dtype: float64

In [None]:
# Arrestation rate for each race

filt = (data['arrest_made'] == True)

arrest_rate = ((data.loc[filt]['subject_race'].value_counts() / data['subject_race'].value_counts())*100)

arrest_rate

black                     19.896575
white                     16.467622
hispanic                  15.521459
asian/pacific islander     7.487477
unknown                    7.902925
other                     10.526316
Name: subject_race, dtype: float64

In [None]:
# Arrestation rate for each sex

filt = (data['arrest_made'] == True)

arrest_rate = ((data.loc[filt]['subject_sex'].value_counts() / data['subject_sex'].value_counts())*100)

arrest_rate

male      19.903529
female    15.822481
Name: subject_sex, dtype: float64

In [None]:
# Arrestation rate for each age

filt = (data['arrest_made'] == True)

arrest_rate = ((data.loc[filt]['subject_age'].value_counts() / data['subject_age'].value_counts())*100)

arrest_rate.sort_values(ascending=False).head(10)

10.0    43.333333
13.0    41.033755
12.0    40.529532
11.0    37.674419
14.0    37.615207
15.0    34.611289
93.0    33.333333
16.0    30.238147
27.0    20.700529
35.0    20.696795
Name: subject_age, dtype: float64

In [None]:
# Grouping Race vs Contraband_weapons

data[data.search_conducted == True].groupby('subject_race')['contraband_weapons'].sum()

subject_race
asian/pacific islander       9
black                     2742
hispanic                    40
other                        0
unknown                      7
white                      311
Name: contraband_weapons, dtype: int64

In [None]:
# Where live the richest people 

data.groupby('district')['vehicle_year'].mean().sort_values()

district
5        2003.450792
5        2003.520488
4        2003.806462
3        2003.956794
1        2003.957323
4        2003.966812
7        2003.970504
7        2003.998879
3        2004.020548
8        2004.056494
1        2004.142627
8        2004.341650
2        2004.589421
2        2004.607631
6        2005.188022
6        2005.502055
1|7              NaN
3|2              NaN
5|3|3            NaN
6|2              NaN
Name: vehicle_year, dtype: float64

In [None]:
# Look at the differences of poor vs rich district

data[data.district.isin([5,6])].groupby('district').search_conducted.value_counts(normalize='columns')

district  search_conducted
5         False               0.875089
          True                0.124911
6         False               0.829455
          True                0.170545
Name: search_conducted, dtype: float64

In [None]:
data[data.district.isin([5,6])].groupby('district').subject_race.value_counts(normalize='columns')

district  subject_race          
5         black                     0.758880
          white                     0.214093
          hispanic                  0.018758
          unknown                   0.004710
          asian/pacific islander    0.003085
          other                     0.000473
6         black                     0.734431
          white                     0.225178
          hispanic                  0.024931
          unknown                   0.009662
          asian/pacific islander    0.005188
          other                     0.000609
Name: subject_race, dtype: float64

In [None]:
# Map Folium => TBC