# Data Preperation

### Joining with Census Data

In [44]:
import glob
from pathlib import Path  
import numpy as np
import pandas as pd
# import geopandas
from shapely.geometry import Point

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

First let's create the dataframes. 

In [57]:
non_ripa_df = pd.read_csv('Data/Berkeley_PD_-_Stop_Data__Jan_26__2015_to_Sep_30__2020_.csv',keep_default_na=True)
ripa_df = pd.read_csv('Data/Berkeley_PD_-_Stop_Data__October_1__2020_-_Present.csv',keep_default_na=True)

print(f'The non_RIPA dataframe has a shape of {non_ripa_df.shape}.')
non_ripa_df.columns

print(f'\nThe RIPA dataframe has a shape of {ripa_df.shape}.')
ripa_df.columns

The non_RIPA dataframe has a shape of (58951, 13).


Index(['CreateDatetime', 'IncidentNumber', 'Address', 'City', 'Lat', 'Lon',
       'CallType', 'Race', 'Gender', 'Age', 'Reason', 'Enforcement',
       'Car Search'],
      dtype='object')


The RIPA dataframe has a shape of (8454, 45).


Index(['LEA Record ID', 'Person Number', 'Date of Stop', 'Time of Stop',
       'Duration of Stop', 'Is Stop Made in Response to Call for Service',
       'Information Based Stop', 'Type Of Stop', 'Officer Type of Assignment',
       'Location', 'City', 'Is Location a K12 Public School',
       'If K12 School Is Stop of a Student', 'School Name',
       'Education Code Section', 'Education Code Subdivision',
       'Race Perceived Prior To Stop', 'Perceived Race or Ethnicity',
       'Perceived Gender', 'Perceived Gender Nonconforming', 'Is LGBT',
       'Perceived Age', 'Person had Limited or No English Fluency',
       'Perceived or Known Disability', 'City of Residence', 'Reason for Stop',
       'Reason for Stop Narrative', 'Traffic Violation Type',
       'Traffic Violation Offense Codes', 'Suspicion Offense Code',
       'Suspicion SubType', 'Actions Taken', 'Basis for Search',
       'Basis for Search Narrative', 'Basis for Property Seizure',
       'Type of Property Seized', 'C

Now let's format and join them. We'll remove any observations with innacurate location information. 

In [58]:
non_ripa_df.rename(columns={'Lat':'LAT', 
                            'Lon':'LONG',
                            'Race':'Perceived Race or Ethnicity',
                            'Gender':'Perceived Gender',
                            'Age':'Perceived Age',
                            'Reason':'Reason for Stop',
                            'Enforcement':'Result of Stop'}, inplace=True)

non_ripa_df['CreateDatetime'] = pd.to_datetime(non_ripa_df.CreateDatetime)
non_ripa_df['Date of Stop'] = non_ripa_df['CreateDatetime'].dt.date
non_ripa_df['Time of Stop'] = non_ripa_df['CreateDatetime'].dt.time
non_ripa_df = non_ripa_df.drop(columns = ['CreateDatetime'])
ripa_df['Date of Stop'], non_ripa_df['Date of Stop']= pd.to_datetime(ripa_df['Date of Stop']), pd.to_datetime(non_ripa_df['Date of Stop'])

ripa_df = ripa_df[ripa_df.LONG != -361]
non_ripa_df = non_ripa_df[non_ripa_df.LONG != -361]
print(f'The non_RIPA dataframe has a shape of {non_ripa_df.shape}.')
print(f'The RIPA dataframe has a shape of {ripa_df.shape}.')
print(f'The two dataframes have {len(np.intersect1d(non_ripa_df.columns, ripa_df.columns))} shared columns.')

The non_RIPA dataframe has a shape of (55976, 14).
The RIPA dataframe has a shape of (8454, 45).
The two dataframes have 10 shared columns.


In [60]:
all_stops = pd.concat([ripa_df,non_ripa_df], axis=0)
all_stops['Date of Stop']= pd.to_datetime(all_stops['Date of Stop'])

all_stops = all_stops.reindex(columns=[ 'LEA Record ID', 'Incident Number', 'Date of Stop',
                                        'Time of Stop', 'Duration of Stop', 'City',
                                        'LAT', 'LONG', 'Race Perceived Prior To Stop',
                                        'Perceived Race or Ethnicity', 'Perceived Gender',
                                        'Perceived Age', 'Reason for Stop', 
                                        'Reason for Stop Narrative', 'Result of Stop', 'Person Number',  'Is Stop Made in Response to Call for Service',
                                        'Information Based Stop', 'Type Of Stop', 'Officer Type of Assignment',
                                        'Location', 'Is Location a K12 Public School',
                                        'If K12 School Is Stop of a Student', 'School Name',
                                        'Education Code Section', 'Education Code Subdivision',
                                        'Perceived Gender Nonconforming', 'Is LGBT',
                                        'Person had Limited or No English Fluency',
                                        'Perceived or Known Disability', 'City of Residence', 'Traffic Violation Type',
                                        'Traffic Violation Offense Codes', 'Suspicion Offense Code',
                                        'Suspicion SubType', 'Actions Taken', 'Basis for Search',
                                        'Basis for Search Narrative', 'Basis for Property Seizure',
                                        'Type of Property Seized', 'Contraband or Evidence',
                                        'Other Contraband Desc', 'Warning Offense Codes',
                                        'Citation Offense Codes',
                                        'In field cite and release Offense Codes',
                                        'Custodial arrest Offense Codes', 'IncidentNumber',
                                        'Address', 'CallType', 'Car Search'
                                      ])

Alright, great! Now let's find the census tract for each stop. This will be useful later when we merge this data with census data. We'll be checking to see if the latitude and longitude points fall within the map geometries provided by our .geojson file, and if they do, putting in the tract number they fall within.

In [61]:
berkeley = geopandas.read_file('Data/Census Tract Polygons 2010.geojson')

NameError: name 'geopandas' is not defined

In [None]:
stop_point = [Point(all_stops['LONG'][i], all_stops['LAT'][i]) for i in np.arange(len(ripa_df))]
tt = [berkeley['geometry'].contains(stop_point[i]) for i in np.arange(len(stop_point))]
judge = [np.where(tt[i] == True)[0] for i in np.arange(len(tt))]
tract_column = [i[0] if i.size > 0  else -1 for i in judge]
all_stops['tract'] = tract_column
berkeley['order'] = np.arange(len(berkeley))
df = all_stops.merge(berkeley[['name10', 'order']], left_on = 'tract', right_on = 'order')
df.drop(columns=['tract','order'], inplace=True)

# Exploratory Data Analysis

# Modelling