## Stanford Open Policing Project dataset

In this project, we analyze the traffic stop data of Rhode Island downloaded from Stanford University [webpage](https://openpolicing.stanford.edu/data/).

In [18]:
import pandas as pd

ri = pd.read_csv("Data/ri.csv")
ri.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,raw_row_number,date,time,zone,subject_race,subject_sex,department_id,type,arrest_made,citation_issued,...,reason_for_stop,vehicle_make,vehicle_model,raw_BasisForStop,raw_OperatorRace,raw_OperatorSex,raw_ResultOfStop,raw_SearchResultOne,raw_SearchResultTwo,raw_SearchResultThree
0,1,2005-11-22,11:15:00,X3,white,male,200,vehicular,False,True,...,Speeding,,,SP,W,M,M,,,
1,2,2005-10-01,12:20:00,X3,white,male,200,vehicular,False,True,...,Speeding,,,SP,W,M,M,,,
2,3,2005-10-01,12:30:00,X3,white,female,200,vehicular,False,True,...,Speeding,,,SP,W,F,M,,,
3,4,2005-10-01,12:50:00,X3,white,male,200,vehicular,False,True,...,Speeding,,,SP,W,M,M,,,
4,5,2005-10-01,13:10:00,X3,white,female,200,vehicular,False,True,...,Speeding,,,SP,W,F,M,,,


In [19]:
ri.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 509681 entries, 0 to 509680
Data columns (total 31 columns):
raw_row_number           509681 non-null int64
date                     509671 non-null object
time                     509671 non-null object
zone                     509671 non-null object
subject_race             480608 non-null object
subject_sex              480584 non-null object
department_id            509671 non-null object
type                     509681 non-null object
arrest_made              480608 non-null object
citation_issued          480608 non-null object
outcome                  473840 non-null object
contraband_found         17762 non-null object
contraband_drugs         15988 non-null object
contraband_weapons       11795 non-null object
contraband_alcohol       1217 non-null object
contraband_other         17762 non-null object
frisk_performed          509671 non-null object
search_conducted         509681 non-null bool
search_basis             17762 non

#### Locating missing values

In [20]:
# isnull() method generates a DataFrame of True and False values
# True if the element is missing and False if it is not

ri.isnull()

Unnamed: 0,raw_row_number,date,time,zone,subject_race,subject_sex,department_id,type,arrest_made,citation_issued,...,reason_for_stop,vehicle_make,vehicle_model,raw_BasisForStop,raw_OperatorRace,raw_OperatorSex,raw_ResultOfStop,raw_SearchResultOne,raw_SearchResultTwo,raw_SearchResultThree
0,False,False,False,False,False,False,False,False,False,False,...,False,True,True,False,False,False,False,True,True,True
1,False,False,False,False,False,False,False,False,False,False,...,False,True,True,False,False,False,False,True,True,True
2,False,False,False,False,False,False,False,False,False,False,...,False,True,True,False,False,False,False,True,True,True
3,False,False,False,False,False,False,False,False,False,False,...,False,True,True,False,False,False,False,True,True,True
4,False,False,False,False,False,False,False,False,False,False,...,False,True,True,False,False,False,False,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
509676,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,True,True
509677,False,False,False,False,False,False,False,False,False,False,...,False,True,True,False,False,False,False,True,True,True
509678,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,True,True
509679,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,True,True


In [21]:
# to find the count of missing values 
# we can use sum() method
# because True is treated as 1 and False is treated as 0 

ri.isnull().sum()

raw_row_number                0
date                         10
time                         10
zone                         10
subject_race              29073
subject_sex               29097
department_id                10
type                          0
arrest_made               29073
citation_issued           29073
outcome                   35841
contraband_found         491919
contraband_drugs         493693
contraband_weapons       497886
contraband_alcohol       508464
contraband_other         491919
frisk_performed              10
search_conducted              0
search_basis             491919
reason_for_search        491919
reason_for_stop           29073
vehicle_make             191564
vehicle_model            279593
raw_BasisForStop          29073
raw_OperatorRace          29073
raw_OperatorSex           29073
raw_ResultOfStop          29073
raw_SearchResultOne      491919
raw_SearchResultTwo      508862
raw_SearchResultThree    509513
dtype: int64

In [22]:
# drop a column

ri.drop(columns=["contraband_found", "contraband_alcohol"], axis="columns", inplace=True)
ri.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 509681 entries, 0 to 509680
Data columns (total 29 columns):
raw_row_number           509681 non-null int64
date                     509671 non-null object
time                     509671 non-null object
zone                     509671 non-null object
subject_race             480608 non-null object
subject_sex              480584 non-null object
department_id            509671 non-null object
type                     509681 non-null object
arrest_made              480608 non-null object
citation_issued          480608 non-null object
outcome                  473840 non-null object
contraband_drugs         15988 non-null object
contraband_weapons       11795 non-null object
contraband_other         17762 non-null object
frisk_performed          509671 non-null object
search_conducted         509681 non-null bool
search_basis             17762 non-null object
reason_for_search        17762 non-null object
reason_for_stop          480608 n

#### Drop rows which has missing values in specific columns

In [23]:
# Since date, time and zone values are important 
# those rows whixh do not have those values are useless
# so, drop those rows

ri.dropna(subset=["date", "time", "zone"], inplace=True)
ri.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 509671 entries, 0 to 509679
Data columns (total 29 columns):
raw_row_number           509671 non-null int64
date                     509671 non-null object
time                     509671 non-null object
zone                     509671 non-null object
subject_race             480608 non-null object
subject_sex              480584 non-null object
department_id            509671 non-null object
type                     509671 non-null object
arrest_made              480608 non-null object
citation_issued          480608 non-null object
outcome                  473840 non-null object
contraband_drugs         15988 non-null object
contraband_weapons       11795 non-null object
contraband_other         17762 non-null object
frisk_performed          509671 non-null object
search_conducted         509671 non-null bool
search_basis             17762 non-null object
reason_for_search        17762 non-null object
reason_for_stop          480608 n

### Fixing a data type

In [24]:
# Examine the head of the 'arrest_made' column
print(ri.arrest_made.head())

# Check the data type of 'is_arrested' 
print(ri.arrest_made.dtype)

# Change the data type of 'is_arrested' to 'bool'
ri['arrest_made'] = ri.arrest_made.astype("bool")

# Check the data type of 'is_arrested' 
print(ri.arrest_made.dtype)

0    False
1    False
2    False
3    False
4    False
Name: arrest_made, dtype: object
object
bool


### Combining columns and Creating a DatetimeIndex

In [25]:
ri[["date", "time"]].head()

Unnamed: 0,date,time
0,2005-11-22,11:15:00
1,2005-10-01,12:20:00
2,2005-10-01,12:30:00
3,2005-10-01,12:50:00
4,2005-10-01,13:10:00


In [26]:
# str.cat() method:
# Concatenate strings in the Series/Index with given separator.
combined = ri.date.str.cat(ri.time, sep= " ")
combined

# note that it is still an object column

0         2005-11-22 11:15:00
1         2005-10-01 12:20:00
2         2005-10-01 12:30:00
3         2005-10-01 12:50:00
4         2005-10-01 13:10:00
                 ...         
509675    2015-08-16 13:37:00
509676    2015-08-04 10:40:00
509677    2015-12-20 11:17:00
509678    2015-11-09 23:35:00
509679    2015-10-30 11:09:00
Name: date, Length: 509671, dtype: object

In [13]:
# convert to datetime

ri["date_and_time"] = pd.to_datetime(combined)
ri.head()

Unnamed: 0,raw_row_number,date,time,zone,subject_race,subject_sex,department_id,type,arrest_made,citation_issued,...,vehicle_make,vehicle_model,raw_BasisForStop,raw_OperatorRace,raw_OperatorSex,raw_ResultOfStop,raw_SearchResultOne,raw_SearchResultTwo,raw_SearchResultThree,date_and_time
0,1,2005-11-22,11:15:00,X3,white,male,200,vehicular,False,True,...,,,SP,W,M,M,,,,2005-11-22 11:15:00
1,2,2005-10-01,12:20:00,X3,white,male,200,vehicular,False,True,...,,,SP,W,M,M,,,,2005-10-01 12:20:00
2,3,2005-10-01,12:30:00,X3,white,female,200,vehicular,False,True,...,,,SP,W,F,M,,,,2005-10-01 12:30:00
3,4,2005-10-01,12:50:00,X3,white,male,200,vehicular,False,True,...,,,SP,W,M,M,,,,2005-10-01 12:50:00
4,5,2005-10-01,13:10:00,X3,white,female,200,vehicular,False,True,...,,,SP,W,F,M,,,,2005-10-01 13:10:00


In [14]:
# set DatetimeIndex

ri.set_index("date_and_time", inplace=True)
ri.head()

Unnamed: 0_level_0,raw_row_number,date,time,zone,subject_race,subject_sex,department_id,type,arrest_made,citation_issued,...,reason_for_stop,vehicle_make,vehicle_model,raw_BasisForStop,raw_OperatorRace,raw_OperatorSex,raw_ResultOfStop,raw_SearchResultOne,raw_SearchResultTwo,raw_SearchResultThree
date_and_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2005-11-22 11:15:00,1,2005-11-22,11:15:00,X3,white,male,200,vehicular,False,True,...,Speeding,,,SP,W,M,M,,,
2005-10-01 12:20:00,2,2005-10-01,12:20:00,X3,white,male,200,vehicular,False,True,...,Speeding,,,SP,W,M,M,,,
2005-10-01 12:30:00,3,2005-10-01,12:30:00,X3,white,female,200,vehicular,False,True,...,Speeding,,,SP,W,F,M,,,
2005-10-01 12:50:00,4,2005-10-01,12:50:00,X3,white,male,200,vehicular,False,True,...,Speeding,,,SP,W,M,M,,,
2005-10-01 13:10:00,5,2005-10-01,13:10:00,X3,white,female,200,vehicular,False,True,...,Speeding,,,SP,W,F,M,,,
