In [2]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from matplotlib.patches import Rectangle
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

1) Are certain races stopped at a rate disproportionate to their population? 
- Yes, black people are stopped 2.3-3.4 times more than their population 
2) At what times of day are most/least stops made? 
- The most stops happen betweek 8:00pm and 10:00pm. 
- The fewest stops happen between 4:00am and 6:00 am. 

3) Are males or females stopped at higher rates?
- ~73-74% of stops are male. This has roughly stayed the same over the past 10 years. In 2010 males made up 69% of the stops. 


4) In which divisions are the most/least stops made?
- The Central division makes the most stops. This is even more apparent when you compare the ratio of the number of stops to the divisions population. However this isn't the best metric because just because the central district 
5) Have stops increased/decreased over the past 10 years?
- 
6) Are there more traffic stops later in the month (or on a particular day)
7) Grouped by officer: Are certain officers more likely to make a stop on a person of particular descent?
8) Are certain races more likely to have Post Stop Activity?
- Yes

In [3]:
pd.options.display.max_rows = 20

In [None]:
df = pd.read_csv('Vehicle_and_Pedestrian_Stop_Data_2010_to_Present.csv', parse_dates = ['Stop Date'])

In [None]:
df['Stop Date'] = pd.to_datetime(df['Stop Date'])
df['Stop Time'] = pd.to_datetime(df['Stop Time'])

In [None]:
df['Stop Year'] = df['Stop Date'].dt.year
df['Stop Month'] = df['Stop Date'].dt.month
df['Stop Day'] = df['Stop Date'].dt.day
df['Stop Day Name'] = df['Stop Date'].dt.dayofweek

df = df.drop(df[df['Stop Year'] == 1900].index)
df['Stop Year'].value_counts()

In [None]:
year_descent_grouped = df.groupby('Stop Year')['Descent Description'].value_counts(normalize=True)

In [None]:
stops_per_year = pd.DataFrame(df['Stop Year'].value_counts()).reset_index()
stops_per_year
sns.barplot(x='index', y='Stop Year', data=stops_per_year)

In [None]:
sns.set_style('darkgrid')

In [None]:
grouped_df = pd.DataFrame(year_descent_grouped)
grouped_df = grouped_df.rename(columns={'Descent Description':'Percent'})
grouped_df = grouped_df.reset_index()
# print(grouped_df)
# grouped_df.resent

#print(grouped_df)
fig, ax = plt.subplots(figsize = (20, 8))

sns.barplot(x='Stop Year', y='Percent', data=grouped_df, hue='Descent Description', ax=ax)
la_race_breakdown_2010 = {'hispanic':0.475, 'white':0.524, 'black':0.086, 'other':0.211, 'asian':.138, 'native american':0.005, 'multi_descents':0.034}

bars = [r for r in ax.get_children() if type(r)==Rectangle]
colors = [c.get_facecolor() for c in bars[:-1]]
def unique(lst):
    unique_list = []
    for c in lst:
        if c not in unique_list:
            unique_list.append(c)
    return unique_list

colors = unique(colors)

i = 0
for x in la_race_breakdown_2010.values():
    ax.axhline(x, color=colors[i], alpha=0.7)
    i += 1

xlabels = list(range(2010, 2021))
print(xlabels)
ylabels = list(x/10.0 for x in range(0, 6))
print(ylabels)
    
ax.set_title('Police Stops by Race since 2010', fontsize=20);
ax.set_xticklabels(labels = xlabels, fontsize=12);
ax.set_yticklabels(labels = ylabels, fontsize=12);
ax.set_xlabel('Year', fontsize=15)
ax.set_ylabel('Percent', fontsize=15)

As we can see, over the past 10 years black people have been far more likely to stopped by officers

In [None]:
grouped_df

In [None]:
la_race_breakdown_2010 = {'HISPANIC':0.475, 'WHITE':0.524, 'BLACK':0.086, 'OTHER':0.211, 'ASIAN':.138, 'AMERICAN INDIAN':0.005, 'MULTI-DESCENTS':0.034}
grouped_df['Percent_normalized'] = grouped_df['Percent']
grouped_df['Percent_normalized'] = grouped_df.apply(lambda row: row['Percent_normalized']/la_race_breakdown_2010[row['Descent Description']], axis=1)


In [None]:
grouped_df.loc[grouped_df['Descent Description']=='BLACK']

In [None]:
fig, ax3 = plt.subplots(figsize=(16, 8))
sns.barplot(x='Stop Year', y='Percent_normalized', hue='Descent Description', data=grouped_df, ax=ax3)

In [None]:
fig, ax2 = plt.subplots(figsize=(16, 8))
sns.lineplot(x='Stop Year', y='Percent', hue='Descent Description', data=grouped_df)

la_race_breakdown_2010 = {'hispanic':0.475, 'white':0.524, 'black':0.086, 'other':0.211, 'asian':.138, 'native american':0.005, 'multi_descents':0.034}

bars = [r for r in ax.get_children() if type(r)==Rectangle]
colors = [c.get_facecolor() for c in bars[:-1]]
def unique(lst):
    unique_list = []
    for c in lst:
        if c not in unique_list:
            unique_list.append(c)
    return unique_list

colors = unique(colors)

i = 0
for x in la_race_breakdown_2010.values():
    ax2.axhline(x, color=colors[i], linewidth = 0.6, alpha=1, linestyle='-.')
    i += 1


ax2.set_title('Police Stops by Race since 2010', fontsize=20);
ax2.set_xticks(list(range(2010, 2021)))
ax2.set_xticklabels(labels = xlabels, fontsize=12);
ax2.set_yticklabels(labels = ylabels, fontsize=12);

ax2.set_xlabel('Year', fontsize=15)
ax2.set_ylabel('Percent', fontsize=15)
ax2.legend(loc=1)

In [None]:
gender_group = pd.DataFrame(df.groupby('Stop Year')['Sex Code'].value_counts(normalize=True))

gender_group.rename(columns={'Sex Code':'Percent'},inplace=True)
gender_group = gender_group.reset_index()

In [None]:
gender_group

In [None]:
fig, ax3 = plt.subplots(figsize=(12, 8))
sns.barplot(x='Stop Year', y='Percent', hue='Sex Code', data=gender_group)


In [None]:
male_stops = gender_group.loc[gender_group['Sex Code'] == 'M', 'Percent']
female_stops = gender_group.loc[gender_group['Sex Code'] == 'F', 'Percent']
total = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
sns.barplot(x = list(range(2010, 2021)), y = total, color='#ff9933')
bottom_plot = sns.barplot(x = list(range(2010, 2021)), y = male_stops, color='#008ae6')
topbar = plt.Rectangle((0,0), 1, 1, fc='#ff9933', edgecolor='none')
bottombar = plt.Rectangle((0,0), 1, 1, fc='#008ae6', edgecolor='none')
l = plt.legend([bottombar, topbar], ['Male', 'Female'], loc=0)
l.draw_frame(False)


In [None]:
df['Stop Time'] = pd.to_datetime(df['Stop Time'])
df['Stop Hour'] = df['Stop Time'].dt.hour
df

In [None]:
df['Stop Hour'].value_counts(normalize=True).sort_index().plot.bar()

In [None]:
df['Stop Day'].value_counts(normalize=True).sort_index().plot.bar()

In [None]:
df['Stop Day Name'].value_counts(normalize=True).sort_index().plot.bar()

In [None]:
df['Stop Month'].value_counts(normalize=True).sort_index().plot.bar()

In [None]:
df['Stop Hour'].value_counts(normalize=True).sort_index()

In [None]:
stop_types = df['Stop Type'].value_counts(normalize=True)
stop_types.plot.bar()

In [None]:
df_no_null_div = df.dropna(subset = ['Division Description 1'])

In [None]:
traffic_division = df_no_null_div[df_no_null_div['Division Description 1'].str.endswith('TRAFFIC')]
traffic_division['Division Description 1'].value_counts()

In [None]:
traffic_division['Division Description 1'].value_counts().plot.bar()

In [None]:
list(df_no_null_div['Division Description 1'].unique())

In [None]:
la_divisions = ['MISSION', 'WEST LA', 'SEVENTY-SEVENTH', 'NORTH EAST', 'TOPANGA', 'WEST VALLEY', 'OLYMPIC', 'SOUTH EAST', 'VALLEY BUREAU', 'SOUTH WEST', 'FOOTHILL', 
                'NEWTON', 'RAMPART', 'HOLLYWOOD', 'NORTH HOLLYWOOD', 'WILSHIRE', 'DEVONSHIRE', 'VAN NUYS', 'WEST BUREAU', 'CENTRAL', 'PACIFIC', 'HOLLENBECK', 
                'HARBOR', 'CENTRAL BUREAU', 'SOUTH BUREAU']

la_divisions_df = df_no_null_div.loc[df_no_null_div['Division Description 1'].isin(la_divisions)]

In [None]:
la_divisions_df['Division Description 1'].value_counts().plot.bar(figsize=(12, 8))

In [None]:
area_populations = {'CENTRAL':40000, 'HOLLENBECK': 200000, 'NEWTON':150000, 'NORTH EAST':250000, 'RAMPART':164961, 'SOUTH WEST':164552, 'SEVENTY-SEVENTH':175522, 'SOUTH EAST':127984, 'HARBOR':171866,
                   'HOLLYWOOD':300000, 'OLYMPIC': 200000, 'PACIFIC':200000, 'WEST LA':228000, 'WILSHIRE':251000, 
                    'DEVONSHIRE':219136, 'FOOTHILL':182214, 'MISSION':225849, 'NORTH HOLLYWOOD':220000, 'TOPANGA':57032, 'VAN NUYS':325000, 'WEST VALLEY':196840}

normalized_la_divisions_df = la_divisions_df['Division Description 1'].value_counts()
normalized_la_divisions_df = normalized_la_divisions_df.drop(['VALLEY BUREAU', 'WEST BUREAU', 'SOUTH BUREAU', 'CENTRAL BUREAU'])
normalized_la_divisions_df = pd.DataFrame(normalized_la_divisions_df).reset_index()


In [None]:
normalized_la_divisions_df['Stops/Population'] = normalized_la_divisions_df.apply(lambda x: x['Division Description 1']/area_populations[x['index']], axis=1)
normalized_la_divisions_df

In [None]:
normalized_la_divisions_df.plot.bar(x='index', y='Stops/Population', figsize=(12, 8))

In [None]:
post_stop_race = df.groupby('Descent Description')['Post Stop Activity Indicator'].value_counts(normalize=True)
post_stop_race_df = pd.DataFrame(post_stop_race)

In [None]:
post_stop_race_df = post_stop_race_df.rename(columns={'Post Stop Activity Indicator':'Percentage'})
post_stop_race_df.reset_index(inplace=True)

In [None]:
post_stop_race_df = post_stop_race_df[post_stop_race_df['Post Stop Activity Indicator'] == 'Y']
post_stop_race_df

In [None]:
fig, ax4 = plt.subplots(figsize=(12, 8))
sns.barplot(x = 'Descent Description', y='Percentage', data = post_stop_race_df)
ax4.set_title('Percentage of Stops that resulted in Post Stop Activity by Race', fontsize=18)

Black, Hispanic, and people of Multiple Descents are much more likely to be involved in post stop activity compared to Asian, White, American Indian and Other. 

In [None]:
df['Officer 1 Serial Number'].value_counts()
top20_index = list(df['Officer 1 Serial Number'].value_counts().head(20).index)
top20_index

In [None]:
top20_df = df[df['Officer 1 Serial Number'].isin(top20_index)]

top20_df


In [None]:
top20_df = pd.DataFrame(top20_df.groupby(['Officer 1 Serial Number'])['Descent Description'].value_counts(normalize=True))
top20_df = top20_df.rename(columns={'Descent Description':'Percent'})
top20_df

In [None]:
top20_df.reset_index(inplace=True)

In [None]:
fig, ax5 = plt.subplots(figsize=(20, 8))
sns.barplot(x='Officer 1 Serial Number', y='Percent', hue='Descent Description', data=top20_df)

In [None]:
df

In [None]:
hour_descent_group = df.groupby('Stop Hour')['Descent Description'].value_counts(normalize=True)
hour_descent_group_df = pd.DataFrame(hour_descent_group)
hour_descent_group_df = hour_descent_group_df.rename(columns={'Descent Description':'Percent'}).reset_index()

stops_by_race = df['Descent Description'].value_counts(normalize=True)


fig, ax = plt.subplots(figsize=(20,8))
sns.barplot(x='Stop Hour', y='Percent', hue='Descent Description', data=hour_descent_group_df)

bars = [r for r in ax.get_children() if type(r)==Rectangle]
colors = [c.get_facecolor() for c in bars[:-1]]
def unique(lst):
    unique_list = []
    for c in lst:
        if c not in unique_list:
            unique_list.append(c)
    return unique_list

colors = unique(colors)

i = 0
for x in stops_by_race:
    ax.axhline(x, color=colors[i], linewidth = 0.6, alpha=1, linestyle='-.')
    i += 1
    
ax.set_title('Stops made each hour by Race', fontsize=20)

In [None]:
df

In [None]:
df['Descent Description'].value_counts(normalize=True)


In [None]:
x = 7
4 < x < 6

In [None]:
def time_category(hour):
    if 6 <= hour < 12:
        return 'morning'
    elif 12 <= hour < 17:
        return 'afternoon'
    elif 17 <= hour < 20:
        return 'evening'
    else:
        return 'night'

df['Time Category'] = df['Stop Hour'].apply(time_category)
df

In [None]:
time_descent_group = df.groupby('Time Category')['Descent Description'].value_counts(normalize=True)
time_descent_group_df = pd.DataFrame(time_descent_group)
time_descent_group_df = time_descent_group_df.rename(columns={'Descent Description':'Percent'}).reset_index()

time_descent_group_df



In [None]:
d = {'morning':1, 'afternoon':2, 'evening':3, 'night':4}
d1 = {'HISPANIC':1, 'BLACK':2, 'WHITE':3, 'ASIAN':4, 'OTHER':5, 'MULTI-DESCENTS':6, 'AMERICAN INDIAN':7}
d2 = {1:'morning', 2:'afternoon', 3:'evening', 4:'night'}
d3 = {1:'HISPANIC', 2:'BLACK', 3:'WHITE', 4:'ASIAN', 5:'OTHER', 6:'MULTI-DESCENTS', 7:'AMERICAN INDIAN'}


time_descent_group_df = time_descent_group_df.replace(d).replace(d1).sort_values(['Time Category', 'Descent Description'])
time_descent_group_df['Time Category'] = time_descent_group_df['Time Category'].replace(d2)
time_descent_group_df['Descent Description'] = time_descent_group_df['Descent Description'].replace(d3)
time_descent_group_df

In [None]:


fig, ax5 = plt.subplots(figsize=(20,8))
sns.barplot(x='Time Category', y='Percent', hue='Descent Description', data=time_descent_group_df)

bars = [r for r in ax.get_children() if type(r)==Rectangle]
colors = [c.get_facecolor() for c in bars[:-1]]
def unique(lst):
    unique_list = []
    for c in lst:
        if c not in unique_list:
            unique_list.append(c)
    return unique_list

colors = unique(colors)

i = 0
for x in stops_by_race:
    ax5.axhline(x, color=colors[i], linewidth = 0.6, alpha=1, linestyle='-.')
    i += 1
    
ax5.set_title('Stops made each time of day by Race', fontsize=20)

ax5.set_xlabel('Time Category\n Morning: 6:00-11:59, Afternoon: 12:00-4:59, Evening: 5:00 - 7:59pm, Night: 8:00pm - 5:59am', fontsize=14)


In [None]:
vehicle_stops = df[df['Stop Type']=='VEH']

vehicle_time_descent_group = vehicle_stops.groupby('Time Category')['Descent Description'].value_counts(normalize=True)
vehicle_time_descent_group_df = pd.DataFrame(vehicle_time_descent_group)
vehicle_time_descent_group_df = vehicle_time_descent_group_df.rename(columns={'Descent Description':'Percent'}).reset_index()



d = {'morning':1, 'afternoon':2, 'evening':3, 'night':4}
d1 = {'HISPANIC':1, 'BLACK':2, 'WHITE':3, 'ASIAN':4, 'OTHER':5, 'MULTI-DESCENTS':6, 'AMERICAN INDIAN':7}
d2 = {1:'morning', 2:'afternoon', 3:'evening', 4:'night'}
d3 = {1:'HISPANIC', 2:'BLACK', 3:'WHITE', 4:'ASIAN', 5:'OTHER', 6:'MULTI-DESCENTS', 7:'AMERICAN INDIAN'}


vehicle_time_descent_group_df = vehicle_time_descent_group_df.replace(d).replace(d1).sort_values(['Time Category', 'Descent Description'])
vehicle_time_descent_group_df['Time Category'] = vehicle_time_descent_group_df['Time Category'].replace(d2)
vehicle_time_descent_group_df['Descent Description'] = vehicle_time_descent_group_df['Descent Description'].replace(d3)
vehicle_time_descent_group_df

In [None]:
fig, ax6 = plt.subplots(figsize=(20,8))
sns.barplot(x='Time Category', y='Percent', hue='Descent Description', data=vehicle_time_descent_group_df)

bars = [r for r in ax.get_children() if type(r)==Rectangle]
colors = [c.get_facecolor() for c in bars[:-1]]
def unique(lst):
    unique_list = []
    for c in lst:
        if c not in unique_list:
            unique_list.append(c)
    return unique_list

colors = unique(colors)

i = 0
for x in stops_by_race:
    ax6.axhline(x, color=colors[i], linewidth = 0.6, alpha=1, linestyle='-.')
    i += 1
    
ax6.set_title('Stops made each time of day by Race', fontsize=20)

ax6.set_xlabel('Time Category\n Morning: 6:00-11:59, Afternoon: 12:00-4:59, Evening: 5:00 - 7:59pm, Night: 8:00pm - 5:59am', fontsize=14)

In [None]:
df

In [None]:
new_df = df[['Sex Code', 'Descent Description', 'Stop Type', 'Stop Hour', 'Time Category', 'Stop Year', 'Stop Month', 'Stop Day', 'Stop Day Name', 'Division Description 1', 'Post Stop Activity Indicator']]

In [None]:
new_df

In [None]:
la_divisions = ['MISSION', 'WEST LA', 'SEVENTY-SEVENTH', 'NORTH EAST', 'TOPANGA', 'WEST VALLEY', 'OLYMPIC', 'SOUTH EAST', 'VALLEY BUREAU', 'SOUTH WEST', 'FOOTHILL', 
                'NEWTON', 'RAMPART', 'HOLLYWOOD', 'NORTH HOLLYWOOD', 'WILSHIRE', 'DEVONSHIRE', 'VAN NUYS', 'WEST BUREAU', 'CENTRAL', 'PACIFIC', 'HOLLENBECK', 
                'HARBOR', 'CENTRAL BUREAU', 'SOUTH BUREAU']

new_df = new_df.loc[new_df['Division Description 1'].isin(la_divisions)]

new_df

In [None]:
new_df = new_df.dropna(axis=0)

In [None]:
new_df.isnull().sum()

In [None]:
new_df['Post Stop Activity Indicator'].value_counts(normalize=True)

In [None]:
new_df

In [None]:
X = new_df.drop(['Post Stop Activity Indicator'], axis=1)
y = new_df['Post Stop Activity Indicator']

In [None]:
X_dummy = pd.get_dummies(X.astype(str))
