# Exploring trends in the cleaned data
This notebook will delve into some basic trends of the crash data. 

Before running this notebook, you should run the cleaning scripts in the `cleaning-workflow/cleaning-scripts` folder on Github using the instructions in the `readme` document. This notebook will analyze the `master-crash.csv` file which is generated at the end of the cleaning process.

In [16]:
# basic data/viz libraries
import pandas as pd
from matplotlib import pyplot as plt
from datetime import date, time

import altair as alt
# saving data into a file rather than embedding into the chart
alt.data_transformers.enable('json')
alt.renderers.enable('default')

RendererRegistry.enable('default')

In [138]:
df = pd.read_csv('../source-data/moco-crash-2003-2015.csv', encoding='unicode_escape')

In [144]:
# df.count()
# df.dropna(subset=["Interchange"])
df['Injury Type'].unique()

array(['No injury/unknown', 'Non-incapacitating', 'Incapacitating',
       'Fatal'], dtype=object)

In [8]:
# read in the cleaned master csv
crash_df = pd.read_csv('../data-output/master-crashes.csv', low_memory=False)  

In [9]:
crash_df

Unnamed: 0,Vehicles Involved,Number Injured,Number Dead,Roadway Id,Intersecting Road,Latitude,Longitude,Primary Factor,Manner of Collision,DateTime
0,1.0,0.0,0.0,I69N,STATE RD 37,38.329723,-86.509226,ANIMAL/OBJECT IN ROADWAY,COLLISION WITH ANIMAL OTHER,2022-01-07 05:14:00
1,1.0,0.0,0.0,SR46W,DEER PARK,39.212153,-86.587526,ANIMAL/OBJECT IN ROADWAY,COLLISION WITH DEER,2022-01-08 08:35:00
2,1.0,0.0,0.0,W REEVES,,39.235012,-86.676553,RAN OFF ROAD RIGHT,RAN OFF ROAD,2022-01-17 07:33:00
3,2.0,0.0,0.0,THIRD,S HAWTHORNE,39.156888,-86.520324,UNSAFE LANE MOVEMENT,SAME DIRECTION SIDESWIPE,2022-01-04 12:32:00
4,2.0,0.0,0.0,S HENDERSON,E HILLSIDE,39.150640,-86.526960,FAILURE TO YIELD RIGHT OF WAY,RIGHT ANGLE,2022-01-01 05:33:00
...,...,...,...,...,...,...,...,...,...,...
74617,,0.0,0.0,DUNN,WHITE LOT WEST,0.000000,0.000000,IMPROPER LANE USAGE,,2003-10-06 17:00:00
74618,,0.0,0.0,RED OAK,SR446,0.000000,0.000000,UNSAFE SPEED,,2003-11-03 08:00:00
74619,,0.0,0.0,2ND ST,WALNUT,0.000000,0.000000,BRAKE FAILURE OR DEFECTIVE,,2003-12-05 12:00:00
74620,,0.0,0.0,NINETH,NORTH,0.000000,0.000000,UNSAFE BACKING,,2003-12-01 07:00:00


First, let's see some time trends. How has the frequency of crashes and crash injuries/fatalities changed over the time span of the dataset? 

In [10]:
def get_year(date):
    return pd.to_datetime(date).year

In [18]:
def get_month(date):
    return pd.to_datetime(date).month

In [21]:
crash_df['Year'] = crash_df['DateTime'].apply(get_year)
crash_df['Month'] = crash_df['DateTime'].apply(get_month)

In [34]:
def make_line_chart(source, title, values, x_axis, y_axis):
    return alt.Chart(source, 
          width=500,
          height=300, 
          title=title,
    ).mark_line().encode(
        alt.X(x_axis, axis=alt.Axis(
                values=values,
                grid=True,
                labelAngle=0)
             ),
        alt.Y(y_axis),
    )

In [53]:
# total crashes per year
years = crash_df['Year'].unique()
counts = []
for year in years:
    counts.append(crash_df[crash_df['Year']==year]['Year'].count())
years, counts = zip(*sorted(zip(years, counts)))
years = [str(x) for x in years]

source = pd.DataFrame({
  'Year': years,
  'Number of Crashes': counts
})
title='Car crashes year by year in Monroe County, IN'
values=years
x_axis='Year:O'
y_axis='Number of Crashes:Q'

make_line_chart(source, title, values, x_axis,y_axis)

In [54]:
# total crashes per month
months = crash_df['Month'].unique()
counts = []
for month in months:
    counts.append(crash_df[crash_df['Month']==month]['Month'].count())
months, counts = zip(*sorted(zip(months, counts)))
month_names = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
months = month_names

source = pd.DataFrame({
  'Month': months,
  'Number of Crashes': counts
})
title='Car crashes by month in Monroe County, IN (data from 2003-2021)'
values=month_names
x_axis='Month:O'
y_axis='Number of Crashes:Q'

make_line_chart(source, title, values, x_axis, y_axis)

In [55]:
# fatalities per year
# 2003-2012 ARE LOW ESTIMATES
counts = []
years = crash_df['Year'].unique()
for year in years:
    year_df = crash_df[crash_df['Year']==year]
    counts.append(year_df[year_df['Number Dead'] > 0]['Year'].count())
    
years, counts = zip(*sorted(zip(years, counts)))
source = pd.DataFrame({
  'Year': years,
  'Number of Crash Deaths': counts
})
title='Crash deaths by year in Monroe County, IN (data from 2003-22)'
values=years
x_axis='Year:O'
y_axis='Number of Crash Deaths:Q'

make_line_chart(source, title, values, x_axis, y_axis)

In [59]:
# fatalities per year as a proportion of total crashes
# 2003-2012 ARE LOW ESTIMATES
counts = []
years = crash_df['Year'].unique()
for year in years:
    year_df = crash_df[crash_df['Year']==year]
    counts.append(year_df[year_df['Number Dead'] > 0]['Year'].count() / year_df.shape[0])
    
years, counts = zip(*sorted(zip(years, counts)))
source = pd.DataFrame({
  'Year': years,
  'Proportion of Crashes that Involved Fatalities': counts
})
title='Proportion of Crashes that Involved Fatalities in Monroe County, IN (data from 2003-22)'
values=years
x_axis='Year:O'
y_axis='Proportion of Crashes that Involved Fatalities:Q'

make_line_chart(source, title, values, x_axis, y_axis)

In [57]:
# injuries per year
# 2003-2012 ARE LOW ESTIMATES
counts = []
years = crash_df['Year'].unique()
for year in years:
    year_df = crash_df[crash_df['Year']==year]
    counts.append(year_df[year_df['Number Injured'] > 0]['Year'].count())
    
years, counts = zip(*sorted(zip(years, counts)))
source = pd.DataFrame({
  'Year': years,
  'Number of Crash Injuries': counts
})
title='Crash injuries by year in Monroe County, IN (data from 2003-22)'
values=years
x_axis='Year:O'
y_axis='Number of Crash Injuries:Q'

make_line_chart(source, title, values, x_axis, y_axis)

In [60]:
# injuries per year as a proportion of total crashes
# 2003-2012 ARE LOW ESTIMATES
counts = []
years = crash_df['Year'].unique()
for year in years:
    year_df = crash_df[crash_df['Year']==year]
    counts.append(year_df[year_df['Number Injured'] > 0]['Year'].count() / year_df.shape[0])
    
years, counts = zip(*sorted(zip(years, counts)))
source = pd.DataFrame({
  'Year': years,
  'Proportion of Crashes that Involved Injury': counts
})
title='Proportion of Crashes that Involved Injury in Monroe County, IN (data from 2003-22)'
values=years
x_axis='Year:O'
y_axis='Proportion of Crashes that Involved Injury:Q'

make_line_chart(source, title, values, x_axis, y_axis)

In [64]:
# proportion of crashes that resulted in injury each month
months = crash_df['Month'].unique()
counts = []
for month in months:
    month_df = crash_df[crash_df['Month']==month]
    counts.append(year_df[year_df['Number Injured'] > 0]['Month'].count() / month_df.shape[0])
months, counts = zip(*sorted(zip(months, counts)))
month_names = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
months = month_names

source = pd.DataFrame({
  'Month': months,
  'Proportion of Injuries': counts
})
title='Proportion of Crashes that Involved Injury by Month'
values=month_names
x_axis='Month:O'
y_axis='Proportion of Injuries:Q'

make_line_chart(source, title, values, x_axis, y_axis)

In [65]:
# proportion of crashes that resulted in death each month
months = crash_df['Month'].unique()
counts = []
for month in months:
    month_df = crash_df[crash_df['Month']==month]
    counts.append(year_df[year_df['Number Dead'] > 0]['Month'].count() / month_df.shape[0])
months, counts = zip(*sorted(zip(months, counts)))
month_names = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
months = month_names

source = pd.DataFrame({
  'Month': months,
  'Proportion of Fatalities': counts
})
title='Proportion of Crashes that Involved Fatalities by Month'
values=month_names
x_axis='Month:O'
y_axis='Proportion of Fatalities:Q'

make_line_chart(source, title, values, x_axis, y_axis)