## Descriptives of crash/concern data for Vision Zero
### Developed by: bpben
#### Produces visuals of crash/concern data

In [1]:
import pandas as pd
import csv
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from glob import glob
from simpledbf import Dbf5
from sqlalchemy import create_engine

In [None]:
def read_data(filepath):
    if filepath[-3:]=='dbf':
        db = Dbf5(filepath)
        df = db.to_dataframe()
    else:
        df = pd.read_csv(filepath)
    df.columns = [x.lower() for x in df.columns]
    return(df)

In [None]:
#Read in data
crash = read_data('../data/all_crashes.dbf')
concern = read_data('../data/all_concerns.dbf')
incidents = read_data('../data/incidents.csv')
roads = read_data('../data/boston_road_massdot2015.dbf')

In [None]:
#Drop duplicates in incidents, merge to crash
in_dedup = incidents.groupby('incident').apply(lambda x: x.apply(lambda y: str(y.unique()[0])))
in_dedup.reset_index(drop=True, inplace=True)
cr_in = pd.merge(crash, in_dedup, on='incident', how='left')
cr_in['datetime'] = pd.to_datetime(cr_in['date'])
cr_in['date'] = pd.to_datetime(cr_in.datetime.apply(lambda x: x.date()))

In [None]:
#Merge crash with concerns
concern = concern[concern['near_fid'].notnull()]
concern['near_fid'] = concern['near_fid'].astype('int')
concern['requestdat'] = pd.to_datetime(concern['requestdat'])
cr_in_con = pd.merge(cr_in, concern, on='near_fid', how='inner')

In [None]:
#Get dummies for crash type and complaint type
#Look at correlations between complaints and incidents
#Drop any where type or usertype is missing 
#(i.e. crashes, no complaints ; complaints, no crashes)
cr_con_nonull = cr_in_con[(cr_in_con['type'].notnull())&(cr_in_con['usertype'].notnull())]
cr_con_types = pd.concat([pd.get_dummies(cr_con_nonull['type']), 
                          pd.get_dummies(cr_con_nonull['usertype']),
                         cr_con_nonull['near_fid']], axis  = 1)
#0/1 presence/absence of type
cr_con_types = cr_con_types.groupby('near_fid').max()
cr_con_types.columns = ['Crash - '+c for c in cr_con_types.columns[:3]] + \
                        ['Concern - '+c for c in cr_con_types.columns[3:]]
    
#Drop assistive device/other
cr_con_types.drop([ u'Concern - travels (other)',
                   u'Concern - uses an assistive device'], axis=1, inplace=True)

### Heatmap

In [None]:
cr_con_cor = cr_con_types.corr()

In [None]:
p = sns.heatmap(cr_con_cor, vmin = -0.4, vmax = 0.4)
plt.show()

### Volumes

In [None]:
#Year for focus
# VZ data is for 2016, make sense to constrain to year
yr = '2016'

# Concern by date
con_time = concern.set_index('requestdat').copy()
con_time = con_time[yr]
con_time = con_time.groupby(con_time.index).apply(lambda x: len(x['requestid'].unique()))

# Crash by date
cr_time = cr_in.set_index('date').copy()
cr_time = cr_time[yr]
cr_time = cr_time.groupby(cr_time.index).apply(lambda x: len(x['incident'].unique()))

In [None]:
fig,axs = plt.subplots(1,2)
sns.set_style('white')
axs[0].plot(con_time, color='purple')
axs[0].set_title('concerns')
plt.setp( axs[0].xaxis.get_majorticklabels(), rotation=70 )
l_bound = con_time.index.min()

axs[1].plot(cr_time, color='crimson')
axs[1].set_title('crashes')
plt.setp( axs[1].xaxis.get_majorticklabels(), rotation=70 )
axs[1].set_xlim([l_bound, cr_time.index.max()])

plt.show()

### General data exploration

In [None]:
# Concern by month
con_month = concern.set_index('requestdat').copy()
con_month = con_month[yr]
con_month['month'] = con_month.index.map(lambda x: x.month)
con_month = con_month.groupby('requestdat').apply(lambda x: len(x['requestid'].unique()))

# Crash by month
cr_month = cr_in.copy()
cr_month['month'] = pd.to_datetime(cr_in['date']).apply(lambda x: x.month)
cr_month = cr_month.groupby('month').apply(lambda x: len(x['incident'].unique()))

In [None]:
# Most dangerous road segments, by mode type
incident_count = pd.DataFrame()
g_fid_type = cr_in_con.groupby(['near_fid', 'type'])
fid_inc_count = g_fid_type.apply(lambda x: len(x['incident'].unique()))
#Total incident by type
inc_tot = fid_inc_count.reset_index().groupby('type')[0].sum().to_dict()
for t in ['bike','mv','pedestrian']:
    incident_count = incident_count.append(fid_inc_count.loc(axis=0)[:,t].sort_values(ascending=False).reset_index())

In [None]:
# Most complained about road segments, by user type
concern_count = pd.DataFrame()
g_fid_type = cr_in_con.groupby(['near_fid', 'usertype'])
fid_concern_count = g_fid_type.apply(lambda x: len(x['incident'].unique()))
#Total complaint by type
con_tot = fid_concern_count.reset_index().groupby('usertype')[0].sum().to_dict()
for t in ['bikes','drives','walks']:
    concern_count = concern_count.append(fid_concern_count.loc(axis=0)[:,t].sort_values(ascending=False).reset_index())

In [None]:
# What percent are these top 3 responsible for
#Get the percent for each fid
#pct by fid
inc_pct_fid = fid_inc_count.reset_index()
inc_pct_fid['pct'] = inc_pct_fid.apply(lambda x: 1.*x[0] / inc_tot[x['type']], axis=1)
con_pct_fid = fid_concern_count.reset_index()
con_pct_fid['pct'] = con_pct_fid.apply(lambda x: 1.*x[0] / con_tot[x['usertype']], axis=1)

In [None]:
# Display top 3 incident, complaints and % of complaints/incidents they account for
print 'Top incident'
top3inc = incident_count.groupby('type').apply(lambda x: x.iloc[0:3])
print top3inc.merge(inc_pct_fid, left_on=['near_fid','type'], right_on=['near_fid','type'])
print 'Top complaint'
top3con = concern_count.groupby('usertype').apply(lambda x: x.iloc[0:3])
print top3con.merge(con_pct_fid, left_on=['near_fid','usertype'], right_on=['near_fid','usertype'])

In [None]:
#Merge concern_incident
concern_count = concern_count.pivot(index='near_fid', columns='usertype', values=0)
concern_count['tot_con'] = concern_count.sum(axis=1)
incident_count = incident_count.pivot(index='near_fid', columns='type', values=0)
incident_count['tot_inc'] = incident_count.sum(axis=1)

In [None]:
#Merge together
c_i_count = concern_count.join(incident_count, how='outer')
#Add road length
c_i_count = c_i_count.join(roads.shape_leng)
#Make road length km
c_i_count['shape_leng_km'] = c_i_count['shape_leng'] / 1000
#Normalize each by length
count_norm = c_i_count[c_i_count.columns.difference(['shape_leng','shape_leng_km'])].apply(
    lambda x: x / c_i_count['shape_leng_km'])