# Some interesting things from VicRoads Crash Stats

#### Import some data

In [3]:
import pandas as pd

df_accident = pd.read_csv(
    'vicroads_data/ACCIDENT/ACCIDENT.csv', # file name and path
    sep=',', # separated by commas
    index_col=0 # first column should be primary key
)

In [4]:
# data read okay?
print(df_accident.head(3))

             ACCIDENTDATE ACCIDENTTIME  ACCIDENT_TYPE  \
ACCIDENT_NO                                             
T20060000010   13/01/2006     12.42.00              1   
T20060000018   13/01/2006     19.10.00              1   
T20060000022   14/01/2006     12.10.00              7   

                          Accident Type Desc  DAY_OF_WEEK  \
ACCIDENT_NO                                                 
T20060000010          Collision with vehicle            6   
T20060000018          Collision with vehicle            6   
T20060000022  Fall from or in moving vehicle            7   

             Day Week Description  DCA_CODE  \
ACCIDENT_NO                                   
T20060000010               Friday       113   
T20060000018               Friday       113   
T20060000022             Saturday       190   

                                             DCA Description DIRECTORY  \
ACCIDENT_NO                                                              
T20060000010  RIGHT NEAR

## charts

#### Include Libraries

In [5]:
import pandas as pd
import numpy as np
import plotly.offline as py
from plotly.offline import *
import plotly.graph_objs as go

init_notebook_mode(connected=True) # render plotly charts in the notebook on the fly

#### data prep

Manipulate data: create a copy of df_accident, add columns for:
* Year
* Month

Filter out 2006 and 2017, as they are incomplete in this data

In [6]:
# Manipulate data
df_accident_mod = df_accident
#print(df_accident_mod.head(3))

## DATES

# make a string representing date time
df_accident_mod['date'] = df_accident_mod['ACCIDENTDATE'] + ' ' + df_accident_mod['ACCIDENTTIME']

# convert to datetime
df_accident_mod['date'] = pd.to_datetime(df_accident_mod['date'], format="%d/%m/%Y %H.%M.%S")

# make some useful date fields
df_accident_mod['year'] = pd.DatetimeIndex(df_accident_mod['date']).year
df_accident_mod['month'] = pd.DatetimeIndex(df_accident_mod['date']).month
df_accident_mod['hour'] = pd.DatetimeIndex(df_accident_mod['date']).hour

#print(df_accident_mod.head(3))

## FILTER

# remove < 2007 and > 2016, as 2006 and 2017 are incomplete years
df_accident_mod = df_accident_mod[df_accident_mod['year'] >= 2007]
df_accident_mod = df_accident_mod[df_accident_mod['year'] <= 2016]

# remove speed limits > 110 km/h - 777, 888, and 999 are used to code:
# 777 = other
# 888 = camping grounds or off road
# 999 = not known

df_accident_mod = df_accident_mod[df_accident_mod['SPEED_ZONE'] <= 110]

print(df_accident_mod.head(3))

             ACCIDENTDATE ACCIDENTTIME  ACCIDENT_TYPE  \
ACCIDENT_NO                                             
T20070000004    1/01/2007     02.55.00              4   
T20070000008    1/01/2007     03.59.00              1   
T20070000009    1/01/2007     03.30.00              4   

                         Accident Type Desc  DAY_OF_WEEK Day Week Description  \
ACCIDENT_NO                                                                     
T20070000004  Collision with a fixed object            2               Monday   
T20070000008         Collision with vehicle            2               Monday   
T20070000009  Collision with a fixed object            2               Monday   

              DCA_CODE                                    DCA Description  \
ACCIDENT_NO                                                                 
T20070000004       171  LEFT OFF CARRIAGEWAY INTO OBJECT/PARKED VEHICL...   
T20070000008       140                  U TURN                              


#### Make some charts

* run group bys in pandas
* configure chart layout
* display

In [7]:
accidents_by_year =  df_accident_mod.groupby([ 'year']).size().reset_index(name="cnt")
print(accidents_by_year.head(3))

   year    cnt
0  2007  12980
1  2008  13494
2  2009  13097


In [8]:
seriesCrashesbyYearBar = go.Bar(
    x = accidents_by_year['year'],
    y = accidents_by_year['cnt'],
    #mode = 'scatter',
    name = 'Crashes per Year',
    marker = dict(
        color = '#ee2737'
    )
)

layout = go.Layout(
    title = 'VicRoads Accident - Accidents per Year',
    titlefont=dict(
            family='Open Sans',
            size=22#,
            #color='#7f7f7f'
    ),
    xaxis=dict(
        tickangle = -45, # angle lables at 45 deg
        dtick = 1, # label every 1 bars
        title = 'Year',
            titlefont=dict(
                family='Open Sans',
                size=16#,
                #color='#7f7f7f'
            )
    ),
    yaxis = dict(
        title = 'Count',
        titlefont=dict(
            family='Open Sans',
            size=16#,
            #color='#7f7f7f'
        )
    )
)

data = [seriesCrashesbyYearBar] # can be multiple series

fig = go.Figure(data=data, layout=layout)

### Number of accidents over time
The number of accidents per year has been relatively steady for the last decade

In [9]:
py.iplot(fig, filename='figureYear')

#### Accidents by Month

# TODO
* Control for days in month
* label months Jan, Feb, etc

In [10]:
accidents_by_month =  df_accident_mod.groupby([ 'month']).size().reset_index(name="cnt")
print(accidents_by_month.head(3))

monthSeries = go.Bar(
    x = accidents_by_month['cnt'],
    y = accidents_by_month['month'],
    
    #mode = 'scatter',
    name = 'Crashes per Month',
    marker = dict(
        color = '#ee2737'
    ),
    orientation = 'h'
)

layout = go.Layout(
    title = 'VicRoads Accident - Accidents per Month',
    titlefont=dict(
            family='Open Sans',
            size=22#,
            #color='#7f7f7f'
    ),
    xaxis=dict(
        tickangle = -45, # angle lables at 45 deg
        dtick = 1, # label every 1 bars
        title = 'Month',
            titlefont=dict(
                family='Open Sans',
                size=16#,
                #color='#7f7f7f'
            )
    ),
    yaxis = dict(
        title = 'Count',
        titlefont=dict(
            family='Open Sans',
            size=16#,
            #color='#7f7f7f'
        )
    )
)

data = [monthSeries] # can be multiple series

figureMonth = go.Figure(data=data, layout=layout)

#py.iplot(figureMonth, filename='figureMonth')

   month    cnt
0      1  10048
1      2  10993
2      3  11963


#### Accidents by Speed Zone

In [11]:
# 777, 888, and 999 exist in the SPEED_ZONE column
# remove >110


accidents_by_speed =  df_accident_mod.groupby([ 'SPEED_ZONE']).size().reset_index(name="cnt")
print(accidents_by_speed.head(3))

speedLimitZoneSeries = go.Bar(
    y = accidents_by_speed['cnt'],
    x = accidents_by_speed['SPEED_ZONE'],
    name = 'Num. Crashes',
    marker = dict(
        color = '#ee2737'
    )
)

layout = go.Layout(
    title = 'VicRoads Accident - Accidents by Speed Limit Zone',
    titlefont=dict(
            family='Open Sans',
            size=22#,
            #color='#7f7f7f'
    ),
    xaxis=dict(
        tickangle = -45, # angle lables at 45 deg
        dtick = 10, # label every 1 bars
        title = 'Speed Limit Zone',
            titlefont=dict(
                family='Open Sans',
                size=16#,
                #color='#7f7f7f'
            )
    ),
    yaxis = dict(
        title = 'Count',
        titlefont=dict(
            family='Open Sans',
            size=16#,
            #color='#7f7f7f'
        )
    )
)

data = [speedLimitZoneSeries] # can be multiple series

figurespeedLimitZone = go.Figure(data=data, layout=layout)

   SPEED_ZONE    cnt
0          30    170
1          40   5388
2          50  25395


### Most accidents happen at lower speeds

In [12]:
py.iplot(figurespeedLimitZone, filename='figurespeedLimitZone')

#### All accidents vs Fatal accidents

In [13]:
# group by severity and speed zone
accidents_by_speed_by_severity =  df_accident_mod.groupby(['SPEED_ZONE','SEVERITY']).size().reset_index(name="cnt")
print(accidents_by_speed_by_severity.head(3))

# only keep severe
accidents_by_speed_severity_1 = accidents_by_speed_by_severity[accidents_by_speed_by_severity['SEVERITY'] == 1]
print(accidents_by_speed_severity_1.head(3))

speedLimitZoneSeriesSevere = go.Bar(
    y = accidents_by_speed_severity_1['cnt'],
    x = accidents_by_speed_severity_1['SPEED_ZONE'],
    name = 'Severe Crashes by Speed Limit Zone',
    marker = dict(
        color = '#0077c8'
    )
)

layout = go.Layout(
    #barmode = 'stack',
    title = 'VicRoads Accident - Accidents by Speed Limit Zone',
    titlefont=dict(
            family='Open Sans',
            size=22#,
            #color='#7f7f7f'
    ),
    xaxis=dict(
        tickangle = -45, # angle lables at 45 deg
        dtick = 10, # label every 1 bars
        title = 'Speed Limit Zone',
            titlefont=dict(
                family='Open Sans',
                size=16#,
                #color='#7f7f7f'
            )
    ),
    yaxis = dict(
        title = 'Count',
        titlefont=dict(
            family='Open Sans',
            size=16#,
            #color='#7f7f7f'
        )
    )
)

data = [speedLimitZoneSeries, speedLimitZoneSeriesSevere] # can be multiple series

figurespeedLimitZoneSevere = go.Figure(data=data, layout=layout)

py.iplot(figurespeedLimitZoneSevere, filename='figurespeedLimitZoneSevere')

   SPEED_ZONE  SEVERITY  cnt
0          30         1    1
1          30         2   52
2          30         3  117
   SPEED_ZONE  SEVERITY  cnt
0          30         1    1
3          40         1   24
6          50         1  222


#### % Crashes Lethal

This snippet of code:
* Groups by both Speed Zone and Severity
* Applys a function to calculate a fields percentage of the first group by field (speed zone), so all speed zone 30 sum to 100, all speed zone 40 sum to 100, etc
* filter for just severe crashes (severity = 1), indicating someone died

In [14]:
# group by severity and speed zone
accidents_by_speed_by_severity =  df_accident_mod.groupby(['SPEED_ZONE','SEVERITY']).size()
# divide by the first level you grouped by, multiply by 100, to end up with each severitys % of speed zone crashes
accidents_by_speed_by_severity_pc = accidents_by_speed_by_severity.groupby(level=0).apply(lambda x:round(
                                                 100 * x / float(x.sum()),2))

# reset index and name the new column count
accidents_by_speed_by_severity_pc = accidents_by_speed_by_severity_pc.reset_index(name="count")

# filter for just severe
accidents_by_speed_severe_pc = accidents_by_speed_by_severity_pc[accidents_by_speed_by_severity_pc['SEVERITY'] == 1]

# drop now-redundant column
del accidents_by_speed_severe_pc['SEVERITY']

print(accidents_by_speed_severe_pc)


    SPEED_ZONE  count
0           30   0.59
3           40   0.45
6           50   0.87
9           60   1.01
13          70   1.74
16          75   3.57
19          80   2.11
22          90   3.77
25         100   5.27
28         110   6.23


In [15]:
speedLimitZoneSeriesSeverePc = go.Scatter(
    y = accidents_by_speed_severe_pc['count'],
    x = accidents_by_speed_severe_pc['SPEED_ZONE'],
    name = '% Crashes Lethal',
    marker = dict(
        color = '#0077c8'
    ),
    yaxis='y2'
)

layout = go.Layout(
    barmode = 'group',
    title = 'VicRoads Accident - Accidents by Speed Limit Zone',
    titlefont=dict(
            family='Open Sans',
            size=22
    ),
    xaxis=dict(
        tickangle = -45, # angle lables at 45 deg
        dtick = 10, # label every 10th number after 0
        title = 'Speed Limit Zone',
            titlefont=dict(
                family='Open Sans',
                size=16
            )
    ),
    yaxis = dict(
        title = 'Num. Crashes',
        titlefont=dict(
            family='Open Sans',
            size=16
        ),
        dtick=5000,
        range=[0,65000]
    ),
    yaxis2=dict(
        title='% Crashes Lethal',
        overlaying='y',
        side='right',
        titlefont=dict(
            family='Open Sans',
            size=16
        ),
        range=[0,6.5],
        dtick = 1
    )
)

data = [speedLimitZoneSeries, speedLimitZoneSeriesSeverePc] # can be multiple series

figurespeedLimitZoneSeverePc = go.Figure(data=data, layout=layout)

### While most accidents happen at slower speeds, accidents in higher-speed zones are more likely to be deadly

In [16]:
py.iplot(figurespeedLimitZoneSeverePc, filename='figurespeedLimitZoneSeverePc')

### Crashes by Hour

In [17]:
accidents_by_hour = df_accident_mod.groupby(['hour']).size().reset_index(name="cnt")
print(accidents_by_hour.head(3))

   hour   cnt
0     0  2067
1     1  1755
2     2  1415


In [18]:
seriesAccidentsByHourBar = go.Bar(
    y = accidents_by_hour['cnt'],
    x = accidents_by_hour['hour'],
    name = 'Num. Crashes',
    marker = dict(
        color = '#FF6900'
    )
)

layout = go.Layout(
    barmode = 'group',
    title = 'VicRoads Accident - Accidents by Hour',
    titlefont=dict(
            family='Open Sans',
            size=22
    ),
    xaxis=dict(
        #tickangle = -45, # angle lables at 45 deg
        dtick = 1,
        title = 'Speed Limit Zone',
            titlefont=dict(
                family='Open Sans',
                size=16
            )
    ),
    yaxis = dict(
        title = 'Num. Crashes',
        titlefont=dict(
            family='Open Sans',
            size=16
        )
    )
)

data = [seriesAccidentsByHourBar]

figureAccidentsByHour = go.Figure(data=data, layout=layout)

### Most accidents happen during daylight hours
(presumably this reflects more people being on the road)

In [19]:
py.iplot(figureAccidentsByHour, filename='figureAccidentsByHour')

### Crashes by Hour by Severity

In [20]:
# group by hour by severity
accidents_by_hour_by_severity =  df_accident_mod.groupby(['hour','SEVERITY']).size()

# divide by the first level you grouped by, multiply by 100, to end up with each severitys % of speed zone crashes
accidents_by_hour_by_severity_pc = accidents_by_hour_by_severity.groupby(level=0).apply(lambda x:round(
                                                 100 * x / float(x.sum()),2))
# reset index and name the new column count
accidents_by_hour_by_severity_pc = accidents_by_hour_by_severity_pc.reset_index(name="count")

# filter for just severe
accidents_by_hour_by_severity_1 = accidents_by_hour_by_severity_pc[accidents_by_hour_by_severity_pc['SEVERITY'] == 1]

# drop now-redundant column
del accidents_by_hour_by_severity_1['SEVERITY']

print(accidents_by_hour_by_severity_1.head(3))

   hour  count
0     0   3.97
3     1   4.33
6     2   4.59


In [21]:
seriesAccidentsByHourSevereLine = go.Scatter(
    y = accidents_by_hour_by_severity_1['count'],
    x = accidents_by_hour_by_severity_1['hour'],
    name = 'Num. Crashes',
    marker = dict(
        color = '#0077C8'
    ),
    yaxis = 'y2'
)

layout = go.Layout(
    barmode = 'group',
    title = 'VicRoads Accident - Accidents by Hour - Lethal vs All',
    titlefont=dict(
            family='Open Sans',
            size=22
    ),
    xaxis=dict(
        title = 'Speed Limit Zone',
        titlefont=dict(
            family='Open Sans',
            size=16
        )
    ),
    yaxis = dict(
        title = 'Num. Crashes',
        titlefont=dict(
            family='Open Sans',
            size=16
        ),
        range=[0,12000]
    ),
    yaxis2=dict(
        title='% Crashes Lethal',
        overlaying='y',
        side='right',
        titlefont=dict(
            family='Open Sans',
            size=16
        ),
        range=[0,6],
        dtick = 1
    )
)

data = [seriesAccidentsByHourBar, seriesAccidentsByHourSevereLine]

figureAccidentsByHourWithSeverePc = go.Figure(data=data, layout=layout)

### While more accidents do happen during the day, those at night are more likley to be lethal

In [22]:
py.iplot(figureAccidentsByHourWithSeverePc, filename='figureAccidentsByHourWithSeverePc')

### Crashes by Population Estimates

Crashes look pretty stable over this period, but we know Victoria has grown a lot.

Have more or less crashes happened per person?

To estimate, I:
* Import the ABS ERP dataset for Victoria, code 3101.0
    * Available at: http://www.abs.gov.au/AUSSTATS/abs@.nsf/DetailsPage/3101.0Jun%202016?OpenDocument
* Tidy that up, and average by year (as it is quarterly)
* Join to crash dataset, and divide crashes by population to estimate crashes per population

#### Get ERP Data

In [23]:
# read from ABS Excel series
df_erp = pd.read_excel(
    'abs_data/erp_2016.xlsx', # file name and path
    sheetname = 'Data1',
    header = 0,
    skiprows = 9,
    index_col=0 # first column should be primary key
)

# just keep the Victoria Persons series, which has the id A2060844K
df_erp = df_erp['A2060844K']

# fix column headers
df_erp = df_erp.reset_index()

# rename columns
df_erp.columns = ['date','erp']

# add a year column
df_erp['year'] = pd.DatetimeIndex(df_erp['date']).year

print(df_erp.head(3))

# group by year
erp_by_year = df_erp.groupby(['year']).mean().round()

# stop pandas displaying output in sceintific notation
# pinched from https://stackoverflow.com/questions/21137150/format-suppress-scientific-notation-from-python-pandas-aggregation-results/21140339

#pd.set_option('display.float_format', lambda x: '%.0f' % x) # .0f is to 0 decimal places

# fix column headers
erp_by_year = erp_by_year.reset_index()

print(erp_by_year.head(3))

        date      erp  year
0 1981-06-01  3946917  1981
1 1981-09-01  3957333  1981
2 1981-12-01  3968398  1981
   year        erp
0  1981  3957549.0
1  1982  3997278.0
2  1983  4040160.0


#### Merge ERP data with Crash data

In [24]:
# dataframe accidents_by_year defined earlier in the file
print(accidents_by_year.head(3))
print(erp_by_year.head(3))

# take all rows from accidents by year, left join with erp_by_year on year=year
df_accidents_by_year_with_erp = pd.merge(accidents_by_year, erp_by_year, on='year', how='left')

# did we get what we expect?
print(df_accidents_by_year_with_erp.head(3))

# make column for crashes per thousand people
df_accidents_by_year_with_erp['crashes_per_100k'] = round(
    100000 * (
        df_accidents_by_year_with_erp['cnt']/df_accidents_by_year_with_erp['erp']
    ),0
)

   year    cnt
0  2007  12980
1  2008  13494
2  2009  13097
   year        erp
0  1981  3957549.0
1  1982  3997278.0
2  1983  4040160.0
   year    cnt        erp
0  2007  12980  5166404.0
1  2008  13494  5272302.0
2  2009  13097  5384432.0


In [25]:
# seriesCrashesbyYearBar defined above

seriesCrashesbyYearBy100KPopulationScatter = go.Scatter(
    y = df_accidents_by_year_with_erp['crashes_per_100k'],
    x = df_accidents_by_year_with_erp['year'],
    name = 'Crashes Per 100k Population',
    marker = dict(
        color = '#0077C8',
        size=10
    ),
    yaxis = 'y2',
    line = dict(
        color = '#0077C8',
        width=4
    )
)

layout = go.Layout(
    title = 'VicRoads Accident - Crashes by Year',
    titlefont=dict(
            family='Open Sans',
            size=22
    ),
    xaxis=dict(
        title = 'Year',
        titlefont=dict(
            family='Open Sans',
            size=16
        )
    ),
    yaxis = dict(
        title = 'Num. Crashes',
        titlefont=dict(
            family='Open Sans',
            size=16
        ),
        range=[0,15000],
        dtick = 2500
    ),
    yaxis2=dict(
        title='Crashes per 100k Popn',
        overlaying='y',
        side='right',
        titlefont=dict(
            family='Open Sans',
            size=16
        ),
        range=[0,300]
    ),
    legend=dict(
        orientation="h",
        x=.24,
        y=1.1
    )
)

data = [seriesCrashesbyYearBar, seriesCrashesbyYearBy100KPopulationScatter]

figureCrashesbyYearWithPer100k = go.Figure(data=data, layout=layout)

## Accidents per population are steadily declining
While the number of Accidents has remained relatively steady, the number of acidents per 100,000 population has fallen from a peak of 256 per 100,000 in 2008 to 208 in 2016, a fall of 19%

In [26]:
py.iplot(figureCrashesbyYearWithPer100k, filename='figureCrashesbyYearWithPer100k')

# Accident Locations

### Data prep

In [27]:
df_node = pd.read_csv(
    'vicroads_data/ACCIDENT/NODE.csv', # file name and path
    sep=','#, # separated by commas
    #index_col=0, # first column should be primary key
    #low_memory = False
)

# filter out all values in the LGA_NAME column that don't represent LGAs
# read valid lga list
valid_lgas = pd.read_csv(
    'vic_lgas.csv',
    names=['LGA']                        
)

# make upper case to match vicroads data
valid_lgas['LGA'] = valid_lgas['LGA'].str.upper()

# convert to a list
valid_lgas = valid_lgas['LGA'].values.tolist()

# normalise LGA names
df_node['LGA_NAME'].replace('DANDENONG', 'GREATER DANDENONG',inplace=True)
df_node['LGA_NAME'].replace('GEELONG', 'GREATER GEELONG',inplace=True)
df_node['LGA_NAME'].replace('COLAC OTWAY', 'COLAC-OTWAY',inplace=True)
df_node['LGA_NAME'].replace('SHEPPARTON', 'GREATER SHEPPARTON',inplace=True)
df_node['LGA_NAME'].replace('BENDIGO', 'GREATER BENDIGO',inplace=True)

# filter where name not in list
df_node = df_node[df_node['LGA_NAME'].isin(valid_lgas)]

# merge node columns onto limited dataset
df_tmp = df_accident_mod.reset_index()
df_accident_node = pd.merge(df_tmp, df_node, on='ACCIDENT_NO', how='left')

print(df_accident_node.head(3))

    ACCIDENT_NO ACCIDENTDATE ACCIDENTTIME  ACCIDENT_TYPE  \
0  T20070000004    1/01/2007     02.55.00              4   
1  T20070000008    1/01/2007     03.59.00              1   
2  T20070000009    1/01/2007     03.30.00              4   

              Accident Type Desc  DAY_OF_WEEK Day Week Description  DCA_CODE  \
0  Collision with a fixed object            2               Monday       171   
1         Collision with vehicle            2               Monday       140   
2  Collision with a fixed object            2               Monday       181   

                                     DCA Description DIRECTORY     ...       \
0  LEFT OFF CARRIAGEWAY INTO OBJECT/PARKED VEHICL...       VCS     ...        
1                  U TURN                                  MEL     ...        
2  OFF RIGHT BEND INTO OBJECT/PARKED VEHICLE     ...       MEL     ...        

   NODE_TYPE        AMG_X        AMG_Y  LGA_NAME  Lga Name All  \
0          N  2477943.291  2596972.868  CAMPASPE      C

## LGA

In [28]:
# group by hour by severity
accidents_by_lga =  df_accident_node.groupby(['LGA_NAME']).size()

# reset index and name the new column count
accidents_by_lga = accidents_by_lga.reset_index(name="count")

### The most accidents occur in the following LGAs:

In [29]:
accidents_by_lga.nlargest(5,'count')

Unnamed: 0,LGA_NAME,count
43,MELBOURNE,8430
13,CASEY,5348
26,GREATER GEELONG,4673
25,GREATER DANDENONG,4577
9,BRIMBANK,4257


### And the least occur in:

In [30]:
accidents_by_lga.nsmallest(5,'count')

Unnamed: 0,LGA_NAME,count
60,QUEENSCLIFFE,24
78,YARRIAMBIACK,103
29,HINDMARSH,120
71,WEST WIMMERA,121
10,BULOKE,155


### This is fairly unsurprising given that the Borough of Queenscliffe had only 2,853 residents as at the 2016 census!

### Number of accidents versus population by LGA

It should be more informative to compare accident rates to population.

For the sake of simplicity, this compares accidents from 2007 to 2016 against 2016 census population counts by LGA.

This doesn't account for the boundary changes that may have happened to LGAs in that time.

In [31]:
df_lga_population = pd.read_csv(
    'abs_data/popn_by_lga_vic_census_2016.csv',
    names = ['LGA_NAME','popn'],
    skiprows=1 # first row is names, which we're setting manually
)

# make upper case to match vicroads data
df_lga_population['LGA_NAME'] = df_lga_population['LGA_NAME'].str.upper()

# merge into new data frame
accidents_by_lga_with_popn = pd.merge(accidents_by_lga, df_lga_population, on='LGA_NAME', how='left')

# rate per popn
accidents_by_lga_with_popn['crashes_per_100k_per_year'] = round(
    100000 * (
        accidents_by_lga_with_popn['count']/accidents_by_lga_with_popn['popn']/9
    ),0
)

#print(accidents_by_lga_with_popn)

In [37]:
seriesLGAPopnVsCrashesPer100K = go.Scatter(
    y = accidents_by_lga_with_popn['crashes_per_100k_per_year'],
    x = accidents_by_lga_with_popn['popn'],
    name = 'Crashes Per 100k Population vs. Population',
    mode = 'markers',
    text= accidents_by_lga_with_popn['LGA_NAME']
)

layout = go.Layout(
    title = 'VicRoads Accident - Accidents per 100,000 population vs. Population',
    titlefont=dict(
            family='Open Sans',
            size=22
    ),
    xaxis=dict(
        title = '2016 Population',
        titlefont=dict(
            family='Open Sans',
            size=16
        )
    ),
    yaxis = dict(
        title = 'Accidents per 100,000 people',
        titlefont=dict(
            family='Open Sans',
            size=16
        )
    )
    legend=dict(
        orientation="h",
        x=.24,
        y=1.1
    ),
    annotations=[
        dict(
            x=135959,
            y=689,
            xref='x',
            yref='y',
            text='Melbourne LGA',
            showarrow=True,
            arrowhead=2,
            arrowsize=1,
            arrowwidth=1.5,
            ax=0,
            ay=-40
        ),
        dict(
            x=86675,
            y=397,
            xref='x',
            yref='y',
            text='Yarra LGA',
            showarrow=True,
            arrowhead=2,
            arrowsize=1,
            arrowwidth=1.5,
            ax=0,
            ay=-40
        )
    ]
)

data = [seriesLGAPopnVsCrashesPer100K]

figureLGAPopnVsCrashesPer100K = go.Figure(data=data, layout=layout)

SyntaxError: invalid syntax (<ipython-input-37-bd15a6f3585b>, line 29)

### Smaller LGAs have more accidents per person

* This would align with the hypothesis that country roads are more dangerous
* Outliers
    * Melbourne LGA is a clear outlier
    * Yarra is more marginal
    * Is it perhaps the case minor accidents are more likely to be reported in the city?

In [33]:
py.iplot(figureLGAPopnVsCrashesPer100K, filename='figureLGAPopnVsCrashesPer100K')

### What's this look like on a log scale?

# TODO
* colour above chart by LGA
* make map of rates per LGA
* do these for lethality, is the country worse for death?
* minor accidents by lga - are melb, yarra near the top?

# Vehicle statistics

### Data prep

In [34]:
df_vehicle = pd.read_csv(
    'vicroads_data/ACCIDENT/VEHICLE.csv', # file name and path
    sep=',', # separated by commas
    #index_col=0, # first column should be primary key
    low_memory = False
)

print(df_vehicle.head(3))

    ACCIDENT_NO VEHICLE_ID  VEHICLE_YEAR_MANUF VEHICLE_DCA_CODE  \
0  T20060000010          A              1996.0                2   
1  T20060000010          B              2003.0                1   
2  T20060000010          C              2001.0                8   

  INITIAL_DIRECTION  ROAD_SURFACE_TYPE Road Surface Type Desc REG_STATE  \
0                SW                  1                  Paved         V   
1                NW                  1                  Paved         V   
2                NW                  1                  Paved         V   

  VEHICLE_BODY_STYLE VEHICLE_MAKE          ...          VEHICLE_COLOUR_1  \
0             SEDAN        MITSUB          ...                       MRN   
1             COUPE        UNKN            ...                       BLU   
2             SEDAN        FORD            ...                       YLW   

   VEHICLE_COLOUR_2  CAUGHT_FIRE INITIAL_IMPACT  LAMPS LEVEL_OF_DAMAGE  \
0               ZZ             2              F    

In [35]:
df_accident.head(2)

Unnamed: 0_level_0,ACCIDENTDATE,ACCIDENTTIME,ACCIDENT_TYPE,Accident Type Desc,DAY_OF_WEEK,Day Week Description,DCA_CODE,DCA Description,DIRECTORY,EDITION,...,NO_PERSONS_NOT_INJ,POLICE_ATTEND,ROAD_GEOMETRY,Road Geometry Desc,SEVERITY,SPEED_ZONE,date,year,month,hour
ACCIDENT_NO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
T20060000010,13/01/2006,12.42.00,1,Collision with vehicle,6,Friday,113,RIGHT NEAR (INTERSECTIONS ONLY),MEL,40.0,...,5,1,1,Cross intersection,3,60,2006-01-13 12:42:00,2006,1,12
T20060000018,13/01/2006,19.10.00,1,Collision with vehicle,6,Friday,113,RIGHT NEAR (INTERSECTIONS ONLY),MEL,40.0,...,3,1,2,T intersection,3,70,2006-01-13 19:10:00,2006,1,19
