In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from dbfread import DBF
import pandas as pd
import numpy as np
from pandas import DataFrame
import shelve
import seaborn as sns
plt.style.use('seaborn-dark')
plt.rcParams['figure.figsize'] = (10, 6)
pd.set_option('display.max_columns', 100)
import warnings
warnings.filterwarnings('ignore')

In [1]:
load_ext watermark

In [4]:
watermark -v -m -p numpy,pandas,seaborn,matplotlib,sklearn,statsmodels,dbfread

CPython 3.6.1
IPython 5.3.0

numpy 1.11.2
pandas 0.20.1
seaborn 0.7.1
matplotlib 2.0.2
sklearn 0.18.1
statsmodels 0.8.0
dbfread 2.0.7

compiler   : GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.57)
system     : Darwin
release    : 17.3.0
machine    : x86_64
processor  : i386
CPU cores  : 8
interpreter: 64bit


## Load 10 years of related data, from 2007 to 2016

We also want to study the mortality rate of fatal accidents. The data element “Fatalities in Vehicle” in the Vehicle data file from the `U.S. Department of Transportation` website provides the number of deaths in a vehicle. Load accidents,vehicle data file which contains mortality rate and person data from 2007 to 2016, add a column of YEAR in each vehicle dataframe. The column name 'latitude' and 'longitud' in accidents07 are different from other dataframes, so we rename these 2 columns to match column names in other dataframes.

In [2]:
vehicle=[]
accident=[]
#person=[]
for i,j in zip(range(2007,2017),range(0,10)):
    if i==2015:
        vehicle.append(pd.read_csv('data/vehicle/vehicle2015.csv',encoding = "ISO-8859-1"))   
    else:
        vehicle.append(DataFrame(iter(DBF('data/vehicle/vehicle{}.dbf'.format(i)))))
    vehicle[j]['YEAR']=i
    accident.append(DataFrame(iter(DBF('data/accident/accident{}.dbf'.format(i)))))
    if i==2007:
        accident[j].rename(columns={'latitude': 'LATITUDE', 'longitud': 'LONGITUD'}, inplace=True)
    #person.append(DataFrame(iter(DBF("data/person/person{}.dbf".format(i)))))
    #person[j]['YEAR']=i
all_accidents =pd.concat(accident, axis=0,join='inner')
all_vehicle=pd.concat(vehicle, axis=0,join='inner')
#all_person =pd.concat(person, axis=0,join='inner')

First, we want to combine accidents10 ~ accidents16 to one dataframe. Since not all of the accident data downloaded from the U.S. Department of Transportation have the same features, by using the `jion:inner` option in `pd.concat` function, we can get the intersection of features.

The allaccidents table recorded 320874 accidents from 2010-2016, and it has 42 features. Here are the meaning of some of the features according to the `FARS Analytical User’s Manual`.

### Explaination of variables
*VE_TOTAL*: Number of Vehicle in crash <br/>
*VE_FORMS*: Number of Motor Vehicles in Transport (MVIT) <br/>
*PED*: Number of Persons Not in Motor Vehicles <br/>
*NHS*: National Highway System<br/>
*ROUTE*: Route Signing <br/>
*SP_JUR*: Special Jurisdiction <br/>
*HARM_EV*: First Harmful Event<br/>
*TWAY_ID , TWAY_ID2* : Trafficway Identifier <br/>
*MILEPT*: Milepoint <br/>
*SP_JUR*: Special Jurisdiction<br/>
*HARM_EV*: injury or damage producing First Harmful Event <br/> 
*MAN_COLL*:Manner of Collision  <br/> 
*RELJCT1, RELJCT2*: Relation to Junction- Within Interchange Area, Specific Location. <br/>
*TYP_INT*: Type of Intersection <br/>
*REL_ROAD*: Relation to Trafficway <br/>
*LGT_COND*: Light Condition<br/> 
*NOT_HOUR,MIN*: Min, Hour of Notification <br/>
*ARR_HOUR,MIN*: Hour, Min arrival at scene <br/>
*HOSP_HR,MIN*: Hour, Min arrival at hospital <br/>
*CF1, CF2, CF3*:Related Factors- Crash Level, factors related to the crash <br/>
*FATALS*: Fatalities<br/>
*DRUNK_DR*: Number of Drinking Drivers<br/> 
*RAIL*: Rail Grade Crossing Identifier<br/>

For more detailed information, please refer to `FARS Analytical User’s Manual`.

## Select variables and rename variables
Observed from the table above, some of the variables in the table are not very readable. Therefore, in order to make it easier to understand the variables,we renamed some of the variables according to `FARS Analytical User’s Manual`  downloaded from the  `U.S. Department of Transportation`  website. In order to make all column values informative, we selected important column variables from allaccidents, replace numerical number to meaningful character description according to `FARS Analytical User’s Manual`

 in order to make those dataframe more informative, we replace some columns value by some functions that we have defined.

In [3]:
import calendar
def change_body_type_value(x):
    if x<20: 
        return 'Sedan'
    elif 20<x<30:
        return 'Van'
    elif 30<x<70:
        return 'Truck'
    elif x>70:
        return 'Motor'
def change_month(x):
    for i in range(1,13):
        if x==i:
            return calendar.month_name[i]
def change_light_condition(x):
    if x==1:
        return 'Daylight'
    if x in [2,3,6]:
        return 'Dark'
    if x==4:
        return 'Dawn'
    if x==5:
        return 'Dusk'
    if x in [8,9]:
        return 'Not report'
    else:
        return 'Other'

In [4]:
accidents = all_accidents[['YEAR','ST_CASE','STATE','VE_TOTAL','FATALS','MONTH','DAY_WEEK','HOUR','NHS','LATITUDE','LONGITUD','MAN_COLL','LGT_COND','WEATHER','CF1','DRUNK_DR']]
accidents.rename(columns={'ST_CASE':'CASE_NUM','VE_TOTAL':'NUM_VEHICLE','NHS': 'HIGHWAY', 'MAN_COLL': 'COLLISION_TYPE','LGT_COND':'LIGHT_CONDITION','CF1':'CRASH_FACTOR','DRUNK_DR':'DRUNK_DRIVE'}, inplace=True)
accidents['DAY_WEEK']= accidents['DAY_WEEK'].map({1.0:'Sunday',2.0:'Monday', 3.0:'Tuesday', 4.0: 'Wednesday', 5.0:'Thursday', 6.0:'Friday', 7.0:'Saturday'})
accidents['HIGHWAY'] = accidents['HIGHWAY'].map({1.0:'On',0.0:'Off',9.0:'Unknow'})
accidents['COLLISION_TYPE'] = accidents['COLLISION_TYPE'].map({0.0:'Not Collision',1.0:'Rear-End',2.0:'Head-On',3.0:'Rear-to-Rear',4.0:'Angle',5.0:'Sideswipe, Same Direction',6.0:'Sideswipe, Opposite Direction',7.0:'Sideswipe, Unknown Direction',9.0:'Unknown'})
accidents['WEATHER'] = accidents['WEATHER'].map({0.0:'Normal',1.0:'Clear',2.0:'Rain', 3.0: 'Sleet,Hail', 4.0:'Snow', 5.0:'Fog, Smog, Smoke',6.0:'Severe Crosswinds',7.0:'Blowing Sand, Soil, Dirt',8.0:'other',10.0:'Cloudy',11.0:'Blowing Snow',12.0:'Freezing Rain or Drizzle',98.0:'Not Reported', 99.0:'Unkown' })
accidents['MONTH']=accidents['MONTH'].apply(change_month)
accidents['LIGHT_CONDITION']=accidents['LIGHT_CONDITION'].apply(change_light_condition)

In [5]:
vehicles = all_vehicle[['STATE','YEAR','ST_CASE','TRAV_SP','ROLLOVER','FIRE_EXP','BODY_TYP','DEATHS','DR_DRINK']]
vehicles['BODY_TYP']=vehicles['BODY_TYP'].apply(change_body_type_value)
vehicles.rename(columns={'ST_CASE':'CASE_NUM','TRAV_SP':'SPEED','FIRE_EXP': 'FIRE','SPEEDREL':'SPEEDING','DR_DRINK':'DRINKING_INDICATOR'}, inplace=True)

combine "year" and "case_num" to reindex accidents dataframe.

In [6]:
def make_index(df):
    df['STATE']=df['STATE'].astype(int)
    df['CASE_NUM']=df['CASE_NUM'].astype(int)
    df['YEAR']=df['YEAR'].astype(int)
    df.index = list(df['YEAR'].astype(str) + df['CASE_NUM'].astype(str))

In [7]:
df=[accidents,vehicles]
for s in df:
    make_index(s) 

take a look at important variables

In [8]:
accidents.head()

Unnamed: 0,YEAR,CASE_NUM,STATE,NUM_VEHICLE,FATALS,MONTH,DAY_WEEK,HOUR,HIGHWAY,LATITUDE,LONGITUD,COLLISION_TYPE,LIGHT_CONDITION,WEATHER,CRASH_FACTOR,DRUNK_DRIVE
200710001,2007,10001,1,1.0,1.0,January,Tuesday,23.0,Off,33.455839,-87.017928,Not Collision,Dark,Clear,0.0,0.0
200710002,2007,10002,1,3.0,2.0,January,Tuesday,13.0,Off,34.160597,-85.678075,"Sideswipe, Same Direction",Daylight,Clear,0.0,0.0
200710003,2007,10003,1,2.0,1.0,January,Sunday,12.0,Off,33.975717,-86.496347,"Sideswipe, Unknown Direction",Daylight,Clear,0.0,0.0
200710004,2007,10004,1,2.0,1.0,January,Monday,15.0,On,33.853258,-85.915622,Angle,Daylight,Clear,0.0,0.0
200710005,2007,10005,1,2.0,1.0,January,Saturday,19.0,Off,32.488833,-86.408936,"Sideswipe, Same Direction",Dark,Clear,0.0,0.0


In [9]:
accidents.shape

(320874, 16)

In [10]:
vehicles.head()

Unnamed: 0,STATE,YEAR,CASE_NUM,SPEED,ROLLOVER,FIRE,BODY_TYP,DEATHS,DRINKING_INDICATOR
200710001,1,2007,10001,65.0,0.0,0.0,Sedan,1.0,0.0
200710002,1,2007,10002,10.0,0.0,0.0,Sedan,0.0,0.0
200710002,1,2007,10002,55.0,0.0,0.0,Sedan,2.0,0.0
200710002,1,2007,10002,55.0,0.0,0.0,Truck,0.0,0.0
200710003,1,2007,10003,55.0,0.0,0.0,Sedan,0.0,0.0


In [11]:
vehicles.shape

(479153, 9)

In [12]:
#store important dataframe
#because the person dataframe has exceeded 600mb, so I split it to 10 part to store.
accidents.to_hdf('results/accidents.h5', 'accidents')
vehicles.to_hdf('results/vehicles.h5', 'vehicles')
#person.to_hdf('results/person.h5', 'person')
accidents16=accident[-1]
accidents15=accident[-2]
accidents16.to_hdf('results/accidents16.h5', 'accidents16')
accidents15.to_hdf('results/accidents15.h5', 'accidents15')