## COVID Drivers: Describe Data

### Table of Contents
* [Read the Data](#read)</BR>
* [Create Table of Columns and Data Types](#dtypes)</BR>
* [Select CRASH Columns to Use](#select-crash)</BR>
* [Select FLAGS Columns to Use](#select-flags)</BR>
* [Display Decoding Dictionary](#enum-df)</BR>
* [Create Data Dictionary](#data-dict)</BR>
* [Merge CRASH and FLAGS](#merge)</BR>
* [Evaluate Related Flags and Enumerated Fields](#eval)</BR>
* [Write Final Compiled Dataset to File](#write)

Import packages

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import os
import ruptures as rpt
import altair as alt

import statsmodels.api as sm

In [2]:
# Import project specific utilities
from utils.functions import *

### <a id='read'>Read the data</a>

In [3]:
comm = pd.DataFrame()
crash = pd.DataFrame()
cycle = pd.DataFrame()
flags = pd.DataFrame()
person = pd.DataFrame()
road = pd.DataFrame()
trail = pd.DataFrame()
veh = pd.DataFrame()

for folder in os.listdir(path_raw):
    #print(folder)
    year = folder[-4:]
    #print(year)
    if 'statewide' in folder.lower():
    #if year != '2020' and 'statewide' in folder.lower():
        tmp = read_data(folder, year, 'COMMVEH')
        comm = (tmp.copy() if comm.empty else comm.copy() if tmp.empty
            else pd.concat([comm, tmp]) # if both DataFrames non empty
        )

        tmp = read_data(folder, year, 'CRASH')
        crash = (tmp.copy() if crash.empty else crash.copy() if tmp.empty
            else pd.concat([crash, tmp]) # if both DataFrames non empty
        )

        tmp = read_data(folder, year, 'CYCLE')
        cycle = (tmp.copy() if cycle.empty else cycle.copy() if tmp.empty
            else pd.concat([cycle, tmp]) # if both DataFrames non empty
        )

        tmp = read_data(folder, year, 'FLAGS')
        flags = (tmp.copy() if flags.empty else flags.copy() if tmp.empty
            else pd.concat([flags, tmp]) # if both DataFrames non empty
        )

        tmp = read_data(folder, year, 'PERSON')
        person = (tmp.copy() if person.empty else person.copy() if tmp.empty
            else pd.concat([person, tmp]) # if both DataFrames non empty
        )

        tmp = read_data(folder, year, 'ROADWAY')
        road = (tmp.copy() if road.empty else road.copy() if tmp.empty
            else pd.concat([road, tmp]) # if both DataFrames non empty
        )

        tmp = read_data(folder, year, 'TRAILVEH')
        trail = (tmp.copy() if trail.empty else trail.copy() if tmp.empty
            else pd.concat([trail, tmp]) # if both DataFrames non empty
        )

        tmp = read_data(folder, year, 'VEHICLE')
        veh = (tmp.copy() if veh.empty else veh.copy() if tmp.empty
            else pd.concat([veh, tmp]) # if both DataFrames non empty
        )
            

In [4]:
crash.shape[0]

2461193

In [5]:
crash['ARRIVAL_TM'].isna().sum()/crash.shape[0]

np.float64(0.6959482657394198)

In [6]:
crash['LATITUDE'].isna().sum()/crash.shape[0]

np.float64(0.024513721597615466)

In [7]:
crash['LONGITUDE'].isna().sum()/crash.shape[0]

np.float64(0.024514534211660766)

In [8]:
crash['DISTRICT'].isna().sum()/crash.shape[0]

np.float64(0.0)

In [9]:
crash['MUNICIPALITY'].isna().sum()/crash.shape[0]

np.float64(0.0)

In [10]:
len(crash['CRN'].tolist()) == len(crash['CRN'].unique().tolist())

True

In [11]:
crash_crn = crash['CRN'].tolist()

In [12]:
comm.loc[~comm['CRN'].isin(crash_crn)].shape[0]

0

In [13]:
cycle.loc[~cycle['CRN'].isin(crash_crn)].shape[0]

0

In [14]:
flags.loc[~flags['CRN'].isin(crash_crn)].shape[0]

0

In [15]:
person.loc[~person['CRN'].isin(crash_crn)].shape[0]

0

In [16]:
person.loc[:,['CRN','UNIT_NUM']].drop_duplicates()

Unnamed: 0,CRN,UNIT_NUM
0,2005066315,1
1,2005109861,1
2,2005185756,1
3,2005185756,2
4,2005015297,1
...,...,...
244168,2024115154,1
244169,2024090876,1
244170,2024090876,2
244171,2024090876,3


In [17]:
person.columns.tolist()

['CRN',
 'AGE',
 'AIRBAG1',
 'AIRBAG2',
 'AIRBAG3',
 'AIRBAG4',
 'AIRBAG_PADS',
 'DVR_LIC_STATE',
 'DVR_PED_CONDITION',
 'EJECTION_IND',
 'EJECT_PATH_CD',
 'EXTRIC_IND',
 'INJ_SEVERITY',
 'NON_MOTORIST',
 'PERSON_NUM',
 'PERSON_TYPE',
 'RESTRAINT_HELMET',
 'SEAT_POSITION',
 'SEX',
 'TRANSPORTED',
 'TRANSPORTED_BY',
 'UNIT_NUM',
 'VULNERABLE_ROAD_USER']

In [18]:
road.loc[~road['CRN'].isin(crash_crn)].shape[0]

0

In [19]:
trail.loc[~trail['CRN'].isin(crash_crn)].shape[0]

0

In [20]:
veh.loc[~veh['CRN'].isin(crash_crn)].shape[0]

0

In [21]:
flags_crn = flags['CRN'].unique().tolist()
per_crn = person['CRN'].unique().tolist()
road_crn = road['CRN'].unique().tolist()
veh_crn = veh['CRN'].unique().tolist()

In [22]:
crash.loc[~crash['CRN'].isin(road_crn)].shape[0]

22

In [23]:
crash.loc[~crash['CRN'].isin(veh_crn)].shape[0]

1

In [24]:
crash.loc[~crash['CRN'].isin(flags_crn)].shape[0]

0

In [25]:
crash.loc[~crash['CRN'].isin(per_crn)].shape[0]

2378

In [26]:
road.columns.tolist()

['CRN',
 'ACCESS_CTRL',
 'COUNTY',
 'LANE_COUNT',
 'OFFSET',
 'RAMP',
 'RDWY_ORIENT',
 'RDWY_SEQ_NUM',
 'ROAD_OWNER',
 'ROUTE',
 'SEGMENT',
 'SPEED_LIMIT',
 'STREET_NAME']

In [27]:
road['rank'] = road.groupby(['CRN']).cumcount() + 1

In [28]:
road['rank'].value_counts()

rank
1     2461171
2      937326
3      299959
4       91204
5       22308
6        7145
7        1901
8         614
9         152
10         45
Name: count, dtype: int64

### <a id='dtypes'>Create Table of Columns and Data Types</a>

In [29]:
comm_cols = comm.columns.tolist()
crash_cols = crash.columns.tolist()
cycle_cols = cycle.columns.tolist()
flags_cols = flags.columns.tolist()
person_cols = person.columns.tolist()
road_cols = road.columns.tolist()
trail_cols = trail.columns.tolist()
veh_cols = veh.columns.tolist()

In [30]:
datasets = [comm, crash, cycle, flags, person, road, trail, veh]
dsnames = ['COMMVEH', 'CRASH', 'CYCLE', 'FLAGS', 'PERSON', 'ROADWAY', 'TRAILVEH', 'VEHICLE']

In [31]:
cols = []
dtypes = []
dsnamesx = []
chk = {}

for i in range(0, len(datasets)):
    ds_cols = datasets[i].columns.tolist()
    print(dsnames[i])
    print(ds_cols)
    print()
    dsnamesx.extend([dsnames[i]] * len(ds_cols))
    cols.extend(ds_cols)
    for c in ds_cols:
        dtypes.append(datasets[i][c].dtype.name)
        if datasets[i][c].dtype.name == 'object':
            chk[c] = dsnames[i]

COMMVEH
['CRN', 'AXLE_CNT', 'CARGO_BD_TYPE', 'CARRIER_ADDR_1', 'CARRIER_ADDR_2', 'CARRIER_ADDR_CITY', 'CARRIER_ADDR_STATE', 'CARRIER_ADDR_ZIP', 'CARRIER_NM', 'CARRIER_TEL', 'GVWR', 'HAZMAT_CD1', 'HAZMAT_CD2', 'HAZMAT_CD3', 'HAZMAT_CD4', 'HAZMAT_IND', 'HAZMAT_REL_IND1', 'HAZMAT_REL_IND2', 'HAZMAT_REL_IND3', 'HAZMAT_REL_IND4', 'ICC_NUM', 'OSIZE_LOAD_IND', 'PERMITTED', 'PUC_NUM', 'SPECIAL_SIZING1', 'SPECIAL_SIZING2', 'SPECIAL_SIZING3', 'SPECIAL_SIZING4', 'TYPE_OF_CARRIER', 'UNIT_NUM', 'USDOT_NUM', 'VEH_CONFIG_CD']

CRASH
['CRN', 'ARRIVAL_TM', 'AUTOMOBILE_COUNT', 'BELTED_DEATH_COUNT', 'BELTED_SUSP_SERIOUS_INJ_COUNT', 'BICYCLE_COUNT', 'BICYCLE_DEATH_COUNT', 'BICYCLE_SUSP_SERIOUS_INJ_COUNT', 'BUS_COUNT', 'CHLDPAS_DEATH_COUNT', 'CHLDPAS_SUSP_SERIOUS_INJ_COUNT', 'COLLISION_TYPE', 'COMM_VEH_COUNT', 'CONS_ZONE_SPD_LIM', 'COUNTY', 'CRASH_MONTH', 'CRASH_YEAR', 'DAY_OF_WEEK', 'DEC_LATITUDE', 'DEC_LONGITUDE', 'DISPATCH_TM', 'DISTRICT', 'DRIVER_COUNT_16YR', 'DRIVER_COUNT_17YR', 'DRIVER_COUNT_18YR', '

In [32]:
len(dsnamesx)

368

In [33]:
len(cols)

368

In [34]:
len(dtypes)

368

In [35]:
chk

{'CARRIER_TEL': 'COMMVEH',
 'GVWR': 'COMMVEH',
 'ICC_NUM': 'COMMVEH',
 'INTERSECTION_RELATED': 'CRASH',
 'SECONDARY_CRASH': 'CRASH',
 'WZ_WORKERS_INJ_KILLED': 'CRASH',
 'MC_ENGINE_SIZE': 'CYCLE',
 'AIRBAG1': 'PERSON',
 'TRAILER_PARTIAL_VIN': 'TRAILVEH',
 'TRL_VEH_TAG_YR': 'TRAILVEH',
 'BODY_TYPE': 'VEHICLE',
 'TRAVEL_SPD': 'VEHICLE'}

In [36]:
for k, v in chk.items():
    print(v)
    print(k)
    if v=='COMMVEH':
        print(comm[k].unique())
    if v=='CRASH':
        print(crash[k].unique())
    if v=='CYCLE':
        print(cycle[k].unique())
    if v=='PERSON':
        print(person[k].unique())
    if v=='TRAILVEH':
        print(trail[k].unique())
    if v=='VEHICLE':
        print(veh[k].unique())
    print()
    

COMMVEH
CARRIER_TEL
[8003349608.0 9999999999.0 8008762016.0 ... 7247835035.0 4124252682.0
 9733441010.0]

COMMVEH
GVWR
[80000.0 999999.0 73280.0 ... '009715' '016120' '056001']

COMMVEH
ICC_NUM
[nan '000173354' '999999999' ... '000866225' '000035236' '000264237']

CRASH
INTERSECTION_RELATED
[nan 'N' 'Y']

CRASH
SECONDARY_CRASH
[nan 'N' 'Y']

CRASH
WZ_WORKERS_INJ_KILLED
[nan 'N' 'Y']

CYCLE
MC_ENGINE_SIZE
[nan '99999' '01000' ... 48500.0 116.0 11400.0]

PERSON
AIRBAG1
[nan 'M' '02' '03' '04' '01' '00' '97']

TRAILVEH
TRAILER_PARTIAL_VIN
[nan '99999999999' '3H3V532C8FT' ... '4X4TVBC25F5' '43YDC2224CC'
 '2DM42JA48SS']

TRAILVEH
TRL_VEH_TAG_YR
[nan '2005' '9999' '2002' '205' '2006' '2007' '2004' '1205' '854' '0405'
 '06' '2009' '1996' '2000' '1990' '999' '9' '2001' '25' '05' '99-9' '1995'
 '2010' '02' '26' '2008' '1997' '6' '22' '2011' '0000' '1999' '1973'
 '8884' '0360' '08' '012' '011' '206' '104' '209' '7100' '60' '201' '0705'
 '2015' '402' '236' '2504' '24' '2016' '1998' '099' '20' '12

In [37]:
col_dict = pd.DataFrame({'Dataset':dsnamesx,
                         'Column':cols,
                         'Data Type':dtypes})

In [38]:
col_dict.head()

Unnamed: 0,Dataset,Column,Data Type
0,COMMVEH,CRN,int64
1,COMMVEH,AXLE_CNT,float64
2,COMMVEH,CARGO_BD_TYPE,float64
3,COMMVEH,CARRIER_ADDR_1,str
4,COMMVEH,CARRIER_ADDR_2,str


In [39]:
col_dict.to_csv('data/aux/column_dict.csv', index=False)

### <a id='select-crash'>Select CRASH Columns to Use</a>

In [40]:
crash_cols = [
 'COUNTY',
'CRASH_MONTH',
'CRASH_YEAR',
'CRN',
'DAY_OF_WEEK',
'DRIVER_COUNT_16YR',
'DRIVER_COUNT_17YR',
'DRIVER_COUNT_18YR',
'DRIVER_COUNT_19YR',
'DRIVER_COUNT_20YR',
'DRIVER_COUNT_50_64YR',
'DRIVER_COUNT_65_74YR',
'DRIVER_COUNT_75PLUS',
'ILLUMINATION',
'INTERSECT_TYPE',
'LOCATION_TYPE',
'ROAD_CONDITION',
'SECONDARY_CRASH',
'URBAN_RURAL',
'WEATHER1',
'WEATHER2'
 ]

In [41]:
num_cols = []
str_cols = []
obj_cols = []
oth_cols = []

for c in crash_cols:
    if crash[c].dtype.name in ['int64', 'float64']:
        num_cols.append(c)
    elif crash[c].dtype.name in ['str']:
        str_cols.append(c)
    elif crash[c].dtype.name in ['object']:
        obj_cols.append(c)
    else:
        oth_cols.append(c)

In [42]:
len(oth_cols)

0

In [43]:
obj_cols

['SECONDARY_CRASH']

In [44]:
crash['INTERSECTION_RELATED'].unique()

array([nan, 'N', 'Y'], dtype=object)

In [45]:
crash['SECONDARY_CRASH'].unique()

array([nan, 'N', 'Y'], dtype=object)

In [46]:
crash_selected = crash.loc[:,crash_cols].\
    reset_index(drop=True).\
    copy()

### <a id='select-flags'>Select FLAGS Columns to Use</a>

In [47]:
flags_cols = [
'AGGRESSIVE_DRIVING',
'ALCOHOL_RELATED',
'CELL_PHONE',
'CRN',
'CROSS_MEDIAN',
'CURVE_DVR_ERROR',
'CURVED_ROAD',
'DISTRACTED',
'DRINKING_DRIVER',
'DRIVER_16YR',
'DRIVER_17YR',
'DRIVER_18YR',
'DRIVER_19YR',
'DRIVER_20YR',
'DRIVER_50_64YR',
'DRIVER_65_74YR',
'DRIVER_75PLUS',
'DRUG_RELATED',
'DRUGGED_DRIVER',
'FATIGUE_ASLEEP',
'HIT_RUN',
'ICY_ROAD',
'ILLEGAL_DRUG_RELATED',
'ILLUMINATION_DARK',
'IMPAIRED_DRIVER',
'IMPAIRED_NONMOTORIST',
'INTERSECTION',
'LANE_DEPARTURE',
'MARIJUANA_DRUGGED_DRIVER',
'MARIJUANA_RELATED',
'MATURE_DRIVER',
'MC_DRINKING_DRIVER',
'NHTSA_AGG_DRIVING',
'NON_INTERSECTION',
'OPIOID_RELATED',
'RAMP',
'RAMP_SEGMENT',
'RAMP_TERMINAL',
'ROUNDABOUT',
'RUNNING_RED_LT',
'RUNNING_STOP_SIGN',
'RURAL',
'SIGNALIZED_INT',
'SNOW_SLUSH_ROAD',
'SPEED_CHANGE_LANE',
'SPEEDING',
'SPEEDING_RELATED',
'STOP_CONTROLLED_INT',
'SUDDEN_DEER',
'TAILGATING',
'UNDERAGE_DRNK_DRV',
'UNLICENSED',
'UNSIGNALIZED_INT',
'URBAN',
'WET_ROAD',
'YOUNG_DRIVER'
 ]

In [48]:
num_cols = []
str_cols = []
obj_cols = []
oth_cols = []

for c in flags_cols:
    if flags[c].dtype.name in ['int64', 'float64']:
        num_cols.append(c)
    elif flags[c].dtype.name in ['str']:
        str_cols.append(c)
    elif flags[c].dtype.name in ['object']:
        obj_cols.append(c)
    else:
        oth_cols.append(c)

In [49]:
len(oth_cols)

0

In [50]:
for c in oth_cols:
    print(c)
    print(crash[c].dtype.name)
    print()

In [51]:
flags_selected = flags.loc[:,flags_cols].\
    reset_index(drop=True).\
    copy()

### <a id='enum-df'>Display Decoding Dictionary</a>

In [52]:
#enum_df = pd.read_excel('data/aux/enumeration_dictionary.xlsx')

In [53]:
#enum_df.COLUMN.unique()

In [54]:
enum_cols = ['DAY_OF_WEEK',
'ILLUMINATION',
'INTERSECT_TYPE',
'LOCATION_TYPE', 
'ROAD_CONDITION',
'COUNTY',
'URBAN_RURAL',
'WEATHER1',
'WEATHER2']

In [55]:
"""enum_dict = {}

for c in enum_cols:
    enum = enum_df.loc[enum_df['COLUMN']==c]
    tmp_dict = dict(zip(enum['VALUE'], enum['CONTENT'].str.strip()))
    enum_dict[c] = tmp_dict"""

"enum_dict = {}\n\nfor c in enum_cols:\n    enum = enum_df.loc[enum_df['COLUMN']==c]\n    tmp_dict = dict(zip(enum['VALUE'], enum['CONTENT'].str.strip()))\n    enum_dict[c] = tmp_dict"

In [56]:
for k, v in enum_dict.items():
    print(k)
    print(v)
    print()

DAY_OF_WEEK
{1: 'Sunday', 2: 'Monday', 3: 'Tuesday', 4: 'Wednesday', 5: 'Thursday', 6: 'Friday', 7: 'Saturday', 9: 'Unknown'}

LOCATION_TYPE
{0: 'Not applicable', 1: 'Underpass', 2: 'Ramp', 3: 'Bridge', 4: 'Tunnel', 5: 'Toll Booth', 6: 'Cross over related', 7: 'Driveway or Parking Lot', 8: 'Ramp and bridge', 99: 'Unknown'}

RELATION_TO_ROAD
{1: 'On roadway', 2: 'Shoulder', 3: 'Median', 4: 'Roadside (off trafficway; on vehicle area)', 5: 'Outside trafficway (in area not meant for vehicles)', 6: 'In parking lane', 7: 'Gore (intersection of ramp and highway)', 9: 'Unknown'}

ROAD_CONDITION
{1: 'Dry', 2: 'Ice/Frost', 3: 'Mud, Dirt, Gravel', 4: 'Oil', 5: 'Sand', 6: 'Slush', 7: 'Snow', 8: 'Water (Standing or Moving)', 9: 'Wet', 22: 'Mud, Sand, Dirt, Oil (Expired 1-1-20)', 98: 'Other', 99: 'Unknown'}

COUNTY
{1: 'ADAMS', 2: 'ALLEGHENY', 3: 'ARMSTRONG', 4: 'BEAVER', 5: 'BEDFORD', 6: 'BERKS', 7: 'BLAIR', 8: 'BRADFORD', 9: 'BUCKS', 10: 'BUTLER', 11: 'CAMBRIA', 12: 'CAMERON', 13: 'CARBON', 14: 'C

In [57]:
enum_dict['DAY_OF_WEEK']

{1: 'Sunday',
 2: 'Monday',
 3: 'Tuesday',
 4: 'Wednesday',
 5: 'Thursday',
 6: 'Friday',
 7: 'Saturday',
 9: 'Unknown'}

In [58]:
enum_dict['DAY_OF_WEEK'][1]

'Sunday'

### <a id='data-dict'>Create Data Dictionary</a>

In [59]:
enum_cols

['DAY_OF_WEEK',
 'ILLUMINATION',
 'INTERSECT_TYPE',
 'LOCATION_TYPE',
 'ROAD_CONDITION',
 'COUNTY',
 'URBAN_RURAL',
 'WEATHER1',
 'WEATHER2']

In [60]:
crash_dict = create_data_dict(crash_selected, enum_dict)

In [61]:
crash_dict.head()

Unnamed: 0,Column,Data Type,Number of Unique,Missing,Range/Unique,Enumeration
5,COUNTY,int64,67,0 (0.0%),"[1, 67]","{1: 'ADAMS', 2: 'ALLEGHENY', 3: 'ARMSTRONG', 4..."
11,CRASH_MONTH,int64,12,0 (0.0%),"[1, 12]",
18,CRASH_YEAR,int64,20,0 (0.0%),"[2005, 2024]",
3,CRN,int64,2461193,0 (0.0%),"[2005000003, 2025050430]",
12,DAY_OF_WEEK,int64,7,0 (0.0%),"[1, 7]","{1: 'Sunday', 2: 'Monday', 3: 'Tuesday', 4: 'W..."


In [62]:
crash_dict.to_csv('data/aux/crash_data_dict.csv', index=False)

In [63]:
flags_dict = create_data_dict(flags_selected, enum_dict)

In [64]:
flags_dict.to_csv('data/aux/flags_data_dict.csv', index=False)

### <a id='merge'>Merge CRASH and FLAGS</a>

In [65]:
df = pd.merge(crash_selected, flags_selected, on='CRN', how='outer').copy()

In [66]:
set(crash_selected.columns.tolist())

{'COUNTY',
 'CRASH_MONTH',
 'CRASH_YEAR',
 'CRN',
 'DAY_OF_WEEK',
 'DRIVER_COUNT_16YR',
 'DRIVER_COUNT_17YR',
 'DRIVER_COUNT_18YR',
 'DRIVER_COUNT_19YR',
 'DRIVER_COUNT_20YR',
 'DRIVER_COUNT_50_64YR',
 'DRIVER_COUNT_65_74YR',
 'DRIVER_COUNT_75PLUS',
 'ILLUMINATION',
 'INTERSECT_TYPE',
 'LOCATION_TYPE',
 'ROAD_CONDITION',
 'SECONDARY_CRASH',
 'URBAN_RURAL',
 'WEATHER1',
 'WEATHER2'}

In [67]:
set(flags_selected.columns.tolist())

{'AGGRESSIVE_DRIVING',
 'ALCOHOL_RELATED',
 'CELL_PHONE',
 'CRN',
 'CROSS_MEDIAN',
 'CURVED_ROAD',
 'CURVE_DVR_ERROR',
 'DISTRACTED',
 'DRINKING_DRIVER',
 'DRIVER_16YR',
 'DRIVER_17YR',
 'DRIVER_18YR',
 'DRIVER_19YR',
 'DRIVER_20YR',
 'DRIVER_50_64YR',
 'DRIVER_65_74YR',
 'DRIVER_75PLUS',
 'DRUGGED_DRIVER',
 'DRUG_RELATED',
 'FATIGUE_ASLEEP',
 'HIT_RUN',
 'ICY_ROAD',
 'ILLEGAL_DRUG_RELATED',
 'ILLUMINATION_DARK',
 'IMPAIRED_DRIVER',
 'IMPAIRED_NONMOTORIST',
 'INTERSECTION',
 'LANE_DEPARTURE',
 'MARIJUANA_DRUGGED_DRIVER',
 'MARIJUANA_RELATED',
 'MATURE_DRIVER',
 'MC_DRINKING_DRIVER',
 'NHTSA_AGG_DRIVING',
 'NON_INTERSECTION',
 'OPIOID_RELATED',
 'RAMP',
 'RAMP_SEGMENT',
 'RAMP_TERMINAL',
 'ROUNDABOUT',
 'RUNNING_RED_LT',
 'RUNNING_STOP_SIGN',
 'RURAL',
 'SIGNALIZED_INT',
 'SNOW_SLUSH_ROAD',
 'SPEEDING',
 'SPEEDING_RELATED',
 'SPEED_CHANGE_LANE',
 'STOP_CONTROLLED_INT',
 'SUDDEN_DEER',
 'TAILGATING',
 'UNDERAGE_DRNK_DRV',
 'UNLICENSED',
 'UNSIGNALIZED_INT',
 'URBAN',
 'WET_ROAD',
 'YOUNG

### <a id='eval'>Evaluate Related Flags and Enumerated Fields</a>

COUNTY

In [68]:
df['COUNTYx'] = [enum_dict['COUNTY'][i] for i in df['COUNTY']]

URBAN_RURAL

In [69]:
df['URBAN_RURALx'] = [enum_dict['COUNTY'][i] for i in df['URBAN_RURAL']]

DAY_OF_WEEK

In [70]:
df['DAY_OF_WEEKx'] = [enum_dict['DAY_OF_WEEK'][i] for i in df['DAY_OF_WEEK']]

In [71]:
df['DAY_OF_WEEKx'].value_counts()

DAY_OF_WEEKx
Friday       408675
Saturday     360443
Thursday     356049
Wednesday    351283
Tuesday      346787
Monday       332554
Sunday       305402
Name: count, dtype: int64

ROAD_CONDITION and related flags

In [72]:
enum_dict['ROAD_CONDITION']

{1: 'Dry',
 2: 'Ice/Frost',
 3: 'Mud, Dirt, Gravel',
 4: 'Oil',
 5: 'Sand',
 6: 'Slush',
 7: 'Snow',
 8: 'Water (Standing or Moving)',
 9: 'Wet',
 22: 'Mud, Sand, Dirt, Oil (Expired 1-1-20)',
 98: 'Other',
 99: 'Unknown'}

In [73]:
df['ROAD_CONDITIONx'] = [enum_dict['ROAD_CONDITION'][i] for i in df['ROAD_CONDITION']]

In [74]:
df['ROAD_CONDITIONx'].value_counts()

ROAD_CONDITIONx
Dry                           1794903
Wet                            429278
Snow                            97813
Ice/Frost                       83886
Slush                           32143
Water (Standing or Moving)       8194
Mud, Dirt, Gravel                6948
Other                            5859
Unknown                          2009
Oil                               139
Sand                               21
Name: count, dtype: int64

In [75]:
df.loc[df['ICY_ROAD']==1,['ROAD_CONDITIONx','SNOW_SLUSH_ROAD','WET_ROAD','ICY_ROAD']].head()

Unnamed: 0,ROAD_CONDITIONx,SNOW_SLUSH_ROAD,WET_ROAD,ICY_ROAD
117,Ice/Frost,0,0,1
124,Ice/Frost,0,0,1
159,Ice/Frost,0,0,1
162,Ice/Frost,0,0,1
196,Ice/Frost,0,0,1


In [76]:
df.loc[df['SNOW_SLUSH_ROAD']==1,['ROAD_CONDITIONx','SNOW_SLUSH_ROAD','WET_ROAD','ICY_ROAD']]['ROAD_CONDITIONx'].unique()

<StringArray>
['Snow', 'Slush']
Length: 2, dtype: str

In [77]:
df.loc[df['WET_ROAD']==1,['ROAD_CONDITIONx','SNOW_SLUSH_ROAD','WET_ROAD','ICY_ROAD']]['ROAD_CONDITIONx'].unique()

<StringArray>
['Wet', 'Water (Standing or Moving)']
Length: 2, dtype: str

In [78]:
df.loc[df['ICY_ROAD']==1,['ROAD_CONDITIONx','SNOW_SLUSH_ROAD','WET_ROAD','ICY_ROAD']]['ROAD_CONDITIONx'].unique()

<StringArray>
['Ice/Frost']
Length: 1, dtype: str

LOCATION_TYPE and related flags

In [79]:
enum_dict['LOCATION_TYPE']

{0: 'Not applicable',
 1: 'Underpass',
 2: 'Ramp',
 3: 'Bridge',
 4: 'Tunnel',
 5: 'Toll Booth',
 6: 'Cross over related',
 7: 'Driveway or Parking Lot',
 8: 'Ramp and bridge',
 99: 'Unknown'}

In [80]:
loc_flags = [
'CROSS_MEDIAN',
'CURVE_DVR_ERROR',
'CURVED_ROAD',
'RAMP',
'RAMP_SEGMENT',
'RAMP_TERMINAL',
'ROUNDABOUT',
'SPEED_CHANGE_LANE'
]

In [81]:
df['LOCATION_TYPEx'] = [enum_dict['LOCATION_TYPE'][i] for i in df['LOCATION_TYPE']]

In [82]:
df['LOCATION_TYPEx'].value_counts()

LOCATION_TYPEx
Not applicable             2216661
Driveway or Parking Lot     117893
Ramp                         69495
Bridge                       32793
Underpass                     7974
Ramp and bridge               4487
Cross over related            4290
Unknown                       4049
Tunnel                        2315
Toll Booth                    1236
Name: count, dtype: int64

In [83]:
for c in loc_flags:
    print(c)
    print(df.loc[:,['LOCATION_TYPEx', c]].drop_duplicates())
    print()

CROSS_MEDIAN
                 LOCATION_TYPEx  CROSS_MEDIAN
0                Not applicable             0
7       Driveway or Parking Lot             0
11                       Bridge             0
19                         Ramp             0
29               Not applicable             1
32                    Underpass             0
72                      Unknown             0
187             Ramp and bridge             0
214          Cross over related             0
768                      Tunnel             0
1700    Driveway or Parking Lot             1
2778                       Ramp             1
4383                 Toll Booth             0
4843                     Bridge             1
4845                  Underpass             1
9340         Cross over related             1
27277                   Unknown             1
70052                    Tunnel             1
565418               Toll Booth             1
733360          Ramp and bridge             1

CURVE_DVR_ERROR
    

INTERSECT_TYPE and related flags

In [84]:
int_flags = [
'INTERSECTION',
'NON_INTERSECTION',
'RUNNING_RED_LT',
'RUNNING_STOP_SIGN',
'SIGNALIZED_INT',
'STOP_CONTROLLED_INT',
'UNSIGNALIZED_INT'
]

In [85]:
df['INTERSECT_TYPEx'] = [enum_dict['INTERSECT_TYPE'][i] for i in df['INTERSECT_TYPE']]

In [86]:
df['INTERSECT_TYPEx'].value_counts()

INTERSECT_TYPEx
Mid-block                 1524430
Four-way intersection      522497
“T” intersection           311912
“Y” intersection            40654
Ramp End                    21380
Ramp Begin                  14965
Multi-leg intersection      14067
Other                        7392
Railroad crossing            1375
“L” Intersection             1243
Roundabout                    759
Crossover                     286
Traffic Circle                228
Unknown                         5
Name: count, dtype: int64

In [87]:
for c in int_flags:
    print(c)
    print(df.loc[:,['INTERSECT_TYPEx', c]].drop_duplicates())
    print()

INTERSECTION
                INTERSECT_TYPEx  INTERSECTION
0                     Mid-block             0
2        Multi-leg intersection             1
3              “T” intersection             1
5         Four-way intersection             1
8              “Y” intersection             1
20                        Other             1
50                   Ramp Begin             1
214                   Crossover             1
519                    Ramp End             1
4605          Railroad crossing             1
5681             Traffic Circle             1
64684                Roundabout             1
86006                   Unknown             1
1610179        “L” Intersection             1

NON_INTERSECTION
                INTERSECT_TYPEx  NON_INTERSECTION
0                     Mid-block                 1
2        Multi-leg intersection                 0
3              “T” intersection                 0
5         Four-way intersection                 0
8              “Y” intersecti

DRIVER_COUNT_age and related flags

In [88]:
crash_age = ['DRIVER_COUNT_16YR',
 'DRIVER_COUNT_17YR',
 'DRIVER_COUNT_18YR',
 'DRIVER_COUNT_19YR',
 'DRIVER_COUNT_20YR',
 'DRIVER_COUNT_50_64YR',
 'DRIVER_COUNT_65_74YR',
 'DRIVER_COUNT_75PLUS']

In [89]:
flags_age = ['DRIVER_16YR',
 'DRIVER_17YR',
 'DRIVER_18YR',
 'DRIVER_19YR',
 'DRIVER_20YR',
 'DRIVER_50_64YR',
 'DRIVER_65_74YR',
 'DRIVER_75PLUS']

In [90]:
for i in range(0, len(crash_age)):
    print(crash_age[i] + '/' + flags_age[i])
    print(df.loc[:,[crash_age[i], flags_age[i]]].drop_duplicates())
    print('Inconsistent: ' + str(df.loc[(df[crash_age[i]]==0) & (df[flags_age[i]]==1)].shape[0]))
    print('Total non-null: ' + str(df.loc[(df[crash_age[i]].notna()) & (df[flags_age[i]].notna())].shape[0]))
    print()

DRIVER_COUNT_16YR/DRIVER_16YR
         DRIVER_COUNT_16YR  DRIVER_16YR
0                        0            0
2                        1            1
1928                     2            1
3945                     3            1
71987                    4            1
1643713                  0            1
Inconsistent: 2
Total non-null: 2461193

DRIVER_COUNT_17YR/DRIVER_17YR
         DRIVER_COUNT_17YR  DRIVER_17YR
0                        0            0
5                        1            1
165                      2            1
28390                    3            1
218825                   4            1
472158                   6            1
1224249                  5            1
1703090                  0            1
Inconsistent: 4
Total non-null: 2461193

DRIVER_COUNT_18YR/DRIVER_18YR
         DRIVER_COUNT_18YR  DRIVER_18YR
0                        1            1
1                        0            0
246                      2            1
4539                     3  

URBAN_RURAL and related flags

In [91]:
enum_dict['URBAN_RURAL']

{1: 'Rural', 2: 'Urbanized', 3: 'Urban'}

In [92]:
df['URBAN_RURAL'].value_counts()

URBAN_RURAL
2    1643082
1     818111
Name: count, dtype: int64

In [93]:
df['URBAN_RURALx'] = [enum_dict['URBAN_RURAL'][i] for i in df['URBAN_RURAL']]

In [94]:
df.loc[df['URBAN']==1, ['URBAN_RURALx','URBAN', 'RURAL']]['URBAN_RURALx'].unique()

<StringArray>
['Urbanized', 'Rural']
Length: 2, dtype: str

In [95]:
df.loc[:,['URBAN_RURALx','URBAN_RURAL', 'URBAN', 'RURAL']].drop_duplicates()

Unnamed: 0,URBAN_RURALx,URBAN_RURAL,URBAN,RURAL
0,Rural,1,0,1
2,Urbanized,2,1,0
87680,Urbanized,2,0,1
90131,Rural,1,1,0


In [96]:
df['URBAN_RURALx'].value_counts()

URBAN_RURALx
Urbanized    1643082
Rural         818111
Name: count, dtype: int64

In [97]:
df.loc[df['RURAL']==1]['URBAN_RURALx'].value_counts()

URBAN_RURALx
Rural        818102
Urbanized      6220
Name: count, dtype: int64

In [98]:
df.loc[df['URBAN']==1]['URBAN_RURALx'].value_counts()

URBAN_RURALx
Urbanized    1636862
Rural              9
Name: count, dtype: int64

ILLUMINATION and related flags

In [99]:
enum_dict['ILLUMINATION']

{1: 'Daylight',
 2: 'Dark - no streetlights',
 3: 'Dark streetlights',
 4: 'Dusk',
 5: 'Dawn',
 6: 'Dark unknown roadway lighting',
 8: 'Other',
 9: 'Unknown'}

In [100]:
df['ILLUMINATION'].value_counts()

ILLUMINATION
1.0    7379
3.0    2414
2.0    1940
5.0     232
4.0     229
6.0      72
8.0      19
Name: count, dtype: int64

In [101]:
df['ILLUMINATIONx'] = [enum_dict['ILLUMINATION'][i] if str(i)!='nan' else np.nan for i in df['ILLUMINATION']]

In [102]:
df.loc[df['ILLUMINATION_DARK']==1, ['ILLUMINATIONx','ILLUMINATION']]['ILLUMINATIONx'].unique()

<StringArray>
[                            nan,             'Dark streetlights',
        'Dark - no streetlights', 'Dark unknown roadway lighting']
Length: 4, dtype: str

In [103]:
df.loc[:,['ILLUMINATIONx','ILLUMINATION','ILLUMINATION_DARK']].drop_duplicates()

Unnamed: 0,ILLUMINATIONx,ILLUMINATION,ILLUMINATION_DARK
0,,,1
1,,,0
930437,Daylight,1.0,0
930438,Dusk,4.0,0
930445,Dark streetlights,3.0,1
930471,Dark - no streetlights,2.0,1
930483,Other,8.0,0
930564,Dawn,5.0,0
930589,Dark unknown roadway lighting,6.0,1


In [104]:
df.loc[:,['ILLUMINATIONx','ILLUMINATION','ILLUMINATION_DARK']].drop_duplicates().to_csv('data/aux/illumination.csv', index=False)

In [105]:
df.loc[df['ILLUMINATION'].isna() & (df['ILLUMINATION_DARK']==1)].shape[0]

817191

In [106]:
df.loc[df['ILLUMINATION'].isna() & (df['ILLUMINATION_DARK']==0)].shape[0]

1631717

In [107]:
df.loc[df['ILLUMINATION'].isna() & df['ILLUMINATION_DARK'].isna()].shape[0]

0

In [108]:
df['ILLUMINATION_DARK'].value_counts()

ILLUMINATION_DARK
0    1639576
1     821617
Name: count, dtype: int64

In [109]:
df.loc[df['ILLUMINATION_DARK'].isna()].shape[0]

0

In [110]:
df['ILLUMINATION'].value_counts()

ILLUMINATION
1.0    7379
3.0    2414
2.0    1940
5.0     232
4.0     229
6.0      72
8.0      19
Name: count, dtype: int64

In [111]:
crash.shape[0]

2461193

In [112]:
df['WEATHER1x'] = [enum_dict['WEATHER1'][i] if str(i)!='nan' else np.nan for i in df['WEATHER1']]

In [113]:
df['WEATHER2x'] = [enum_dict['WEATHER2'][i] if str(i)!='nan' else np.nan for i in df['WEATHER2']]

In [114]:
df.loc[(df['ILLUMINATION_DARK']==1) & df['ILLUMINATION'].isna(), ['WEATHER1x', 'WEATHER2x','ILLUMINATION','ILLUMINATION_DARK']].drop_duplicates()

Unnamed: 0,WEATHER1x,WEATHER2x,ILLUMINATION,ILLUMINATION_DARK
0,,,,1
100,,"Fog, Smog, Smoke",,1
1808248,,Clear,,1
1873855,,Severe Crosswinds,,1
1875509,,Rain,,1
...,...,...,...,...
2378997,Other,Rain,,1
2380574,Rain,Blowing Snow,,1
2422853,Cloudy,Sleet or Hail,,1
2445454,Severe Crosswinds,Clear,,1


In [115]:
df.loc[(df['ILLUMINATION_DARK']==1) & df['ILLUMINATION'].isna(), ['ILLUMINATIONx','ILLUMINATION','ILLUMINATION_DARK']].shape[0]

817191

In [116]:
df['WEATHER1x'].unique().tolist()

[nan,
 'Clear',
 'Rain',
 'Freezing Rain or Freezing Drizzle',
 'Snow',
 'Cloudy',
 'Other',
 'Fog, Smog, Smoke',
 'Unknown',
 'Blowing Snow',
 'Sleet or Hail',
 'Severe Crosswinds',
 'Blowing Sand, Soil, Dirt']

In [117]:
df.loc[(df['ILLUMINATION_DARK']==1)]['WEATHER1x'].unique().tolist()

[nan,
 'Clear',
 'Rain',
 'Freezing Rain or Freezing Drizzle',
 'Snow',
 'Other',
 'Cloudy',
 'Fog, Smog, Smoke',
 'Unknown',
 'Blowing Snow',
 'Sleet or Hail',
 'Blowing Sand, Soil, Dirt',
 'Severe Crosswinds']

In [118]:
df.loc[(df['ILLUMINATION_DARK']==1)]['WEATHER2x'].unique().tolist()

[nan,
 'Fog, Smog, Smoke',
 'Clear',
 'Severe Crosswinds',
 'Rain',
 'Other',
 'Cloudy',
 'Snow',
 'Freezing Rain or Freezing Drizzle',
 'Blowing Snow',
 'Sleet or Hail',
 'Blowing Sand, Soil, Dirt']

### <a id='write'>Write Final Compiled Dataset to File</a>

In [119]:
df.to_csv('data/aux/selected_crash_and_flags.csv', index=False)