## COVID Drivers: Collect Initial Data

### Table of Contents
* [Read the Data](#read)</BR>
* [VEHICLE](#veh)</BR>
* [COMMVEH](#comm)</BR>
* [CYCLE](#cycle)</BR>
* [TRAILVEH](#trail)</BR>
* [VEHICLE, COMMVEH, TRAILVEH, and CYCLE](#veh-mrg)</BR>
* [PERSON](#person)</BR>
* [ROADWAY](#road)</BR>
* [CRASH](#crash)</BR>
* [FLAGS](#flags)

Import packages

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import os
import ruptures as rpt
import altair as alt

import statsmodels.api as sm

In [2]:
# Import project specific utilities
from utils.functions import *

### <a id='read'>Read the data</a>

In [3]:
comm = pd.DataFrame()
crash = pd.DataFrame()
cycle = pd.DataFrame()
flags = pd.DataFrame()
person = pd.DataFrame()
road = pd.DataFrame()
trail = pd.DataFrame()
veh = pd.DataFrame()

for folder in os.listdir(path_raw):
    #print(folder)
    year = folder[-4:]
    #print(year)
    if 'statewide' in folder.lower():
    #if year != '2020' and 'statewide' in folder.lower():
        tmp = read_data(folder, year, 'COMMVEH')
        comm = (tmp.copy() if comm.empty else comm.copy() if tmp.empty
            else pd.concat([comm, tmp]) # if both DataFrames non empty
        )

        tmp = read_data(folder, year, 'CRASH')
        crash = (tmp.copy() if crash.empty else crash.copy() if tmp.empty
            else pd.concat([crash, tmp]) # if both DataFrames non empty
        )

        tmp = read_data(folder, year, 'CYCLE')
        cycle = (tmp.copy() if cycle.empty else cycle.copy() if tmp.empty
            else pd.concat([cycle, tmp]) # if both DataFrames non empty
        )

        tmp = read_data(folder, year, 'FLAGS')
        flags = (tmp.copy() if flags.empty else flags.copy() if tmp.empty
            else pd.concat([flags, tmp]) # if both DataFrames non empty
        )

        tmp = read_data(folder, year, 'PERSON')
        person = (tmp.copy() if person.empty else person.copy() if tmp.empty
            else pd.concat([person, tmp]) # if both DataFrames non empty
        )

        tmp = read_data(folder, year, 'ROADWAY')
        road = (tmp.copy() if road.empty else road.copy() if tmp.empty
            else pd.concat([road, tmp]) # if both DataFrames non empty
        )

        tmp = read_data(folder, year, 'TRAILVEH')
        trail = (tmp.copy() if trail.empty else trail.copy() if tmp.empty
            else pd.concat([trail, tmp]) # if both DataFrames non empty
        )

        tmp = read_data(folder, year, 'VEHICLE')
        veh = (tmp.copy() if veh.empty else veh.copy() if tmp.empty
            else pd.concat([veh, tmp]) # if both DataFrames non empty
        )
            

In [4]:
specs = pd.DataFrame({'Year':years,
                     'File Path':filepaths,
                     'File Size (MB)':filesizes,
                     'Number of Rows':nrows,
                     'Number of Columns':ncols})

In [5]:
specs.sort_values(['File Path', 'Year']).to_csv('data/aux/data_specs.csv', index=False)

### <a id='veh'>VEHICLE</a>

In [6]:
veh.columns

Index(['CRN', 'AVOID_MAN_CD', 'BODY_TYPE', 'COMM_VEH_IND', 'DAMAGE_IND',
       'DVR_PRES_IND', 'EMERG_VEH_USE_CD', 'GRADE', 'HAZMAT_IND',
       'IMPACT_POINT', 'INS_IND', 'MAKE_CD', 'MODEL_YR', 'NM_AT_INTERSECTION',
       'NM_CROSSING_TCD', 'NM_DISTRACTION', 'NM_IN_CROSSWALK', 'NM_LIGHTING',
       'NM_POWERED', 'NM_REFLECT', 'NON_MOTORIST', 'OWNER_DRIVER',
       'PARTIAL_VIN', 'PEOPLE_IN_UNIT', 'PRIN_IMP_PT', 'RDWY_ALIGNMENT',
       'SPECIAL_USAGE', 'TOW_IND', 'TRAVEL_DIRECTION', 'TRAVEL_SPD',
       'TRL_VEH_CNT', 'UNDER_RIDE_IND', 'UNIT_NUM', 'UNIT_TYPE',
       'VEH_COLOR_CD', 'VEH_MOVEMENT', 'VEH_POSITION', 'VEH_REG_STATE',
       'VEH_ROLE_CD', 'VEH_TYPE', 'VINA_BODY_TYPE_CD'],
      dtype='str')

In [7]:
veh.loc[:,['CRN','UNIT_NUM']].head()

Unnamed: 0,CRN,UNIT_NUM
0,2005096326,1
1,2005132408,1
2,2005132408,2
3,2005153012,1
4,2005153012,2


In [8]:
veh['rank'] = veh.groupby(['CRN']).cumcount() + 1

In [9]:
veh['rank'].value_counts()

rank
1     2461192
2     1574437
3      256397
4       54076
5       13271
       ...   
60          1
61          1
62          1
63          1
64          1
Name: count, Length: 64, dtype: int64

In [10]:
veh_crn = veh['CRN'].unique().tolist()

### <a id='comm'>COMMVEH</a>

In [11]:
comm.loc[~comm['CRN'].isin(veh_crn)].shape[0]

0

In [12]:
comm.columns

Index(['CRN', 'AXLE_CNT', 'CARGO_BD_TYPE', 'CARRIER_ADDR_1', 'CARRIER_ADDR_2',
       'CARRIER_ADDR_CITY', 'CARRIER_ADDR_STATE', 'CARRIER_ADDR_ZIP',
       'CARRIER_NM', 'CARRIER_TEL', 'GVWR', 'HAZMAT_CD1', 'HAZMAT_CD2',
       'HAZMAT_CD3', 'HAZMAT_CD4', 'HAZMAT_IND', 'HAZMAT_REL_IND1',
       'HAZMAT_REL_IND2', 'HAZMAT_REL_IND3', 'HAZMAT_REL_IND4', 'ICC_NUM',
       'OSIZE_LOAD_IND', 'PERMITTED', 'PUC_NUM', 'SPECIAL_SIZING1',
       'SPECIAL_SIZING2', 'SPECIAL_SIZING3', 'SPECIAL_SIZING4',
       'TYPE_OF_CARRIER', 'UNIT_NUM', 'USDOT_NUM', 'VEH_CONFIG_CD'],
      dtype='str')

In [13]:
comm.info()

<class 'pandas.DataFrame'>
Index: 162702 entries, 0 to 8715
Data columns (total 32 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   CRN                 162702 non-null  int64  
 1   AXLE_CNT            161734 non-null  float64
 2   CARGO_BD_TYPE       162631 non-null  float64
 3   CARRIER_ADDR_1      162453 non-null  str    
 4   CARRIER_ADDR_2      6623 non-null    str    
 5   CARRIER_ADDR_CITY   162440 non-null  str    
 6   CARRIER_ADDR_STATE  162313 non-null  str    
 7   CARRIER_ADDR_ZIP    162246 non-null  str    
 8   CARRIER_NM          162626 non-null  str    
 9   CARRIER_TEL         161366 non-null  object 
 10  GVWR                137455 non-null  object 
 11  HAZMAT_CD1          3367 non-null    float64
 12  HAZMAT_CD2          427 non-null     float64
 13  HAZMAT_CD3          206 non-null     float64
 14  HAZMAT_CD4          191 non-null     float64
 15  HAZMAT_IND          154287 non-null  str    
 16  HA

In [14]:
comm.loc[:,['CRN','UNIT_NUM']].head()

Unnamed: 0,CRN,UNIT_NUM
0,2005105557,2
1,2005199544,2
2,2005104218,1
3,2005075974,2
4,2005088473,1


In [15]:
comm['rank'] = comm.groupby(['CRN']).cumcount() + 1

In [16]:
comm['rank'].value_counts()

rank
1     150338
2      10927
3        963
4        207
5         89
6         44
7         30
8         23
9         18
10        13
11        10
12         8
13         6
14         4
15         2
16         2
17         2
18         2
19         2
20         1
21         1
22         1
23         1
24         1
25         1
26         1
27         1
28         1
29         1
30         1
31         1
Name: count, dtype: int64

### <a id='cycle'>CYCLE</a>

In [17]:
cycle.loc[~cycle['CRN'].isin(veh_crn)].shape[0]

0

In [18]:
cycle.columns

Index(['CRN', 'MC_BAG_IND', 'MC_DVR_BOOTS_IND', 'MC_DVR_EDC_IND',
       'MC_DVR_EYEPRT_IND', 'MC_DVR_HLMTDOT_IND', 'MC_DVR_HLMTON_IND',
       'MC_DVR_HLMT_TYPE', 'MC_DVR_LNGPNTS_IND', 'MC_DVR_LNGSLV_IND',
       'MC_ENGINE_SIZE', 'MC_PASSNGR_IND', 'MC_PAS_BOOTS_IND',
       'MC_PAS_EYEPRT_IND', 'MC_PAS_HLMTDOT_IND', 'MC_PAS_HLMTON_IND',
       'MC_PAS_HLMT_TYPE', 'MC_PAS_LNGPNTS_IND', 'MC_PAS_LNGSLV_IND',
       'MC_TRAIL_IND', 'UNIT_NUM'],
      dtype='str')

In [19]:
cycle.loc[:,['CRN','UNIT_NUM']].head()

Unnamed: 0,CRN,UNIT_NUM
0,2005157748,1
1,2005145325,2
2,2005152192,1
3,2005131003,2
4,2005131003,3


In [20]:
cycle['rank'] = cycle.groupby(['CRN']).cumcount() + 1

In [21]:
cycle['rank'].value_counts()

rank
1    90544
2     2168
3      137
4       19
5        8
6        3
7        1
8        1
Name: count, dtype: int64

### <a id='trail'>TRAILVEH</a>

In [22]:
trail.loc[~trail['CRN'].isin(veh_crn)].shape[0]

0

In [23]:
trail.columns

Index(['CRN', 'TRAILER_PARTIAL_VIN', 'TRL_SEQ_NUM', 'TRL_VEH_REG_STATE',
       'TRL_VEH_TAG_NUM', 'TRL_VEH_TAG_YR', 'TRL_VEH_TYPE_CD', 'UNIT_NUM'],
      dtype='str')

In [24]:
trail.loc[:,['CRN','UNIT_NUM']].head()

Unnamed: 0,CRN,UNIT_NUM
0,2005159564,2
1,2006009775,1
2,2005120303,2
3,2005083797,1
4,2005144488,2


In [25]:
trail['rank'] = trail.groupby(['CRN']).cumcount() + 1

In [26]:
trail['rank'].value_counts()

rank
1     107626
2       8790
3        856
4        193
5         79
6         41
7         29
8         23
9         18
10        12
11         9
12         8
13         5
14         4
15         2
16         2
17         2
18         2
19         2
20         1
21         1
22         1
23         1
24         1
25         1
26         1
27         1
28         1
Name: count, dtype: int64

In [27]:
veh['TRL_VEH_CNT'].value_counts()

TRL_VEH_CNT
0.0    2898006
1.0     113663
2.0       3038
9.0        164
6.0         14
3.0          8
5.0          8
8.0          5
7.0          2
Name: count, dtype: int64

### <a id='veh-mrg'>VEHICLE, COMMVEH, TRAILVEH, and CYCLE</a>

In [28]:
comm.HAZMAT_IND.value_counts()

HAZMAT_IND
N    150966
Y      3312
U         9
Name: count, dtype: int64

In [29]:
veh.HAZMAT_IND.value_counts()

HAZMAT_IND
N    4355554
Y       3312
U          9
Name: count, dtype: int64

In [30]:
veh2 = pd.merge(veh.rename(columns={'rank':'rank_veh'}).copy(), comm.rename(columns={'rank':'rank_comm'}).copy(), on=['CRN','UNIT_NUM'], how='left').copy()

In [31]:
veh2.columns

Index(['CRN', 'AVOID_MAN_CD', 'BODY_TYPE', 'COMM_VEH_IND', 'DAMAGE_IND',
       'DVR_PRES_IND', 'EMERG_VEH_USE_CD', 'GRADE', 'HAZMAT_IND_x',
       'IMPACT_POINT', 'INS_IND', 'MAKE_CD', 'MODEL_YR', 'NM_AT_INTERSECTION',
       'NM_CROSSING_TCD', 'NM_DISTRACTION', 'NM_IN_CROSSWALK', 'NM_LIGHTING',
       'NM_POWERED', 'NM_REFLECT', 'NON_MOTORIST', 'OWNER_DRIVER',
       'PARTIAL_VIN', 'PEOPLE_IN_UNIT', 'PRIN_IMP_PT', 'RDWY_ALIGNMENT',
       'SPECIAL_USAGE', 'TOW_IND', 'TRAVEL_DIRECTION', 'TRAVEL_SPD',
       'TRL_VEH_CNT', 'UNDER_RIDE_IND', 'UNIT_NUM', 'UNIT_TYPE',
       'VEH_COLOR_CD', 'VEH_MOVEMENT', 'VEH_POSITION', 'VEH_REG_STATE',
       'VEH_ROLE_CD', 'VEH_TYPE', 'VINA_BODY_TYPE_CD', 'rank_veh', 'AXLE_CNT',
       'CARGO_BD_TYPE', 'CARRIER_ADDR_1', 'CARRIER_ADDR_2',
       'CARRIER_ADDR_CITY', 'CARRIER_ADDR_STATE', 'CARRIER_ADDR_ZIP',
       'CARRIER_NM', 'CARRIER_TEL', 'GVWR', 'HAZMAT_CD1', 'HAZMAT_CD2',
       'HAZMAT_CD3', 'HAZMAT_CD4', 'HAZMAT_IND_y', 'HAZMAT_REL_IND1',
     

In [32]:
veh2.loc[veh2['HAZMAT_IND_x'] != veh2['HAZMAT_IND_y'], ['CRN','HAZMAT_IND_x','HAZMAT_IND_y']].shape[0]

4212997

In [33]:
veh2.loc[veh2['HAZMAT_IND_x'] == veh2['HAZMAT_IND_y'], ['CRN','HAZMAT_IND_x','HAZMAT_IND_y']].shape[0]

154286

In [34]:
veh2.shape[0]

4367283

In [35]:
veh2.loc[(veh2['COMM_VEH_IND']=='Y') & veh2['rank_comm'].isna()].shape[0]

318

In [36]:
veh3 = pd.merge(veh2, trail.rename(columns={'rank':'rank_trail'}).copy(), on=['CRN','UNIT_NUM'], how='left').copy()

In [37]:
veh3.columns

Index(['CRN', 'AVOID_MAN_CD', 'BODY_TYPE', 'COMM_VEH_IND', 'DAMAGE_IND',
       'DVR_PRES_IND', 'EMERG_VEH_USE_CD', 'GRADE', 'HAZMAT_IND_x',
       'IMPACT_POINT', 'INS_IND', 'MAKE_CD', 'MODEL_YR', 'NM_AT_INTERSECTION',
       'NM_CROSSING_TCD', 'NM_DISTRACTION', 'NM_IN_CROSSWALK', 'NM_LIGHTING',
       'NM_POWERED', 'NM_REFLECT', 'NON_MOTORIST', 'OWNER_DRIVER',
       'PARTIAL_VIN', 'PEOPLE_IN_UNIT', 'PRIN_IMP_PT', 'RDWY_ALIGNMENT',
       'SPECIAL_USAGE', 'TOW_IND', 'TRAVEL_DIRECTION', 'TRAVEL_SPD',
       'TRL_VEH_CNT', 'UNDER_RIDE_IND', 'UNIT_NUM', 'UNIT_TYPE',
       'VEH_COLOR_CD', 'VEH_MOVEMENT', 'VEH_POSITION', 'VEH_REG_STATE',
       'VEH_ROLE_CD', 'VEH_TYPE', 'VINA_BODY_TYPE_CD', 'rank_veh', 'AXLE_CNT',
       'CARGO_BD_TYPE', 'CARRIER_ADDR_1', 'CARRIER_ADDR_2',
       'CARRIER_ADDR_CITY', 'CARRIER_ADDR_STATE', 'CARRIER_ADDR_ZIP',
       'CARRIER_NM', 'CARRIER_TEL', 'GVWR', 'HAZMAT_CD1', 'HAZMAT_CD2',
       'HAZMAT_CD3', 'HAZMAT_CD4', 'HAZMAT_IND_y', 'HAZMAT_REL_IND1',
     

There are commercial vehicles with trailing units...makes sense.

In [38]:
veh3.loc[veh3['rank_comm'].notna() & veh3['rank_trail'].notna()].shape[0]

86108

In [39]:
veh4 = pd.merge(veh3, cycle.rename(columns={'rank':'rank_cyc'}).copy(), on=['CRN','UNIT_NUM'], how='left').copy()

In [40]:
veh4.columns

Index(['CRN', 'AVOID_MAN_CD', 'BODY_TYPE', 'COMM_VEH_IND', 'DAMAGE_IND',
       'DVR_PRES_IND', 'EMERG_VEH_USE_CD', 'GRADE', 'HAZMAT_IND_x',
       'IMPACT_POINT', 'INS_IND', 'MAKE_CD', 'MODEL_YR', 'NM_AT_INTERSECTION',
       'NM_CROSSING_TCD', 'NM_DISTRACTION', 'NM_IN_CROSSWALK', 'NM_LIGHTING',
       'NM_POWERED', 'NM_REFLECT', 'NON_MOTORIST', 'OWNER_DRIVER',
       'PARTIAL_VIN', 'PEOPLE_IN_UNIT', 'PRIN_IMP_PT', 'RDWY_ALIGNMENT',
       'SPECIAL_USAGE', 'TOW_IND', 'TRAVEL_DIRECTION', 'TRAVEL_SPD',
       'TRL_VEH_CNT', 'UNDER_RIDE_IND', 'UNIT_NUM', 'UNIT_TYPE',
       'VEH_COLOR_CD', 'VEH_MOVEMENT', 'VEH_POSITION', 'VEH_REG_STATE',
       'VEH_ROLE_CD', 'VEH_TYPE', 'VINA_BODY_TYPE_CD', 'rank_veh', 'AXLE_CNT',
       'CARGO_BD_TYPE', 'CARRIER_ADDR_1', 'CARRIER_ADDR_2',
       'CARRIER_ADDR_CITY', 'CARRIER_ADDR_STATE', 'CARRIER_ADDR_ZIP',
       'CARRIER_NM', 'CARRIER_TEL', 'GVWR', 'HAZMAT_CD1', 'HAZMAT_CD2',
       'HAZMAT_CD3', 'HAZMAT_CD4', 'HAZMAT_IND_y', 'HAZMAT_REL_IND1',
     

There are records with motorcycle data that correspond to records with commercial vehicle</BR>
Not sure if this makes sense

In [41]:
veh4.loc[veh4['rank_comm'].notna() & veh4['rank_cyc'].notna()].shape[0]

9

There are records with motorcycle data that correspond to records with trailing units</BR>
Not sure if this makes sense 

In [42]:
veh4.loc[veh4['rank_trail'].notna() & veh4['rank_cyc'].notna()].shape[0]

326

### <a id='person'>PERSON</a>

In [43]:
person.loc[~person['CRN'].isin(veh_crn)].shape[0]

0

In [44]:
person.columns

Index(['CRN', 'AGE', 'AIRBAG1', 'AIRBAG2', 'AIRBAG3', 'AIRBAG4', 'AIRBAG_PADS',
       'DVR_LIC_STATE', 'DVR_PED_CONDITION', 'EJECTION_IND', 'EJECT_PATH_CD',
       'EXTRIC_IND', 'INJ_SEVERITY', 'NON_MOTORIST', 'PERSON_NUM',
       'PERSON_TYPE', 'RESTRAINT_HELMET', 'SEAT_POSITION', 'SEX',
       'TRANSPORTED', 'TRANSPORTED_BY', 'UNIT_NUM', 'VULNERABLE_ROAD_USER'],
      dtype='str')

In [45]:
person.loc[:,['CRN','UNIT_NUM']].head()

Unnamed: 0,CRN,UNIT_NUM
0,2005066315,1
1,2005109861,1
2,2005185756,1
3,2005185756,2
4,2005015297,1


In [46]:
person['rank'] = person.groupby(['CRN']).cumcount() + 1

In [47]:
person['rank'].value_counts()

rank
1      2458815
2      1657658
3       772847
4       361788
5       174336
        ...   
126          2
127          2
128          1
129          1
130          1
Name: count, Length: 130, dtype: int64

### <a id='road'>ROADWAY</a>

In [48]:
road.loc[~road['CRN'].isin(veh_crn)].shape[0]

0

In [49]:
road.columns

Index(['CRN', 'ACCESS_CTRL', 'COUNTY', 'LANE_COUNT', 'OFFSET', 'RAMP',
       'RDWY_ORIENT', 'RDWY_SEQ_NUM', 'ROAD_OWNER', 'ROUTE', 'SEGMENT',
       'SPEED_LIMIT', 'STREET_NAME'],
      dtype='str')

In [50]:
road.loc[:,['CRN']].head()

Unnamed: 0,CRN
0,2005104346
1,2005109406
2,2005136005
3,2005146762
4,2005151993


In [51]:
road['rank'] = road.groupby(['CRN']).cumcount() + 1

In [52]:
road['rank'].value_counts()

rank
1     2461171
2      937326
3      299959
4       91204
5       22308
6        7145
7        1901
8         614
9         152
10         45
Name: count, dtype: int64

### <a id='crash'>CRASH</a>

In [53]:
crash_crn = crash['CRN'].unique().tolist()

In [54]:
crash.loc[~crash['CRN'].isin(veh_crn)].shape[0]

1

One crash does not involve any vehicles..?

In [55]:
crash.loc[~crash['CRN'].isin(veh_crn)]

Unnamed: 0,CRN,ARRIVAL_TM,AUTOMOBILE_COUNT,BELTED_DEATH_COUNT,BELTED_SUSP_SERIOUS_INJ_COUNT,BICYCLE_COUNT,BICYCLE_DEATH_COUNT,BICYCLE_SUSP_SERIOUS_INJ_COUNT,BUS_COUNT,CHLDPAS_DEATH_COUNT,...,WORK_ZONE_LOC,WORK_ZONE_TYPE,WZ_CLOSE_DETOUR,WZ_FLAGGER,WZ_LAW_OFFCR_IND,WZ_LN_CLOSURE,WZ_MOVING,WZ_OTHER,WZ_SHLDER_MDN,WZ_WORKERS_INJ_KILLED
7404,2008020159,,0,0,0,0,0,0,0,0,...,,,,,,,,,,


In [56]:
crash['rank'] = crash.groupby(['CRN']).cumcount() + 1

In [57]:
crash['rank'].value_counts()

rank
1    2461193
Name: count, dtype: int64

### <a id='flags'>FLAGS</a>

In [58]:
flags_crn = flags['CRN'].unique().tolist()

In [59]:
crash.loc[~crash['CRN'].isin(flags_crn)].shape[0]

0

In [60]:
flags.loc[~flags['CRN'].isin(crash_crn)].shape[0]

0

In [61]:
flags['rank'] = flags.groupby(['CRN']).cumcount() + 1

  flags['rank'] = flags.groupby(['CRN']).cumcount() + 1


In [62]:
flags['rank'].value_counts()

rank
1    2461193
Name: count, dtype: int64