## COVID Drivers: Data Preparation

### Table of Contents
* [Read the Data](#read)</BR>
* [Select Columns and Merge CRASH and FLAGS](#cols)</BR>
* [Select Rows with Vehicles in Transit or Hit and Run](#rows)</BR>
* [Create New Date Variables](#dates)</BR>
* [Calculate Driver Age Flags](#ages)</BR>
* [Decode Categorical Variables](#decode)</BR>
* [Prepare Data by Date](#prep-dt)</BR>
* [Write Final Aggregated Datasets to Files](#write)

Import packages

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import os
import ruptures as rpt
import altair as alt

import statsmodels.api as sm

In [2]:
# Import project specific utilities
from utils.functions import *

### <a id='read'>Read the data</a>

In [3]:
crash = pd.DataFrame()
flags = pd.DataFrame()
person = pd.DataFrame()
vehicle = pd.DataFrame()

for folder in os.listdir(path_raw):
    #print(folder)
    year = folder[-4:]
    #print(year)
    if 'statewide' in folder.lower():
    #if year != '2020' and 'statewide' in folder.lower():
        tmp = read_data(folder, year, 'CRASH')
        crash = (tmp.copy() if crash.empty else crash.copy() if tmp.empty
            else pd.concat([crash, tmp]) # if both DataFrames non empty
        )

        tmp = read_data(folder, year, 'FLAGS')
        flags = (tmp.copy() if flags.empty else flags.copy() if tmp.empty
            else pd.concat([flags, tmp]) # if both DataFrames non empty
        )

        tmp = read_data(folder, year, 'PERSON')
        person = (tmp.copy() if person.empty else person.copy() if tmp.empty
            else pd.concat([person, tmp]) # if both DataFrames non empty
        )

        tmp = read_data(folder, year, 'VEHICLE')
        vehicle = (tmp.copy() if vehicle.empty else vehicle.copy() if tmp.empty
            else pd.concat([vehicle, tmp]) # if both DataFrames non empty
        )
            

### <a id='dates'>Create New Date Variables</a>

In [4]:
crash['POST_COVID'] = [1 if (i > 2020) or (i == 2020 and j >= 3) else 0 for (i, j) in zip(crash['CRASH_YEAR'], crash['CRASH_MONTH'])]

In [5]:
crash['CRASH_MN_NAME'] = pd.to_datetime(crash['CRASH_MONTH'], format='%m').dt.month_name().str[:3]

In [6]:
crash['CRASH_YEAR'] = crash['CRASH_YEAR'].astype('int64').astype('str')

In [7]:
crash['CRASH_DATE'] = crash.CRASH_YEAR.str.cat( crash.CRASH_MN_NAME, sep = '-' )

In [8]:
crash['CRASH_DATE'].head()

0    2005-Jan
1    2005-Jan
2    2005-Jan
3    2005-Jan
4    2005-Jan
Name: CRASH_DATE, dtype: str

In [9]:
crash['CRASH_DATE'] = pd.to_datetime( crash.CRASH_DATE )

  crash['CRASH_DATE'] = pd.to_datetime( crash.CRASH_DATE )


In [10]:
crash['CRASH_DATE'].head()

0   2005-01-01
1   2005-01-01
2   2005-01-01
3   2005-01-01
4   2005-01-01
Name: CRASH_DATE, dtype: datetime64[us]

### <a id='cols'>Select Columns and Merge CRASH and FLAGS</a>

In [11]:
crash_cols = ['CRN',
 'CRASH_DATE',
 'POST_COVID',
 'COUNTY',
 'URBAN_RURAL']

flags_cols = ['CRN',
 'AGGRESSIVE_DRIVING',
 'NHTSA_AGG_DRIVING',
 'NO_CLEARANCE',
 'RUNNING_RED_LT',
 'RUNNING_STOP_SIGN',
 'SPEEDING',
 'SPEEDING_RELATED',
 'TAILGATING']

per_cols = ['CRN',
            'UNIT_NUM',
            'PERSON_TYPE',
            'AGE']

veh_cols = ['CRN',
            'UNIT_NUM',
            'UNIT_TYPE']



In [12]:
df_init = pd.merge(crash.loc[:,crash_cols], flags.loc[:,flags_cols], on='CRN', how='left').copy()

### <a id='rows'>Select Rows with Vehicles in Transit or Hit and Run</a>

In [13]:
enum_dict['UNIT_TYPE']

{1: 'Motor vehicle in transport',
 2: 'Legally parked',
 3: 'Illegally parked',
 5: 'Hit and run vehicle',
 6: 'Disabled from a previous crash',
 21: 'Train',
 30: 'Non-Motorist',
 33: 'Personal Delivery Device',
 51: 'Phantom vehicle'}

In [14]:
moving = vehicle.loc[vehicle['UNIT_TYPE'].isin([1, 5])].copy()

In [15]:
select_crn = moving['CRN'].unique().tolist()

In [16]:
df = df_init.loc[df_init['CRN'].isin(select_crn)].copy()

In [17]:
df_init.shape[0]

2461193

In [18]:
df.shape[0]

2460484

### <a id='ages'>Calculate Driver Age Flags</a>

In [19]:
enum_dict['PERSON_TYPE']

{1: 'Driver',
 2: 'Passenger',
 4: 'Non-Motorist Operator',
 5: 'Non-Motorist Occupant',
 7: 'Pedestrian',
 8: 'Other',
 9: 'Unknown'}

Drviers of moving vehicles - in transit or hit and run

In [20]:
drivers = person.loc[(person['PERSON_TYPE']==1) & (person['CRN'].isin(select_crn)), ['CRN','PERSON_TYPE', 'AGE']].copy()

In [21]:
drivers.shape[0]

3995186

In [22]:
drivers.head()

Unnamed: 0,CRN,PERSON_TYPE,AGE
0,2005066315,1.0,26
1,2005109861,1.0,17
2,2005185756,1.0,46
3,2005185756,1.0,38
4,2005015297,1.0,47


In [23]:
drivers.AGE.isna().sum()

np.int64(0)

In [24]:
drivers['TRACK'] = [1 if (x >=16 and x <= 20) or (x >=50) else 0 for x in drivers['AGE']]

In [25]:
drivers2 = drivers.loc[(drivers['TRACK']==1)].copy()

In [26]:

drivers2['GROUP'] = drivers2['AGE']

In [27]:
drivers2['GROUP'].value_counts()

GROUP
18    124235
19    122532
20    117803
17    106744
99     86941
50     59175
51     58882
52     57347
53     56886
54     55333
55     53859
56     52260
57     50732
58     49509
59     47158
60     45376
61     42536
16     41759
62     40366
63     38049
64     35296
65     32553
66     30368
67     28587
68     26585
69     24935
70     22975
71     21840
72     20363
73     19209
74     18435
75     17143
76     16024
77     15273
78     14185
79     13263
80     12288
81     11524
82     10349
83      9425
84      8626
85      7520
86      6481
87      5484
88      4406
89      3483
90      2528
91      1986
92      1298
93       932
98       820
94       602
95       442
96       246
97       161
Name: count, dtype: int64

In [28]:
drivers2['GROUP'] = ['No Track' if (x >= 21 and x <= 49) else 'DRV_CNT_50_64YR' if (x >= 50 and x <= 64) else 'DRV_CNT_65_74YR' if (x >= 65 and x <= 74) else 'DRV_CNT_75PLUS' if x >= 75 else 'DRV_CNT_' + str(x) + 'YR' for x in drivers2['GROUP']]

In [29]:
drv_age_cnt = drivers2.groupby(['CRN','GROUP']).\
    aggregate(DRIVER_COUNTS=('CRN','count')).\
    reset_index().\
    copy()

In [30]:
drv_age_cnt

Unnamed: 0,CRN,GROUP,DRIVER_COUNTS
0,2005000003,DRV_CNT_18YR,1
1,2005000006,DRV_CNT_19YR,1
2,2005000010,DRV_CNT_16YR,1
3,2005000012,DRV_CNT_65_74YR,1
4,2005000020,DRV_CNT_17YR,1
...,...,...,...
1653985,2025043500,DRV_CNT_75PLUS,1
1653986,2025043942,DRV_CNT_50_64YR,1
1653987,2025045416,DRV_CNT_75PLUS,1
1653988,2025045750,DRV_CNT_75PLUS,1


In [31]:
drv_age_cnt.GROUP.value_counts()

GROUP
DRV_CNT_50_64YR    669464
DRV_CNT_75PLUS     243714
DRV_CNT_65_74YR    236085
DRV_CNT_18YR       122219
DRV_CNT_19YR       120957
DRV_CNT_20YR       116311
DRV_CNT_17YR       103942
DRV_CNT_16YR        41298
Name: count, dtype: int64

In [32]:
drv_age_cnt.CRN.nunique()

1414268

In [33]:
drv_age_cnt.shape[0]

1653990

In [34]:
drv_age_cnt2 = drv_age_cnt.pivot(index=['CRN'], values='DRIVER_COUNTS', columns='GROUP').reset_index().copy()

In [35]:
drv_age_cnt2

GROUP,CRN,DRV_CNT_16YR,DRV_CNT_17YR,DRV_CNT_18YR,DRV_CNT_19YR,DRV_CNT_20YR,DRV_CNT_50_64YR,DRV_CNT_65_74YR,DRV_CNT_75PLUS
0,2005000003,,,1.0,,,,,
1,2005000006,,,,1.0,,,,
2,2005000010,1.0,,,,,,,
3,2005000012,,,,,,,1.0,
4,2005000020,,1.0,,,1.0,,,
...,...,...,...,...,...,...,...,...,...
1414263,2025043500,,,,,,,,1.0
1414264,2025043942,,,,,,1.0,,
1414265,2025045416,,,,,,,,1.0
1414266,2025045750,,,,,,,,1.0


In [36]:
drv_age_cnt2.CRN.nunique()

1414268

In [37]:
drv_age_cnt2.shape[0]

1414268

In [38]:
drv_counts = [
       'DRV_CNT_16YR', 
       'DRV_CNT_17YR',
       'DRV_CNT_18YR',
       'DRV_CNT_19YR',
       'DRV_CNT_20YR',
       'DRV_CNT_50_64YR',
       'DRV_CNT_65_74YR',
       'DRV_CNT_75PLUS']

In [39]:
df2 = pd.merge(drv_age_cnt2, df, on=['CRN'], how='left').\
    copy()

In [40]:
df2.columns

Index(['CRN', 'DRV_CNT_16YR', 'DRV_CNT_17YR', 'DRV_CNT_18YR', 'DRV_CNT_19YR',
       'DRV_CNT_20YR', 'DRV_CNT_50_64YR', 'DRV_CNT_65_74YR', 'DRV_CNT_75PLUS',
       'CRASH_DATE', 'POST_COVID', 'COUNTY', 'URBAN_RURAL',
       'AGGRESSIVE_DRIVING', 'NHTSA_AGG_DRIVING', 'NO_CLEARANCE',
       'RUNNING_RED_LT', 'RUNNING_STOP_SIGN', 'SPEEDING', 'SPEEDING_RELATED',
       'TAILGATING'],
      dtype='str')

### <a id='decode'>Decode Categorical Variables</a>

In [41]:
df3 = df2.copy()

COUNTY

In [42]:
df3['COUNTYx'] = [enum_dict['COUNTY'][i] for i in df3['COUNTY']]

URBAN_RURAL

In [43]:
df3['URBAN_RURALx'] = [enum_dict['COUNTY'][i] for i in df3['URBAN_RURAL']]

### <a id='prep-dt'>Prepare Data by Date</a>

In [44]:
cols_to_keep = ['CRASH_DATE', 
            'DRV_CNT_16YR', 
            'DRV_CNT_17YR',
            'DRV_CNT_18YR', 
            'DRV_CNT_19YR', 
            'DRV_CNT_20YR',
            'DRV_CNT_50_64YR', 
            'DRV_CNT_65_74YR', 
            'DRV_CNT_75PLUS',
            'AGGRESSIVE_DRIVING', 
            'NHTSA_AGG_DRIVING', 
            'NO_CLEARANCE',
            'RUNNING_RED_LT', 
            'RUNNING_STOP_SIGN', 
            'SPEEDING',
            'SPEEDING_RELATED', 
            'TAILGATING',
            'POST_COVID']

In [45]:
df_dates = df3.loc[:,cols_to_keep].copy()

In [46]:
summ_idx = df_dates.set_index('CRASH_DATE')

In [47]:
summ_by_date = summ_idx.copy().resample('MS').mean()

### <a id='write'>Write Final Aggregated Datasets to Files</a>

In [48]:
ready_path = 'data/ready/'

In [49]:
summ_by_date.to_csv(ready_path + 'by_dates.csv', index=False)

In [50]:
#df3.columns.tolist()

In [51]:
out_df = df3.loc[:, ['CRN',
 'CRASH_DATE',
 'POST_COVID',
 'AGGRESSIVE_DRIVING',
 'NHTSA_AGG_DRIVING',
 'NO_CLEARANCE',
 'RUNNING_RED_LT',
 'RUNNING_STOP_SIGN',
 'SPEEDING',
 'SPEEDING_RELATED',
 'TAILGATING',
 'COUNTYx',
 'URBAN_RURALx',
 'DRV_CNT_16YR',
 'DRV_CNT_17YR',
 'DRV_CNT_18YR',
 'DRV_CNT_19YR',
 'DRV_CNT_20YR',
 'DRV_CNT_50_64YR',
 'DRV_CNT_65_74YR',
 'DRV_CNT_75PLUS']].\
 copy()

In [52]:
out_df.to_csv(ready_path + 'full_dataset.csv', index=False)