## COVID Drivers: Data Quality Assessment

### Table of Contents
* [Summary of Decisions](#summ)</BR>
* [Read the Data](#read)</BR>
* [Create New Date Variables](#dates)</BR>
* [Evaluate Variables](#eval)</BR>
* [Calculate Driver Counts and Compare](#drv-calc)

Import packages

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import os
import ruptures as rpt
import altair as alt

import statsmodels.api as sm

In [2]:
# Import project specific utilities
from utils.functions import *

In [3]:
path_in = 'data/aux/selected_crash_and_flags.csv'

### <a id='read'>Read the data</a>

Read dataset created in covid_drivers_02_DESC.ipynb

In [4]:
df_init = pd.read_csv(path_in, low_memory=False)

Read PERSON and VEHICLE to evaluate DRIVER_COUNT_<age> and DRIVER_<age> variables

In [5]:
person = pd.DataFrame()
vehicle = pd.DataFrame()

for folder in os.listdir(path_raw):
    #print(folder)
    year = folder[-4:]
    #print(year)
    if 'statewide' in folder.lower():
    #if year != '2020' and 'statewide' in folder.lower():
        tmp = read_data(folder, year, 'PERSON')
        person = (tmp.copy() if person.empty else person.copy() if tmp.empty
            else pd.concat([person, tmp]) # if both DataFrames non empty
        )

        tmp = read_data(folder, year, 'VEHICLE')
        vehicle = (tmp.copy() if vehicle.empty else vehicle.copy() if tmp.empty
            else pd.concat([vehicle, tmp]) # if both DataFrames non empty
        )
            

### <a id='dates'>Create New Date Variables</a>

In [6]:
df = df_init.copy()

In [7]:
df['POST_COVID'] = [1 if (i > 2020) or (i == 2020 and j >= 3) else 0 for (i, j) in zip(df['CRASH_YEAR'], df['CRASH_MONTH'])]

In [8]:
df['CRASH_MN_NAME'] = pd.to_datetime(df['CRASH_MONTH'], format='%m').dt.month_name().str[:3]

In [9]:
df['CRASH_YEAR'] = df['CRASH_YEAR'].astype('int64').astype('str')

In [10]:
df['CRASH_DATE'] = df.CRASH_YEAR.str.cat( df.CRASH_MN_NAME, sep = '-' )

In [11]:
df['CRASH_DATE'].head()

0    2005-Jan
1    2005-Jan
2    2005-Jan
3    2005-Jan
4    2005-Jan
Name: CRASH_DATE, dtype: str

In [12]:
df['CRASH_DATE'] = pd.to_datetime( df.CRASH_DATE )

  df['CRASH_DATE'] = pd.to_datetime( df.CRASH_DATE )


In [13]:
df['CRASH_DATE'].head()

0   2005-01-01
1   2005-01-01
2   2005-01-01
3   2005-01-01
4   2005-01-01
Name: CRASH_DATE, dtype: datetime64[us]

In [14]:
date_cols = ['CRASH_DATE', 
             'CRASH_MONTH', 
             'CRASH_MN_NAME', 
             'CRASH_YEAR', 
             'POST_COVID']

In [15]:
outcome = ['POST_COVID']

### <a id='eval'>Evaluate Variables</a>

In [16]:
df.columns.tolist()

['COUNTY',
 'CRASH_MONTH',
 'CRASH_YEAR',
 'CRN',
 'DRIVER_COUNT_16YR',
 'DRIVER_COUNT_17YR',
 'DRIVER_COUNT_18YR',
 'DRIVER_COUNT_19YR',
 'DRIVER_COUNT_20YR',
 'DRIVER_COUNT_50_64YR',
 'DRIVER_COUNT_65_74YR',
 'DRIVER_COUNT_75PLUS',
 'URBAN_RURAL',
 'AGGRESSIVE_DRIVING',
 'DRIVER_16YR',
 'DRIVER_17YR',
 'DRIVER_18YR',
 'DRIVER_19YR',
 'DRIVER_20YR',
 'DRIVER_50_64YR',
 'DRIVER_65_74YR',
 'DRIVER_75PLUS',
 'NHTSA_AGG_DRIVING',
 'NO_CLEARANCE',
 'RUNNING_RED_LT',
 'RUNNING_STOP_SIGN',
 'SPEEDING',
 'SPEEDING_RELATED',
 'TAILGATING',
 'COUNTYx',
 'URBAN_RURALx',
 'POST_COVID',
 'CRASH_MN_NAME',
 'CRASH_DATE']

In [17]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 2461193 entries, 0 to 2461192
Data columns (total 34 columns):
 #   Column                Dtype         
---  ------                -----         
 0   COUNTY                int64         
 1   CRASH_MONTH           int64         
 2   CRASH_YEAR            str           
 3   CRN                   int64         
 4   DRIVER_COUNT_16YR     int64         
 5   DRIVER_COUNT_17YR     int64         
 6   DRIVER_COUNT_18YR     int64         
 7   DRIVER_COUNT_19YR     int64         
 8   DRIVER_COUNT_20YR     int64         
 9   DRIVER_COUNT_50_64YR  int64         
 10  DRIVER_COUNT_65_74YR  int64         
 11  DRIVER_COUNT_75PLUS   int64         
 12  URBAN_RURAL           int64         
 13  AGGRESSIVE_DRIVING    int64         
 14  DRIVER_16YR           int64         
 15  DRIVER_17YR           int64         
 16  DRIVER_18YR           int64         
 17  DRIVER_19YR           int64         
 18  DRIVER_20YR           int64         
 19  DRIVER_50_6

In [18]:
df.COUNTY.nunique()

67

In [19]:
df.COUNTY.value_counts()

COUNTY
2     236677
67    214621
46    170534
9     120646
36    111555
       ...  
47      4182
52      2672
56      1522
27      1383
12       992
Name: count, Length: 67, dtype: int64

#### Evaluate Variables from FLAGS

In [20]:
flags_cols = ['AGGRESSIVE_DRIVING',
'CRN',
'DRIVER_16YR',
'DRIVER_17YR',
'DRIVER_18YR',
'DRIVER_19YR',
'DRIVER_20YR',
'DRIVER_50_64YR',
'DRIVER_65_74YR',
'DRIVER_75PLUS',
'NHTSA_AGG_DRIVING',
'NO_CLEARANCE',
'RUNNING_RED_LT',
'RUNNING_STOP_SIGN',
'SPEEDING',
'SPEEDING_RELATED',
'TAILGATING']

In [21]:
for c in flags_cols:
    miss = df[c].isna().sum()
    pct = round((df.loc[df[c].isna()].shape[0]/df.shape[0])*100, 2)
    if miss > 0:
        print()
        print(c)
        print(str(miss) + ' missing')
        print(str(pct) + '% missing')
        print(df[c].value_counts())
        print()
    else:
        print(c + ' has no missing values')

AGGRESSIVE_DRIVING has no missing values
CRN has no missing values
DRIVER_16YR has no missing values
DRIVER_17YR has no missing values
DRIVER_18YR has no missing values
DRIVER_19YR has no missing values
DRIVER_20YR has no missing values
DRIVER_50_64YR has no missing values
DRIVER_65_74YR has no missing values
DRIVER_75PLUS has no missing values
NHTSA_AGG_DRIVING has no missing values
NO_CLEARANCE has no missing values
RUNNING_RED_LT has no missing values
RUNNING_STOP_SIGN has no missing values
SPEEDING has no missing values
SPEEDING_RELATED has no missing values
TAILGATING has no missing values


#### Evaluate month and year variables

In [22]:
date_cols = ['CRASH_MONTH', 'CRASH_YEAR']

In [23]:
for c in date_cols:
    print(c + ': ' + str(df[c].isna().sum()))

CRASH_MONTH: 0
CRASH_YEAR: 0


In [24]:
df['CRASH_YEAR'].value_counts()

CRASH_YEAR
2005    134261
2007    132152
2016    129607
2006    129253
2018    128541
2017    128441
2015    127470
2008    126184
2011    125616
2019    125452
2012    124501
2013    124366
2009    121794
2010    121612
2014    121547
2021    118100
2022    116147
2024    110813
2023    110736
2020    104600
Name: count, dtype: int64

In [25]:
for c in date_cols:
    print(c)
    print(df[c].isna().sum())
    print()

CRASH_MONTH
0

CRASH_YEAR
0



#### Evaluate location variables

In [26]:
loc_cols = ['COUNTY',
 'URBAN_RURAL']

In [27]:
for c in loc_cols:
    print(c)
    print(df[c].isna().sum())
    print()

COUNTY
0

URBAN_RURAL
0



#### Evaluate aggressive driving variables

In [28]:
agg_cols = [
'AGGRESSIVE_DRIVING',
'NHTSA_AGG_DRIVING',
'NO_CLEARANCE',
'RUNNING_RED_LT',
'RUNNING_STOP_SIGN',
'SPEEDING',
'SPEEDING_RELATED',
'TAILGATING'
]

In [29]:
for c in agg_cols:
    miss = df[c].isna().sum()
    pct = round((df.loc[df[c].isna()].shape[0]/df.shape[0])*100, 2)
    if miss > 0:
        print()
        print(c)
        print(str(miss) + ' missing')
        print(str(pct) + '% missing')
        print(df[c].value_counts())
        print()
    else:
        print(c + ' has no missing values')

AGGRESSIVE_DRIVING has no missing values
NHTSA_AGG_DRIVING has no missing values
NO_CLEARANCE has no missing values
RUNNING_RED_LT has no missing values
RUNNING_STOP_SIGN has no missing values
SPEEDING has no missing values
SPEEDING_RELATED has no missing values
TAILGATING has no missing values


### <a id='drv-calc'>Calculate Driver Counts and Compare</a>

Select Rows with Vehicles in Transit or Hit and Run

In [30]:
enum_dict['UNIT_TYPE']

{1: 'Motor vehicle in transport',
 2: 'Legally parked',
 3: 'Illegally parked',
 5: 'Hit and run vehicle',
 6: 'Disabled from a previous crash',
 21: 'Train',
 30: 'Non-Motorist',
 33: 'Personal Delivery Device',
 51: 'Phantom vehicle'}

In [31]:
moving = vehicle.loc[vehicle['UNIT_TYPE'].isin([1, 5])].copy()

In [32]:
select_crn = moving['CRN'].unique().tolist()

In [33]:
df2 = df.loc[df_init['CRN'].isin(select_crn)].copy()

In [34]:
df.shape[0]

2461193

In [35]:
df2.shape[0]

2460484

Limit to persons driving

In [36]:
enum_dict['PERSON_TYPE']

{1: 'Driver',
 2: 'Passenger',
 4: 'Non-Motorist Operator',
 5: 'Non-Motorist Occupant',
 7: 'Pedestrian',
 8: 'Other',
 9: 'Unknown'}

In [37]:
drivers = person.loc[(person['PERSON_TYPE']==1) & (person['CRN'].isin(select_crn)), ['CRN','PERSON_TYPE', 'AGE']].copy()

In [38]:
drivers.shape[0]

3995186

In [39]:
drivers.head()

Unnamed: 0,CRN,PERSON_TYPE,AGE
0,2005066315,1.0,26
1,2005109861,1.0,17
2,2005185756,1.0,46
3,2005185756,1.0,38
4,2005015297,1.0,47


In [40]:
drivers.AGE.isna().sum()

np.int64(0)

In [41]:
drivers['TRACK'] = [1 if (x >=16 and x <= 20) or (x >=50) else 0 for x in drivers['AGE']]

In [42]:

drivers['GROUP'] = drivers['AGE']

In [43]:
drivers['GROUP'].value_counts()

GROUP
18    124235
19    122532
21    119626
20    117803
22    116074
       ...  
7         30
2         23
3         22
4         21
6         18
Name: count, Length: 99, dtype: int64

In [44]:
drivers['GROUP'] = ['No Track' if (x <= 15) or (x >= 21 and x <= 49) else 'CNT_50_64YR' if (x >= 50 and x <= 64) else 'CNT_65_74YR' if (x >= 65 and x <= 74) else 'CNT_75PLUS' if x >= 75 else 'CNT_' + str(x) + 'YR' for x in drivers['GROUP']]

In [45]:
drv_age_cnt = drivers.groupby(['CRN','GROUP']).\
    aggregate(DRIVER_COUNTS=('CRN','count')).\
    reset_index().\
    copy()

In [46]:
drv_age_cnt

Unnamed: 0,CRN,GROUP,DRIVER_COUNTS
0,2005000003,CNT_18YR,1
1,2005000006,CNT_19YR,1
2,2005000010,CNT_16YR,1
3,2005000012,CNT_65_74YR,1
4,2005000012,No Track,1
...,...,...,...
3375603,2025047213,No Track,1
3375604,2025047762,No Track,2
3375605,2025048164,No Track,1
3375606,2025049635,No Track,2


In [47]:
drv_age_cnt.GROUP.value_counts()

GROUP
No Track       1721618
CNT_50_64YR     669464
CNT_75PLUS      243714
CNT_65_74YR     236085
CNT_18YR        122219
CNT_19YR        120957
CNT_20YR        116311
CNT_17YR        103942
CNT_16YR         41298
Name: count, dtype: int64

In [48]:
drv_age_cnt.CRN.nunique()

2451586

In [49]:
drv_age_cnt.shape[0]

3375608

In [50]:
drv_age_cnt2 = drv_age_cnt.pivot(index=['CRN'], values='DRIVER_COUNTS', columns='GROUP').reset_index().copy()

In [51]:
drv_age_cnt2

GROUP,CRN,CNT_16YR,CNT_17YR,CNT_18YR,CNT_19YR,CNT_20YR,CNT_50_64YR,CNT_65_74YR,CNT_75PLUS,No Track
0,2005000003,,,1.0,,,,,,
1,2005000006,,,,1.0,,,,,
2,2005000010,1.0,,,,,,,,
3,2005000012,,,,,,,1.0,,1.0
4,2005000013,,,,,,,,,1.0
...,...,...,...,...,...,...,...,...,...,...
2451581,2025047213,,,,,,,,,1.0
2451582,2025047762,,,,,,,,,2.0
2451583,2025048164,,,,,,,,,1.0
2451584,2025049635,,,,,,,,,2.0


In [52]:
df2.columns

Index(['COUNTY', 'CRASH_MONTH', 'CRASH_YEAR', 'CRN', 'DRIVER_COUNT_16YR',
       'DRIVER_COUNT_17YR', 'DRIVER_COUNT_18YR', 'DRIVER_COUNT_19YR',
       'DRIVER_COUNT_20YR', 'DRIVER_COUNT_50_64YR', 'DRIVER_COUNT_65_74YR',
       'DRIVER_COUNT_75PLUS', 'URBAN_RURAL', 'AGGRESSIVE_DRIVING',
       'DRIVER_16YR', 'DRIVER_17YR', 'DRIVER_18YR', 'DRIVER_19YR',
       'DRIVER_20YR', 'DRIVER_50_64YR', 'DRIVER_65_74YR', 'DRIVER_75PLUS',
       'NHTSA_AGG_DRIVING', 'NO_CLEARANCE', 'RUNNING_RED_LT',
       'RUNNING_STOP_SIGN', 'SPEEDING', 'SPEEDING_RELATED', 'TAILGATING',
       'COUNTYx', 'URBAN_RURALx', 'POST_COVID', 'CRASH_MN_NAME', 'CRASH_DATE'],
      dtype='str')

In [53]:
drv_age_cnt2.columns

Index(['CRN', 'CNT_16YR', 'CNT_17YR', 'CNT_18YR', 'CNT_19YR', 'CNT_20YR',
       'CNT_50_64YR', 'CNT_65_74YR', 'CNT_75PLUS', 'No Track'],
      dtype='str', name='GROUP')

In [54]:
derived_cnt = ['CNT_16YR', 
       'CNT_17YR',
       'CNT_18YR',
       'CNT_19YR',
       'CNT_20YR', 
       'CNT_50_64YR',
       'CNT_65_74YR',
       'CNT_75PLUS']

crash_cnt = ['DRIVER_COUNT_16YR', 
       'DRIVER_COUNT_17YR',
       'DRIVER_COUNT_18YR',
       'DRIVER_COUNT_19YR',
       'DRIVER_COUNT_20YR',
       'DRIVER_COUNT_50_64YR',
       'DRIVER_COUNT_65_74YR',
       'DRIVER_COUNT_75PLUS']

ages = [16, 17, 18, 19, 20, 50, 65, 75]

drv_counts = ['DRIVER_COUNT_16YR',
       'CNT_16YR', 
       'DRIVER_COUNT_17YR',
       'CNT_17YR',
       'DRIVER_COUNT_18YR', 
       'CNT_18YR',
       'DRIVER_COUNT_19YR', 
       'CNT_19YR',
       'DRIVER_COUNT_20YR',
       'CNT_20YR',
       'DRIVER_COUNT_50_64YR', 
       'CNT_50_64YR',
       'DRIVER_COUNT_65_74YR', 
       'CNT_65_74YR',
       'DRIVER_COUNT_75PLUS',
       'CNT_75PLUS']

In [55]:
drv_age_cnt3 = pd.merge(drv_age_cnt2, df2, on=['CRN'], how='left').\
    loc[:,['CRN', 'CRASH_DATE'] + drv_counts].\
    copy()

In [56]:
diff_cols = []

for i in range(0, len(ages)):
    drv_age_cnt3['DIFF_' + str(ages[i])] = [i - j for (i, j) in zip(drv_age_cnt3[crash_cnt[i]], drv_age_cnt3[derived_cnt[i]])]
    diff_cols.append('DIFF_' + str(ages[i]))
    print(diff_cols)

['DIFF_16']
['DIFF_16', 'DIFF_17']
['DIFF_16', 'DIFF_17', 'DIFF_18']
['DIFF_16', 'DIFF_17', 'DIFF_18', 'DIFF_19']
['DIFF_16', 'DIFF_17', 'DIFF_18', 'DIFF_19', 'DIFF_20']
['DIFF_16', 'DIFF_17', 'DIFF_18', 'DIFF_19', 'DIFF_20', 'DIFF_50']
['DIFF_16', 'DIFF_17', 'DIFF_18', 'DIFF_19', 'DIFF_20', 'DIFF_50', 'DIFF_65']
['DIFF_16', 'DIFF_17', 'DIFF_18', 'DIFF_19', 'DIFF_20', 'DIFF_50', 'DIFF_65', 'DIFF_75']


In [57]:
drv_age_cnt3.head()

Unnamed: 0,CRN,CRASH_DATE,DRIVER_COUNT_16YR,CNT_16YR,DRIVER_COUNT_17YR,CNT_17YR,DRIVER_COUNT_18YR,CNT_18YR,DRIVER_COUNT_19YR,CNT_19YR,...,DRIVER_COUNT_75PLUS,CNT_75PLUS,DIFF_16,DIFF_17,DIFF_18,DIFF_19,DIFF_20,DIFF_50,DIFF_65,DIFF_75
0,2005000003,2005-01-01,0,,0,,1,1.0,0,,...,0,,,,0.0,,,,,
1,2005000006,2005-01-01,0,,0,,0,,1,1.0,...,0,,,,,0.0,,,,
2,2005000010,2005-01-01,1,1.0,0,,0,,0,,...,0,,0.0,,,,,,,
3,2005000012,2005-01-01,0,,0,,0,,0,,...,0,,,,,,,,0.0,
4,2005000013,2005-01-01,0,,0,,0,,0,,...,0,,,,,,,,,


In [58]:
drv_age_cnt3['TOTAL_DIFF'] = drv_age_cnt3[diff_cols].sum(axis=1)

In [59]:
drv_age_cnt3['TOTAL_DIFF'].unique()

array([ 0., -1., -2., -3., -4.])

In [60]:
drv_age_cnt3.loc[drv_age_cnt3['TOTAL_DIFF']!=0]

Unnamed: 0,CRN,CRASH_DATE,DRIVER_COUNT_16YR,CNT_16YR,DRIVER_COUNT_17YR,CNT_17YR,DRIVER_COUNT_18YR,CNT_18YR,DRIVER_COUNT_19YR,CNT_19YR,...,CNT_75PLUS,DIFF_16,DIFF_17,DIFF_18,DIFF_19,DIFF_20,DIFF_50,DIFF_65,DIFF_75,TOTAL_DIFF
19,2005000056,2005-01-01,0,,0,,0,,0,,...,1.0,,,,,,,,-1.0,-1.0
79,2005001449,2005-01-01,0,,0,,0,,0,,...,1.0,,,,,,,,-1.0,-1.0
302,2005004715,2005-01-01,0,,0,,0,,0,,...,1.0,,,,,,,,-1.0,-1.0
524,2005005500,2005-01-01,0,,0,,0,,0,,...,1.0,,,,,,,,-1.0,-1.0
871,2005011903,2005-01-01,0,,0,,0,,0,,...,1.0,,,,,,0.0,,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2451565,2025043247,2024-07-01,0,,0,,0,,0,,...,1.0,,,,,,,,-1.0,-1.0
2451571,2025043498,2024-08-01,0,,0,,0,,0,,...,1.0,,,,,,,,-1.0,-1.0
2451573,2025043500,2024-08-01,0,,0,,0,,0,,...,1.0,,,,,,,,-1.0,-1.0
2451576,2025045416,2024-08-01,0,,0,,0,,0,,...,1.0,,,,,,,,-1.0,-1.0


In [61]:
drv_age_cnt3.loc[drv_age_cnt3['TOTAL_DIFF']!=0].to_csv('data/aux/age_grp_diff_by_crn.csv', index=False)

In [62]:
summ = drv_age_cnt3.groupby(['CRASH_DATE']).\
    aggregate(TOTAL_CRASHES=('CRN','count')).\
    reset_index().\
    copy()

In [63]:
summ

Unnamed: 0,CRASH_DATE,TOTAL_CRASHES
0,2005-01-01,12516
1,2005-02-01,11060
2,2005-03-01,10617
3,2005-04-01,10216
4,2005-05-01,10818
...,...,...
235,2024-08-01,8725
236,2024-09-01,8702
237,2024-10-01,10642
238,2024-11-01,10240


In [64]:
for c in drv_counts:
    summ = pd.concat([summ, summarize(drv_age_cnt3, 'CRASH_DATE', c, agg_func='sum')], axis=1).copy()

In [None]:
summ

In [None]:
summ.to_csv('data/aux/age_grp_diff_by_dt.csv', index=False)