## COVID Drivers: Data Quality Assessment

### Table of Contents
* [Summary of Decisions](#summ)</BR>
* [Read the Data](#read)</BR>
* [Evaluate Variables](#eval)</BR>
* [Create Final Dataframe](#final)</BR>
* [Write Final Dataset to File](#write)

Import packages

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import os
import ruptures as rpt
import altair as alt

import statsmodels.api as sm

In [2]:
# Import project specific utilities
from utils.functions import *

In [3]:
path_in = 'data/aux/selected_crash_and_flags.csv'

### <a id='read'>Read the data</a>

In [4]:
df = pd.read_csv(path_in, low_memory=False)

### <a id='eval'>Evaluate Variables</a>

In [5]:
df.columns.tolist()

['COUNTY',
 'CRASH_MONTH',
 'CRASH_YEAR',
 'CRN',
 'DAY_OF_WEEK',
 'DRIVER_COUNT_16YR',
 'DRIVER_COUNT_17YR',
 'DRIVER_COUNT_18YR',
 'DRIVER_COUNT_19YR',
 'DRIVER_COUNT_20YR',
 'DRIVER_COUNT_50_64YR',
 'DRIVER_COUNT_65_74YR',
 'DRIVER_COUNT_75PLUS',
 'ILLUMINATION',
 'INTERSECT_TYPE',
 'LOCATION_TYPE',
 'ROAD_CONDITION',
 'SECONDARY_CRASH',
 'URBAN_RURAL',
 'WEATHER1',
 'WEATHER2',
 'AGGRESSIVE_DRIVING',
 'ALCOHOL_RELATED',
 'CELL_PHONE',
 'CROSS_MEDIAN',
 'CURVE_DVR_ERROR',
 'CURVED_ROAD',
 'DISTRACTED',
 'DRINKING_DRIVER',
 'DRIVER_16YR',
 'DRIVER_17YR',
 'DRIVER_18YR',
 'DRIVER_19YR',
 'DRIVER_20YR',
 'DRIVER_50_64YR',
 'DRIVER_65_74YR',
 'DRIVER_75PLUS',
 'DRUG_RELATED',
 'DRUGGED_DRIVER',
 'FATIGUE_ASLEEP',
 'HIT_RUN',
 'ICY_ROAD',
 'ILLEGAL_DRUG_RELATED',
 'ILLUMINATION_DARK',
 'IMPAIRED_DRIVER',
 'IMPAIRED_NONMOTORIST',
 'INTERSECTION',
 'LANE_DEPARTURE',
 'MARIJUANA_DRUGGED_DRIVER',
 'MARIJUANA_RELATED',
 'MATURE_DRIVER',
 'MC_DRINKING_DRIVER',
 'NHTSA_AGG_DRIVING',
 'NON_INTER

In [6]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 2461193 entries, 0 to 2461192
Data columns (total 85 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   COUNTY                    int64  
 1   CRASH_MONTH               int64  
 2   CRASH_YEAR                int64  
 3   CRN                       int64  
 4   DAY_OF_WEEK               int64  
 5   DRIVER_COUNT_16YR         int64  
 6   DRIVER_COUNT_17YR         int64  
 7   DRIVER_COUNT_18YR         int64  
 8   DRIVER_COUNT_19YR         int64  
 9   DRIVER_COUNT_20YR         int64  
 10  DRIVER_COUNT_50_64YR      int64  
 11  DRIVER_COUNT_65_74YR      int64  
 12  DRIVER_COUNT_75PLUS       int64  
 13  ILLUMINATION              float64
 14  INTERSECT_TYPE            int64  
 15  LOCATION_TYPE             int64  
 16  ROAD_CONDITION            int64  
 17  SECONDARY_CRASH           str    
 18  URBAN_RURAL               int64  
 19  WEATHER1                  float64
 20  WEATHER2                  float64
 

In [7]:
df.COUNTY.nunique()

67

In [8]:
df.COUNTY.value_counts()

COUNTY
2     236677
67    214621
46    170534
9     120646
36    111555
       ...  
47      4182
52      2672
56      1522
27      1383
12       992
Name: count, Length: 67, dtype: int64

#### Evaluate Variables from FLAGS

In [9]:
flags_cols = ['AGGRESSIVE_DRIVING',
'ALCOHOL_RELATED',
'CELL_PHONE',
'CRN',
'CROSS_MEDIAN',
'CURVE_DVR_ERROR',
'CURVED_ROAD',
'DISTRACTED',
'DRINKING_DRIVER',
'DRIVER_16YR',
'DRIVER_17YR',
'DRIVER_18YR',
'DRIVER_19YR',
'DRIVER_20YR',
'DRIVER_50_64YR',
'DRIVER_65_74YR',
'DRIVER_75PLUS',
'DRUG_RELATED',
'DRUGGED_DRIVER',
'FATIGUE_ASLEEP',
'HIT_RUN',
'ICY_ROAD',
'ILLEGAL_DRUG_RELATED',
'ILLUMINATION_DARK',
'IMPAIRED_DRIVER',
'IMPAIRED_NONMOTORIST',
'INTERSECTION',
'MARIJUANA_DRUGGED_DRIVER',
'MARIJUANA_RELATED',
'MATURE_DRIVER',
'MC_DRINKING_DRIVER',
'NHTSA_AGG_DRIVING',
'NON_INTERSECTION',
'OPIOID_RELATED',
'RAMP',
'RAMP_SEGMENT',
'RAMP_TERMINAL',
'ROUNDABOUT',
'RUNNING_RED_LT',
'RUNNING_STOP_SIGN',
'RURAL',
'SIGNALIZED_INT',
'SNOW_SLUSH_ROAD',
'SPEED_CHANGE_LANE',
'SPEEDING',
'SPEEDING_RELATED',
'STOP_CONTROLLED_INT',
'SUDDEN_DEER',
'TAILGATING',
'UNDERAGE_DRNK_DRV',
'UNLICENSED',
'UNSIGNALIZED_INT',
'URBAN',
'WET_ROAD',
'YOUNG_DRIVER']

In [10]:
for c in flags_cols:
    miss = df[c].isna().sum()
    pct = round((df.loc[df[c].isna()].shape[0]/df.shape[0])*100, 2)
    if miss > 0:
        print()
        print(c)
        print(str(miss) + ' missing')
        print(str(pct) + '% missing')
        print(df[c].value_counts())
        print()
    else:
        print(c + ' has no missing values')

AGGRESSIVE_DRIVING has no missing values
ALCOHOL_RELATED has no missing values
CELL_PHONE has no missing values
CRN has no missing values
CROSS_MEDIAN has no missing values
CURVE_DVR_ERROR has no missing values
CURVED_ROAD has no missing values
DISTRACTED has no missing values
DRINKING_DRIVER has no missing values
DRIVER_16YR has no missing values
DRIVER_17YR has no missing values
DRIVER_18YR has no missing values
DRIVER_19YR has no missing values
DRIVER_20YR has no missing values
DRIVER_50_64YR has no missing values
DRIVER_65_74YR has no missing values
DRIVER_75PLUS has no missing values
DRUG_RELATED has no missing values
DRUGGED_DRIVER has no missing values
FATIGUE_ASLEEP has no missing values
HIT_RUN has no missing values
ICY_ROAD has no missing values
ILLEGAL_DRUG_RELATED has no missing values
ILLUMINATION_DARK has no missing values
IMPAIRED_DRIVER has no missing values
IMPAIRED_NONMOTORIST has no missing values
INTERSECTION has no missing values
MARIJUANA_DRUGGED_DRIVER has no mis

#### Evaluate month and year variables

In [11]:
date_cols = ['CRASH_MONTH', 'CRASH_YEAR', 'DAY_OF_WEEK']

In [12]:
for c in date_cols:
    print(c + ': ' + str(df[c].isna().sum()))

CRASH_MONTH: 0
CRASH_YEAR: 0
DAY_OF_WEEK: 0


In [13]:
df['CRASH_YEAR'].value_counts()

CRASH_YEAR
2005    134261
2007    132152
2016    129607
2006    129253
2018    128541
2017    128441
2015    127470
2008    126184
2011    125616
2019    125452
2012    124501
2013    124366
2009    121794
2010    121612
2014    121547
2021    118100
2022    116147
2024    110813
2023    110736
2020    104600
Name: count, dtype: int64

In [14]:
for c in date_cols:
    print(c)
    print(df[c].isna().sum())
    print()

CRASH_MONTH
0

CRASH_YEAR
0

DAY_OF_WEEK
0



#### Evaluate location variables

In [15]:
loc_cols = ['COUNTY',
 'URBAN_RURAL']

In [16]:
for c in loc_cols:
    print(c)
    print(df[c].isna().sum())
    print()

COUNTY
0

URBAN_RURAL
0



#### Evaluate condition variables

In [17]:
cond_cols = ['ILLUMINATION',
 'ILLUMINATION_DARK',
 'INTERSECT_TYPE',
 'INTERSECTION',
 'NON_INTERSECTION',
 'LOCATION_TYPE',
 'ROAD_CONDITION',
 'WEATHER1',
 'WEATHER2']

In [18]:
for c in cond_cols:
    miss = df[c].isna().sum()
    pct = round((df.loc[df[c].isna()].shape[0]/df.shape[0])*100, 2)
    if miss > 0:
        print()
        print(c)
        print(str(miss) + ' missing')
        print(str(pct) + '% missing')
        print(df[c].value_counts())
        print()
    else:
        print(c + ' has no missing values')


ILLUMINATION
2448908 missing
99.5% missing
ILLUMINATION
1.0    7379
3.0    2414
2.0    1940
5.0     232
4.0     229
6.0      72
8.0      19
Name: count, dtype: int64

ILLUMINATION_DARK has no missing values
INTERSECT_TYPE has no missing values
INTERSECTION has no missing values
NON_INTERSECTION has no missing values
LOCATION_TYPE has no missing values
ROAD_CONDITION has no missing values

WEATHER1
2315023 missing
94.06% missing
WEATHER1
3.0     111621
7.0      17887
4.0       7732
10.0      5259
2.0       1011
5.0        966
99.0       577
6.0        558
98.0       337
9.0        145
8.0         66
1.0         11
Name: count, dtype: int64


WEATHER2
2161825 missing
87.84% missing
WEATHER2
3.0     228624
4.0      21461
7.0      20253
5.0      10638
10.0      6302
98.0      5103
2.0       2718
6.0       2323
9.0       1120
8.0        669
1.0        157
Name: count, dtype: int64



In [19]:
cond_cols = [
 'ILLUMINATION_DARK',
 'INTERSECT_TYPE',
 'INTERSECTION',
 'NON_INTERSECTION',
 'LOCATION_TYPE',
 'ROAD_CONDITION'
 ]

#### Evaluate driver age variables

In [20]:
driv_cols = [
    'DRIVER_COUNT_16YR',
 'DRIVER_COUNT_17YR',
 'DRIVER_COUNT_18YR',
 'DRIVER_COUNT_19YR',
 'DRIVER_COUNT_20YR',
 'DRIVER_COUNT_50_64YR',
 'DRIVER_COUNT_65_74YR',
 'DRIVER_COUNT_75PLUS',
 'DRIVER_16YR',
 'DRIVER_17YR',
 'DRIVER_18YR',
 'DRIVER_19YR',
 'DRIVER_20YR',
 'DRIVER_50_64YR',
 'DRIVER_65_74YR',
 'DRIVER_75PLUS',
 'MATURE_DRIVER',
 'YOUNG_DRIVER'
 ]

In [21]:
for c in driv_cols:
    miss = df[c].isna().sum()
    pct = round((df.loc[df[c].isna()].shape[0]/df.shape[0])*100, 2)
    if miss > 0:
        print()
        print(c)
        print(str(miss) + ' missing')
        print(str(pct) + '% missing')
        print(df[c].value_counts())
        print()
    else:
        print(c + ' has no missing values')

DRIVER_COUNT_16YR has no missing values
DRIVER_COUNT_17YR has no missing values
DRIVER_COUNT_18YR has no missing values
DRIVER_COUNT_19YR has no missing values
DRIVER_COUNT_20YR has no missing values
DRIVER_COUNT_50_64YR has no missing values
DRIVER_COUNT_65_74YR has no missing values
DRIVER_COUNT_75PLUS has no missing values
DRIVER_16YR has no missing values
DRIVER_17YR has no missing values
DRIVER_18YR has no missing values
DRIVER_19YR has no missing values
DRIVER_20YR has no missing values
DRIVER_50_64YR has no missing values
DRIVER_65_74YR has no missing values
DRIVER_75PLUS has no missing values
MATURE_DRIVER has no missing values
YOUNG_DRIVER has no missing values


#### Evaluate aggressive driving variables

In [22]:
agg_cols = [
'AGGRESSIVE_DRIVING',
'ALCOHOL_RELATED',
'CELL_PHONE',
'DISTRACTED',
'DRINKING_DRIVER',
'DRUG_RELATED',
'DRUGGED_DRIVER',
'FATIGUE_ASLEEP',
'HIT_RUN',
'ILLEGAL_DRUG_RELATED',
'IMPAIRED_DRIVER',
'MARIJUANA_DRUGGED_DRIVER',
'MARIJUANA_RELATED',
'MC_DRINKING_DRIVER',
'NHTSA_AGG_DRIVING',
'OPIOID_RELATED',
'RUNNING_RED_LT',
'RUNNING_STOP_SIGN',
'SPEEDING',
'SPEEDING_RELATED',
'TAILGATING',
'UNDERAGE_DRNK_DRV',
'UNLICENSED'
]

In [23]:
for c in agg_cols:
    miss = df[c].isna().sum()
    pct = round((df.loc[df[c].isna()].shape[0]/df.shape[0])*100, 2)
    if miss > 0:
        print()
        print(c)
        print(str(miss) + ' missing')
        print(str(pct) + '% missing')
        print(df[c].value_counts())
        print()
    else:
        print(c + ' has no missing values')

AGGRESSIVE_DRIVING has no missing values
ALCOHOL_RELATED has no missing values
CELL_PHONE has no missing values
DISTRACTED has no missing values
DRINKING_DRIVER has no missing values
DRUG_RELATED has no missing values
DRUGGED_DRIVER has no missing values
FATIGUE_ASLEEP has no missing values
HIT_RUN has no missing values
ILLEGAL_DRUG_RELATED has no missing values
IMPAIRED_DRIVER has no missing values
MARIJUANA_DRUGGED_DRIVER has no missing values
MARIJUANA_RELATED has no missing values
MC_DRINKING_DRIVER has no missing values
NHTSA_AGG_DRIVING has no missing values
OPIOID_RELATED has no missing values
RUNNING_RED_LT has no missing values
RUNNING_STOP_SIGN has no missing values
SPEEDING has no missing values
SPEEDING_RELATED has no missing values
TAILGATING has no missing values
UNDERAGE_DRNK_DRV has no missing values
UNLICENSED has no missing values


### <a id='final'>Create Final Dataframe</a>

In [24]:
use_cols = ['CRN'] + date_cols + agg_cols + cond_cols + loc_cols + driv_cols
use_cols

['CRN',
 'CRASH_MONTH',
 'CRASH_YEAR',
 'DAY_OF_WEEK',
 'AGGRESSIVE_DRIVING',
 'ALCOHOL_RELATED',
 'CELL_PHONE',
 'DISTRACTED',
 'DRINKING_DRIVER',
 'DRUG_RELATED',
 'DRUGGED_DRIVER',
 'FATIGUE_ASLEEP',
 'HIT_RUN',
 'ILLEGAL_DRUG_RELATED',
 'IMPAIRED_DRIVER',
 'MARIJUANA_DRUGGED_DRIVER',
 'MARIJUANA_RELATED',
 'MC_DRINKING_DRIVER',
 'NHTSA_AGG_DRIVING',
 'OPIOID_RELATED',
 'RUNNING_RED_LT',
 'RUNNING_STOP_SIGN',
 'SPEEDING',
 'SPEEDING_RELATED',
 'TAILGATING',
 'UNDERAGE_DRNK_DRV',
 'UNLICENSED',
 'ILLUMINATION_DARK',
 'INTERSECT_TYPE',
 'INTERSECTION',
 'NON_INTERSECTION',
 'LOCATION_TYPE',
 'ROAD_CONDITION',
 'COUNTY',
 'URBAN_RURAL',
 'DRIVER_COUNT_16YR',
 'DRIVER_COUNT_17YR',
 'DRIVER_COUNT_18YR',
 'DRIVER_COUNT_19YR',
 'DRIVER_COUNT_20YR',
 'DRIVER_COUNT_50_64YR',
 'DRIVER_COUNT_65_74YR',
 'DRIVER_COUNT_75PLUS',
 'DRIVER_16YR',
 'DRIVER_17YR',
 'DRIVER_18YR',
 'DRIVER_19YR',
 'DRIVER_20YR',
 'DRIVER_50_64YR',
 'DRIVER_65_74YR',
 'DRIVER_75PLUS',
 'MATURE_DRIVER',
 'YOUNG_DRIVER']

Verify columns not selected

In [25]:
oth_cols = df.columns.tolist()


In [26]:
for c in oth_cols:
    if c not in use_cols:
        print(c)

ILLUMINATION
SECONDARY_CRASH
WEATHER1
WEATHER2
CROSS_MEDIAN
CURVE_DVR_ERROR
CURVED_ROAD
ICY_ROAD
IMPAIRED_NONMOTORIST
LANE_DEPARTURE
RAMP
RAMP_SEGMENT
RAMP_TERMINAL
ROUNDABOUT
RURAL
SIGNALIZED_INT
SNOW_SLUSH_ROAD
SPEED_CHANGE_LANE
STOP_CONTROLLED_INT
SUDDEN_DEER
UNSIGNALIZED_INT
URBAN
WET_ROAD
COUNTYx
URBAN_RURALx
DAY_OF_WEEKx
ROAD_CONDITIONx
LOCATION_TYPEx
INTERSECT_TYPEx
ILLUMINATIONx
WEATHER1x
WEATHER2x


In [27]:
final = df.loc[:,use_cols].\
    dropna(axis=1, how='any').\
    copy()

In [28]:
final.columns.tolist()

['CRN',
 'CRASH_MONTH',
 'CRASH_YEAR',
 'DAY_OF_WEEK',
 'AGGRESSIVE_DRIVING',
 'ALCOHOL_RELATED',
 'CELL_PHONE',
 'DISTRACTED',
 'DRINKING_DRIVER',
 'DRUG_RELATED',
 'DRUGGED_DRIVER',
 'FATIGUE_ASLEEP',
 'HIT_RUN',
 'ILLEGAL_DRUG_RELATED',
 'IMPAIRED_DRIVER',
 'MARIJUANA_DRUGGED_DRIVER',
 'MARIJUANA_RELATED',
 'MC_DRINKING_DRIVER',
 'NHTSA_AGG_DRIVING',
 'OPIOID_RELATED',
 'RUNNING_RED_LT',
 'RUNNING_STOP_SIGN',
 'SPEEDING',
 'SPEEDING_RELATED',
 'TAILGATING',
 'UNDERAGE_DRNK_DRV',
 'UNLICENSED',
 'ILLUMINATION_DARK',
 'INTERSECT_TYPE',
 'INTERSECTION',
 'NON_INTERSECTION',
 'LOCATION_TYPE',
 'ROAD_CONDITION',
 'COUNTY',
 'URBAN_RURAL',
 'DRIVER_COUNT_16YR',
 'DRIVER_COUNT_17YR',
 'DRIVER_COUNT_18YR',
 'DRIVER_COUNT_19YR',
 'DRIVER_COUNT_20YR',
 'DRIVER_COUNT_50_64YR',
 'DRIVER_COUNT_65_74YR',
 'DRIVER_COUNT_75PLUS',
 'DRIVER_16YR',
 'DRIVER_17YR',
 'DRIVER_18YR',
 'DRIVER_19YR',
 'DRIVER_20YR',
 'DRIVER_50_64YR',
 'DRIVER_65_74YR',
 'DRIVER_75PLUS',
 'MATURE_DRIVER',
 'YOUNG_DRIVER']