## COVID Drivers: Data Preparation

### Table of Contents
* [Read the Data](#read)</BR>
* [Create New Date Variables](#dates)</BR>
* [Select Columns and Merge CRASH and FLAGS](#cols)</BR>
* [Select Rows with Vehicles in Transit or Hit and Run](#rows)</BR>
* [Decode Categorical Variables](#decode)</BR>
* [Prepare Data by Date](#prep-dt)</BR>
* [Write Final Aggregated Datasets to Files](#write)

Import packages

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import os
import ruptures as rpt
import altair as alt

import statsmodels.api as sm

In [2]:
# Import project specific utilities
from utils.functions import *

### <a id='read'>Read the Data</a>

In [3]:
crash = pd.DataFrame()
flags = pd.DataFrame()
person = pd.DataFrame()
vehicle = pd.DataFrame()

for folder in os.listdir(path_raw):
    #print(folder)
    year = folder[-4:]
    #print(year)
    if 'statewide' in folder.lower():
    #if year != '2020' and 'statewide' in folder.lower():
        tmp = read_data(folder, year, 'CRASH')
        crash = (tmp.copy() if crash.empty else crash.copy() if tmp.empty
            else pd.concat([crash, tmp]) # if both DataFrames non empty
        )

        tmp = read_data(folder, year, 'FLAGS')
        flags = (tmp.copy() if flags.empty else flags.copy() if tmp.empty
            else pd.concat([flags, tmp]) # if both DataFrames non empty
        )

        tmp = read_data(folder, year, 'PERSON')
        person = (tmp.copy() if person.empty else person.copy() if tmp.empty
            else pd.concat([person, tmp]) # if both DataFrames non empty
        )

        tmp = read_data(folder, year, 'VEHICLE')
        vehicle = (tmp.copy() if vehicle.empty else vehicle.copy() if tmp.empty
            else pd.concat([vehicle, tmp]) # if both DataFrames non empty
        )
            

### <a id='dates'>Create New Date Variables</a>

In [4]:
crash2 = crash.copy()

In [5]:
crash2['POST_COVID'] = [1 if (i > 2020) or (i == 2020 and j >= 3) else 0 for (i, j) in zip(crash2['CRASH_YEAR'], crash2['CRASH_MONTH'])]

In [None]:
crash2['COVID'] = ['Post_COVID' if (i > 2020) or (i == 2020 and j >= 3) else 'Pre_COVID' for (i, j) in zip(crash2['CRASH_YEAR'], crash2['CRASH_MONTH'])]

In [6]:
crash2['CRASH_MN_NAME'] = pd.to_datetime(crash2['CRASH_MONTH'], format='%m').dt.month_name().str[:3]

In [7]:
crash2['CRASH_YEAR'] = crash2['CRASH_YEAR'].astype('int64').astype('str')

In [8]:
crash2['CRASH_DATE'] = crash2.CRASH_YEAR.str.cat( crash2.CRASH_MN_NAME, sep = '-' )

In [9]:
crash2['CRASH_DATE'].head()

0    2005-Jan
1    2005-Jan
2    2005-Jan
3    2005-Jan
4    2005-Jan
Name: CRASH_DATE, dtype: str

In [10]:
crash2['CRASH_DATE'] = pd.to_datetime( crash2.CRASH_DATE )

  crash2['CRASH_DATE'] = pd.to_datetime( crash2.CRASH_DATE )


In [11]:
crash2['CRASH_DATE'].head()

0   2005-01-01
1   2005-01-01
2   2005-01-01
3   2005-01-01
4   2005-01-01
Name: CRASH_DATE, dtype: datetime64[us]

### <a id='cols'>Select Columns and Merge CRASH and FLAGS</a>

In [None]:
crash_cols = ['CRN',
 'CRASH_DATE',
 'POST_COVID',
 'COUNTY',
 'URBAN_RURAL']

flags_cols = ['CRN',
 'AGGRESSIVE_DRIVING',
 'NHTSA_AGG_DRIVING',
 'NO_CLEARANCE',
 'RUNNING_RED_LT',
 'RUNNING_STOP_SIGN',
 'SPEEDING',
 'SPEEDING_RELATED',
 'TAILGATING']

veh_cols = ['CRN',
            'UNIT_NUM',
            'UNIT_TYPE']



In [13]:
df_init = pd.merge(crash2.loc[:,crash_cols], flags.loc[:,flags_cols], on='CRN', how='left').copy()

### <a id='rows'>Select Rows with Vehicles in Transit or Hit and Run</a>

In [14]:
vehicle2 = vehicle.copy()

In [15]:
enum_dict['UNIT_TYPE']

{1: 'Motor vehicle in transport',
 2: 'Legally parked',
 3: 'Illegally parked',
 5: 'Hit and run vehicle',
 6: 'Disabled from a previous crash',
 21: 'Train',
 30: 'Non-Motorist',
 33: 'Personal Delivery Device',
 51: 'Phantom vehicle'}

In [16]:
moving = vehicle2.loc[vehicle2['UNIT_TYPE'].isin([1, 5])].copy()

In [17]:
select_crn = moving['CRN'].unique().tolist()

In [18]:
df = df_init.loc[df_init['CRN'].isin(select_crn)].copy()

In [19]:
df_init.shape[0]

2461193

In [20]:
df.shape[0]

2460484

### <a id='decode'>Decode Categorical Variables</a>

In [None]:
df2 = df.copy()

COUNTY

In [None]:
df2['COUNTYx'] = [enum_dict['COUNTY'][i] for i in df2['COUNTY']]

URBAN_RURAL

In [None]:
df2['URBAN_RURALx'] = [enum_dict['COUNTY'][i] for i in df2['URBAN_RURAL']]

### <a id='prep-dt'>Prepare Data by Date</a>

In [None]:
cols_to_keep = ['CRASH_DATE', 
            'AGGRESSIVE_DRIVING', 
            'NHTSA_AGG_DRIVING', 
            'NO_CLEARANCE',
            'RUNNING_RED_LT', 
            'RUNNING_STOP_SIGN', 
            'SPEEDING',
            'SPEEDING_RELATED', 
            'TAILGATING',
            'POST_COVID']

In [None]:
df_dates = df2.loc[:,cols_to_keep].copy()

In [64]:
summ_idx = df_dates.set_index('CRASH_DATE')

In [65]:
summ_by_date = summ_idx.copy().resample('MS').mean()

### <a id='write'>Write Final Aggregated Datasets to Files</a>

In [66]:
ready_path = 'data/ready/'

In [67]:
summ_by_date.to_csv(ready_path + 'by_dates.csv', index=False)

In [None]:
#df2.columns.tolist()

In [None]:
out_df = df2.loc[:, ['CRN',
 'CRASH_DATE',
 'POST_COVID',
 'COVID',
 'AGGRESSIVE_DRIVING',
 'NHTSA_AGG_DRIVING',
 'NO_CLEARANCE',
 'RUNNING_RED_LT',
 'RUNNING_STOP_SIGN',
 'SPEEDING',
 'SPEEDING_RELATED',
 'TAILGATING',
 'COUNTYx',
 'URBAN_RURALx']].\
 copy()

In [70]:
out_df.to_csv(ready_path + 'full_dataset.csv', index=False)