# Model Output File

Clean up the data and prep for modeling.
* Create time series
* Calculate distance to coast
* Create Holiday flag (window for 2 days before and 2 day after)
* Clean up missing values and create flags for imputed values if necessary

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import datetime
from datetime import datetime

import boto3
import awswrangler
# set name of S3 bucket
s3_bucket = 'traffic-data-bucket'

import re

import os
from os.path import isfile, join
from pathlib import Path
from os import listdir
import os
os.getcwd()

#from commons import download_data, find_vcs_root

path =  Path(os.getcwd())
root = path.parent.absolute()

root


## 1. Create Boto3 session
Start by creating a boto3 session so that we can connect to the S3 bucket.

In [None]:
from aws_secrets import aws_access_key_id, aws_secret_access_key, aws_session_token

my_session = boto3.Session(
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    aws_session_token = aws_session_token

)

## 2. Import joined data set

In [None]:
joined_df = awswrangler.s3.read_parquet(path = f's3://{s3_bucket}/model_data/model_data_pre_transformation.parquet', boto3_session=my_session, use_threads=True)

In [None]:
joined_df.shape

In [None]:
joined_df.sample(3)

In [None]:
joined_df_copy = joined_df.copy()

Perform general exploration of the data

In [None]:
joined_df.accident_count.value_counts()

In [None]:
joined_df.collision_year.value_counts().sort_index()

## 3. Creat time series features
### 3.1 Transform time series features into sin and cosin components

In [None]:
def cyclical_encode(data, col, max_val):
    data['drv_' + col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)
    data['drv_' + col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)
    return data

Inspect the variables

In [None]:
print('max and min hour')
print(np.max(joined_df.collision_hour))
print(np.min(joined_df.collision_hour))

print('max and min day of week')
print(np.max(joined_df.collision_dayofweek))
print(np.min(joined_df.collision_dayofweek))

print('max and min month')
print(np.max(joined_df.collision_month))
print(np.min(joined_df.collision_month))

In [None]:
sin_cos_transformed_list = {'collision_hour' : 23 ,
                            'collision_dayofweek' : 6, 
                            'collision_month' : 12
                           }

for date_type in sin_cos_transformed_list:
    joined_df = cyclical_encode(joined_df, date_type, sin_cos_transformed_list[date_type])

In [None]:
joined_df[['collision_hour', 'drv_collision_hour_sin', 'drv_collision_hour_cos']].sample(10)

In [None]:
hour_tr_sample = joined_df[['collision_hour', 'drv_collision_hour_sin', 'drv_collision_hour_cos']].sample(500)

plt.rcParams["figure.figsize"] = [5.00, 5.00]
plt.plot(hour_tr_sample['drv_collision_hour_sin'], 
         hour_tr_sample['drv_collision_hour_cos'], 
         'o', 
         color='blue',
         alpha=.01);
plt.ylim(-1.25, 1.25);
plt.xlim(-1.25, 1.25);

### 3.2 Make a holiday indicator

In [None]:
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
import datetime

cal = calendar()
dr = pd.date_range(start='2014-01-01', end='2020-12-31')
df = pd.DataFrame()
df['Date'] = dr

cal = calendar()
holidays = cal.holidays(start=dr.min(), end=dr.max())

holiday_list = list()

for holiday in holidays:
    
    holiday = holiday.date()

    date_plus1 = holiday + datetime.timedelta(days=1)
    date_minus1 = holiday + datetime.timedelta(days=-1)
    
    holiday_list.append(date_minus1.strftime('%Y-%m-%d'))
    holiday_list.append(holiday.strftime('%Y-%m-%d'))
    holiday_list.append(date_plus1.strftime('%Y-%m-%d'))
    holiday_list.append(date_minus1.strftime('%Y-%m-%d'))
    
holiday_list[0:10]


In [None]:
joined_df['collision_date'].sample(5)

In [None]:
joined_df['collision_date'] = joined_df['collision_date'].map(str)

Create a new column called `drv_holiday_flag` which indicates if the date of collision is a holiday.

In [None]:
joined_df['drv_holiday_flag'] = joined_df['collision_date'].isin(holiday_list)

In [None]:
joined_df['drv_holiday_flag'].value_counts()

## 4. Reduce dataset to selected columns

In [None]:
#joined_df.columns[100:150]

In [None]:
keep_cols_list = ['hex_id', 'collision_date', 'collision_year', 'collision_month',
                  'collision_dayofweek', 'collision_hour', 'accident_count', 'ttv_split',
                  'node_street_count', 
                  'node_stop', 
                  'node_traffic_signals',
                  'CITY_NAME', 
                  'edge_speed_kph_max',
                  'edge_speek_kph_min', 
                  'edge_lanes_max', 
                  'edge_motorway_id', 
                  'edge_motorway_link_id', 
                  'edge_living_street_id',
                  'edge_bridge_id', 
                  'edge_oneway_id', 
                  'edge_tunnel_id',
                  'amenities_bar_cnt',
                  'amenities_school_cnt',
                  'amenities_restaurant_cnt',
                  'amenities_college_cnt',
                  'prev1_yr_coll_cnt',
                  'prev2_yr_coll_cnt', 
                  'prev1_yr_coll_neighbor1',
                  'prev1_yr_coll_neighbor2',
                  'prev2_yr_coll_neighbor1',
                  'prev2_yr_coll_neighbor2',
                  'awnd', 'prcp', 'tavg', 'tmax', 'tmin',
                  'drv_collision_hour_sin','drv_collision_hour_cos', 
                  'drv_holiday_flag']

In [None]:
print("The number of columns to keep are:", len(keep_cols_list))

In [None]:
pd.set_option("display.max_columns", None)
joined_trimmed_df = joined_df[keep_cols_list]
joined_trimmed_df.sample(3)

Rename column names.

In [None]:
joined_trimmed_df.rename(columns = {'awnd':'noaa_wind_speed' 
                                    ,'prcp':'noaa_precipitation'
                                    ,'tavg':'noaa_temperature_average'
                                    ,'tmax':'noaa_temperature_max'
                                    ,'tmin':'noaa_temperature_min'
                                    ,'wdf2':'noaa_wind_direction'
                                    ,'CITY_NAME':'la_data_city_name'
                                    ,'edge_motorway_id':'edge_motorway_flag' 
                                    ,'edge_motorway_link_id':'edge_motorway_link_flag'
                                    ,'edge_living_street_id':'edge_living_street_flag'
                                    ,'edge_bridge_id':'edge_bridge_flag' 
                                    ,'edge_oneway_id':'edge_oneway_flag'
                                    ,'edge_tunnel_id':'edge_tunnel_flag'
                                    ,'drv_holiday_id':'drv_holiday_flag'
                                    ,'accident_count':'target'
                                   }, inplace = True)

In [None]:
joined_trimmed_df.columns

## 5. Check for missing values
Here we check for missing values in each column and if missing values are present, we will fill them with zero.

In [None]:
na_cnt = joined_trimmed_df.isnull().sum()
na_cnt[na_cnt>0]

In [None]:
joined_trimmed_df.prev1_yr_coll_cnt = joined_trimmed_df.prev1_yr_coll_cnt.fillna(0)
joined_trimmed_df.prev2_yr_coll_cnt = joined_trimmed_df.prev2_yr_coll_cnt.fillna(0)
joined_trimmed_df.prev1_yr_coll_neighbor1 = joined_trimmed_df.prev1_yr_coll_neighbor1.fillna(0)
joined_trimmed_df.prev1_yr_coll_neighbor2 = joined_trimmed_df.prev1_yr_coll_neighbor2.fillna(0)
joined_trimmed_df.prev2_yr_coll_neighbor1 = joined_trimmed_df.prev2_yr_coll_neighbor1.fillna(0)
joined_trimmed_df.prev2_yr_coll_neighbor2 = joined_trimmed_df.prev2_yr_coll_neighbor2.fillna(0)
joined_trimmed_df.amenities_restaurant_cnt = joined_trimmed_df.amenities_restaurant_cnt.fillna(0)
joined_trimmed_df.amenities_bar_cnt = joined_trimmed_df.amenities_bar_cnt.fillna(0)
joined_trimmed_df.amenities_school_cnt = joined_trimmed_df.amenities_school_cnt.fillna(0)
joined_trimmed_df.amenities_college_cnt = joined_trimmed_df.amenities_college_cnt.fillna(0)


Check if any columns contain missing values.

In [None]:
na_cnt = joined_trimmed_df.isnull().sum()
na_cnt[na_cnt>0]

In [None]:
max_edge_mode = joined_trimmed_df.edge_lanes_max.value_counts().index[0]
max_edge_mode

In [None]:
joined_trimmed_df['drv_edge_lanes_max_imputed_flag'] = joined_trimmed_df['edge_lanes_max'].isna()

In [None]:
joined_trimmed_df['drv_edge_lanes_max_imputed_flag'].value_counts()

In [None]:
joined_trimmed_df.edge_lanes_max = joined_trimmed_df.edge_lanes_max.fillna(max_edge_mode)

In [None]:
#update true false id columns
for col in joined_trimmed_df.columns:
    if (col.endswith('_flag')):
        joined_trimmed_df.loc[joined_trimmed_df[col] == True, col] = 1
        joined_trimmed_df.loc[joined_trimmed_df[col] == False, col] = 0
        joined_trimmed_df[col] = joined_trimmed_df[col].astype('int')
        

In [None]:
joined_trimmed_df.sample(2)

## 6. Review number of records in final dataset
Get count of records for train-test-validation

In [None]:
joined_trimmed_df.ttv_split.value_counts()

Get count of records for accident_count

In [None]:
joined_trimmed_df.target.value_counts()

In [None]:
#variable 3
var_numb = 7
#print(joined_trimmed_df.columns[var_numb])
for var_numb in range(len(joined_trimmed_df.columns)):
    if 'top' in joined_trimmed_df[joined_trimmed_df.columns[var_numb]].describe().index:
        print(joined_trimmed_df.columns[var_numb])
        print(var_numb)
        print(pd.DataFrame(joined_trimmed_df[joined_trimmed_df.columns[var_numb]].describe()))

In [None]:
joined_trimmed_df.describe()

## 7. Save final dataset
### 7.1 Save to parquet and upload to S3 bucket

In [None]:
# awswrangler.s3.to_csv(df=joined_trimmed_df, path = 's3://traffic-data-bucket/model_data/model_data_post_transformation.csv', index=False,
#                        boto3_session=my_session, use_threads=True
#                        )


awswrangler.s3.to_parquet(df=joined_trimmed_df, path = f's3://{s3_bucket}/model_data/model_data_post_transformation.parquet', index=False,
                        boto3_session=my_session, use_threads=True
                        )

### 7.1 Store local copy of sample dataset

In [None]:
joined_trimmed_df[0:50].to_csv(root / 'X.data' / 'model_data' / 'model_data_post_transformation.csv', index = False )

### 7.2. Store local copy of full dataset

In [None]:
#create a local copy
joined_trimmed_df.to_csv(root / 'X.data' / 'model_data' / 'model_data_post_transformation.csv', index = False )