### Model Data Initial Join
* Base hex data with node and edge characteristics and target variable (hex_id, collision year/month/day of week/hour).
* Join prior year accident data information (hex_id and year)
* Join TTV split column (hex_id, collision year/month/day of week/hour, accident id, ttv_split)
* Join weather data - needs to be created

In [None]:
import pandas as pd
import numpy as np

import datetime
from datetime import datetime

import boto3
import awswrangler

import re

import os
from os.path import isfile, join
from pathlib import Path
from os import listdir
import os
os.getcwd()

#from commons import download_data, find_vcs_root

path =  Path(os.getcwd())
root = path.parent.absolute()

root


In [None]:
from aws_secrets import aws_access_key_id, aws_secret_access_key, aws_session_token

my_session = boto3.Session(
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    aws_session_token = aws_session_token

)

### 1. Import Base data
This includes most of the base data for modeling
* Node and edge information (intersections and streets.
* Includes the accident data as well.

In [None]:
raw_s3_bucket = 'traffic-data-bucket'
raw_path_dir = 'joined_data'

In [None]:
base_df = awswrangler.s3.read_csv(path = 's3://traffic-data-bucket/joined_data/base_location_data.csv',
                       boto3_session=my_session, use_threads=True
                       )

In [None]:
base_df.sample(5)

### 2. Prior Years Accident Information
Various stats regarding collision history for the hexegon and its neighbors

In [None]:
raw_s3_bucket = 'traffic-data-bucket'
raw_path_dir = 'joined_data'

In [None]:
prior_collision_hist_dict = {}
coll_year_dict = [2014, 2015, 2016, 2017, 2018, 2019, 2020]

for key in coll_year_dict:
    print(key)
    #base_dict[key] = pd.read_csv(root / 'X.data' / 'joined_data' / ('base_location_' + str(key) + '_collision_data.csv'))
    
    raw_path = f"s3://{raw_s3_bucket}/{raw_path_dir}/{'base_location_' + str(key) + '_collision_data.csv'}"
    prior_collision_hist_dict[key] = awswrangler.s3.read_csv(path = raw_path, boto3_session=my_session, use_threads=True)


In [None]:
for key in coll_year_dict:
    temp_df = prior_collision_hist_dict[key]
    temp_df['collision_year'] = key
    prior_collision_hist_dict[key] = temp_df

In [None]:
#stack the dictionary of pd frames
prior_collision_hist_df = pd.concat(prior_collision_hist_dict.values(), ignore_index=True)

In [None]:
prior_collision_hist_df.sample(3)

### 3. Import Train-Test-Validation split data
This includes all collision date and time
* accident_count = 1 is a positive sample
* accident_count = 0 is a negative sample
* ttv_split values are Train, Test, Validate

In [None]:
#TTV_df = pd.read_csv(root / 'X.data' / 'TTV_splits' / 'TTV_data.csv')
ttv_df = awswrangler.s3.read_csv(path = 's3://traffic-data-bucket/TTV_splits/TTV_data.csv', boto3_session=my_session, use_threads=True)

In [None]:
ttv_df.sample()

### 4. Import weather
Weather data for LA county.

In [None]:
weather_df = awswrangler.s3.read_csv(path = 's3://traffic-data-bucket/weather/LA_weather_data_updated.csv', boto3_session=my_session, use_threads=True)

In [None]:
weather_df.sample(2)

### 4. Amenities
Open Streets information for counts
* Restaurants, bars, colleges and schools

In [None]:
ammenities_df = awswrangler.s3.read_csv(path = 's3://traffic-data-bucket/nodes_and_edges/la_county_amenities/la_county_ammenities.csv', boto3_session=my_session, use_threads=True)

In [None]:
ammenities_df.sample(2)

### 5. Join Data
* train-test-validate (TTV) split had hex id and date/hour 
* base data - join on hex id
* collision history - join on hex id, collision year


In [None]:
joined_df1 = ttv_df[['hex_id','collision_date', 'collision_year','collision_month',
                     'collision_dayofweek','collision_hour',
                     'accident_count','ttv_split']].merge(base_df, on = 'hex_id', how = 'left')

In [None]:
#should have the same number of rows
ttv_df.shape[0] == joined_df1.shape[0]

In [None]:
joined_df2 = joined_df1.merge(prior_collision_hist_df, on = ['hex_id', 'collision_year'], how = 'left')

In [None]:
#should have the same number of rows
joined_df2.shape[0] == joined_df1.shape[0]

In [None]:
#make sure they are the same format
joined_df2['collision_date'] = pd.to_datetime(joined_df2['collision_date']).dt.date
weather_df['date'] = pd.to_datetime(weather_df['date']).dt.date 
joined_df3 = joined_df2.merge(weather_df, left_on = ['collision_date'], right_on = ['date'], how = 'left')

In [None]:
#should have the same number of rows
joined_df2.shape[0] == joined_df3.shape[0]

In [None]:
joined_df4 = joined_df3.merge(ammenities_df, on = ['hex_id'], how = 'left')

In [None]:
#should have the same number of rows
joined_df4.shape[0] == joined_df3.shape[0]

In [None]:
# joined_df4.sample(3)

In [None]:
joined_df4.shape

## 6. Save to parquet and upload to S3 bucket

In [None]:
# upload to root of S3 Bucket
# awswrangler.s3.to_csv(df=joined_df4, path = 's3://traffic-data-bucket/model_data/model_data_pre_transformation.csv', index=False,
#                        boto3_session=my_session, use_threads=True
#                        )

awswrangler.s3.to_parquet(df=joined_df4, path = 's3://traffic-data-bucket/model_data/model_data_pre_transformation.parquet', index=False,
                       boto3_session=my_session, use_threads=True
                       )

### Create a local copy

In [None]:
#create a local copy
# joined_df4.to_csv(root / 'X.data' / 'model_data_pre_transformation.csv', index = False )