# Train, Test, Validation

In [None]:
import pandas as pd
import numpy as np

import datetime
from datetime import datetime

import boto3
import awswrangler

import re

import os
from os.path import isfile, join
from pathlib import Path
from os import listdir
import os
os.getcwd()

#from commons import download_data, find_vcs_root

path =  Path(os.getcwd())
root = path.parent.absolute()

root

## 1. Create Boto3 session
Start by creating a boto3 session so that we can connect to the S3 bucket.

In [None]:
from aws_secrets import aws_access_key_id, aws_secret_access_key, aws_session_token

my_session = boto3.Session(
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    aws_session_token = aws_session_token

)

# 2. Import data

### 2.1  Base location data
##### LA County shape file transposed to Uber Hexegons at level 8. ~.75 square km
##### Import all hex and make a list of over day, hour and year and attach a random number for 
##### https://h3geo.org/docs/core-library/restable/

In [None]:
gdf_df = awswrangler.s3.read_csv(path='s3://traffic-data-bucket/joined_data/base_location_data.csv', boto3_session=my_session)


# gdf_df = pd.read_csv(root / 'X.data' / 'joined_data' / 'base_location_data.csv')
print(gdf_df.shape)
valid_mask = gdf_df['valid_accident_location_filter'] == True
gdf_valid_df = gdf_df[valid_mask]
gdf_valid_df.shape

In [None]:
gdf_df = gdf_df[~(gdf_df.hex_id == '0')]

In [None]:
gdf_df.head()

### 2.2 Negative sample data
Download negative sample dataframes generated by notebook `13.base_neg_sample_build.ipynb` and concatenate into one dataframe.

In [None]:
# S3 bucket name and folder containing collision data
raw_s3_bucket = 'traffic-data-bucket'
raw_path_dir = 'neg_samples'

# path of S3 bucket where collision data is stored
raw_path = f"s3://{raw_s3_bucket}/{raw_path_dir}"

# read data from S3 bucket
neg_sample_df = awswrangler.s3.read_csv(path=raw_path, path_suffix=['.csv'], dataset=True,
                                 boto3_session=my_session, use_threads=True, low_memory=False)

In [None]:
# collision_year_list = [2014, 2015, 2016, 2017, 2018, 2019, 2020]
# neg_sample_dict = {}
# for year in collision_year_list:
#     neg_sample_dict[year] = pd.read_csv(root / 'X.data' / 'neg_samples' / ('neg_samples_' + str(year) + '.csv'),low_memory = False)
# neg_sample_df = pd.concat(neg_sample_dict.values(), ignore_index=True)

In [None]:
neg_sample_df.sample(3)

In [None]:
def convert_doy_to_date(row):
    doy = str(row.doy)
    year = str(row.year)
    doy.rjust(3 + len(doy), '0')
    new_date = datetime.strptime(year + "-" + doy, "%Y-%j").strftime("%m-%d-%Y")
    return new_date

In [None]:
from random import choices
def sample_date_time_creation(frame):
    frame['collision_date'] = pd.to_datetime(frame['date'])
    frame['collision_month']  = frame['collision_date'].dt.month
    frame['collision_dayofweek']  = frame['collision_date'].dt.dayofweek
    frame['collision_year']  = frame['year']
    frame['accident_count'] = 0
    # panda frame hours range from 0 to 23
    frame['collision_hour'] = choices(range(24),k=frame.shape[0])
    frame = frame[['hex_id', 'collision_year', 'collision_month', 'collision_dayofweek', 'collision_hour', 'accident_count']]
    return frame

In [None]:
neg_sample_df['date'] = neg_sample_df.apply(convert_doy_to_date, axis=1)
neg_sample_df = sample_date_time_creation(neg_sample_df)
neg_sample_df['accident_count'] = 0
neg_sample_df.sample(3)

## 2.3 Positive sample data

In [None]:
pos_sample_df = awswrangler.s3.read_csv(path='s3://traffic-data-bucket/h3_processed_data/collisions_hex.csv', boto3_session=my_session)

# pos_sample_df = pd.read_csv(root / 'X.data' / 'h3_processed_data' / 'collisions_hex.csv', low_memory = False)
pos_sample_df = pos_sample_df[['hex_id', 'collision_year', 'collision_month', 'collision_dayofweek', 'collision_hour', 'accident_count']]
print(pos_sample_df.shape)
pos_sample_df.sample(3)

In [None]:
pos_sample_df = pos_sample_df[pos_sample_df['hex_id'].isin(gdf_valid_df['hex_id'])]
pos_sample_df.shape

In [None]:
#concatenate and attach test/train/validate and out of time.
neg_pos_sample_df = pd.concat([pos_sample_df, neg_sample_df])
neg_pos_sample_df.sample(2)

In [None]:
neg_pos_sample_df[neg_pos_sample_df['accident_count'] == 0].collision_year.unique()

In [None]:
neg_pos_sample_df[neg_pos_sample_df['accident_count'] == 1].collision_year.unique()

In [None]:
neg_pos_sample_df.shape[0]

## 3. Create train-test-validation split column
This column is used for...???

In [None]:
# generate random floating point values
from random import seed
from random import random
# seed random number generator
seed(1)
random_list = list()
# generate random numbers between 0-1
for _ in range(neg_pos_sample_df.shape[0]):
	random_list.append(random())
len(random_list)

In [None]:
neg_pos_sample_df['random'] = pd.Series(random_list)
neg_pos_sample_df['ttv_split'] = np.where(neg_pos_sample_df['random']<=.5, 'Train',
                                 np.where(neg_pos_sample_df['random']<=.8, 'Test','Validate'))

In [None]:
neg_pos_sample_df.ttv_split.value_counts()/neg_pos_sample_df.shape[0]

In [None]:
neg_pos_sample_df.sample(10)

In [None]:
awswrangler.s3.to_csv(df=neg_pos_sample_df, path = 's3://traffic-data-bucket/TTV_splits/TTV_data.csv', index=False,
                       boto3_session=my_session, use_threads=True)

# neg_pos_sample_df.to_csv(root / 'X.data' / 'TTV_splits' / 'TTV_data.csv', index = False )