# Negative Sampling

In [1]:
import geopandas  as gpd
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import h3 as h3

import re

import os
from os.path import isfile, join
from pathlib import Path
from os import listdir
import os
os.getcwd()

path =  Path(os.getcwd())
root = path.parent.absolute()

root

# import aws libraries
import boto3
import awswrangler
# set name of S3 bucket
s3_bucket = 'traffic-data-bucket'

# 1. Import Data

#### 1.1 Import Base Table with Valid Street ID for sampling
##### LA County shape file transposed to Uber Hexegons at level 8. ~.75 square km
##### This process takes a shape file and maps it to hex files for a given level. The output of the mapping is the a unique hex_id for the hexegon and the shape geometry
##### https://h3geo.org/docs/core-library/restable/

In [2]:
from aws_secrets import aws_access_key_id, aws_secret_access_key, aws_session_token

my_session = boto3.Session(
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    aws_session_token = aws_session_token

)

In [3]:
gdf_all = awswrangler.s3.read_csv(path=f's3://{s3_bucket}/joined_data/base_location_data.csv', boto3_session=my_session)


In [None]:
valid_mask = gdf_all['valid_accident_location_filter'] == True
gdf_valid = gdf_all[valid_mask]
gdf_valid = gdf_valid[['hex_id']]
gdf_valid.shape
display(gdf_valid.sample(3))
del(gdf_all)

#### 1.2 Collision Data

In [None]:
collisions = awswrangler.s3.read_csv(path=f's3://{s3_bucket}/h3_processed_data/collisions_hex.csv', boto3_session=my_session)

# collisions = pd.read_csv(root / 'X.data' / 'h3_processed_data' / 'collisions_hex.csv')
collisions = collisions[~(collisions.hex_id == '0')]
print(collisions.shape)
collisions.sample(2)

# 2. Collisions 

### 2.1 Prep collision file by making a wide table.  One unique row per hex id

In [None]:
#collisions_year_counts = collisions.groupby(["collision_year"])["count"].agg('sum').reset_index()
collision_year_count = collisions.groupby("collision_year").hex_id.agg("count").to_frame("count").reset_index()
display(collision_year_count)
collisions = collisions[["hex_id", "collision_year", "collision_dayofyear"]]
collisions = collisions.drop_duplicates()
print(collisions.columns)
coll_years = list(collisions.collision_year.unique())
print(coll_years.sort())
collisions.sample(2)

In [None]:
collision_year_count[collision_year_count.collision_year == 2018]['count'].values[0]

In [None]:
def rking_neighbors(row, skins):
    neighbors = h3.k_ring(row.hex_id, skins)
    neighbors_list = list(neighbors)
    return(neighbors_list)

collisions['hex_neighbors_2_ids'] = collisions.apply(lambda x: rking_neighbors(x, skins = 2), axis=1)
collisions.sample(2)


In [None]:
import calendar

def get_fence_range(doy):
    #daysinyear = 365
    #is_leapyear = calendar.isleap(year)
    #if is_leapyear:
    #    daysinyear = 366
    doy_fence = ""
    for i in range((doy-3),(doy+3)):
        #print(i)
        doy_fence = doy_fence + " " +str(i)
    doy_fence = doy_fence.strip()
    return(doy_fence)

def collision_fence(row):
    doy = int(row.doy_fence)
    year = int(row.collision_year)
    
    doy_out = doy
    year_out = year
    
    daysinyear = 365
    daysinyear_prev = 365 
    
    is_leapyear = calendar.isleap(year)
    if is_leapyear:
        daysinyear = 366
    is_leapyear = calendar.isleap(year - 1)
    if is_leapyear:
        daysinyear_prev = 366

    #days after year end
    if daysinyear < doy:
        year_out = year + 1
        doy_out = doy - daysinyear 
    #days before year began
    if doy < 0:
        year_out = year -1
        doy_out = daysinyear_prev + doy
    return pd.Series([year_out, doy_out])
    


In [None]:
#take the collision day of year create a list with plus minus 3 days
#so we will not be taking samples from any of the dqys just before a collision and just after
collisions['doy_fence'] = collisions['collision_dayofyear'].apply(lambda x: get_fence_range(x))
collisions['doy_fence'] = collisions['doy_fence'].str.split()
collisions['doy_fence']

In [None]:
#explode the column to create a row for every day of year in the list
collisions = collisions[['hex_id', 'collision_year', 'hex_neighbors_2_ids', 'doy_fence']].explode('doy_fence')
collisions.sample(2)

In [None]:
collisions.shape

The collision fence take collision year and day fence and adjusts for days that run into another year. For example, -1 2010 would be 365 2009.

In [None]:
#example doy -1 2010, would become day 365 2009
collisions[['year_fence', 'doy_fence']] = collisions.apply(collision_fence ,axis=1)
collisions.sample(3)

In [None]:
collision_dict = {}
for year in coll_years:
    year_mask = collisions['year_fence'] == year
    collision_dict[year] = collisions[year_mask]


In [None]:
del collisions

## 3. Generate negative samples
Here we will generate 4 negative samples for each collision.

Start by designating a folder to store the negative sample data in the S3 bucket.

In [None]:
raw_path_dir = 'neg_samples'

In [None]:
sample_multiplier = 4
#sample_year = 2018

neg_sample_dict = {}

for sample_year in coll_years:
    print(sample_year)
    
    #assign days in a year
    daysinyear = 365
    #correct for leap years
    is_leapyear = calendar.isleap(sample_year)
    if is_leapyear:
        daysinyear = 366
    
    doy = ""
    #python day of year starts at 1 not zero
    for i in range(1,daysinyear+1):
        doy = doy + " " +str(i)
    doy = doy.strip()
    
    #attach a vector of all the days of yeat to each hex id
    gdf_valid['doy'] = doy
    gdf_valid['doy'] = gdf_valid['doy'].str.split()
    print(gdf_valid.shape)
    display(gdf_valid.head(1))
    #make a tall table, one column for every hex id and day of year
    gdf_valid_exp = gdf_valid[['hex_id', 'doy']].explode('doy')

    #attache a column for doy
    gdf_valid_exp['year'] = sample_year
    gdf_valid_exp.shape
    
    
    #get the frame from the dictionary for the given year
    #create samples by making a refence of all accidents on a given day within 2 neighbors of a hexegon.
    
    coll_ref = collision_dict[sample_year]
    #name the columns
    coll_ref = coll_ref[['hex_neighbors_2_ids', 'doy_fence', 'year_fence']]
    sample_year_initial_count = len(collision_dict[sample_year]['hex_id'].unique())
    print('exploding collisions')
    coll_ref = coll_ref.explode('hex_neighbors_2_ids')
    coll_ref = coll_ref.drop_duplicates()
    
    coll_ref['doy_fence'] = coll_ref['doy_fence'].map(str)
    gdf_valid_exp['doy'] = gdf_valid_exp['doy'].map(str)
    print('merging data')
    gdf_valid_exp = gdf_valid_exp.merge(coll_ref, 
                                left_on = ['hex_id', 'doy'],
                                right_on = ['hex_neighbors_2_ids', 'doy_fence'],
                                how = 'left')
    
    #remove any samples that are in accident fence
    #after joining, accidents to exclude will have a valid neighbor id.  All nas are therefore eligible for sampling
    nas_mask = gdf_valid_exp.hex_neighbors_2_ids.isna()
    #exclude is not na
    gdf_valid_exp = gdf_valid_exp[~nas_mask]
    
    sample_year_initial_count = collision_year_count[collision_year_count.collision_year == sample_year]['count'].values[0]
    negative_year_samples = gdf_valid_exp.sample(n = (sample_year_initial_count * sample_multiplier), replace = True, random_state = 42)
    
    negative_year_samples = negative_year_samples[['hex_id','doy','year']]
    print(negative_year_samples.shape)
    # negative_year_samples.to_csv(root / 'X.data' / 'neg_samples' / ('neg_samples_' + str(sample_year) + '.csv'), index = False )
    
    # create S3 file path for dataframe and upload to S3 bucket
    raw_path = f"s3://{s3_bucket}/{raw_path_dir}/{'neg_samples_' + str(sample_year) + '.csv'}"
    awswrangler.s3.to_csv(df=negative_year_samples, path = raw_path, index=False,
                       boto3_session=my_session, use_threads=True
                       )
    