# Transportation Injury Mapping System (TIMS) Data Preparation
## Import Libraries

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
from h3 import h3
import os
from os.path import isfile, join
from pathlib import Path
from os import listdir

import boto3
import awswrangler

path =  Path(os.getcwd())
root = path.parent.absolute()

import warnings
pd.options.mode.chained_assignment = None  # default='warn'

h3_level = 8

## 1. Connect to AWS Services

In [3]:
from aws_secrets import aws_access_key_id, aws_secret_access_key, aws_session_token

my_session = boto3.Session(
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    aws_session_token = aws_session_token

)

## 2. Download collision data from S3 bucket using AWS Wrangler
AWS Wrangler is used to read all files in the S3 Bucket with a .csv suffix into a single Pandas dataframe.

In [2]:
# S3 bucket name and folder containing collision data
raw_s3_bucket = 'traffic-data-bucket'
raw_path_dir = 'TIMS_raw_crashes/'

# path of S3 bucket where collision data is stored
raw_path = f"s3://{raw_s3_bucket}/{raw_path_dir}"

# read data from S3 bucket
collision_df = awswrangler.s3.read_csv(path=raw_path, path_suffix=['.csv'], dataset=True,
                                 boto3_session=my_session, use_threads=True, low_memory=False)

In [6]:
collision_df.shape

(404681, 80)

In [7]:
collision_df.head()

Unnamed: 0,CASE_ID,ACCIDENT_YEAR,PROC_DATE,JURIS,COLLISION_DATE,COLLISION_TIME,OFFICER_ID,REPORTING_DISTRICT,DAY_OF_WEEK,CHP_SHIFT,...,COUNT_MC_KILLED,COUNT_MC_INJURED,PRIMARY_RAMP,SECONDARY_RAMP,LATITUDE,LONGITUDE,COUNTY,CITY,POINT_X,POINT_Y
0,5912010,2014,2015-12-21,1900,2014-01-10,2230,525744,1132.0,5,5,...,0,0,-,-,,,LOS ANGELES,LANCASTER,-118.118594,34.68957
1,5912033,2014,2014-01-23,1942,2014-01-07,2115,39586,497.0,2,5,...,0,0,-,-,,,LOS ANGELES,LOS ANGELES,-118.215508,34.016986
2,5912119,2014,2014-02-10,1976,2014-01-09,712,890,,4,5,...,0,0,-,-,,,LOS ANGELES,SANTA FE SPRINGS,-118.066151,33.931708
3,5912128,2014,2015-12-16,1900,2014-01-04,202,515585,1332.0,6,5,...,0,0,-,-,,,LOS ANGELES,BELLFLOWER,-118.14282,33.90379
4,5912130,2014,2014-02-14,1942,2014-01-04,1425,36752,559.0,6,5,...,0,0,-,-,,,LOS ANGELES,LOS ANGELES,-118.2521,33.75383


## Clean and create time features
The dataset contains some invalid times which will be dropped and from the `collission_date` variable we will extract the year, month, day, day of the week, and day of the year.

In [16]:
# 0:0>4 is a left padding of a string so if the string length is less than 4, add front ‘0’, e.g., 23 would become ‘0023’ 
collision_df['COLLISION_TIME']=collision_df['COLLISION_TIME'].apply(lambda x: '{0:0>4}'.format(x))

# create a mask to remove records with invalid times
# there are several rows that have '2500' entered as their collision time
mask = collision_df['COLLISION_TIME']=='2500'

# apply mask to capture records with valid times
valid_time = collision_df[~mask]

# create a dataframe of records with nonvalid times
nonvalid_time = collision_df[mask]
nonvalid_time['COLLISION_HOUR'] = 'none'
# print number of records to drop
print(nonvalid_time.shape)

valid_time = valid_time.assign(COLLISION_HOUR=pd.to_datetime(valid_time.COLLISION_TIME, format='%H%M').dt.hour)
valid_time = valid_time.assign(COLLISION_DATE=pd.to_datetime(valid_time.COLLISION_DATE, format='%Y-%m-%d'))

# create date features
valid_time['COLLISION_YEAR'] = valid_time.COLLISION_DATE.dt.year
valid_time['COLLISION_MONTH'] = valid_time.COLLISION_DATE.dt.month
valid_time['COLLISION_DAY'] = valid_time.COLLISION_DATE.dt.day
valid_time['COLLISION_DAYOFWEEK'] = valid_time.COLLISION_DATE.dt.dayofweek
valid_time['COLLISION_DAYOFYEAR'] = valid_time.COLLISION_DATE.dt.dayofyear

valid_time.head(2)

(353, 81)


Unnamed: 0,CASE_ID,ACCIDENT_YEAR,PROC_DATE,JURIS,COLLISION_DATE,COLLISION_TIME,OFFICER_ID,REPORTING_DISTRICT,DAY_OF_WEEK,CHP_SHIFT,...,COUNTY,CITY,POINT_X,POINT_Y,COLLISION_HOUR,COLLISION_YEAR,COLLISION_MONTH,COLLISION_DAY,COLLISION_DAYOFWEEK,COLLISION_DAYOFYEAR
0,5912010,2014,2015-12-21,1900,2014-01-10,2230,525744,1132,5,5,...,LOS ANGELES,LANCASTER,-118.118594,34.68957,22,2014,1,10,4,10
1,5912033,2014,2014-01-23,1942,2014-01-07,2115,39586,497,2,5,...,LOS ANGELES,LOS ANGELES,-118.215508,34.016986,21,2014,1,7,1,7


Next, lowercase each column name.

In [17]:
valid_time.columns = [each_string.lower() for each_string in valid_time.columns]

## Use H3 library to create and attach hexagons
`h3_level` refers to the size of each hexagon.  A higher value represents a larger hexagon which covers a larger area and smaller values represent smaller hexagons.

In [3]:
print('H3 Level:', h3_level)

H3 Level: 8


In [20]:
# create a function to convert latitude and longitude values to hexagons
def lat_lng_to_h3(row):
    return h3.geo_to_h3(row.point_y, row.point_x, h3_level)

pd_dict = {}

valid_time['hex_id'] = valid_time.apply(lat_lng_to_h3, axis=1)
print(valid_time.shape)
valid_time = valid_time.groupby(['hex_id', 'collision_year', 'collision_dayofyear', 'collision_month', 'collision_dayofweek', 'collision_hour']).first()
valid_time.reset_index(inplace = True)

valid_time_select = valid_time[['hex_id', 'collision_year', 'collision_dayofyear', 'collision_month', 'collision_dayofweek', 'collision_hour']]
valid_time_select['accident_count'] = 1
print(valid_time_select.shape)
#valid_time_cnt.sample(3)
#valid_time_cnt.columns = ['h3_'+str(level), 'collision_count_h3_'+str(level)]
#pd_dict[key_val] = valid_time_cnt

(404328, 87)
(400243, 7)


## Upload processed data to S3 Bucket
Convert the processed data to CSV and upload to the S3 bucket.

In [26]:
# upload to a new folder in the bucket called h3_processed_data
awswrangler.s3.to_csv(df=valid_time_select, path = 's3://traffic-data-bucket/h3_processed_data/collisions_hex.csv', index=False,
                       boto3_session=my_session, use_threads=True
                       )
# upload to root of S3 Bucket
awswrangler.s3.to_csv(df=valid_time_select, path = 's3://traffic-data-bucket/all_collisions_points.csv', index=False,
                       boto3_session=my_session, use_threads=True
                       )

{'paths': ['s3://traffic-data-bucket/all_collisions_points.csv'],
 'partitions_values': {}}