In [1]:
# package imports
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import shapely

In [2]:
# load all pickle files
street_by_time = pd.read_pickle('../../data/interim/features/street_by_time.pickle')
daylight = pd.read_pickle('../../data/interim/features/daylight.pickle')
holidays = pd.read_pickle('../../data/interim/features/holiday_dates.pickle')
street_info = pd.read_pickle('../../data/interim/features/streets_by_nhood.pickle')
rush_hour = pd.read_pickle('../../data/interim/features/rush_hour.pickle')
weather = pd.read_pickle('../../data/interim/features/weather.pickle')

**Goal: Dataset for all accidents that is indexed by ROAD SEGMENT and DATETIME**

1. Merge information from other datasets into accidents dataset
2. Select 3 negative samples for each positive (accident) sample according to methodology from article
3. Append negative samples to positive samples and clean up final output dataset

In [3]:
street_by_time['date'] = street_by_time.day_hour.dt.date
street_by_time['hour'] = street_by_time.day_hour.dt.hour
street_by_time = street_by_time[['date', 'hour', 'segment_id', 'accident_yn']]

In [4]:
accident_samples = street_by_time[['date', 'hour', 'segment_id']].copy()
negative_sample_size = accident_samples.shape[0] * 3

In [5]:
np.random.seed(42)
sample_space_date = pd.Series(pd.date_range('2012-01-01', '2018-06-30')).dt.date
sample_space_hour = pd.Series([x for x in range(24)])
sample_space_location = street_info.segment_id.sort_values().reset_index(drop=True)

In [6]:
n_rows = 1000000
sample_dates = np.random.choice(sample_space_date, size=n_rows)
sample_hours = np.random.choice(sample_space_hour, size=n_rows)
sample_locations = np.random.choice(sample_space_location, size=n_rows)

In [7]:
generated_samples = pd.DataFrame(columns=['date', 'hour', 'segment_id'])

generated_samples.date = sample_dates
generated_samples.hour = sample_hours
generated_samples.segment_id = sample_locations

In [8]:
negative_samples = generated_samples.merge(accident_samples, how='left', indicator=True)
negative_samples = negative_samples[negative_samples._merge == 'left_only']
negative_samples = negative_samples.drop_duplicates()

In [9]:
negative_sample_idx = np.random.choice(negative_samples.index, size=negative_sample_size, replace=False)

In [10]:
negative_samples = negative_samples[negative_samples.index.isin(negative_sample_idx)]
negative_samples = negative_samples[['date', 'hour', 'segment_id']]

In [11]:
accident_samples['accident_yn'] = 1
negative_samples['accident_yn'] = 0

all_samples = pd.concat([accident_samples, negative_samples], ignore_index=True)

In [12]:
daylight['sunrise'] = daylight.sunrise.ffill()
daylight['sunset'] = daylight.sunset.ffill()
daylight['dt_sunrise'] = pd.to_datetime(daylight.index.astype('str') + ' ' + daylight.sunrise.astype('str'))
daylight['dt_sunset'] = pd.to_datetime(daylight.index.astype('str') + ' ' + daylight.sunset.astype('str'))
daylight['dt_sunrise'] = daylight.dt_sunrise.dt.round('1h')
daylight['dt_sunset'] = daylight.dt_sunset.dt.round('1h')
daylight['date'] = daylight.index.date

In [13]:
all_samples['datetime'] = pd.to_datetime(all_samples.date.astype('str') + ' ' + all_samples.hour.astype('str') + ':00')
all_samples = all_samples.merge(daylight, on='date')
daylight_yn = np.where((all_samples['datetime'] >= all_samples['dt_sunrise']) & \
                       (all_samples['datetime'] <= all_samples['dt_sunset']), 1, 0)

all_samples['daylight_yn'] = daylight_yn
all_samples = all_samples[['date', 'hour', 'segment_id', 'accident_yn', 'datetime', 'daylight_yn']]

In [14]:
holidays = pd.DataFrame(holidays)
holidays['dt_date'] = holidays.date.dt.date

In [15]:
all_samples = all_samples.merge(holidays, left_on='date', right_on='dt_date', how='left', indicator=True)
holiday_yn = np.where(all_samples._merge == 'left_only', 0, 1)
all_samples['holiday_yn'] = holiday_yn
all_samples = all_samples[['date_x', 'hour', 'segment_id', 'accident_yn', 'datetime', 'daylight_yn', 'holiday_yn']]
all_samples.columns = ['date', 'hour', 'segment_id', 'accident_yn', 'datetime', 'daylight_yn', 'holiday_yn']

In [16]:
all_samples = all_samples.merge(rush_hour, left_on='datetime', right_index=True, how='left', indicator=True)
rush_hour_yn = np.where(all_samples._merge == 'left_only', 0, 1)
all_samples['rush_hour_yn'] = rush_hour_yn
all_samples.drop(columns=['rush_hour', '_merge'], inplace=True)

In [17]:
all_samples = all_samples.merge(weather, left_on='datetime', right_index=True, how='left')

In [18]:
streets = street_info[['segment_id', 'shape_leng', 'class_Freeway', 'class_Local', 'class_Major Arterial', 'class_Other', 'class_Umimproved']]
all_samples = all_samples.merge(streets, on='segment_id')

In [19]:
final_columns = ['date',
                 'hour',
                 'segment_id',
                 'daylight_yn',
                 'holiday_yn',
                 'rush_hour_yn',
                 'temp',
                 'wind_speed',
                 'precipitation',
                 'shape_leng',
                 'class_Freeway',
                 'class_Local',
                 'class_Major Arterial',
                 'class_Other',
                 'class_Umimproved',
                 'accident_yn']

all_samples = all_samples[final_columns]

renamed_cols = ['date',
                'hour',
                'segment_id',
                'daylight_yn',
                'holiday_yn',
                'rush_hour_yn',
                'temp',
                'wind_speed',
                'precipitation',
                'road_length',
                'class_freeway',
                'class_local',
                'class_major',
                'class_other',
                'class_unimproved',
                'accident_yn']

all_samples.columns = renamed_cols

In [20]:
all_samples.to_pickle('../../data/processed/all_samples.pickle')