# Out of Time Predictions

In [None]:
# !pip install -U pip
# !pip install -U setuptools wheel
# !pip install -U "mxnet<2.0.0" bokeh==2.0.1
# !pip install autogluon --no-cache-dir

In [None]:
import pandas as pd
import numpy as np
import boto3
# import awswrangler
from autogluon.tabular import TabularPredictor

# set name of S3 bucket
s3_bucket = 'traffic-data-bucket'

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

## 1. Import data

In [None]:
df = pd.read_parquet(f's3://{s3_bucket}/model_data/out_of_time_validation.parquet', engine='auto')

In [None]:
df.shape

In [None]:
df['day_of_week_sin'] = np.sin(df['collision_dayofweek'] * (2 * np.pi / 7))
df['day_of_week_cos'] = np.cos(df['collision_dayofweek'] * (2 * np.pi / 7))

#### Select feature columns

In [None]:
street_features = ['la_data_city_name', 
                     'node_street_count', 'node_stop', 'node_traffic_signals',
                     'edge_speed_kph_max', 'edge_speek_kph_min',
                     'edge_lanes_max', 'edge_motorway_flag', 'edge_motorway_link_flag',
                     'edge_living_street_flag', 'edge_bridge_flag', 'edge_oneway_flag',
                     'edge_tunnel_flag', 'amenities_bar_cnt', 'amenities_school_cnt',
                     'amenities_restaurant_cnt', 'amenities_college_cnt',
                     'drv_edge_lanes_max_imputed_flag']

time_features = ['drv_collision_hour_sin','drv_collision_hour_cos',
                 'collision_month', 'drv_holiday_flag', 'day_of_week_sin', 'day_of_week_cos' # add cosine and sine for day of the week
                ]

hex_history_features = ['prev1_yr_coll_cnt', 'prev1_yr_coll_neighbor1']

weather_features = ['noaa_wind_speed', 'noaa_precipitation',
                    'noaa_temperature_average', 'noaa_temperature_max',
                    'noaa_temperature_min']

model_features = street_features +  time_features + hex_history_features +  weather_features

Create a dataframe of selected features.

In [None]:
X_all = df[model_features]

In [None]:
X_all.dtypes

In [None]:
for column in X_all.columns:
    if df[column].dtype == 'Int64':
        df[column] = df[column].astype(int)
    if df[column].dtype == 'Float64':
        df[column] = df[column].astype('float64')

In [None]:
percent_missing = X_all.isnull().sum()
missing_value_df = pd.DataFrame({'column_name': X_all.columns,
                                 'number_missing': percent_missing})

display(missing_value_df.sort_values('number_missing', ascending=False))

## 3. Generate predicitons on out of time data
#### 3.1 Load saved AutoGluon model

In [None]:
load_path = 'agModels-final_model_updated'

predictor = TabularPredictor.load(load_path)

#### 3.2 Generate predictions across the out of time validation set

In [None]:
predictions = predictor.predict_proba(X_all)

Add predictions from the positive class.

In [None]:
df['prediction'] = predictions[1]

In [None]:
df.head()

Calculate the mean prediction each hexagon across each collision hour group and date.

In [None]:
df_predictions = df.groupby(['collision_hour_grp', 'collision_date','hex_id'], as_index=False)['prediction'].mean()

#### 3.3 Calculate the relative probability
The relative probability reflects the percentage above or below the average probability where the average probability is 0.2.

In [None]:
df_predictions['relativity'] = (df_predictions['prediction'] - 0.2) / 0.2

In [None]:
df_predictions.dtypes

In [None]:
df_predictions['collision_date'] = df_predictions['collision_date'].astype('datetime64[ns]')

In [None]:
df_predictions.head()

## 4. Upload to S3

In [None]:
df_predictions.to_parquet(f"s3://{s3_bucket}/power_bi/out_of_time_predictions.parquet", index=False, compression='GZIP')