In [1]:
from tensorflow.keras import backend as K
K.clear_session()

In [2]:
import os
os.chdir("/media/seconddrive/mta_stationing_problem")
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [3]:
import sys
import datetime as dt
import importlib
from pyspark import SparkContext,SparkConf
from pyspark.sql import SQLContext
from pyspark.sql import Row, SparkSession
from pyspark.sql import functions as F
from pyspark import SparkConf
import numpy as np
import pickle
import joblib
import matplotlib.pyplot as plt
import matplotlib as mpl
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, concatenate, GlobalAveragePooling1D
from tensorflow.keras.layers import LayerNormalization, MultiHeadAttention, Dropout
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.models import Model
import IPython
from copy import deepcopy
from tqdm import trange, tqdm

from src import tf_utils, config, data_utils, models, linklevel_utils

mpl.rcParams['figure.facecolor'] = 'white'

import warnings

import pandas as pd
import swifter
pd.set_option('display.max_columns', None)
from pandas.core.common import SettingWithCopyWarning

warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)
tf.get_logger().setLevel('INFO')

In [4]:
importlib.reload(tf_utils)
importlib.reload(models)

<module 'src.models' from '/media/seconddrive/mta_stationing_problem/src/models.py'>

In [5]:
import pyspark
print(pyspark.__version__)

3.3.0


In [6]:
spark = SparkSession.builder.config('spark.executor.cores', '8').config('spark.executor.memory', '80g')\
        .config("spark.sql.session.timeZone", "UTC").config('spark.driver.memory', '40g').master("local[26]")\
        .appName("wego-daily").config('spark.driver.extraJavaOptions', '-Duser.timezone=UTC').config('spark.executor.extraJavaOptions', '-Duser.timezone=UTC')\
        .config("spark.sql.datetime.java8API.enabled", "true").config("spark.sql.execution.arrow.pyspark.enabled", "true")\
        .config("spark.sql.autoBroadcastJoinThreshold", -1)\
        .config("spark.driver.maxResultSize", 0)\
        .config("spark.shuffle.spill", "true")\
        .getOrCreate()

22/08/19 22:48:48 WARN Utils: Your hostname, scope-vanderbilt resolves to a loopback address: 127.0.1.1; using 10.2.218.69 instead (on interface enp8s0)
22/08/19 22:48:48 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/19 22:48:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [7]:
f = os.path.join('data', 'processed', 'apc_weather_gtfs.parquet')
apcdata = spark.read.load(f)

In [8]:
todelete = apcdata.filter('(load < 0) OR (load IS NULL)').select('transit_date','trip_id','overload_id').distinct()
todelete=todelete.withColumn('marker',F.lit(1))

#joining and whereever the records are not found in sync error table the marker will be null
apcdataafternegdelete=apcdata.join(todelete,on=['trip_id','transit_date','overload_id'],how='left').filter('marker is null').drop('marker')

In [9]:
apcdataafternegdelete = apcdataafternegdelete.sort(['trip_id', 'overload_id'])

In [10]:
apcdataafternegdelete.show(1, vertical=True, truncate=False)

22/08/19 22:48:54 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.




-RECORD 0-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 trip_id                                | 193637                                                                                                                                                                                                                                                                                                                                                                                                                   

                                                                                

In [11]:
get_columns = ['trip_id', 'transit_date', 'arrival_time', 
               'block_abbr', 'stop_sequence', 'stop_id_original',
               'load', 
               'darksky_temperature', 
               'darksky_humidity', 
               'darksky_precipitation_probability', 
               'route_direction_name', 'route_id',
               'dayofweek',  'year', 'month', 'hour', 'zero_load_at_trip_end',
               'sched_hdwy']
get_str = ", ".join([c for c in get_columns])

apcdataafternegdelete.createOrReplaceTempView("apc")

# # filter subset
query = f"""
SELECT {get_str}
FROM apc
"""
print(query)

apcdataafternegdelete = spark.sql(query)
apcdataafternegdelete = apcdataafternegdelete.na.fill(value=0,subset=["zero_load_at_trip_end"])


SELECT trip_id, transit_date, arrival_time, block_abbr, stop_sequence, stop_id_original, load, darksky_temperature, darksky_humidity, darksky_precipitation_probability, route_direction_name, route_id, dayofweek, year, month, hour, zero_load_at_trip_end, sched_hdwy
FROM apc



In [12]:
df = apcdataafternegdelete.toPandas()
print(df.shape)
old_shape = df.shape[0]
df.head(1)

                                                                                

(16683167, 18)


Unnamed: 0,trip_id,transit_date,arrival_time,block_abbr,stop_sequence,stop_id_original,load,darksky_temperature,darksky_humidity,darksky_precipitation_probability,route_direction_name,route_id,dayofweek,year,month,hour,zero_load_at_trip_end,sched_hdwy
0,193637,2020-02-21,2020-02-21 23:38:48,1403,39,MEALYTNN,11.0,29.067,0.66,0.0,FROM DOWNTOWN,14,6,2020,2,23.0,0,3480.0


In [13]:
df = df[df.arrival_time.notna()]
df = df[df.sched_hdwy.notna()]
df = df[df.darksky_temperature.notna()]

df['route_id_dir'] = df["route_id"].astype("str") + "_" + df["route_direction_name"]
df['day'] = df["arrival_time"].dt.day
df = df.sort_values(by=['block_abbr', 'arrival_time']).reset_index(drop=True)

# Adding extra features
# Holidays
fp = os.path.join('data', 'others', 'US Holiday Dates (2004-2021).csv')
holidays_df = pd.read_csv(fp)
holidays_df['Date'] = pd.to_datetime(holidays_df['Date'])
holidays_df['is_holiday'] = True
df = df.merge(holidays_df[['Date', 'is_holiday']], left_on='transit_date', right_on='Date', how='left')
df['is_holiday'] = df['is_holiday'].fillna(False)
df = df.drop(columns=['Date'])
    
# School breaks
fp = os.path.join('data', 'others', 'School Breaks (2019-2022).pkl')
school_break_df = pd.read_pickle(fp)
school_break_df['is_school_break'] = True
df = df.merge(school_break_df[['Date', 'is_school_break']], left_on='transit_date', right_on='Date', how='left')
df['is_school_break'] = df['is_school_break'].fillna(False)
df = df.drop(columns=['Date'])

# Traffic
# Causes 3M data points to be lost
fp = os.path.join('data', 'traffic', 'triplevel_speed.pickle')
speed_df = pd.read_pickle(fp)
speed_df = speed_df.rename({'route_id_direction':'route_id_dir'}, axis=1)
speed_df = speed_df[['transit_date', 'trip_id', 'route_id_dir', 'traffic_speed']]
df = df.merge(speed_df, how='left', 
                left_on=['transit_date', 'trip_id', 'route_id_dir'], 
                right_on=['transit_date', 'trip_id', 'route_id_dir'])
# df = df[~df['traffic_speed'].isna()]
df['traffic_speed'].bfill(inplace=True)

In [14]:
old_shape - df.shape[0]

2242087

In [15]:
sorted_df = []
for ba in tqdm(df.block_abbr.unique()):
    ba_df = df[df['block_abbr'] == ba]
    end_stop = ba_df.stop_sequence.max()
    # Same result as creating a fixed_arrival_time (but faster)
    ba_df = ba_df[ba_df.stop_sequence != end_stop].reset_index(drop=True)
    sorted_df.append(ba_df)
        
overall_df = pd.concat(sorted_df)
drop_cols = ['route_direction_name', 'route_id', 'trip_id']
drop_cols = [col for col in drop_cols if col in overall_df.columns]
overall_df = overall_df.drop(drop_cols, axis=1)

# overall_df = overall_df.rename({"fixed_arrival_time": "arrival_time"}, axis=1)

100%|██████████| 227/227 [00:07<00:00, 31.11it/s]


In [16]:
TIMEWINDOW = 15
overall_df['minute'] = overall_df['arrival_time'].dt.minute
overall_df['minuteByWindow'] = overall_df['minute'] // TIMEWINDOW
overall_df['temp'] = overall_df['minuteByWindow'] + (overall_df['hour'] * 60 / TIMEWINDOW)
overall_df['time_window'] = np.floor(overall_df['temp']).astype('int')
overall_df = overall_df.drop(columns=['minute', 'minuteByWindow', 'temp'])

## Aggregate stops by time window

In [17]:
print(overall_df.shape)
overall_df.head(1)

(14371083, 21)


Unnamed: 0,transit_date,arrival_time,block_abbr,stop_sequence,stop_id_original,load,darksky_temperature,darksky_humidity,darksky_precipitation_probability,dayofweek,year,month,hour,zero_load_at_trip_end,sched_hdwy,route_id_dir,day,is_holiday,is_school_break,traffic_speed,time_window
0,2020-01-01,2020-01-01 10:01:17,300,1,MCC5_5,7.0,40.15,0.616,0.0,4,2020,1,10.0,0,4800.0,3_FROM DOWNTOWN,1,True,True,18.639604,40


In [18]:
# Group by time windows and get the maximum of the aggregate load/class/sched
# Get mean of temperature (mostly going to be equal)
# TODO: Double check this! 
overall_df = overall_df.groupby(['transit_date', 
                                 'route_id_dir', 
                                 'stop_id_original',
                                 'time_window']).agg({"block_abbr":"first",
                                                      "arrival_time":"first",
                                                      "year":"first", 
                                                      "month":"first",
                                                      "day": "first",
                                                      "hour":"first",
                                                      "is_holiday": "first",
                                                      "is_school_break":"first",
                                                      "dayofweek":"first",
                                                      "zero_load_at_trip_end":"first",
                                                      "stop_sequence":"first",
                                                      "darksky_temperature":"mean", 
                                                      "darksky_humidity":"mean",
                                                      "darksky_precipitation_probability": "mean",
                                                      "traffic_speed":"mean",
                                                      "sched_hdwy": "max",
                                                      "load": "sum" })
overall_df = overall_df.reset_index(level=[0,1,2,3])
overall_df = overall_df.sort_values(by=['block_abbr', 'arrival_time']).reset_index(drop=True)

In [19]:
print(overall_df.shape)
overall_df[100:120]

(13984203, 21)


Unnamed: 0,transit_date,route_id_dir,stop_id_original,time_window,block_abbr,arrival_time,year,month,day,hour,is_holiday,is_school_break,dayofweek,zero_load_at_trip_end,stop_sequence,darksky_temperature,darksky_humidity,darksky_precipitation_probability,traffic_speed,sched_hdwy,load
100,2020-01-01,3_FROM DOWNTOWN,WES18AWN,63,300,2020-01-01 15:46:14,2020,1,1,15.0,True,True,4,0,11,52.555,0.358,0.0,17.727612,2400.0,6.0
101,2020-01-01,3_FROM DOWNTOWN,WES19AWN,63,300,2020-01-01 15:46:26,2020,1,1,15.0,True,True,4,0,12,52.555,0.358,0.0,17.727612,2400.0,5.0
102,2020-01-01,3_FROM DOWNTOWN,WES20AWN,63,300,2020-01-01 15:46:58,2020,1,1,15.0,True,True,4,0,13,52.555,0.358,0.0,17.727612,2400.0,5.0
103,2020-01-01,3_FROM DOWNTOWN,WES21AWN,63,300,2020-01-01 15:47:12,2020,1,1,15.0,True,True,4,0,14,52.555,0.358,0.0,17.727612,2400.0,6.0
104,2020-01-01,3_FROM DOWNTOWN,WES23AWN,63,300,2020-01-01 15:48:24,2020,1,1,15.0,True,True,4,0,15,52.555,0.358,0.0,17.727612,2400.0,6.0
105,2020-01-01,3_FROM DOWNTOWN,WES24AWN,63,300,2020-01-01 15:48:28,2020,1,1,15.0,True,True,4,0,16,52.555,0.358,0.0,17.727612,2400.0,6.0
106,2020-01-01,3_FROM DOWNTOWN,WES25AWN,63,300,2020-01-01 15:48:50,2020,1,1,15.0,True,True,4,0,17,52.555,0.358,0.0,17.727612,2400.0,4.0
107,2020-01-01,3_FROM DOWNTOWN,WESNATWN,63,300,2020-01-01 15:49:18,2020,1,1,15.0,True,True,4,0,18,52.555,0.358,0.0,17.727612,2400.0,4.0
108,2020-01-01,3_FROM DOWNTOWN,WES27AWN,63,300,2020-01-01 15:49:30,2020,1,1,15.0,True,True,4,0,19,52.555,0.358,0.0,17.727612,2400.0,4.0
109,2020-01-01,3_FROM DOWNTOWN,WES29AWN,63,300,2020-01-01 15:49:42,2020,1,1,15.0,True,True,4,0,20,52.555,0.358,0.0,17.727612,2400.0,4.0


In [20]:
drop_cols = ['arrival_time', 'block_abbr']
drop_cols = [col for col in drop_cols if col in overall_df.columns]
overall_df = overall_df.drop(drop_cols, axis=1)

In [21]:
# checking bins of loads for possible classification problem
loads = overall_df[overall_df.load <= config.TARGET_MAX]['load']
percentiles = []
for cbin in config.CLASS_BINS:
    percentile = np.percentile(loads.values, cbin)
    percentiles.append(percentile)

# percentiles = [(percentiles[0], percentiles[1]), (percentiles[1] + 1, percentiles[2]), (percentiles[2] + 1, percentiles[3])]
percentiles = [(percentiles[0], percentiles[1]), (percentiles[1] + 1, percentiles[2]), (percentiles[2] + 1, 55.0), (56.0, 75.0), (76.0, 100.0)]
print(f"Percentiles: {percentiles}")
overall_df[config.TARGET_COLUMN_CLASSIFICATION] = overall_df['load'].apply(lambda x: data_utils.get_class(x, percentiles))
overall_df = overall_df[overall_df[config.TARGET_COLUMN_CLASSIFICATION].notna()]
overall_df.y_class.unique()

Percentiles: [(0.0, 6.0), (7.0, 12.0), (13.0, 55.0), (56.0, 75.0), (76.0, 100.0)]


array([1., 0., 2., 3., 4.])

In [22]:
overall_df.y_class.value_counts()

0.0    5088996
2.0    4517975
1.0    4293333
3.0      64488
4.0      15133
Name: y_class, dtype: int64

In [23]:
## Hyperparameters
past = 10 # Past stops observed
future = 1 # Future stops predicted
offset = 0

learning_rate = 1e-4
batch_size = 256
epochs = 200

feature_label = config.TARGET_COLUMN_CLASSIFICATION
patience = 10

hyperparams_dict = {'past': past,
                    'future': future,
                    'offset': offset,
                    'learning_rate': learning_rate,
                    'batch_size': batch_size,
                    'epochs': epochs,
                    'patience': patience}
hyperparams_dict

{'past': 10,
 'future': 1,
 'offset': 0,
 'learning_rate': 0.0001,
 'batch_size': 256,
 'epochs': 200,
 'patience': 10}

In [24]:
# target = config.TARGET_COLUMN_CLASSIFICATION
target = 'y_class'

num_columns = ['darksky_temperature', 'darksky_humidity', 'darksky_precipitation_probability', 'sched_hdwy', 'traffic_speed']
cat_columns = ['month', 'hour', 'day', 'stop_sequence', 'stop_id_original', 'year', 'time_window', target]
ohe_columns = ['dayofweek', 'route_id_dir', 'is_holiday', 'is_school_break', 'zero_load_at_trip_end']

columns = num_columns + cat_columns + ohe_columns
print(f"Numerical columns: {num_columns}")
print(f"Categorical columns: {cat_columns}")
print(f"One Hot Encode columns: {ohe_columns}")

Numerical columns: ['darksky_temperature', 'darksky_humidity', 'darksky_precipitation_probability', 'sched_hdwy', 'traffic_speed']
Categorical columns: ['month', 'hour', 'day', 'stop_sequence', 'stop_id_original', 'year', 'time_window', 'y_class']
One Hot Encode columns: ['dayofweek', 'route_id_dir', 'is_holiday', 'is_school_break', 'zero_load_at_trip_end']


In [25]:
overall_df.head(1)

Unnamed: 0,transit_date,route_id_dir,stop_id_original,time_window,year,month,day,hour,is_holiday,is_school_break,dayofweek,zero_load_at_trip_end,stop_sequence,darksky_temperature,darksky_humidity,darksky_precipitation_probability,traffic_speed,sched_hdwy,load,y_class
0,2020-01-01,3_FROM DOWNTOWN,MCC5_5,40,2020,1,1,10.0,True,True,4,0,1,40.15,0.616,0.0,18.639604,4800.0,7.0,1.0


In [26]:
overall_df.hour.unique(), overall_df.stop_sequence.unique()

(array([10., 11., 12., 15., 16., 17.,  6.,  7.,  8., 18.,  5.,  9., 13.,
        14., 19., 20.,  0., 21., 22., 23.,  4.,  1.]),
 array([ 1,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
        19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
        36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46,  2, 47, 48, 49, 50, 51,
        52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
        69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83],
       dtype=int32))

In [27]:
train_dates = ('2020-01-01', '2021-06-30')
val_dates =   ('2021-06-30', '2021-10-31')
test_dates =  ('2021-10-31', '2022-04-06')

ohe_encoder, label_encoder, num_scaler, train_df, val_df, test_df = linklevel_utils.prepare_linklevel(overall_df, 
                                                                                                 train_dates=train_dates, 
                                                                                                 val_dates=val_dates, 
                                                                                                 test_dates=test_dates,
                                                                                                 cat_columns=cat_columns,
                                                                                                 num_columns=num_columns,
                                                                                                 ohe_columns=ohe_columns,
                                                                                                 feature_label='y_class',
                                                                                                 time_feature_used='transit_date',
                                                                                                 scaler='minmax')

drop_cols = ['transit_date', 'load', 'arrival_time']
drop_cols = [col for col in drop_cols if col in train_df.columns]
train_df = train_df.drop(drop_cols, axis=1)
val_df = val_df.drop(drop_cols, axis=1)
test_df = test_df.drop(drop_cols, axis=1)

arrange_cols = [target] + [col for col in train_df.columns if col != target]
train_df = train_df[arrange_cols]
val_df = val_df[arrange_cols]
test_df = test_df[arrange_cols]

Train df:  (7749389, 102)
Val df:  (2710767, 102)
Test df:  (3563519, 102)
Columns to drop: ['transit_date', 'load', 'dayofweek_1', 'dayofweek_2', 'dayofweek_3', 'dayofweek_4', 'dayofweek_5', 'dayofweek_6', 'dayofweek_7', 'route_id_dir_14_FROM DOWNTOWN', 'route_id_dir_14_TO DOWNTOWN', 'route_id_dir_17_FROM DOWNTOWN', 'route_id_dir_17_TO DOWNTOWN', 'route_id_dir_18_FROM DOWNTOWN', 'route_id_dir_18_TO DOWNTOWN', 'route_id_dir_19_FROM DOWNTOWN', 'route_id_dir_19_TO DOWNTOWN', 'route_id_dir_21_NORTHBOUND', 'route_id_dir_21_SOUTHBOUND', 'route_id_dir_22_FROM DOWNTOWN', 'route_id_dir_22_TO DOWNTOWN', 'route_id_dir_23_FROM DOWNTOWN', 'route_id_dir_23_TO DOWNTOWN', 'route_id_dir_24_FROM DOWNTOWN', 'route_id_dir_24_TO DOWNTOWN', 'route_id_dir_25_NORTHBOUND', 'route_id_dir_25_SOUTHBOUND', 'route_id_dir_28_FROM DOWNTOWN', 'route_id_dir_28_TO DOWNTOWN', 'route_id_dir_29_FROM DOWNTOWN', 'route_id_dir_29_TO DOWNTOWN', 'route_id_dir_34_FROM DOWNTOWN', 'route_id_dir_34_TO DOWNTOWN', 'route_id_dir_35_F

In [28]:
train_df['y_class'] = train_df.y_class.astype('int')
val_df['y_class']   = val_df.y_class.astype('int')
test_df['y_class']  = test_df.y_class.astype('int')

In [29]:
print(train_df.shape)
train_df.head(1)

(7749389, 100)


Unnamed: 0,y_class,stop_id_original,time_window,year,month,day,hour,stop_sequence,darksky_temperature,darksky_humidity,darksky_precipitation_probability,traffic_speed,sched_hdwy,dayofweek_1,dayofweek_2,dayofweek_3,dayofweek_4,dayofweek_5,dayofweek_6,dayofweek_7,route_id_dir_14_FROM DOWNTOWN,route_id_dir_14_TO DOWNTOWN,route_id_dir_17_FROM DOWNTOWN,route_id_dir_17_TO DOWNTOWN,route_id_dir_18_FROM DOWNTOWN,route_id_dir_18_TO DOWNTOWN,route_id_dir_19_FROM DOWNTOWN,route_id_dir_19_TO DOWNTOWN,route_id_dir_21_NORTHBOUND,route_id_dir_21_SOUTHBOUND,route_id_dir_22_FROM DOWNTOWN,route_id_dir_22_TO DOWNTOWN,route_id_dir_23_FROM DOWNTOWN,route_id_dir_23_TO DOWNTOWN,route_id_dir_24_FROM DOWNTOWN,route_id_dir_24_TO DOWNTOWN,route_id_dir_25_NORTHBOUND,route_id_dir_25_SOUTHBOUND,route_id_dir_28_FROM DOWNTOWN,route_id_dir_28_TO DOWNTOWN,route_id_dir_29_FROM DOWNTOWN,route_id_dir_29_TO DOWNTOWN,route_id_dir_34_FROM DOWNTOWN,route_id_dir_34_TO DOWNTOWN,route_id_dir_35_FROM DOWNTOWN,route_id_dir_35_TO DOWNTOWN,route_id_dir_38_FROM DOWNTOWN,route_id_dir_38_TO DOWNTOWN,route_id_dir_3_FROM DOWNTOWN,route_id_dir_3_TO DOWNTOWN,route_id_dir_41_FROM DOWNTOWN,route_id_dir_41_TO DOWNTOWN,route_id_dir_42_FROM DOWNTOWN,route_id_dir_42_TO DOWNTOWN,route_id_dir_43_FROM DOWNTOWN,route_id_dir_43_TO DOWNTOWN,route_id_dir_4_FROM DOWNTOWN,route_id_dir_4_TO DOWNTOWN,route_id_dir_50_FROM DOWNTOWN,route_id_dir_50_TO DOWNTOWN,route_id_dir_52_FROM DOWNTOWN,route_id_dir_52_TO DOWNTOWN,route_id_dir_55_FROM DOWNTOWN,route_id_dir_55_TO DOWNTOWN,route_id_dir_56_FROM DOWNTOWN,route_id_dir_56_TO DOWNTOWN,route_id_dir_5_FROM DOWNTOWN,route_id_dir_5_TO DOWNTOWN,route_id_dir_64_FROM RIVERFRONT,route_id_dir_64_TO RIVERFRONT,route_id_dir_6_FROM DOWNTOWN,route_id_dir_6_TO DOWNTOWN,route_id_dir_72_EDMONDSON,route_id_dir_72_GRASSMERE,route_id_dir_75_NORTHBOUND,route_id_dir_75_SOUTHBOUND,route_id_dir_76_LOOP,route_id_dir_79_EASTBOUND,route_id_dir_79_NORTHBOUND,route_id_dir_7_FROM DOWNTOWN,route_id_dir_7_TO DOWNTOWN,route_id_dir_84_FROM NASHVILLE,route_id_dir_84_TO NASHVILLE,route_id_dir_86_FROM NASHVILLE,route_id_dir_86_TO NASHVILLE,route_id_dir_8_FROM DOWNTOWN,route_id_dir_8_TO DOWNTOWN,route_id_dir_93_LOOP,route_id_dir_94_FROM NASHVILLE,route_id_dir_95_FROM NASHVILLE,route_id_dir_96_FROM NASHVILLE,route_id_dir_96_TO NASHVILLE,route_id_dir_9_FROM DOWNTOWN,route_id_dir_9_TO DOWNTOWN,is_holiday_False,is_holiday_True,is_school_break_False,is_school_break_True,zero_load_at_trip_end_0,zero_load_at_trip_end_1
0,1,1362,30,0,0,0,8,0,0.364066,0.52,0.0,0.238351,0.08342,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0


In [30]:
## Saving encoders, scalers and column arrangement
fp = os.path.join('models', 'same_day', 'LL_OHE_encoder.joblib')
joblib.dump(ohe_encoder, fp)
fp = os.path.join('models', 'same_day', 'LL_Label_encoders.joblib')
joblib.dump(label_encoder, fp)
fp = os.path.join('models', 'same_day', 'LL_Num_scaler.joblib')
joblib.dump(num_scaler, fp)
fp = os.path.join('models', 'same_day', 'LL_X_columns.joblib')
joblib.dump(train_df.columns, fp)

['models/same_day/LL_X_columns.joblib']

In [31]:
# Can add shuffle in the future
@tf.autograph.experimental.do_not_convert
def timeseries_dataset_from_dataset(df, feature_slice, label_slice, input_sequence_length, output_sequence_length, batch_size):
    dataset = tf.data.Dataset.from_tensor_slices(df.values)
    ds = dataset.window(input_sequence_length + output_sequence_length, shift=1, drop_remainder=True)
    ds = ds.flat_map(lambda x: x).batch(input_sequence_length + output_sequence_length)
     
    def split_feature_label(x):
        return x[:input_sequence_length:, feature_slice], x[input_sequence_length:,label_slice]
     
    ds = ds.map(split_feature_label)
     
    return ds.batch(batch_size)


In [32]:
label_index = train_df.columns.tolist().index(target)
print(label_index)

0


In [33]:
label_slice = slice(label_index, label_index + 1, None) # which column the label/labels are
feature_slice = slice(None, None, None) # Which feature columns are included, by default includes all (even label)
input_sequence_length = past # number of past information to look at
output_sequence_length = future # number of time steps to predict

dataset_train = timeseries_dataset_from_dataset(train_df, 
                                                feature_slice=feature_slice,
                                                label_slice=label_slice,
                                                input_sequence_length=input_sequence_length, 
                                                output_sequence_length=output_sequence_length, 
                                                batch_size=batch_size)

dataset_val = timeseries_dataset_from_dataset(val_df, 
                                              feature_slice=feature_slice,
                                              label_slice=label_slice,
                                              input_sequence_length=input_sequence_length, 
                                              output_sequence_length=output_sequence_length, 
                                              batch_size=batch_size)

dataset_test = timeseries_dataset_from_dataset(test_df,
                                               feature_slice=feature_slice,
                                               label_slice=label_slice,
                                               input_sequence_length=input_sequence_length, 
                                               output_sequence_length=output_sequence_length, 
                                               batch_size=batch_size)
for batch in dataset_train.take(1):
    (x, y) = batch
    display(pd.DataFrame(x[100], columns=train_df.columns))
    print(x[100].shape)
    display(pd.DataFrame(y[100], columns=['y_class']))

2022-08-19 18:03:03.992007: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE: forward compatibility was attempted on non supported HW
2022-08-19 18:03:03.992051: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: scope-vanderbilt
2022-08-19 18:03:03.992059: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: scope-vanderbilt
2022-08-19 18:03:03.996547: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 515.65.1
2022-08-19 18:03:03.996580: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 515.48.7
2022-08-19 18:03:03.996587: E tensorflow/stream_executor/cuda/cuda_diagnostics.cc:313] kernel version 515.48.7 does not match DSO version 515.65.1 -- cannot find working devices in this configuration
2022-08-19 18:03:03.998423: I tensorflow/core/platform/cpu_feature_guard.cc:151]

Unnamed: 0,y_class,stop_id_original,time_window,year,month,day,hour,stop_sequence,darksky_temperature,darksky_humidity,darksky_precipitation_probability,traffic_speed,sched_hdwy,dayofweek_1,dayofweek_2,dayofweek_3,dayofweek_4,dayofweek_5,dayofweek_6,dayofweek_7,route_id_dir_14_FROM DOWNTOWN,route_id_dir_14_TO DOWNTOWN,route_id_dir_17_FROM DOWNTOWN,route_id_dir_17_TO DOWNTOWN,route_id_dir_18_FROM DOWNTOWN,route_id_dir_18_TO DOWNTOWN,route_id_dir_19_FROM DOWNTOWN,route_id_dir_19_TO DOWNTOWN,route_id_dir_21_NORTHBOUND,route_id_dir_21_SOUTHBOUND,route_id_dir_22_FROM DOWNTOWN,route_id_dir_22_TO DOWNTOWN,route_id_dir_23_FROM DOWNTOWN,route_id_dir_23_TO DOWNTOWN,route_id_dir_24_FROM DOWNTOWN,route_id_dir_24_TO DOWNTOWN,route_id_dir_25_NORTHBOUND,route_id_dir_25_SOUTHBOUND,route_id_dir_28_FROM DOWNTOWN,route_id_dir_28_TO DOWNTOWN,route_id_dir_29_FROM DOWNTOWN,route_id_dir_29_TO DOWNTOWN,route_id_dir_34_FROM DOWNTOWN,route_id_dir_34_TO DOWNTOWN,route_id_dir_35_FROM DOWNTOWN,route_id_dir_35_TO DOWNTOWN,route_id_dir_38_FROM DOWNTOWN,route_id_dir_38_TO DOWNTOWN,route_id_dir_3_FROM DOWNTOWN,route_id_dir_3_TO DOWNTOWN,route_id_dir_41_FROM DOWNTOWN,route_id_dir_41_TO DOWNTOWN,route_id_dir_42_FROM DOWNTOWN,route_id_dir_42_TO DOWNTOWN,route_id_dir_43_FROM DOWNTOWN,route_id_dir_43_TO DOWNTOWN,route_id_dir_4_FROM DOWNTOWN,route_id_dir_4_TO DOWNTOWN,route_id_dir_50_FROM DOWNTOWN,route_id_dir_50_TO DOWNTOWN,route_id_dir_52_FROM DOWNTOWN,route_id_dir_52_TO DOWNTOWN,route_id_dir_55_FROM DOWNTOWN,route_id_dir_55_TO DOWNTOWN,route_id_dir_56_FROM DOWNTOWN,route_id_dir_56_TO DOWNTOWN,route_id_dir_5_FROM DOWNTOWN,route_id_dir_5_TO DOWNTOWN,route_id_dir_64_FROM RIVERFRONT,route_id_dir_64_TO RIVERFRONT,route_id_dir_6_FROM DOWNTOWN,route_id_dir_6_TO DOWNTOWN,route_id_dir_72_EDMONDSON,route_id_dir_72_GRASSMERE,route_id_dir_75_NORTHBOUND,route_id_dir_75_SOUTHBOUND,route_id_dir_76_LOOP,route_id_dir_79_EASTBOUND,route_id_dir_79_NORTHBOUND,route_id_dir_7_FROM DOWNTOWN,route_id_dir_7_TO DOWNTOWN,route_id_dir_84_FROM NASHVILLE,route_id_dir_84_TO NASHVILLE,route_id_dir_86_FROM NASHVILLE,route_id_dir_86_TO NASHVILLE,route_id_dir_8_FROM DOWNTOWN,route_id_dir_8_TO DOWNTOWN,route_id_dir_93_LOOP,route_id_dir_94_FROM NASHVILLE,route_id_dir_95_FROM NASHVILLE,route_id_dir_96_FROM NASHVILLE,route_id_dir_96_TO NASHVILLE,route_id_dir_9_FROM DOWNTOWN,route_id_dir_9_TO DOWNTOWN,is_holiday_False,is_holiday_True,is_school_break_False,is_school_break_True,zero_load_at_trip_end_0,zero_load_at_trip_end_1
0,0.0,1916.0,53.0,0.0,0.0,0.0,13.0,10.0,0.508382,0.1975,0.0,0.214028,0.04171,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
1,0.0,1918.0,53.0,0.0,0.0,0.0,13.0,11.0,0.508382,0.1975,0.0,0.214028,0.04171,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
2,0.0,1920.0,53.0,0.0,0.0,0.0,13.0,12.0,0.508382,0.1975,0.0,0.214028,0.04171,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
3,0.0,1922.0,53.0,0.0,0.0,0.0,13.0,13.0,0.508382,0.1975,0.0,0.214028,0.04171,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
4,0.0,1924.0,53.0,0.0,0.0,0.0,13.0,14.0,0.508382,0.1975,0.0,0.214028,0.04171,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
5,0.0,1926.0,53.0,0.0,0.0,0.0,13.0,15.0,0.508382,0.1975,0.0,0.214028,0.04171,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
6,0.0,1928.0,53.0,0.0,0.0,0.0,13.0,16.0,0.508382,0.1975,0.0,0.214028,0.04171,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
7,0.0,1957.0,53.0,0.0,0.0,0.0,13.0,17.0,0.508382,0.1975,0.0,0.214028,0.04171,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
8,0.0,1929.0,53.0,0.0,0.0,0.0,13.0,18.0,0.508382,0.1975,0.0,0.214028,0.04171,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
9,0.0,1932.0,53.0,0.0,0.0,0.0,13.0,19.0,0.508382,0.1975,0.0,0.214028,0.04171,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0


(10, 100)


Unnamed: 0,y_class
0,0.0


In [34]:
num_classes = len(train_df.y_class.unique())
num_classes

5

In [35]:
# define model
model = tf.keras.Sequential()
model.add(LSTM(256, return_sequences=True))
model.add(LSTM(256))
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

# compile model
model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
    metrics=["sparse_categorical_accuracy"],
)
# model.compile(
#     loss="mean_absolute_error",
#     optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
#     metrics=tf.keras.metrics.MeanSquaredError(),
# )

input_shape = (None, None, len(train_df.columns))
model.build(input_shape)
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, None, 256)         365568    
                                                                 
 lstm_1 (LSTM)               (None, 256)               525312    
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense (Dense)               (None, 128)               32896     
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dense_2 (Dense)             (None, 5)                 3

In [36]:
from tensorflow.keras import backend as K
K.clear_session()

In [38]:
# checkpoint_filepath = 'models/same_day/model/CLA_cp-epoch{epoch:02d}-loss{val_loss:.2f}.ckpt'
checkpoint_filepath = 'models/same_day/school_zero_load/CLA_cp-epoch{epoch:02d}-loss{val_loss:.2f}.ckpt'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    save_best_only=True)

In [39]:
# fit model
callbacks = [keras.callbacks.EarlyStopping(patience=patience, restore_best_weights=True), model_checkpoint_callback]

history = model.fit(dataset_train, validation_data=dataset_val, epochs=epochs, callbacks=callbacks, verbose=1)

2022-08-19 18:07:39.376700: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 6199511200 exceeds 10% of free system memory.


Epoch 1/200


In [None]:
# plt.plot(history)

## Inference
* Load model and encoders,scalers,converters

In [29]:
import datetime as dt

In [30]:

TIMEWINDOW = 15

In [31]:
# Load model
latest = tf.train.latest_checkpoint('models/same_day/model')
columns = joblib.load('models/same_day/LL_X_columns.joblib')
label_encoders = joblib.load('models/same_day/LL_Label_encoders.joblib')
ohe_encoder = joblib.load('models/same_day/LL_OHE_encoder.joblib')
num_scaler = joblib.load('models/same_day/LL_Num_scaler.joblib')

In [32]:
def get_apc_data_for_date(filter_date):
    print("Running this...")
    filepath = os.path.join('data', 'processed', 'apc_weather_gtfs.parquet')
    apcdata = spark.read.load(filepath)
    apcdata.createOrReplaceTempView("apc")

    plot_date = filter_date.strftime('%Y-%m-%d')
    get_columns = ['trip_id', 'transit_date', 'arrival_time', 'vehicle_id', 'ons',
                   'block_abbr', 'stop_sequence', 'stop_name', 'stop_id_original',
                   'load', 
                   'darksky_temperature', 
                   'darksky_humidity', 
                   'darksky_precipitation_probability', 
                   'route_direction_name', 'route_id', 'gtfs_direction_id',
                   'dayofweek',  'year', 'month', 'hour',
                   'sched_hdwy']
    get_str = ", ".join([c for c in get_columns])
    query = f"""
    SELECT {get_str}
    FROM apc
    WHERE (transit_date == '{plot_date}')
    ORDER BY arrival_time
    """
    apcdata = spark.sql(query)
    apcdata = apcdata.withColumn("route_id_dir", F.concat_ws("_", apcdata.route_id, apcdata.route_direction_name))
    apcdata = apcdata.withColumn("day", F.dayofmonth(apcdata.arrival_time))
    apcdata = apcdata.drop("route_direction_name")
    apcdata = apcdata.withColumn("load", F.when(apcdata.load < 0, 0).otherwise(apcdata.load))
    return apcdata

In [38]:
def prepare_input_data(input_df, ohe_encoder, label_encoders, num_scaler, columns, keep_columns=[], target='y_class'):
    num_columns = ['darksky_temperature', 'darksky_humidity', 'darksky_precipitation_probability', 'sched_hdwy', 'traffic_speed']
    cat_columns = ['month', 'hour', 'day', 'stop_sequence', 'stop_id_original', 'year', 'time_window']
    ohe_columns = ['dayofweek', 'route_id_dir', 'is_holiday']

    # OHE
    input_df[ohe_encoder.get_feature_names_out()] = ohe_encoder.transform(input_df[ohe_columns]).toarray()
    # input_df = input_df.drop(columns=ohe_columns)

    # Label encoder
    for cat in cat_columns:
        encoder = label_encoders[cat]
        input_df[cat] = encoder.transform(input_df[cat])
    
    # Num scaler
    input_df[num_columns] = num_scaler.transform(input_df[num_columns])
    input_df['y_class']  = input_df.y_class.astype('int')

    if keep_columns:
        columns = keep_columns + columns
    # Rearrange columns
    input_df = input_df[columns]
    
    return input_df

def assign_data_to_bins(df, TARGET='load'):
    bins = pd.IntervalIndex.from_tuples([(-1, 6.0), (7.0, 12.0), (13.0, 55.0), (56.0, 75.0), (76.0, 100.0)])
    mycut = pd.cut(df[TARGET].tolist(), bins=bins)
    df['y_class'] = mycut.codes
    return df

In [34]:
date_to_predict = dt.date(2021, 10, 18)
apcdata = get_apc_data_for_date(date_to_predict)

Running this...


In [47]:
df = apcdata.toPandas()
df = df[df.arrival_time.notna()]
df = df[df.sched_hdwy.notna()]
df = df[df.darksky_temperature.notna()]

df['day'] = df["arrival_time"].dt.day
df = df.sort_values(by=['block_abbr', 'arrival_time']).reset_index(drop=True)

# Adding extra features
# Holidays
fp = os.path.join('data', 'others', 'US Holiday Dates (2004-2021).csv')
holidays_df = pd.read_csv(fp)
holidays_df['Date'] = pd.to_datetime(holidays_df['Date'])
holidays_df['is_holiday'] = True
df = df.merge(holidays_df[['Date', 'is_holiday']], left_on='transit_date', right_on='Date', how='left')
df['is_holiday'] = df['is_holiday'].fillna(False)
df = df.drop(columns=['Date'])

# Traffic
# Causes 3M data points to be lost
fp = os.path.join('data', 'traffic', 'triplevel_speed.pickle')
speed_df = pd.read_pickle(fp)
speed_df = speed_df.rename({'route_id_direction':'route_id_dir'}, axis=1)
speed_df = speed_df[['transit_date', 'trip_id', 'route_id_dir', 'traffic_speed']]
df = df.merge(speed_df, how='left', 
                left_on=['transit_date', 'trip_id', 'route_id_dir'], 
                right_on=['transit_date', 'trip_id', 'route_id_dir'])
# df = df[~df['traffic_speed'].isna()]
df['traffic_speed'].bfill(inplace=True)

df['minute'] = df['arrival_time'].dt.minute
df['minuteByWindow'] = df['minute'] // TIMEWINDOW
df['temp'] = df['minuteByWindow'] + (df['hour'] * 60 / TIMEWINDOW)
df['time_window'] = np.floor(df['temp']).astype('int')
df = df.drop(columns=['minute', 'minuteByWindow', 'temp'])

# HACK
df = df[df['hour'] != 3]
df = df[df['stop_sequence'] != 0]

df = df.sort_values(by=['block_abbr', 'arrival_time']).reset_index(drop=True)

df = assign_data_to_bins(df, TARGET='load')

                                                                                

In [48]:
input_df = prepare_input_data(df, ohe_encoder, label_encoders, num_scaler, columns, target='y_class')

month
hour
day
stop_sequence
stop_id_original
year
time_window


In [55]:
def generate_simple_lstm_predictions(input_df, model, past, future):
    past_df = input_df[0:past]
    future_df = input_df[past:]
    predictions = []
    for f in range(future):
        pred = model.predict(past_df.to_numpy().reshape(1, *past_df.shape))
        y_pred = np.argmax(pred)
        predictions.append(y_pred)
        
        # Add information from future
        last_row = future_df.iloc[[0]]
        last_row['y_class'] = y_pred
        past_df = pd.concat([past_df[1:], last_row])
        
        # Move future to remove used row
        future_df = future_df[1:]
    return predictions

In [56]:
tdf = input_df[0:20]
model = linklevel_utils.setup_simple_lstm_generator(input_df.shape[1], 5)
model.load_weights(latest)
y_pred = generate_simple_lstm_predictions(tdf, model, 10, 10)

In [57]:
y_pred

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [2]:
import pandas as pd
import os

# fp = os.path.join('../models/same_day/evaluation/SIMPLE_LSTM_multi_stop_5P_xF_results.pkl')
fp = os.path.join('../models/same_day/evaluation/baseline_multi_stop_10P_xF_results.pkl')
df = pd.read_pickle(fp)
df

Unnamed: 0,trip_id,y_true,y_pred
0,268878_2022-04-04,1.0,
1,268878_2022-04-04,1.0,2.0
2,268878_2022-04-04,1.0,2.0
3,268878_2022-04-04,1.0,2.0
4,268878_2022-04-04,1.0,2.0
5,268878_2022-04-04,1.0,2.0
6,268878_2022-04-04,1.0,2.0
7,268878_2022-04-04,0.0,2.0
8,268878_2022-04-04,0.0,2.0
9,268878_2022-04-04,0.0,2.0
