In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [2]:
import sys
sys.path.insert(0,'..')
from copy import deepcopy
import numpy as np
from pprint import pprint
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, mean_squared_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold, StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from src import data_utils, triplevel_utils
from pyspark.sql import SparkSession
import matplotlib.pyplot as plt
from pandas.core.common import SettingWithCopyWarning
import datetime as dt
import swifter
import xgboost as xgb
from keras import backend as K 
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, concatenate, LSTM
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy
from tensorflow.keras.optimizers import Adam
import os
import tensorflow as tf
tf.get_logger().setLevel('ERROR')

import pandas as pd
pd.set_option('display.max_columns', None)
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)
from pandas import MultiIndex, Int64Index
from xgboost import XGBClassifier

In [3]:
spark = SparkSession.builder.config('spark.executor.cores', '8').config('spark.executor.memory', '40g')\
        .config("spark.sql.session.timeZone", "UTC").config('spark.driver.memory', '20g').master("local[26]")\
        .appName("wego-daily").config('spark.driver.extraJavaOptions', '-Duser.timezone=UTC').config('spark.executor.extraJavaOptions', '-Duser.timezone=UTC')\
        .config("spark.sql.datetime.java8API.enabled", "true").config("spark.sql.execution.arrow.pyspark.enabled", "true")\
        .getOrCreate()

22/09/15 15:52:11 WARN Utils: Your hostname, scope-vanderbilt resolves to a loopback address: 127.0.1.1; using 10.2.218.69 instead (on interface enp8s0)
22/09/15 15:52:11 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/15 15:52:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/09/15 15:52:15 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
# "Utils"
def get_time_window(row, window):
    minute = row.arrival_time.minute
    minuteByWindow = minute//window
    temp = minuteByWindow + (row.hour * (60/window))
    return round(temp, 2)

class dotdict(dict):
    """dot.notation access to dictionary attributes"""
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__

In [5]:
# load the APC data from a prepared file
processed_triplevel = os.path.join('../data', 'processed', 'triplevel_df.parquet')
if not os.path.exists(processed_triplevel):
# if True:
    filepath = os.path.join(os.getcwd(), "../data", "processed", "apc_weather_gtfs.parquet")
    apcdata = spark.read.load(filepath)
    apcdata.createOrReplaceTempView("apc")

    # filter subset
    query = f"""
                SELECT *
                FROM apc
            """
    apcdata=spark.sql(query)
    apcdata = data_utils.remove_nulls_from_apc(apcdata)
    apcdata.createOrReplaceTempView('apcdata')
    apcdata_per_trip = data_utils.get_apc_per_trip_sparkview(spark)
    df = apcdata_per_trip.toPandas()
    
    # Adding extra features
    # Holidays
    fp = os.path.join('../data', 'others', 'US Holiday Dates (2004-2021).csv')
    holidays_df = pd.read_csv(fp)
    holidays_df['Date'] = pd.to_datetime(holidays_df['Date'])
    holidays_df['is_holiday'] = True
    df = df.merge(holidays_df[['Date', 'is_holiday']], left_on='transit_date', right_on='Date', how='left')
    df['is_holiday'] = df['is_holiday'].fillna(False)
    df = df.drop(columns=['Date'])
    
    # School breaks
    fp = os.path.join('../data', 'others', 'School Breaks (2019-2022).pkl')
    school_break_df = pd.read_pickle(fp)
    school_break_df['is_school_break'] = True
    df = df.merge(school_break_df[['Date', 'is_school_break']], left_on='transit_date', right_on='Date', how='left')
    df['is_school_break'] = df['is_school_break'].fillna(False)
    df = df.drop(columns=['Date'])

    # Traffic
    fp = os.path.join('../data', 'traffic', 'triplevel_speed.pickle')
    speed_df = pd.read_pickle(fp)
    speed_df = speed_df[['transit_date', 'trip_id', 'route_id_direction', 'traffic_speed']]
    df = df.merge(speed_df, how='left', 
                  left_on =['transit_date', 'trip_id', 'route_id_direction'], 
                  right_on=['transit_date', 'trip_id', 'route_id_direction'])
    df = df[~df['traffic_speed'].isna()]
    df.to_parquet(processed_triplevel, engine='auto', compression='gzip')
else:
    df = pd.read_parquet(processed_triplevel, engine='auto')
    df = df.dropna()
    # Removing time_window in case a different one will be used
df = df.drop(['time_window', 'load'], axis=1)
df = df.reset_index(drop=True)

                                                                                

22/09/15 15:52:21 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


                                                                                

In [7]:
cat_features = ['route_id_direction', 'is_holiday', 'dayofweek', 'is_school_break', 'time_window']
ord_features = ['year', 'month', 'hour', 'day']
# num_features = ['temperature', 'humidity', 'precipitation_intensity', 'scheduled_headway', 'traffic_speed']
num_features = ['temperature', 'humidity', 'precipitation_intensity', 'scheduled_headway']

RANDOM_SEED = 100
WINDOW = 30
PAST_TRIPS = 5
TARGET = 'y_reg100'
FOLDS = 3

learning_rate = 0.001
batch_size = 512
epochs = 200
patience = 10

In [8]:
tdf = deepcopy(df)

In [9]:
tdf = triplevel_utils.generate_new_features(tdf, time_window=WINDOW, past_trips=PAST_TRIPS, target=TARGET)

In [10]:
# Group by time windows and get the maximum of the aggregate load/class/sched
# Get mean of temperature (mostly going to be equal)
tdf = tdf.groupby(['transit_date', 'route_id_direction', 'time_window']).agg({"trip_id":"first",
                                                                              "year":"first", 
                                                                              "month":"first",
                                                                              "day": "first",
                                                                              "dayofweek":"first", 
                                                                              "hour":"first",
                                                                              "is_holiday": "first",
                                                                              "is_school_break": "first",
                                                                              "temperature":"mean", 
                                                                              "humidity":"mean",
                                                                              "precipitation_intensity": "mean",
                                                                              "scheduled_headway": "max",
                                                                              TARGET: "max"})
                                                                            #   "traffic_speed":"mean",
tdf = tdf.reset_index(level=[0,1,2])

In [11]:
print("ohe_encoder is for the following column order:", cat_features)

rf_df, ix_map, ohe_encoder, percentiles = triplevel_utils.prepare_df_for_training(tdf, cat_features, ord_features, target=TARGET)
rf_df, percentiles = triplevel_utils.adjust_bins(rf_df, TARGET=TARGET, percentiles=percentiles)

original_rf = deepcopy(rf_df)
original_rf['time_window'] = tdf['time_window']

drop_cols = ['time_window', 'route_id', 'route_direction_name', 'block_abbr', 'y_reg100', 'y_reg095', 'transit_date', 'is_holiday', 'route_id_direction', 'actual_headways', 'trip_id', 'arrival_time']
drop_cols = [col for col in drop_cols if col in rf_df.columns]
rf_df = rf_df.drop(drop_cols, axis=1)

percentiles

ohe_encoder is for the following column order: ['route_id_direction', 'is_holiday', 'dayofweek', 'is_school_break', 'time_window']


[(0.0, 9.0), (10.0, 16.0), (16.0, 55.0), (56.0, 75.0), (76.0, 100.0)]

In [12]:
# Drop informational columns
y = rf_df.pop('y_class')
X = rf_df

In [13]:
def reconstruct_original_data_from_RF(df, ix_map, ohe_encoder):
    df[cat_features] = ohe_encoder.inverse_transform(df.filter(regex='route_id_direction_|is_holiday_|dayofweek_|is_school_break_|time_window_'))
    
    for col in ord_features:
        inv_map = {v: k for k, v in ix_map[col].items()}
        df[col] = df[f"{col}_ix"].apply(lambda x: inv_map[x])
        
    df = df.drop(columns=df.filter(regex='route_id_direction_|is_holiday_|dayofweek_|is_school_break_|time_window_|_ix').columns, axis=1)
    return df

# Random Forest

In [14]:
sss = StratifiedShuffleSplit(n_splits=FOLDS, test_size=0.3, random_state=RANDOM_SEED)
sss.get_n_splits(X, y)

kfold = 0
results_df_arr = []
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
    model = RandomForestClassifier(n_estimators=100, max_depth=8)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    res_df = original_rf.iloc[test_index]
    res_df['y_pred'] = y_pred
    res_df['y_true'] = y_test
    res_df['kfold'] = kfold
    res_df = reconstruct_original_data_from_RF(res_df, ix_map, ohe_encoder)
    kfold = kfold + 1
    results_df_arr.append(res_df)
    
fp = os.path.join('../evaluation', 'any_day_comparisons', 'RF_raw_results.pkl')
pd.concat(results_df_arr).to_pickle(fp)

In [15]:
fp = os.path.join('../evaluation', 'any_day_comparisons', 'RF_raw_results.pkl')
rf_results = pd.read_pickle(fp)

# Xgboost

In [16]:
sss = StratifiedShuffleSplit(n_splits=FOLDS, test_size=0.3, random_state=RANDOM_SEED)
sss.get_n_splits(X, y)

objective = 'multi:softmax'
kfold = 0
results_df_arr = []
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model = xgb.XGBClassifier(n_estimators=600,
                              max_depth=12,
                              use_label_encoder=False, 
                              objective=objective, 
                              eval_metric='mlogloss', 
                              num_class=len(y.unique()))
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    res_df = original_rf.iloc[test_index]
    res_df['y_pred'] = y_pred
    res_df['y_true'] = y_test
    res_df['kfold'] = kfold
    res_df = reconstruct_original_data_from_RF(res_df, ix_map, ohe_encoder)
    kfold = kfold + 1
    results_df_arr.append(res_df)
fp = os.path.join('../evaluation', 'any_day_comparisons', 'XGB_raw_results.pkl')
pd.concat(results_df_arr).to_pickle(fp)

In [None]:
fp = os.path.join('../evaluation', 'any_day_comparisons', 'XGB_raw_results.pkl')
xgb_results = pd.read_pickle(fp)

# MLP

In [39]:
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, OneHotEncoder
tf.get_logger().setLevel('ERROR')

def create_MLP(X, y):
    # Create embeddings for categorical features
    number_of_unique_rids = X.filter(like='route_id_direction_').shape[1]
    number_of_unique_holidays = X.filter(like='is_holiday_').shape[1]
    number_of_unique_dow = X.filter(like='dayofweek_').shape[1]
    number_of_unique_break = X.filter(like='is_school_break_').shape[1]
    number_of_unique_time_window = X.filter(like='time_window_').shape[1]
    classes = len(y.unique())
    print(f"No. of classes: {classes}")

    numerical_input = Input(len(num_features) + len(ord_features), name='numerical_input')
    
    onehot_input1 = Input((number_of_unique_rids), name='ohe_rid')
    ohe_layer1 = Dense(64)(onehot_input1)
    
    onehot_input2 = Input((number_of_unique_holidays), name='ohe_holiday')
    ohe_layer2 = Dense(64)(onehot_input2)
    
    onehot_input3 = Input((number_of_unique_dow), name='ohe_dow')
    ohe_layer3 = Dense(64)(onehot_input3)
    
    onehot_input4 = Input((number_of_unique_break), name='ohe_break')
    ohe_layer4 = Dense(64)(onehot_input4)
    
    onehot_input5 = Input((number_of_unique_time_window), name='ohe_tw')
    ohe_layer5 = Dense(64)(onehot_input5)

    merged_input = concatenate([numerical_input, ohe_layer1, ohe_layer2, ohe_layer3, ohe_layer4, ohe_layer5], axis=1)
    x = Dense(64, activation='relu')(merged_input)
    x = Dropout(0.2)(x)
    x = BatchNormalization()(x)
    x = Dense(32, activation='relu')(merged_input)
    output = Dense(classes, activation='softmax')(x)

    model = Model(inputs=[numerical_input, onehot_input1, onehot_input2, onehot_input3, onehot_input4, onehot_input5], outputs=output, name='Simple_NN')

    return model

In [40]:
batch_size = 128
epochs = 10

sss = StratifiedShuffleSplit(n_splits=FOLDS, test_size=0.3, random_state=RANDOM_SEED)
sss.get_n_splits(X, y)

kfold = 0
results_df_arr = []
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    scaler = MinMaxScaler()
    X_train[num_features] = scaler.fit_transform(X_train[num_features])
    X_test[num_features] = scaler.transform(X_test[num_features])
    
    model = create_MLP(X_train, y_train)
    es_callback = EarlyStopping(monitor="val_sparse_categorical_accuracy", min_delta=0, patience=patience)

    checkpoint_filepath = os.path.join('../evaluation', 'any_day_comparisons', 'CLA_cp-epoch{epoch:02d}-loss{val_loss:.2f}.ckpt')
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_filepath,
        save_weights_only=True,
        monitor='val_loss',
        mode='min',
        save_best_only=True)
    
    model.compile(loss=SparseCategoricalCrossentropy(), 
                optimizer=Adam(learning_rate=learning_rate), 
                metrics='sparse_categorical_accuracy')
    
    input_train_x = {}    
    input_train_x['ohe_rid'] = X_train.filter(like='route_id_direction_')
    input_train_x['ohe_holiday'] = X_train.filter(like='is_holiday_')
    input_train_x['ohe_dow'] = X_train.filter(like='dayofweek_')
    input_train_x['ohe_break'] = X_train.filter(like='is_school_break_')
    input_train_x['ohe_tw'] = X_train.filter(like='time_window_')
    input_train_x['numerical_input'] = X_train[num_features + [f"{o}_ix" for o in ord_features]]
    
    model.fit(x=input_train_x,
              y=y_train,
              epochs=epochs,
              batch_size=batch_size,
              validation_split=0.2,
              callbacks=[es_callback],
              verbose=1)
    
    input_test_x = {}
    input_test_x['ohe_rid'] = X_test.filter(like='route_id_direction_')
    input_test_x['ohe_holiday'] = X_test.filter(like='is_holiday_')
    input_test_x['ohe_dow'] = X_test.filter(like='dayofweek_')
    input_test_x['ohe_break'] = X_test.filter(like='is_school_break_')
    input_test_x['ohe_tw'] = X_test.filter(like='time_window_')
    input_test_x['numerical_input'] = X_test[num_features + [f"{o}_ix" for o in ord_features]]

    predictions = model.predict(x=input_test_x)
    y_pred = np.argmax(predictions, axis=1)
    res_df = original_rf.iloc[test_index]
    res_df['y_pred'] = y_pred
    res_df['y_true'] = y_test.to_numpy()
    res_df['kfold'] = kfold
    res_df = reconstruct_original_data_from_RF(res_df, ix_map, ohe_encoder)
    kfold = kfold + 1
    
    results_df_arr.append(res_df)
    K.clear_session()

fp = os.path.join('../evaluation', 'any_day_comparisons', 'MLP_raw_results.pkl')
pd.concat(results_df_arr).to_pickle(fp)

No. of classes: 5
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
No. of classes: 5
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
No. of classes: 5
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [41]:
fp = os.path.join('../evaluation', 'any_day_comparisons', 'MLP_raw_results.pkl')
mlp_results = pd.read_pickle(fp)
mlp_results.head()

Unnamed: 0,transit_date,trip_id,temperature,humidity,traffic_speed,precipitation_intensity,scheduled_headway,y_reg100,y_class,time_window,y_pred,y_true,kfold,route_id_direction,is_holiday,dayofweek,is_school_break,year,month,hour,day
144340,2021-01-18,231502,45.342601,0.5374,19.8893,0.0,5028.0,7.0,0,37,0,0,0,5_TO DOWNTOWN,True,2,False,2021,1,18,18
348350,2022-01-22,263828,24.513094,0.678635,19.617309,0.002347,4710.15625,41.0,2,22,2,2,0,56_TO DOWNTOWN,False,7,False,2022,1,11,22
221509,2021-06-24,242337,85.094709,0.544875,17.135672,0.0,1898.916667,13.0,1,22,0,1,0,4_TO DOWNTOWN,False,5,True,2021,6,11,24
189121,2021-04-28,242197,75.056664,0.851,17.565668,0.098147,1800.0,14.0,1,34,0,1,0,29_TO DOWNTOWN,False,4,False,2021,4,17,28
293491,2021-10-23,260851,69.902532,0.554235,17.608281,0.0,7200.0,10.0,1,33,0,1,0,28_TO DOWNTOWN,False,7,False,2021,10,16,23


# LSTM

In [21]:
tdf = deepcopy(df)
tdf = triplevel_utils.generate_new_features(tdf, time_window=WINDOW, past_trips=PAST_TRIPS, target=TARGET)
# Group by time windows and get the maximum of the aggregate load/class/sched
# Get mean of temperature (mostly going to be equal)
tdf = tdf.groupby(['transit_date', 'route_id_direction', 'time_window']).agg({"arrival_time":"first",
                                                                              "block_abbr":"first",
                                                                              "trip_id":"first",
                                                                              "year":"first", 
                                                                              "month":"first",
                                                                              "day": "first",
                                                                              "dayofweek":"first", 
                                                                              "hour":"first",
                                                                              "is_holiday": "first",
                                                                              "is_school_break": "first",
                                                                              "temperature":"mean", 
                                                                              "humidity":"mean",
                                                                              "precipitation_intensity": "mean",
                                                                              "scheduled_headway": "max",
                                                                              TARGET: "max"})
                                                                            #   "traffic_speed":"mean",
tdf = tdf.reset_index(level=[0,1,2])
tdf = tdf.sort_values(by=['block_abbr', 'arrival_time'])

In [22]:
print("ohe_encoder is for the following column order:", cat_features)

rf_df, ix_map, ohe_encoder, percentiles = triplevel_utils.prepare_df_for_training(tdf, cat_features, ord_features, target=TARGET)
rf_df, percentiles = triplevel_utils.adjust_bins(rf_df, TARGET=TARGET, percentiles=percentiles)

original_rf = deepcopy(rf_df)
original_rf['time_window'] = tdf['time_window']

drop_cols = ['time_window', 'route_id', 'route_direction_name', 'block_abbr', 'y_reg100', 'y_reg095', 'is_holiday', 'route_id_direction', 'actual_headways', 'trip_id', 'arrival_time']
drop_cols = [col for col in drop_cols if col in rf_df.columns]
rf_df = rf_df.drop(drop_cols, axis=1)

ohe_encoder is for the following column order: ['route_id_direction', 'is_holiday', 'dayofweek', 'is_school_break', 'time_window']


In [23]:
train_dates = ('2020-01-01', '2021-09-30')
val_dates =   ('2021-09-30', '2021-11-30')
test_dates =  ('2021-11-30', '2022-04-06')

train = rf_df[(rf_df['transit_date'] >= train_dates[0]) &\
            (rf_df['transit_date'] < train_dates[1])]
val   = rf_df[(rf_df['transit_date'] >= val_dates[0]) &\
            (rf_df['transit_date'] < val_dates[1])]
test  = rf_df[(rf_df['transit_date'] >= test_dates[0]) &\
            (rf_df['transit_date'] <= test_dates[1])]

scaler = MinMaxScaler()
train[num_features] = scaler.fit_transform(train[num_features])
val[num_features] = scaler.transform(val[num_features])
test[num_features] = scaler.transform(test[num_features])

drop_cols = ['transit_date']
train = train.drop(columns=drop_cols, axis=1)
val = val.drop(columns=drop_cols, axis=1)
test = test.drop(columns=drop_cols, axis=1)

In [24]:
@tf.autograph.experimental.do_not_convert
def timeseries_dataset_from_dataset(df, label_slice, input_sequence_length, output_sequence_length, batch_size):
    dataset = tf.data.Dataset.from_tensor_slices(df.values)
    ds = dataset.window(input_sequence_length + output_sequence_length, shift=1, drop_remainder=True)
    ds = ds.flat_map(lambda x: x).batch(input_sequence_length + output_sequence_length)
    def split_feature_label(x):
        return x[:input_sequence_length], x[input_sequence_length:,label_slice]
    ds = ds.map(split_feature_label)
    return ds.batch(batch_size)

num_of_classes = len(y.unique())
def create_LSTM(n_lstm_units, n_timesteps, n_features, num_of_classes=num_of_classes):
    model = tf.keras.Sequential()
    model.add(LSTM(n_lstm_units, input_shape=(n_timesteps, n_features), return_sequences=False))
    model.add(Dense(32))
    model.add(Dense(num_of_classes, activation='softmax'))
    return model

In [27]:
input_sequence_length = 5
output_sequence_length = 1
learning_rate = 0.001
batch_size = 256
epochs = 10
patience = 3

# label_slice = slice(5, 6, None)
label_index = train.columns.tolist().index('y_class')
print("Label index:", label_index)
label_slice = slice(label_index, label_index + 1, None) # which column the label/labels are

dataset_train = timeseries_dataset_from_dataset(train, label_slice, 
                                                input_sequence_length=input_sequence_length,
                                                output_sequence_length=output_sequence_length,
                                                batch_size=batch_size)
dataset_val  = timeseries_dataset_from_dataset(val, label_slice, 
                                                input_sequence_length=input_sequence_length,
                                                output_sequence_length=output_sequence_length,
                                                batch_size=batch_size)
dataset_test  = timeseries_dataset_from_dataset(test, label_slice, 
                                                input_sequence_length=input_sequence_length,
                                                output_sequence_length=output_sequence_length,
                                                batch_size=batch_size)

Label index: 5


In [28]:
results_df_arr = []
for n_lstm_units in [64, 128, 256]:
    number_of_features = len(train.columns)
    model = create_LSTM(n_lstm_units, input_sequence_length, number_of_features)

    model.compile(loss=SparseCategoricalCrossentropy(), 
                    optimizer=Adam(learning_rate=learning_rate), 
                    metrics=SparseCategoricalAccuracy())

    es_callback = EarlyStopping(monitor="val_sparse_categorical_accuracy",
                                min_delta=0.1, patience=patience)
    model.fit(dataset_train,
              epochs=epochs,
              validation_data=dataset_val,
              callbacks=[es_callback],
              verbose=1)

    predictions = model.predict(dataset_test)
    y_pred = np.argmax(predictions, axis=1)
    
    res_df = original_rf.loc[test.index][5:]
    res_df['y_pred'] = y_pred
    res_df['y_true'] = test[input_sequence_length:]['y_class'].to_numpy()
    res_df['n_lstm_units'] = n_lstm_units
    res_df = reconstruct_original_data_from_RF(res_df, ix_map, ohe_encoder)
    
    results_df_arr.append(res_df)
    # model.reset_states()
    K.clear_session()

fp = os.path.join('../evaluation', 'any_day_comparisons', 'LSTM_raw_results.pkl')
pd.concat(results_df_arr).to_pickle(fp)

Epoch 1/20
     10/Unknown - 1s 6ms/step - loss: 1.4109 - sparse_categorical_accuracy: 0.3086

2022-09-14 15:26:33.904644: I tensorflow/stream_executor/cuda/cuda_dnn.cc:366] Loaded cuDNN version 8401


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20


In [29]:
fp = os.path.join('../evaluation', 'any_day_comparisons', 'LSTM_raw_results.pkl')
lstm_results = pd.read_pickle(fp)
lstm_results.shape

(151533, 23)

# Baseline

In [47]:
def get_statistical_prediction(row, df, percentile, lookback_duration, TARGET='y_reg100'):
    trip_id = row.trip_id
    transit_date = row.transit_date
    route_id_direction = row.route_id_direction
    lookback_date = transit_date - pd.Timedelta(lookback_duration)
    tdf = df[(df['transit_date'] >= lookback_date) & \
             (df['transit_date'] < transit_date)]
    tdf = tdf[(tdf['trip_id'] == trip_id) & \
              (tdf['route_id_direction'] == route_id_direction)]
    if tdf.empty:
        return -1
    return np.percentile(tdf[TARGET].to_numpy(), percentile)

processed_triplevel = os.path.join('../data', 'processed', 'triplevel_df.parquet')
df = pd.read_parquet(processed_triplevel, engine='auto')
df = df.dropna()
df = df.drop(['time_window', 'load'], axis=1)
df = df.reset_index(drop=True)
df = df.sort_values(['block_abbr', 'transit_date', 'arrival_time', 'route_id_direction'])

percentiles = [(0.0, 9.0), (10.0, 16.0), (17.0, 55.0), (56.0, 75.0), (76.0, 100.0)]
df['y_class'] = df[TARGET].swifter.apply(lambda x: data_utils.get_class(x, percentiles))
df['y_class'] = df['y_class'].astype('int')

df['minute'] = df['arrival_time'].dt.minute
df['minuteByWindow'] = df['minute'] // WINDOW
df['temp'] = df['minuteByWindow'] + (df['hour'] * 60 / WINDOW)
df['time_window'] = np.floor(df['temp']).astype('int')
df = df.drop(columns=['minute', 'minuteByWindow', 'temp'], axis=1)

skf = StratifiedKFold(n_splits=FOLDS, random_state=RANDOM_SEED, shuffle=True)
X, y = df[['transit_date', 'trip_id', 'arrival_time', 'route_id_direction', 'time_window', TARGET]], df['y_class']
skf.get_n_splits(X, y)

lookback_distances = ['4W', '2W', '1W']
percentile = 1.0
results_df_arr = []
for _, test_index in skf.split(X, y):
    for lookback_distance in lookback_distances:
        baseline_X = X.iloc[test_index]
        baseline_Y = y.iloc[test_index]
        
        y_pred = baseline_X.swifter.apply(lambda x: get_statistical_prediction(x, df, percentile, lookback_distance, TARGET=TARGET), axis=1)
        y_true = baseline_Y.to_numpy()
        # res_df = deepcopy(X.loc[test_index])
        res_df = deepcopy(baseline_X)
        res_df['y_pred'] = y_pred.to_numpy()
        res_df['y_true'] = y_true
        # res_df = reconstruct_original_data_from_RF(res_df, ix_map, ohe_encoder)
        results_df_arr.append(res_df)
    break

results_df_arr[0]['y_pred_class'] = results_df_arr[0]['y_pred'].apply(lambda x: data_utils.get_class(x, percentiles))
results_df_arr[1]['y_pred_class'] = results_df_arr[1]['y_pred'].apply(lambda x: data_utils.get_class(x, percentiles))
results_df_arr[2]['y_pred_class'] = results_df_arr[2]['y_pred'].apply(lambda x: data_utils.get_class(x, percentiles))
df1 = results_df_arr[0].dropna(subset=['y_pred_class'])
df2 = results_df_arr[1].dropna(subset=['y_pred_class'])
df3 = results_df_arr[2].dropna(subset=['y_pred_class'])
df1 = df1.rename(columns={'y_pred_class': 'y_pred'})
df2 = df2.rename(columns={'y_pred_class': 'y_pred'})
df3 = df3.rename(columns={'y_pred_class': 'y_pred'})
df1['past'] = 1
df2['past'] = 2
df3['past'] = 4

fp = os.path.join('../evaluation', 'any_day_comparisons', 'BASELINE_raw_results.pkl')
pd.concat([df1, df2, df3]).to_pickle(fp)


Pandas Apply:   0%|          | 0/430404 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/143468 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/143468 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/143468 [00:00<?, ?it/s]

In [48]:
fp = os.path.join('../evaluation', 'any_day_comparisons', 'BASELINE_raw_results.pkl')
baseline_results = pd.read_pickle(fp)
baseline_results.head()

Unnamed: 0,transit_date,trip_id,arrival_time,route_id_direction,time_window,y_reg100,y_pred,y_true,y_pred.1,past
301336,2020-01-03,195743,2020-01-03 07:03:16,3_FROM DOWNTOWN,14,18.0,14.0,2,1.0,1
44946,2020-01-14,195729,2020-01-14 13:06:16,3_FROM DOWNTOWN,26,24.0,20.0,2,2.0,1
241070,2020-01-14,195766,2020-01-14 13:48:36,3_TO DOWNTOWN,27,24.0,15.0,2,1.0,1
44947,2020-01-14,195762,2020-01-14 15:19:38,3_TO DOWNTOWN,30,49.0,33.0,2,2.0,1
90135,2020-01-14,195757,2020-01-14 16:45:39,3_TO DOWNTOWN,33,19.0,23.0,2,2.0,1


# for plotting results, see `plotting_paper.ipynb`

In [46]:
res_df

Unnamed: 0,transit_date,trip_id,arrival_time,route_id_direction,time_window,y_reg100,y_pred,y_true
286072,2020-01-01,195844,2020-01-01 11:50:51,3_FROM DOWNTOWN,23,3.0,-1.00,0
407007,2020-01-02,195717,2020-01-02 18:04:02,3_FROM DOWNTOWN,36,9.0,-1.00,0
301336,2020-01-03,195743,2020-01-03 07:03:16,3_FROM DOWNTOWN,14,18.0,14.00,2
40,2020-01-03,195778,2020-01-03 07:41:40,3_TO DOWNTOWN,15,13.0,-1.00,1
135589,2020-01-03,195775,2020-01-03 09:28:04,3_TO DOWNTOWN,18,21.0,-1.00,2
...,...,...,...,...,...,...,...,...
145294,2021-07-30,246346,2021-07-30 22:54:48,50_TO DOWNTOWN,45,13.0,5.16,1
206042,2021-08-23,246346,2021-08-23 22:54:40,50_TO DOWNTOWN,45,15.0,6.06,1
296678,2021-08-26,246346,2021-08-26 22:43:42,50_TO DOWNTOWN,45,12.0,6.06,1
251619,2021-09-01,246345,2021-09-01 23:34:34,50_FROM DOWNTOWN,47,2.0,2.04,0
