In [1]:
import logging
import os
import pandas as pd
pd.set_option('display.max_columns', None)
from copy import deepcopy
from pathlib import Path
import joblib
import xgboost as xgb
import sys
sys.path.insert(0,'..')
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from pandas.core.common import SettingWithCopyWarning
from src import data_utils, triplevel_utils
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)
from sklearn.model_selection import RandomizedSearchCV, StratifiedShuffleSplit, StratifiedKFold
import datetime as dt
import swifter
import numpy as np
import argparse
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, OneHotEncoder
# Requires the preprocessed dataset `triplevel_df.parquet`

OUTPUT_DIR = os.path.join('../models', 'any_day', 'variable_timewindow')
ohe_features = ['route_id_direction', 'is_holiday', 'dayofweek', 'is_school_break', 'time_window']
ord_features = ['year', 'month', 'hour', 'day']
num_features = ['temperature', 'humidity', 'precipitation_intensity', 'avg_sched_headway', 'traffic_speed']
feature_label = 'y_class'

  from pandas import MultiIndex, Int64Index


In [2]:
processed_triplevel = os.path.join('../data', 'processed', 'triplevel_df.parquet')
df = pd.read_parquet(processed_triplevel, engine='auto')
df = df.dropna()
# Removing time_window in case a different one will be used
df = df.drop(['time_window', 'load'], axis=1)
df = df.reset_index(drop=True)
df = df.sort_values(['block_abbr', 'transit_date', 'arrival_time', 'route_id_direction'])
df['day'] = df.transit_date.dt.day
df['year'] = df.transit_date.dt.year
df = df[df["y_reg100"] < 100]

In [10]:
time_windows = [1, 10, 20, 30, 40, 50, 60, 120]
percentiles = [(0.0, 9.0), (10.0, 16.0), (17.0, 55.0), (56.0, 75.0), (76.0, 100.0)]
# tdf = triplevel_utils.generate_new_features(df, time_window=config.time_window, past_trips=config.past_trips, target=config.target)

In [11]:
def reconstruct_original_data(df, ix_map, ohe_encoder):
    df[ord_features] = ohe_encoder.inverse_transform(df.filter(regex='route_id_direction_|is_holiday_|dayofweek_|is_school_break_|time_window_'))
    
    for col in ord_features:
        inv_map = {v: k for k, v in ix_map[col].items()}
        df[col] = df[f"{col}_ix"].apply(lambda x: inv_map[x])
        
    df = df.drop(columns=df.filter(regex='route_id_direction_|is_holiday_|dayofweek_|is_school_break_|time_window_|_ix').columns, axis=1)
    return df

In [12]:
ix_map = {}
for col in ord_features:
    ix_map[col] = triplevel_utils.create_ix_map(df, df, col)
    df[f"{col}_ix"] = df[col].apply(lambda x: ix_map[col][x])
# df = df.drop(columns=ord_features)
df['y_class'] = df['y_reg100'].apply(lambda x: data_utils.get_class(x, percentiles)) 
df.head(1)

Unnamed: 0,trip_id,transit_date,arrival_time,year,month,route_id,route_direction_name,block_abbr,dayofweek,hour,temperature,humidity,precipitation_intensity,scheduled_headway,actual_headways,y_reg100,y_reg095,route_id_direction,is_holiday,is_school_break,traffic_speed,day,year_ix,month_ix,hour_ix,day_ix,y_class,time_window
392045,195842,2020-01-01,2020-01-01 10:27:12,2020,1,3,FROM DOWNTOWN,300,4,10,40.150002,0.616,0.0,4800.0,4811.8,8.0,8.0,3_FROM DOWNTOWN,True,True,18.639604,1,0,0,8,0,0,20


In [13]:
objective = 'multi:softmax'
    

for tw in time_windows:
    # sss = StratifiedShuffleSplit(n_splits=3, test_size=0.3, random_state=100)
    sss = StratifiedKFold(n_splits=3, random_state=100, shuffle=True)

    sss.get_n_splits(df)
    
    # columns = X.columns
    
    reconstructed_df_arr = []
    kfold = 0
    for train_index, test_index in sss.split(df, df['y_class'].to_numpy()):
        # y_train, y_test = df['y_class'].iloc[train_index], df['y_class'].iloc[test_index]
        
        df = triplevel_utils.generate_new_features(df, time_window=tw, target='y_reg100')
        # test = triplevel_utils.generate_new_features(test, time_window=tw, target='y_reg100')
        
        ohe_encoder = OneHotEncoder()
        ohe_encoder = ohe_encoder.fit(df[ohe_features])
        
        train, test = df.iloc[train_index], df.iloc[test_index]
        # TRAINING
        train = train.groupby(['transit_date', 'route_id_direction', 'time_window']).agg({"trip_id":"first",
                                                                                    "year_ix":"first", 
                                                                                    "month_ix":"first",
                                                                                    "day_ix": "first",
                                                                                    "hour_ix":"first",
                                                                                    "is_holiday": "first",
                                                                                    "is_school_break": "first",
                                                                                    "dayofweek":"first",
                                                                                    "temperature":"mean", 
                                                                                    "humidity":"mean",
                                                                                    "precipitation_intensity": "mean",
                                                                                    "traffic_speed":"mean",
                                                                                    "scheduled_headway": "max",
                                                                                    "y_reg100": "max" })
        train = train.reset_index(level=[0,1,2])
        train[ohe_encoder.get_feature_names_out()] = ohe_encoder.transform(train[ohe_features]).toarray()
        train = train.drop(columns=ohe_features, axis=1)
        
        drop_cols = ['time_window', 'route_id', 'route_direction_name', 'block_abbr', 'y_reg100', 'y_reg095', 'transit_date', 'is_holiday', 'route_id_direction', 'actual_headways', 'trip_id', 'arrival_time']
        
        train['y_class'] = train['y_reg100'].apply(lambda x: data_utils.get_class(x, percentiles))
        
        drop_cols = [col for col in drop_cols if col in train.columns]
        rf_df = train.drop(drop_cols, axis=1)
        
        y_train = rf_df.pop('y_class')
        X_train = rf_df
        train_columns = X_train.columns
        model = xgb.XGBClassifier(use_label_encoder=False, 
                                  objective=objective, 
                                  eval_metric='mlogloss', 
                                  num_class=5)

        model.fit(X_train, y_train, verbose=1)

        # TESTING
        test = test[test["y_reg100"] < 100]
        test['y_class'] = test['y_reg100'].apply(lambda x: data_utils.get_class(x, percentiles)) 
        
        test = test.drop(columns=ord_features, axis=1)
        test[ohe_encoder.get_feature_names_out()] = ohe_encoder.transform(test[ohe_features]).toarray()
        test = test.drop(columns=ohe_features, axis=1)
        
        drop_cols = ['time_window', 'route_id', 'route_direction_name', 'block_abbr', 'y_reg100', 'y_reg095', 'transit_date', 'is_holiday', 'route_id_direction', 'actual_headways', 'trip_id', 'arrival_time']
        drop_cols = [col for col in drop_cols if col in test.columns]
        test = test.drop(drop_cols, axis=1)
        
        y_test = test.pop("y_class")
        X_test = test[train_columns]
        
        y_pred = model.predict(X_test)
        _original_rf = deepcopy(df.iloc[test_index])
        _original_rf['y_pred'] = y_pred
        _original_rf['y_true'] = y_test
        _original_rf['kfold'] = kfold
        _original_rf['time_window_param'] = tw
        kfold = kfold + 1
        reconstructed_df_arr.append(_original_rf)
        fp = f'/home/jptalusan/mta_stationing_problem/evaluation/any_day_time_windows_fixed/{tw}_raw_res.pkl'
        _original_rf.to_pickle(fp)

In [None]:
reconstructed_df_arr = pd.concat(reconstructed_df_arr)
fp = f'/home/jptalusan/mta_stationing_problem/evaluation/any_day_time_windows_fixed/all_concat.pkl'
reconstructed_df_arr.to_pickle(fp)

Unnamed: 0,trip_id,transit_date,arrival_time,year,month,route_id,route_direction_name,block_abbr,dayofweek,hour,temperature,humidity,precipitation_intensity,scheduled_headway,actual_headways,y_reg100,y_reg095,route_id_direction,is_holiday,is_school_break,traffic_speed,day,year_ix,month_ix,hour_ix,day_ix,y_class,y_pred,y_true
311408,245280,2021-08-13,2021-08-13 07:13:50,2021,8,5,FROM DOWNTOWN,502,6,7,76.484908,0.868795,0.000000,1837.659091,1932.227273,17.0,16.85,5_FROM DOWNTOWN,False,False,20.541879,13,1,7,5,12,2,1,2
424918,231963,2020-12-27,2020-12-27 07:38:26,2020,12,52,TO DOWNTOWN,5200,1,7,30.648324,0.855765,0.000000,4146.852941,4038.117647,10.0,10.00,52_TO DOWNTOWN,False,True,20.738681,27,0,11,5,26,1,1,1
397303,229662,2021-01-06,2021-01-06 10:59:22,2021,1,17,TO DOWNTOWN,1701,4,10,35.217092,0.821758,0.000000,1800.000000,1773.030303,12.0,12.00,17_TO DOWNTOWN,False,False,17.242890,6,1,0,8,5,1,2,1
241973,203779,2020-03-01,2020-03-01 19:07:48,2020,3,4,TO DOWNTOWN,400,1,19,59.962542,0.469375,0.000308,2400.000000,2404.145833,10.0,10.00,4_TO DOWNTOWN,False,False,17.653482,1,0,2,17,0,1,0,1
25497,245713,2021-09-01,2021-09-01 11:38:38,2021,9,7,TO DOWNTOWN,711,4,11,76.987000,0.851000,0.000000,1018.928571,1129.500000,15.0,15.00,7_TO DOWNTOWN,False,False,15.659355,1,1,8,9,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
289042,225270,2020-07-29,2020-07-29 07:47:46,2020,7,52,FROM DOWNTOWN,5206,4,7,75.093002,0.940000,0.000000,3600.000000,3492.640000,5.0,5.00,52_FROM DOWNTOWN,False,True,19.717783,29,0,6,5,28,0,2,0
323322,231332,2021-03-28,2021-03-28 18:23:46,2021,3,42,FROM DOWNTOWN,2200,1,18,54.153000,0.510000,0.000000,3600.000000,3609.964286,8.0,8.00,42_FROM DOWNTOWN,False,False,18.191172,28,1,2,16,27,0,0,0
401168,240451,2021-07-11,2021-07-11 09:43:18,2021,7,55,FROM DOWNTOWN,5503,1,9,73.702003,0.922000,0.000000,3600.000000,3401.815789,16.0,16.00,55_FROM DOWNTOWN,False,True,23.791136,11,1,6,7,10,1,2,1
304549,225798,2020-08-26,2020-08-26 18:59:10,2020,8,18,TO DOWNTOWN,1800,4,18,84.924003,0.668242,0.000000,3600.000000,3610.424242,26.0,26.00,18_TO DOWNTOWN,False,False,20.552574,26,0,7,16,25,2,0,2
