### Bus Arrival Time Prediction

#### Running Time Prediction

##### Meta Learning with Stacked Generalization

setting up the environment

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn import metrics
from joblib import dump


In [2]:
## Read data files
df_predictions = pd.read_csv('../../data/predicted_running_times.csv')
df_features = pd.read_csv('../../data/bus_running_time_all_features.csv')

In [3]:
df_predictions.columns

Index(['trip_id', 'deviceid', 'direction', 'segment', 'date', 'start_time',
       'end_time', 'run_time', 'run_time_in_seconds', 'length', 'day_of_week',
       'time_of_day', 'Sunday/holiday', 'saturday', 'weekday/end', 'week_no',
       'rt(w-1)', 'rt(w-2)', 'rt(w-3)', 'rt(t-1)', 'rt(t-2)', 'rt(n-1)',
       'rt(n-2)', 'rt(n-3)', 'hour_of_day', 'day', 'month', 'temp', 'precip',
       'windspeed', 'conditions', 'XGBoost_rand', 'LightGBM_rand',
       'Random forest', 'Linear Reg', 'DateTime', 'DateTimeRef', 'convlstm',
       'lstm', 'XGBoost', 'LightGBM', 'med_convlstm', 'DayOfWeek',
       'DowTimeRef', 'out_convlstm', 'es_convlstm', 'ES_convlstm_bfill',
       'ES_purelstm_med', 'purelstm_med', 'HA', 'weight_avg', 'simple_avg',
       'metalr'],
      dtype='object')

In [4]:
rmse_ref = np.sqrt(metrics.mean_squared_error(df_predictions['run_time_in_seconds'], df_predictions['metalr']))
mae_ref = metrics.mean_absolute_error(df_predictions['run_time_in_seconds'],df_predictions['metalr'])
mape_ref = metrics.mean_absolute_percentage_error(df_predictions['run_time_in_seconds'],df_predictions['metalr'])
print(f"Ref Root Mean Squared Error (on entire dataset): {rmse_ref}")
print(f"Ref Mean Absolute Error (on entire dataset): {mae_ref}")
print(f"Ref Mean Absolute Percentage Error (on entire dataset): {mape_ref}")

Ref Root Mean Squared Error (on entire dataset): 53.40331672350828
Ref Mean Absolute Error (on entire dataset): 32.37119229157685
Ref Mean Absolute Percentage Error (on entire dataset): 0.19200333277298803


In [3]:
df_predictions.head()

Unnamed: 0,trip_id,deviceid,direction,segment,date,start_time,end_time,run_time,run_time_in_seconds,length,...,DowTimeRef,out_convlstm,es_convlstm,ES_convlstm_bfill,ES_purelstm_med,purelstm_med,HA,weight_avg,simple_avg,metalr
0,22733,117,1,1,9/20/2022,6:19:59,6:21:21,0:01:22,82,0.63,...,1970-01-02 06:15:00,88.6,87.074704,84.069713,85.666289,85.996378,83.916667,93.548105,94.093316,96.098501
1,22733,117,1,2,9/20/2022,6:22:12,6:25:32,0:03:20,200,1.28,...,1970-01-02 06:15:00,241.0,234.027539,235.173474,238.754493,242.329224,233.923077,238.46502,238.570328,232.249795
2,22733,117,1,3,9/20/2022,6:25:47,6:35:11,0:09:24,564,2.11,...,1970-01-02 06:15:00,415.52381,412.090909,412.090909,396.285632,407.264836,412.090909,435.984704,437.50984,432.908372
3,22733,117,1,4,9/20/2022,6:35:26,6:38:13,0:02:47,167,1.55,...,1970-01-02 06:30:00,211.9,196.233462,195.272727,192.328069,194.341785,195.272727,216.783598,218.061745,220.150008
4,22733,117,1,5,9/20/2022,6:38:13,6:40:34,0:02:21,141,0.84,...,1970-01-02 06:30:00,121.272727,121.923077,121.923077,123.587151,121.787791,121.923077,123.614569,123.722536,121.761664


In [4]:
df_predictions.columns

Index(['trip_id', 'deviceid', 'direction', 'segment', 'date', 'start_time',
       'end_time', 'run_time', 'run_time_in_seconds', 'length', 'day_of_week',
       'time_of_day', 'Sunday/holiday', 'saturday', 'weekday/end', 'week_no',
       'rt(w-1)', 'rt(w-2)', 'rt(w-3)', 'rt(t-1)', 'rt(t-2)', 'rt(n-1)',
       'rt(n-2)', 'rt(n-3)', 'hour_of_day', 'day', 'month', 'temp', 'precip',
       'windspeed', 'conditions', 'XGBoost_rand', 'LightGBM_rand',
       'Random forest', 'Linear Reg', 'DateTime', 'DateTimeRef', 'convlstm',
       'lstm', 'XGBoost', 'LightGBM', 'med_convlstm', 'DayOfWeek',
       'DowTimeRef', 'out_convlstm', 'es_convlstm', 'ES_convlstm_bfill',
       'ES_purelstm_med', 'purelstm_med', 'HA', 'weight_avg', 'simple_avg',
       'metalr'],
      dtype='object')

In [5]:
input_features = ['deviceid','week_no','segment','length',
            'rt(n-1)','rt(n-2)', 'rt(n-3)','rt(t-1)', 'rt(t-2)', 'rt(w-1)', 'rt(w-2)', 'rt(w-3)', 'dt(n-1)','max_elevation','avg_elevation',
'total_poi_count', 'average_poi_distance','segment_gradient_average', 'stop_to_stop_gradient','significant_bend_count']
df_train = df_features[input_features]
df_train = df_train[df_train['week_no']<=36]
df_train = df_train.drop(['week_no'],axis = 1)

In [6]:
top_poi_features = ['trip_id','deviceid','segment','week_no','dt(n-1)','max_elevation','avg_elevation','total_poi_count', 'average_poi_distance','segment_gradient_average', 'stop_to_stop_gradient','significant_bend_count']
df_features = df_features[top_poi_features]

In [7]:
df = df_predictions.merge(right=df_features,on=['trip_id','deviceid','segment','week_no'],how='left')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18402 entries, 0 to 18401
Data columns (total 61 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   trip_id                   18402 non-null  int64  
 1   deviceid                  18402 non-null  int64  
 2   direction                 18402 non-null  int64  
 3   segment                   18402 non-null  int64  
 4   date                      18402 non-null  object 
 5   start_time                18402 non-null  object 
 6   end_time                  18402 non-null  object 
 7   run_time                  18402 non-null  object 
 8   run_time_in_seconds       18402 non-null  int64  
 9   length                    18402 non-null  float64
 10  day_of_week               18402 non-null  int64  
 11  time_of_day               18402 non-null  float64
 12  Sunday/holiday            18402 non-null  int64  
 13  saturday                  18402 non-null  int64  
 14  weekda

In [9]:
df_input = df[input_features]
df_input = df_input.drop(['week_no'],axis = 1)

In [10]:
df_input.head()

Unnamed: 0,deviceid,segment,length,rt(n-1),rt(n-2),rt(n-3),rt(t-1),rt(t-2),rt(w-1),rt(w-2),rt(w-3),dt(n-1),max_elevation,avg_elevation,total_poi_count,average_poi_distance,segment_gradient_average,stop_to_stop_gradient,significant_bend_count
0,117,1,0.63,105,105,105,105,105,105,92,105,0.0,532.0,511.65,16,0.002356,0.004662,0.079618,2.0
1,117,2,1.28,82,242,242,242,242,242,222,242,51.0,532.0,529.5,19,0.006437,0.015948,-0.007985,7.0
2,117,3,2.11,200,82,439,439,439,439,474,439,15.0,544.0,529.1,13,0.003714,-0.011355,-0.004628,11.0
3,117,4,1.55,564,200,82,216,216,216,173,216,15.0,527.0,515.714286,7,0.007617,0.008623,-0.01095,4.0
4,117,5,0.84,167,564,200,128,128,128,114,128,0.0,488.0,480.333333,6,0.003597,-0.04085,-0.050612,2.0


In [11]:
#Scaling
scaler = StandardScaler()
df_train_scaled = scaler.fit_transform(df_train)
df_input_scaled = scaler.transform(df_input)

In [12]:
#PCA
pca = PCA(n_components=16)
pca.fit(df_train_scaled)
df_input_pca = pca.transform(df_input_scaled)

In [13]:
dump(scaler,filename='../../models/running_scaler.bin')
dump(pca,filename='../../models/pca.bin')

['../../models/pca.bin']

Make Predictions

In [13]:
model = xgb.XGBRegressor()
model.load_model('../../models/run_time_model_improved.json')

In [14]:
xgboost_pred = model.predict(df_input_pca)
xgboost_pred

array([103.84273 , 241.60274 , 444.385   , ..., 113.338745, 207.47966 ,
       396.7875  ], dtype=float32)

In [15]:
xgb_predictions = pd.DataFrame(data=xgboost_pred,columns=['XGBoost'])
xgb_predictions.head()

Unnamed: 0,XGBoost
0,103.842728
1,241.602737
2,444.38501
3,216.669769
4,129.72377


In [16]:
df_predictions['XGBoost'] = xgb_predictions['XGBoost']

In [17]:
df['XGBoost'] = xgb_predictions['XGBoost']

In [18]:
df_predictions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18402 entries, 0 to 18401
Data columns (total 53 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   trip_id              18402 non-null  int64  
 1   deviceid             18402 non-null  int64  
 2   direction            18402 non-null  int64  
 3   segment              18402 non-null  int64  
 4   date                 18402 non-null  object 
 5   start_time           18402 non-null  object 
 6   end_time             18402 non-null  object 
 7   run_time             18402 non-null  object 
 8   run_time_in_seconds  18402 non-null  int64  
 9   length               18402 non-null  float64
 10  day_of_week          18402 non-null  int64  
 11  time_of_day          18402 non-null  float64
 12  Sunday/holiday       18402 non-null  int64  
 13  saturday             18402 non-null  int64  
 14  weekday/end          18402 non-null  int64  
 15  week_no              18402 non-null 

In [19]:
features = ['trip_id', 'deviceid', 'direction', 'segment', 'date', 'start_time',
       'end_time', 'run_time', 'run_time_in_seconds', 'length', 'day_of_week',
       'time_of_day', 'Sunday/holiday', 'saturday', 'weekday/end', 'week_no',
       'rt(w-1)', 'rt(w-2)', 'rt(w-3)', 'rt(t-1)', 'rt(t-2)', 'rt(n-1)',
       'rt(n-2)', 'rt(n-3)', 'hour_of_day', 'day', 'month', 'temp', 'precip',
       'windspeed','max_elevation','avg_elevation','total_poi_count', 'average_poi_distance','segment_gradient_average', 
       'stop_to_stop_gradient','significant_bend_count','conditions','convlstm','XGBoost']
df_meta = df[features]

In [20]:
df_meta.head()

Unnamed: 0,trip_id,deviceid,direction,segment,date,start_time,end_time,run_time,run_time_in_seconds,length,...,max_elevation,avg_elevation,total_poi_count,average_poi_distance,segment_gradient_average,stop_to_stop_gradient,significant_bend_count,conditions,convlstm,XGBoost
0,22733,117,1,1,9/20/2022,6:19:59,6:21:21,0:01:22,82,0.63,...,532.0,511.65,16,0.002356,0.004662,0.079618,2.0,Overcast,85.006463,103.842728
1,22733,117,1,2,9/20/2022,6:22:12,6:25:32,0:03:20,200,1.28,...,532.0,529.5,19,0.006437,0.015948,-0.007985,7.0,Overcast,236.815187,241.602737
2,22733,117,1,3,9/20/2022,6:25:47,6:35:11,0:09:24,564,2.11,...,544.0,529.1,13,0.003714,-0.011355,-0.004628,11.0,Overcast,412.090909,444.38501
3,22733,117,1,4,9/20/2022,6:35:26,6:38:13,0:02:47,167,1.55,...,527.0,515.714286,7,0.007617,0.008623,-0.01095,4.0,Overcast,196.759309,216.669769
4,22733,117,1,5,9/20/2022,6:38:13,6:40:34,0:02:21,141,0.84,...,488.0,480.333333,6,0.003597,-0.04085,-0.050612,2.0,Overcast,121.923077,129.72377


In [21]:
df_predictions.to_csv('../../data/predicted_running_times_group_27.csv',index=False)

Meta Learning

In [22]:
X = df_meta[['convlstm','XGBoost']]
y = df_meta['run_time_in_seconds']

In [23]:
kfolds = 5

meta_predictions = np.zeros(X.shape[0])

kf = KFold(n_splits=kfolds, shuffle=True, random_state=42)

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train the metalearner on the training set for this fold
    metalearner = RandomForestRegressor(n_estimators=80,criterion='absolute_error',max_depth=10)
    metalearner.fit(X_train, y_train)
    
    # Make predictions on the test set for this fold
    fold_predictions = metalearner.predict(X_test)
    
    # Store the predictions for this fold
    meta_predictions[test_index] = fold_predictions

# Calculate the Mean Squared Error for the entire dataset
rmse = np.sqrt(metrics.mean_squared_error(y, meta_predictions))
mae = metrics.mean_absolute_error(y,meta_predictions)
mape = metrics.mean_absolute_percentage_error(y,meta_predictions)
print(f"Metalearner Root Mean Squared Error (on entire dataset): {rmse}")
print(f"Metalearner Mean Absolute Error (on entire dataset): {mae}")
print(f"Metalearner Mean Absolute Percentage Error (on entire dataset): {mape}")



Metalearner Root Mean Squared Error (on entire dataset): 55.21776645093742
Metalearner Mean Absolute Error (on entire dataset): 32.93355273883274
Metalearner Mean Absolute Percentage Error (on entire dataset): 0.18411003714160273


In [28]:
df_meta_pred = pd.DataFrame(data=meta_predictions,columns=['meta_rr'])
df_meta['meta_rr'] = df_meta_pred[['meta_rr']]
df_meta.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_meta['meta_rr'] = df_meta_pred[['meta_rr']]


Unnamed: 0,trip_id,deviceid,direction,segment,date,start_time,end_time,run_time,run_time_in_seconds,length,...,total_poi_count,average_poi_distance,segment_gradient_average,stop_to_stop_gradient,significant_bend_count,conditions,convlstm,XGBoost,meta_lr,meta_rr
0,22733,117,1,1,9/20/2022,6:19:59,6:21:21,0:01:22,82,0.63,...,16,0.002356,0.004662,0.079618,2.0,Overcast,85.006463,103.842728,97.4,97.4
1,22733,117,1,2,9/20/2022,6:22:12,6:25:32,0:03:20,200,1.28,...,19,0.006437,0.015948,-0.007985,7.0,Overcast,236.815187,241.602737,246.8125,246.8125
2,22733,117,1,3,9/20/2022,6:25:47,6:35:11,0:09:24,564,2.11,...,13,0.003714,-0.011355,-0.004628,11.0,Overcast,412.090909,444.38501,386.375,386.375
3,22733,117,1,4,9/20/2022,6:35:26,6:38:13,0:02:47,167,1.55,...,7,0.007617,0.008623,-0.01095,4.0,Overcast,196.759309,216.669769,188.6625,188.6625
4,22733,117,1,5,9/20/2022,6:38:13,6:40:34,0:02:21,141,0.84,...,6,0.003597,-0.04085,-0.050612,2.0,Overcast,121.923077,129.72377,118.71875,118.71875


In [33]:
kfolds = 5

meta_predictions_lr = np.zeros(X.shape[0])

kf = KFold(n_splits=kfolds, shuffle=True, random_state=42)

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train the metalearner_lr on the training set for this fold
    metalearner_lr = LinearRegression()
    metalearner_lr.fit(X_train, y_train)
    
    # Make predictions on the test set for this fold
    fold_predictions = metalearner_lr.predict(X_test)
    
    # Store the predictions for this fold
    meta_predictions_lr[test_index] = fold_predictions

# Calculate the Mean Squared Error for the entire dataset
rmse = np.sqrt(metrics.mean_squared_error(y, meta_predictions_lr))
mae = metrics.mean_absolute_error(y,meta_predictions_lr)
mape = metrics.mean_absolute_percentage_error(y,meta_predictions_lr)
print(f"metalearner_lr Root Mean Squared Error (on entire dataset): {rmse}")
print(f"metalearner_lr Mean Absolute Error (on entire dataset): {mae}")
print(f"metalearner_lr Mean Absolute Percentage Error (on entire dataset): {mape}")



metalearner_lr Root Mean Squared Error (on entire dataset): 54.5794480239168
metalearner_lr Mean Absolute Error (on entire dataset): 33.585384262558115
metalearner_lr Mean Absolute Percentage Error (on entire dataset): 0.19986624962757363


In [34]:
df_meta_pred_lr = pd.DataFrame(data=meta_predictions_lr,columns=['meta_lr'])
df_meta['meta_lr'] = df_meta_pred_lr[['meta_lr']]
df_meta.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_meta['meta_lr'] = df_meta_pred_lr[['meta_lr']]


Unnamed: 0,trip_id,deviceid,direction,segment,date,start_time,end_time,run_time,run_time_in_seconds,length,...,total_poi_count,average_poi_distance,segment_gradient_average,stop_to_stop_gradient,significant_bend_count,conditions,convlstm,XGBoost,meta_lr,meta_rr
0,22733,117,1,1,9/20/2022,6:19:59,6:21:21,0:01:22,82,0.63,...,16,0.002356,0.004662,0.079618,2.0,Overcast,85.006463,103.842728,99.876809,97.4
1,22733,117,1,2,9/20/2022,6:22:12,6:25:32,0:03:20,200,1.28,...,19,0.006437,0.015948,-0.007985,7.0,Overcast,236.815187,241.602737,239.67664,246.8125
2,22733,117,1,3,9/20/2022,6:25:47,6:35:11,0:09:24,564,2.11,...,13,0.003714,-0.011355,-0.004628,11.0,Overcast,412.090909,444.38501,430.84258,386.375
3,22733,117,1,4,9/20/2022,6:35:26,6:38:13,0:02:47,167,1.55,...,7,0.007617,0.008623,-0.01095,4.0,Overcast,196.759309,216.669769,210.348248,188.6625
4,22733,117,1,5,9/20/2022,6:38:13,6:40:34,0:02:21,141,0.84,...,6,0.003597,-0.04085,-0.050612,2.0,Overcast,121.923077,129.72377,128.294363,118.71875


In [35]:
df_meta.to_csv("../../data/predicted_running_times_xgboost_convlstm.csv")

In [26]:
dump(value=metalearner,filename='../../models/running_time_metalearner.joblib')

['../../models/running_time_metalearner.joblib']

In [36]:
dump(metalearner_lr,filename='../../models/running_time_metalearner_lr.joblib')

['../../models/running_time_metalearner_lr.joblib']