### Bus Arrival Time Prediction

#### Dwell Time Prediction

##### Meta Learning with Stacked Generalization

Setting up the environment

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from joblib import dump
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

In [2]:
df_predictions = pd.read_csv('../../data/predicted_dwell_times.csv')
df_features = pd.read_csv('../../data/bus_stop_times_poi_data_added_modified.csv')

In [3]:
df_predictions.columns

Index(['trip_id', 'deviceid', 'direction', 'bus_stop', 'date', 'arrival_time',
       'departure_time', 'dwell_time', 'dwell_time_in_seconds_old',
       'day_of_week', 'time_of_day', 'Sunday/holiday', 'saturday',
       'weekday/end', 'week_no', 'dt(w-1)', 'dt(w-2)', 'dt(w-3)', 'dt(t-1)',
       'dt(t-2)', 'dt(n-1)', 'dt(n-2)', 'dt(n-3)', 'hour_of_day', 'day',
       'month', 'temp', 'precip', 'windspeed', 'conditions', 'rt(n-1)',
       'stop_type', 'dwell_time_in_seconds', 'XGBoost', 'Random forest',
       'Linear Reg', 'LightGBM', 'DateTime', 'DateTimeRef', 'convlstm',
       'XGBoost_Bay_Opt', 'LightGBM_Bay_Opt', 'XGBoost_class', 'simple_avg',
       'weight_avg', 'metalr', 'ES'],
      dtype='object')

In [5]:
rmse_ref = np.sqrt(metrics.mean_squared_error(df_predictions['dwell_time_in_seconds'], df_predictions['metalr']))
mae_ref = metrics.mean_absolute_error(df_predictions['dwell_time_in_seconds'],df_predictions['metalr'])

print("Dwell Time Prediction")
print(f"Ref Root Mean Squared Error (on entire dataset): {rmse_ref}")
print(f"Ref Mean Absolute Error (on entire dataset): {mae_ref}")


Dwell Time Prediction
Ref Root Mean Squared Error (on entire dataset): 29.56011918046916
Ref Mean Absolute Error (on entire dataset): 14.496970777819744


In [4]:
df_predictions.columns

Index(['trip_id', 'deviceid', 'direction', 'bus_stop', 'date', 'arrival_time',
       'departure_time', 'dwell_time', 'dwell_time_in_seconds_old',
       'day_of_week', 'time_of_day', 'Sunday/holiday', 'saturday',
       'weekday/end', 'week_no', 'dt(w-1)', 'dt(w-2)', 'dt(w-3)', 'dt(t-1)',
       'dt(t-2)', 'dt(n-1)', 'dt(n-2)', 'dt(n-3)', 'hour_of_day', 'day',
       'month', 'temp', 'precip', 'windspeed', 'conditions', 'rt(n-1)',
       'stop_type', 'dwell_time_in_seconds', 'XGBoost', 'Random forest',
       'Linear Reg', 'LightGBM', 'DateTime', 'DateTimeRef', 'convlstm',
       'XGBoost_Bay_Opt', 'LightGBM_Bay_Opt', 'XGBoost_class', 'simple_avg',
       'weight_avg', 'metalr', 'ES'],
      dtype='object')

In [5]:
rel_features = ['deviceid','week_no','day_of_week','time_of_day','Sunday/holiday','weekday/end','bus_stop','precip','temp',
            'dt(n-1)','dt(n-2)', 'dt(n-3)','dt(t-1)', 'dt(t-2)', 'dt(w-1)', 'dt(w-2)', 'dt(w-3)', 'rt(n-1)','total_poi_count']
df_train = df_features[rel_features]
df_train = df_train[df_train['week_no']<36]
df_train = df_train.drop(['week_no'],axis = 1)
df_train.head()

Unnamed: 0,deviceid,day_of_week,time_of_day,Sunday/holiday,weekday/end,bus_stop,precip,temp,dt(n-1),dt(n-2),dt(n-3),dt(t-1),dt(t-2),dt(w-1),dt(w-2),dt(w-3),rt(n-1),total_poi_count
0,262,4,6.5,0,1,101,0.0,20.0,92.0,92.0,92.0,92.0,92.0,92.0,92.0,92.0,69.0,13
1,262,4,6.75,0,1,102,0.0,20.0,74.0,45.0,45.0,45.0,45.0,45.0,45.0,45.0,210.0,6
2,262,4,6.75,0,1,103,0.0,20.0,0.0,74.0,28.0,28.0,28.0,28.0,28.0,28.0,496.0,5
3,262,4,6.75,0,1,104,0.0,20.0,6.0,0.0,74.0,1.0,1.0,1.0,1.0,1.0,195.0,3
4,262,4,6.75,0,1,105,0.0,20.0,0.0,6.0,0.0,230.0,230.0,230.0,230.0,230.0,97.0,3


In [6]:
poi_features = ['trip_id','deviceid','week_no','bus_stop','direction','dt(n-1)','total_poi_count']
df_features = df_features[poi_features]

In [7]:
df = df_predictions.merge(right=df_features,how='left',on=['trip_id','deviceid','bus_stop','week_no','direction','dt(n-1)'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17189 entries, 0 to 17188
Data columns (total 48 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   trip_id                    17189 non-null  int64  
 1   deviceid                   17189 non-null  int64  
 2   direction                  17189 non-null  int64  
 3   bus_stop                   17189 non-null  int64  
 4   date                       17189 non-null  object 
 5   arrival_time               17189 non-null  object 
 6   departure_time             17189 non-null  object 
 7   dwell_time                 17189 non-null  object 
 8   dwell_time_in_seconds_old  17189 non-null  float64
 9   day_of_week                17189 non-null  int64  
 10  time_of_day                17189 non-null  float64
 11  Sunday/holiday             17189 non-null  int64  
 12  saturday                   17189 non-null  int64  
 13  weekday/end                17189 non-null  int

In [8]:
input_features = ['deviceid','day_of_week','time_of_day','Sunday/holiday','weekday/end','bus_stop','precip','temp',
            'dt(n-1)','dt(n-2)', 'dt(n-3)','dt(t-1)', 'dt(t-2)', 'dt(w-1)', 'dt(w-2)', 'dt(w-3)', 'rt(n-1)','total_poi_count']
df_input = df[input_features]


In [9]:
#Scaling
scaler = StandardScaler()
scaler.fit(df_train)
df_input_scaled = scaler.transform(df_input)

In [10]:
dump(scaler,filename='../../models/dwell_scaler.bin')

['../../models/dwell_scaler.bin']

Predicting Dwell Times

In [10]:
model = xgb.XGBRegressor()
model.load_model('../../models/dwell_time_model_improved.json')
pred = model.predict(df_input_scaled)
pred

array([18.544977, 21.211845, 27.184288, ..., 26.943935, 39.888527,
       13.740579], dtype=float32)

In [11]:
xgb_pred_df = pd.DataFrame(data=pred, columns=['XGBoost'])
df_predictions["XGBoost"] = xgb_pred_df
df_predictions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17189 entries, 0 to 17188
Data columns (total 47 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   trip_id                    17189 non-null  int64  
 1   deviceid                   17189 non-null  int64  
 2   direction                  17189 non-null  int64  
 3   bus_stop                   17189 non-null  int64  
 4   date                       17189 non-null  object 
 5   arrival_time               17189 non-null  object 
 6   departure_time             17189 non-null  object 
 7   dwell_time                 17189 non-null  object 
 8   dwell_time_in_seconds_old  17189 non-null  float64
 9   day_of_week                17189 non-null  int64  
 10  time_of_day                17189 non-null  float64
 11  Sunday/holiday             17189 non-null  int64  
 12  saturday                   17189 non-null  int64  
 13  weekday/end                17189 non-null  int

In [12]:
df['XGBoost'] = xgb_pred_df['XGBoost']

In [13]:
features =['trip_id', 'deviceid', 'direction', 'bus_stop', 'date', 'arrival_time',
       'departure_time', 'dwell_time', 'dwell_time_in_seconds_old',
       'day_of_week', 'time_of_day', 'Sunday/holiday', 'saturday',
       'weekday/end', 'week_no', 'dt(w-1)', 'dt(w-2)', 'dt(w-3)', 'dt(t-1)',
       'dt(t-2)', 'dt(n-1)', 'dt(n-2)', 'dt(n-3)', 'hour_of_day', 'day',
       'month', 'temp', 'precip', 'windspeed', 'conditions', 'rt(n-1)',
       'stop_type','total_poi_count', 'dwell_time_in_seconds', 'XGBoost','convlstm']
df_meta = df[features]

In [14]:
df_meta.head()

Unnamed: 0,trip_id,deviceid,direction,bus_stop,date,arrival_time,departure_time,dwell_time,dwell_time_in_seconds_old,day_of_week,...,temp,precip,windspeed,conditions,rt(n-1),stop_type,total_poi_count,dwell_time_in_seconds,XGBoost,convlstm
0,22733,117,1,112,2022-09-20,07:00:04,07:00:19,0:00:15,15.0,1,...,19.5,0.0,7.6,Overcast,180.0,br,1,15.0,18.544977,9.938972
1,22733,117,1,113,2022-09-20,07:02:58,07:03:13,0:00:15,15.0,1,...,19.5,0.0,7.6,Overcast,159.0,mod,1,15.0,21.211845,33.644896
2,22733,117,1,114,2022-09-20,07:04:40,07:05:07,0:00:27,27.0,1,...,19.5,0.0,7.6,Overcast,87.0,br,3,27.0,27.184288,37.179429
3,22736,505,1,101,2022-09-20,07:02:32,07:03:18,0:00:46,46.0,1,...,19.5,0.0,7.6,Overcast,110.0,pro,13,46.0,42.047146,54.615385
4,22736,505,1,102,2022-09-20,07:07:38,07:12:26,0:04:48,288.0,1,...,19.5,0.0,7.6,Overcast,260.0,mod,6,288.0,47.063343,19.062432


In [15]:
df_predictions.to_csv('../../data/predicted_dwell_times_group_27.csv',index=False)

Meta Learning

In [16]:
X = df_meta[['convlstm','XGBoost']]
y = df_meta['dwell_time_in_seconds']

In [17]:
kfolds = 5

meta_predictions = np.zeros(X.shape[0])

kf = KFold(n_splits=kfolds, shuffle=True, random_state=42)

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train the metalearner on the training set for this fold
    metalearner = RandomForestRegressor(n_estimators=80,criterion='absolute_error',max_depth=10)
    metalearner.fit(X_train, y_train)
    
    # Make predictions on the test set for this fold
    fold_predictions = metalearner.predict(X_test)
    
    # Store the predictions for this fold
    meta_predictions[test_index] = fold_predictions

# Calculate the Mean Squared Error for the entire dataset
rmse = np.sqrt(metrics.mean_squared_error(y, meta_predictions))
mae = metrics.mean_absolute_error(y,meta_predictions)
#mape = metrics.mean_absolute_percentage_error(y,meta_predictions)
print(f"Metalearner Root Mean Squared Error (on entire dataset): {rmse}")
print(f"Metalearner Root Mean Absolute Error (on entire dataset): {mae}")
#print(f"Metalearner Root Mean Absolute Percentage Error (on entire dataset): {mape}")



Metalearner Root Mean Squared Error (on entire dataset): 34.12091497332408
Metalearner Root Mean Absolute Error (on entire dataset): 12.946553028099364


In [22]:
df_meta_pred = pd.DataFrame(data=meta_predictions,columns=['meta_rr'])
df_meta['meta_rr'] = df_meta_pred[['meta_rr']]
df_meta.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_meta['meta_rr'] = df_meta_pred[['meta_rr']]


Unnamed: 0,trip_id,deviceid,direction,bus_stop,date,arrival_time,departure_time,dwell_time,dwell_time_in_seconds_old,day_of_week,...,windspeed,conditions,rt(n-1),stop_type,total_poi_count,dwell_time_in_seconds,XGBoost,convlstm,meta_lr,meta_rr
0,22733,117,1,112,2022-09-20,07:00:04,07:00:19,0:00:15,15.0,1,...,7.6,Overcast,180.0,br,1,15.0,18.544977,9.938972,14.34375,14.34375
1,22733,117,1,113,2022-09-20,07:02:58,07:03:13,0:00:15,15.0,1,...,7.6,Overcast,159.0,mod,1,15.0,21.211845,33.644896,15.0,15.0
2,22733,117,1,114,2022-09-20,07:04:40,07:05:07,0:00:27,27.0,1,...,7.6,Overcast,87.0,br,3,27.0,27.184288,37.179429,15.0,15.0
3,22736,505,1,101,2022-09-20,07:02:32,07:03:18,0:00:46,46.0,1,...,7.6,Overcast,110.0,pro,13,46.0,42.047146,54.615385,15.0,15.0
4,22736,505,1,102,2022-09-20,07:07:38,07:12:26,0:04:48,288.0,1,...,7.6,Overcast,260.0,mod,6,288.0,47.063343,19.062432,15.0,15.0


In [25]:
kfolds = 5

meta_predictions_lr = np.zeros(X.shape[0])

kf = KFold(n_splits=kfolds, shuffle=True, random_state=42)

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train the metalearner_lr on the training set for this fold
    metalearner_lr = LinearRegression()
    metalearner_lr.fit(X_train, y_train)
    
    # Make predictions on the test set for this fold
    fold_predictions = metalearner_lr.predict(X_test)
    
    # Store the predictions for this fold
    meta_predictions_lr[test_index] = fold_predictions

# Calculate the Mean Squared Error for the entire dataset
rmse = np.sqrt(metrics.mean_squared_error(y, meta_predictions_lr))
mae = metrics.mean_absolute_error(y,meta_predictions_lr)
#mape = metrics.mean_absolute_percentage_error(y,meta_predictions_lr)
print(f"metalearner_lr Root Mean Squared Error (on entire dataset): {rmse}")
print(f"metalearner_lr Mean Absolute Error (on entire dataset): {mae}")
#print(f"metalearner_lr Mean Absolute Percentage Error (on entire dataset): {mape}")



metalearner_lr Root Mean Squared Error (on entire dataset): 30.230682136183376
metalearner_lr Mean Absolute Error (on entire dataset): 15.046581233089045


In [26]:
df_meta_pred_lr = pd.DataFrame(data=meta_predictions_lr,columns=['meta_lr'])
df_meta['meta_lr'] = df_meta_pred_lr[['meta_lr']]
df_meta.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_meta['meta_lr'] = df_meta_pred_lr[['meta_lr']]


Unnamed: 0,trip_id,deviceid,direction,bus_stop,date,arrival_time,departure_time,dwell_time,dwell_time_in_seconds_old,day_of_week,...,windspeed,conditions,rt(n-1),stop_type,total_poi_count,dwell_time_in_seconds,XGBoost,convlstm,meta_lr,meta_rr
0,22733,117,1,112,2022-09-20,07:00:04,07:00:19,0:00:15,15.0,1,...,7.6,Overcast,180.0,br,1,15.0,18.544977,9.938972,10.621514,14.34375
1,22733,117,1,113,2022-09-20,07:02:58,07:03:13,0:00:15,15.0,1,...,7.6,Overcast,159.0,mod,1,15.0,21.211845,33.644896,17.074417,15.0
2,22733,117,1,114,2022-09-20,07:04:40,07:05:07,0:00:27,27.0,1,...,7.6,Overcast,87.0,br,3,27.0,27.184288,37.179429,21.803112,15.0
3,22736,505,1,101,2022-09-20,07:02:32,07:03:18,0:00:46,46.0,1,...,7.6,Overcast,110.0,pro,13,46.0,42.047146,54.615385,35.3964,15.0
4,22736,505,1,102,2022-09-20,07:07:38,07:12:26,0:04:48,288.0,1,...,7.6,Overcast,260.0,mod,6,288.0,47.063343,19.062432,32.462473,15.0


In [27]:
df_meta.to_csv("../../data/predicted_dwell_times_xgboost_convlstm.csv")

In [21]:
dump(value=metalearner,filename='../../models/dwell_time_metalearner.joblib')

['../../models/dwell_time_metalearner.joblib']

In [28]:
dump(metalearner_lr,filename='../../models/dwell_time_metalearner_lr.joblib')

['../../models/dwell_time_metalearner_lr.joblib']