In [1]:
import bus_prediction
import os
import pandas as pd
from constants import DATA_DIR

def load_bus_data():
    path = os.path.join(DATA_DIR, "dataset_routes.csv")
    df = pd.read_csv(path)
    return df

bus_data = load_bus_data()
bus_data["time_bucket"] = pd.to_datetime(bus_data["time_bucket"], utc=True)
X_train, y_train, X_test, y_test = bus_prediction.prepare_bus_data(bus_data)

  _torch_pytree._register_pytree_node(
  df = pd.read_csv(path)


In [2]:
print(X_train[0:5])

  vehicle.trip.route_id  vehicle.trip.direction_id               time_bucket  \
0                     1                        0.0 2023-11-20 12:20:00+00:00   
1                     1                        0.0 2023-11-20 12:40:00+00:00   
2                     1                        0.0 2023-11-20 13:00:00+00:00   
3                     1                        0.0 2023-11-20 13:20:00+00:00   
4                     1                        0.0 2023-11-20 13:40:00+00:00   

   holiday  weekday  temp  dwpt  rhum  prcp   wdir  wspd    pres  coco  \
0        0        0  15.0   3.5  46.0   0.0   20.0  27.7  1023.5   1.0   
1        0        0  15.0   3.5  46.0   0.0   20.0  27.7  1023.5   1.0   
2        0        0  12.2   4.4  59.0   0.0  140.0   9.4  1023.2   1.0   
3        0        0  12.2   4.4  59.0   0.0  140.0   9.4  1023.2   1.0   
4        0        0  12.2   4.4  59.0   0.0  140.0   9.4  1023.2   1.0   

   avg_delay  month  day  hour  minute  
0        0.0     11   20    12   

In [4]:
route_ids = bus_data['vehicle.trip.route_id'].unique()
print(route_ids)

[1 12 14 '14' '14R' '15' '18' 18 19 '19' '1X' '2' 2 21 22 23 24 25 27 28
 '28' '28R' '29' 29 30 '30' '30X' '31' 31 33 35 36 37 38 '38' '38R' '39'
 39 43 44 45 48 49 5 52 54 55 56 57 58 '58' '5R' '6' 6 66 67 7 714 8 '8'
 '8AX' '8BX' '9' 9 90 91 '91' '9R' 'CA' 'F' 'FBUS' 'J' 'K' 'KBUS' 'KLM'
 'L' 'LBUS' 'LOWL' 'M' 'MBUS' 'N' 'NBUS' 'NOWL' 'PH' 'PM' 'S' 'T' 'TBUS']


In [8]:
route_train_sizes = {}
route_test_mses = {}

for route_id in route_ids:
    route_X_train = X_train[X_train['vehicle.trip.route_id'] == route_id]
    route_y_train = y_train[X_train['vehicle.trip.route_id'] == route_id]
    route_X_test = X_test[X_test['vehicle.trip.route_id'] == route_id]
    route_y_test = y_test[X_test['vehicle.trip.route_id'] == route_id]
    if route_X_train.shape[0] == 0 or route_X_test.shape[0] == 0:
        print(f"Skipping route {route_id} due to empty train/test set.")
        continue
    route_train_sizes[route_id] = route_X_train.shape[0]
    print(f"Route ID: {route_id}")
    print(f"Train set size: {route_X_train.shape}")
    print(f"Test set size: {route_X_test.shape}")
    route_X_train = route_X_train.drop(columns=['vehicle.trip.route_id'])
    route_X_test = route_X_test.drop(columns=['vehicle.trip.route_id'])
    xgboost_predictor = bus_prediction.XGBoostPredictor(lagged_features=["avg_delay"], lag_offsets=[1, 2, 3, 24*3], avg_features=["avg_delay"], avg_ranges=[24*3, 48*3, 72*3])
    xgboost_predictor.train(route_X_train, route_y_train)
    xgboost_mse = xgboost_predictor.evaluate(route_X_test, route_y_test)
    route_test_mses[route_id] = xgboost_mse
    print(f"XGBoost Test MSE: {xgboost_mse:.4f}")


# Compute overall test MSE
overall_test_mse = 0
for route_id in route_ids:
    if route_id in route_test_mses:
        overall_test_mse += route_test_mses[route_id] * route_train_sizes[route_id]
overall_test_mse /= sum(route_train_sizes.values())
print(f"Overall Test MSE: {overall_test_mse:.4f}")

Route ID: 1
Train set size: (30833, 18)
Test set size: (3759, 18)
Validation MSE: 2.9038
XGBoost Test MSE: 3.4116
Route ID: 12
Train set size: (25824, 18)
Test set size: (3157, 18)
Validation MSE: 9.5742
XGBoost Test MSE: 8.5528
Route ID: 14
Train set size: (32529, 18)
Test set size: (2202, 18)
Validation MSE: 2.6590
XGBoost Test MSE: 4.0659
Route ID: 14
Train set size: (3683, 18)
Test set size: (2201, 18)
Validation MSE: 2.8579
XGBoost Test MSE: 7.3426
Route ID: 14R
Train set size: (27188, 18)
Test set size: (3320, 18)
Validation MSE: 3.3537
XGBoost Test MSE: 7.8527
Route ID: 15
Train set size: (25663, 18)
Test set size: (3104, 18)
Validation MSE: 6.6722
XGBoost Test MSE: 7.1675
Skipping route 18 due to empty train/test set.
Route ID: 18
Train set size: (24984, 18)
Test set size: (3090, 18)
Validation MSE: 3.5045
XGBoost Test MSE: 4.9588
Skipping route 19 due to empty train/test set.
Route ID: 19
Train set size: (22600, 18)
Test set size: (3336, 18)
Validation MSE: 18.1465
XGBoost Tes