In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

In [21]:
# Load dataset
df = pd.read_excel("ttc_bus_delay_2023_Udanie.xlsx")
df.head()

Unnamed: 0,date,month,year,route,time,day,location,incident,min delay,delay_type,min gap,vehicle,direction
0,1,1,2023,69,02:34:00,Sunday,WARDEN STATION,Security,22,medium,44,8407,S
1,1,1,2023,35,03:06:00,Sunday,JANE STATION,Cleaning - Unsanitary,30,medium,60,1051,N
2,1,1,2023,52,04:25:00,Sunday,LAWRENCE AND YONGE,Emergency Services,30,medium,60,3520,E
3,1,1,2023,24,04:35:00,Sunday,DANFORTH AND MAIN,Cleaning - Unsanitary,20,medium,40,8404,W
4,1,1,2023,36,05:50:00,Sunday,FINCH STATION,Cleaning - Unsanitary,11,medium,26,3561,W


In [None]:
# Drop unnecessary columns if needed 
df.drop(columns=['year'], inplace=True)

In [23]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for col in categorical_cols:
    df[col] = df[col].astype(str)  # Convert everything to string
    df[col] = le.fit_transform(df[col])

In [24]:
# Encoding categorical variables
categorical_cols = ['month', 'day', 'route', 'location', 'incident', 'delay_type', 'vehicle', 'direction']
le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

In [25]:
# Selecting features and target
X = df.drop(columns=['min delay'])  # Features
y = df['min delay']  # Target variable

# Splitting dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
print(X_train.dtypes)


date           int64
month          int64
route          int64
time          object
day            int64
location       int64
incident       int64
delay_type     int64
min gap        int64
vehicle        int64
direction      int64
dtype: object


In [29]:
print(X_train['time'].head())  # Replace 'time' with the actual column name

31300    22:24:00
8166     09:29:00
28896    21:06:00
39738    09:03:00
10381    09:29:00
Name: time, dtype: object


In [31]:
X_train['time'] = pd.to_datetime(X_train['time'], format='%H:%M:%S').dt.time

In [None]:
# Scaling features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [32]:
X_train['time_seconds'] = X_train['time'].apply(lambda x: x.hour * 3600 + x.minute * 60 + x.second)
X_test['time_seconds'] = X_test['time'].apply(lambda x: x.hour * 3600 + x.minute * 60 + x.second)


In [33]:
X_train = X_train.drop(columns=['time'])
X_test = X_test.drop(columns=['time'])

In [34]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [35]:
# 1. Random Forest Regression
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

In [36]:
# 2. XGBoost Regression
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)

In [39]:
# 5. LSTM Model (Deep Learning)
X_train_lstm = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test_lstm = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

lstm_model = Sequential([
    LSTM(50, return_sequences=True, input_shape=(X_train.shape[1], 1)),
    Dropout(0.2),
    LSTM(50),
    Dropout(0.2),
    Dense(1)
])

lstm_model.compile(optimizer='adam', loss='mse')
lstm_model.fit(X_train_lstm, y_train, epochs=10, batch_size=32, validation_data=(X_test_lstm, y_test))

lstm_pred = lstm_model.predict(X_test_lstm)

Epoch 1/10
[1m1091/1091[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 10ms/step - loss: 2400.2285 - val_loss: 1828.6562
Epoch 2/10
[1m1091/1091[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 15ms/step - loss: 2470.3396 - val_loss: 1828.5933
Epoch 3/10
[1m1091/1091[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 13ms/step - loss: 2118.7554 - val_loss: 1830.1263
Epoch 4/10
[1m1091/1091[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 11ms/step - loss: 2403.3630 - val_loss: 1829.7362
Epoch 5/10
[1m1091/1091[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 9ms/step - loss: 2525.5039 - val_loss: 1829.8413
Epoch 6/10
[1m1091/1091[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 10ms/step - loss: 2604.8054 - val_loss: 1828.5317
Epoch 7/10
[1m1091/1091[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 9ms/step - loss: 2332.6035 - val_loss: 1829.1705
Epoch 8/10
[1m1091/1091[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 10ms/step - 

In [41]:
# 6. GRU Model (Deep Learning)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Dropout

gru_model = Sequential([
    GRU(50, return_sequences=True, input_shape=(X_train.shape[1], 1)),
    Dropout(0.2),
    GRU(50),
    Dropout(0.2),
    Dense(1)
])

gru_model.compile(optimizer='adam', loss='mse')
gru_model.fit(X_train_lstm, y_train, epochs=10, batch_size=32, validation_data=(X_test_lstm, y_test))

gru_pred = gru_model.predict(X_test_lstm)

Epoch 1/10
[1m1091/1091[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 10ms/step - loss: 2224.4792 - val_loss: 1829.1262
Epoch 2/10
[1m1091/1091[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 9ms/step - loss: 2161.0925 - val_loss: 1593.1658
Epoch 3/10
[1m1091/1091[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 10ms/step - loss: 2044.8723 - val_loss: 1348.0713
Epoch 4/10
[1m1091/1091[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 9ms/step - loss: 1924.7007 - val_loss: 1169.9684
Epoch 5/10
[1m1091/1091[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 11ms/step - loss: 1582.3514 - val_loss: 1084.5065
Epoch 6/10
[1m1091/1091[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 11ms/step - loss: 1342.7173 - val_loss: 921.5781
Epoch 7/10
[1m1091/1091[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 11ms/step - loss: 1355.5210 - val_loss: 825.8176
Epoch 8/10
[1m1091/1091[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 11ms/step - lo

In [42]:
# Evaluating models
from sklearn.metrics import mean_absolute_error, mean_squared_error

def evaluate_model(name, y_true, y_pred):
    print(f"{name} - MAE: {mean_absolute_error(y_true, y_pred):.2f}, MSE: {mean_squared_error(y_true, y_pred):.2f}")

evaluate_model("Random Forest", y_test, rf_pred)
evaluate_model("XGBoost", y_test, xgb_pred)
evaluate_model("LSTM", y_test, lstm_pred.flatten())
evaluate_model("GRU", y_test, gru_pred.flatten())

Random Forest - MAE: 1.15, MSE: 115.95
XGBoost - MAE: 1.89, MSE: 215.43
LSTM - MAE: 14.08, MSE: 1828.61
GRU - MAE: 3.83, MSE: 613.10
