In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.metrics import accuracy_score, classification_report

In [3]:
# Load dataset
df = pd.read_excel("ttc_bus_delay_2023_Udanie.xlsx")
df.head()

Unnamed: 0,date,month,year,route,time,day,location,incident,min delay,delay_type,min gap,vehicle,direction
0,1,1,2023,69,02:34:00,Sunday,WARDEN STATION,Security,22,medium,44,8407,S
1,1,1,2023,35,03:06:00,Sunday,JANE STATION,Cleaning - Unsanitary,30,medium,60,1051,N
2,1,1,2023,52,04:25:00,Sunday,LAWRENCE AND YONGE,Emergency Services,30,medium,60,3520,E
3,1,1,2023,24,04:35:00,Sunday,DANFORTH AND MAIN,Cleaning - Unsanitary,20,medium,40,8404,W
4,1,1,2023,36,05:50:00,Sunday,FINCH STATION,Cleaning - Unsanitary,11,medium,26,3561,W


In [4]:
# Drop unnecessary columns if needed (e.g., 'year' might be redundant if all data is from 2023)
df.drop(columns=['year'], inplace=True)

# Define delay severity (classification target)
def classify_delay(min_delay):
    if min_delay < 10:
        return "short"
    elif 10 <= min_delay < 30:
        return "medium"
    else:
        return "long"

df['delay_severity'] = df['min delay'].apply(classify_delay)


In [6]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for col in categorical_cols:
    df[col] = df[col].astype(str)  # Convert to string before encoding
    df[col] = le.fit_transform(df[col])

df['delay_severity'] = df['delay_severity'].astype(str)  # Convert target column to string
df['delay_severity'] = le.fit_transform(df['delay_severity'])


In [7]:
# Selecting features and target
X = df.drop(columns=['min delay', 'delay_severity'])  # Features
y = df['delay_severity']  # Target variable

In [8]:
# Splitting dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
print(X_train.dtypes)

date           int64
month          int64
route          int64
time          object
day            int64
location       int64
incident       int64
delay_type     int64
min gap        int64
vehicle        int64
direction      int64
dtype: object


In [12]:
def time_to_seconds(t):
    return t.hour * 3600 + t.minute * 60 + t.second

X_train['time'] = X_train['time'].apply(time_to_seconds)
X_test['time'] = X_test['time'].apply(time_to_seconds)

In [13]:
# Scaling features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [14]:
# 1. Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

In [15]:
# 2. XGBoost Classifier
xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)

In [16]:
# 4. Multi-Layer Perceptron (MLP - Neural Network)
nn_model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(3, activation='softmax')  # 3 output classes (short, medium, long)
])

nn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
nn_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

nn_pred = np.argmax(nn_model.predict(X_test), axis=1)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m1091/1091[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.6948 - loss: 0.6355 - val_accuracy: 0.8021 - val_loss: 0.4025
Epoch 2/10
[1m1091/1091[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7863 - loss: 0.4165 - val_accuracy: 0.8342 - val_loss: 0.3704
Epoch 3/10
[1m1091/1091[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8190 - loss: 0.3730 - val_accuracy: 0.9243 - val_loss: 0.2884
Epoch 4/10
[1m1091/1091[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8745 - loss: 0.3032 - val_accuracy: 0.9538 - val_loss: 0.1927
Epoch 5/10
[1m1091/1091[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9391 - loss: 0.2042 - val_accuracy: 0.9717 - val_loss: 0.1369
Epoch 6/10
[1m1091/1091[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9573 - loss: 0.1658 - val_accuracy: 0.9732 - val_loss: 0.1178
Epoch 7/10
[1m1

In [17]:
# Evaluating models
def evaluate_model(name, y_true, y_pred):
    print(f"{name} Accuracy: {accuracy_score(y_true, y_pred):.2f}")
    print(classification_report(y_true, y_pred))

evaluate_model("Random Forest", y_test, rf_pred)
evaluate_model("XGBoost", y_test, xgb_pred)
evaluate_model("Neural Network (MLP)", y_test, nn_pred)

Random Forest Accuracy: 0.98
              precision    recall  f1-score   support

           0       0.99      0.92      0.95      1010
           1       0.98      0.99      0.98      5626
           2       0.98      0.97      0.98      2087

    accuracy                           0.98      8723
   macro avg       0.98      0.96      0.97      8723
weighted avg       0.98      0.98      0.98      8723

XGBoost Accuracy: 0.98
              precision    recall  f1-score   support

           0       0.99      0.93      0.96      1010
           1       0.98      0.99      0.99      5626
           2       0.98      0.99      0.98      2087

    accuracy                           0.98      8723
   macro avg       0.98      0.97      0.97      8723
weighted avg       0.98      0.98      0.98      8723

Neural Network (MLP) Accuracy: 0.98
              precision    recall  f1-score   support

           0       0.97      0.92      0.95      1010
           1       0.98      0.99      0.