In [1]:
import sqlite3
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
from xgboost import XGBClassifier

In [2]:
# Load data
conn = sqlite3.connect("../smart_transit.db")
df = pd.read_sql_query("SELECT * FROM labeled_with_weather", conn)
conn.close()

# Clean up times
df = df[df["arrival_time"].str.match(r"^\d{2}:\d{2}:\d{2}$", na=False)]
df["hour"] = df["arrival_time"].str.slice(0, 2).astype(int)
df = df[df["delayed"].isin([0, 1])]

# Sample down for memory
df = df.sample(n=5000, random_state=42)

# Numeric feature: stop_sequence
df["stop_sequence"] = pd.to_numeric(df["stop_sequence"], errors="coerce")

# Simulate day of week (placeholder)
df["day_of_week"] = (df["hour"] // 4) % 7  # Cycles through 0–6

# One-hot encode weather condition
df = pd.get_dummies(df, columns=["conditions"], drop_first=True)

# Limit stop_id to top 50 and one-hot encode
top_stops = df["stop_id"].value_counts().nlargest(50).index
df = df[df["stop_id"].isin(top_stops)]
df = pd.get_dummies(df, columns=["stop_id"], drop_first=True)

# Define features and target
drop_cols = [
    "arrival_time", "departure_time", "trip_id", "delayed",
    "stop_headsign", "pickup_type", "shape_dist_traveled",
    "icon", "timestamp"
]
X = df.drop(columns=[col for col in drop_cols if col in df.columns])
y = df["delayed"]

In [3]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Apply SMOTE to balance training data
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

In [None]:
# Train Logistic Regression
log_model = LogisticRegression(max_iter=1000, class_weight="balanced")
log_model.fit(X_train_balanced, y_train_balanced)
y_log_pred = log_model.predict(X_test)

print("Logistic Regression")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_log_pred))
print("\nClassification Report:\n", classification_report(y_test, y_log_pred))


In [None]:
# Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, class_weight="balanced", random_state=42)
rf_model.fit(X_train_balanced, y_train_balanced)
y_rf_pred = rf_model.predict(X_test)

print("\nRandom Forest")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_rf_pred))
print("\nClassification Report:\n", classification_report(y_test, y_rf_pred))

In [4]:
from xgboost import XGBClassifier

# Train XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)
xgb_model.fit(X_train_balanced, y_train_balanced)
y_xgb_pred = xgb_model.predict(X_test)

print("\nXGBoost")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_xgb_pred))
print("\nClassification Report:\n", classification_report(y_test, y_xgb_pred))


XGBoost
Confusion Matrix:
 [[28  6]
 [ 8  2]]

Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.82      0.80        34
           1       0.25      0.20      0.22        10

    accuracy                           0.68        44
   macro avg       0.51      0.51      0.51        44
weighted avg       0.66      0.68      0.67        44



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
