# 🚍 Refactored Demand Prediction Notebook
This notebook is optimized for passenger demand prediction using GTFS-derived features.
It uses `synthetic_demand.csv` generated from the GTFS `simulate_demand.py` pipeline.

In [None]:
# Install required libraries (uncomment if needed)
# %pip install pandas scikit-learn joblib matplotlib seaborn


In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import joblib

# Seed for reproducibility
SEED = 42
np.random.seed(SEED)


## 1. Load Synthetic Demand Data

In [None]:
# Load the dataset
data_path = '../data/synthetic_demand.csv'
df = pd.read_csv(data_path)
df.head()


## 2. Preprocessing
Convert categorical features and prepare input/output.

In [None]:
# One-hot encode categorical features
X = df[['route_id', 'stop_id', 'hour', 'weekday']]
y = df['passenger_count']

X_encoded = pd.get_dummies(X, columns=['route_id', 'stop_id'])
X_encoded.head()


## 3. Train/Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=SEED
)


## 4. Train Random Forest Model

In [None]:
model = RandomForestRegressor(n_estimators=200, max_depth=12, random_state=SEED)
model.fit(X_train, y_train)


## 5. Evaluate Model Performance

In [None]:
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")


## 6. Save Trained Model

In [None]:
joblib.dump(model, '../models/demand_model.pkl')
print("✅ Model saved to ../models/demand_model.pkl")


## 7. Feature Importance (Optional Insight)

In [None]:
importances = model.feature_importances_
feat_names = X_encoded.columns
feat_imp_df = pd.DataFrame({'Feature': feat_names, 'Importance': importances})
feat_imp_df.sort_values(by='Importance', ascending=False).head(10)


In [None]:
# Visualize top 10 features
top_features = feat_imp_df.sort_values(by='Importance', ascending=False).head(10)
sns.barplot(x='Importance', y='Feature', data=top_features)
plt.title("Top 10 Feature Importances")
plt.tight_layout()
plt.show()
