In [51]:
import joblib
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [52]:
RANDOM_STATE = 42

In [53]:
##UTILITIES
# def rmse(y_true, y_pred):
#     return root_mean_squared_error(y_true, y_pred, squared=False)

def evaluate_predictions(y_true_log, y_pred_log):
    """Return metrics on log scale and on original price scale (expm1)."""
    # Log-target metrics
    rmse_log = root_mean_squared_error(y_true_log, y_pred_log)
    mae_log = mean_absolute_error(y_true_log, y_pred_log)
    r2_log = r2_score(y_true_log, y_pred_log)

    # Convert back to price scale
    y_true_price = np.expm1(y_true_log)
    y_pred_price = np.expm1(y_pred_log)

    rmse_price = root_mean_squared_error(y_true_price, y_pred_price)
    mae_price = mean_absolute_error(y_true_price, y_pred_price)
    r2_price = r2_score(y_true_price, y_pred_price)

    return {
        "rmse_log": rmse_log,
        "mae_log": mae_log,
        "r2_log": r2_log,
        "rmse_price": rmse_price,
        "mae_price": mae_price,
        "r2_price": r2_price
    }


In [54]:
df = pd.read_csv('processed_flight_data.csv')
print("Shape:", df.shape)

Shape: (10682, 29)


In [55]:
##Prepare X and Y
X = df.drop(columns=["Price", "Price_capped", "log_Price"], errors="ignore")
y = df["log_Price"].values

In [56]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE
)
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

Train shape: (8545, 26) Test shape: (2137, 26)


In [57]:
###Define models
models = {}

# 1) Linear Regression with scaling (good practice for LR)
models["LinearRegression"] = Pipeline([
    ("scaler", StandardScaler()),
    ("lr", LinearRegression())
])

# 2) Random Forest
models["RandomForest"] = RandomForestRegressor(
    n_estimators=300,
    max_depth=12,
    random_state=RANDOM_STATE,
    n_jobs=-1,
)

# 3) Gradient Boosting (sklearn)
models["GradientBoosting"] = GradientBoostingRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=6,
    random_state=RANDOM_STATE
)

In [58]:
# ---------- TRAIN & EVALUATE ----------
results = []
fitted_models = {}

for name, model in models.items():
    print(f"\nTraining {name} ...")
    # Fit
    model.fit(X_train, y_train)
    # Predict on test set (log scale)
    preds_log = model.predict(X_test)
    # Evaluate
    metrics = evaluate_predictions(y_test, preds_log)
    metrics_summary = {
        "Model": name,
        "RMSE_log": metrics["rmse_log"],
        "MAE_log": metrics["mae_log"],
        "R2_log": metrics["r2_log"],
        "RMSE_price": metrics["rmse_price"],
        "MAE_price": metrics["mae_price"],
        "R2_price": metrics["r2_price"]
    }
    results.append(metrics_summary)
    fitted_models[name] = model
    print(f"Done {name} — RMSE (price scale): {metrics['rmse_price']:.2f}, MAE (price scale): {metrics['mae_price']:.2f}, R2_price: {metrics['r2_price']:.4f}")



Training LinearRegression ...
Done LinearRegression — RMSE (price scale): 2693.93, MAE (price scale): 1865.06, R2_price: 0.6110

Training RandomForest ...
Done RandomForest — RMSE (price scale): 1763.23, MAE (price scale): 1195.95, R2_price: 0.8333

Training GradientBoosting ...
Done GradientBoosting — RMSE (price scale): 1691.54, MAE (price scale): 1191.99, R2_price: 0.8466


In [60]:
# ---------- RESULTS TABLE ----------
results_df = pd.DataFrame(results)
results_df = results_df.sort_values("RMSE_price").reset_index(drop=True)
print("\nModel comparison (sorted by RMSE on original price):")
results_df


Model comparison (sorted by RMSE on original price):


Unnamed: 0,Model,RMSE_log,MAE_log,R2_log,RMSE_price,MAE_price,R2_price
0,GradientBoosting,0.171184,0.128764,0.889763,1691.538835,1191.987384,0.846616
1,RandomForest,0.178276,0.127769,0.88044,1763.233331,1195.954865,0.833338
2,LinearRegression,0.266268,0.200785,0.73329,2693.931884,1865.061918,0.610964


In [63]:
best_row = results_df.loc[0]
best_model_name = best_row["Model"]
best_model = fitted_models[best_model_name]
print(f"Best model: {best_model_name}")

Best model: GradientBoosting
