In [267]:
pip install xgboost lightgbm catboost

Note: you may need to restart the kernel to use updated packages.


In [268]:
import joblib
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [269]:
try:
    from xgboost import XGBRegressor
    xgb_available = True
except Exception:
    xgb_available = False

try:
    from lightgbm import LGBMRegressor
    lgb_available = True
except Exception:
    lgb_available = False

try:
    from catboost import CatBoostRegressor
    cb_available = True
except Exception:
    cb_available = False

In [270]:
RANDOM_STATE = 42
TEST_SIZE = 0.3

In [271]:
##UTILITIES
# def rmse(y_true, y_pred):
#     return root_mean_squared_error(y_true, y_pred, squared=False)

def evaluate_predictions_log(y_true_log, y_pred_log):
    """Return metrics on log scale and on original price scale (expm1)."""
    # Log-target metrics
    rmse_log = root_mean_squared_error(y_true_log, y_pred_log)
    mae_log = mean_absolute_error(y_true_log, y_pred_log)
    r2_log = r2_score(y_true_log, y_pred_log)

    # Convert back to price scale
    y_true_price = np.expm1(y_true_log)
    y_pred_price = np.expm1(y_pred_log)

    rmse_price = root_mean_squared_error(y_true_price, y_pred_price)
    mae_price = mean_absolute_error(y_true_price, y_pred_price)
    r2_price = r2_score(y_true_price, y_pred_price)

    return {
        "rmse_log": rmse_log,
        "mae_log": mae_log,
        "r2_log": r2_log,
        "rmse_price": rmse_price,
        "mae_price": mae_price,
        "r2_price": r2_price
    }


In [272]:
def fit_booster_with_fallback(name, model, X_tr_t, y_tr, X_val_t, y_val):
    """
    Try multiple ways to fit booster with early stopping across library versions:
      1) model.fit(..., eval_set=[...], early_stopping_rounds=...)
      2) model.fit(..., eval_set=[...], callbacks=[...]) (XGBoost / LightGBM callback)
      3) CatBoost: use_best_model=True
      4) fallback: model.fit(...) without early stopping
    Returns: (fitted_model, method_string)
    """
    default_es = 30
    # 1) try classic API
    try:
        model.fit(X_tr_t, y_tr, eval_set=[(X_val_t, y_val)], early_stopping_rounds=default_es, verbose=False)
        return model, "fit(early_stopping_rounds)"
    except Exception:
        pass

    # 2) try callbacks for XGBoost/LightGBM
    try:
        if "XGB" in name and xgb_available:
            try:
                # attempt xgboost callback import
                from xgboost.callback import EarlyStopping
                model.fit(X_tr_t, y_tr, eval_set=[(X_val_t, y_val)], callbacks=[EarlyStopping(rounds=default_es)])
                return model, "xgb.callback.EarlyStopping"
            except Exception:
                # alternate import path
                import xgboost as xgbpkg
                try:
                    model.fit(X_tr_t, y_tr, eval_set=[(X_val_t, y_val)], callbacks=[xgbpkg.callback.EarlyStopping(rounds=default_es)])
                    return model, "xgbpkg.callback.EarlyStopping"
                except Exception:
                    pass

        if "LightGBM" in name and lgb_available:
            try:
                from lightgbm import early_stopping
                model.fit(X_tr_t, y_tr, eval_set=[(X_val_t, y_val)], callbacks=[early_stopping(default_es)])
                return model, "lightgbm.callback.early_stopping"
            except Exception:
                # try package callback
                try:
                    import lightgbm as lgbpkg
                    model.fit(X_tr_t, y_tr, eval_set=[(X_val_t, y_val)], callbacks=[lgbpkg.callback.early_stopping(default_es)])
                    return model, "lgbpkg.callback.early_stopping"
                except Exception:
                    pass

        if "CatBoost" in name and cb_available:
            try:
                model.fit(X_tr_t, y_tr, eval_set=(X_val_t, y_val), use_best_model=True, verbose=False)
                return model, "catboost.use_best_model"
            except Exception:
                pass
    except Exception:
        pass

    # 3) fallback: fit without early stopping
    try:
        model.fit(X_tr_t, y_tr)
        return model, "fallback_no_early_stopping"
    except Exception as e:
        # If it fails to fit at all, raise
        raise RuntimeError(f"Booster {name} failed to fit in all attempts: {e}")


In [273]:
df = pd.read_csv('processed_flight_data.csv')
print("Shape:", df.shape)

Shape: (10682, 29)


In [274]:
# ---------- Detect whether data is OHE or needs preprocessing ----------
# If categorical columns exist as names, we will build a ColumnTransformer.
categorical_candidates = ["Airline", "Source", "Destination"]
has_categorical = any(c in df.columns for c in categorical_candidates)
has_categorical

False

In [275]:
# If data appears to be OHE already (no categorical columns), we'll skip ColumnTransformer.
use_preprocessor = has_categorical

if use_preprocessor:
    numeric_features = ["Duration_minutes", "stops_num", "Dep_Hour", "Arrival_Hour", "Journey_Day", "Journey_Month"]
    cat_features = [c for c in categorical_candidates if c in df.columns]
    print("Using ColumnTransformer with numeric:", numeric_features, "and categorical:", cat_features)

    numeric_transformer = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])
    categorical_transformer = Pipeline([
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse=False))
    ])
    preprocessor = ColumnTransformer([
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, cat_features)
    ], remainder="drop")
    # Feature names after transform will be built where needed.
else:
    print("Detected OHE data or no categorical columns. Skipping ColumnTransformer.")
    preprocessor = None

Detected OHE data or no categorical columns. Skipping ColumnTransformer.


In [276]:
##Prepare X and Y
X_all = df.drop(columns=["Price", "Price_capped", "log_Price"], errors="ignore")
y_all = df["log_Price"].values

In [277]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_all, y_all, test_size=TEST_SIZE, random_state=RANDOM_STATE
)
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

Train shape: (7477, 26) Test shape: (3205, 26)


In [278]:
# ---------- Models to train ----------
models = {
    "LinearRegression": {"type": "sklearn", "est": LinearRegression()},
    "RandomForest": {"type": "sklearn", "est": RandomForestRegressor(n_estimators=150, max_depth=12, random_state=RANDOM_STATE, n_jobs=-1)},
    "GradientBoosting": {"type": "sklearn", "est": GradientBoostingRegressor(n_estimators=150, learning_rate=0.05, max_depth=6, random_state=RANDOM_STATE)}
}
if xgb_available:
    models["XGBoost"] = {"type": "booster", "est": XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=6, subsample=0.8, colsample_bytree=0.8, objective="reg:squarederror", random_state=RANDOM_STATE, n_jobs=-1, verbosity=0)}
if lgb_available:
    models["LightGBM"] = {"type": "booster", "est": LGBMRegressor(n_estimators=200, learning_rate=0.05, max_depth=6, subsample=0.8, colsample_bytree=0.8, random_state=RANDOM_STATE, n_jobs=-1)}
if cb_available:
    models["CatBoost"] = {"type": "booster", "est": CatBoostRegressor(iterations=200, learning_rate=0.05, depth=6, random_state=RANDOM_STATE, verbose=False)}

print("Models to run:", list(models.keys()))

Models to run: ['LinearRegression', 'RandomForest', 'GradientBoosting', 'XGBoost', 'LightGBM', 'CatBoost']


In [279]:
# ---------- TRAIN & EVAL ----------
trained_pipelines = {}
results = []

for name, info in models.items():
    model = info["est"]
    mtype = info["type"]
    print(f"\nTraining {name} (type={mtype}) ...")
    t0 = time.time()
    try:
        if mtype == "sklearn":
            # For sklearn models fit on full outer training set
            if preprocessor is not None:
                pipe = Pipeline([("preprocessor", preprocessor), ("reg", model)])
                pipe.fit(X_train, y_train)
                preds_log = pipe.predict(X_test)
                final_pipeline = pipe  # ready to predict on raw rows
            else:
                # If no preprocessor, wrap scaler+lr only for LR
                if name == "LinearRegression":
                    lr_pipe = Pipeline([("scaler", StandardScaler()), ("lr", model)])
                    lr_pipe.fit(X_train, y_train)
                    preds_log = lr_pipe.predict(X_test)
                    final_pipeline = lr_pipe
                else:
                    model.fit(X_train, y_train)
                    preds_log = model.predict(X_test)
                    final_pipeline = Pipeline([("reg", model)])

        else:
            # boosters: inner split for early stopping
            X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.20, random_state=RANDOM_STATE)
            if preprocessor is not None:
                # fit preprocessor on X_tr then transform
                preprocessor.fit(X_tr)
                X_tr_t = preprocessor.transform(X_tr)
                X_val_t = preprocessor.transform(X_val)
                X_test_t = preprocessor.transform(X_test)
                # robust fit with fallback
                fitted_booster, method = fit_booster_with_fallback(name, model, X_tr_t, y_tr, X_val_t, y_val)
                print(f"{name}: fitted using method -> {method}")
                preds_log = fitted_booster.predict(X_test_t)
                final_pipeline = Pipeline([("preprocessor", preprocessor), ("reg", fitted_booster)])
            else:
                X_tr_arr = X_tr.values if hasattr(X_tr, "values") else X_tr
                X_val_arr = X_val.values if hasattr(X_val, "values") else X_val
                X_test_arr = X_test.values if hasattr(X_test, "values") else X_test
                fitted_booster, method = fit_booster_with_fallback(name, model, X_tr_arr, y_tr, X_val_arr, y_val)
                print(f"{name}: fitted using method -> {method}")
                preds_log = fitted_booster.predict(X_test_arr)
                final_pipeline = Pipeline([("reg", fitted_booster)])

        # Evaluate
        metrics = evaluate_predictions_log(y_test, preds_log)
        metrics.update({"Model": name, "time_sec": time.time() - t0})
        results.append(metrics)
        trained_pipelines[name] = final_pipeline
        print(f"{name} done — RMSE_price: {metrics['rmse_price']:.2f}, MAE_price: {metrics['mae_price']:.2f}, time: {metrics['time_sec']:.1f}s")

    except Exception as e:
        import traceback
        print(f"Error training {name}: {e}")
        traceback.print_exc()
        results.append({"Model": name, "rmse_log": None, "mae_log": None, "r2_log": None, "rmse_price": None, "mae_price": None, "r2_price": str(e), "time_sec": None})



Training LinearRegression (type=sklearn) ...
LinearRegression done — RMSE_price: 2625.62, MAE_price: 1826.49, time: 0.0s

Training RandomForest (type=sklearn) ...
RandomForest done — RMSE_price: 1723.00, MAE_price: 1170.91, time: 0.3s

Training GradientBoosting (type=sklearn) ...
GradientBoosting done — RMSE_price: 1707.48, MAE_price: 1216.72, time: 1.0s

Training XGBoost (type=booster) ...
XGBoost: fitted using method -> fallback_no_early_stopping
XGBoost done — RMSE_price: 1689.42, MAE_price: 1191.51, time: 0.2s

Training LightGBM (type=booster) ...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000182 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 343
[LightGBM] [Info] Number of data points in the train set: 5981, number of used features: 22
[LightGBM] [Info] Start training from score 8.991873
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iterat



In [280]:
res_df = pd.DataFrame(results).sort_values("rmse_price").reset_index(drop=True)


In [281]:
print("\n========= MODEL PERFORMANCE COMPARISON =========")
print(res_df[["Model", "rmse_price", "mae_price", "r2_price", "time_sec"]])

# Pick best model based on RMSE on original price scale
best_idx = res_df["rmse_price"].idxmin()
best_model_name = res_df.loc[best_idx, "Model"]
best_model = trained_pipelines[best_model_name]

print("\n========= BEST MODEL SELECTED =========")
print(f"Best Model: {best_model_name}")
print(f"RMSE (Price): {res_df.loc[best_idx, 'rmse_price']:.4f}")
print(f"MAE  (Price): {res_df.loc[best_idx, 'mae_price']:.4f}")
print(f"R²   (Price): {res_df.loc[best_idx, 'r2_price']:.4f}")
print(f"Training Time: {res_df.loc[best_idx, 'time_sec']:.2f} seconds")


              Model   rmse_price    mae_price  r2_price  time_sec
0           XGBoost  1689.419908  1191.505379  0.844077  0.191951
1  GradientBoosting  1707.481507  1216.724772  0.840726  0.953026
2      RandomForest  1722.998261  1170.914918  0.837818  0.257384
3          LightGBM  1740.739690  1234.314434  0.834460  0.401225
4          CatBoost  1876.616660  1350.058798  0.807609  0.149789
5  LinearRegression  2625.618777  1826.491241  0.623385  0.007508

Best Model: XGBoost
RMSE (Price): 1689.4199
MAE  (Price): 1191.5054
R²   (Price): 0.8441
Training Time: 0.19 seconds
