## 🧠 Model Training

In [None]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import cross_val_predict, KFold
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
trainwdrop_log = pd.read_csv('../dataset/cleanedwdrop_log_encode_sales_data.csv')
trainwfill_log = pd.read_csv('../dataset/cleanedwfill_log_encode_sales_data.csv')
trainwdrop_iso = pd.read_csv('../dataset/cleanedwdrop_iso_encode_sales_data.csv')
trainwfill_iso = pd.read_csv('../dataset/cleanedwfill_iso_encode_sales_data.csv')

In [None]:
trainwdrop_log.drop(columns=['year'], inplace=True)
trainwdrop_iso.drop(columns=['year'], inplace=True)

trainwfill_log.drop(columns=['year', 'manufacturer', 'model'], inplace=True)
trainwfill_iso.drop(columns=['year', 'manufacturer', 'model'], inplace=True)

In [None]:
#odometer can car_age into int
trainwdrop_log['car_age'] = trainwdrop_log['car_age'].astype(int)
trainwdrop_iso['car_age'] = trainwdrop_iso['car_age'].astype(int)
trainwfill_log['car_age'] = trainwfill_log['car_age'].astype(int)
trainwfill_iso['car_age'] = trainwfill_iso['car_age'].astype(int)

trainwdrop_log['odometer'] = trainwdrop_log['odometer'].astype(int)
trainwdrop_iso['odometer'] = trainwdrop_iso['odometer'].astype(int)
trainwfill_log['odometer'] = trainwfill_log['odometer'].astype(int)
trainwfill_iso['odometer'] = trainwfill_iso['odometer'].astype(int)

In [None]:
models = {
    "LinearRegression": LinearRegression(),
    "RidgeRegression": Ridge(alpha=1.0),
    "RandomForest": RandomForestRegressor(n_jobs=-1, random_state=42),
    "XGBoost": XGBRegressor(n_jobs=-1, random_state=42, verbosity=0, tree_method='gpu_hist')
}

In [None]:
datasets = {
    "Drop Log": trainwdrop_log.copy(),
    "Drop Iso": trainwdrop_iso.copy(),
    "Fill Log": trainwfill_log.copy(),
    "Fill Iso": trainwfill_iso.copy()
}

In [None]:
def evaluate(y, y_pred):
    r2 = r2_score(y, y_pred)
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    mae = mean_absolute_error(y, y_pred)
    pmae = (mae / y.mean()) * 100
    return r2, rmse, mae, pmae

In [None]:
from tqdm.notebook import tqdm
import pandas as pd
from sklearn.model_selection import KFold, cross_val_predict

kf = KFold(n_splits=5, shuffle=True, random_state=42)
results_no_hyper_tune = []

# Total iterations = number of datasets * number of models
total_iters = len(datasets) * len(models)

# Create progress bar
pbar = tqdm(total=total_iters, desc="Evaluating Models", leave=True)

# --- LOOP THROUGH DATASETS & MODELS ---
for name, df in datasets.items():
    X = df.drop(columns=["price"])
    y = df["price"]

    for model_name, model in models.items():
        y_pred = cross_val_predict(model, X, y, cv=kf, n_jobs=-1)
        r2, rmse, mae, pmae = evaluate(y, y_pred)

        results_no_hyper_tune.append({
            "Dataset": name,
            "Model": model_name,
            "R²": round(r2, 4),
            "RMSE": round(rmse, 2),
            "MAE": round(mae, 2),
            "PMAE (%)": round(pmae, 2)
        })

        # Update progress
        pbar.update(1)

pbar.close()


In [None]:
import pandas as pd

# Convert the results list to a DataFrame
results_df = pd.DataFrame(results_no_hyper_tune)

# Improved styled DataFrame for better header and width display
styled_results = results_df.style.format({
    "R²": "{:.4f}",
    "RMSE": "{:,.2f}",
    "MAE": "{:,.2f}",
    "PMAE (%)": "{:.2f}"
}).background_gradient(subset=["R²"], cmap="Blues") \
  .background_gradient(subset=["RMSE", "MAE", "PMAE (%)"], cmap="Reds_r") \
  .set_table_styles([
    {"selector": "thead th", "props": [
        ("background-color", "#1976D2"),
        ("color", "white"),
        ("font-weight", "bold"),
        ("text-align", "center"),
        ("white-space", "nowrap"),
        ("padding", "10px")
    ]},
    {"selector": "tbody td", "props": [
        ("border", "1px solid #ddd"),
        ("text-align", "center"),
        ("padding", "8px"),
        ("white-space", "nowrap")
    ]},
    {"selector": "table", "props": [
        ("width", "100%"),
        ("table-layout", "fixed")
    ]}
]) \
  .set_properties(**{"text-align": "center"})

styled_results


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# for result in results_no_hyper_tune:
#     dataset_name = result["Dataset"]
#     model_name = result["Model"]

#     # Retrieve the corresponding dataset and model
#     df = datasets[dataset_name]
#     model = models[model_name]

#     X = df.drop(columns=["price"])
#     y = df["price"]

#     # Generate predictions using cross_val_predict
#     y_pred = cross_val_predict(model, X, y, cv=kf, n_jobs=-1)

#     # Plot the results
#     plt.figure(figsize=(18, 14))
#     plt.hexbin(y, y_pred, gridsize=70, mincnt=1, linewidths=0.5, edgecolors='gray', cmap='Blues', bins='log')
#     plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2, label='Perfect Prediction')
#     plt.xlabel("Actual Price")
#     plt.ylabel("Predicted Price")
#     plt.title(f"{dataset_name} - {model_name}")
#     plt.colorbar(label="Density")
#     plt.legend()
#     plt.tight_layout()
#     plt.show()


In [None]:
# Convert the results list to a DataFrame
results_df = pd.DataFrame(results_no_hyper_tune)

# Create a pivot table for the heatmap
heatmap_data = results_df.pivot(index="Model", columns="Dataset", values="R²")

# Calculate the average R² and sort by it
heatmap_data["Avg R²"] = heatmap_data.mean(axis=1)
heatmap_data.sort_values("Avg R²", ascending=False, inplace=True)
heatmap_data.drop(columns="Avg R²", inplace=True)

# Red theme color map
plt.figure(figsize=(10, 6))
sns.heatmap(
    heatmap_data,
    annot=True,
    fmt=".3f",
    cmap="Reds",          # red theme
    linewidths=0.5,
    linecolor='white',
    cbar_kws={"label": "R² Score"}
)

plt.title("Model Performance by R² (Sorted)", fontsize=16)
plt.xlabel("Dataset", fontsize=12)
plt.ylabel("Model", fontsize=12)
plt.xticks(fontsize=11)
plt.yticks(fontsize=11)
plt.show()

In [None]:
from sklearn.base import clone

# Prepare data
X_Fill = datasets["Fill Log"].drop(columns=["price"])
y_Fill = datasets["Fill Log"]["price"]

X_Drop = datasets["Drop Log"].drop(columns=["price"])
y_Drop = datasets["Drop Log"]["price"]

# Fit model
model = clone(models["RandomForest"]).fit(X, y)

In [None]:
# Extract feature importances and select top 20
importances_fill = pd.Series(model.feature_importances_, index=X_Fill.columns).sort_values(ascending=False).head(20)

# Convert to DataFrame for seaborn
top_features_df = importances_fill.reset_index()
top_features_df.columns = ['Feature', 'Importance']

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(
    data=top_features_df,
    x='Importance',
    y='Feature',
    hue='Feature',
    palette='Reds_r',
    dodge=False,
    legend=False  # hide redundant legend
)
plt.title("Top 20 Features - RandomForest (Fill Log Dataset)", fontsize=14)
plt.xlabel("Importance", fontsize=12)
plt.ylabel("Feature", fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
# Extract feature importances and select top 20
importances_drop = pd.Series(model.feature_importances_, index=X_Drop.columns).sort_values(ascending=False).head(20)

# Convert to DataFrame for seaborn
top_features_df = importances_drop.reset_index()
top_features_df.columns = ['Feature', 'Importance']

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(
    data=top_features_df,
    x='Importance',
    y='Feature',
    hue='Feature',
    palette='Reds_r',
    dodge=False,
    legend=False  # hide redundant legend
)
plt.title("Top 20 Features - RandomForest (Drop Log Dataset)", fontsize=14)
plt.xlabel("Importance", fontsize=12)
plt.ylabel("Feature", fontsize=12)
plt.tight_layout()
plt.show()

note : redundency on car_age and year drop `year`

In [None]:
importances_fill * 100

In [None]:
importances_drop * 100

In [None]:
select_feature_fill = importances_fill[:9].index
select_feature_fill

In [None]:
select_feature_drop = importances_drop[:9].index
select_feature_drop

In [None]:
importances_fill[select_feature_fill].sum()

In [None]:
importances_drop[select_feature_drop].sum()

## 🛠️Feature Selection & Hypertune

In [None]:
datasets = {
    "Drop Log": datasets["Drop Log"][select_feature_drop.tolist() + ['price']].copy(),
    "Drop Iso": datasets["Drop Iso"][select_feature_drop.tolist() + ['price']].copy(),
    "Fill Log": datasets["Fill Log"][select_feature_fill.tolist() + ['price']].copy(),
    "Fill Iso": datasets["Fill Iso"][select_feature_fill.tolist() + ['price']].copy()
}

In [None]:
from tqdm.notebook import tqdm
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_predict, KFold
import numpy as np
import pandas as pd
import optuna

results_tuned = []
best_params = {}

# --- Early stopping callback ---
class EarlyStoppingCallback:
    def __init__(self, patience):
        self.patience = patience
        self.best_value = float('inf')
        self.counter = 0

    def __call__(self, study, trial):
        if study.best_value < self.best_value:
            self.best_value = study.best_value
            self.counter = 0
        else:
            self.counter += 1
        if self.counter >= self.patience:
            print(f"⛔️ Early stopping triggered after {self.patience} trials.")
            study.stop()


In [None]:
# --- Tuning functions ---
def tune_ridge(trial, X, y):
    alpha = trial.suggest_float("alpha", 1e-4, 1e3, log=True)  # Wide range for alpha
    model = Ridge(alpha=alpha)
    y_pred = cross_val_predict(model, X, y, cv=kf, n_jobs=-1)
    sse = np.sum((y - y_pred) ** 2)
    return sse 

def tune_rf(trial, X, y):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 150), # fewer trees
        "max_depth": trial.suggest_int("max_depth", 3, 10), # shallower trees
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 5), # less granular
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 3), # fewer samples per leaf
    }
    model = RandomForestRegressor(**params, n_jobs=-1, random_state=42)
    y_pred = cross_val_predict(model, X, y, cv=kf, n_jobs=-1)
    sse = np.sum((y - y_pred) ** 2)
    return sse 

def tune_xgb(trial, X, y):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.3, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 32),

        'subsample': trial.suggest_float('subsample', 0.1, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0),
        'tree_method': 'gpu_hist',
        'random_state': 42
    }
    model = XGBRegressor(**params, n_jobs=-1)
    y_pred = cross_val_predict(model, X, y, cv=kf, n_jobs=-1)
    sse = np.sum((y - y_pred) ** 2)
    return sse 


In [None]:
# --- Main tuning loop ---
def run_tuning(dataset_name, model_name, datasets, n_trials=None, patience=None):
    X = datasets[dataset_name].drop(columns=["price"])
    y = datasets[dataset_name]["price"]

    # Check if the model has already been trained on this dataset
    existing_result_index = next(
        (i for i, result in enumerate(results_tuned) 
         if result["Dataset"] == dataset_name and result["Model"].startswith(model_name)), 
        None
    )

    # Set default trial/patience if not specified
    if n_trials is None:
        n_trials = 100 if model_name == "RidgeRegression" else 25
    if patience is None:
        patience = 40 if model_name == "RidgeRegression" else 10

    if existing_result_index is not None:
        print(f"✅ {model_name} on {dataset_name} has already been tuned. Retraining and updating results.")
    else:
        print(f"🔧 Starting {model_name} on {dataset_name} with {n_trials} trials")
    

    def objective(trial):
        if model_name == "RidgeRegression":
            return tune_ridge(trial, X, y)
        elif model_name == "RandomForest":
            return tune_rf(trial, X, y)
        elif model_name == "XGBoost":
            return tune_xgb(trial, X, y)

    early_stopping = EarlyStoppingCallback(patience=patience)

    # Progress bar wrapper
    with tqdm(total=n_trials, desc=f"{dataset_name} | {model_name}", leave=False) as pbar:
        def progress_bar_callback(study, trial):
            pbar.update(1)

        study = optuna.create_study(direction="minimize")
        study.optimize(objective, n_trials=n_trials+1,
                       callbacks=[early_stopping, progress_bar_callback])

    best_params[(dataset_name, model_name)] = study.best_params

    # Retrain and evaluate
    if model_name == "RidgeRegression":
        model = Ridge(**study.best_params)
    elif model_name == "RandomForest":
        model = RandomForestRegressor(**study.best_params, random_state=42, n_jobs=-1)
    elif model_name == "XGBoost":
        model = XGBRegressor(**study.best_params, random_state=42, n_jobs=-1, verbosity=0)

    y_pred = cross_val_predict(model, X, y, cv=kf, n_jobs=-1)

    y_true = np.expm1(y) if dataset_name == "Log" else y
    y_pred = np.expm1(y_pred) if dataset_name == "Log" else y_pred

    r2, rmse, mae, pmae = evaluate(y_true, y_pred)

    # Update or append the result based on whether it's already in results_tuned
    result = {
        "Dataset": dataset_name,
        "Model": model_name + " (Tuned)",
        "R²": round(r2, 4),
        "RMSE": round(rmse, 2),
        "MAE": round(mae, 2),
        "PMAE (%)": round(pmae, 2)
    }

    if existing_result_index is not None:
        # Update the existing result in results_tuned
        results_tuned[existing_result_index] = result
    else:
        # Append as a new result if it's not already present
        results_tuned.append(result)

    print(f"✅ Finished {model_name} on {dataset_name}")


In [None]:
for dataset in ["Drop Log", "Drop Iso"]:
    for model in ["RidgeRegression", "RandomForest", "XGBoost"]:
        run_tuning(dataset, model, datasets)

In [None]:
for dataset in ["Fill Log", "Fill Iso"]:
    for model in ["RidgeRegression", "RandomForest", "XGBoost"]:
        run_tuning(dataset, model, datasets)

In [None]:
#in case want to train only 1 model --> run_tuning(dataset, model, datasets) ex. run_tuning("Fill Log", "XGBoost", datasets)
run_tuning("Fill Log", "XGBoost", datasets)

In [None]:
import pandas as pd

# Convert the results list to a DataFrame
results_df = pd.DataFrame(results_tuned)

# Improved styled DataFrame for better header and width display
styled_results = results_df.style.format({
    "R²": "{:.4f}",
    "RMSE": "{:,.2f}",
    "MAE": "{:,.2f}",
    "PMAE (%)": "{:.2f}"
}).background_gradient(subset=["R²"], cmap="Blues") \
  .background_gradient(subset=["RMSE", "MAE", "PMAE (%)"], cmap="Reds_r") \
  .set_table_styles([
    {"selector": "thead th", "props": [
        ("background-color", "#1976D2"),
        ("color", "white"),
        ("font-weight", "bold"),
        ("text-align", "center"),
        ("white-space", "nowrap"),
        ("padding", "10px")
    ]},
    {"selector": "tbody td", "props": [
        ("border", "1px solid #ddd"),
        ("text-align", "center"),
        ("padding", "8px"),
        ("white-space", "nowrap")
    ]},
    {"selector": "table", "props": [
        ("width", "100%"),
        ("table-layout", "fixed")
    ]}
]) \
  .set_properties(**{"text-align": "center"})

styled_results


In [None]:
# Convert results_tuned to a DataFrame
results_tuned_df = pd.DataFrame(results_tuned)

# Check if results_tuned_df is empty
if results_tuned_df.empty:
    print("No tuned results available to create a heatmap.")
else:
    # Create a pivot table
    heatmap_data = results_tuned_df.pivot(index="Model", columns="Dataset", values="R²")

    heatmap_data["Avg R²"] = heatmap_data.mean(axis=1)
    heatmap_data.sort_values("Avg R²", ascending=False, inplace=True)
    heatmap_data.drop(columns="Avg R²", inplace=True)

    # Red theme color map
    plt.figure(figsize=(10, 6))
    sns.heatmap(
        heatmap_data,
        annot=True,
        fmt=".3f",
        cmap="Reds",          # red theme
        linewidths=0.5,
        linecolor='white',
        cbar_kws={"label": "R² Score"}
    )

    plt.title("Model Performance by R² (Sorted)", fontsize=16)
    plt.xlabel("Dataset", fontsize=12)
    plt.ylabel("Model", fontsize=12)
    plt.xticks(fontsize=11)
    plt.yticks(fontsize=11)
    plt.show()


In [None]:
# Convert to DataFrame

df_no_tune = pd.DataFrame(results_no_hyper_tune)
df_tuned = pd.DataFrame(results_tuned)

df_no_tune.to_csv('../dataset/results_no_hyper_tune.csv', index=False)
df_tuned.to_csv('../dataset/result_tuned.csv', index=False)


# df_no_tune = pd.read_csv('../dataset/results_no_hyper_tune.csv') #use for called from saved result 
# df_tuned = pd.read_csv('../dataset/result_tuned.csv')

# Clean up model/dataset names
df_tuned["Model"] = df_tuned["Model"].str.replace(r" \(Tuned\)", "", regex=True)
df_tuned["Dataset"] = df_tuned["Dataset"].str.replace(r" Selected", "", regex=True)

# Merge for comparison
comparison = pd.merge(
    df_no_tune,
    df_tuned,
    on=["Dataset", "Model"],
    suffixes=("_NoTune", "_Tuned")
)

# Compute differences
comparison["Δ R²"] = comparison["R²_Tuned"] - comparison["R²_NoTune"]
comparison["Δ RMSE"] = comparison["RMSE_NoTune"] - comparison["RMSE_Tuned"]
comparison["Δ MAE"] = comparison["MAE_NoTune"] - comparison["MAE_Tuned"]
comparison["Δ PMAE"] = comparison["PMAE (%)_NoTune"] - comparison["PMAE (%)_Tuned"]

# Round for display
comparison = comparison.round(3)

# Sort by R² improvement
comparison.sort_values(by="Δ R²", ascending=False, inplace=True)
comparison.reset_index(drop=True, inplace=True)

# Style the comparison DataFrame
styled_comparison = comparison.style.format({
    "R²_NoTune": "{:.4f}",
    "RMSE_NoTune": "{:,.2f}",
    "MAE_NoTune": "{:,.2f}",
    "PMAE (%)_NoTune": "{:.2f}",
    "R²_Tuned": "{:.4f}",
    "RMSE_Tuned": "{:,.2f}",
    "MAE_Tuned": "{:,.2f}",
    "PMAE (%)_Tuned": "{:.2f}",
    "Δ R²": "{:.4f}",
    "Δ RMSE": "{:,.2f}",
    "Δ MAE": "{:,.2f}",
    "Δ PMAE": "{:.2f}"
}).background_gradient(subset=["R²_NoTune", "R²_Tuned", "Δ R²"], cmap="Blues") \
  .background_gradient(subset=["RMSE_NoTune", "MAE_NoTune", "PMAE (%)_NoTune", "RMSE_Tuned", "MAE_Tuned", "PMAE (%)_Tuned"], cmap="Reds_r") \
  .background_gradient(subset=["Δ RMSE", "Δ MAE", "Δ PMAE"], cmap="Reds") \
  .set_table_styles([
    {"selector": "thead th", "props": [
        ("background-color", "#1976D2"),
        ("color", "white"),
        ("font-weight", "bold"),
        ("text-align", "center"),
        ("white-space", "nowrap"),
        ("padding", "10px")
    ]},
    {"selector": "tbody td", "props": [
        ("border", "1px solid #ddd"),
        ("text-align", "center"),
        ("padding", "8px"),
        ("white-space", "nowrap")
    ]},
    {"selector": "table", "props": [
        ("width", "100%"),
        ("table-layout", "fixed")
    ]}
]) \
  .set_properties(**{"text-align": "center"})

styled_comparison

In [None]:
metrics = ["R²", "RMSE", "MAE", "PMAE (%)"]
melted = pd.melt(
    comparison,
    id_vars=["Dataset", "Model"],
    value_vars=[f"{m}_{sfx}" for m in metrics for sfx in ["NoTune", "Tuned"]],
    var_name="Metric_Version",
    value_name="Score"
)

# Split 'Metric_Version' into two columns: Metric and Version
melted[["Metric", "Version"]] = melted["Metric_Version"].str.extract(r"(.*)_((?:NoTune)|(?:Tuned))")
melted.drop(columns=["Metric_Version"], inplace=True)

# Plot
g = sns.catplot(
    data=melted,
    kind="bar",
    x="Score",
    y="Model",
    hue="Version",
    col="Metric",
    row="Dataset",
    palette="Set2",
    sharex=False,
    height=4,
    aspect=1.6
)

g.fig.subplots_adjust(top=0.9)
g.fig.suptitle("Model Performance Comparison: Tuned vs. NoTune")
plt.show()