# LightGBM Model (log-target)

Train a tuned LightGBM regressor using the leakage-free preprocessing pipeline, then export predictions for Kaggle.


In [None]:
# === Imports ===
import os
import sys
sys.path.append(os.path.abspath("../"))

# Reload modules to pick up latest changes
import importlib
import src.preprocess.preprocessing_pipeline
import src.preprocess.encoding
import src.preprocess.feature_engineering
importlib.reload(src.preprocess.encoding)
importlib.reload(src.preprocess.feature_engineering)
importlib.reload(src.preprocess.preprocessing_pipeline)

import numpy as np
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import root_mean_squared_error
import joblib

from src.load_data import load_train_data, load_test_data
from src.preprocess.preprocessing_pipeline import PreprocessingPipeline

print("✅ Modules reloaded - ready to use new features!")


In [None]:
# === Load data & create holdout split ===
df = load_train_data()

X_raw = df.drop(columns=["price"])
y_raw = df["price"]

X_train_raw, X_val_raw, y_train_raw, y_val_raw = train_test_split(
    X_raw,
    y_raw,
    test_size=0.2,
    random_state=42
)

print(f"Train rows: {X_train_raw.shape[0]} | Val rows: {X_val_raw.shape[0]}")


In [None]:
# === Preprocess training data ===
train_df = X_train_raw.copy()
train_df["price"] = y_train_raw

pipeline = PreprocessingPipeline(
    use_log_target=True,
    drop_low_importance=True,
    encode_data=False,  # keep categoricals for native LightGBM handling
    use_target_encoding=True  # Add target encoding as additional features
)
train_processed = pipeline.fit_transform(train_df)

X_train = train_processed.drop(columns=["price", "log_price"], errors="ignore").copy()
y_train = train_processed["log_price"].copy()

# Convert all object/string columns to category for LightGBM
# LightGBM requires: numeric (int/float/bool) or category dtype
categorical_candidates = ["model", "brand", "transmission", "fuelType", "brand_model"]
cat_features = []
for col in X_train.columns:
    if X_train[col].dtype == 'object' or col in categorical_candidates:
        X_train[col] = X_train[col].astype("category")
        if col in categorical_candidates:
            cat_features.append(col)

# Ensure all numeric columns are proper types
for col in X_train.columns:
    if X_train[col].dtype.name == 'category':
        continue
    # Convert any remaining object columns to numeric or drop them
    if X_train[col].dtype == 'object':
        # Try to convert to numeric, if fails, drop it
        try:
            X_train[col] = pd.to_numeric(X_train[col], errors='coerce')
        except:
            X_train = X_train.drop(columns=[col])

print(f"✅ X_train shape: {X_train.shape}")
print(f"✅ Categorical features (for native handling): {cat_features}")
print(f"✅ Target-encoded features: {[col for col in X_train.columns if 'target_enc' in col]}")
print(f"✅ Tax features: {[col for col in X_train.columns if 'tax' in col.lower()]}")
print(f"✅ Polynomial features: {[col for col in X_train.columns if 'squared' in col or 'interaction' in col]}")
print(f"✅ Column dtypes: {X_train.dtypes.value_counts().to_dict()}")


In [None]:
# === Preprocess validation data ===
val_df = X_val_raw.copy()
val_df["price"] = y_val_raw
val_processed = pipeline.transform(val_df)

X_val = val_processed.drop(columns=["price", "log_price"], errors="ignore").copy()
if "log_price" in val_processed.columns:
    y_val = val_processed["log_price"].copy()
else:
    y_val = np.log1p(y_val_raw)

# Convert all object/string columns to category for LightGBM (same as training)
for col in X_val.columns:
    if X_val[col].dtype == 'object' or col in categorical_candidates:
        if col in X_train.columns and X_train[col].dtype.name == 'category':
            # Use same categories as training
            X_val[col] = pd.Categorical(X_val[col], categories=X_train[col].cat.categories)
        else:
            X_val[col] = X_val[col].astype("category")

# Ensure all numeric columns are proper types (same as training)
for col in X_val.columns:
    if X_val[col].dtype.name == 'category':
        continue
    if X_val[col].dtype == 'object':
        try:
            X_val[col] = pd.to_numeric(X_val[col], errors='coerce')
        except:
            if col in X_val.columns:
                X_val = X_val.drop(columns=[col])

# Ensure X_val has same columns as X_train
X_val = X_val[X_train.columns]

print(f"✅ X_val shape: {X_val.shape}")
print(f"✅ X_val column dtypes: {X_val.dtypes.value_counts().to_dict()}")


In [None]:
# === Hyperparameter search (RandomizedSearchCV) ===
param_dist = {
    "num_leaves": [31, 63, 95, 127],
    "max_depth": [-1, 8, 10, 12],
    "learning_rate": [0.01, 0.02, 0.03, 0.05],
    "n_estimators": [600, 900, 1200, 1500],
    "subsample": [0.7, 0.85, 1.0],
    "colsample_bytree": [0.7, 0.85, 1.0],
    "reg_alpha": [0.0, 0.1, 0.3, 0.5],
    "reg_lambda": [0.0, 0.1, 0.3, 0.5],
    "min_child_weight": [1, 5, 10],
    "min_child_samples": [20, 40, 60]
}

base_lgbm = LGBMRegressor(
    objective="regression",
    random_state=42,
    n_jobs=-1,
    boosting_type="gbdt"
)

random_search = RandomizedSearchCV(
    estimator=base_lgbm,
    param_distributions=param_dist,
    n_iter=40,
    scoring="neg_root_mean_squared_error",
    cv=5,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

fit_params = {"categorical_feature": cat_features}
random_search.fit(X_train, y_train, **fit_params)

best_model = random_search.best_estimator_
best_params = random_search.best_params_
best_cv_rmse = -random_search.best_score_

print("Best params:", best_params)
print(f"Best CV RMSE (log): {best_cv_rmse:.5f}")


In [None]:
# === Validation performance ===
val_preds = best_model.predict(X_val)

val_rmse_log = root_mean_squared_error(y_val, val_preds)
val_rmse_raw = root_mean_squared_error(np.expm1(y_val), np.expm1(val_preds))
print(f"Validation RMSE (log): {val_rmse_log:.5f}")
print(f"Validation RMSE ($): {val_rmse_raw:.2f}")


In [None]:
# === Train final model on full dataset ===
full_df = load_train_data()
final_pipeline = PreprocessingPipeline(
    use_log_target=True,
    drop_low_importance=True,
    encode_data=False
)
full_processed = final_pipeline.fit_transform(full_df)

X_full = full_processed.drop(columns=["price", "log_price"], errors="ignore").copy()
y_full = full_processed["log_price"].copy()

cat_features_full = [col for col in categorical_candidates if col in X_full.columns]
for col in cat_features_full:
    X_full[col] = X_full[col].astype("category")

final_model = LGBMRegressor(
    **best_params,
    random_state=42,
    n_jobs=-1,
    boosting_type="gbdt"
)
final_model.fit(X_full, y_full, categorical_feature=cat_features_full)

model_path = "../models/lightgbm_model.joblib"
joblib.dump({
    "model": final_model,
    "pipeline": final_pipeline,
    "categorical_features": cat_features_full,
    "feature_columns": X_full.columns.tolist()
}, model_path)
print(f"Saved LightGBM model + pipeline to {model_path}")


In [None]:
# === Generate Kaggle submission ===
test_df = load_test_data()
test_processed = final_pipeline.transform(test_df)

X_test = test_processed.copy()
missing_cols = [col for col in X_full.columns if col not in X_test.columns]
for col in missing_cols:
    X_test[col] = 0
X_test = X_test[X_full.columns]

for col in cat_features_full:
    if col in X_test.columns:
        X_test[col] = pd.Categorical(X_test[col], categories=X_full[col].cat.categories)

test_preds_log = final_model.predict(X_test)
test_preds = np.expm1(test_preds_log)

submission = pd.DataFrame({
    "ID": test_df["ID"],
    "Actual": test_preds
})
submission_path = "../results/lightgbm_test_preds.csv"
submission.to_csv(submission_path, index=False)
print(f"Saved LightGBM submission to {submission_path}")
