In [None]:
# === XGBoost Modeling Notebook ===
# Testing new features: Target Encoding + Tax Features + Polynomial Features
# Using XGBoost as a fast proxy (2-3 min) to test improvements before running LightGBM (16 min)

# === 1. Imports ===
import sys
import os
sys.path.append(os.path.abspath("../"))  # ensure src/ is importable

# Reload modules to pick up latest changes
import importlib
import src.preprocess.preprocessing_pipeline
import src.preprocess.encoding
import src.preprocess.feature_engineering
importlib.reload(src.preprocess.encoding)
importlib.reload(src.preprocess.feature_engineering)
importlib.reload(src.preprocess.preprocessing_pipeline)

import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import root_mean_squared_error, r2_score

from src.load_data import load_train_data, load_test_data
from src.preprocess.preprocessing_pipeline import PreprocessingPipeline

print("✅ Modules reloaded - ready to use new features!")

In [None]:
# === 2. Load data ===
df = load_train_data()

In [None]:
# === 3. Prepare features and target (before preprocessing) ===
X_raw = df.drop(columns=["price"])
y_raw = df["price"]

In [None]:
# === 4. Train-test split (on raw data) ===
X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(
    X_raw, y_raw, test_size=0.2, random_state=42
)


In [None]:
# === 4a. Preprocess training data ===
train_df = X_train_raw.copy()
train_df["price"] = y_train_raw

pipeline = PreprocessingPipeline(
    use_log_target=True,
    drop_low_importance=True,
    encode_data=True,
    use_target_encoding=True  # Enable target encoding for better XGBoost performance
)
train_processed = pipeline.fit_transform(train_df)
X_train = train_processed.drop(columns=["price", "log_price"], errors='ignore')
y_train = train_processed["log_price"]

print(f"✅ Training data shape: {X_train.shape}")
print(f"✅ Features with target encoding: {[col for col in X_train.columns if 'target_enc' in col]}")
print(f"✅ New tax features: {[col for col in X_train.columns if 'tax' in col.lower()]}")
print(f"✅ Polynomial features: {[col for col in X_train.columns if 'squared' in col or 'interaction' in col]}")

In [None]:
# === 4b. Preprocess test data (using fitted pipeline) ===
test_df = X_test_raw.copy()
test_df["price"] = y_test_raw  # For consistency
test_processed = pipeline.transform(test_df)
X_test = test_processed.drop(columns=["price", "log_price"], errors='ignore')
y_test = test_processed["log_price"] if "log_price" in test_processed.columns else y_test_raw

In [None]:
# === 5. Hyperparameter tuning with RandomizedSearchCV ===
# Expanded parameter distribution for better exploration
param_dist = {
    "n_estimators": [300, 500, 800, 1000, 1200],
    "max_depth": [5, 7, 9, 11],
    "learning_rate": [0.01, 0.03, 0.05, 0.08, 0.1],
    "subsample": [0.7, 0.8, 0.9, 1.0],
    "colsample_bytree": [0.7, 0.8, 0.9, 1.0],
    "reg_alpha": [0.0, 0.1, 0.3, 0.5],
    "reg_lambda": [0.0, 0.1, 0.3, 0.5, 1.0],
    "min_child_weight": [1, 3, 5, 7],
    "gamma": [0.0, 0.1, 0.2]
}

base_model = XGBRegressor(random_state=42, n_jobs=-1)

# RandomizedSearchCV: 50 iterations, 3-fold CV = ~150 fits (vs 729 with GridSearch)
random_search = RandomizedSearchCV(
    estimator=base_model,
    param_distributions=param_dist,
    n_iter=50,
    cv=3,
    scoring="neg_root_mean_squared_error",
    verbose=2,
    n_jobs=-1,
    random_state=42
)

print("Starting hyperparameter search...")
random_search.fit(X_train, y_train)

xgb_model = random_search.best_estimator_
best_params = random_search.best_params_
best_cv_rmse = -random_search.best_score_

print("\n" + "="*50)
print("Best parameters:", best_params)
print(f"Best CV RMSE (log scale): {best_cv_rmse:.5f}")
print("="*50)

In [None]:
# === 6. Predict and evaluate ===
y_pred = xgb_model.predict(X_test)
rmse_log = root_mean_squared_error(y_test, y_pred)
rmse_raw = root_mean_squared_error(np.expm1(y_test), np.expm1(y_pred))
r2 = r2_score(y_test, y_pred)

print(f"XGBoost RMSE (log scale): {rmse_log:.5f}")
print(f"XGBoost RMSE ($): {rmse_raw:.2f}")
print(f"XGBoost R2 Score: {r2:.3f}")

In [None]:
# === 7. Save model ===
import joblib
joblib.dump(xgb_model, "../models/xgboost_model.joblib")

In [None]:
# === 8. Generate predictions for stacking ===
# Note: This generates predictions on full training set, not OOF
# For proper OOF predictions, use the oof_xgboost.py script
full_train_processed = pipeline.fit_transform(df)
X_full = full_train_processed.drop(columns=["price", "log_price"], errors='ignore')
train_preds = xgb_model.predict(X_full)
pd.DataFrame({
    "xgb_oof_pred": train_preds
}).to_csv("../results/xgb_oof_train_preds.csv", index=False)


In [None]:
# === 9. Load and preprocess test data ===
test_df = load_test_data()
# Use the same pipeline that was fitted on training data
test_clean = pipeline.transform(test_df)

# === 10. Predict and save submission ===
test_preds = np.expm1(xgb_model.predict(test_clean))  # Convert log(price) back

submission = pd.DataFrame({
    "ID": test_df["ID"],
    "Actual": test_preds
})
submission.to_csv("../results/xgb_test_preds.csv", index=False)
print("✅ XGBoost submission saved to: ../results/xgb_test_preds.csv")
