In [None]:
# === XGBoost Modeling Notebook ===

# === 1. Imports ===
import sys
import os
sys.path.append(os.path.abspath("../"))  # ensure src/ is importable

import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import root_mean_squared_error, r2_score

from src.load_data import load_train_data, load_test_data
from src.preprocess.preprocessing_pipeline import PreprocessingPipeline

In [None]:
# === 2. Load data ===
df = load_train_data()

In [None]:
# === 3. Prepare features and target (before preprocessing) ===
X_raw = df.drop(columns=["price"])
y_raw = df["price"]

In [None]:
# === 4. Train-test split (on raw data) ===
X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(
    X_raw, y_raw, test_size=0.2, random_state=42
)


In [None]:
# === 4a. Preprocess training data ===
train_df = X_train_raw.copy()
train_df["price"] = y_train_raw

pipeline = PreprocessingPipeline(
    use_log_target=True,
    drop_low_importance=True,
    encode_data=True
)
train_processed = pipeline.fit_transform(train_df)
X_train = train_processed.drop(columns=["price", "log_price"], errors='ignore')
y_train = train_processed["log_price"]

In [None]:
# === 4b. Preprocess test data (using fitted pipeline) ===
test_df = X_test_raw.copy()
test_df["price"] = y_test_raw  # For consistency
test_processed = pipeline.transform(test_df)
X_test = test_processed.drop(columns=["price", "log_price"], errors='ignore')
y_test = test_processed["log_price"] if "log_price" in test_processed.columns else y_test_raw

In [None]:
# === 5. Train basic XGBoost model ===
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.05, 0.1, 0.2],
    "subsample": [0.7, 0.9, 1.0],
    "colsample_bytree": [0.7, 0.9, 1.0]
}

base_model = XGBRegressor(random_state=42, n_jobs=-1)
grid_search = GridSearchCV(base_model, param_grid, cv=3, scoring="neg_root_mean_squared_error", verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

xgb_model = grid_search.best_estimator_
print("Best parameters:", grid_search.best_params_)

In [None]:
# === 6. Predict and evaluate ===
y_pred = xgb_model.predict(X_test)
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"XGBoost RMSE (log-target): {rmse:.2f}")
print(f"XGBoost R2 Score: {r2:.3f}")

In [None]:
# === 7. Save model ===
import joblib
joblib.dump(xgb_model, "../models/xgboost_model.joblib")

In [None]:
# === 8. Generate predictions for stacking ===
# Note: This generates predictions on full training set, not OOF
# For proper OOF predictions, use the oof_xgboost.py script
full_train_processed = pipeline.fit_transform(df)
X_full = full_train_processed.drop(columns=["price", "log_price"], errors='ignore')
train_preds = xgb_model.predict(X_full)
pd.DataFrame({
    "xgb_oof_pred": train_preds
}).to_csv("../results/xgb_oof_train_preds.csv", index=False)


In [None]:
# === 9. Load and preprocess test data ===
test_df = load_test_data()
# Use the same pipeline that was fitted on training data
test_clean = pipeline.transform(test_df)

# === 10. Predict and save submission ===
test_preds = np.expm1(xgb_model.predict(test_clean))  # Convert log(price) back

submission = pd.DataFrame({
    "ID": test_df["ID"],
    "Actual": test_preds
})
submission.to_csv("../results/xgb_test_preds.csv", index=False)
print("âœ… XGBoost submission saved to: ../results/xgb_test_preds.csv")
