## default parameter XGBoost model


In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

In [11]:

# === Helper function ===
def run_baseline(train_path, test_path, label='SalePrice', name=''):
    print(f"\n=== Running {name} ===")

    # 載入資料
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)

    # 拆分特徵與目標
    X_train = train_df.drop(columns=[label])
    y_train = train_df[label]
    X_test = test_df.copy()

    # 建立模型（預設參數）
    model = XGBRegressor(random_state=42)
    model.fit(X_train, y_train)

    # 訓練集預測與 RMSE
    preds_train = model.predict(X_train)
    rmse_train = np.sqrt(mean_squared_error(y_train, preds_train))
    print(f"[{name}] Training RMSE = {rmse_train:.4f}")

    # 可視化預測效果
    sample_idx = np.random.choice(len(y_train), size=100, replace=False)
    plt.figure(figsize=(6, 6))
    plt.scatter(y_train.iloc[sample_idx], preds_train[sample_idx], alpha=0.7, edgecolor='k')
    plt.plot([y_train.min(), y_train.max()],
             [y_train.min(), y_train.max()], 'r--', lw=2)
    plt.title(f"{name} — Actual vs Predicted (Train Set)\nRMSE = {rmse_train:.4f}")
    plt.xlabel("Actual SalePrice")
    plt.ylabel("Predicted SalePrice")
    plt.show()

    return model, rmse_train

In [16]:
# === Baseline 1: Cleaned data ===
run_baseline("./data/cleaned/train.csv", "./data/cleaned/test.csv",
             name="Cleaned_all_features")


=== Running Cleaned_all_features ===


ImportError: sklearn needs to be installed in order to use this module

In [None]:

# === Baseline 2: Method 1 (Mutual Information, 40 features) ===
run_baseline("./data/selected/train_filter.csv", "./data/selected/test_filter.csv",
             name="Method1_Mutual_Info")


In [None]:
# === Baseline 3: Method 2 (XGBoost CV, 30 features) ===
run_baseline("./data/selected/train_Method2_selected.csv", "./data/selected/test_Method2_selected.csv",
             name="Method2_XGBoost_CV")