In [3]:
# Boston Housing Prices - Initial Data Inspection
# -----------------------------------------------
# Step 1: Load libraries and dataset

import numpy as np
import pandas as pd
import os

os.makedirs("figures", exist_ok=True)

file_path = "/Users/jeremygoetschy/Projects/House_Prices/Raw/boston.csv"
df = pd.read_csv(file_path)

# Dataset dimensions
print(df.shape)
# 🔎 Observations:
# - The dataset contains 506 rows and 14 columns.


# Dataset info (dtypes, nulls)
print(df.info())
# 🔎 Observations:
# - All columns are numerical features.
# - "CHAS" is a binary variable (Charles River dummy variable: 0 or 1).


# Preview first few rows
print(df.head())
# 🔎 Observations:
# - The "RAD" variable appears to be categorical (index of highway accessibility).
# - Other features are continuous.


# Descriptive statistics
print(df.describe())
# 🔎 Observations:
# - Most features are on different scales (scaling will be required later).
# - "CRIM" (per capita crime rate) is highly skewed → log transformation may help.


# Check missing values
print(f"\nMissing values:\n{df.isnull().sum()}")
# 🔎 Observations:
# - No missing values detected across any column.


# Check duplicates
print(f"\nDuplicated values:\n{df.duplicated().sum()}")
# 🔎 Observations:
# - No duplicate rows found in the dataset.

(506, 14)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(12), int64(2)
memory usage: 55.5 KB
None
      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900    1  296.0   
1  0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242.0   
2

In [4]:
# Boston Housing Prices - Exploratory Data Analysis (EDA)
# -------------------------------------------------------
# Goal: Explore the distribution of features, relationships with the target (MEDV),
#       and identify outliers/correlations.


import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid", palette="muted", color_codes=True)
plt.rcParams["figure.figsize"] = (12, 8)
plt.rcParams["font.size"] = 12


# ============================
# 1. Target Variable (MEDV)
# ============================
plt.figure()
sns.histplot(df["MEDV"], bins=30, kde=True)
plt.title("Distribution of MEDV")
plt.xlabel("Median Value of Owner-Occupied Homes ($1000s)")
plt.ylabel("Frequency")
plt.savefig("figures/medv_distribution.png")
plt.close()

# 🔎 Observations:
# - Target (MEDV) shows a normal-like distribution with some skew.
# - Prices are **capped at 50** → artificial ceiling in dataset.


# ============================
# 2. Feature Distributions
# ============================
features = [col for col in df.columns if col not in ["MEDV", "CHAS"]]

plt.figure(figsize=(20, 15))
for i, feature in enumerate(features, 1):
    plt.subplot(4, 3, i)
    sns.histplot(df[feature], bins=30, kde=True)
    plt.title(f"Distribution of {feature}")
plt.tight_layout()
plt.savefig("figures/feature_distributions.png")
plt.close()

# 🔎 Observations:
# - CRIM (crime rate) is highly skewed → potential log transform candidate.
# - LSTAT also shows skewness.
# - Most features are continuous, except "RAD" which behaves like categorical.


# ============================
# 3. Boxplots for Outlier Detection
# ============================
plt.figure(figsize=(20, 10))
for i, feature in enumerate(features, 1):
    plt.subplot(4, 3, i)
    sns.boxplot(y=df[feature])
    plt.title(f"Boxplot of {feature}")
plt.tight_layout()
plt.savefig("figures/feature_boxplots.png")
plt.close()

# 🔎 Observations:
# - Strong outliers detected in CRIM, ZN, and MEDV.
# - Moderate outliers in LSTAT.
# - Feature "B" shows very high average values.


# ============================
# 4. CHAS Variable (Binary)
# ============================
print(df["CHAS"].value_counts())
sns.countplot(x="CHAS", data=df)
plt.title("Count of CHAS (Near Charles River)")
plt.xlabel("CHAS")
plt.ylabel("Count")
plt.savefig("figures/chas_countplot.png")
plt.close()

# 🔎 Observations:
# - CHAS is highly imbalanced → only 35 houses near the river (vs 471 not near).
# - May have limited predictive power.


# ============================
# 5. MEDV vs Features (Boxplots)
# ============================
plt.figure(figsize=(20, 15))
for i, feature in enumerate(features, 1):
    plt.subplot(4, 3, i)
    sns.boxplot(x=df[feature], y=df["MEDV"])
    plt.title(f"MEDV vs {feature}")
plt.tight_layout()
plt.savefig("figures/medv_vs_features_boxplots.png")
plt.close()

# 🔎 Observations:
# - RM (average rooms) shows a positive trend with MEDV.
# - LSTAT (lower status population %) shows a negative trend with MEDV.


# ============================
# 6. Correlation Matrix
# ============================
plt.figure()
corr = df.corr()
sns.heatmap(corr, annot=True, cmap="coolwarm", vmin=-1, vmax=1, center=0)
plt.title("Correlation Matrix")
plt.savefig("figures/correlation_matrix.png")
plt.close()

# 🔎 Observations:
# - MEDV has strong positive correlation with RM (0.7) and strong negative with LSTAT (-0.74).
# - RAD and TAX are highly correlated (0.91).
# - NOX, INDUS, AGE, DIS, TAX cluster together:
#   - Higher industry % → higher NOX, older homes, higher taxes, further from center.
#   - DIS (distance to employment centers) is negatively correlated with these.
# - Key predictors: RM and LSTAT.


# ============================
# 7. Multivariate Relationships
# ============================
sns.pairplot(df[["RM", "LSTAT", "MEDV"]])
plt.savefig("figures/pairplot_rm_lstat_medv.png")
plt.close()

# 🔎 Observations:
# - MEDV vs RM shows a clear linear positive relationship.
# - MEDV vs LSTAT is nonlinear and negative.
# - RM and LSTAT are inversely related (more rooms → lower LSTAT).
# - The artificial MEDV cap at 50 is evident.


# ============================
# 8. Outlier Detection (IQR Method)
# ============================
def detect_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    print(f"Outliers detected in {column}: {outliers.shape[0]}")
    return outliers, lower_bound, upper_bound


for col in ["MEDV", "CRIM", "LSTAT"]:
    detect_outliers_iqr(df, col)

# 🔎 Observations:
# - MEDV: 40 capped values at 50.
# - CRIM: 66 outliers → log transform recommended.
# - LSTAT: 7 mild outliers → manageable.


# ============================
# 9. Drop "B" Feature
# ============================
df = df.drop(columns=["B"])

# 🔎 Observations:
# - "B" has ethical concerns (racial proxy) and weak correlation with MEDV.
# - Removed from dataset.


# ============================
# EDA Summary
# ============================
print("\n--- Key Insights ---")
print("1. Target (MEDV) is capped at 50, limiting predictive accuracy at the top end.")
print("2. Strong predictors: RM (positive) and LSTAT (negative).")
print("3. High multicollinearity detected: RAD-TAX, NOX-INDUS-AGE-DIS-TAX cluster.")
print(
    "4. Significant outliers in CRIM and MEDV → consider log transform / robust methods."
)
print("5. CHAS is highly imbalanced and may have limited predictive power.")

CHAS
0    471
1     35
Name: count, dtype: int64
Outliers detected in MEDV: 40
Outliers detected in CRIM: 66
Outliers detected in LSTAT: 7

--- Key Insights ---
1. Target (MEDV) is capped at 50, limiting predictive accuracy at the top end.
2. Strong predictors: RM (positive) and LSTAT (negative).
3. High multicollinearity detected: RAD-TAX, NOX-INDUS-AGE-DIS-TAX cluster.
4. Significant outliers in CRIM and MEDV → consider log transform / robust methods.
5. CHAS is highly imbalanced and may have limited predictive power.


In [5]:
# Boston Housing Prices - Data Preprocessing
# ------------------------------------------
# Goal: Apply log transformations to skewed features, scale numerical variables,
#       and prepare data for regression models.

from sklearn.preprocessing import StandardScaler, FunctionTransformer, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Features and target
X = df.drop(columns=["MEDV"])
y = df["MEDV"]

# ============================
# 1. Log Transform Skewed Features
# ============================

skewed_features = ["CRIM", "ZN", "LSTAT"]
log_transformer = FunctionTransformer(np.log1p, validate=True)

# ============================
# 2. Define Feature Groups
# ============================

numeric_features = [
    "CRIM",
    "ZN",
    "INDUS",
    "NOX",
    "RM",
    "AGE",
    "DIS",
    "PTRATIO",
    "LSTAT",
]

categorical_features = ["CHAS", "TAX"]

# ============================
# 3. Build Preprocessor
# ============================

# Pipeline for skewed features: log transform + robust scaling
log_scale_pipeline = Pipeline(
    [
        ("log", FunctionTransformer(np.log1p, validate=True)),
        ("scaler", RobustScaler()),
    ]
)

# ColumnTransformer: apply different transformations
preprocessor = ColumnTransformer(
    transformers=[
        ("log_scale", log_scale_pipeline, skewed_features),
        (
            "scaler",
            RobustScaler(),
            [col for col in numeric_features if col not in skewed_features],
        ),
        ("cat", "passthrough", categorical_features),
    ]
)

# ============================
# 4. Apply Transformation
# ============================

pipeline = Pipeline(steps=[("preprocessor", preprocessor)])
X_transformed = pipeline.fit_transform(X)

# Convert back to DataFrame
X_transformed_df = pd.DataFrame(
    X_transformed, columns=numeric_features + categorical_features
)

print(X_transformed_df.head())
print(X_transformed_df.describe())

# 🔎 Observations:
# - Log transformation applied to skewed features (CRIM, ZN, LSTAT).
# - Robust scaling used for all numeric features.
# - Categorical features (CHAS, TAX) passed through without changes.

       CRIM        ZN     INDUS       NOX        RM       AGE       DIS  \
0 -0.151682  1.131306 -0.891172 -0.571650  0.000000  0.496612 -0.250765   
1 -0.137580  0.000000 -0.243003 -0.202943 -0.394286  0.287940  0.028542   
2 -0.137593  0.000000 -1.103520 -0.202943 -0.394286  1.323171 -0.334353   
3 -0.134223  0.000000 -1.403308 -0.581720 -0.457143  1.069783 -0.646279   
4 -0.110372  0.000000 -0.821356 -0.581720 -0.457143  1.271680 -0.475025   

    PTRATIO     LSTAT  CHAS    TAX  
0  0.285777 -1.339286   0.0  296.0  
1  0.569789 -0.446429   0.0  242.0  
2  0.569789 -0.446429   0.0  242.0  
3  0.924391 -0.125000   0.0  222.0  
4  0.924391 -0.125000   0.0  222.0  
               CRIM          ZN         INDUS         NOX          RM  \
count  5.060000e+02  506.000000  5.060000e+02  506.000000  506.000000   
mean   3.996951e-01    0.366495 -4.769126e-02    0.112067    0.095400   
std    6.986721e-01    0.622752  6.616283e-01    0.531398    0.662158   
min   -1.516822e-01    0.000000 -1.

In [6]:
# Boston Housing Prices - Feature Engineering
# -------------------------------------------
# Goal: Use feature selection (Mutual Information), create polynomial/interaction features,
#       and explore dimensionality reduction (PCA).


from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA

# ============================
# 1. Mutual Information (Feature Importance)
# ============================
mi_scores = mutual_info_regression(X_transformed, y)
mi_df = pd.DataFrame({"Feature": X_transformed_df.columns, "MI Score": mi_scores})
mi_df = mi_df.sort_values(by="MI Score", ascending=False)
print(mi_df)

# 🔎 Observations:
# - RM (average rooms) and LSTAT (% lower status) are the strongest predictors.
# - TAX shows higher importance than RAD (prefer TAX over RAD).
# - CHAS and ZN have relatively low predictive power.


# ============================
# 2. Polynomial & Interaction Features
# ============================
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
X_poly = poly.fit_transform(X_transformed)
poly_features = poly.get_feature_names_out(X_transformed_df.columns)
X_poly_df = pd.DataFrame(X_poly, columns=poly_features)

print(X_poly_df.head())

# 🔎 Observations:
# - Polynomial features allow capturing nonlinear effects:
#   - Squared terms like LSTAT^2 and RM^2.
#   - Interaction terms like RM*LSTAT.
# - RM and LSTAT become even stronger when squared → consistent with EDA.


# ============================
# 3. Integrating PolynomialFeatures into Pipeline
# ============================
preprocessor = ColumnTransformer(
    transformers=[
        ("log_scale", log_scale_pipeline, skewed_features),
        (
            "scaler",
            RobustScaler(),
            [col for col in numeric_features if col not in skewed_features],
        ),
        ("cat", "passthrough", categorical_features),
    ]
)

pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("poly", PolynomialFeatures(degree=2, include_bias=False)),
    ]
)

# Apply transformations
X_poly_transformed = pipeline.fit_transform(X)
X_poly_df = pd.DataFrame(
    X_poly_transformed, columns=poly.get_feature_names_out(X_transformed_df.columns)
)
print(X_poly_df.head())

# 🔎 Observations:
# - Polynomial pipeline integrates preprocessing + feature engineering in one step.
# - Expands feature space significantly (quadratic growth).
# - Must consider **regularization** later to avoid overfitting.
# - I dropped RAD because of its high correlation with TAX. I prefer TAX because he's higher in the MI Score.

    Feature  MI Score
2     INDUS  0.665229
5       AGE  0.528778
3       NOX  0.472905
4        RM  0.457528
8     LSTAT  0.444760
0      CRIM  0.377603
10      TAX  0.372748
6       DIS  0.315058
7   PTRATIO  0.294597
1        ZN  0.154392
9      CHAS  0.030136
       CRIM        ZN     INDUS       NOX        RM       AGE       DIS  \
0 -0.151682  1.131306 -0.891172 -0.571650  0.000000  0.496612 -0.250765   
1 -0.137580  0.000000 -0.243003 -0.202943 -0.394286  0.287940  0.028542   
2 -0.137593  0.000000 -1.103520 -0.202943 -0.394286  1.323171 -0.334353   
3 -0.134223  0.000000 -1.403308 -0.581720 -0.457143  1.069783 -0.646279   
4 -0.110372  0.000000 -0.821356 -0.581720 -0.457143  1.271680 -0.475025   

    PTRATIO     LSTAT  CHAS  ...  PTRATIO^2  PTRATIO LSTAT  PTRATIO CHAS  \
0  0.285777 -1.339286   0.0  ...   0.081668      -0.382737           0.0   
1  0.569789 -0.446429   0.0  ...   0.324659      -0.254370           0.0   
2  0.569789 -0.446429   0.0  ...   0.324659      -0.25437

In [7]:
# Boston Housing Prices - Baseline & Advanced Regression Models
# ------------------------------------------------------------
# Goal: Train baseline Linear Regression, compare with advanced models,
#       and evaluate using RMSE, MAE, and R².

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# ============================
# 1. Train-Test Split
# ============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print("Train shape:", X_train.shape, "| Test shape:", X_test.shape)

# ============================
# 2. Linear Regression Pipeline
# ============================
lr_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("poly", PolynomialFeatures(degree=2, include_bias=False)),
        ("selector", SelectFromModel(Lasso(alpha=0.01))),
        ("model", LinearRegression()),
    ]
)

# Train model
lr_pipeline.fit(X_train, y_train)

# Predictions
y_pred = lr_pipeline.predict(X_test)

# Evaluation
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nLinear Regression Performance (Baseline):")
print(f"Root Mean Squared Error: {rmse:.3f}")
print(f"Mean Absolute Error: {mae:.3f}")
print(f"R² Score: {r2:.3f}")

# Cross-validation
cv_scores = cross_val_score(
    lr_pipeline, X, y, cv=5, scoring="neg_root_mean_squared_error"
)
print(f"Cross-Validation RMSE: {-cv_scores.mean():.3f} ± {cv_scores.std():.3f}")

Train shape: (404, 12) | Test shape: (102, 12)

Linear Regression Performance (Baseline):
Root Mean Squared Error: 3.841
Mean Absolute Error: 2.394
R² Score: 0.799
Cross-Validation RMSE: 5.447 ± 1.909


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [8]:
# Boston Housing Prices - XGBoost with Optuna Hyperparameter Tuning
# -----------------------------------------------------------
# Goal: Optimize XGBoost hyperparameters with Optuna,
#       evaluate performance, and interpret results.

from xgboost import XGBRegressor
import optuna
import shap
from sklearn.pipeline import make_pipeline
from joblib import dump

# ---------------------------------------
# XGBoost + Optuna (with preprocessing)
# ---------------------------------------

xgb_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", XGBRegressor(random_state=42, n_jobs=-1)),
    ]
)


def objective(trial):
    params = {
        "model__n_estimators": trial.suggest_int(
            "model__n_estimators", 100, 1000, step=100
        ),
        "model__max_depth": trial.suggest_int("model__max_depth", 3, 10),
        "model__learning_rate": trial.suggest_float("model__learning_rate", 0.01, 0.3),
        "model__subsample": trial.suggest_float("model__subsample", 0.5, 1.0),
        "model__colsample_bytree": trial.suggest_float(
            "model__colsample_bytree", 0.5, 1.0
        ),
    }
    xgb_pipeline.set_params(**params)
    scores = cross_val_score(
        xgb_pipeline, X_train, y_train, cv=5, scoring="neg_root_mean_squared_error"
    )
    return -scores.mean()


study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)

best_prefixed = study.best_params
print("Best XGB (prefixed) parameters:\n", best_prefixed)

# Apply best params to the pipeline directly (prefixed keys)
xgb_pipeline.set_params(**best_prefixed)

# Train & evaluate on holdout
xgb_pipeline.fit(X_train, y_train)
y_pred_xgb = xgb_pipeline.predict(X_test)
rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print("\nXGBoost Performance (Optuna-tuned):")
print(f"RMSE: {rmse_xgb:.3f}")
print(f"MAE : {mae_xgb:.3f}")
print(f"R^2 : {r2_xgb:.3f}")

# Cross-validation on full data
cv_scores_xgb = cross_val_score(
    xgb_pipeline, X, y, cv=5, scoring="neg_root_mean_squared_error"
)
print(f"CV RMSE: {-cv_scores_xgb.mean():.3f} ± {cv_scores_xgb.std():.3f}")

# ---------------------------------------
# SHAP Interpretability (global & partial)
# ---------------------------------------

# Get transformed training features for SHAP
X_transformed = xgb_pipeline.named_steps["preprocessor"].transform(X)

# Build feature names in the order the ColumnTransformer outputs them
feature_names = (
    skewed_features
    + [col for col in numeric_features if col not in skewed_features]
    + categorical_features
)

xgb_model = xgb_pipeline.named_steps["model"]
explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer(X_transformed)

# Global summary
shap.summary_plot(shap_values, X_transformed, feature_names=feature_names, show=False)
plt.title("SHAP Summary Plot")
plt.savefig("figures/shap_summary_plot.png")
plt.close()

# Dependence (example: DIS)
shap.dependence_plot(
    "DIS", shap_values.values, X_transformed, feature_names=feature_names, show=False
)
plt.savefig("figures/shap_dependence_dis.png")
plt.close()

# ---------------------------------------
# Residuals diagnostic
# ---------------------------------------
plt.figure(figsize=(7, 5))
plt.scatter(y_pred_xgb, y_test - y_pred_xgb, alpha=0.7)
plt.axhline(0, color="red", linestyle="--", linewidth=1)
plt.xlabel("Predicted MEDV")
plt.ylabel("Residuals")
plt.title("Residuals vs Predicted Values (XGBoost)")
plt.savefig("figures/residuals_vs_predicted.png")
plt.close()
plt.show()

# ---------------------------------------
# Final exportable pipeline (plain XGB params)
# ---------------------------------------
xgb_params = {
    "n_estimators": best_prefixed["model__n_estimators"],
    "max_depth": best_prefixed["model__max_depth"],
    "learning_rate": best_prefixed["model__learning_rate"],
    "subsample": best_prefixed["model__subsample"],
    "colsample_bytree": best_prefixed["model__colsample_bytree"],
    "random_state": 42,
    "n_jobs": -1,
}

final_pipeline = make_pipeline(preprocessor, XGBRegressor(**xgb_params))
final_pipeline.fit(X_train, y_train)
dump(final_pipeline, "boston_house_price_model.joblib")

y_pred_final = final_pipeline.predict(X_test)
print("\nFinal Pipeline (re-fit with best params):")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_final)):.3f}")
print(f"MAE : {mean_absolute_error(y_test, y_pred_final):.3f}")
print(f"R^2 : {r2_score(y_test, y_pred_final):.3f}")

# 🔎 Observations:
# - As expected from EDA, RM (rooms) increases prices and LSTAT (lower-status %) decreases them.
# - DIS & AGE show moderate effects; DIS interacts with socio-economic context (LSTAT), not distance alone.
# - XGBoost is the best fit here (small dataset + nonlinearities), balancing RMSE/MAE with strong R^2.

  from .autonotebook import tqdm as notebook_tqdm
[I 2025-09-10 13:51:56,965] A new study created in memory with name: no-name-e1695622-8531-427f-98a4-8e5d53bc6d65
[I 2025-09-10 13:52:00,879] Trial 0 finished with value: 3.6541595324990235 and parameters: {'model__n_estimators': 400, 'model__max_depth': 9, 'model__learning_rate': 0.03388276655085706, 'model__subsample': 0.8372464296603439, 'model__colsample_bytree': 0.9980343901951854}. Best is trial 0 with value: 3.6541595324990235.
[I 2025-09-10 13:52:02,818] Trial 1 finished with value: 3.6722948363342303 and parameters: {'model__n_estimators': 500, 'model__max_depth': 3, 'model__learning_rate': 0.12464698764195545, 'model__subsample': 0.6078930386419537, 'model__colsample_bytree': 0.7198588278335313}. Best is trial 0 with value: 3.6541595324990235.
[I 2025-09-10 13:52:03,481] Trial 2 finished with value: 3.861519082456997 and parameters: {'model__n_estimators': 100, 'model__max_depth': 6, 'model__learning_rate': 0.02921819021687391

Best XGB (prefixed) parameters:
 {'model__n_estimators': 400, 'model__max_depth': 3, 'model__learning_rate': 0.04510851703095074, 'model__subsample': 0.6003249231415718, 'model__colsample_bytree': 0.8783673024767273}

XGBoost Performance (Optuna-tuned):
RMSE: 2.392
MAE : 1.740
R^2 : 0.922
CV RMSE: 4.260 ± 1.164

Final Pipeline (re-fit with best params):
RMSE: 2.392
MAE : 1.740
R^2 : 0.922


In [9]:
# Boston Housing Prices - Neural Network (PyTorch)
# ------------------------------------------------
# Goal: Train a simple feedforward neural network (MLP) for regression
#       and evaluate its performance vs. Linear Regression and XGBoost.

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# ============================
# 1. Data Preparation
# ============================
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

X_train_tensor = torch.tensor(X_train_transformed, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_transformed, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).reshape(-1, 1)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).reshape(-1, 1)


# ============================
# 2. Define Model Architecture
# ============================
class MLP(nn.Module):
    def __init__(self, input_dim):
        super(MLP, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(32, 1),
        )

    def forward(self, x):
        return self.model(x)


model = MLP(input_dim=X_train_transformed.shape[1])

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# ============================
# 3. Training Loop (with Early Stopping)
# ============================
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

epochs = 200
patience = 20
best_loss = float("inf")
epochs_no_improve = 0

for epoch in range(epochs):
    model.train()
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        y_pred = model(X_batch)
        loss = criterion(y_pred, y_batch)
        loss.backward()
        optimizer.step()

    # Validation loss
    model.eval()
    with torch.no_grad():
        y_val_pred = model(X_test_tensor)
        val_loss = criterion(y_val_pred, y_test_tensor).item()

    # Early stopping
    if val_loss < best_loss:
        best_loss = val_loss
        epochs_no_improve = 0
        torch.save(model.state_dict(), "best_model.pt")
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print("Early stopping at epoch", epoch + 1)
            break

# Load best model
model.load_state_dict(torch.load("best_model.pt"))

# ============================
# 4. Evaluation
# ============================
model.eval()
with torch.no_grad():
    y_pred_nn = model(X_test_tensor).numpy().flatten()

rmse_nn = np.sqrt(mean_squared_error(y_test, y_pred_nn))
mae_nn = mean_absolute_error(y_test, y_pred_nn)
r2_nn = r2_score(y_test, y_pred_nn)

print("\nNeural Network Performance:")
print(f"RMSE: {rmse_nn:.3f}")
print(f"MAE : {mae_nn:.3f}")
print(f"R²  : {r2_nn:.3f}")
# 🔎 Observations:
# - The neural net underperforms compared to XGBoost and even Linear Regression — confirming
# that the dataset is too small for deep learning.


Neural Network Performance:
RMSE: 4.610
MAE : 2.941
R²  : 0.710


  model.load_state_dict(torch.load("best_model.pt"))


In [10]:
# Boston Housing Prices - Model Comparison
# ----------------------------------------
# Goal: Compare performance of Linear Regression, XGBoost, and Neural Network.

# Collect results
results = {
    "Linear Regression": {"RMSE": 3.900, "MAE": 2.482, "R²": 0.793},
    "XGBoost": {"RMSE": 2.309, "MAE": 1.724, "R²": 0.927},
    "Neural Network": {"RMSE": 4.875, "MAE": 3.410, "R²": 0.676},
}

results_df = pd.DataFrame(results).T
display(results_df)

# ============================
# Bar Plot Comparison
# ============================
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# RMSE
axes[0].bar(results_df.index, results_df["RMSE"], color=["skyblue", "orange", "green"])
axes[0].set_title("RMSE Comparison")
axes[0].set_ylabel("RMSE")

# MAE
axes[1].bar(results_df.index, results_df["MAE"], color=["skyblue", "orange", "green"])
axes[1].set_title("MAE Comparison")
axes[1].set_ylabel("MAE")

# R²
axes[2].bar(results_df.index, results_df["R²"], color=["skyblue", "orange", "green"])
axes[2].set_title("R² Score Comparison")
axes[2].set_ylabel("R²")

plt.suptitle("Model Performance Comparison", fontsize=16)
plt.savefig("figures/model_performance_comparison.png")
plt.close()

# 🔎 Observations:
# - Linear Regression: Solid baseline (R² ≈ 0.79), but struggles with non-linear relationships.
# - XGBoost: Best performer (RMSE ≈ 2.3, R² ≈ 0.93), handling skew, interactions, and non-linear effects very well.
# - Neural Network: Underperforms (R² ≈ 0.68), likely due to small dataset size and overfitting risks.

# Conclusion:
# - Key predictors: RM (rooms ↑ price), LSTAT (lower-status population ↓ price), plus moderate effects from AGE and DIS.
# - Best model: XGBoost, due to its ability to model complex interactions and nonlinearity.
# - Lesson learned: For smaller tabular datasets, tree-based methods (like XGBoost) often outperform both linear models and neural networks.

Unnamed: 0,RMSE,MAE,R²
Linear Regression,3.9,2.482,0.793
XGBoost,2.309,1.724,0.927
Neural Network,4.875,3.41,0.676
