In [None]:
import pandas as pd
import warnings

warnings.filterwarnings('ignore')
processed_train_data = pd.read_csv('../data/processed/processed_train_data.csv').copy()
processed_train_data.drop(columns=['Unnamed: 0'], inplace=True)
processed_train_data.info()

In [None]:
processed_train_data.columns

## Model 1: XGBoost Model with Standard Hyperparameters

### Steps to Prevent Data Leakage in the Neighborhood Variable
##### For each fold, the neighborhood statistics are calculated only using the training data. Smoothed average prices for each neighborhood are computed by combining the neighborhood mean with the global training mean to reduce noise from small samples. These smoothed values are then mapped to both training and validation sets. Unknown neighborhoods in validation get the global mean. This process ensures no target information from validation leaks into training. Finally, the original neighborhood column is removed, leaving only the numerical smoothed feature for modeling.

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import zscore

X = processed_train_data.drop(columns=['SalePrice'])
y = np.log1p(processed_train_data['SalePrice'])  # Log-transform target

kf = KFold(n_splits=10, shuffle=True, random_state=42)

mse_list, rmse_list, mae_list, r2_list = [], [], [], []
fold = 1

for train_index, val_index in kf.split(X):
    X_train, X_val = X.iloc[train_index].copy(), X.iloc[val_index].copy()
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # Filter outliers in training set using Z-score
    y_train_prices = np.expm1(y_train)
    z_scores = zscore(y_train_prices)
    mask = (z_scores > -3) & (z_scores < 3)

    X_train_filtered = X_train[mask].copy()
    y_train_filtered = y_train[mask]

    # Smoothing mean encoding for Neighborhood
    temp_df = X_train_filtered.copy()
    temp_df['SalePrice'] = y_train_filtered

    neighborhood_stats = temp_df.groupby('Neighborhood')['SalePrice'].agg(['mean', 'count'])
    global_mean = y_train_filtered.mean()

    alpha = 10  # smoothing parameter
    neighborhood_stats['smoothed'] = (
        neighborhood_stats['mean'] * neighborhood_stats['count'] + global_mean * alpha
    ) / (neighborhood_stats['count'] + alpha)

    X_train['Neighborhood_avg_price'] = X_train['Neighborhood'].map(neighborhood_stats['smoothed'])
    X_val['Neighborhood_avg_price'] = X_val['Neighborhood'].map(neighborhood_stats['smoothed'])

    # Fill missing values with global mean
    X_train['Neighborhood_avg_price'].fillna(global_mean, inplace=True)
    X_val['Neighborhood_avg_price'].fillna(global_mean, inplace=True)

    X_train.drop(columns=['Neighborhood'], inplace=True)
    X_val.drop(columns=['Neighborhood'], inplace=True)

    # Model definition and training
    model = XGBRegressor(
        n_estimators=1000,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        verbosity=0
    )
    model.fit(X_train, y_train)

    # Predict and inverse transform
    y_pred_log = model.predict(X_val)
    y_pred = np.expm1(y_pred_log)
    y_val_orig = np.expm1(y_val)

    # Calculate metrics
    mse = mean_squared_error(y_val_orig, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_val_orig, y_pred)
    r2 = r2_score(y_val_orig, y_pred)

    print(f"Fold {fold} -> MSE: {mse:.2f}, RMSE: {rmse:.2f}, MAE: {mae:.2f}, R2: {r2:.4f}")

    mse_list.append(mse)
    rmse_list.append(rmse)
    mae_list.append(mae)
    r2_list.append(r2)

    fold += 1

print("\n--- 10-Fold CV Average Results ---")
print(f"Average MSE: {np.mean(mse_list):.2f}")
print(f"Average RMSE: {np.mean(rmse_list):.2f}")
print(f"Average MAE: {np.mean(mae_list):.2f}")
print(f"Average R2: {np.mean(r2_list):.4f}")

## Hyperparameter Tuning Using Optuna

In [None]:
import optuna
from sklearn.model_selection import KFold
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

def objective(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 1500),
            'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
            'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
            'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
            'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
            'random_state': 42,
            'verbosity': 0
        }
    
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        rmse_scores = []
    
        for train_idx, val_idx in kf.split(X):
            X_train, X_val = X.iloc[train_index].copy(), X.iloc[val_index].copy()
            y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        
            # Filter outliers in training set using Z-score
            y_train_prices = np.expm1(y_train)
            z_scores = zscore(y_train_prices)
            mask = (z_scores > -3) & (z_scores < 3)
        
            X_train_filtered = X_train[mask].copy()
            y_train_filtered = y_train[mask]
        
            # Smoothing mean encoding for Neighborhood
            temp_df = X_train_filtered.copy()
            temp_df['SalePrice'] = y_train_filtered
        
            neighborhood_stats = temp_df.groupby('Neighborhood')['SalePrice'].agg(['mean', 'count'])
            global_mean = y_train_filtered.mean()
        
            alpha = 10  # smoothing parameter
            neighborhood_stats['smoothed'] = (
                neighborhood_stats['mean'] * neighborhood_stats['count'] + global_mean * alpha
            ) / (neighborhood_stats['count'] + alpha)
        
            X_train['Neighborhood_avg_price'] = X_train['Neighborhood'].map(neighborhood_stats['smoothed'])
            X_val['Neighborhood_avg_price'] = X_val['Neighborhood'].map(neighborhood_stats['smoothed'])
        
            # Fill missing values with global mean
            X_train['Neighborhood_avg_price'].fillna(global_mean, inplace=True)
            X_val['Neighborhood_avg_price'].fillna(global_mean, inplace=True)
        
            X_train.drop(columns=['Neighborhood'], inplace=True)
            X_val.drop(columns=['Neighborhood'], inplace=True)
    
            model = XGBRegressor(**params)
            model.fit(X_train, y_train)
    
            y_pred_log = model.predict(X_val)
            y_pred = np.expm1(y_pred_log)
            y_val_orig = np.expm1(y_val)
    
            rmse = np.sqrt(mean_squared_error(y_val_orig, y_pred))
            rmse_scores.append(rmse)
    
            return np.mean(rmse_scores)

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

print("Best trial:")
trial = study.best_trial
print(f"  RMSE: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

## Model 2: Fine Tuned XGBoost Model

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import zscore

X = processed_train_data.drop(columns=['SalePrice'])
y = np.log1p(processed_train_data['SalePrice'])  # Log-transform target

kf = KFold(n_splits=10, shuffle=True, random_state=42)

mse_list, rmse_list, mae_list, r2_list = [], [], [], []
fold = 1

for train_index, val_index in kf.split(X):
    X_train, X_val = X.iloc[train_index].copy(), X.iloc[val_index].copy()
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # Filter outliers in training set using Z-score
    y_train_prices = np.expm1(y_train)
    z_scores = zscore(y_train_prices)
    mask = (z_scores > -3) & (z_scores < 3)

    X_train_filtered = X_train[mask].copy()
    y_train_filtered = y_train[mask]

    # Smoothing mean encoding for Neighborhood
    temp_df = X_train_filtered.copy()
    temp_df['SalePrice'] = y_train_filtered

    neighborhood_stats = temp_df.groupby('Neighborhood')['SalePrice'].agg(['mean', 'count'])
    global_mean = y_train_filtered.mean()

    alpha = 10  # smoothing parameter
    neighborhood_stats['smoothed'] = (
        neighborhood_stats['mean'] * neighborhood_stats['count'] + global_mean * alpha
    ) / (neighborhood_stats['count'] + alpha)

    X_train['Neighborhood_avg_price'] = X_train['Neighborhood'].map(neighborhood_stats['smoothed'])
    X_val['Neighborhood_avg_price'] = X_val['Neighborhood'].map(neighborhood_stats['smoothed'])

    # Fill missing values with global mean
    X_train['Neighborhood_avg_price'].fillna(global_mean, inplace=True)
    X_val['Neighborhood_avg_price'].fillna(global_mean, inplace=True)

    X_train.drop(columns=['Neighborhood'], inplace=True)
    X_val.drop(columns=['Neighborhood'], inplace=True)

    # Model definition and training
    model = XGBRegressor(
        n_estimators=1248,
        learning_rate=0.05499096292974256,
        max_depth=4,
        subsample=0.577013993087534,
        colsample_bytree=0.9068997064085227,
        gamma=0.0004916084930462287,
        reg_alpha=1.0951310305247596e-05,
        reg_lambda=0.03878934490687091,
        random_state=42,
        verbosity=0
    )
    
    model.fit(X_train, y_train)

    # Predict and inverse transform
    y_pred_log = model.predict(X_val)
    y_pred = np.expm1(y_pred_log)
    y_val_orig = np.expm1(y_val)

    # Calculate metrics
    mse = mean_squared_error(y_val_orig, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_val_orig, y_pred)
    r2 = r2_score(y_val_orig, y_pred)

    print(f"Fold {fold} -> MSE: {mse:.2f}, RMSE: {rmse:.2f}, MAE: {mae:.2f}, R2: {r2:.4f}")

    mse_list.append(mse)
    rmse_list.append(rmse)
    mae_list.append(mae)
    r2_list.append(r2)

    fold += 1

print("\n--- 10-Fold CV Average Results ---")
print(f"Average MSE: {np.mean(mse_list):.2f}")
print(f"Average RMSE: {np.mean(rmse_list):.2f}")
print(f"Average MAE: {np.mean(mae_list):.2f}")
print(f"Average R2: {np.mean(r2_list):.4f}")

import joblib

joblib.dump(model, '../models/xgb_model_final.joblib')
print("Final model successfully saved.")

In [None]:
XGB_model = joblib.load('../models/xgb_model_final.joblib')
print(XGB_model.feature_names_in_)

## Model 3: TPOT

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import zscore
from tpot import TPOTRegressor
import joblib

# Assume processed_train_data is already loaded and preprocessed
X = processed_train_data.drop(columns=['SalePrice'])
y = np.log1p(processed_train_data['SalePrice'])  # Log-transform target

kf = KFold(n_splits=10, shuffle=True, random_state=42)

mse_list, rmse_list, mae_list, r2_list = [], [], [], []
fold = 1

for train_index, val_index in kf.split(X):
    X_train, X_val = X.iloc[train_index].copy(), X.iloc[val_index].copy()
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # Outlier removal in training target using Z-score
    y_train_prices = np.expm1(y_train)
    z_scores = zscore(y_train_prices)
    mask = (z_scores > -3) & (z_scores < 3)
    X_train_filtered = X_train[mask].copy()
    y_train_filtered = y_train[mask]

    # Smoothing mean encoding for 'Neighborhood'
    temp_df = X_train_filtered.copy()
    temp_df['SalePrice'] = y_train_filtered
    neighborhood_stats = temp_df.groupby('Neighborhood')['SalePrice'].agg(['mean', 'count'])
    global_mean = y_train_filtered.mean()
    alpha = 10  # smoothing parameter

    neighborhood_stats['smoothed'] = (
        neighborhood_stats['mean'] * neighborhood_stats['count'] + global_mean * alpha
    ) / (neighborhood_stats['count'] + alpha)

    X_train['Neighborhood_avg_price'] = X_train['Neighborhood'].map(neighborhood_stats['smoothed'])
    X_val['Neighborhood_avg_price'] = X_val['Neighborhood'].map(neighborhood_stats['smoothed'])
    X_train['Neighborhood_avg_price'].fillna(global_mean, inplace=True)
    X_val['Neighborhood_avg_price'].fillna(global_mean, inplace=True)

    X_train.drop(columns=['Neighborhood'], inplace=True)
    X_val.drop(columns=['Neighborhood'], inplace=True)

    # TPOTRegressor - AutoML with genetic programming
    tpot = TPOTRegressor(
        generations=10,          # Number of iterations to evolve pipelines
        population_size=50,      # Number of pipelines per generation
        verbosity=2,             # Show progress logs
        scoring='neg_mean_squared_error',  # Optimize for MSE
        random_state=42,
        n_jobs=-1,               # Use all CPU cores
        max_time_mins=30         # Max time in minutes per fold
    )

    tpot.fit(X_train, y_train)

    # Predict on validation set and inverse log-transform
    y_pred_log = tpot.predict(X_val)
    y_pred = np.expm1(y_pred_log)
    y_val_orig = np.expm1(y_val)

    # Calculate performance metrics
    mse = mean_squared_error(y_val_orig, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_val_orig, y_pred)
    r2 = r2_score(y_val_orig, y_pred)

    print(f"Fold {fold} -> MSE: {mse:.2f}, RMSE: {rmse:.2f}, MAE: {mae:.2f}, R2: {r2:.4f}")

    mse_list.append(mse)
    rmse_list.append(rmse)
    mae_list.append(mae)
    r2_list.append(r2)

    fold += 1

print("\n--- 10-Fold CV Average Results ---")
print(f"Average MSE: {np.mean(mse_list):.2f}")
print(f"Average RMSE: {np.mean(rmse_list):.2f}")
print(f"Average MAE: {np.mean(mae_list):.2f}")
print(f"Average R2: {np.mean(r2_list):.4f}")

# Save the best pipeline found by TPOT from last fold
joblib.dump(tpot.fitted_pipeline_, '../models/tpot_model_final.joblib')
print("Final TPOT model successfully saved.")