# Advanced Regression Pipeline


This notebook builds on the LightGBM pipeline to compare three regression algorithms using data from the local database. It includes data loading, preprocessing, model training, evaluation, and visualization.

In [1]:
import os
import json
import math
from datetime import datetime
from pathlib import Path
from typing import Dict, Tuple

## Environment
from dotenv import load_dotenv

## Core Scientific Stack
import numpy as np
import pandas as pd

## Visualization
import matplotlib.pyplot as plt
import seaborn as sns

## Database
import psycopg2

## Machine Learning / Preprocessing (scikit-learn)
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, Pipeline as SkPipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils.validation import check_is_fitted
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

## Gradient Boosting Libraries
import lightgbm as lgb
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import xgboost as xgb
from xgboost import XGBRegressor

## Deep Learning / Tabular
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from pytorch_tabnet.tab_model import TabNetRegressor

## Optimization & Persistence
import optuna
import joblib

# Load environment variables from .env file
load_dotenv(override=True)

# Setup model directory (handle notebook environment where __file__ is undefined)
try:
    PROJECT_ROOT = Path(__file__).resolve().parent
except NameError:
    # Fallback: assume notebook is inside src; go up one directory if so
    cwd = Path.cwd().resolve()
    if (cwd / 'supervised.ipynb').exists() or (cwd / 'unsupervised.ipynb').exists():
        PROJECT_ROOT = cwd
    else:
        for parent in cwd.parents:
            if (parent / 'requirements.txt').exists() or (parent / 'README.md').exists():
                PROJECT_ROOT = parent / 'src'
                break
        else:
            PROJECT_ROOT = cwd  # final fallback

MODEL_DIR = (PROJECT_ROOT / '..' / 'supervised').resolve()
MODEL_DIR.mkdir(parents=True, exist_ok=True)
print(f"Models will be saved to: {MODEL_DIR}")

def save_model(model, name: str, extra: dict | None = None):
    """Utility to persist models and optional metadata alongside them.
    Saves model as joblib plus a companion JSON with metadata/hyperparams."""
    timestamp = datetime.utcnow().strftime('%Y%m%dT%H%M%SZ')
    base_name = f"{name}_{timestamp}"
    model_path = MODEL_DIR / f"{base_name}.joblib"
    meta_path = MODEL_DIR / f"{base_name}.json"
    joblib.dump(model, model_path)
    meta = {'model_name': name, 'saved_utc': timestamp}
    if extra:
        meta.update(extra)
    with open(meta_path, 'w') as f:
        json.dump(meta, f, indent=2)
    print(f"Saved model -> {model_path.name}; metadata -> {meta_path.name}")

# Database connection parameters
db_params = {
    "host": os.getenv("LOCAL_HOST"),
    "user": os.getenv("LOCAL_USER"),
    "password": os.getenv("LOCAL_PW"),
    "port": os.getenv("LOCAL_PORT"),
    "dbname": os.getenv("LOCAL_DB")
}

# Display versions
print('LightGBM version:', lgb.__version__)
print('XGBoost version:', xgb.__version__)

Models will be saved to: D:\docs\MADS\696-Milestone 2\supervised
LightGBM version: 4.6.0
XGBoost version: 3.0.5


In [2]:
# Check CUDA availability
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"Device count: {torch.cuda.device_count()}")
    print(f"Current device: {torch.cuda.current_device()}")
    print(f"Device name: {torch.cuda.get_device_name(0)}")
else:
    print("CUDA is NOT available - PyTorch will use CPU only")
    print("To enable GPU training, install PyTorch with CUDA support:")
    print("  conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia")
    print("  or")
    print("  pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118")

PyTorch version: 2.5.1
CUDA available: True
CUDA version: 12.1
Device count: 1
Current device: 0
Device name: NVIDIA GeForce GTX 1660 Ti


### Auto Load / Conditional Training
If a previously saved optimized model exists in `src/supervised`, the notebook will load the most recent artifact (by timestamp in filename) and skip retraining unless `FORCE_RETRAIN=True`.

In [15]:
# Auto-load previously saved optimized models (XGBoost / RandomForest / SVR / LinearRegression / Polynomial / MLP / TabNet)
from pathlib import Path as _Path
import json as _json

# Initialize placeholders if not already present
globals().setdefault('FORCE_RETRAIN', True)

# Only set to None if not defined to avoid clobbering models loaded earlier in session
if 'xgb_model' not in globals():
    xgb_model = None
if 'rf_model' not in globals():
    rf_model = None
if 'mlp_model' not in globals():
    mlp_model = None
if 'tabnet_model' not in globals():
    tabnet_model = None  # Will delay loading until wrapper class defined
if 'svr_model' not in globals():
    svr_model = None
if 'lr_model' not in globals():
    lr_model = None

MODEL_GLOB_PATTERNS = {
    'xgb_model': 'xgboost_opt_*.joblib',
    'rf_model': 'random_forest_opt_*.joblib', 
    'mlp_model': 'mlp_opt_*.pt',  # Changed to .pt for PyTorch models
    # 'tabnet_model': DEFERRED - skip here, load after wrapper class defined
    'svr_model': 'svr_opt_*.joblib',
    'lr_model': 'linear_regression_opt_*.joblib',
}

loaded_flags = {}
for var, pattern in MODEL_GLOB_PATTERNS.items():
    if globals().get(var) is not None:
        loaded_flags[var] = 'pre-existing'
        continue
    matches = sorted(MODEL_DIR.glob(pattern))
    if not matches:
        loaded_flags[var] = 'not found'
        continue
    latest = matches[-1]
    try:
        # Special handling for PyTorch models
        if pattern.endswith('.pt'):
            # Load PyTorch model state
            import torch
            checkpoint = torch.load(latest, map_location='cpu')
            model_params = checkpoint['model_params']
            
            # Recreate the model (assumes TorchMLPRegressor class is defined)
            if 'mlp' in var:
                # Model will be recreated when needed - just store params for now
                globals()[f"{var}_params"] = model_params
                globals()[f"{var}_checkpoint"] = checkpoint
                
                # Load metadata JSON for PyTorch models
                meta_file = latest.with_suffix('.json')
                if meta_file.exists():
                    with open(meta_file) as f:
                        globals()[f"{var}_meta"] = _json.load(f)
                
                loaded_flags[var] = f"✓ {latest.name} (PyTorch checkpoint)"
                print(f"  {var}: {latest} (will be recreated on use)")
                globals()[var] = None  # Will recreate later
                continue
        
        # Standard joblib loading for non-PyTorch models
        globals()[var] = joblib.load(latest)
        meta_file = latest.with_suffix('.json')
        if meta_file.exists():
            with open(meta_file) as f:
                globals()[f"{var}_meta"] = _json.load(f)
        loaded_flags[var] = f"✓ {latest.name}"
        print(f"  {var}: {latest}")
    except Exception as e:
        loaded_flags[var] = f"failed: {e}";
        globals()[var] = None

loaded_flags['tabnet_model'] = 'deferred (class not yet defined)'

print("Auto-load status:")
for k,v in loaded_flags.items():
    print(f"  {k}: {v}")
print("FORCE_RETRAIN=", FORCE_RETRAIN)

Auto-load status:
  xgb_model: pre-existing
  rf_model: pre-existing
  mlp_model: pre-existing
  svr_model: pre-existing
  lr_model: pre-existing
  tabnet_model: deferred (class not yet defined)
FORCE_RETRAIN= True


In [4]:
# Connect to database and load data
try:
    conn = psycopg2.connect(**db_params)
    print("Database connection successful")
    sql_query = "SELECT * FROM dev.base_data;"
    df = pd.read_sql_query(sql_query, conn)
    conn.close()
    print("Golden data loaded into DataFrame:")
    print(df.info())
except Exception as e:
    print(f"An error occurred: {e}")

Database connection successful


  df = pd.read_sql_query(sql_query, conn)


Golden data loaded into DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23038 entries, 0 to 23037
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   school_name             23038 non-null  object 
 1   school_type             23038 non-null  object 
 2   teachers_fte            22550 non-null  float64
 3   enrollment              22863 non-null  float64
 4   grade_eight_enrollment  21613 non-null  float64
 5   math_counts             22507 non-null  float64
 6   math_high_pct           22507 non-null  float64
 7   read_counts             22386 non-null  float64
 8   read_high_pct           22386 non-null  float64
 9   pct_hhi_150k_200k       23038 non-null  float64
 10  pct_hhi_220k_plus       23038 non-null  float64
 11  avg_natwalkind          23038 non-null  float64
 12  total_10_14             23038 non-null  int64  
 13  pct_10_14               23038 non-null  int64  
 14  pct

## 3. Data Splitting: Train, Validation, Test
Split the dataset into train, validation, and test sets, ensuring proper handling of the target variable.

In [5]:
df.columns

Index(['school_name', 'school_type', 'teachers_fte', 'enrollment',
       'grade_eight_enrollment', 'math_counts', 'math_high_pct', 'read_counts',
       'read_high_pct', 'pct_hhi_150k_200k', 'pct_hhi_220k_plus',
       'avg_natwalkind', 'total_10_14', 'pct_10_14', 'pct_female_10_14',
       'total_pop', 'schools_in_zip', 'dup_rank'],
      dtype='object')

In [6]:
# Define target and drop missing
TARGET = 'math_high_pct' if 'math_high_pct' in df.columns else 'target'
data = df.dropna().reset_index(drop=True)
data = data.set_index('school_name')

# Split features and target
feature_cols = [c for c in data.columns if c != TARGET and c != 'dup_rank' and c != 'math_low_pct']
X = data[feature_cols]
y = data[TARGET]

# Train/validation/test split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.40, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42)
print(f'Train shape: {X_train.shape}, Validation shape: {X_valid.shape}, Test shape: {X_test.shape}')

Train shape: (12378, 15), Validation shape: (4126, 15), Test shape: (4126, 15)


## 4. Feature Engineering and Preprocessing Pipeline
Identify numeric and categorical features, set up StandardScaler and OneHotEncoder, and build a ColumnTransformer pipeline.

In [7]:
# Identify numeric and categorical features
numeric_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X_train.select_dtypes(include=['category', 'object']).columns.tolist()

print('Numeric features:', numeric_features)
print('Categorical features:', categorical_features)

# Preprocessing pipeline
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)

# Fit preprocessor
preprocessor.fit(X_train)
X_train_enc = preprocessor.transform(X_train)
X_valid_enc = preprocessor.transform(X_valid)
X_test_enc = preprocessor.transform(X_test)

Numeric features: ['teachers_fte', 'enrollment', 'grade_eight_enrollment', 'math_counts', 'read_counts', 'read_high_pct', 'pct_hhi_150k_200k', 'pct_hhi_220k_plus', 'avg_natwalkind', 'total_10_14', 'pct_10_14', 'pct_female_10_14', 'total_pop', 'schools_in_zip']
Categorical features: ['school_type']


## Model Optimization function

In [8]:
# Reusable Optuna-based optimizer for cross-validated hyperparameter tuning
from typing import Callable, Dict, Tuple

def optimize_model_with_optuna(
                                model_name: str,
                                estimator_builder: Callable[[Dict], object],
                                param_space_fn: Callable[[optuna.trial.Trial], Dict],
                                X,
                                y,
                                scoring: str = 'neg_root_mean_squared_error',
                                cv: int = 3,
                                n_trials: int = 5,
                                direction: str = 'minimize',
                                random_state: int = 42,
                                n_jobs: int = -1,
                            ) -> Tuple[optuna.study.Study, Dict]:
    """Optimize a model's hyperparameters using Optuna and cross-validation.

    Args:
        model_name: Name used to label the Optuna study
        estimator_builder: Callable that receives a params dict and returns an unfitted estimator
        param_space_fn: Callable that maps an Optuna trial to a hyperparameter dictionary
        X, y: Training features and targets used for cross-validation
        scoring: scikit-learn scoring string guiding optimization
        cv: Number of cross-validation folds
        n_trials: Number of Optuna trials to run
        direction: 'minimize' or 'maximize' depending on the objective
        random_state: Seed for the Optuna sampler
        n_jobs: Parallelism for cross_val_score

    Returns:
        The completed Optuna study and the best hyperparameters discovered.
    """
    sampler = optuna.samplers.TPESampler(seed=random_state)
    study = optuna.create_study(study_name=f"{model_name}_opt", direction=direction, sampler=sampler)

    def objective(trial: optuna.trial.Trial) -> float:
        params = param_space_fn(trial)
        estimator = estimator_builder(params)
        scores = cross_val_score(estimator, X, y, cv=cv, scoring=scoring, n_jobs=n_jobs)
        mean_score = np.mean(scores)
        normalized_score = -mean_score if scoring.startswith('neg') else mean_score
        return normalized_score if direction == 'minimize' else -normalized_score

    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
    return study, study.best_params

## Model metrics

## LINEAR MODELS


In [9]:
# Linear & Kernel-based Models: SVR + Polynomial Regression (with Optuna tuning + CV)
def build_svr_estimator(params: Dict) -> SVR:
    base = {'kernel': params.get('kernel', 'rbf')}
    # Map params safely
    for k in ['C','epsilon','gamma','degree']:
        if k in params:
            base[k] = params[k]
    return SVR(**base)

def svr_param_space(trial: optuna.trial.Trial) -> Dict:
    kernel = trial.suggest_categorical('kernel', ['rbf','poly','sigmoid'])
    params = {
        'kernel': kernel,
        'C': trial.suggest_float('C', 0.5, 10, log=True),
        'epsilon': trial.suggest_float('epsilon', 0.05, 0.3),
    }
    if kernel in ['rbf','sigmoid']:
        params['gamma'] = trial.suggest_float('gamma', 0.1, 0.5, log=True)
    if kernel == 'poly':
        params['degree'] = trial.suggest_int('degree', 2, 5)
        params['gamma'] = trial.suggest_float('gamma', 0.1, 1, log=True)
    return params

def build_poly_estimator(params: Dict):
    degree = params.get('degree', 2)
    include_bias = params.get('include_bias', False)
    interaction_only = params.get('interaction_only', False)
    return SkPipeline([
        ('poly', PolynomialFeatures(degree=degree, include_bias=include_bias, interaction_only=interaction_only)),
        ('lr', LinearRegression())
    ])

def poly_param_space(trial: optuna.trial.Trial) -> Dict:
    return {
        'degree': trial.suggest_int('degree', 2, 5),
        'include_bias': False,
        'interaction_only': trial.suggest_categorical('interaction_only', [False, True])
    }

# Run / reuse SVR optimization
if svr_model is None or FORCE_RETRAIN:
    print('[SVR] Starting Optuna optimization...')
    svr_study, svr_best_params = optimize_model_with_optuna(
        model_name='SVR',
        estimator_builder=build_svr_estimator,
        param_space_fn=svr_param_space,
        X=X_train_enc,
        y=y_train,
        scoring='neg_root_mean_squared_error',
        cv=3,
        n_trials=3,
        direction='minimize',
        random_state=42,
        n_jobs=-1,
    )
    print('Best SVR params:', svr_best_params)
    svr_model = build_svr_estimator(svr_best_params)
    svr_model.fit(X_train_enc, y_train)
    svr_valid_pred = svr_model.predict(X_valid_enc)
    svr_test_pred = svr_model.predict(X_test_enc)
    save_model(svr_model, 'svr_opt', {'best_params': svr_best_params})
else:
    print('[SVR] Using preloaded optimized model; generating predictions.')
    svr_valid_pred = svr_model.predict(X_valid_enc)
    svr_test_pred = svr_model.predict(X_test_enc)

# Baseline Linear Regression (also optionally re-optimized via polynomial)
if lr_model is None or FORCE_RETRAIN:
    # Keep a simple baseline linear regression for reference
    lr_model = LinearRegression()
    lr_model.fit(X_train_enc, y_train)
    save_model(lr_model, 'linear_regression_opt', {'params': lr_model.get_params(), 'baseline': True})
    lr_valid_pred = lr_model.predict(X_valid_enc)
    lr_test_pred = lr_model.predict(X_test_enc)
elif lr_model is not None:
    lr_valid_pred = lr_model.predict(X_valid_enc)
    lr_test_pred = lr_model.predict(X_test_enc)

# Polynomial Regression optimization
if poly_model is None or FORCE_RETRAIN:
    print('[PolynomialRegression] Starting Optuna optimization...')
    poly_study, poly_best_params = optimize_model_with_optuna(
        model_name='PolynomialRegression',
        estimator_builder=build_poly_estimator,
        param_space_fn=poly_param_space,
        X=X_train_enc,
        y=y_train,
        scoring='neg_root_mean_squared_error',
        cv=3,
        n_trials=3,
        direction='minimize',
        random_state=42,
        n_jobs=-1,
    )
    print('Best Polynomial Regression params:', poly_best_params)
    poly_model = build_poly_estimator(poly_best_params)
    poly_model.fit(X_train_enc, y_train)
    poly_valid_pred = poly_model.predict(X_valid_enc)
    poly_test_pred = poly_model.predict(X_test_enc)
    save_model(poly_model, 'poly_reg_opt', {'best_params': poly_best_params})
else:
    print('[PolynomialRegression] Using preloaded optimized model; generating predictions.')
    poly_valid_pred = poly_model.predict(X_valid_enc)
    poly_test_pred = poly_model.predict(X_test_enc)

[I 2025-10-12 18:20:32,815] A new study created in memory with name: SVR_opt


[SVR] Starting Optuna optimization...


  0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-10-12 18:20:39,953] Trial 0 finished with value: 19.380419730474674 and parameters: {'kernel': 'poly', 'C': 3.0049873591901566, 'epsilon': 0.08900466011060913, 'degree': 2, 'gamma': 0.1143098387631322}. Best is trial 0 with value: 19.380419730474674.
[I 2025-10-12 18:20:46,877] Trial 1 finished with value: 21.00995200439677 and parameters: {'kernel': 'rbf', 'C': 0.5318033256270142, 'epsilon': 0.2924774630404986, 'gamma': 0.3818145165896869}. Best is trial 0 with value: 19.380419730474674.
[I 2025-10-12 18:20:52,736] Trial 2 finished with value: 19.135056005123072 and parameters: {'kernel': 'rbf', 'C': 1.2439367209907215, 'epsilon': 0.18118910790805948, 'gamma': 0.2004087187654156}. Best is trial 2 with value: 19.135056005123072.
Best SVR params: {'kernel': 'rbf', 'C': 1.2439367209907215, 'epsilon': 0.18118910790805948, 'gamma': 0.2004087187654156}


[I 2025-10-12 18:21:06,087] A new study created in memory with name: PolynomialRegression_opt


Saved model -> svr_opt_20251012T222106Z.joblib; metadata -> svr_opt_20251012T222106Z.json
Saved model -> linear_regression_opt_20251012T222106Z.joblib; metadata -> linear_regression_opt_20251012T222106Z.json
[PolynomialRegression] Starting Optuna optimization...


  0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-10-12 18:21:10,036] Trial 0 finished with value: 5991224411.412984 and parameters: {'degree': 3, 'interaction_only': False}. Best is trial 0 with value: 5991224411.412984.
[I 2025-10-12 18:27:34,980] Trial 1 finished with value: 917618917.5113349 and parameters: {'degree': 4, 'interaction_only': False}. Best is trial 1 with value: 917618917.5113349.
[I 2025-10-12 18:27:35,489] Trial 2 finished with value: 224636540636.54575 and parameters: {'degree': 2, 'interaction_only': False}. Best is trial 1 with value: 917618917.5113349.
Best Polynomial Regression params: {'degree': 4, 'interaction_only': False}
Saved model -> poly_reg_opt_20251012T222954Z.joblib; metadata -> poly_reg_opt_20251012T222954Z.json


# NEURAL NETWORKS


## MLP

In [10]:
# Torch MLP Regressor with sklearn API

def _ensure_dense_np(X):
    return X.toarray() if hasattr(X, 'toarray') else X

class TorchMLPRegressor(BaseEstimator, RegressorMixin):
    """PyTorch MLP for regression with sklearn API and CUDA support."""
    
    def __init__(self, hidden_layer_sizes=(128, 64), activation='relu', 
                 learning_rate_init=1e-3, alpha=0.0, batch_size=128, max_iter=100,
                 early_stopping=True, validation_fraction=0.1, n_iter_no_change=10,
                 random_state=42, verbose=False, device=None):
        self.hidden_layer_sizes = hidden_layer_sizes
        self.activation = activation
        self.learning_rate_init = learning_rate_init
        self.alpha = alpha
        self.batch_size = batch_size
        self.max_iter = max_iter
        self.early_stopping = early_stopping
        self.validation_fraction = validation_fraction
        self.n_iter_no_change = n_iter_no_change
        self.random_state = random_state
        self.verbose = verbose
        self.device = device

    def _build_network(self, in_features):
        layers = []
        act_fn = nn.ReLU if self.activation == 'relu' else nn.Tanh
        prev = in_features
        for h in self.hidden_layer_sizes:
            layers.extend([nn.Linear(prev, h), act_fn()])
            prev = h
        layers.append(nn.Linear(prev, 1))
        return nn.Sequential(*layers)

    def fit(self, X, y):
        # Setup
        torch.manual_seed(self.random_state)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(self.random_state)
        
        X_np = _ensure_dense_np(X).astype('float32')
        y_np = (y.values if hasattr(y, 'values') else y).astype('float32').reshape(-1, 1)
        
        # Train/val split
        if self.early_stopping and 0 < self.validation_fraction < 0.5:
            val_size = max(1, int(len(X_np) * self.validation_fraction))
            idx = np.random.RandomState(self.random_state).permutation(len(X_np))
            X_train, X_val = X_np[idx[val_size:]], X_np[idx[:val_size]]
            y_train, y_val = y_np[idx[val_size:]], y_np[idx[:val_size]]
        else:
            X_train, y_train, X_val, y_val = X_np, y_np, None, None
        
        # Device and model
        device = self.device or ('cuda' if torch.cuda.is_available() else 'cpu')
        self._device_ = device
        self.model_ = self._build_network(X_np.shape[1]).to(device)
        self.optimizer_ = torch.optim.Adam(self.model_.parameters(), 
                                          lr=self.learning_rate_init, 
                                          weight_decay=self.alpha)
        criterion = nn.MSELoss()
        
        # Training
        train_loader = DataLoader(
            TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train)),
            batch_size=self.batch_size, shuffle=True
        )
        
        best_val, best_state, patience = math.inf, None, 0
        for epoch in range(1, self.max_iter + 1):
            self.model_.train()
            for xb, yb in train_loader:
                xb, yb = xb.to(device), yb.to(device)
                self.optimizer_.zero_grad()
                criterion(self.model_(xb), yb).backward()
                self.optimizer_.step()
            
            # Validation
            if X_val is not None:
                self.model_.eval()
                with torch.no_grad():
                    val_loss = criterion(
                        self.model_(torch.from_numpy(X_val).to(device)),
                        torch.from_numpy(y_val).to(device)
                    ).item()
                
                if val_loss < best_val - 1e-9:
                    best_val, patience = val_loss, 0
                    best_state = {k: v.cpu().clone() for k, v in self.model_.state_dict().items()}
                else:
                    patience += 1
                
                if patience >= self.n_iter_no_change:
                    break
        
        if best_state:
            self.model_.load_state_dict(best_state)
        self.n_features_in_ = X_np.shape[1]
        return self

    def predict(self, X):
        check_is_fitted(self, ['model_'])
        X_np = _ensure_dense_np(X).astype('float32')
        self.model_.eval()
        with torch.no_grad():
            preds = self.model_(torch.from_numpy(X_np).to(self._device_))
        return preds.cpu().numpy().ravel()

# Helper functions
def build_mlp_estimator(params):
    params = params.copy()
    params.pop('hl1', None)
    params.pop('hl2', None)
    return TorchMLPRegressor(**params)

def mlp_param_space(trial):
    return {
        'hidden_layer_sizes': tuple(sorted([
            trial.suggest_int('hl1', 64, 256, step=32),
            trial.suggest_int('hl2', 32, 192, step=32)
        ], reverse=True)),
        'learning_rate_init': trial.suggest_float('learning_rate_init', 1e-4, 1e-2, log=True),
        'alpha': trial.suggest_float('alpha', 1e-6, 1e-2, log=True),
        'batch_size': trial.suggest_categorical('batch_size', [64, 128, 256]),
        'max_iter': trial.suggest_int('max_iter', 150, 600, step=75),
        'n_iter_no_change': trial.suggest_int('n_iter_no_change', 5, 25, step=5),
        'early_stopping': True,
        'validation_fraction': 0.15,
        'activation': trial.suggest_categorical('activation', ['relu', 'tanh']),
        'random_state': 42,
        'verbose': False,
    }

# Training
if mlp_model is None or FORCE_RETRAIN:
    print(f"[MLP] Starting optimization on {'CUDA' if torch.cuda.is_available() else 'CPU'}")
    mlp_study, mlp_best_params = optimize_model_with_optuna(
        model_name='TorchMLPRegressor',
        estimator_builder=build_mlp_estimator,
        param_space_fn=mlp_param_space,
        X=X_train_enc, y=y_train,
        scoring='neg_root_mean_squared_error',
        cv=3, 
        n_trials=5, 
        direction='minimize',
        random_state=42, n_jobs=-1,
    )
    print(f'[MLP] Best params: {mlp_best_params}')
    mlp_model = build_mlp_estimator(mlp_best_params)
    mlp_model.fit(X_train_enc, y_train)
    mlp_valid_pred = mlp_model.predict(X_valid_enc)
    mlp_test_pred = mlp_model.predict(X_test_enc)
    save_model(mlp_model, 'mlp_opt', {'best_params': mlp_best_params, 'framework': 'torch'})
else:
    print("[MLP] Refitting preloaded model")
    mlp_model.fit(X_train_enc, y_train)
    mlp_valid_pred = mlp_model.predict(X_valid_enc)
    mlp_test_pred = mlp_model.predict(X_test_enc)
    save_model(mlp_model, 'mlp_opt_refit', {'refit': True, 'framework': 'torch'})


[I 2025-10-12 18:29:54,625] A new study created in memory with name: TorchMLPRegressor_opt


[MLP] Starting optimization on CUDA


  0%|          | 0/5 [00:00<?, ?it/s]

[I 2025-10-12 18:30:17,084] Trial 0 finished with value: 17.610844195036623 and parameters: {'hl1': 128, 'hl2': 192, 'learning_rate_init': 0.0029106359131330704, 'alpha': 0.0002481040974867811, 'batch_size': 64, 'max_iter': 600, 'n_iter_no_change': 20, 'activation': 'relu'}. Best is trial 0 with value: 17.610844195036623.
[I 2025-10-12 18:30:49,130] Trial 1 finished with value: 17.574408454538453 and parameters: {'hl1': 256, 'hl2': 160, 'learning_rate_init': 0.00026587543983272726, 'alpha': 5.337032762603957e-06, 'batch_size': 256, 'max_iter': 375, 'n_iter_no_change': 10, 'activation': 'relu'}. Best is trial 1 with value: 17.574408454538453.
[I 2025-10-12 18:30:49,130] Trial 1 finished with value: 17.574408454538453 and parameters: {'hl1': 256, 'hl2': 160, 'learning_rate_init': 0.00026587543983272726, 'alpha': 5.337032762603957e-06, 'batch_size': 256, 'max_iter': 375, 'n_iter_no_change': 10, 'activation': 'relu'}. Best is trial 1 with value: 17.574408454538453.
[I 2025-10-12 18:31:14,4

## TABNET

In [11]:
# Simplified TabNet Implementation
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils.validation import check_is_fitted

def prepare_data_for_tabnet(X, y=None):
    """Convert data to TabNet-compatible format."""
    X_dense = X.toarray().astype(np.float32) if hasattr(X, 'toarray') else np.asarray(X, dtype=np.float32)
    if y is not None:
        y_reshaped = np.asarray(y).reshape(-1, 1) if np.asarray(y).ndim == 1 else np.asarray(y)
        return X_dense, y_reshaped
    return X_dense

class SimpleTabNetWrapper(BaseEstimator, RegressorMixin):
    """Simplified TabNet wrapper with sensible defaults."""
    
    def __init__(self, n_d=16, n_a=16, n_steps=5, gamma=1.3, lambda_sparse=1e-4, 
                 lr=0.02, max_epochs=100, patience=20, batch_size=1024, seed=42):
        self.n_d = n_d
        self.n_a = n_a
        self.n_steps = n_steps
        self.gamma = gamma
        self.lambda_sparse = lambda_sparse
        self.lr = lr
        self.max_epochs = max_epochs
        self.patience = patience
        self.batch_size = batch_size
        self.seed = seed
        self.model_ = None

    def fit(self, X, y):
        X_prep, y_prep = prepare_data_for_tabnet(X, y)
        
        # Create internal validation split (15% for early stopping)
        n_samples = X_prep.shape[0]
        val_size = max(1, int(n_samples * 0.15))
        rng = np.random.default_rng(self.seed)
        indices = rng.permutation(n_samples)
        
        train_idx, val_idx = indices[val_size:], indices[:val_size]
        X_train, y_train = X_prep[train_idx], y_prep[train_idx]
        X_val, y_val = X_prep[val_idx], y_prep[val_idx]
        
        self.model_ = TabNetRegressor(
            n_d=self.n_d, n_a=self.n_a, n_steps=self.n_steps,
            gamma=self.gamma, lambda_sparse=self.lambda_sparse,
            optimizer_params={'lr': self.lr}, seed=self.seed
        )
        
        self.model_.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            eval_metric=['rmse'],
            max_epochs=self.max_epochs,
            patience=self.patience,
            batch_size=self.batch_size,
            virtual_batch_size=128
        )
        return self

    def predict(self, X):
        check_is_fitted(self, 'model_')
        X_prep = prepare_data_for_tabnet(X)
        return self.model_.predict(X_prep).ravel()

def get_tabnet_param_space(trial):
    """Simplified parameter space for TabNet optimization."""
    return {
        'n_d': trial.suggest_categorical('n_d', [8, 16, 24]),
        'n_a': trial.suggest_categorical('n_a', [8, 16, 24]),
        'n_steps': trial.suggest_int('n_steps', 3, 6),
        'gamma': trial.suggest_float('gamma', 1.0, 1.8),
        'lambda_sparse': trial.suggest_float('lambda_sparse', 1e-6, 1e-3, log=True),
        'lr': trial.suggest_float('lr', 1e-3, 2e-2, log=True),
        'max_epochs': 100,
        'patience': 20
    }

def load_or_train_tabnet():
    """Load existing TabNet model or train new one."""
    tabnet_files = sorted(MODEL_DIR.glob('tabnet_opt_*.joblib'))
    
    if tabnet_files and not FORCE_RETRAIN:
        try:
            model = joblib.load(tabnet_files[-1])
            print(f"[TabNet] Loaded existing model: {tabnet_files[-1].name}")
            return model
        except Exception as e:
            print(f"[TabNet] Failed to load model: {e}")
    
    # Train new model
    print("[TabNet] Training new model with Optuna optimization")
    study, best_params = optimize_model_with_optuna(
        model_name='TabNetRegressor',
        estimator_builder=lambda params: SimpleTabNetWrapper(**params),
        param_space_fn=get_tabnet_param_space,
        X=X_train_enc, y=y_train,
        scoring='neg_root_mean_squared_error',
        cv=3, 
        n_trials=5, 
        direction='minimize',
        random_state=42, n_jobs=-1
    )
    
    model = SimpleTabNetWrapper(**best_params)
    model.fit(X_train_enc, y_train)
    
    # Save model and metadata
    save_model(model, 'tabnet_opt', {'best_params': best_params})
    print(f"[TabNet] Best parameters: {best_params}")
    
    return model

# Execute TabNet training/loading
tabnet_model = load_or_train_tabnet()

# Generate predictions
tabnet_valid_pred = tabnet_model.predict(X_valid_enc)
tabnet_test_pred = tabnet_model.predict(X_test_enc)

[I 2025-10-12 18:32:20,075] A new study created in memory with name: TabNetRegressor_opt


[TabNet] Training new model with Optuna optimization


  0%|          | 0/5 [00:00<?, ?it/s]

[I 2025-10-12 18:32:58,886] Trial 0 finished with value: 17.90373767061614 and parameters: {'n_d': 16, 'n_a': 8, 'n_steps': 3, 'gamma': 1.692940916619948, 'lambda_sparse': 6.358358856676247e-05, 'lr': 0.00834110643236209}. Best is trial 0 with value: 17.90373767061614.
[I 2025-10-12 18:34:24,286] Trial 1 finished with value: 17.941336017480243 and parameters: {'n_d': 16, 'n_a': 8, 'n_steps': 4, 'gamma': 1.4198051453057903, 'lambda_sparse': 1.9762189340280066e-05, 'lr': 0.002392752876558064}. Best is trial 0 with value: 17.90373767061614.
[I 2025-10-12 18:34:24,286] Trial 1 finished with value: 17.941336017480243 and parameters: {'n_d': 16, 'n_a': 8, 'n_steps': 4, 'gamma': 1.4198051453057903, 'lambda_sparse': 1.9762189340280066e-05, 'lr': 0.002392752876558064}. Best is trial 0 with value: 17.90373767061614.
[I 2025-10-12 18:35:34,386] Trial 2 finished with value: 24.2295931036714 and parameters: {'n_d': 8, 'n_a': 24, 'n_steps': 3, 'gamma': 1.4113875507308893, 'lambda_sparse': 5.98747491



epoch 0  | loss: 2657.33423| val_0_rmse: 51.80996|  0:00:01s
epoch 1  | loss: 2592.91721| val_0_rmse: 51.28325|  0:00:02s
epoch 1  | loss: 2592.91721| val_0_rmse: 51.28325|  0:00:02s
epoch 2  | loss: 2534.17283| val_0_rmse: 50.71539|  0:00:03s
epoch 2  | loss: 2534.17283| val_0_rmse: 50.71539|  0:00:03s
epoch 3  | loss: 2475.51138| val_0_rmse: 50.08335|  0:00:04s
epoch 3  | loss: 2475.51138| val_0_rmse: 50.08335|  0:00:04s
epoch 4  | loss: 2415.30674| val_0_rmse: 49.35671|  0:00:05s
epoch 4  | loss: 2415.30674| val_0_rmse: 49.35671|  0:00:05s
epoch 5  | loss: 2335.52185| val_0_rmse: 48.48561|  0:00:06s
epoch 5  | loss: 2335.52185| val_0_rmse: 48.48561|  0:00:06s
epoch 6  | loss: 2239.76829| val_0_rmse: 47.28928|  0:00:07s
epoch 6  | loss: 2239.76829| val_0_rmse: 47.28928|  0:00:07s
epoch 7  | loss: 2138.7392| val_0_rmse: 46.05031|  0:00:08s
epoch 7  | loss: 2138.7392| val_0_rmse: 46.05031|  0:00:08s
epoch 8  | loss: 2013.08971| val_0_rmse: 44.5799 |  0:00:10s
epoch 8  | loss: 2013.0897



Saved model -> tabnet_opt_20251013T000538Z.joblib; metadata -> tabnet_opt_20251013T000538Z.json
[TabNet] Best parameters: {'n_d': 8, 'n_a': 16, 'n_steps': 4, 'gamma': 1.078137691205107, 'lambda_sparse': 0.00011290133559092664, 'lr': 0.0037381058681917956}


# 5. Tree-based ensemble models

## XGBOOST

In [12]:
# Hyperparameter optimization and training for ensemble models
def build_xgb_estimator(params: Dict) -> XGBRegressor:
    base_params = {
        'random_state': 42,
        'device': 'cuda',
        'verbosity': 0,
        'tree_method': 'gpu_hist'
    }
    base_params.update(params)
    return XGBRegressor(**base_params)


def xgb_param_space(trial: optuna.trial.Trial) -> Dict:
    return {
        'n_estimators': trial.suggest_int('n_estimators', 200, 600),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 0.9),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 0.8),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1e-1, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10, log=True)
    }

if xgb_model is None or FORCE_RETRAIN:
    print("[XGBoost] No preloaded model (or FORCE_RETRAIN=True). Starting Optuna optimization...")
    xgb_study, xgb_best_params = optimize_model_with_optuna(
        model_name='XGBoost',
        estimator_builder=build_xgb_estimator,
        param_space_fn=xgb_param_space,
        X=X_train_enc,
        y=y_train,
        scoring='neg_root_mean_squared_error',
        cv=3,
        n_trials=5,
        direction='minimize',
        random_state=42,
        n_jobs=-1
    )
    print('Best XGBoost params:', xgb_best_params)
    # Fit model with optimized hyperparameters
    xgb_model = build_xgb_estimator(xgb_best_params)
    xgb_model.fit(X_train_enc, y_train)
    xgb_valid_pred = xgb_model.predict(X_valid_enc)
    xgb_test_pred = xgb_model.predict(X_test_enc)
    save_model(xgb_model, 'xgboost_opt', {'best_params': xgb_best_params})
else:
    # Refit even when preloaded to ensure alignment with current data & preprocessing
    print('[XGBoost] Preloaded model found; refitting on current data.')
    # Try to pull previously stored best params from metadata if available
    reuse_params = None
    try:
        if 'xgb_model_meta' in globals() and isinstance(xgb_model_meta, dict):
            reuse_params = xgb_model_meta.get('best_params')
    except Exception:
        reuse_params = None
    if reuse_params is None:
        # Fall back to current model's parameters (filter to search space + core)
        try:
            current = xgb_model.get_params()
            reuse_keys = {'n_estimators','max_depth','learning_rate','subsample','colsample_bytree','reg_alpha','reg_lambda'}
            reuse_params = {k: v for k, v in current.items() if k in reuse_keys}
        except Exception:
            reuse_params = {}
    # Rebuild a fresh estimator to avoid any internal state carry-over
    xgb_model = build_xgb_estimator(reuse_params)
    xgb_model.fit(X_train_enc, y_train)
    xgb_valid_pred = xgb_model.predict(X_valid_enc)
    xgb_test_pred = xgb_model.predict(X_test_enc)
    # Save refit artifact
    save_model(xgb_model, 'xgboost_opt_refit', {'refit': True, 'best_params': reuse_params})

[I 2025-10-12 20:05:39,185] A new study created in memory with name: XGBoost_opt


[XGBoost] No preloaded model (or FORCE_RETRAIN=True). Starting Optuna optimization...


  0%|          | 0/5 [00:00<?, ?it/s]



[I 2025-10-12 20:05:54,055] Trial 0 finished with value: 17.677079001125602 and parameters: {'n_estimators': 350, 'max_depth': 10, 'learning_rate': 0.1205712628744377, 'subsample': 0.779597545259111, 'colsample_bytree': 0.6312037280884872, 'reg_alpha': 1.23583827723069e-07, 'reg_lambda': 3.3323645788192616e-08}. Best is trial 0 with value: 17.677079001125602.
[I 2025-10-12 20:06:01,858] Trial 1 finished with value: 17.804445533726852 and parameters: {'n_estimators': 547, 'max_depth': 7, 'learning_rate': 0.11114989443094977, 'subsample': 0.6061753482887408, 'colsample_bytree': 0.7939819704323989, 'reg_alpha': 0.006715811311069936, 'reg_lambda': 8.148018307012941e-07}. Best is trial 0 with value: 17.677079001125602.
[I 2025-10-12 20:06:01,858] Trial 1 finished with value: 17.804445533726852 and parameters: {'n_estimators': 547, 'max_depth': 7, 'learning_rate': 0.11114989443094977, 'subsample': 0.6061753482887408, 'colsample_bytree': 0.7939819704323989, 'reg_alpha': 0.006715811311069936, 

# RANDOM FOREST

In [13]:
def build_rf_estimator(params: Dict) -> RandomForestRegressor:
    base_params = {
        'random_state': 42,
        'n_jobs': -1
    }
    base_params.update(params)
    return RandomForestRegressor(**base_params)


def rf_param_space(trial: optuna.trial.Trial) -> Dict:
    return {
        'n_estimators': trial.suggest_int('n_estimators', 200, 800),
        'max_depth': trial.suggest_int('max_depth', 5, 15),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
        'max_features': trial.suggest_float('max_features', 0.4, 0.9)
    }

if rf_model is None or FORCE_RETRAIN:
    print("[RandomForest] No preloaded model (or FORCE_RETRAIN=True). Starting Optuna optimization...")
    rf_study, rf_best_params = optimize_model_with_optuna(
        model_name='RandomForest',
        estimator_builder=build_rf_estimator,
        param_space_fn=rf_param_space,
        X=X_train_enc,
        y=y_train,
        scoring='neg_root_mean_squared_error',
        cv=3,
        n_trials=5,
        direction='minimize',
        random_state=42,
        n_jobs=-1,
    )
    print('Best Random Forest params:', rf_best_params)
    rf_model = build_rf_estimator(rf_best_params)
    rf_model.fit(X_train_enc, y_train)
    rf_valid_pred = rf_model.predict(X_valid_enc)
    rf_test_pred = rf_model.predict(X_test_enc)
    save_model(rf_model, 'random_forest_opt', {'best_params': rf_best_params})
else:
    print("[RandomForest] Using preloaded optimized model. Skipping training.")
    rf_valid_pred = rf_model.predict(X_valid_enc)
    rf_test_pred = rf_model.predict(X_test_enc)


[I 2025-10-12 20:06:10,510] A new study created in memory with name: RandomForest_opt


[RandomForest] No preloaded model (or FORCE_RETRAIN=True). Starting Optuna optimization...


  0%|          | 0/5 [00:00<?, ?it/s]

[I 2025-10-12 20:06:17,405] Trial 0 finished with value: 16.85200958243298 and parameters: {'n_estimators': 425, 'max_depth': 15, 'min_samples_split': 8, 'min_samples_leaf': 3, 'max_features': 0.4780093202212183}. Best is trial 0 with value: 16.85200958243298.
[I 2025-10-12 20:06:20,530] Trial 1 finished with value: 17.118219387331564 and parameters: {'n_estimators': 293, 'max_depth': 5, 'min_samples_split': 9, 'min_samples_leaf': 4, 'max_features': 0.7540362888980228}. Best is trial 0 with value: 16.85200958243298.
[I 2025-10-12 20:06:20,530] Trial 1 finished with value: 17.118219387331564 and parameters: {'n_estimators': 293, 'max_depth': 5, 'min_samples_split': 9, 'min_samples_leaf': 4, 'max_features': 0.7540362888980228}. Best is trial 0 with value: 16.85200958243298.
[I 2025-10-12 20:06:23,940] Trial 2 finished with value: 16.869692123066528 and parameters: {'n_estimators': 212, 'max_depth': 15, 'min_samples_split': 9, 'min_samples_leaf': 2, 'max_features': 0.49091248360355033}. B