In [None]:
import polars as pl


df = pl.read_parquet("/home/jupyter/data/int32Features.parquet")


In [None]:
# Check for null values in each column
null_counts = df.null_count()

# Display columns with null values and their counts
for column, count in zip(null_counts.columns, null_counts.row(0)):
    print(f"{column}: {count} null values")

In [None]:
import os
import glob
import numpy as np
import polars as pl
import torch
import torch.nn as nn
import torch.optim as optim
import pickle
import random
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import StandardScaler
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.metrics import r2_score
import logging

# Setup Logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[logging.StreamHandler()]
)
logger = logging.getLogger(__name__)

# Reproducibility Setup
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False  # Safer for reproducibility

set_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {device}")

# Constants
HIDDEN_DIM = 512
DROPOUT_RATE = 0.2
NOISE_STD = 0.001
LEARNING_RATE = 1e-5
BATCH_SIZE = 256
WEIGHT_DECAY = 1e-5
EPOCHS = 75
N_FOLDS = 5
GAP_SIZE = 30
VAL_SIZE = 100
TRAIN_SIZE = 300
GRADIENT_CLIP_VALUE = 0.5
LR_PATIENCE = 5
MIN_LR = 1e-7

# Paths
PARTITIONED_DIR = "/home/jupyter/data/partitioned/"
all_part_files = sorted(glob.glob(os.path.join(PARTITIONED_DIR, "*.parquet")))
logger.info(f"Found {len(all_part_files)} partitioned files.")

# Exclude columns
EXCLUDE_COLS = ["date_id", "time_id", "symbol_id", "weight", "partition_id"]
df_tmp = pl.read_parquet(all_part_files[0])
EXCLUDE_COLS += [col for col in df_tmp.columns if col.startswith("responder_")]
del df_tmp

class EarlyStopping:
    def __init__(self, patience=7, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = None
        self.early_stop = False
        
    def __call__(self, val_loss):
        if self.best_loss is None or val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True

class FinancialDataset(Dataset):
    def __init__(self, features, target, weights):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.target = torch.tensor(target, dtype=torch.float32)
        self.weights = torch.tensor(weights, dtype=torch.float32)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.target[idx], self.weights[idx]

class GaussianNoise(nn.Module):
    def __init__(self, std=0.05):
        super().__init__()
        self.std = std

    def forward(self, x):
        if self.training and self.std > 0:
            return x + torch.randn_like(x) * self.std
        return x

class AutoencoderWithMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim=512, dropout_rate=0.3, noise_std=0.05):
        super().__init__()
        self.noise = GaussianNoise(std=noise_std)
        
        # Smaller initial hidden dimensions with gradual scaling
        dim1 = hidden_dim // 2  # 256
        dim2 = hidden_dim // 4  # 128
        dim3 = hidden_dim // 8  # 64
        
        # Layer normalization for better stability
        self.input_norm = nn.LayerNorm(input_dim)
        
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, dim1),
            nn.LayerNorm(dim1),
            nn.SiLU(),
            nn.Dropout(dropout_rate),
            
            nn.Linear(dim1, dim2),
            nn.LayerNorm(dim2),
            nn.SiLU(),
            nn.Dropout(dropout_rate)
        )
        
        self.bottleneck = nn.Sequential(
            nn.Linear(dim2, dim3),
            nn.LayerNorm(dim3)
        )
        
        self.decoder = nn.Sequential(
            nn.Linear(dim3, dim2),
            nn.LayerNorm(dim2),
            nn.SiLU(),
            
            nn.Linear(dim2, input_dim),
            nn.LayerNorm(input_dim)
        )
        
        self.mlp = nn.Sequential(
            nn.Linear(input_dim + dim3, dim1),
            nn.LayerNorm(dim1),
            nn.SiLU(),
            nn.Dropout(dropout_rate),
            
            nn.Linear(dim1, 1)
        )
        
        self._init_weights()
        
    def _init_weights(self):
        for module in self.modules():
            if isinstance(module, nn.Linear):
                nn.init.kaiming_normal_(module.weight, a=0.1, mode='fan_in', nonlinearity='linear')
                if module.bias is not None:
                    nn.init.zeros_(module.bias)
    
    def forward(self, x):
        x = self.input_norm(x)
        x_noised = self.noise(x)
        
        encoded = self.encoder(x_noised)
        latent = self.bottleneck(encoded)
        
        reconstructed = self.decoder(latent)
        reconstructed = reconstructed + x  # Skip connection
        
        mlp_input = torch.cat([x, latent], dim=1)
        mlp_output = self.mlp(mlp_input)
        
        return reconstructed, mlp_output, latent

import math

def initialize_weights(model):
    """Initialize weights using Kaiming He initialization with manual gain for SiLU."""
    for m in model.modules():
        if isinstance(m, nn.Linear):
            # Manual gain for SiLU (approximated)
            gain = math.sqrt(2)
            nn.init.kaiming_normal_(m.weight, a=gain, nonlinearity="leaky_relu")
            if m.bias is not None:
                nn.init.zeros_(m.bias)


def get_unique_dates(files):
    all_dates = set()
    for file in files:
        df = pl.read_parquet(file).select("date_id")
        unique_dates = df.unique().to_series().to_list()
        all_dates.update(unique_dates)
    return sorted(all_dates)

def load_and_scale_data_for_dates(
    parquet_file, date_list, feature_scaler, exclude_cols, 
    target_col="responder_6", weight_col="weight"
):
    """
    Load a Parquet file, filter rows to only those date_ids in date_list,
    apply scaling, and return (features, targets, weights).
    If no rows match, or all are dropped as null, returns (None, None, None).
    """
    df = pl.read_parquet(parquet_file)
    df = df.filter(pl.col("date_id").is_in(date_list))
    if df.is_empty():
        return None, None, None

    numeric_cols = [col for col in df.columns if col not in exclude_cols]
    needed_cols = numeric_cols + [target_col, weight_col]

    # Drop any rows containing null in these columns
    df = df.drop_nulls(subset=needed_cols)
    if df.is_empty():
        return None, None, None

    features = df.select(numeric_cols).to_numpy().astype(np.float32)
    targets = df[target_col].to_numpy().astype(np.float32)
    weights = df[weight_col].to_numpy().astype(np.float32)

    # Scale features and ensure no NaNs remain
    features = feature_scaler.transform(features)
    features = np.nan_to_num(features, nan=0.0, posinf=1.0, neginf=-1.0)
    if np.any(np.abs(features) > 100):
        features = np.clip(features, -100, 100)
    # Normalize targets (standardization)
    targets_mean = targets.mean()
    targets_std = targets.std() + 1e-6  # Avoid division by zero
    targets = (targets - targets_mean) / targets_std  # <- Added normalization
    
    return features, targets, weights

def create_rolling_folds(dates_list, n_folds, gap_size, val_size, train_size):
    folds = []
    idx_start = 0
    total_dates = len(dates_list)

    for _ in range(n_folds):
        train_end = idx_start + train_size
        gap_end = train_end + gap_size
        val_end = gap_end + val_size

        if val_end > total_dates:
            break

        train_dates = dates_list[idx_start:train_end]
        val_dates = dates_list[gap_end:val_end]
        
        folds.append((train_dates, val_dates))
        idx_start = val_end

    return folds

# Build or load the scaler
if os.path.exists("feature_scaler.pkl"):
    with open("feature_scaler.pkl", "rb") as f:
        feature_scaler = pickle.load(f)
else:
    df_tmp = pl.read_parquet(all_part_files[0])
    numeric_cols = [col for col in df_tmp.columns if col not in EXCLUDE_COLS]
    features = df_tmp.select(numeric_cols).to_numpy()
    feature_scaler = StandardScaler().fit(features)
    with open("feature_scaler.pkl", "wb") as f:
        pickle.dump(feature_scaler, f)
    del df_tmp

GLOBAL_INPUT_DIM = feature_scaler.n_features_in_
sorted_dates = get_unique_dates(all_part_files)
folds = create_rolling_folds(sorted_dates, N_FOLDS, GAP_SIZE, VAL_SIZE, TRAIN_SIZE)

logger.info(f"Created {len(folds)} folds")

def train_model_with_cv(folds, files, feature_scaler):
    for fold_idx, (train_dates, val_dates) in enumerate(folds, start=1):
        logger.info(f"Training Fold {fold_idx}/{len(folds)}...")

        # Model and optimizer setup
        model = AutoencoderWithMLP(
            input_dim=GLOBAL_INPUT_DIM,
            hidden_dim=HIDDEN_DIM,
            dropout_rate=DROPOUT_RATE,
            noise_std=NOISE_STD
        )
        initialize_weights(model)
        model.to(device)

        optimizer = optim.AdamW(
            model.parameters(),
            lr=LEARNING_RATE,
            weight_decay=WEIGHT_DECAY,
            betas=(0.9, 0.999)
        )
        
        for param_group in optimizer.param_groups:
            param_group['lr'] = LEARNING_RATE
        
        scheduler = torch.optim.lr_scheduler.OneCycleLR(
            optimizer,
            max_lr=LEARNING_RATE,
            epochs=EPOCHS,
            steps_per_epoch=len(train_loader) if 'train_loader' in locals() else 100,
            pct_start=0.1,
            anneal_strategy='cos'
        )
        recon_criterion = nn.MSELoss()
        supervised_criterion = nn.MSELoss()
        scaler = torch.amp.GradScaler()

        early_stopping = EarlyStopping(patience=7)

        best_val_loss = float('inf')
        
        for epoch in range(1, EPOCHS + 1):
            # ----------------------------
            # Training Phase
            # ----------------------------
            model.train()
            train_loss = 0.0
            train_steps = 0

            for file in files:
                features, targets, weights = load_and_scale_data_for_dates(
                    file, train_dates, feature_scaler, EXCLUDE_COLS
                )
                if features is None:
                    continue

                train_dataset = FinancialDataset(features, targets, weights)
                train_loader = DataLoader(
                    train_dataset, batch_size=BATCH_SIZE, shuffle=True, 
                    num_workers=4, pin_memory=True
                )

                for batch_x, batch_y, batch_w in train_loader:
                    batch_x, batch_y, batch_w = batch_x.to(device), batch_y.to(device), batch_w.to(device)
                    
                    optimizer.zero_grad(set_to_none=True)
                    with torch.amp.autocast(device_type="cuda"):
                        reconstructed, mlp_output, _ = model(batch_x)
                        loss_recon = recon_criterion(reconstructed, batch_x)
                        loss_sup = supervised_criterion(mlp_output.squeeze(), batch_y)
                        # Safer weighted loss to avoid NaN issues
                        loss_sup = (loss_sup * batch_w).sum() / (batch_w.sum() + 1e-6)
                        loss = loss_recon + loss_sup
                    
                    scaler.scale(loss).backward()
                    scaler.unscale_(optimizer)
                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.5)
                    scaler.step(optimizer)
                    scaler.update()
                    
                    train_loss += loss.item()
                    train_steps += 1

            avg_train_loss = train_loss / train_steps if train_steps > 0 else float('nan')

            # ----------------------------
            # Validation Phase
            # ----------------------------
            model.eval()
            val_loss = 0.0
            val_steps = 0
            all_targets, all_predictions = [], []

            with torch.no_grad():
                for file in files:
                    features, targets, weights = load_and_scale_data_for_dates(
                        file, val_dates, feature_scaler, EXCLUDE_COLS
                    )
                    if features is None:
                        continue

                    val_dataset = FinancialDataset(features, targets, weights)
                    val_loader = DataLoader(
                        val_dataset, batch_size=BATCH_SIZE, shuffle=False, 
                        num_workers=4, pin_memory=True
                    )

                    for batch_x, batch_y, batch_w in val_loader:
                        batch_x, batch_y, batch_w = batch_x.to(device), batch_y.to(device), batch_w.to(device)

                        with torch.amp.autocast(device_type="cuda"):

                            reconstructed, mlp_output, _ = model(batch_x)
                            loss_recon = recon_criterion(reconstructed, batch_x)
                            loss_sup = supervised_criterion(mlp_output.squeeze(), batch_y)
                            loss_sup = (loss_sup * batch_w).sum() / (batch_w.sum() + 1e-6)
                            loss = loss_recon + loss_sup

                        val_loss += loss.item()
                        val_steps += 1

                        # Collect predictions and targets for R² calculation
                        all_targets.extend(batch_y.cpu().numpy())
                        all_predictions.extend(mlp_output.squeeze().cpu().numpy())

            avg_val_loss = val_loss / val_steps if val_steps > 0 else float('nan')

            # R² calculation with NaN protection
            all_targets = np.array(all_targets)
            all_predictions = np.array(all_predictions)
            valid_mask = np.isfinite(all_targets) & np.isfinite(all_predictions)
            filtered_targets = all_targets[valid_mask]
            filtered_predictions = all_predictions[valid_mask]
            r2 = r2_score(filtered_targets, filtered_predictions) if len(filtered_targets) > 1 else float('nan')

            logger.info(f"[Fold {fold_idx} | Epoch {epoch}] "
                        f"Train Loss: {avg_train_loss:.5f} | "
                        f"Val Loss: {avg_val_loss:.5f} | "
                        f"R²: {r2:.5f}")

            # ----------------------------
            # Save the best model
            # ----------------------------
            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                torch.save(model.state_dict(), f"best_model_fold_{fold_idx}.pt")
                logger.info(f"Best model updated for fold {fold_idx}")

            # Step scheduler and early stopping
            scheduler.step()
            early_stopping(avg_val_loss)
            if early_stopping.early_stop:
                logger.info(f"Early stopping triggered for Fold {fold_idx}")
                break

# ----------------------------------------------------------------------------
# Main Execution (unchanged from original)
# ----------------------------------------------------------------------------
sorted_dates = get_unique_dates(all_part_files)
folds = create_rolling_folds(sorted_dates, N_FOLDS, GAP_SIZE, VAL_SIZE, TRAIN_SIZE)
logger.info(f"Created {len(folds)} folds")

train_model_with_cv(folds, all_part_files, feature_scaler)


In [1]:
#current
import os
import glob
import numpy as np
import polars as pl
import torch
import torch.nn as nn
import torch.optim as optim
import pickle
import random
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import StandardScaler
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.metrics import r2_score
import logging

# Setup Logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[logging.StreamHandler()]
)
logger = logging.getLogger(__name__)

# Reproducibility Setup
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False  # Safer for reproducibility

set_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {device}")

# Constants
HIDDEN_DIM = 512
DROPOUT_RATE = 0.2
NOISE_STD = 0.001
LEARNING_RATE = 1e-5
BATCH_SIZE = 256
WEIGHT_DECAY = 1e-5
EPOCHS = 75
N_FOLDS = 5
GAP_SIZE = 30
VAL_SIZE = 100
TRAIN_SIZE = 300
GRADIENT_CLIP_VALUE = 0.5
LR_PATIENCE = 5
MIN_LR = 1e-7

# Paths
PARTITIONED_DIR = "/home/jupyter/data/partitioned/"
all_part_files = sorted(glob.glob(os.path.join(PARTITIONED_DIR, "*.parquet")))
logger.info(f"Found {len(all_part_files)} partitioned files.")

# Exclude columns
EXCLUDE_COLS = ["date_id", "time_id", "symbol_id", "weight", "partition_id"]
df_tmp = pl.read_parquet(all_part_files[0])
EXCLUDE_COLS += [col for col in df_tmp.columns if col.startswith("responder_")]
del df_tmp

class EarlyStopping:
    def __init__(self, patience=7, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = None
        self.early_stop = False
        
    def __call__(self, val_loss):
        if self.best_loss is None or val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True

class FinancialDataset(Dataset):
    def __init__(self, features, target, weights):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.target = torch.tensor(target, dtype=torch.float32)
        self.weights = torch.tensor(weights, dtype=torch.float32)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.target[idx], self.weights[idx]

class GaussianNoise(nn.Module):
    def __init__(self, std=0.05):
        super().__init__()
        self.std = std

    def forward(self, x):
        if self.training and self.std > 0:
            return x + torch.randn_like(x) * self.std
        return x

class AutoencoderWithMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim=512, dropout_rate=0.2, noise_std=0.001):  # Note original values
        super().__init__()
        self.noise = GaussianNoise(std=noise_std)
        
        # Original dimensionality reduction that worked
        dim1 = hidden_dim // 2  # 256
        dim2 = hidden_dim // 4  # 128
        dim3 = hidden_dim // 8  # 64
        
        self.input_norm = nn.LayerNorm(input_dim)  # Keep LayerNorm
        
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, dim1),
            nn.LayerNorm(dim1),
            nn.SiLU(),
            nn.Dropout(dropout_rate),
            
            nn.Linear(dim1, dim2),
            nn.LayerNorm(dim2),
            nn.SiLU(),
            nn.Dropout(dropout_rate)
        )
        
        self.bottleneck = nn.Sequential(
            nn.Linear(dim2, dim3),
            nn.LayerNorm(dim3)
        )
        
        self.decoder = nn.Sequential(
            nn.Linear(dim3, dim2),
            nn.LayerNorm(dim2),
            nn.SiLU(),
            
            nn.Linear(dim2, input_dim),
            nn.LayerNorm(input_dim)
        )
        
        self.mlp = nn.Sequential(
            nn.Linear(input_dim + dim3, dim1),
            nn.LayerNorm(dim1),
            nn.SiLU(),
            nn.Dropout(dropout_rate),
            
            nn.Linear(dim1, 1)
        )
        
        self._init_weights()
        
    def _init_weights(self):
        for module in self.modules():
            if isinstance(module, nn.Linear):
                nn.init.kaiming_normal_(module.weight, a=0.1, mode='fan_in', nonlinearity='linear')
                if module.bias is not None:
                    nn.init.zeros_(module.bias)
    
    def forward(self, x):
        x = self.input_norm(x)
        x_noised = self.noise(x)
        
        encoded = self.encoder(x_noised)
        latent = self.bottleneck(encoded)
        
        reconstructed = self.decoder(latent)
        reconstructed = reconstructed + x  # Skip connection from original
        
        mlp_input = torch.cat([x, latent], dim=1)
        mlp_output = self.mlp(mlp_input)
        
        return reconstructed, mlp_output, latent

import math

def initialize_weights(model):
    """Initialize weights using Kaiming He initialization with manual gain for SiLU."""
    for m in model.modules():
        if isinstance(m, nn.Linear):
            # Manual gain for SiLU (approximated)
            gain = math.sqrt(2)
            nn.init.kaiming_normal_(m.weight, a=gain, nonlinearity="leaky_relu")
            if m.bias is not None:
                nn.init.zeros_(m.bias)


def get_unique_dates(files):
    all_dates = set()
    for file in files:
        df = pl.read_parquet(file).select("date_id")
        unique_dates = df.unique().to_series().to_list()
        all_dates.update(unique_dates)
    return sorted(all_dates)

def load_and_scale_data_for_dates(
    parquet_file, date_list, feature_scaler, exclude_cols, 
    target_col="responder_6", weight_col="weight"
):
    """
    Load a Parquet file, filter rows to only those date_ids in date_list,
    apply scaling, and return (features, targets, weights).
    If no rows match, or all are dropped as null, returns (None, None, None).
    """
    df = pl.read_parquet(parquet_file)
    df = df.filter(pl.col("date_id").is_in(date_list))
    if df.is_empty():
        return None, None, None

    numeric_cols = [col for col in df.columns if col not in exclude_cols]
    needed_cols = numeric_cols + [target_col, weight_col]

    # Drop any rows containing null in these columns
    df = df.drop_nulls(subset=needed_cols)
    if df.is_empty():
        return None, None, None

    features = df.select(numeric_cols).to_numpy().astype(np.float32)
    targets = df[target_col].to_numpy().astype(np.float32)
    weights = df[weight_col].to_numpy().astype(np.float32)

    # Scale features and ensure no NaNs remain
    features = feature_scaler.transform(features)
    features = np.nan_to_num(features, nan=0.0, posinf=1.0, neginf=-1.0)
    if np.any(np.abs(features) > 100):
        features = np.clip(features, -100, 100)
    # Normalize targets (standardization)
    targets_mean = targets.mean()
    targets_std = targets.std() + 1e-6  # Avoid division by zero
    targets = (targets - targets_mean) / targets_std  # <- Added normalization
    
    return features, targets, weights

def create_rolling_folds(dates_list, n_folds, gap_size, val_size, train_size):
    folds = []
    idx_start = 0
    total_dates = len(dates_list)

    for _ in range(n_folds):
        train_end = idx_start + train_size
        gap_end = train_end + gap_size
        val_end = gap_end + val_size

        if val_end > total_dates:
            break

        train_dates = dates_list[idx_start:train_end]
        val_dates = dates_list[gap_end:val_end]
        
        folds.append((train_dates, val_dates))
        idx_start = val_end

    return folds

# Build or load the scaler
if os.path.exists("feature_scaler.pkl"):
    with open("feature_scaler.pkl", "rb") as f:
        feature_scaler = pickle.load(f)
else:
    df_tmp = pl.read_parquet(all_part_files[0])
    numeric_cols = [col for col in df_tmp.columns if col not in EXCLUDE_COLS]
    features = df_tmp.select(numeric_cols).to_numpy()
    feature_scaler = StandardScaler().fit(features)
    with open("feature_scaler.pkl", "wb") as f:
        pickle.dump(feature_scaler, f)
    del df_tmp

GLOBAL_INPUT_DIM = feature_scaler.n_features_in_
sorted_dates = get_unique_dates(all_part_files)
folds = create_rolling_folds(sorted_dates, N_FOLDS, GAP_SIZE, VAL_SIZE, TRAIN_SIZE)

logger.info(f"Created {len(folds)} folds")

def train_model_with_cv(folds, files, feature_scaler):
    for fold_idx, (train_dates, val_dates) in enumerate(folds, start=1):
        logger.info(f"Training Fold {fold_idx}/{len(folds)}...")

        model = AutoencoderWithMLP(
            input_dim=GLOBAL_INPUT_DIM,
            hidden_dim=HIDDEN_DIM,
            dropout_rate=DROPOUT_RATE,
            noise_std=NOISE_STD
        )
        initialize_weights(model)
        model.to(device)

        optimizer = optim.AdamW(
            model.parameters(),
            lr=LEARNING_RATE,  # Keep original 1e-5
            weight_decay=WEIGHT_DECAY,  # Keep original 1e-5
            betas=(0.9, 0.999)
        )
        
        scheduler = torch.optim.lr_scheduler.OneCycleLR(
            optimizer,
            max_lr=LEARNING_RATE,
            epochs=EPOCHS,
            steps_per_epoch=len(train_loader) if 'train_loader' in locals() else 100,
            pct_start=0.1,
            anneal_strategy='cos',
            div_factor=25.0  # Starts with lr/25
        )

        recon_criterion = nn.MSELoss()
        supervised_criterion = nn.MSELoss()
        scaler = torch.amp.GradScaler()

        early_stopping = EarlyStopping(patience=7)

        best_val_loss = float('inf')
        
        for epoch in range(1, EPOCHS + 1):
            # ----------------------------
            # Training Phase
            # ----------------------------
            model.train()
            train_loss = 0.0
            train_steps = 0

            for file in files:
                features, targets, weights = load_and_scale_data_for_dates(
                    file, train_dates, feature_scaler, EXCLUDE_COLS
                )
                if features is None:
                    continue

                train_dataset = FinancialDataset(features, targets, weights)
                train_loader = DataLoader(
                    train_dataset, batch_size=BATCH_SIZE, shuffle=True, 
                    num_workers=4, pin_memory=True
                )

                for batch_x, batch_y, batch_w in train_loader:
                    batch_x, batch_y, batch_w = batch_x.to(device), batch_y.to(device), batch_w.to(device)
                    
                    optimizer.zero_grad(set_to_none=True)
                    with torch.amp.autocast(device_type="cuda"):
                        reconstructed, mlp_output, _ = model(batch_x)
                        loss_recon = recon_criterion(reconstructed, batch_x)
                        loss_sup = supervised_criterion(mlp_output.squeeze(), batch_y)
                        # Safer weighted loss to avoid NaN issues
                        loss_sup = (loss_sup * batch_w).sum() / (batch_w.sum() + 1e-6)
                        loss = loss_recon + loss_sup
                    
                    scaler.scale(loss).backward()
                    scaler.unscale_(optimizer)
                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.5)
                    scaler.step(optimizer)
                    scaler.update()
                    
                    train_loss += loss.item()
                    train_steps += 1

            avg_train_loss = train_loss / train_steps if train_steps > 0 else float('nan')

            # ----------------------------
            # Validation Phase
            # ----------------------------
            model.eval()
            val_loss = 0.0
            val_steps = 0
            all_targets, all_predictions = [], []

            with torch.no_grad():
                for file in files:
                    features, targets, weights = load_and_scale_data_for_dates(
                        file, val_dates, feature_scaler, EXCLUDE_COLS
                    )
                    if features is None:
                        continue

                    val_dataset = FinancialDataset(features, targets, weights)
                    val_loader = DataLoader(
                        val_dataset, batch_size=BATCH_SIZE, shuffle=False, 
                        num_workers=4, pin_memory=True
                    )

                    for batch_x, batch_y, batch_w in val_loader:
                        batch_x, batch_y, batch_w = batch_x.to(device), batch_y.to(device), batch_w.to(device)

                        with torch.amp.autocast(device_type="cuda"):

                            reconstructed, mlp_output, _ = model(batch_x)
                            loss_recon = recon_criterion(reconstructed, batch_x)
                            loss_sup = supervised_criterion(mlp_output.squeeze(), batch_y)
                            loss_sup = (loss_sup * batch_w).sum() / (batch_w.sum() + 1e-6)
                            loss = loss_recon + loss_sup

                        val_loss += loss.item()
                        val_steps += 1

                        # Collect predictions and targets for R² calculation
                        all_targets.extend(batch_y.cpu().numpy())
                        all_predictions.extend(mlp_output.squeeze().cpu().numpy())

            avg_val_loss = val_loss / val_steps if val_steps > 0 else float('nan')

            # R² calculation with NaN protection
            all_targets = np.array(all_targets)
            all_predictions = np.array(all_predictions)
            valid_mask = np.isfinite(all_targets) & np.isfinite(all_predictions)
            filtered_targets = all_targets[valid_mask]
            filtered_predictions = all_predictions[valid_mask]
            r2 = r2_score(filtered_targets, filtered_predictions) if len(filtered_targets) > 1 else float('nan')

            logger.info(f"[Fold {fold_idx} | Epoch {epoch}] "
                        f"Train Loss: {avg_train_loss:.5f} | "
                        f"Val Loss: {avg_val_loss:.5f} | "
                        f"R²: {r2:.5f}")

            # ----------------------------
            # Save the best model
            # ----------------------------
            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                torch.save(model.state_dict(), f"best_model_fold_{fold_idx}.pt")
                logger.info(f"Best model updated for fold {fold_idx}")

            # Step scheduler and early stopping
            scheduler.step()
            early_stopping(avg_val_loss)
            if early_stopping.early_stop:
                logger.info(f"Early stopping triggered for Fold {fold_idx}")
                break

# ----------------------------------------------------------------------------
# Main Execution (unchanged from original)
# ----------------------------------------------------------------------------
sorted_dates = get_unique_dates(all_part_files)
folds = create_rolling_folds(sorted_dates, N_FOLDS, GAP_SIZE, VAL_SIZE, TRAIN_SIZE)
logger.info(f"Created {len(folds)} folds")

train_model_with_cv(folds, all_part_files, feature_scaler)
#current

2025-01-13 11:56:51,871 [INFO] Using device: cuda
2025-01-13 11:56:51,874 [INFO] Found 9 partitioned files.
2025-01-13 11:57:08,631 [INFO] Created 3 folds
2025-01-13 11:57:22,091 [INFO] Created 3 folds
2025-01-13 11:57:22,092 [INFO] Training Fold 1/3...
In addition, using fork() with Python in general is a recipe for mysterious
deadlocks and crashes.

The most likely reason you are seeing this error is because you are using the
multiprocessing module on Linux, which uses fork() by default. This will be
fixed in Python 3.14. Until then, you want to use the "spawn" context instead.

See https://docs.pola.rs/user-guide/misc/multiprocessing/ for details.

or by setting POLARS_ALLOW_FORKING_THREAD=1.

  self.pid = os.fork()
In addition, using fork() with Python in general is a recipe for mysterious
deadlocks and crashes.

The most likely reason you are seeing this error is because you are using the
multiprocessing module on Linux, which uses fork() by default. This will be
fixed in Python 3

In [None]:
import os
import glob
import numpy as np
import polars as pl
import torch
import logging
from collections import defaultdict

def analyze_fold_data(fold_idx, train_dates, val_dates, files, feature_scaler, exclude_cols):
    """Analyze data distribution and statistics for each fold."""
    train_stats = defaultdict(list)
    val_stats = defaultdict(list)
    
    logger.info(f"\nAnalyzing Fold {fold_idx}:")
    logger.info(f"Train dates range: {min(train_dates)} to {max(train_dates)}")
    logger.info(f"Val dates range: {min(val_dates)} to {max(val_dates)}")
    
    total_train_samples = 0
    total_val_samples = 0
    
    for file in files:
        # Analyze training data
        train_features, train_targets, train_weights = load_and_scale_data_for_dates(
            file, train_dates, feature_scaler, exclude_cols
        )
        
        if train_features is not None:
            total_train_samples += len(train_features)
            train_stats['feature_means'].append(np.nanmean(train_features, axis=0))
            train_stats['feature_stds'].append(np.nanstd(train_features, axis=0))
            train_stats['target_mean'].append(np.nanmean(train_targets))
            train_stats['target_std'].append(np.nanstd(train_targets))
            train_stats['weight_mean'].append(np.nanmean(train_weights))
            train_stats['nan_counts'].append(np.isnan(train_features).sum())
        
        # Analyze validation data
        val_features, val_targets, val_weights = load_and_scale_data_for_dates(
            file, val_dates, feature_scaler, exclude_cols
        )
        
        if val_features is not None:
            total_val_samples += len(val_features)
            val_stats['feature_means'].append(np.nanmean(val_features, axis=0))
            val_stats['feature_stds'].append(np.nanstd(val_features, axis=0))
            val_stats['target_mean'].append(np.nanmean(val_targets))
            val_stats['target_std'].append(np.nanstd(val_targets))
            val_stats['weight_mean'].append(np.nanmean(val_weights))
            val_stats['nan_counts'].append(np.isnan(val_features).sum())
    
    # Log statistics
    logger.info(f"Total training samples: {total_train_samples}")
    logger.info(f"Total validation samples: {total_val_samples}")
    
    if train_stats['feature_means']:
        logger.info("\nTraining Set Statistics:")
        logger.info(f"Average feature means: {np.nanmean(train_stats['feature_means']):.4f}")
        logger.info(f"Average feature stds: {np.nanmean(train_stats['feature_stds']):.4f}")
        logger.info(f"Target mean: {np.nanmean(train_stats['target_mean']):.4f}")
        logger.info(f"Target std: {np.nanmean(train_stats['target_std']):.4f}")
        logger.info(f"Weight mean: {np.nanmean(train_stats['weight_mean']):.4f}")
        logger.info(f"Total NaN count: {sum(train_stats['nan_counts'])}")
    
    if val_stats['feature_means']:
        logger.info("\nValidation Set Statistics:")
        logger.info(f"Average feature means: {np.nanmean(val_stats['feature_means']):.4f}")
        logger.info(f"Average feature stds: {np.nanmean(val_stats['feature_stds']):.4f}")
        logger.info(f"Target mean: {np.nanmean(val_stats['target_mean']):.4f}")
        logger.info(f"Target std: {np.nanmean(val_stats['target_std']):.4f}")
        logger.info(f"Weight mean: {np.nanmean(val_stats['weight_mean']):.4f}")
        logger.info(f"Total NaN count: {sum(val_stats['nan_counts'])}")
    
    return total_train_samples, total_val_samples

def enhanced_train_model_with_cv(folds, files, feature_scaler):
    """Enhanced training function with additional debugging and safety checks."""
    for fold_idx, (train_dates, val_dates) in enumerate(folds, start=1):
        logger.info(f"\n{'='*50}")
        logger.info(f"Starting Fold {fold_idx}/{len(folds)}")
        logger.info(f"{'='*50}")
        
        # Analyze fold data before training
        total_train_samples, total_val_samples = analyze_fold_data(
            fold_idx, train_dates, val_dates, files, feature_scaler, EXCLUDE_COLS
        )
        
        if total_train_samples < 1000 or total_val_samples < 100:
            logger.warning(f"Insufficient data in fold {fold_idx}. "
                         f"Training samples: {total_train_samples}, "
                         f"Validation samples: {total_val_samples}")
            continue
            
        # Model and optimizer setup
        model = AutoencoderWithMLP(
            input_dim=GLOBAL_INPUT_DIM,
            hidden_dim=HIDDEN_DIM,
            dropout_rate=DROPOUT_RATE,
            noise_std=NOISE_STD
        )
        initialize_weights(model)
        model.to(device)
        
        # Add gradient norm monitoring
        def hook_fn(grad):
            if torch.isnan(grad).any():
                logger.error("NaN detected in gradients!")
                return None
            norm = grad.norm().item()
            if norm > 10:
                logger.warning(f"Large gradient norm detected: {norm}")
            return grad
        
        # Attach hooks to monitor gradients
        for name, param in model.named_parameters():
            if param.requires_grad:
                param.register_hook(lambda grad, name=name: hook_fn(grad))
        
        optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
        scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=7, verbose=True)
        recon_criterion = nn.MSELoss()
        supervised_criterion = nn.MSELoss()
        scaler = torch.amp.GradScaler()
        
        early_stopping = EarlyStopping(patience=7)
        best_val_loss = float('inf')
        nan_epochs = 0  # Counter for epochs with NaN losses
        
        for epoch in range(1, EPOCHS + 1):
            # Training Phase with enhanced monitoring
            model.train()
            train_loss = 0.0
            train_steps = 0
            epoch_grad_norms = []
            
            for file in files:
                features, targets, weights = load_and_scale_data_for_dates(
                    file, train_dates, feature_scaler, EXCLUDE_COLS
                )
                if features is None:
                    continue
                
                # Check for NaN/Inf in input data
                if np.isnan(features).any() or np.isinf(features).any():
                    logger.warning(f"NaN/Inf detected in features from {file}")
                    continue
                    
                train_dataset = FinancialDataset(features, targets, weights)
                train_loader = DataLoader(
                    train_dataset, batch_size=BATCH_SIZE, shuffle=True,
                    num_workers=4, pin_memory=True
                )
                
                for batch_x, batch_y, batch_w in train_loader:
                    batch_x, batch_y, batch_w = batch_x.to(device), batch_y.to(device), batch_w.to(device)
                    
                    optimizer.zero_grad(set_to_none=True)
                    
                    try:
                        with torch.amp.autocast(device_type="cuda"):
                            reconstructed, mlp_output, _ = model(batch_x)
                            loss_recon = recon_criterion(reconstructed, batch_x)
                            loss_sup = supervised_criterion(mlp_output.squeeze(), batch_y)
                            loss_sup = (loss_sup * batch_w).sum() / (batch_w.sum() + 1e-6)
                            loss = loss_recon + loss_sup
                            
                        if torch.isnan(loss) or torch.isinf(loss):
                            logger.error(f"NaN/Inf loss detected in fold {fold_idx}, epoch {epoch}")
                            continue
                            
                        scaler.scale(loss).backward()
                        scaler.unscale_(optimizer)
                        
                        # Monitor gradient norms
                        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                        epoch_grad_norms.append(grad_norm.item())
                        
                        scaler.step(optimizer)
                        scaler.update()
                        
                        train_loss += loss.item()
                        train_steps += 1
                        
                    except RuntimeError as e:
                        logger.error(f"Runtime error in training: {str(e)}")
                        continue
            
            avg_train_loss = train_loss / train_steps if train_steps > 0 else float('nan')
            
            # Log gradient statistics
            if epoch_grad_norms:
                logger.info(f"Gradient norms - Mean: {np.mean(epoch_grad_norms):.4f}, "
                          f"Max: {np.max(epoch_grad_norms):.4f}")
            
            # Validation Phase with enhanced monitoring
            model.eval()
            val_loss = 0.0
            val_steps = 0
            all_targets, all_predictions = [], []
            
            with torch.no_grad():
                for file in files:
                    features, targets, weights = load_and_scale_data_for_dates(
                        file, val_dates, feature_scaler, EXCLUDE_COLS
                    )
                    if features is None:
                        continue
                        
                    val_dataset = FinancialDataset(features, targets, weights)
                    val_loader = DataLoader(
                        val_dataset, batch_size=BATCH_SIZE, shuffle=False,
                        num_workers=4, pin_memory=True
                    )
                    
                    for batch_x, batch_y, batch_w in val_loader:
                        batch_x, batch_y, batch_w = batch_x.to(device), batch_y.to(device), batch_w.to(device)
                        
                        try:
                            with torch.amp.autocast(device_type="cuda"):
                                reconstructed, mlp_output, _ = model(batch_x)
                                loss_recon = recon_criterion(reconstructed, batch_x)
                                loss_sup = supervised_criterion(mlp_output.squeeze(), batch_y)
                                loss_sup = (loss_sup * batch_w).sum() / (batch_w.sum() + 1e-6)
                                loss = loss_recon + loss_sup
                                
                            val_loss += loss.item()
                            val_steps += 1
                            
                            all_targets.extend(batch_y.cpu().numpy())
                            all_predictions.extend(mlp_output.squeeze().cpu().numpy())
                            
                        except RuntimeError as e:
                            logger.error(f"Runtime error in validation: {str(e)}")
                            continue
            
            avg_val_loss = val_loss / val_steps if val_steps > 0 else float('nan')
            
            # Check for NaN losses
            if np.isnan(avg_train_loss) or np.isnan(avg_val_loss):
                nan_epochs += 1
                logger.warning(f"NaN loss detected in epoch {epoch}. NaN epoch count: {nan_epochs}")
                if nan_epochs >= 3:
                    logger.error(f"Too many NaN epochs in fold {fold_idx}. Stopping this fold.")
                    break
            else:
                nan_epochs = 0  # Reset counter if we get a valid epoch
            
            # R² calculation with enhanced error handling
            try:
                all_targets = np.array(all_targets)
                all_predictions = np.array(all_predictions)
                valid_mask = np.isfinite(all_targets) & np.isfinite(all_predictions)
                filtered_targets = all_targets[valid_mask]
                filtered_predictions = all_predictions[valid_mask]
                r2 = r2_score(filtered_targets, filtered_predictions) if len(filtered_targets) > 1 else float('nan')
                
                logger.info(f"[Fold {fold_idx} | Epoch {epoch}] "
                          f"Train Loss: {avg_train_loss:.5f} | "
                          f"Val Loss: {avg_val_loss:.5f} | "
                          f"R²: {r2:.5f}")
                
            except Exception as e:
                logger.error(f"Error calculating R² score: {str(e)}")
                r2 = float('nan')
            
            # Model saving with error handling
            if avg_val_loss < best_val_loss and not np.isnan(avg_val_loss):
                best_val_loss = avg_val_loss
                try:
                    torch.save(model.state_dict(), f"best_model_fold_{fold_idx}.pt")
                    logger.info(f"Best model updated for fold {fold_idx}")
                except Exception as e:
                    logger.error(f"Error saving model: {str(e)}")
            
            # Scheduler and early stopping
            scheduler.step(avg_val_loss)
            early_stopping(avg_val_loss)
            if early_stopping.early_stop:
                logger.info(f"Early stopping triggered for Fold {fold_idx}")
                break

# Use the enhanced training function
enhanced_train_model_with_cv(folds, all_part_files, feature_scaler)

In [None]:
#goat but does optuna which I dont have time for
import os
import glob
import numpy as np
import polars as pl
import torch
import torch.nn as nn
import torch.optim as optim
import optuna
import json
import random
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import StandardScaler
from torch import amp
from torch.optim.lr_scheduler import ReduceLROnPlateau
import logging
from sklearn.metrics import mean_squared_error, r2_score

#############################################
# Setup Logging
#############################################
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

#############################################
# 1) Reproducibility Setup
#############################################
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)
logger.info("Random seed set for reproducibility.")

#############################################
# 2) Basic Setup
#############################################
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {device}")

PARTITIONED_DIR = "/home/jupyter/data/partitioned/"
all_part_files = sorted(glob.glob(os.path.join(PARTITIONED_DIR, "XGFeatures_part_*.parquet")))
logger.info(f"Found {len(all_part_files)} partitioned files.")

EXCLUDE_COLS = ["date_id", "time_id", "symbol_id", "weight", "partition_id"]

# Directories for saving results
os.makedirs("saved_models", exist_ok=True)
os.makedirs("saved_scalers", exist_ok=True)

#############################################
# 3) Identify Unique Dates
#############################################
def get_unique_dates(part_files, num_files=None):
    all_dates = set()
    files_to_process = part_files if num_files is None else part_files[:num_files]
    for f in files_to_process:
        chunk_df = pl.read_parquet(f).select(pl.col("date_id"))
        unique_dates_chunk = chunk_df.unique().to_series().to_list()
        all_dates.update(unique_dates_chunk)
    sorted_dates = sorted(all_dates)
    return sorted_dates

sorted_dates = get_unique_dates(all_part_files, num_files=17)
logger.info(f"Identified {len(sorted_dates)} unique dates from first 17 chunks.")

#############################################
# 4) Create "Rolling" Folds with a Gap
#############################################
def create_rolling_folds(dates_list, n_folds=5, gap_size=30, val_size=100, train_size=300):
    folds = []
    idx_start = 0
    total_dates = len(dates_list)
    required_per_fold = train_size + gap_size + val_size

    max_possible_folds = (total_dates + gap_size) // required_per_fold
    if max_possible_folds < n_folds:
        logger.warning(f"Requested {n_folds} folds, but only {max_possible_folds} can be created with {total_dates} dates.")
        n_folds = max_possible_folds

    for i in range(n_folds):
        train_end = idx_start + train_size
        if train_end >= total_dates:
            logger.warning(f"Not enough dates to create fold {i+1}.")
            break

        gap_start = train_end
        gap_end = gap_start + gap_size

        val_start = gap_end
        val_end = val_start + val_size

        if val_end > total_dates:
            val_end = total_dates

        train_dates = dates_list[idx_start:train_end]
        val_dates = dates_list[val_start:val_end]

        # Update start index for next fold
        idx_start = val_end

        if not train_dates or not val_dates:
            logger.warning(f"Empty train or validation set for fold {i+1}. Skipping.")
            break

        folds.append((train_dates, val_dates))
        logger.info(f"Created Fold {i+1}: Train={len(train_dates)} dates, Val={len(val_dates)} dates")

    logger.info(f"Total folds created: {len(folds)}")
    return folds

n_folds = 5
folds = create_rolling_folds(sorted_dates, n_folds=n_folds, gap_size=30, val_size=100)
logger.info(f"Created {len(folds)} rolling folds with gap=30, val_size=100.")
for i, (td, vd) in enumerate(folds, start=1):
    logger.info(f"Fold {i} => Train: {len(td)} dates, Val: {len(vd)} dates")

#############################################
# 5) Data Classes
#############################################
class FinancialDataset(Dataset):
    def __init__(self, features, target, weights):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.target = torch.tensor(target, dtype=torch.float32)
        self.weights = torch.tensor(weights, dtype=torch.float32)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.target[idx], self.weights[idx]

class GaussianNoise(nn.Module):
    def __init__(self, std=0.05):
        super().__init__()
        self.std = std

    def forward(self, x):
        if self.training and self.std > 0:
            return x + torch.randn_like(x) * self.std
        return x

#############################################
# 6) Model Definition
#############################################
class AutoencoderWithMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim=512, dropout_rate=0.3, noise_std=0.05):
        super().__init__()
        self.noise = GaussianNoise(std=noise_std)

        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.SiLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Dropout(dropout_rate),

            nn.Linear(hidden_dim, hidden_dim),
            nn.SiLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Dropout(dropout_rate),

            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.SiLU(),
            nn.BatchNorm1d(hidden_dim // 2),
            nn.Dropout(dropout_rate),
        )
        self.bottleneck = nn.Linear(hidden_dim // 2, hidden_dim // 4)

        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(hidden_dim // 4, hidden_dim // 2),
            nn.SiLU(),
            nn.BatchNorm1d(hidden_dim // 2),
            nn.Linear(hidden_dim // 2, hidden_dim),
            nn.SiLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Linear(hidden_dim, input_dim),
        )

        # MLP
        self.mlp = nn.Sequential(
            nn.Linear(input_dim + hidden_dim // 4, hidden_dim),
            nn.SiLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Dropout(dropout_rate),

            nn.Linear(hidden_dim, hidden_dim),
            nn.SiLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Dropout(dropout_rate),

            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.SiLU(),
            nn.BatchNorm1d(hidden_dim // 2),
            nn.Dropout(dropout_rate),

            nn.Linear(hidden_dim // 2, 1),
        )

    def forward(self, x):
        x_noised = self.noise(x)
        encoded = self.encoder(x_noised)
        latent = self.bottleneck(encoded)
        reconstructed = self.decoder(latent)

        mlp_input = torch.cat([x, latent], dim=1)
        mlp_output = self.mlp(mlp_input)
        return reconstructed, mlp_output, latent

def initialize_weights(model):
    for m in model.modules():
        if isinstance(m, nn.Linear):
            nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
            if m.bias is not None:
                nn.init.zeros_(m.bias)

#############################################
# 7) Feature + Target Standardization
#############################################
def build_scalers_from_training_data(folds, chunk_files, exclude_cols, target_col="responder_6"):
    """
    Aggregates all training data across folds to fit the scalers.
    Saves the scalers to disk.
    Returns (feature_scaler, target_scaler).
    """
    all_train_features = []
    all_train_targets = []

    logger.info("Aggregating training data across all folds to fit scalers.")

    for fold_i, (train_dates, _) in enumerate(folds, start=1):
        logger.info(f"Processing Fold {fold_i} for scaler fitting.")
        for cf in chunk_files:
            # Load and filter training data for the current fold
            df = pl.read_parquet(cf)
            df_train = df.filter(pl.col("date_id").is_in(list(train_dates)))
            if df_train.is_empty():
                continue

            numeric_cols = [c for c in df_train.columns if (c not in exclude_cols) and (not c.startswith("responder_"))]
            features = df_train.select(numeric_cols).to_numpy().astype(np.float32)
            all_train_features.append(features)

            targets = df_train[target_col].to_numpy().astype(np.float32).reshape(-1, 1)
            all_train_targets.append(targets)

    if not all_train_features:
        logger.error("No training data found to fit scalers.")
        raise ValueError("Training data is empty.")

    all_train_features = np.vstack(all_train_features)
    all_train_targets = np.vstack(all_train_targets)

    # Fit scalers
    feature_scaler = StandardScaler()
    feature_scaler.fit(all_train_features)

    target_scaler = StandardScaler()
    target_scaler.fit(all_train_targets)

    # Save scalers
    with open("saved_scalers/feature_scaler.pkl", "wb") as f:
        import pickle
        pickle.dump(feature_scaler, f)
    with open("saved_scalers/target_scaler.pkl", "wb") as f:
        pickle.dump(target_scaler, f)

    logger.info("Feature and target scalers built and saved from aggregated training data.")
    return feature_scaler, target_scaler

#############################################
# 8) Load & Filter by Date with Scaling
#############################################
def load_and_filter_chunk(
    parquet_file,
    date_set,
    exclude_cols,
    feat_scaler,
    targ_scaler,
    target_col="responder_6",
    weight_col="weight"
):
    """
    1) Load chunk
    2) Filter rows by date_id in date_set
    3) Scale features + target
    4) Return (features, target, weights)
    """
    df = pl.read_parquet(parquet_file)
    df = df.filter(pl.col("date_id").is_in(list(date_set)))
    if df.is_empty():
        return None, None, None

    # Features
    numeric_cols = [c for c in df.columns if (c not in exclude_cols) and (not c.startswith("responder_"))]
    f = df.select(numeric_cols).to_numpy().astype(np.float32)
    f = feat_scaler.transform(f)

    # Target
    t = df[target_col].to_numpy().astype(np.float32).reshape(-1, 1)
    t = targ_scaler.transform(t).flatten()

    # Weights
    w = df[weight_col].to_numpy().astype(np.float32)

    return f, t, w

#############################################
# 9) Early Stopping Class
#############################################
class EarlyStopping:
    def __init__(self, patience=5, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = None
        self.early_stop = False

    def __call__(self, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
        elif val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True

#############################################
# 10) Training Function
#############################################
def train_one_fold(
    model,
    train_date_set,
    val_date_set,
    chunk_files,
    exclude_cols,
    feat_scaler,
    targ_scaler,
    epochs=30,
    lr=1e-3,
    batch_size=512,
    patience=5,
    device=device,
    fold_i=1  # Added to save models per fold
):
    """
    Each epoch:
        - Iterate over all chunk files and train on rows matching train_date_set.
        - Validate on the validation set at the end of the epoch.
    """
    model.to(device)
    model.train()

    # Loss functions
    recon_criterion = nn.MSELoss()
    supervised_criterion = nn.MSELoss()

    optimizer = optim.Adam(model.parameters(), lr=lr)
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3, verbose=True)

    scaler = amp.GradScaler()

    # Prepare validation data
    val_feats_list, val_targs_list, val_wts_list = [], [], []
    for cf in chunk_files:
        vf, vt, vw = load_and_filter_chunk(cf, val_date_set, exclude_cols, feat_scaler, targ_scaler)
        if vf is not None and len(vf) > 0:
            val_feats_list.append(vf)
            val_targs_list.append(vt)
            val_wts_list.append(vw)

    if val_feats_list:
        val_feats = np.concatenate(val_feats_list, axis=0)
        val_targs = np.concatenate(val_targs_list, axis=0)
        val_wts = np.concatenate(val_wts_list, axis=0)
        val_ds = FinancialDataset(val_feats, val_targs, val_wts)
        val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)
    else:
        logger.warning("No validation data found for this fold.")
        val_loader = None

    early_stopping = EarlyStopping(patience=patience)
    best_val_loss = float("inf")

    for epoch in range(1, epochs + 1):
        model.train()
        train_loss_sum = 0.0
        train_steps = 0

        for cf in chunk_files:
            feats, targs, wts = load_and_filter_chunk(cf, train_date_set, exclude_cols, feat_scaler, targ_scaler)
            if feats is None or len(feats) == 0:
                continue

            train_ds = FinancialDataset(feats, targs, wts)
            train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True)

            for batch_x, batch_y, batch_w in train_loader:
                batch_x = batch_x.to(device)
                batch_y = batch_y.to(device)
                batch_w = batch_w.to(device)

                optimizer.zero_grad(set_to_none=True)
                with amp.autocast(device_type=device.type):
                    reconstructed, mlp_out, _ = model(batch_x)
                    loss_recon = recon_criterion(reconstructed, batch_x)
                    loss_sup = supervised_criterion(mlp_out.squeeze(), batch_y)
                    loss_sup = (loss_sup * batch_w).mean()
                    loss = loss_recon + loss_sup

                scaler.scale(loss).backward()
                scaler.unscale_(optimizer)
                nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                scaler.step(optimizer)
                scaler.update()

                train_loss_sum += loss.item()
                train_steps += 1

        train_loss_avg = train_loss_sum / max(1, train_steps)

        # Validation Pass
        model.eval()
        val_loss_sum = 0.0
        val_steps = 0
        weighted_sse = 0.0
        weighted_sst = 0.0

        if val_loader:
            with torch.no_grad():
                for batch_x, batch_y, batch_w in val_loader:
                    batch_x = batch_x.to(device)
                    batch_y = batch_y.to(device)
                    batch_w = batch_w.to(device)

                    with amp.autocast(device_type=device.type):
                        reconstructed, mlp_out, _ = model(batch_x)
                        loss_recon = recon_criterion(reconstructed, batch_x)
                        loss_sup = supervised_criterion(mlp_out.squeeze(), batch_y)
                        loss_sup = (loss_sup * batch_w).mean()
                        loss = loss_recon + loss_sup

                    val_loss_sum += loss.item()
                    val_steps += 1

                    # Compute R² components
                    weighted_sse += (batch_w * (batch_y - mlp_out.squeeze()) ** 2).sum().item()
                    weighted_sst += (batch_w * batch_y ** 2).sum().item()

        if val_steps > 0:
            val_loss_avg = val_loss_sum / val_steps
            r2 = 1 - (weighted_sse / weighted_sst) if weighted_sst != 0 else float('nan')
        else:
            val_loss_avg = float("inf")
            r2 = float('nan')

        # Scheduler step
        scheduler.step(val_loss_avg)

        logger.info(f"[Fold {fold_i} Epoch {epoch}/{epochs}] Train Loss: {train_loss_avg:.5f} | Val Loss: {val_loss_avg:.5f} | R²: {r2:.5f} | LR: {optimizer.param_groups[0]['lr']:.6f}")

        # Early Stopping
        early_stopping(val_loss_avg)
        if early_stopping.early_stop:
            logger.info("Early stopping triggered.")
            break

        # Save Best Model for this Fold
        if val_loss_avg < best_val_loss:
            best_val_loss = val_loss_avg
            model_path = f"saved_models/best_fold_{fold_i}_model.pt"
            torch.save(model.state_dict(), model_path)
            logger.info(f"Best model for Fold {fold_i} saved to {model_path}.")

    return best_val_loss

#############################################
# 11) Optuna Objective Function
#############################################
def objective_rolling_tscv(trial):
    hidden_dim = trial.suggest_int("hidden_dim", 256, 1024, step=256)
    dropout_rate = trial.suggest_float("dropout_rate", 0.1, 0.5)
    noise_std = trial.suggest_float("noise_std", 0.0, 0.1)
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True)

    epochs = 3  # Reduced epochs for faster trials

    fold_losses = []
    for fold_i, (train_dates, val_dates) in enumerate(folds, start=1):
        logger.info(f"Starting Fold {fold_i} with {len(train_dates)} training dates and {len(val_dates)} validation dates.")
        model = AutoencoderWithMLP(
            input_dim=GLOBAL_INPUT_DIM,
            hidden_dim=hidden_dim,
            dropout_rate=dropout_rate,
            noise_std=noise_std
        )
        initialize_weights(model)

        val_loss = train_one_fold(
            model=model,
            train_date_set=set(train_dates),
            val_date_set=set(val_dates),
            chunk_files=all_part_files,
            exclude_cols=EXCLUDE_COLS,
            feat_scaler=feature_scaler,
            targ_scaler=target_scaler,
            epochs=epochs,
            lr=learning_rate,
            batch_size=512,
            patience=5,
            device=device,
            fold_i=fold_i  # Pass fold index
        )
        fold_losses.append(val_loss)
        logger.info(f"Fold {fold_i} completed with Val Loss: {val_loss:.5f}")

    avg_val_loss = np.mean(fold_losses)
    logger.info(f"Average Validation Loss for Trial: {avg_val_loss:.5f}")
    return avg_val_loss

#############################################
# 12) Evaluation on Separate Test Set
#############################################
def evaluate_on_test_set(best_model_paths, test_loader, device):
    """
    Evaluates the best models from each fold on the separate test set.
    Aggregates the results.
    """
    results = []
    for fold_i, model_path in enumerate(best_model_paths, start=1):
        logger.info(f"Loading Best Model for Fold {fold_i} from {model_path}.")
        model = AutoencoderWithMLP(input_dim=GLOBAL_INPUT_DIM)
        model.load_state_dict(torch.load(model_path))
        model.to(device)
        model.eval()

        total_loss = 0.0
        val_steps = 0
        weighted_sse = 0.0
        weighted_sst = 0.0

        all_preds = []
        all_trues = []

        with torch.no_grad():
            for batch_x, batch_y, batch_w in test_loader:
                batch_x = batch_x.to(device)
                batch_y = batch_y.to(device)
                batch_w = batch_w.to(device)

                with amp.autocast(device_type=device.type):
                    reconstructed, mlp_out, _ = model(batch_x)
                    loss_recon = nn.MSELoss()(reconstructed, batch_x)
                    loss_sup = nn.MSELoss()(mlp_out.squeeze(), batch_y)
                    loss_sup = (loss_sup * batch_w).mean()
                    loss = loss_recon + loss_sup

                total_loss += loss.item()
                val_steps += 1

                # Compute R² components
                weighted_sse += (batch_w * (batch_y - mlp_out.squeeze()) ** 2).sum().item()
                weighted_sst += (batch_w * batch_y ** 2).sum().item()

                all_preds.extend(mlp_out.squeeze().cpu().numpy())
                all_trues.extend(batch_y.cpu().numpy())

        if val_steps > 0:
            avg_loss = total_loss / val_steps
            r2 = 1 - (weighted_sse / weighted_sst) if weighted_sst != 0 else float('nan')
        else:
            avg_loss = float("inf")
            r2 = float('nan')

        results.append({
            "fold": fold_i,
            "model_path": model_path,
            "test_loss": avg_loss,
            "r2_score": r2
        })

        logger.info(f"[Test Evaluation] Fold {fold_i} | Test Loss: {avg_loss:.5f} | R²: {r2:.5f}")

    # Aggregate Results
    avg_test_loss = np.mean([res["test_loss"] for res in results])
    avg_r2 = np.mean([res["r2_score"] for res in results if not np.isnan(res["r2_score"])])

    logger.info(f"Average Test Loss across all folds: {avg_test_loss:.5f}")
    logger.info(f"Average Test R² across all folds: {avg_r2:.5f}")

    # Save Test Results
    with open("saved_models/test_evaluation_results.json", "w") as f:
        json.dump(results, f)
    with open("saved_models/aggregate_test_evaluation.json", "w") as f:
        json.dump({
            "average_test_loss": avg_test_loss,
            "average_test_r2": avg_r2
        }, f)

    logger.info("Test evaluation results saved.")

#############################################
# 13) Main Execution
#############################################
if __name__ == "__main__":
    #########################################
    # (A) Build Scalers from All Training Data
    #########################################
    feature_scaler, target_scaler = build_scalers_from_training_data(folds, all_part_files, EXCLUDE_COLS, target_col="responder_6")

    #########################################
    # (B) Determine Input Dimension
    #########################################
    # Assuming all chunks have the same structure
    df_tmp = pl.read_parquet(all_part_files[0])
    numeric_cols = [c for c in df_tmp.columns if (c not in EXCLUDE_COLS) and (not c.startswith("responder_"))]
    GLOBAL_INPUT_DIM = len(numeric_cols)
    logger.info(f"GLOBAL_INPUT_DIM set to {GLOBAL_INPUT_DIM}.")
    del df_tmp

    #########################################
    # (C) Optuna Hyperparameter Tuning
    #########################################
    study = optuna.create_study(direction="minimize")
    study.optimize(objective_rolling_tscv, n_trials=100, n_jobs=1)  # Set n_jobs=1 to avoid potential issues with DataLoader
    logger.info("Optuna hyperparameter tuning completed.")
    logger.info(f"Best Trial: {study.best_trial.value}")
    logger.info(f"Best Params: {study.best_params}")

    # Save Best Hyperparameters
    with open("saved_models/best_hyperparameters.json", "w") as f:
        json.dump(study.best_params, f)
    logger.info("Best hyperparameters saved to 'saved_models/best_hyperparameters.json'.")

    #########################################
    # (D) Final Training with Best Hyperparameters
    #########################################
    best_params = study.best_params
    final_fold_losses = []
    final_epochs = 75  # Adjust based on available resources

    best_model_paths = []  # To store paths of best models per fold

    for fold_i, (train_dates, val_dates) in enumerate(folds, start=1):
        logger.info(f"Starting Final Training for Fold {fold_i}.")

        final_model = AutoencoderWithMLP(
            input_dim=GLOBAL_INPUT_DIM,
            hidden_dim=best_params["hidden_dim"],
            dropout_rate=best_params["dropout_rate"],
            noise_std=best_params["noise_std"]
        )
        initialize_weights(final_model)

        val_loss = train_one_fold(
            model=final_model,
            train_date_set=set(train_dates),
            val_date_set=set(val_dates),
            chunk_files=all_part_files,
            exclude_cols=EXCLUDE_COLS,
            feat_scaler=feature_scaler,
            targ_scaler=target_scaler,
            epochs=final_epochs,
            lr=best_params["learning_rate"],
            batch_size=512,
            patience=5,
            device=device,
            fold_i=fold_i
        )
        final_fold_losses.append(val_loss)
        logger.info(f"[FINAL Fold {fold_i}] Validation Loss: {val_loss:.5f}")

        # Save Final Model for Each Fold
        final_model_path = f"saved_models/final_fold_{fold_i}_model.pt"
        torch.save(final_model.state_dict(), final_model_path)
        logger.info(f"Final model for Fold {fold_i} saved to {final_model_path}.")
        best_model_paths.append(final_model_path)

    overall_loss = np.mean(final_fold_losses)
    logger.info(f"Final Average Validation Loss over {len(folds)} folds: {overall_loss:.5f}")

    # Save Overall Metrics
    with open("saved_models/final_validation_loss.json", "w") as f:
        json.dump({"average_val_loss": overall_loss}, f)
    logger.info("Final validation loss metrics saved.")

    #########################################
    # (E) Evaluation on Separate Test Set
    #########################################
    test_file = "/home/jupyter/data/jane-street/test.parquet"
    logger.info(f"Loading separate test set from {test_file}.")

    # Load the test set
    test_df = pl.read_parquet(test_file)

    # Preprocess the test set
    # Assuming the test set has the same structure as the training/validation sets
    test_dates = test_df["date_id"].unique().to_list()
    logger.info(f"Test set contains {len(test_dates)} unique dates.")

    # Since the test set is separate, no need to split further
    test_feats, test_targs, test_wts = load_and_filter_chunk(
        parquet_file=test_file,
        date_set=set(test_dates),
        exclude_cols=EXCLUDE_COLS,
        feat_scaler=feature_scaler,
        targ_scaler=target_scaler,
        target_col="responder_6",
        weight_col="weight"
    )

    if test_feats is None or len(test_feats) == 0:
        logger.error("Test set is empty after filtering. Aborting test evaluation.")
    else:
        # Create Test Dataset and DataLoader
        test_dataset = FinancialDataset(test_feats, test_targs, test_wts)
        test_loader = DataLoader(test_dataset, batch_size=512, shuffle=False, num_workers=2, pin_memory=True)

        # Evaluate each best model on the test set
        evaluate_on_test_set(best_model_paths, test_loader, device)


In [None]:
#did not evaluate
import os
import glob
import numpy as np
import polars as pl
import torch
import torch.nn as nn
import torch.optim as optim
import random
import pickle
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import StandardScaler
from torch import amp
from torch.optim.lr_scheduler import ReduceLROnPlateau
import logging

#############################################
# Setup Logging
#############################################
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[logging.StreamHandler()]
)
logger = logging.getLogger(__name__)

#############################################
# 1) Reproducibility Setup
#############################################
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {device}")

#############################################
# 2) Paths and Constants
#############################################
PARTITIONED_DIR = "/home/jupyter/data/partitioned/"
all_part_files = sorted(glob.glob(os.path.join(PARTITIONED_DIR, "*.parquet")))
logger.info(f"Found {len(all_part_files)} partitioned files.")

# Load column names to exclude all responders dynamically
EXCLUDE_COLS = ["date_id", "time_id", "symbol_id", "weight", "partition_id"]
df_tmp = pl.read_parquet(all_part_files[0])
EXCLUDE_COLS += [col for col in df_tmp.columns if col.startswith("responder_")]
del df_tmp

# Best hyperparameters (from the second script)
HIDDEN_DIM = 512
DROPOUT_RATE = 0.2537
NOISE_STD = 0.0017
LEARNING_RATE = 4.16276e-05
BATCH_SIZE = 512
EPOCHS = 75

#############################################
# 3) Data Classes
#############################################
class FinancialDataset(Dataset):
    def __init__(self, features, target, weights):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.target = torch.tensor(target, dtype=torch.float32)
        self.weights = torch.tensor(weights, dtype=torch.float32)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.target[idx], self.weights[idx]

class GaussianNoise(nn.Module):
    def __init__(self, std=0.05):
        super().__init__()
        self.std = std

    def forward(self, x):
        if self.training and self.std > 0:
            return x + torch.randn_like(x) * self.std
        return x

class AutoencoderWithMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, dropout_rate, noise_std):
        super().__init__()
        self.noise = GaussianNoise(std=noise_std)

        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.SiLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.SiLU(),
            nn.BatchNorm1d(hidden_dim // 2),
            nn.Dropout(dropout_rate),
        )
        self.bottleneck = nn.Linear(hidden_dim // 2, hidden_dim // 4)

        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(hidden_dim // 4, hidden_dim // 2),
            nn.SiLU(),
            nn.BatchNorm1d(hidden_dim // 2),
            nn.Linear(hidden_dim // 2, hidden_dim),
            nn.SiLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Linear(hidden_dim, input_dim)
        )

        # MLP for Final Prediction
        self.mlp = nn.Sequential(
            nn.Linear(input_dim + hidden_dim // 4, hidden_dim),
            nn.SiLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.SiLU(),
            nn.BatchNorm1d(hidden_dim // 2),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_dim // 2, 1)
        )

    def forward(self, x):
        x_noised = self.noise(x)
        encoded = self.encoder(x_noised)
        latent = self.bottleneck(encoded)
        reconstructed = self.decoder(latent)
        mlp_input = torch.cat([x, latent], dim=1)
        mlp_output = self.mlp(mlp_input)
        return reconstructed, mlp_output, latent

def initialize_weights(model):
    for m in model.modules():
        if isinstance(m, nn.Linear):
            nn.init.kaiming_normal_(m.weight, nonlinearity='relu')
            if m.bias is not None:
                nn.init.zeros_(m.bias)

#############################################
# 4) Standardization & Scalers
#############################################
def build_scalers(chunk_files, exclude_cols):
    all_features = []
    for file in chunk_files:
        df = pl.read_parquet(file)
        numeric_cols = [col for col in df.columns if col not in exclude_cols]
        features = df.select(numeric_cols).to_numpy()
        all_features.append(features)

    all_features = np.vstack(all_features)
    feature_scaler = StandardScaler()
    feature_scaler.fit(all_features)

    # Save scaler
    with open("saved_scalers/feature_scaler.pkl", "wb") as f:
        pickle.dump(feature_scaler, f)

    return feature_scaler

# Build feature scaler
feature_scaler = build_scalers(all_part_files, EXCLUDE_COLS)
GLOBAL_INPUT_DIM = len([col for col in pl.read_parquet(all_part_files[0]).columns if col not in EXCLUDE_COLS])

#############################################
# 5) Model Training Function
#############################################
def train_model():
    model = AutoencoderWithMLP(
        input_dim=GLOBAL_INPUT_DIM,
        hidden_dim=HIDDEN_DIM,
        dropout_rate=DROPOUT_RATE,
        noise_std=NOISE_STD
    )
    initialize_weights(model)
    model.to(device)

    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)
    recon_criterion = nn.MSELoss()
    supervised_criterion = nn.MSELoss()
    scaler_amp = amp.GradScaler()

    for epoch in range(1, EPOCHS + 1):
        model.train()
        total_loss = 0.0
        for cf in all_part_files:
            df = pl.read_parquet(cf)
            numeric_cols = [col for col in df.columns if col not in EXCLUDE_COLS]
            features = feature_scaler.transform(df.select(numeric_cols).to_numpy())
            targets = df["responder_6"].to_numpy()
            weights = df["weight"].to_numpy()

            dataset = FinancialDataset(features, targets, weights)
            loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

            for batch_x, batch_y, batch_w in loader:
                batch_x, batch_y, batch_w = batch_x.to(device), batch_y.to(device), batch_w.to(device)

                optimizer.zero_grad()
                with amp.autocast(device_type=device.type):
                    reconstructed, mlp_output, _ = model(batch_x)
                    loss_recon = recon_criterion(reconstructed, batch_x)
                    loss_sup = (supervised_criterion(mlp_output.squeeze(), batch_y) * batch_w).mean()
                    loss = loss_recon + loss_sup

                scaler_amp.scale(loss).backward()
                scaler_amp.step(optimizer)
                scaler_amp.update()
                total_loss += loss.item()

        avg_loss = total_loss / len(all_part_files)
        scheduler.step(avg_loss)
        logger.info(f"Epoch {epoch} | Avg Loss: {avg_loss:.5f}")

    torch.save(model.state_dict(), "saved_models/final_model.pt")
    logger.info("Model training completed and saved!")

#############################################
# 6) Execute Training
#############################################
if __name__ == "__main__":
    train_model()
