# Training Transformer Model

## Imports

In [1]:
import pathlib, os, sys
import pyspark, pandas as pd, numpy as np
import tqdm

import mlflow
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score,
    accuracy_score, precision_recall_fscore_support, roc_curve
)

import matplotlib.pyplot as plt
import seaborn as sns

"""Transformer models for financial market prediction"""

# filepath: src_clean/training/transformers/config.py
"""Configuration for transformer models"""

class TransformerConfig:
    # Model architecture
    MODEL_TYPE = "transformer"
    D_MODEL = 512
    N_HEADS = 4
    N_LAYERS = 2
    D_FF = 2048
    DROPOUT = 0.2
    
    # Training
    BATCH_SIZE = 256
    LEARNING_RATE = 1e-4
    MAX_EPOCHS = 10
    EARLY_STOPPING_PATIENCE = 3
    SEQUENCE_LENGTH = 50  # Time steps to look back

In [2]:
# Check for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

def monitor_gpu_usage():
    if torch.cuda.is_available():
        print(f"GPU Memory Allocated: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")
        print(f"GPU Memory Cached: {torch.cuda.memory_reserved(0) / 1e9:.2f} GB")
        print(f"GPU Memory Free: {(torch.cuda.get_device_properties(0).total_memory - torch.cuda.memory_allocated(0)) / 1e9:.2f} GB")

# Test GPU with a simple operation
print("Testing GPU with simple tensor operations...")
if torch.cuda.is_available():
    test_tensor = torch.randn(1000, 1000).cuda()
    test_result = torch.mm(test_tensor, test_tensor)
    print(f"GPU test successful! Result shape: {test_result.shape}")
    print(f"Tensor is on CUDA: {test_result.is_cuda}")
    monitor_gpu_usage()
    del test_tensor, test_result
    torch.cuda.empty_cache()
else:
    print("CUDA not available")
    
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    
    # Clear GPU cache
    torch.cuda.empty_cache()
    
    # Set memory fraction (optional)
    torch.cuda.set_per_process_memory_fraction(0.8)
else:
    print("GPU not available, using CPU")

Using device: cuda
Testing GPU with simple tensor operations...
GPU test successful! Result shape: torch.Size([1000, 1000])
Tensor is on CUDA: True
GPU Memory Allocated: 0.02 GB
GPU Memory Cached: 0.02 GB
GPU Memory Free: 12.86 GB
GPU: NVIDIA GeForce RTX 4080 Laptop GPU
Memory: 12.9 GB


In [3]:
class FinancialTransformer(nn.Module):
    def __init__(self, config: TransformerConfig, n_features=71):
        super().__init__()
        self.config = config
        self.n_features = n_features
        
        # Input embedding for financial features
        self.feature_embedding = nn.Linear(
            self.n_features, 
            config.D_MODEL
        )
        
        # Positional encoding
        self.pos_encoding = nn.Parameter(
            torch.randn(config.SEQUENCE_LENGTH, config.D_MODEL)
        )
        
        # Transformer encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=config.D_MODEL,
            nhead=config.N_HEADS,
            dim_feedforward=config.D_FF,
            dropout=config.DROPOUT,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, config.N_LAYERS)
        
        # Output head
        self.output_head = nn.Sequential(
            nn.Linear(config.D_MODEL, config.D_MODEL // 2),
            nn.ReLU(),
            nn.Dropout(config.DROPOUT),
            nn.Linear(config.D_MODEL // 2, 1)  # Binary classification output
        )
    
    def forward(self, x):
        # x shape: (batch_size, sequence_length, n_features)
        batch_size, seq_len, _ = x.shape
        # Feature embedding
        x = self.feature_embedding(x)  # (batch_size, seq_len, d_model)
        x = x + self.pos_encoding[:seq_len, :].unsqueeze(0)
        transformer_out = self.transformer(x)  # (batch_size, seq_len, d_model)
        last_hidden = transformer_out[:, -1, :]  # (batch_size, d_model)
        output = self.output_head(last_hidden)  # (batch_size, 1)
        return output
    
class RollingWindowSplit:
    def __init__(self, train_window_months=6, val_window_months=1, step_months=1):
        self.train_window = pd.DateOffset(months=train_window_months)
        self.val_window = pd.DateOffset(months=val_window_months)
        self.step = pd.DateOffset(months=step_months)
    
    def split(self, df, start_date, end_date):
        """Generate rolling window splits"""
        splits = []
        current_date = start_date + self.train_window + self.val_window
        
        while current_date <= end_date:
            # Define windows
            train_start = current_date - self.train_window - self.val_window
            train_end = current_date - self.val_window
            val_start = train_end
            val_end = current_date
            
            # Create masks
            train_mask = (df['event_timestamp'] >= train_start) & (df['event_timestamp'] < train_end)
            val_mask = (df['event_timestamp'] >= val_start) & (df['event_timestamp'] < val_end)
            
            if train_mask.sum() > 0 and val_mask.sum() > 0:
                splits.append({
                    'train_idx': df[train_mask].index,
                    'val_idx': df[val_mask].index,
                    'train_start': train_start,
                    'train_end': train_end,
                    'val_start': val_start,
                    'val_end': val_end
                })
            
            current_date += self.step
        
        return splits


In [4]:
import subprocess, time
def monitor_gpu_realtime():
    """Monitor GPU usage in real-time"""
    if torch.cuda.is_available():
        try:
            result = subprocess.run(['nvidia-smi', '--query-gpu=utilization.gpu,memory.used,memory.total', '--format=csv,noheader,nounits'], 
                                  capture_output=True, text=True)
            if result.returncode == 0:
                gpu_util, mem_used, mem_total = result.stdout.strip().split(', ')
                print(f"GPU Utilization: {gpu_util}%, Memory: {mem_used}MB / {mem_total}MB")
            else:
                print("Could not get nvidia-smi output")
        except:
            print("nvidia-smi not available")
    else:
        print("CUDA not available")

class RollingWindowTransformerTrainer:
    def __init__(self, config: TransformerConfig, feature_list=None):
        self.config = config
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.feature_list = feature_list if feature_list is not None else []
        self.final_scaler = None  # Will store the last scaler for test set

    def train_rolling_window(self, df, splits, test_df):
        """Train with rolling window cross-validation"""
        
        # Prepare features
        feature_cols = self.feature_list if self.feature_list else [col for col in df.columns if col not in [
            'event_timestamp', 'target_classification', 'time', 'instrument'
        ]]
        self.n_features = len(feature_cols)
        
        fold_results = []
        
        for fold, split in enumerate(splits):
            print(f"\n=== Fold {fold + 1}/{len(splits)} ===")
            print(f"Train: {split['train_start']} to {split['train_end']}")
            print(f"Val: {split['val_start']} to {split['val_end']}")
            
            # Get data for this fold
            train_data = df.loc[split['train_idx']]
            val_data = df.loc[split['val_idx']]
            
            # EXPANDING WINDOW: Fit scaler only on data up to training end
            # This simulates real-world scenario where you only know past statistics
            historical_data = df[df['event_timestamp'] <= split['train_end']]
            fold_scaler = StandardScaler()
            fold_scaler.fit(historical_data[feature_cols])
            
            # Transform with this fold's scaler
            train_features = fold_scaler.transform(train_data[feature_cols])
            val_features = fold_scaler.transform(val_data[feature_cols])
            
            # Save the final scaler (from last fold) for test set
            if fold == len(splits) - 1:
                self.final_scaler = fold_scaler
            
            # Create datasets
            train_dataset = self._create_dataset(
                train_features, 
                train_data['target_classification'].values
            )
            val_dataset = self._create_dataset(
                val_features, 
                val_data['target_classification'].values
            )
            
            # Create data loaders
            train_loader = DataLoader(
                train_dataset, 
                batch_size=self.config.BATCH_SIZE, 
                shuffle=True,
                pin_memory=torch.cuda.is_available(),
                num_workers=0
            )
            val_loader = DataLoader(
                val_dataset, 
                batch_size=self.config.BATCH_SIZE, 
                shuffle=False,
                pin_memory=torch.cuda.is_available(),
                num_workers=0
            )
            
            # Train model for this fold
            model = self._train_fold(train_loader, val_loader, fold)
            
            # Evaluate on validation set
            val_metrics = self._evaluate_model(model, val_loader)
            fold_results.append(val_metrics)

            # Clean up GPU memory after each fold
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
                
        # Final evaluation on out-of-time test set
        test_metrics = self._evaluate_on_test(model, test_df, feature_cols)
        
        return fold_results, test_metrics
    
    def _evaluate_on_test(self, model, test_df, feature_cols):
        """Evaluate on out-of-time test set"""
        if self.final_scaler is None:
            raise ValueError("No scaler available. Train the model first.")
        
        # Use the final scaler (trained on all data up to last training period)
        test_features = self.final_scaler.transform(test_df[feature_cols])
        
        # Create test dataset
        test_dataset = self._create_dataset(
            test_features,
            test_df['target_classification'].values
        )
        
        test_loader = DataLoader(
            test_dataset,
            batch_size=self.config.BATCH_SIZE,
            shuffle=False
        )
        
        # Evaluate
        test_metrics = self._evaluate_model(model, test_loader)
        return test_metrics
    
    def _train_fold(self, train_loader, val_loader, fold):
        """Train model for one fold"""
        model = FinancialTransformer(self.config, n_features=self.n_features).to(self.device)
        criterion = nn.BCEWithLogitsLoss()  # For classification
        optimizer = torch.optim.Adam(model.parameters(), lr=self.config.LEARNING_RATE)
        pathlib.Path("models/transformers").mkdir(parents=True, exist_ok=True)

        # Verify model is on GPU
        print(f"Model parameters on CUDA: {next(model.parameters()).is_cuda}")
        if torch.cuda.is_available():
            monitor_gpu_usage()
        
        best_val_loss = float('inf')
        patience = self.config.EARLY_STOPPING_PATIENCE
        patience_counter = 0
        
        for epoch in range(self.config.MAX_EPOCHS):
            # Training
            model.train()
            train_loss = 0.0
            
            for batch_x, batch_y in train_loader:
                batch_x = batch_x.to(self.device)
                batch_y = batch_y.to(self.device).unsqueeze(1)
                
                optimizer.zero_grad()
                outputs = model(batch_x)
                loss = criterion(outputs, batch_y)
                loss.backward()
                optimizer.step()
                
                train_loss += loss.item()
            
            # Validation
            model.eval()
            val_loss = 0.0
            with torch.no_grad():
                for batch_x, batch_y in val_loader:
                    batch_x = batch_x.to(self.device)
                    batch_y = batch_y.to(self.device).unsqueeze(1)
                    with torch.amp.autocast('cuda' if torch.cuda.is_available() else 'cpu'):
                        outputs = model(batch_x)
                        loss = criterion(outputs, batch_y)
                    val_loss += loss.item()
            
            avg_val_loss = val_loss / len(val_loader)

            # Print progress
            if epoch % 5 == 0:
                print(f"Epoch {epoch}: Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {avg_val_loss:.4f}")
                if torch.cuda.is_available():
                    print(f"  GPU Memory: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")
                    monitor_gpu_realtime()

            # Early stopping
            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                patience_counter = 0
                # Save best model
                torch.save(model.state_dict(), f'models/transformers/best_model_fold_{fold}.pth')
            else:
                patience_counter += 1
                
            if patience_counter >= patience:
                print(f"Early stopping at epoch {epoch}")
                break
        
        # Load best model
        model.load_state_dict(torch.load(f'models/transformers/best_model_fold_{fold}.pth'))
        return model
    
    def _create_dataset(self, features, targets):
        """Create PyTorch dataset with sequences for transformer - OPTIMIZED VERSION"""
        print(f"Creating dataset with {len(features)} samples, sequence length {self.config.SEQUENCE_LENGTH}")
        
        # Check for NaN values and handle them
        if np.isnan(features).any():
            print("Warning: NaN values found in features. Filling with 0.")
            features = np.nan_to_num(features, nan=0.0)
        
        # Pre-allocate arrays for efficiency
        num_sequences = len(features) - self.config.SEQUENCE_LENGTH + 1
        sequences = np.zeros((num_sequences, self.config.SEQUENCE_LENGTH, features.shape[1]), dtype=np.float32)
        sequence_targets = np.zeros(num_sequences, dtype=np.float32)
        
        # Create sequences more efficiently
        for i in range(num_sequences):
            sequences[i] = features[i:i + self.config.SEQUENCE_LENGTH]
            sequence_targets[i] = targets[i + self.config.SEQUENCE_LENGTH - 1]
        
        print(f"Created {num_sequences} sequences with shape {sequences.shape}")
        
        # Convert to tensors efficiently
        sequences_tensor = torch.from_numpy(sequences).float()
        targets_tensor = torch.from_numpy(sequence_targets).float()
        
        return torch.utils.data.TensorDataset(sequences_tensor, targets_tensor)
        
    def _evaluate_model(self, model, data_loader):
        """Evaluate model and return metrics"""
        model.eval()
        all_predictions = []
        all_targets = []
        
        with torch.no_grad():
            for batch_x, batch_y in data_loader:
                batch_x = batch_x.to(self.device)
                batch_y = batch_y.to(self.device)
                
                outputs = model(batch_x)
                predictions = torch.sigmoid(outputs).cpu().numpy()
                targets = batch_y.cpu().numpy()
                
                all_predictions.extend(predictions.flatten())
                all_targets.extend(targets.flatten())
        
        # Calculate metrics
        predictions_binary = (np.array(all_predictions) > 0.5).astype(int)
        
        metrics = {
            'accuracy': accuracy_score(all_targets, predictions_binary),
            'auc': roc_auc_score(all_targets, all_predictions),
            'precision': precision_recall_fscore_support(all_targets, predictions_binary, average='weighted')[0],
            'recall': precision_recall_fscore_support(all_targets, predictions_binary, average='weighted')[1],
            'f1': precision_recall_fscore_support(all_targets, predictions_binary, average='weighted')[2],
            'confusion_matrix': confusion_matrix(all_targets, predictions_binary),
            'classification_report': classification_report(all_targets, predictions_binary)
        }
        
        return metrics

In [5]:
class SimpleTransformerTrainer:
    def __init__(self, config: TransformerConfig, feature_list=None):
        self.config = config
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.feature_list = feature_list if feature_list is not None else []
        self.scaler = None

    def create_time_splits(self, df, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15):
        """Create time-based train/val/test splits"""
        df = df.sort_values('event_timestamp').reset_index(drop=True)
        
        n = len(df)
        train_end = int(n * train_ratio)
        val_end = int(n * (train_ratio + val_ratio))
        
        train_df = df.iloc[:train_end].copy()
        val_df = df.iloc[train_end:val_end].copy()
        test_df = df.iloc[val_end:].copy()
        
        print(f"Train: {len(train_df)} samples ({train_df['event_timestamp'].min()} to {train_df['event_timestamp'].max()})")
        print(f"Val: {len(val_df)} samples ({val_df['event_timestamp'].min()} to {val_df['event_timestamp'].max()})")
        print(f"Test: {len(test_df)} samples ({test_df['event_timestamp'].min()} to {test_df['event_timestamp'].max()})")
        
        return train_df, val_df, test_df

    def train_single_model(self, train_df, val_df, test_df, holdout_df):
        """Train a single model with train/val/test + holdout evaluation"""
        
        # Prepare features
        feature_cols = self.feature_list if self.feature_list else [col for col in train_df.columns if col not in [
            'event_timestamp', 'target_classification', 'time', 'instrument'
        ]]
        self.n_features = len(feature_cols)
        
        # Fit scaler on training data only
        self.scaler = StandardScaler()
        train_features = self.scaler.fit_transform(train_df[feature_cols])
        val_features = self.scaler.transform(val_df[feature_cols])
        test_features = self.scaler.transform(test_df[feature_cols])
        holdout_features = self.scaler.transform(holdout_df[feature_cols])
        
        # Create datasets
        train_dataset = self._create_dataset(train_features, train_df['target_classification'].values)
        val_dataset = self._create_dataset(val_features, val_df['target_classification'].values)
        test_dataset = self._create_dataset(test_features, test_df['target_classification'].values)
        holdout_dataset = self._create_dataset(holdout_features, holdout_df['target_classification'].values)
        
        # Create data loaders
        train_loader = DataLoader(train_dataset, batch_size=self.config.BATCH_SIZE, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=self.config.BATCH_SIZE, shuffle=False)
        test_loader = DataLoader(test_dataset, batch_size=self.config.BATCH_SIZE, shuffle=False)
        holdout_loader = DataLoader(holdout_dataset, batch_size=self.config.BATCH_SIZE, shuffle=False)
        
        # Train model
        model = self._train_model(train_loader, val_loader)
        
        # Evaluate on all sets
        train_metrics = self._evaluate_model(model, train_loader, "Training")
        val_metrics = self._evaluate_model(model, val_loader, "Validation")
        test_metrics = self._evaluate_model(model, test_loader, "Test")
        holdout_metrics = self._evaluate_model(model, holdout_loader, "Out-of-Time Holdout")
        
        return {
            'model': model,
            'train_metrics': train_metrics,
            'val_metrics': val_metrics,
            'test_metrics': test_metrics,
            'holdout_metrics': holdout_metrics
        }

    def _train_model(self, train_loader, val_loader):
        """Train the transformer model"""
        model = FinancialTransformer(self.config, n_features=self.n_features).to(self.device)
        criterion = nn.BCEWithLogitsLoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=self.config.LEARNING_RATE)
        
        print(f"Model parameters on CUDA: {next(model.parameters()).is_cuda}")
        
        best_val_loss = float('inf')
        patience = 5  # Reduced patience since we're not doing CV
        patience_counter = 0
        
        for epoch in range(self.config.MAX_EPOCHS):
            # Training
            model.train()
            train_loss = 0.0
            
            for batch_x, batch_y in train_loader:
                batch_x = batch_x.to(self.device)
                batch_y = batch_y.to(self.device).unsqueeze(1)
                
                optimizer.zero_grad()
                outputs = model(batch_x)
                loss = criterion(outputs, batch_y)
                loss.backward()
                optimizer.step()
                
                train_loss += loss.item()
            
            # Validation
            model.eval()
            val_loss = 0.0
            with torch.no_grad():
                for batch_x, batch_y in val_loader:
                    batch_x = batch_x.to(self.device)
                    batch_y = batch_y.to(self.device).unsqueeze(1)
                    outputs = model(batch_x)
                    loss = criterion(outputs, batch_y)
                    val_loss += loss.item()
            
            avg_train_loss = train_loss / len(train_loader)
            avg_val_loss = val_loss / len(val_loader)
            
            # Print progress
            if epoch % 2 == 0:  # Print more frequently
                print(f"Epoch {epoch}: Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")
                if torch.cuda.is_available():
                    print(f"  GPU Memory: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")
            
            # Early stopping
            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                patience_counter = 0
                # Save best model
                torch.save(model.state_dict(), 'models/transformers/simpletransformer_best_model.pth')
            else:
                patience_counter += 1
                
            if patience_counter >= patience:
                print(f"Early stopping at epoch {epoch}")
                break
        
        # Load best model
        model.load_state_dict(torch.load('models/transformers/simpletransformer_best_model.pth'))
        return model

    def _create_dataset(self, features, targets):
        """Create PyTorch dataset with sequences"""
        print(f"Creating dataset with {len(features)} samples, sequence length {self.config.SEQUENCE_LENGTH}")
        
        if np.isnan(features).any():
            print("Warning: NaN values found in features. Filling with 0.")
            features = np.nan_to_num(features, nan=0.0)
        
        num_sequences = len(features) - self.config.SEQUENCE_LENGTH + 1
        sequences = np.zeros((num_sequences, self.config.SEQUENCE_LENGTH, features.shape[1]), dtype=np.float32)
        sequence_targets = np.zeros(num_sequences, dtype=np.float32)
        
        for i in range(num_sequences):
            sequences[i] = features[i:i + self.config.SEQUENCE_LENGTH]
            sequence_targets[i] = targets[i + self.config.SEQUENCE_LENGTH - 1]
        
        print(f"Created {num_sequences} sequences with shape {sequences.shape}")
        
        sequences_tensor = torch.from_numpy(sequences).float()
        targets_tensor = torch.from_numpy(sequence_targets).float()
        
        return torch.utils.data.TensorDataset(sequences_tensor, targets_tensor)
        
    def _evaluate_model(self, model, data_loader, set_name):
        """Evaluate model and return metrics"""
        model.eval()
        all_predictions = []
        all_targets = []
        
        with torch.no_grad():
            for batch_x, batch_y in data_loader:
                batch_x = batch_x.to(self.device)
                batch_y = batch_y.to(self.device)
                
                outputs = model(batch_x)
                predictions = torch.sigmoid(outputs).cpu().numpy()
                targets = batch_y.cpu().numpy()
                
                all_predictions.extend(predictions.flatten())
                all_targets.extend(targets.flatten())
        
        predictions_binary = (np.array(all_predictions) > 0.5).astype(int)
        
        metrics = {
            'accuracy': accuracy_score(all_targets, predictions_binary),
            'auc': roc_auc_score(all_targets, all_predictions),
            'precision': precision_recall_fscore_support(all_targets, predictions_binary, average='weighted')[0],
            'recall': precision_recall_fscore_support(all_targets, predictions_binary, average='weighted')[1],
            'f1': precision_recall_fscore_support(all_targets, predictions_binary, average='weighted')[2],
            'confusion_matrix': confusion_matrix(all_targets, predictions_binary),
            'classification_report': classification_report(all_targets, predictions_binary)
        }
        
        print(f"\n{set_name} Metrics:")
        print(f"  Accuracy: {metrics['accuracy']:.4f}")
        print(f"  AUC: {metrics['auc']:.4f}")
        print(f"  F1: {metrics['f1']:.4f}")
        print(f"  Confusion Matrix:\n{metrics['confusion_matrix']}")
        print(f"  Classification Report:\n{metrics['classification_report']}")

        return metrics

In [6]:
ROOT = os.getcwd()
DATA = os.path.join(ROOT, 'data_clean', 'gold')
FEATURES = os.path.join(DATA, 'market', 'features')
LABELS = os.path.join(DATA, 'market', 'labels')
NEWS = os.path.join(DATA, 'news', 'signals')
COMBINED = os.path.join(DATA, 'gold_1', 'features', 'combined_data.parquet')

run_combined = False or not os.path.exists(COMBINED)

In [7]:
if run_combined:
    features = pd.read_parquet(os.path.join(FEATURES, 'spx500_features.parquet'))
    labels = pd.read_parquet(os.path.join(LABELS, 'spx500_labels_30min.parquet'))
    news = pd.read_csv(os.path.join(NEWS, 'sp500_trading_signals.csv'))
    features['time'] = pd.to_datetime(features['time'])
    news['signal_time'] = pd.to_datetime(news['signal_time'])
    if news['signal_time'].dt.tz is None:
        news['signal_time'] = news['signal_time'].dt.tz_localize('UTC')

    merged_rows = []
    tolerance = pd.Timedelta(hours=6)

    market_df = features.sort_values('time')
    news_df = news.sort_values('signal_time')

    news_features = [
        'signal_time', 'avg_sentiment', 'signal_strength',
        'trading_signal', 'article_count', 'quality_score'
    ]
    available_news = [c for c in news_features if c in news_df.columns]

    for _, market_row in tqdm.tqdm(market_df.iterrows(), total=len(market_df), desc="Merging market and news data"):
        market_time = market_row['time']
        news_cutoff = market_time - tolerance
        eligible_news = news_df[
            (news_df['signal_time'] <= market_time) &
            (news_df['signal_time'] >= news_cutoff)
        ]

        merged_row = market_row.to_dict()

        if not eligible_news.empty:
            latest_news = eligible_news.iloc[-1]
            for col in available_news:
                if col != 'signal_time':
                    merged_row[f'news_{col}'] = latest_news[col]
            news_age_minutes = (market_time - latest_news['signal_time']).total_seconds() / 60
            merged_row['news_age_minutes'] = news_age_minutes
            merged_row['news_available'] = 1
        else:
            for col in available_news:
                if col != 'signal_time':
                    merged_row[f'news_{col}'] = 0.0
            merged_row['news_age_minutes'] = np.nan
            merged_row['news_available'] = 0

        merged_rows.append(merged_row)

    combined_df = pd.DataFrame(merged_rows)

    # Create a folder gold_1 if it doesn't exist
    pathlib.Path("data_clean/gold/gold_1/features").mkdir(parents=True, exist_ok=True)
    pathlib.Path("data_clean/gold/gold_1/labels").mkdir(parents=True, exist_ok=True)
    combined_df.to_parquet("data_clean/gold/gold_1/features/combined_data.parquet")

In [8]:
# Load the data (if not already loaded)
if not 'combined_df' in locals():
    combined_df = pd.read_parquet(COMBINED)

if not 'labels' in locals():
    labels = pd.read_parquet(os.path.join(LABELS, 'spx500_labels_30min.parquet'))
config = TransformerConfig()
gold_1_features = combined_df.drop(columns=['instrument', 'time'], axis=1)
gold_1_full = gold_1_features.merge(labels[['event_timestamp', 'target_classification']], on='event_timestamp', how='inner')
gold_1_full['event_timestamp'] = pd.to_datetime(gold_1_full['event_timestamp'])
gold_1_full = gold_1_full.sort_values('event_timestamp').reset_index(drop=True)

# Define split dates
holdout_date = pd.Timestamp('2025-01-01', tz='UTC')
val_split_date = pd.Timestamp('2024-10-01', tz='UTC')

# Create training data (before holdout) and test data (after holdout)
training_data = gold_1_full[gold_1_full['event_timestamp'] < holdout_date].copy()
test_data = gold_1_full[gold_1_full['event_timestamp'] >= holdout_date].copy()

print(f"Training data: {len(training_data)} samples")
print(f"Test data: {len(test_data)} samples")
print(f"Date range - Train: {training_data['event_timestamp'].min()} to {training_data['event_timestamp'].max()}")
print(f"Date range - Test: {test_data['event_timestamp'].min()} to {test_data['event_timestamp'].max()}")

Training data: 1435407 samples
Test data: 23213 samples
Date range - Train: 2020-10-13 16:15:00+00:00 to 2024-12-31 21:59:00+00:00
Date range - Test: 2025-01-01 23:00:00+00:00 to 2025-01-27 13:58:00+00:00


In [9]:
# Use rolling window for training data only (before holdout)
rolling_splitter = RollingWindowSplit(train_window_months=6, val_window_months=1)
training_data = gold_1_full[gold_1_full['event_timestamp'] < holdout_date]
splits = rolling_splitter.split(
    training_data, 
    start_date=training_data['event_timestamp'].min(),
    end_date=training_data['event_timestamp'].max()
)
print(f"Generated {len(splits)} rolling window splits")

Generated 44 rolling window splits


In [10]:
# # Initialize the trainer
# trainer = RollingWindowTransformerTrainer(config)

# # Run training with rolling window cross-validation
# print("Starting transformer training with rolling window cross-validation...")

# try:
#     fold_results, test_metrics = trainer.train_rolling_window(
#         df=training_data,
#         splits=splits,
#         test_df=test_data
#     )
    
#     # Print results
#     print("\n" + "="*50)
#     print("TRAINING COMPLETE")
#     print("="*50)
    
#     # Fold results
#     print("\nCross-validation results:")
#     for i, metrics in enumerate(fold_results):
#         print(f"Fold {i+1}: Accuracy: {metrics['accuracy']:.4f}, AUC: {metrics['auc']:.4f}, F1: {metrics['f1']:.4f}")
    
#     try:
#         # Average CV performance
#         avg_metrics = {
#             'accuracy': np.mean([m['accuracy'] for m in fold_results]),
#             'auc': np.mean([m['auc'] for m in fold_results]),
#             'f1': np.mean([m['f1'] for m in fold_results]),
#             'precision': np.mean([m['precision'] for m in fold_results]),
#             'recall': np.mean([m['recall'] for m in fold_results]),
#             'confusion_matrix': np.mean([m['confusion_matrix'] for m in fold_results]),
#             'classification_report': np.mean([m['classification_report'] for m in fold_results]),
#         }
        
#         print(f"\nAverage CV Performance:")
#         for metric, value in avg_metrics.items():
#             print(f"{metric.capitalize()}: {value:.4f}")
        
#         # Test set performance
#         print(f"\nOut-of-time Test Performance:")
#         for metric, value in test_metrics.items():
#             print(f"{metric.capitalize()}: {value:.4f}")
#     except Exception as e:
#         print(f"Error calculating average metrics: {e}")
        
# except Exception as e:
#     print(f"Training failed with error: {e}")
#     import traceback
#     traceback.print_exc()

In [11]:
# Initialize the simple trainer
trainer = SimpleTransformerTrainer(config)

print("Starting single transformer model training...")
print("="*50)

try:
    # Create train/val/test splits within your training period
    train_df, val_df, test_df = trainer.create_time_splits(training_data)
    
    # Train single model and evaluate on all sets including holdout
    results = trainer.train_single_model(train_df, val_df, test_df, test_data)
    
    print("\n" + "="*50)
    print("TRAINING COMPLETE")
    print("="*50)
    
    print(f"\nFinal Results Summary:")
    print(f"Training Accuracy: {results['train_metrics']['accuracy']:.4f}")
    print(f"Validation Accuracy: {results['val_metrics']['accuracy']:.4f}")
    print(f"Test Accuracy: {results['test_metrics']['accuracy']:.4f}")
    print(f"Out-of-Time Holdout Accuracy: {results['holdout_metrics']['accuracy']:.4f}")
    
    print(f"\nAUC Scores:")
    print(f"Training AUC: {results['train_metrics']['auc']:.4f}")
    print(f"Validation AUC: {results['val_metrics']['auc']:.4f}")
    print(f"Test AUC: {results['test_metrics']['auc']:.4f}")
    print(f"Out-of-Time Holdout AUC: {results['holdout_metrics']['auc']:.4f}")
    
except Exception as e:
    print(f"Training failed with error: {e}")
    import traceback
    traceback.print_exc()

Starting single transformer model training...
Train: 1004784 samples (2020-10-13 16:15:00+00:00 to 2023-09-14 11:00:00+00:00)
Val: 215311 samples (2023-09-14 11:01:00+00:00 to 2024-05-10 17:12:00+00:00)
Test: 215312 samples (2024-05-10 17:13:00+00:00 to 2024-12-31 21:59:00+00:00)


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


Creating dataset with 1004784 samples, sequence length 50
Created 1004735 sequences with shape (1004735, 50, 71)
Creating dataset with 215311 samples, sequence length 50
Created 215262 sequences with shape (215262, 50, 71)
Creating dataset with 215312 samples, sequence length 50
Created 215263 sequences with shape (215263, 50, 71)
Creating dataset with 23213 samples, sequence length 50
Created 23164 sequences with shape (23164, 50, 71)
Model parameters on CUDA: True
Epoch 0: Train Loss: 0.6928, Val Loss: 0.6928
  GPU Memory: 0.13 GB
Epoch 2: Train Loss: 0.6924, Val Loss: 0.6928
  GPU Memory: 0.13 GB
Epoch 4: Train Loss: 0.6923, Val Loss: 0.6928
  GPU Memory: 0.13 GB
Epoch 6: Train Loss: 0.6924, Val Loss: 0.6929
  GPU Memory: 0.13 GB
Early stopping at epoch 6

Training Metrics:
  Accuracy: 0.5171
  AUC: 0.5217
  F1: 0.4958
  Confusion Matrix:
[[154176 343347]
 [141791 365421]]
  Classification Report:
              precision    recall  f1-score   support

         0.0       0.52      0.

In [12]:
# print out model metrics report
print("\nModel training and evaluation complete.")


Model training and evaluation complete.
