# Market Regime Prediction - Training Pipeline

## Setup and Imports

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
import numpy as np
import pandas as pd
import json
import os
from datetime import datetime
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from model import MarketRegimeTransformer

## Configuration Parameters

In [None]:
# Data parameters
DATA_DIR = 'data/training'
LOOKBACK_WINDOW = 256  # ~1 year of trading data

# Training parameters
BATCH_SIZE = 160
NUM_EPOCHS = 80
LEARNING_RATE = 1e-4
NUM_WORKERS = 8

# Loss weights
CLASS_WEIGHT = 1.0
DURATION_WEIGHT_START = 0.05  # Gradually increase from this
DURATION_WEIGHT_END = 0.4     # To this value

# Model parameters
N_FEATURES = 7  # OHLCV + RSI + BB
D_MODEL = 256
N_HEAD = 8
NUM_FEATURE_LAYERS = 4
NUM_AGGREGATE_LAYERS = 3
DIM_FEEDFORWARD = 1024
DROPOUT = 0.1
NUM_CLASSES = 3  # bull, flat, bear

# Scheduler parameters
WARMUP_EPOCHS = 10

# Device
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Device Information

In [3]:
print(f"Using device: {DEVICE}")

if DEVICE.type == 'cuda':
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory Usage:")
    print(f"  Allocated: {round(torch.cuda.memory_allocated(0)/1024**3,1)} GB")
    print(f"  Cached: {round(torch.cuda.memory_reserved(0)/1024**3,1)} GB")
    print(f"  Total VRAM: {round(torch.cuda.get_device_properties(0).total_memory/1024**3,1)} GB")
    
    # Enable cuDNN optimizations
    torch.backends.cudnn.benchmark = True
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True

Using device: cuda
GPU: NVIDIA GeForce RTX 3090
Memory Usage:
  Allocated: 0.0 GB
  Cached: 0.0 GB
  Total VRAM: 24.0 GB


## Dataset Definition

In [None]:
class MultiFileMarketRegimeDataset(Dataset):
    def __init__(self, data_dir, subset='train', lookback_window=256):
        """
        Load dataset from multiple .npz files without loading all data into memory
        Args:
            data_dir: Base directory containing train/val subdirectories
            subset: 'train' or 'val'
            lookback_window: The lookback window size (for directory naming)
        """
        self.data_dir = os.path.join(data_dir, subset, f'n{lookback_window}')
        
        # Find all .npz files
        self.files = sorted([f for f in os.listdir(self.data_dir) if f.endswith('.npz')])
        if not self.files:
            raise ValueError(f"No .npz files found in {self.data_dir}")
        
        # Build index mapping (file_idx, sample_idx) for each global index
        self.index_map = []
        self.file_lengths = []
        self.cumulative_lengths = [0]
        
        print(f"Indexing {len(self.files)} files from {subset} set...")
        for file_idx, file in enumerate(self.files):
            filepath = os.path.join(self.data_dir, file)
            with np.load(filepath) as data:
                file_length = len(data['X'])
                self.file_lengths.append(file_length)
                self.cumulative_lengths.append(self.cumulative_lengths[-1] + file_length)
                
                # Add mappings for this file
                for sample_idx in range(file_length):
                    self.index_map.append((file_idx, sample_idx))
        
        self.total_samples = self.cumulative_lengths[-1]
        print(f"Indexed {self.total_samples} samples from {subset} set")
        
        # Cache for recently accessed files
        self.cache = {}
        self.cache_size = 5  # Keep up to 5 files in memory
        self.access_order = []
        
    def _load_file(self, file_idx):
        """Load a file into cache if not already loaded"""
        if file_idx not in self.cache:
            # Remove oldest file if cache is full
            if len(self.cache) >= self.cache_size:
                oldest_idx = self.access_order.pop(0)
                del self.cache[oldest_idx]
            
            # Load new file
            filepath = os.path.join(self.data_dir, self.files[file_idx])
            self.cache[file_idx] = np.load(filepath)
            self.access_order.append(file_idx)
        else:
            # Move to end of access order
            self.access_order.remove(file_idx)
            self.access_order.append(file_idx)
        
        return self.cache[file_idx]
    
    def __len__(self):
        return self.total_samples
    
    def __getitem__(self, idx):
        if idx >= self.total_samples:
            raise IndexError(f"Index {idx} out of range for dataset with {self.total_samples} samples")
        
        # Get file and sample indices
        file_idx, sample_idx = self.index_map[idx]
        
        # Load file data
        data = self._load_file(file_idx)
        
        # Return the specific sample
        X = data['X'][sample_idx].astype(np.float32)
        y_class = data['y_class'][sample_idx].astype(np.long)
        y_duration = data['y_duration'][sample_idx].astype(np.float32)
        
        return X, y_class, y_duration

## Data Loading Functions

In [None]:
def create_data_loaders(data_dir, batch_size, num_workers, lookback_window):
    """Create training and validation data loaders using multi-file dataset"""
    # Check if directories exist
    train_dir = os.path.join(data_dir, 'train', f'n{lookback_window}')
    val_dir = os.path.join(data_dir, 'val', f'n{lookback_window}')
    
    if os.path.exists(train_dir) and os.path.exists(val_dir):
        # Load pre-separated datasets
        print("Loading pre-separated train/validation sets from multiple files...")
        train_dataset = MultiFileMarketRegimeDataset(data_dir, subset='train', lookback_window=lookback_window)
        val_dataset = MultiFileMarketRegimeDataset(data_dir, subset='val', lookback_window=lookback_window)
        
        train_loader = DataLoader(
            train_dataset, 
            batch_size=batch_size, 
            shuffle=True,
            num_workers=num_workers,
            pin_memory=True,
            persistent_workers=True if num_workers > 0 else False
        )
        
        val_loader = DataLoader(
            val_dataset, 
            batch_size=batch_size, 
            shuffle=False,
            num_workers=num_workers,
            pin_memory=True,
            persistent_workers=True if num_workers > 0 else False
        )
        
        return train_loader, val_loader, len(train_dataset), len(val_dataset)
    
    else:
        raise ValueError(f"Data directories not found. Please run generate_training_data_v4.py first.")

## Load Class Weights

In [None]:
def load_class_weights(data_dir, lookback_window, device):
    """Load class weights from training data statistics"""
    try:
        # Try to load from manifest file first
        manifest_path = os.path.join(data_dir, 'train', f'n{lookback_window}', 'manifest.json')
        if os.path.exists(manifest_path):
            with open(manifest_path, 'r') as f:
                manifest = json.load(f)
                if 'class_distribution' in manifest:
                    class_dist = manifest['class_distribution']
                    total = sum(class_dist.values())
                    class_pcts = {
                        'bull': class_dist['bull'] / total * 100,
                        'flat': class_dist['flat'] / total * 100,
                        'bear': class_dist['bear'] / total * 100
                    }
        else:
            # Calculate from data if manifest doesn't exist
            print("Calculating class weights from training data...")
            train_dir = os.path.join(data_dir, 'train', f'n{lookback_window}')
            class_counts = {'bull': 0, 'flat': 0, 'bear': 0}
            
            for file in os.listdir(train_dir):
                if file.endswith('.npz'):
                    filepath = os.path.join(train_dir, file)
                    with np.load(filepath) as data:
                        y_class = data['y_class']
                        class_counts['bull'] += np.sum(y_class == 0)
                        class_counts['flat'] += np.sum(y_class == 1)
                        class_counts['bear'] += np.sum(y_class == 2)
            
            total = sum(class_counts.values())
            class_pcts = {k: v / total * 100 for k, v in class_counts.items()}
        
        # Calculate weights inversely proportional to class frequency
        weights = torch.tensor([
            100.0 / class_pcts['bull'],
            100.0 / class_pcts['flat'],
            100.0 / class_pcts['bear']
        ]).to(device)
        
        # Normalize weights
        weights = weights / weights.mean()
        
        print(f"Loaded class weights from training data:")
        print(f"  Bull: {weights[0]:.3f} ({class_pcts['bull']:.1f}% of data)")
        print(f"  Flat: {weights[1]:.3f} ({class_pcts['flat']:.1f}% of data)")
        print(f"  Bear: {weights[2]:.3f} ({class_pcts['bear']:.1f}% of data)")
        
        return weights
    except Exception as e:
        # Default weights if calculation fails
        print(f"Error calculating class weights: {e}")
        print("Using default class weights")
        return torch.tensor([0.760, 1.294, 0.991]).to(device)

## Training Functions

In [7]:
def train_epoch(model, train_loader, criterion_class, criterion_duration, optimizer, device, class_weight, duration_weight):
    """Train for one epoch"""
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    for batch_idx, (data, target_class, target_duration) in enumerate(train_loader):
        data = data.to(device)
        target_class = target_class.to(device)
        target_duration = target_duration.to(device).unsqueeze(1)
        
        optimizer.zero_grad()
        
        class_logits, duration_pred = model(data)
        
        loss_class = criterion_class(class_logits, target_class)
        loss_duration = criterion_duration(duration_pred, target_duration)
        
        loss = class_weight * loss_class + duration_weight * loss_duration
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        
        total_loss += loss.item()
        _, predicted = class_logits.max(1)
        total += target_class.size(0)
        correct += predicted.eq(target_class).sum().item()
        
        if batch_idx % 100 == 0:
            print(f'  Batch [{batch_idx}/{len(train_loader)}] Loss: {loss.item():.4f} '
                  f'(Class: {loss_class.item():.4f}, Duration: {loss_duration.item():.4f})')
    
    accuracy = 100. * correct / total
    avg_loss = total_loss / len(train_loader)
    
    return avg_loss, accuracy


def validate(model, val_loader, criterion_class, criterion_duration, device, class_weight, duration_weight):
    """Validate the model"""
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    class_correct = {0: 0, 1: 0, 2: 0}
    class_total = {0: 0, 1: 0, 2: 0}
    duration_mse = 0
    
    with torch.no_grad():
        for data, target_class, target_duration in val_loader:
            data = data.to(device)
            target_class = target_class.to(device)
            target_duration = target_duration.to(device).unsqueeze(1)
            
            class_logits, duration_pred = model(data)
            
            loss_class = criterion_class(class_logits, target_class)
            loss_duration = criterion_duration(duration_pred, target_duration)
            loss = class_weight * loss_class + duration_weight * loss_duration
            
            total_loss += loss.item()
            _, predicted = class_logits.max(1)
            total += target_class.size(0)
            correct += predicted.eq(target_class).sum().item()
            
            # Track duration MSE
            duration_mse += ((duration_pred - target_duration) ** 2).sum().item()
            
            for i in range(target_class.size(0)):
                label = target_class[i].item()
                class_total[label] += 1
                if predicted[i] == label:
                    class_correct[label] += 1
    
    accuracy = 100. * correct / total
    avg_loss = total_loss / len(val_loader)
    avg_duration_mse = duration_mse / total
    
    class_accuracies = {
        'bull': 100. * class_correct[0] / class_total[0] if class_total[0] > 0 else 0,
        'flat': 100. * class_correct[1] / class_total[1] if class_total[1] > 0 else 0,
        'bear': 100. * class_correct[2] / class_total[2] if class_total[2] > 0 else 0
    }
    
    return avg_loss, accuracy, class_accuracies, avg_duration_mse

## Load Data

In [None]:
print("Loading data...")
train_loader, val_loader, train_size, val_size = create_data_loaders(
    DATA_DIR, 
    BATCH_SIZE,
    NUM_WORKERS,
    LOOKBACK_WINDOW
)
print(f"Training samples: {train_size}, Validation samples: {val_size}")
print(f"NOTE: Validation set contains S&P 500 correlated instruments only")

## Initialize Model

In [None]:
# Load class weights
class_weights = load_class_weights(DATA_DIR, LOOKBACK_WINDOW, DEVICE)

# Initialize model
model = MarketRegimeTransformer(
    lookback_window=LOOKBACK_WINDOW-1,  # -1 because of pct_change
    n_features=N_FEATURES,
    d_model=D_MODEL,
    nhead=N_HEAD,
    num_feature_layers=NUM_FEATURE_LAYERS,
    num_aggregate_layers=NUM_AGGREGATE_LAYERS,
    dim_feedforward=DIM_FEEDFORWARD,
    dropout=DROPOUT,
    num_classes=NUM_CLASSES
).to(DEVICE)

print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

## Setup Training

In [None]:
# Loss functions
criterion_class = nn.CrossEntropyLoss(weight=class_weights)
criterion_duration = nn.MSELoss()

# Optimizer
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.01)

# Learning rate schedulers
scheduler1 = optim.lr_scheduler.LinearLR(optimizer, start_factor=0.1, total_iters=WARMUP_EPOCHS)
scheduler2 = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=NUM_EPOCHS-WARMUP_EPOCHS)
scheduler = optim.lr_scheduler.SequentialLR(optimizer, schedulers=[scheduler1, scheduler2], milestones=[WARMUP_EPOCHS])

# Create checkpoint directory
os.makedirs('checkpoints', exist_ok=True)

## Training Loop

In [None]:
# Training history
history = {
    'train_loss': [],
    'train_acc': [],
    'val_loss': [],
    'val_acc': [],
    'val_duration_mse': [],
    'learning_rate': [],
    'duration_weight': []
}

best_val_accuracy = 0
best_balanced_accuracy = 0
best_duration_mse = float('inf')
best_epoch = 0

for epoch in range(NUM_EPOCHS):
    # Gradually increase duration weight
    progress = epoch / NUM_EPOCHS
    duration_weight = DURATION_WEIGHT_START + (DURATION_WEIGHT_END - DURATION_WEIGHT_START) * progress
    
    print(f"\nEpoch {epoch+1}/{NUM_EPOCHS}")
    print(f"Duration weight: {duration_weight:.3f}")
    print("-" * 50)
    
    # Train
    train_loss, train_acc = train_epoch(
        model, train_loader, criterion_class, criterion_duration, 
        optimizer, DEVICE, CLASS_WEIGHT, duration_weight
    )
    
    # Validate
    val_loss, val_acc, class_accuracies, val_duration_mse = validate(
        model, val_loader, criterion_class, criterion_duration, 
        DEVICE, CLASS_WEIGHT, duration_weight
    )
    
    # Step scheduler
    scheduler.step()
    current_lr = optimizer.param_groups[0]['lr']
    
    # Print metrics
    print(f"\nTrain Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%")
    print(f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%")
    print(f"Val Duration MSE: {val_duration_mse:.4f} (RMSE: {np.sqrt(val_duration_mse):.4f})")
    print(f"Class Accuracies - Bull: {class_accuracies['bull']:.2f}%, "
          f"Flat: {class_accuracies['flat']:.2f}%, Bear: {class_accuracies['bear']:.2f}%")
    print(f"Learning Rate: {current_lr:.2e}")
    
    if DEVICE.type == 'cuda':
        print(f"GPU Memory: {round(torch.cuda.memory_allocated(0)/1024**3,2)} GB allocated")
    
    # Store history
    history['train_loss'].append(train_loss)
    history['train_acc'].append(train_acc)
    history['val_loss'].append(val_loss)
    history['val_acc'].append(val_acc)
    history['val_duration_mse'].append(val_duration_mse)
    history['learning_rate'].append(current_lr)
    history['duration_weight'].append(duration_weight)
    
    # Calculate balanced accuracy
    balanced_accuracy = (class_accuracies['bull'] + class_accuracies['flat'] + class_accuracies['bear']) / 3
    
    # Save best model based on multiple criteria
    save_model = False
    save_reason = ""
    
    if val_acc > best_val_accuracy:
        best_val_accuracy = val_acc
        save_model = True
        save_reason = "best overall accuracy"
    
    if balanced_accuracy > best_balanced_accuracy and val_duration_mse < 15:
        best_balanced_accuracy = balanced_accuracy
        save_model = True
        save_reason = "best balanced accuracy"
    
    if val_duration_mse < best_duration_mse and val_acc > 65:
        best_duration_mse = val_duration_mse
        save_model = True
        save_reason = "best duration MSE"
    
    if save_model:
        best_epoch = epoch + 1
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'train_loss': train_loss,
            'val_loss': val_loss,
            'train_acc': train_acc,
            'val_acc': val_acc,
            'val_duration_mse': val_duration_mse,
            'class_accuracies': class_accuracies,
            'balanced_accuracy': balanced_accuracy,
            'duration_weight': duration_weight,
            'lookback_window': LOOKBACK_WINDOW
        }, 'checkpoints/best_model.pth')
        print(f"  -> New best model saved! ({save_reason})")

print(f"\nTraining complete!")
print(f"Best validation accuracy: {best_val_accuracy:.2f}%")
print(f"Best balanced accuracy: {best_balanced_accuracy:.2f}%")
print(f"Best duration MSE: {best_duration_mse:.4f} (RMSE: {np.sqrt(best_duration_mse):.4f})")
print(f"Best model saved at epoch {best_epoch}")

## Plot Training History

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Loss
axes[0, 0].plot(history['train_loss'], label='Train Loss')
axes[0, 0].plot(history['val_loss'], label='Val Loss')
axes[0, 0].set_xlabel('Epoch')
axes[0, 0].set_ylabel('Loss')
axes[0, 0].set_title('Training and Validation Loss')
axes[0, 0].legend()
axes[0, 0].grid(True)

# Accuracy
axes[0, 1].plot(history['train_acc'], label='Train Accuracy')
axes[0, 1].plot(history['val_acc'], label='Val Accuracy')
axes[0, 1].set_xlabel('Epoch')
axes[0, 1].set_ylabel('Accuracy (%)')
axes[0, 1].set_title('Training and Validation Accuracy')
axes[0, 1].legend()
axes[0, 1].grid(True)

# Duration MSE
axes[1, 0].plot(history['val_duration_mse'], label='Val Duration MSE', color='green')
axes[1, 0].set_xlabel('Epoch')
axes[1, 0].set_ylabel('MSE')
axes[1, 0].set_title('Validation Duration MSE')
axes[1, 0].legend()
axes[1, 0].grid(True)

# Learning Rate and Duration Weight
ax1 = axes[1, 1]
ax2 = ax1.twinx()
ax1.plot(history['learning_rate'], label='Learning Rate', color='blue')
ax2.plot(history['duration_weight'], label='Duration Weight', color='red')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Learning Rate', color='blue')
ax2.set_ylabel('Duration Weight', color='red')
ax1.set_title('Learning Rate and Duration Weight Schedule')
ax1.tick_params(axis='y', labelcolor='blue')
ax2.tick_params(axis='y', labelcolor='red')
ax1.grid(True)

plt.tight_layout()
plt.savefig('training_history.png', dpi=150)
plt.show()

## Load Best Model for Evaluation

In [None]:
# Load best model
checkpoint = torch.load('checkpoints/best_model.pth', map_location=DEVICE)
model.load_state_dict(checkpoint['model_state_dict'])

print(f"Loaded best model from epoch {checkpoint['epoch'] + 1}")
print(f"Best model metrics:")
print(f"  Validation Accuracy: {checkpoint['val_acc']:.2f}%")
print(f"  Validation Loss: {checkpoint['val_loss']:.4f}")
print(f"  Duration MSE: {checkpoint['val_duration_mse']:.4f}")
print(f"  Balanced Accuracy: {checkpoint['balanced_accuracy']:.2f}%")

## Final Validation Performance

In [None]:
# Run final validation
val_loss, val_acc, class_accuracies, val_duration_mse = validate(
    model, val_loader, criterion_class, criterion_duration, 
    DEVICE, CLASS_WEIGHT, checkpoint['duration_weight']
)

print("\nFinal Validation Performance:")
print(f"Overall Accuracy: {val_acc:.2f}%")
print(f"\nPer-Class Accuracy:")
print(f"  Bull: {class_accuracies['bull']:.2f}%")
print(f"  Flat: {class_accuracies['flat']:.2f}%")
print(f"  Bear: {class_accuracies['bear']:.2f}%")
print(f"\nDuration Prediction:")
print(f"  MSE: {val_duration_mse:.4f}")
print(f"  RMSE: {np.sqrt(val_duration_mse):.4f} (≈ {np.sqrt(val_duration_mse) / 0.545:.1f} days error)")

## Save Training Summary

In [None]:
# Save training summary
summary = {
    'config': {
        'lookback_window': LOOKBACK_WINDOW,
        'n_features': N_FEATURES,
        'batch_size': BATCH_SIZE,
        'num_epochs': NUM_EPOCHS,
        'learning_rate': LEARNING_RATE,
        'model_params': sum(p.numel() for p in model.parameters())
    },
    'best_results': {
        'epoch': best_epoch,
        'val_accuracy': best_val_accuracy,
        'balanced_accuracy': best_balanced_accuracy,
        'duration_mse': best_duration_mse,
        'duration_rmse': np.sqrt(best_duration_mse)
    },
    'final_performance': {
        'overall_accuracy': val_acc,
        'class_accuracies': class_accuracies,
        'duration_mse': val_duration_mse
    }
}

with open('training_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

print("\nTraining summary saved to training_summary.json")