In [72]:
!pip install pytorch-lightning einops datasets
!pip install lightning

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import pytorch_lightning as pl
from pytorch_lightning.loggers.logger import Logger
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
import math
import numpy as np
import json
from datetime import datetime
import os
from pathlib import Path
import logging
from google.colab import drive
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML
from typing import Dict, Any, Optional, Union
!pip install datasets
from datasets import load_dataset
from einops import rearrange
import matplotlib.pyplot as plt
import time
import random




In [73]:
class MultiScaleTimeSeriesDataset(Dataset):
    def __init__(self, num_samples=1000, seq_length=128):
        self.data = []
        self.labels = []

        for _ in range(num_samples):
            t = np.linspace(0, 8*np.pi, seq_length)

            # Multiple frequency components
            fast = 0.3 * np.sin(8*t)
            medium = 0.5 * np.sin(2*t)
            slow = np.sin(0.5*t)

            # Add some random regime changes
            jumps = np.zeros_like(t)
            jump_points = np.random.choice(seq_length-20, 2)
            for jp in jump_points:
                jumps[jp:jp+20] += np.random.normal(0, 0.5)

            # Combine all components
            signal = fast + medium + slow + jumps

            # Label is 1 if there's a significant change in next N steps
            future_volatility = np.std(np.diff(signal[-20:]))
            label = 1 if future_volatility > 0.5 else 0

            self.data.append(signal)
            self.labels.append(label)

        # Convert to tensors and ensure proper shape
        self.data = torch.tensor(np.array(self.data), dtype=torch.float32).unsqueeze(-1)  # Shape: [N, seq_len, 1]
        self.labels = torch.tensor(self.labels, dtype=torch.long)

        print(f"Dataset created with data shape: {self.data.shape} and dtype: {self.data.dtype}")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

In [74]:
class HolographicAttention(nn.Module):
    def __init__(self,
                 d_model,
                 num_heads=8,
                 min_scale=1.0,
                 max_scale=10.0,
                 num_scales=4,
                 dropout=0.1):
        super().__init__()
        self.debug_mode = False

        assert d_model % num_heads == 0

        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.dropout = nn.Dropout(p=dropout)

        # Create log-spaced scales
        self.register_buffer(
            'scales',
            torch.exp(torch.linspace(
                math.log(min_scale),
                math.log(max_scale),
                num_scales
            ))
        )

        # Create scale-dependent projections
        self.scale_projections = nn.ModuleList([
            nn.Linear(d_model, d_model)
            for _ in range(len(self.scales))
        ])

        # Standard attention projections
        self.W_Q = nn.Linear(d_model, d_model)
        self.W_K = nn.Linear(d_model, d_model)
        self.W_V = nn.Linear(d_model, d_model)

        # Scale mixing with smaller initialization
        self.scale_mixing = nn.Parameter(
            torch.randn(len(self.scales), len(self.scales)) * 0.02
        )

        self.reset_parameters()

    def reset_parameters(self):
        """Initialize parameters ensuring different random states"""
        for i, projection in enumerate(self.scale_projections):
            scale_factor = math.sqrt(self.scales[i])
            with torch.random.fork_rng():
                torch.manual_seed(i)  # Different seed per projection
                nn.init.xavier_uniform_(projection.weight, gain=scale_factor)
                nn.init.zeros_(projection.bias)  # Initialize biases to zero explicitly

            if self.debug_mode:
                print(f"Scale {i} (z={scale:.2f}) initialization gain: {gain:.4f}")

        # Initialize standard projections
        nn.init.xavier_uniform_(self.W_Q.weight, gain=1.0)
        nn.init.xavier_uniform_(self.W_K.weight, gain=1.0)
        nn.init.xavier_uniform_(self.W_V.weight, gain=1.0)
        nn.init.zeros_(self.W_Q.bias)
        nn.init.zeros_(self.W_K.bias)
        nn.init.zeros_(self.W_V.bias)

    def _compute_geometric_mask(self, size, z):
        """Compute geometric mask with proper distance scaling"""
        positions = torch.arange(size, dtype=torch.float, device=z.device)
        distances = torch.abs(positions.unsqueeze(-1) - positions.unsqueeze(0))
        # Scale distances by z and sequence length
        scaled_distances = distances / (z * math.sqrt(size))
        mask = torch.exp(-scaled_distances)

        if self.debug_mode:
            print(f"Geometric mask stats for z={z.item():.2f}:")
            print(f"  Mean: {mask.mean().item():.4f}")
            print(f"  Std: {mask.std().item():.4f}")
            print(f"  Max: {mask.max().item():.4f}")
            print(f"  Min: {mask.min().item():.4f}")

        return mask

    def compute_geometric_attention(self, Q, K, V, scale_idx, mask=None):
        """Compute attention with geometric scaling applied to queries and keys"""
        B, H, L, E = Q.shape
        z = self.scales[scale_idx]

        # Apply scale-dependent projection BEFORE computing attention
        Q = Q * (1.0 / z.sqrt())
        K = K * (1.0 / z.sqrt())

        # Compute attention scores
        scores = torch.matmul(Q, K.transpose(-2, -1))

        # Apply geometric mask with proper scaling
        geometric_mask = self._compute_geometric_mask(L, z)
        scores = scores * geometric_mask.unsqueeze(0).unsqueeze(0)

        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)

        if self.debug_mode:
            print(f"Attention scores stats:")
            print(f"  Mean: {scores.mean().item():.4f}")
            print(f"  Std: {scores.std().item():.4f}")

        # Apply softmax and dropout
        attention_weights = F.softmax(scores, dim=-1)
        attention_weights = self.dropout(attention_weights)

        # Compute output
        output = torch.matmul(attention_weights, V)

        if self.debug_mode:
            print(f"Output stats:")
            print(f"  Mean: {output.mean().item():.4f}")
            print(f"  Std: {output.std().item():.4f}")

        return output, attention_weights

    def forward(self, x, mask=None):
        """Forward pass with improved scale mixing and debugging"""
        batch_size, seq_len, _ = x.shape

        if self.debug_mode:
            print(f"\nHolographicAttention Forward Pass")
            print(f"Input shape: {x.shape}")

        # Project inputs for each scale
        Q = self.W_Q(x)
        K = self.W_K(x)
        V = self.W_V(x)

        # Reshape for multi-head attention
        Q = Q.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        K = K.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        V = V.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)

        # Compute attention at each scale
        scale_outputs = []
        attention_maps = []

        # Compute scale importance dynamically
        scale_importance = F.softmax(self.scale_mixing.mean(dim=1) / 0.1, dim=0)  # temperature of 0.1

        if self.debug_mode:
            print("\nScale importance weights:")
            print(scale_importance.detach().cpu().numpy())

        for i, z in enumerate(self.scales):
            if self.debug_mode:
                print(f"\nProcessing scale {i} (z={z.item():.2f})")

            # Apply scale-specific projection
            Q_z = self.scale_projections[i](x)
            Q_z = Q_z.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)

            # Scale the queries and keys before attention
            Q_z = Q_z * (1.0 / z.sqrt())
            K_z = Q_z  # Use same scaling for keys
            V_z = Q_z  # Keep values unscaled

            # Compute attention
            output, attention = self.compute_geometric_attention(Q_z, K_z, V_z, i, mask)

            # Scale the output back up proportional to the scale
            output = output * z.sqrt()

            # Reshape output
            output = output.transpose(1, 2).contiguous().view(batch_size, seq_len, self.d_model)

            # Store scaled outputs and attention maps
            scale_outputs.append(output)
            attention_maps.append(attention)

        # Combine outputs using scale importance
        output = torch.zeros_like(scale_outputs[0])
        for i, out in enumerate(scale_outputs):
            output = output + scale_importance[i] * out

        if self.debug_mode:
            print("\nFinal output stats:")
            print(f"Mean: {output.mean().item():.4f}")
            print(f"Std: {output.std().item():.4f}")

        return output, attention_maps

    def compute_information_metrics(self, attention_maps):
        """Compute attention pattern analysis metrics"""
        metrics = {}

        # Analyze each scale's attention pattern
        for i, attn_map in enumerate(attention_maps):
            # Compute attention entropy
            entropy = -(attn_map * torch.log(attn_map + 1e-10)).sum(dim=-1).mean()
            metrics[f'scale_{i}_entropy'] = entropy.item()

            # Compute attention sparsity
            sparsity = (attn_map < 0.01).float().mean()
            metrics[f'scale_{i}_sparsity'] = sparsity.item()

            # Compute attention concentration
            concentration = attn_map.max(dim=-1)[0].mean()
            metrics[f'scale_{i}_concentration'] = concentration.item()

        return metrics

In [75]:
class TransformerLayer(nn.Module):
    def __init__(self, attention, d_model: int, dropout: float = 0.1):
        super().__init__()
        self.attention = attention
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

        # Improved feed-forward network with activation dropout
        self.ff = nn.Sequential(
            nn.Linear(d_model, d_model * 4),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_model * 4, d_model),
            nn.Dropout(dropout)
        )

        # Output dropout
        self.dropout = nn.Dropout(dropout)

        # Initialize feed-forward layers with scaled initialization
        self._reset_parameters()

    def _reset_parameters(self):
        """Initialize feed-forward parameters with proper scaling"""
        for layer in self.ff:
            if isinstance(layer, nn.Linear):
                # Use smaller initialization for stability
                nn.init.xavier_uniform_(layer.weight, gain=0.1)
                nn.init.zeros_(layer.bias)

    def forward(self, x, mask=None):
        """Forward pass with improved residual connections"""
        # Store original input for main residual
        identity = x

        # Attention block with pre-norm
        normalized = self.norm1(x)
        attn_out, attention_maps = self.attention(normalized, mask=mask)
        attn_out = self.dropout(attn_out)
        x = identity + attn_out

        # Feed-forward block with pre-norm
        identity = x
        normalized = self.norm2(x)
        ff_out = self.ff(normalized)
        x = identity + ff_out

        return x, attention_maps

In [76]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        # Create positional encoding matrix
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
        )

        # Apply sine to even indices
        pe[:, 0::2] = torch.sin(position * div_term)
        # Apply cosine to odd indices
        pe[:, 1::2] = torch.cos(position * div_term)

        # Add batch dimension and register as buffer (won't be trained)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

        # Store model dimension for scaling
        self.d_model = d_model

    def forward(self, x):
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        # Add positional encoding scaled by sqrt(d_model)
        x = x + self.pe[:x.size(0), :] * math.sqrt(self.d_model)
        return self.dropout(x)

    def debug_encoding(self, seq_len=100):
        """
        Debug method to visualize positional encodings
        """
        encoding = self.pe[:seq_len, 0, :].detach().cpu().numpy()
        print(f"Positional encoding shape: {encoding.shape}")
        print(f"Mean value: {encoding.mean():.4f}")
        print(f"Std deviation: {encoding.std():.4f}")
        print(f"Min value: {encoding.min():.4f}")
        print(f"Max value: {encoding.max():.4f}")

In [77]:
class HolographicTransformer(pl.LightningModule):
    def __init__(
        self,
        d_model: int = 256,
        num_layers: int = 6,
        num_heads: int = 8,
        dropout: float = 0.1,
        learning_rate: float = 1e-4
    ):
        super().__init__()
        self.save_hyperparameters()

        # Input projection instead of embedding
        self.input_projection = nn.Linear(1, d_model)
        self.pos_encoding = PositionalEncoding(d_model, dropout)

        # Holographic transformer layers
        self.layers = nn.ModuleList([
            TransformerLayer(
                attention=HolographicAttention(
                    d_model=d_model,
                    num_heads=num_heads,
                    dropout=dropout
                ),
                d_model=d_model,
                dropout=dropout
            ) for _ in range(num_layers)
        ])

        # Output head
        self.norm = nn.LayerNorm(d_model)
        self.classifier = nn.Linear(d_model, 2)
        self.learning_rate = learning_rate

    def forward(self, x):
        #print(f"Input shape before any processing: {x.shape}")
        x = x.to(torch.float32)
        if len(x.shape) == 2:
            x = x.unsqueeze(-1)  # Add feature dimension if missing
        #print(f"Input shape after reshaping: {x.shape}")

        # Project input from (batch, seq_len, 1) to (batch, seq_len, d_model)
        x = self.input_projection(x)
        #print(f"Shape after projection: {x.shape}")
        x = self.pos_encoding(x)

        # Process through transformer layers
        attention_maps = []
        for layer in self.layers:
            x, attn = layer(x)
            attention_maps.append(attn)

        # Final normalization
        x = self.norm(x)

        # Global average pooling
        x = x.mean(dim=1)

        # Classification
        logits = self.classifier(x)
        return logits, attention_maps

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits, _ = self(x)
        loss = F.cross_entropy(logits, y)
        acc = (logits.argmax(dim=-1) == y).float().mean()

        self.log('train_loss', loss)
        self.log('train_acc', acc)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits, attention_maps = self(x)
        loss = F.cross_entropy(logits, y)
        acc = (logits.argmax(dim=-1) == y).float().mean()

        self.log('val_loss', loss)
        self.log('val_acc', acc)

        if batch_idx == 0:
            self.last_attention_maps = attention_maps

        return {'val_loss': loss, 'val_acc': acc}

    def test_step(self, batch, batch_idx):
        x, y = batch
        logits, attention_maps = self(x)
        predictions = logits.argmax(dim=-1)
        acc = (predictions == y).float().mean()

        # Log metrics
        metrics = {
            'test_acc': acc.item(),
            'test_logits_mean': logits.mean().item(),
            'test_logits_std': logits.std().item()
        }
        self.log_dict(metrics)
        return metrics

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate)
        return optimizer

In [78]:
class StandardTransformer(pl.LightningModule):
    def __init__(self, d_model=256, num_layers=6, num_heads=8, dropout=0.1, learning_rate=1e-4):
        super().__init__()
        self.save_hyperparameters()


        # Input projection instead of embedding
        self.input_projection = nn.Linear(1, d_model)
        self.pos_encoding = PositionalEncoding(d_model, dropout)

        # Standard transformer layers
        self.layers = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=d_model,
                nhead=num_heads,
                dim_feedforward=4*d_model,
                dropout=dropout,
                batch_first=True
            ) for _ in range(num_layers)
        ])

        self.classifier = nn.Linear(d_model, 2)
        self.learning_rate = learning_rate

    def forward(self, x):
        #print(f"Input shape before any processing: {x.shape}")
        x = x.to(torch.float32)
        if len(x.shape) == 2:
            x = x.unsqueeze(-1)  # Add feature dimension if missing
        #print(f"Input shape after reshaping: {x.shape}")
        # Project input from (batch, seq_len, 1) to (batch, seq_len, d_model)
        x = self.input_projection(x)
        #print(f"Shape after projection: {x.shape}")
        x = self.pos_encoding(x)

        for layer in self.layers:
            x = layer(x)

        x = x.mean(dim=1)  # Global average pooling
        return self.classifier(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.cross_entropy(logits, y)
        acc = (logits.argmax(dim=-1) == y).float().mean()

        self.log('train_loss', loss)
        self.log('train_acc', acc)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.cross_entropy(logits, y)
        acc = (logits.argmax(dim=-1) == y).float().mean()

        self.log('val_loss', loss)
        self.log('val_acc', acc)
        return {'val_loss': loss, 'val_acc': acc}

    def test_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        predictions = logits.argmax(dim=-1)
        acc = (predictions == y).float().mean()

        # Log metrics
        metrics = {
            'test_acc': acc.item(),
            'test_logits_mean': logits.mean().item(),
            'test_logits_std': logits.std().item()
        }
        self.log_dict(metrics)
        return metrics

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate)
        return optimizer

In [79]:
def run_comparative_experiment(
    model_config: Dict[str, Any],
    train_config: Dict[str, Any],
    train_loader: torch.utils.data.DataLoader,
    val_loader: torch.utils.data.DataLoader,
    test_loader: torch.utils.data.DataLoader
):
    """Run comparative experiment between holographic and standard attention"""

    print("Initializing models with different seeds...")
    def set_seeds(seed):
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        np.random.seed(seed)
        random.seed(seed)
        torch.backends.cudnn.deterministic = True

    # Create models with different seeds
    set_seeds(42)
    holo_model = HolographicTransformer(**model_config)

    set_seeds(43)
    std_model = StandardTransformer(**model_config)

    print("Model comparison:")
    print(f"Holographic model id: {id(holo_model)}")
    print(f"Standard model id: {id(std_model)}")

    # Enable debug mode for one batch
    for layer in holo_model.layers:
        layer.attention.debug_mode = True

    # Run one batch in debug mode
    print("\nRunning debug pass on holographic model...")
    with torch.no_grad():
        test_batch = next(iter(test_loader))
        _ = holo_model(test_batch[0])

    # Disable debug mode for training
    for layer in holo_model.layers:
        layer.attention.debug_mode = False

    print("\nStarting training...")

    # Train holographic model
    print("\nTraining Holographic Model...")
    start_time = time.time()
    holo_trainer = pl.Trainer(
        max_epochs=train_config['max_epochs'],
        accelerator='auto',
        devices=1
    )
    holo_trainer.fit(holo_model, train_loader, val_loader)
    holo_time = time.time() - start_time

    # Train standard model
    print("\nTraining Standard Attention Model...")
    start_time = time.time()
    std_trainer = pl.Trainer(
        max_epochs=train_config['max_epochs'],
        accelerator='auto',
        devices=1
    )
    std_trainer.fit(std_model, train_loader, val_loader)
    std_time = time.time() - start_time

    # Validate model differences
    print("\nValidating model differences...")
    with torch.no_grad():
        test_batch = next(iter(test_loader))
        x, y = test_batch

        holo_logits, _ = holo_model(x)
        std_logits = std_model(x)

        holo_preds = holo_logits.argmax(dim=-1)
        std_preds = std_logits.argmax(dim=-1)

        print("Prediction comparison:")
        print(f"Number of samples: {len(y)}")
        print(f"Number of matching predictions: {(holo_preds == std_preds).sum().item()}")
        print(f"Raw prediction correlation: {(holo_logits * std_logits).mean().item():.4f}")
        print("\nSample predictions:")
        for i in range(min(5, len(y))):
            print(f"Sample {i}: Holo={holo_preds[i].item()}, Std={std_preds[i].item()}, True={y[i].item()}")

    # Test both models
    print("\nTesting Models...")
    holo_test = holo_trainer.test(holo_model, test_loader)
    std_test = std_trainer.test(std_model, test_loader)

    # Memory usage comparison
    print("\nMeasuring Memory Usage...")
    torch.cuda.reset_peak_memory_stats()
    with torch.no_grad():
        _ = holo_model(next(iter(test_loader))[0].to(holo_model.device))
    holo_memory = torch.cuda.max_memory_allocated()

    torch.cuda.reset_peak_memory_stats()
    with torch.no_grad():
        _ = std_model(next(iter(test_loader))[0].to(std_model.device))
    std_memory = torch.cuda.max_memory_allocated()

    # Compile results
    results = {
        'holographic': {
            'test_metrics': holo_trainer.callback_metrics,
            'training_time': holo_time,
            'peak_memory': holo_memory
        },
        'standard': {
            'test_metrics': std_trainer.callback_metrics,
            'training_time': std_time,
            'peak_memory': std_memory
        }
    }

    # Create visualizations
    plot_comparative_results(results)

    return results

# Helper function for plotting
def plot_comparative_results(results):
    """Create visualizations comparing the models"""
    plt.figure(figsize=(15, 5))

    # Plot test accuracy
    if 'test_acc' in results['holographic']['test_metrics']:
        plt.subplot(131)
        values = [
            results['holographic']['test_metrics']['test_acc'],
            results['standard']['test_metrics']['test_acc']
        ]
        plt.bar(['Holographic', 'Standard'], values)
        plt.title('Test Accuracy')
        plt.xticks(rotation=45)

    # Plot training time
    plt.subplot(132)
    values = [
        results['holographic']['training_time'],
        results['standard']['training_time']
    ]
    plt.bar(['Holographic', 'Standard'], values)
    plt.title('Training Time (s)')
    plt.xticks(rotation=45)

    # Plot memory usage
    plt.subplot(133)
    values = [
        results['holographic']['peak_memory'] / 1e6,  # Convert to MB
        results['standard']['peak_memory'] / 1e6
    ]
    plt.bar(['Holographic', 'Standard'], values)
    plt.title('Peak Memory Usage (MB)')
    plt.xticks(rotation=45)

    plt.tight_layout()
    plt.savefig('comparative_results.png')
    plt.close()

    # Print numerical results with proper handling of N/A values
    print("\nNumerical Results:")
    print("\nTest Accuracy:")
    holo_acc = results['holographic']['test_metrics'].get('test_acc', 'N/A')
    std_acc = results['standard']['test_metrics'].get('test_acc', 'N/A')
    print(f"Holographic: {holo_acc if isinstance(holo_acc, str) else f'{holo_acc:.4f}'}")
    print(f"Standard: {std_acc if isinstance(std_acc, str) else f'{std_acc:.4f}'}")

    print("\nTraining Time:")
    print(f"Holographic: {results['holographic']['training_time']:.2f} seconds")
    print(f"Standard: {results['standard']['training_time']:.2f} seconds")

    print("\nPeak Memory Usage:")
    print(f"Holographic: {results['holographic']['peak_memory']/1e6:.2f} MB")
    print(f"Standard: {results['standard']['peak_memory']/1e6:.2f} MB")

In [80]:
# Model configuration
model_config = {
    'd_model': 256,
    'num_layers': 6,
    'num_heads': 8,
    'dropout': 0.1,
    'learning_rate': 1e-4,
}

# Training configuration
train_config = {
    'max_epochs': 10,
    'batch_size': 32
}

# Create dataloaders
train_loader, val_loader, test_loader = create_dataloaders(
    batch_size=train_config['batch_size']
)



# Run experiment
results = run_comparative_experiment(
    model_config=model_config,
    train_config=train_config,
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader
)

Initializing models with different seeds...
Model comparison:
Holographic model id: 139063818777648
Standard model id: 139063818769152

Running debug pass on holographic model...

HolographicAttention Forward Pass
Input shape: torch.Size([32, 128, 256])

Scale importance weights:
[0.24366884 0.25526315 0.24436373 0.25670433]

Processing scale 0 (z=1.00)
Geometric mask stats for z=1.00:
  Mean: 0.1613
  Std: 0.2423
  Max: 1.0000
  Min: 0.0000
Attention scores stats:
  Mean: 4.7662
  Std: 7.5818
Output stats:
  Mean: 0.0087
  Std: 1.0902

Processing scale 1 (z=2.15)
Geometric mask stats for z=2.15:
  Mean: 0.3088
  Std: 0.2776
  Max: 1.0000
  Min: 0.0055
Attention scores stats:
  Mean: 3.9747
  Std: 3.8625
Output stats:
  Mean: 0.0350
  Std: 1.0299

Processing scale 2 (z=4.64)
Geometric mask stats for z=4.64:
  Mean: 0.5133
  Std: 0.2516
  Max: 1.0000
  Min: 0.0891
Attention scores stats:
  Mean: 3.0417
  Std: 1.8449
Output stats:
  Mean: 0.0544
  Std: 0.9579

Processing scale 3 (z=10.00

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name             | Type               | Params | Mode 
----------------------------------------------------------------
0 | input_projection | Linear             | 512    | train
1 | pos_encoding     | PositionalEncoding | 0      | train
2 | layers           | ModuleList         | 5.9 M  | train
3 | norm             | LayerNorm          | 512    | train
4 | classifier       | Linear             | 514    | train
----------------------------------------------------------------
5.9 M     Trainable params
0         Non-trainable params
5.9 M     Total params
23.698    Total estimated model params size (MB)
126       


Starting training...

Training Holographic Model...


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name             | Type               | Params | Mode 
----------------------------------------------------------------
0 | input_projection | Linear             | 512    | train
1 | pos_encoding     | PositionalEncoding | 0      | train
2 | layers           | ModuleList         | 4.7 M  | train
3 | classifier       | Linear             | 514    | train
----------------------------------------------------------------
4.7 M     Trainable params
0         Non-trainable params
4.7 M     Total params
18.958    Total estimated m


Training Standard Attention Model...


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.



Validating model differences...


INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Prediction comparison:
Number of samples: 32
Number of matching predictions: 32
Raw prediction correlation: 0.7194

Sample predictions:
Sample 0: Holo=0, Std=0, True=0
Sample 1: Holo=0, Std=0, True=0
Sample 2: Holo=0, Std=0, True=0
Sample 3: Holo=0, Std=0, True=0
Sample 4: Holo=0, Std=0, True=1

Testing Models...


Testing: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]


Measuring Memory Usage...

Numerical Results:

Test Accuracy:
Holographic: 0.8300
Standard: 0.8300

Training Time:
Holographic: 52.91 seconds
Standard: 31.51 seconds

Peak Memory Usage:
Holographic: 805.34 MB
Standard: 805.34 MB
