# Diffusion-TS: Генерация синтетических финансовых данных

Diffusion-TS (ICLR 2024) - это современная архитектура для генерации синтетических временных рядов с декомпозицией на тренд и сезонность.

## Применения в финансах:

1. **Data Augmentation**: Увеличение обучающей выборки для ML моделей
2. **Scenario Generation**: Генерация сценариев для стресс-тестирования
3. **Privacy-preserving**: Создание синтетических данных без раскрытия реальных
4. **Backtesting**: Генерация дополнительных исторических данных

## Ключевые особенности Diffusion-TS:

- **Decomposition**: Разделение на тренд, сезонность и шум
- **Interpretable Diffusion**: Понятный процесс генерации
- **Conditional Generation**: Генерация с условиями (класс, метки)

**Статья**: Shen et al., "Diffusion-TS: Interpretable Diffusion for General Time Series Generation" (ICLR 2024)

In [None]:
# Установка зависимостей
!pip install torch numpy pandas matplotlib seaborn scikit-learn tqdm

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Tuple, Optional, Dict, List
from dataclasses import dataclass
from tqdm import tqdm
from sklearn.manifold import TSNE
from sklearn.metrics import pairwise_distances
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

## 1. Создание реалистичных финансовых данных

In [None]:
class FinancialDataGenerator:
    """Генератор реалистичных финансовых временных рядов."""
    
    def __init__(self, seed: int = 42):
        np.random.seed(seed)
    
    def generate_gbm(
        self,
        n_samples: int,
        seq_len: int,
        mu: float = 0.0001,
        sigma: float = 0.02,
        s0: float = 100
    ) -> np.ndarray:
        """
        Геометрическое броуновское движение.
        """
        dt = 1.0
        returns = np.random.normal(
            (mu - 0.5 * sigma**2) * dt,
            sigma * np.sqrt(dt),
            (n_samples, seq_len)
        )
        prices = s0 * np.exp(np.cumsum(returns, axis=1))
        return prices
    
    def generate_with_jumps(
        self,
        n_samples: int,
        seq_len: int,
        jump_intensity: float = 0.02,
        jump_size_mean: float = 0,
        jump_size_std: float = 0.05
    ) -> np.ndarray:
        """
        GBM с пуассоновскими скачками (Merton model).
        """
        prices = self.generate_gbm(n_samples, seq_len)
        
        # Добавляем скачки
        jump_times = np.random.poisson(jump_intensity, (n_samples, seq_len))
        jump_sizes = np.random.normal(jump_size_mean, jump_size_std, (n_samples, seq_len))
        jumps = jump_times * jump_sizes
        
        prices = prices * np.exp(np.cumsum(jumps, axis=1))
        return prices
    
    def generate_regime_switching(
        self,
        n_samples: int,
        seq_len: int,
        regimes: List[Dict] = None
    ) -> Tuple[np.ndarray, np.ndarray]:
        """
        Режимное переключение (bull/bear markets).
        """
        if regimes is None:
            regimes = [
                {'mu': 0.001, 'sigma': 0.01, 'prob': 0.6},  # Bull
                {'mu': -0.001, 'sigma': 0.03, 'prob': 0.4}  # Bear
            ]
        
        prices = np.zeros((n_samples, seq_len))
        regime_labels = np.zeros((n_samples, seq_len), dtype=int)
        
        for i in range(n_samples):
            current_regime = 0
            s = 100
            
            for t in range(seq_len):
                # Случайное переключение режима
                if np.random.random() < 0.05:  # 5% шанс переключения
                    current_regime = 1 - current_regime
                
                regime = regimes[current_regime]
                ret = np.random.normal(regime['mu'], regime['sigma'])
                s = s * np.exp(ret)
                
                prices[i, t] = s
                regime_labels[i, t] = current_regime
        
        return prices, regime_labels
    
    def generate_correlated_assets(
        self,
        n_samples: int,
        seq_len: int,
        n_assets: int = 5,
        correlation: float = 0.5
    ) -> np.ndarray:
        """
        Коррелированные активы.
        """
        # Ковариационная матрица
        cov = np.full((n_assets, n_assets), correlation * 0.02**2)
        np.fill_diagonal(cov, 0.02**2)
        
        prices = np.zeros((n_samples, seq_len, n_assets))
        
        for i in range(n_samples):
            returns = np.random.multivariate_normal(
                np.full(n_assets, 0.0001),
                cov,
                seq_len
            )
            prices[i] = 100 * np.exp(np.cumsum(returns, axis=0))
        
        return prices

# Генерируем данные
generator = FinancialDataGenerator(seed=42)

n_samples = 1000
seq_len = 100

# Простой GBM
gbm_data = generator.generate_gbm(n_samples, seq_len)

# GBM со скачками
jump_data = generator.generate_with_jumps(n_samples // 2, seq_len)

# Режимное переключение
regime_data, regime_labels = generator.generate_regime_switching(n_samples // 2, seq_len)

print(f"GBM data: {gbm_data.shape}")
print(f"Jump data: {jump_data.shape}")
print(f"Regime data: {regime_data.shape}")

In [None]:
# Визуализация разных типов данных
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# GBM
ax1 = axes[0, 0]
for i in range(10):
    ax1.plot(gbm_data[i], alpha=0.7)
ax1.set_title('Geometric Brownian Motion')
ax1.set_xlabel('Time')
ax1.set_ylabel('Price')

# Jump diffusion
ax2 = axes[0, 1]
for i in range(10):
    ax2.plot(jump_data[i], alpha=0.7)
ax2.set_title('Jump Diffusion (Merton Model)')
ax2.set_xlabel('Time')
ax2.set_ylabel('Price')

# Regime switching
ax3 = axes[1, 0]
for i in range(5):
    ax3.plot(regime_data[i], alpha=0.7, label=f'Sample {i+1}')
ax3.set_title('Regime Switching')
ax3.set_xlabel('Time')
ax3.set_ylabel('Price')

# Распределение доходностей
ax4 = axes[1, 1]
gbm_returns = np.diff(np.log(gbm_data), axis=1).flatten()
jump_returns = np.diff(np.log(jump_data), axis=1).flatten()

ax4.hist(gbm_returns, bins=100, alpha=0.5, label='GBM', density=True)
ax4.hist(jump_returns, bins=100, alpha=0.5, label='Jump', density=True)
ax4.set_title('Return Distributions')
ax4.set_xlabel('Log Return')
ax4.set_ylabel('Density')
ax4.legend()
ax4.set_xlim(-0.1, 0.1)

plt.tight_layout()
plt.show()

## 2. Подготовка данных для обучения

In [None]:
# Объединяем и нормализуем данные
all_data = np.concatenate([gbm_data, jump_data, regime_data], axis=0)

# Нормализация (log returns)
log_returns = np.diff(np.log(all_data + 1e-10), axis=1)

# Стандартизация
mean = log_returns.mean()
std = log_returns.std()
normalized_data = (log_returns - mean) / std

print(f"Data shape: {normalized_data.shape}")
print(f"Mean: {normalized_data.mean():.6f}, Std: {normalized_data.std():.6f}")

In [None]:
@dataclass
class DiffusionTSConfig:
    """Конфигурация Diffusion-TS."""
    seq_len: int = 99  # после diff
    hidden_dim: int = 128
    num_heads: int = 4
    num_layers: int = 4
    diffusion_steps: int = 100
    beta_start: float = 0.0001
    beta_end: float = 0.02
    # Decomposition parameters
    trend_poly_degree: int = 3
    seasonal_periods: List[int] = None
    
    def __post_init__(self):
        if self.seasonal_periods is None:
            self.seasonal_periods = [24, 12]  # daily, half-day

config = DiffusionTSConfig()
print(f"Config: {config}")

In [None]:
from torch.utils.data import Dataset, DataLoader

class TimeSeriesDataset(Dataset):
    def __init__(self, data: np.ndarray):
        # Добавляем dimension для channels
        self.data = torch.FloatTensor(data).unsqueeze(-1)  # [N, seq_len, 1]
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]

# Split
n = len(normalized_data)
train_size = int(0.8 * n)

train_data = normalized_data[:train_size]
test_data = normalized_data[train_size:]

train_dataset = TimeSeriesDataset(train_data)
test_dataset = TimeSeriesDataset(test_data)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

print(f"Train: {len(train_dataset)}, Test: {len(test_dataset)}")

## 3. Diffusion-TS Architecture

Diffusion-TS использует interpretable decomposition:

$$x = \text{trend}(x) + \text{seasonal}(x) + \text{residual}(x)$$

Диффузия применяется к residual компоненте.

In [None]:
class TrendDecomposition(nn.Module):
    """Извлечение тренда через moving average."""
    
    def __init__(self, kernel_size: int = 25):
        super().__init__()
        self.kernel_size = kernel_size
        # Learnable moving average
        self.avg = nn.AvgPool1d(kernel_size, stride=1, padding=kernel_size//2)
    
    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Args:
            x: [batch, seq_len, channels]
        Returns:
            trend, residual
        """
        # [batch, channels, seq_len]
        x_t = x.transpose(1, 2)
        trend = self.avg(x_t)
        
        # Обрезаем до оригинальной длины
        trend = trend[:, :, :x.size(1)]
        trend = trend.transpose(1, 2)
        
        residual = x - trend
        
        return trend, residual

In [None]:
class SeasonalDecomposition(nn.Module):
    """Извлечение сезонности через Fourier."""
    
    def __init__(self, seq_len: int, n_harmonics: int = 5):
        super().__init__()
        self.seq_len = seq_len
        self.n_harmonics = n_harmonics
        
        # Learnable Fourier coefficients
        self.cos_weights = nn.Parameter(torch.randn(n_harmonics) * 0.01)
        self.sin_weights = nn.Parameter(torch.randn(n_harmonics) * 0.01)
    
    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Args:
            x: [batch, seq_len, channels]
        """
        batch_size = x.shape[0]
        device = x.device
        
        # Time indices
        t = torch.arange(self.seq_len, device=device).float() / self.seq_len
        
        # Fourier basis
        seasonal = torch.zeros(self.seq_len, device=device)
        for k in range(self.n_harmonics):
            freq = (k + 1) * 2 * np.pi
            seasonal = seasonal + self.cos_weights[k] * torch.cos(freq * t)
            seasonal = seasonal + self.sin_weights[k] * torch.sin(freq * t)
        
        # Expand to batch and channels
        seasonal = seasonal.unsqueeze(0).unsqueeze(-1)  # [1, seq_len, 1]
        seasonal = seasonal.expand(batch_size, -1, x.size(-1))
        
        residual = x - seasonal
        
        return seasonal, residual

In [None]:
class TimeEmbedding(nn.Module):
    """Sinusoidal time embedding."""
    
    def __init__(self, dim: int):
        super().__init__()
        self.dim = dim
        self.mlp = nn.Sequential(
            nn.Linear(dim, dim * 4),
            nn.GELU(),
            nn.Linear(dim * 4, dim)
        )
    
    def forward(self, t: torch.Tensor) -> torch.Tensor:
        half_dim = self.dim // 2
        emb = np.log(10000) / (half_dim - 1)
        emb = torch.exp(torch.arange(half_dim, device=t.device) * -emb)
        emb = t[:, None] * emb[None, :]
        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
        return self.mlp(emb)

In [None]:
class ResidualBlock(nn.Module):
    """Residual block with time conditioning."""
    
    def __init__(self, in_dim: int, out_dim: int, time_dim: int):
        super().__init__()
        
        self.norm1 = nn.LayerNorm(in_dim)
        self.conv1 = nn.Linear(in_dim, out_dim)
        self.norm2 = nn.LayerNorm(out_dim)
        self.conv2 = nn.Linear(out_dim, out_dim)
        
        self.time_proj = nn.Linear(time_dim, out_dim * 2)
        
        if in_dim != out_dim:
            self.skip = nn.Linear(in_dim, out_dim)
        else:
            self.skip = nn.Identity()
    
    def forward(self, x: torch.Tensor, t_emb: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x: [batch, seq_len, in_dim]
            t_emb: [batch, time_dim]
        """
        h = self.norm1(x)
        h = F.gelu(self.conv1(h))
        
        # Time conditioning
        t_params = self.time_proj(t_emb)  # [batch, out_dim * 2]
        scale, shift = t_params.chunk(2, dim=-1)
        h = h * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
        
        h = self.norm2(h)
        h = F.gelu(self.conv2(h))
        
        return self.skip(x) + h

In [None]:
class TransformerBlock(nn.Module):
    """Self-attention block."""
    
    def __init__(self, dim: int, num_heads: int, time_dim: int):
        super().__init__()
        
        self.norm1 = nn.LayerNorm(dim)
        self.attn = nn.MultiheadAttention(dim, num_heads, batch_first=True)
        
        self.norm2 = nn.LayerNorm(dim)
        self.ffn = nn.Sequential(
            nn.Linear(dim, dim * 4),
            nn.GELU(),
            nn.Linear(dim * 4, dim)
        )
        
        self.time_proj = nn.Linear(time_dim, dim * 2)
    
    def forward(self, x: torch.Tensor, t_emb: torch.Tensor) -> torch.Tensor:
        # Self-attention
        h = self.norm1(x)
        h, _ = self.attn(h, h, h)
        x = x + h
        
        # FFN with time conditioning
        h = self.norm2(x)
        
        t_params = self.time_proj(t_emb)
        scale, shift = t_params.chunk(2, dim=-1)
        h = h * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
        
        h = self.ffn(h)
        x = x + h
        
        return x

In [None]:
class DiffusionTS(nn.Module):
    """Diffusion-TS: Interpretable Diffusion for Time Series Generation."""
    
    def __init__(self, config: DiffusionTSConfig):
        super().__init__()
        
        self.config = config
        
        # Decomposition modules
        self.trend_decomp = TrendDecomposition(kernel_size=25)
        self.seasonal_decomp = SeasonalDecomposition(config.seq_len, n_harmonics=5)
        
        # Input projection
        self.input_proj = nn.Linear(1, config.hidden_dim)
        
        # Time embedding
        self.time_embed = TimeEmbedding(config.hidden_dim)
        
        # Positional encoding
        self.pos_embed = nn.Parameter(torch.randn(1, config.seq_len, config.hidden_dim) * 0.02)
        
        # Encoder blocks
        self.encoder = nn.ModuleList([
            TransformerBlock(config.hidden_dim, config.num_heads, config.hidden_dim)
            for _ in range(config.num_layers // 2)
        ])
        
        # Decoder blocks
        self.decoder = nn.ModuleList([
            TransformerBlock(config.hidden_dim, config.num_heads, config.hidden_dim)
            for _ in range(config.num_layers // 2)
        ])
        
        # Output projection
        self.output_proj = nn.Sequential(
            nn.LayerNorm(config.hidden_dim),
            nn.Linear(config.hidden_dim, 1)
        )
        
        # Noise schedule
        self._setup_schedule()
    
    def _setup_schedule(self):
        """Cosine noise schedule."""
        T = self.config.diffusion_steps
        s = 0.008
        
        steps = torch.arange(T + 1) / T
        alphas_bar = torch.cos((steps + s) / (1 + s) * np.pi / 2) ** 2
        alphas_bar = alphas_bar / alphas_bar[0]
        
        betas = 1 - alphas_bar[1:] / alphas_bar[:-1]
        betas = betas.clamp(max=0.999)
        
        alphas = 1 - betas
        alphas_cumprod = torch.cumprod(alphas, dim=0)
        
        self.register_buffer('betas', betas)
        self.register_buffer('alphas', alphas)
        self.register_buffer('alphas_cumprod', alphas_cumprod)
        self.register_buffer('sqrt_alphas_cumprod', torch.sqrt(alphas_cumprod))
        self.register_buffer('sqrt_one_minus_alphas_cumprod', torch.sqrt(1 - alphas_cumprod))
    
    def forward_diffusion(
        self,
        x0: torch.Tensor,
        t: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """Forward diffusion process."""
        noise = torch.randn_like(x0)
        
        sqrt_alpha = self.sqrt_alphas_cumprod[t].view(-1, 1, 1)
        sqrt_one_minus_alpha = self.sqrt_one_minus_alphas_cumprod[t].view(-1, 1, 1)
        
        xt = sqrt_alpha * x0 + sqrt_one_minus_alpha * noise
        
        return xt, noise
    
    def denoise(
        self,
        xt: torch.Tensor,
        t: torch.Tensor
    ) -> torch.Tensor:
        """
        Predict noise from noisy input.
        
        Args:
            xt: [batch, seq_len, 1] - noisy data
            t: [batch] - diffusion steps
        """
        batch_size = xt.shape[0]
        
        # Input projection
        h = self.input_proj(xt)  # [batch, seq_len, hidden_dim]
        
        # Add positional embedding
        h = h + self.pos_embed
        
        # Time embedding
        t_emb = self.time_embed(t.float())
        
        # Encoder
        for block in self.encoder:
            h = block(h, t_emb)
        
        # Decoder
        for block in self.decoder:
            h = block(h, t_emb)
        
        # Output
        noise_pred = self.output_proj(h)
        
        return noise_pred
    
    def compute_loss(self, x: torch.Tensor) -> torch.Tensor:
        """Compute training loss."""
        batch_size = x.shape[0]
        device = x.device
        
        # Extract residual (apply decomposition)
        trend, residual = self.trend_decomp(x)
        seasonal, residual = self.seasonal_decomp(residual)
        
        # Random diffusion step
        t = torch.randint(0, self.config.diffusion_steps, (batch_size,), device=device)
        
        # Forward diffusion on residual
        xt, noise = self.forward_diffusion(residual, t)
        
        # Predict noise
        noise_pred = self.denoise(xt, t)
        
        # MSE loss
        loss = F.mse_loss(noise_pred, noise)
        
        return loss
    
    @torch.no_grad()
    def sample(self, n_samples: int, device: torch.device) -> torch.Tensor:
        """
        Generate synthetic samples.
        
        Args:
            n_samples: number of samples to generate
            device: torch device
            
        Returns:
            samples: [n_samples, seq_len, 1]
        """
        # Start from noise
        x = torch.randn(n_samples, self.config.seq_len, 1, device=device)
        
        # Reverse diffusion
        for t in tqdm(reversed(range(self.config.diffusion_steps)), desc='Sampling', leave=False):
            t_tensor = torch.full((n_samples,), t, device=device)
            
            # Predict noise
            noise_pred = self.denoise(x, t_tensor)
            
            # DDPM update
            alpha = self.alphas[t]
            alpha_bar = self.alphas_cumprod[t]
            beta = self.betas[t]
            
            mean = (1 / torch.sqrt(alpha)) * (
                x - (beta / torch.sqrt(1 - alpha_bar)) * noise_pred
            )
            
            if t > 0:
                alpha_bar_prev = self.alphas_cumprod[t - 1]
                variance = beta * (1 - alpha_bar_prev) / (1 - alpha_bar)
                noise = torch.randn_like(x)
                x = mean + torch.sqrt(variance) * noise
            else:
                x = mean
        
        return x

# Create model
model = DiffusionTS(config).to(device)
print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}")

## 4. Обучение модели

In [None]:
def train_epoch(model, loader, optimizer, device):
    model.train()
    total_loss = 0
    
    for batch in loader:
        batch = batch.to(device)
        
        optimizer.zero_grad()
        loss = model.compute_loss(batch)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(loader)


def validate(model, loader, device):
    model.eval()
    total_loss = 0
    
    with torch.no_grad():
        for batch in loader:
            batch = batch.to(device)
            loss = model.compute_loss(batch)
            total_loss += loss.item()
    
    return total_loss / len(loader)

In [None]:
# Training
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=50)

n_epochs = 50
train_losses = []
val_losses = []
best_val_loss = float('inf')

for epoch in tqdm(range(n_epochs), desc="Training"):
    train_loss = train_epoch(model, train_loader, optimizer, device)
    val_loss = validate(model, test_loader, device)
    
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    
    scheduler.step()
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_state = model.state_dict().copy()
    
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}: Train={train_loss:.4f}, Val={val_loss:.4f}")

model.load_state_dict(best_state)
print(f"\nBest Val Loss: {best_val_loss:.4f}")

In [None]:
# Training plot
plt.figure(figsize=(10, 5))
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Val Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Diffusion-TS Training Progress')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## 5. Генерация синтетических данных

In [None]:
# Generate synthetic samples
n_synthetic = 200
print(f"Generating {n_synthetic} synthetic samples...")

model.eval()
synthetic_samples = model.sample(n_synthetic, device)
synthetic_samples = synthetic_samples.cpu().numpy().squeeze(-1)

print(f"Synthetic samples shape: {synthetic_samples.shape}")

In [None]:
# Visual comparison
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Real samples
ax1 = axes[0, 0]
for i in range(20):
    ax1.plot(test_data[i], alpha=0.5)
ax1.set_title('Real Samples (Returns)')
ax1.set_xlabel('Time')
ax1.set_ylabel('Normalized Return')

# Synthetic samples
ax2 = axes[0, 1]
for i in range(20):
    ax2.plot(synthetic_samples[i], alpha=0.5)
ax2.set_title('Synthetic Samples (Returns)')
ax2.set_xlabel('Time')
ax2.set_ylabel('Normalized Return')

# Distribution comparison
ax3 = axes[1, 0]
ax3.hist(test_data.flatten(), bins=100, alpha=0.5, label='Real', density=True)
ax3.hist(synthetic_samples.flatten(), bins=100, alpha=0.5, label='Synthetic', density=True)
ax3.set_title('Distribution Comparison')
ax3.set_xlabel('Value')
ax3.set_ylabel('Density')
ax3.legend()

# Autocorrelation comparison
ax4 = axes[1, 1]

def autocorr(x, lag):
    return np.corrcoef(x[:-lag], x[lag:])[0, 1]

lags = range(1, 30)
real_acf = [autocorr(test_data.flatten(), lag) for lag in lags]
synth_acf = [autocorr(synthetic_samples.flatten(), lag) for lag in lags]

ax4.plot(lags, real_acf, 'b-o', label='Real')
ax4.plot(lags, synth_acf, 'r-s', label='Synthetic')
ax4.set_title('Autocorrelation Comparison')
ax4.set_xlabel('Lag')
ax4.set_ylabel('ACF')
ax4.legend()
ax4.axhline(0, color='gray', linestyle='--', alpha=0.5)

plt.tight_layout()
plt.show()

## 6. Оценка качества синтетических данных

In [None]:
def compute_statistics(data: np.ndarray) -> Dict[str, float]:
    """Вычислить статистики временного ряда."""
    flat = data.flatten()
    
    stats = {
        'mean': np.mean(flat),
        'std': np.std(flat),
        'skewness': pd.Series(flat).skew(),
        'kurtosis': pd.Series(flat).kurtosis(),
        'min': np.min(flat),
        'max': np.max(flat),
        'acf_1': autocorr(flat, 1) if len(flat) > 1 else 0,
        'acf_5': autocorr(flat, 5) if len(flat) > 5 else 0,
    }
    
    return stats

# Compare statistics
real_stats = compute_statistics(test_data)
synth_stats = compute_statistics(synthetic_samples)

print("=== Statistical Comparison ===")
print(f"{'Metric':<15} {'Real':<15} {'Synthetic':<15} {'Diff %':<10}")
print("-" * 55)

for key in real_stats:
    real_val = real_stats[key]
    synth_val = synth_stats[key]
    if abs(real_val) > 1e-10:
        diff_pct = abs(real_val - synth_val) / abs(real_val) * 100
    else:
        diff_pct = 0
    print(f"{key:<15} {real_val:<15.4f} {synth_val:<15.4f} {diff_pct:<10.1f}%")

In [None]:
# t-SNE visualization
def prepare_for_tsne(data, n_samples=200):
    """Подготовить данные для t-SNE."""
    if len(data) > n_samples:
        idx = np.random.choice(len(data), n_samples, replace=False)
        data = data[idx]
    return data.reshape(len(data), -1)

real_flat = prepare_for_tsne(test_data, 200)
synth_flat = prepare_for_tsne(synthetic_samples, 200)

# Combine for t-SNE
combined = np.vstack([real_flat, synth_flat])
labels = np.array([0] * len(real_flat) + [1] * len(synth_flat))

print("Computing t-SNE...")
tsne = TSNE(n_components=2, perplexity=30, random_state=42)
embedded = tsne.fit_transform(combined)

# Plot
plt.figure(figsize=(10, 8))
scatter = plt.scatter(
    embedded[:, 0], embedded[:, 1],
    c=labels, cmap='coolwarm', alpha=0.6, s=20
)
plt.colorbar(scatter, ticks=[0, 1], label='Data Type')
plt.title('t-SNE Visualization: Real vs Synthetic')
plt.xlabel('t-SNE 1')
plt.ylabel('t-SNE 2')

# Legend
from matplotlib.patches import Patch
legend_elements = [Patch(facecolor='blue', label='Real'),
                   Patch(facecolor='red', label='Synthetic')]
plt.legend(handles=legend_elements, loc='upper right')

plt.tight_layout()
plt.show()

In [None]:
# Discriminative Score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

def compute_discriminative_score(real: np.ndarray, synthetic: np.ndarray) -> float:
    """
    Вычислить discriminative score.
    Чем ближе к 0.5, тем лучше (труднее различить real и synthetic).
    """
    n_real = len(real)
    n_synth = len(synthetic)
    
    X = np.vstack([real.reshape(n_real, -1), synthetic.reshape(n_synth, -1)])
    y = np.array([0] * n_real + [1] * n_synth)
    
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    scores = cross_val_score(clf, X, y, cv=5)
    
    return scores.mean()

disc_score = compute_discriminative_score(test_data, synthetic_samples)
print(f"Discriminative Score: {disc_score:.4f}")
print(f"(0.5 = perfect, classifier cannot distinguish real from synthetic)")

In [None]:
# Maximum Mean Discrepancy (MMD)
def compute_mmd(X: np.ndarray, Y: np.ndarray, kernel='rbf', gamma=1.0) -> float:
    """
    Вычислить MMD между двумя распределениями.
    Чем ближе к 0, тем более похожи распределения.
    """
    X = X.reshape(len(X), -1)
    Y = Y.reshape(len(Y), -1)
    
    XX = pairwise_distances(X, X, metric='rbf', gamma=gamma)
    YY = pairwise_distances(Y, Y, metric='rbf', gamma=gamma)
    XY = pairwise_distances(X, Y, metric='rbf', gamma=gamma)
    
    mmd = XX.mean() + YY.mean() - 2 * XY.mean()
    return mmd

mmd_score = compute_mmd(test_data[:200], synthetic_samples[:200])
print(f"MMD Score: {mmd_score:.6f}")
print(f"(Closer to 0 = more similar distributions)")

## 7. Конвертация обратно в цены

In [None]:
def returns_to_prices(returns: np.ndarray, initial_price: float = 100) -> np.ndarray:
    """
    Конвертировать log returns обратно в цены.
    """
    # Денормализация
    returns_denorm = returns * std + mean
    
    # Cumulative returns
    cum_returns = np.cumsum(returns_denorm, axis=1)
    
    # Prices
    prices = initial_price * np.exp(cum_returns)
    
    return prices

# Convert synthetic returns to prices
synthetic_prices = returns_to_prices(synthetic_samples)

# Convert real returns to prices for comparison
real_prices = returns_to_prices(test_data)

print(f"Synthetic prices shape: {synthetic_prices.shape}")

In [None]:
# Visualize synthetic price paths
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Real prices
ax1 = axes[0]
for i in range(30):
    ax1.plot(real_prices[i], alpha=0.5)
ax1.set_title('Real Price Paths')
ax1.set_xlabel('Time')
ax1.set_ylabel('Price')

# Synthetic prices
ax2 = axes[1]
for i in range(30):
    ax2.plot(synthetic_prices[i], alpha=0.5)
ax2.set_title('Synthetic Price Paths')
ax2.set_xlabel('Time')
ax2.set_ylabel('Price')

plt.tight_layout()
plt.show()

## 8. Применение: Data Augmentation для ML

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

def create_trading_labels(prices: np.ndarray, threshold: float = 0.01) -> np.ndarray:
    """
    Создать метки для классификации: buy (1) если цена вырастет > threshold.
    """
    # Доходность на следующий период
    returns = np.diff(np.log(prices), axis=1)
    
    # Последняя доходность как цель
    labels = (returns[:, -1] > threshold).astype(int)
    
    # Features - все кроме последней точки
    features = prices[:, :-1]
    
    return features, labels

# Создаём датасет
X_real, y_real = create_trading_labels(real_prices[:500])
X_synth, y_synth = create_trading_labels(synthetic_prices[:200])

print(f"Real: {X_real.shape}, labels distribution: {np.bincount(y_real)}")
print(f"Synthetic: {X_synth.shape}, labels distribution: {np.bincount(y_synth)}")

In [None]:
# Experiment: Train with and without augmentation

# Split real data
X_train, X_test, y_train, y_test = train_test_split(
    X_real, y_real, test_size=0.3, random_state=42
)

# Model 1: Train only on real data
clf_real = LogisticRegression(max_iter=1000, random_state=42)
clf_real.fit(X_train, y_train)
y_pred_real = clf_real.predict(X_test)
acc_real = accuracy_score(y_test, y_pred_real)
f1_real = f1_score(y_test, y_pred_real)

# Model 2: Train on real + synthetic (augmented)
X_augmented = np.vstack([X_train, X_synth])
y_augmented = np.concatenate([y_train, y_synth])

clf_aug = LogisticRegression(max_iter=1000, random_state=42)
clf_aug.fit(X_augmented, y_augmented)
y_pred_aug = clf_aug.predict(X_test)
acc_aug = accuracy_score(y_test, y_pred_aug)
f1_aug = f1_score(y_test, y_pred_aug)

print("=== Data Augmentation Results ===")
print(f"\nTrained on Real Data Only:")
print(f"  Accuracy: {acc_real:.4f}")
print(f"  F1 Score: {f1_real:.4f}")

print(f"\nTrained on Real + Synthetic (Augmented):")
print(f"  Accuracy: {acc_aug:.4f}")
print(f"  F1 Score: {f1_aug:.4f}")

print(f"\nImprovement:")
print(f"  Accuracy: {(acc_aug - acc_real) / acc_real * 100:+.1f}%")
print(f"  F1 Score: {(f1_aug - f1_real) / f1_real * 100:+.1f}%")

## 9. Выводы

### Преимущества Diffusion-TS:

1. **Реалистичные данные**: Сохраняет статистические свойства оригинала
2. **Interpretable**: Декомпозиция на тренд/сезонность/residual
3. **Data Augmentation**: Улучшает качество ML моделей
4. **Privacy**: Можно делиться синтетическими данными вместо реальных

### Ограничения:

1. **Вычислительная сложность**: Генерация требует много шагов
2. **Tail events**: Может недооценивать экстремальные события
3. **Multivariate**: Сложнее для многомерных рядов

### Рекомендации:

- Валидируйте качество синтетических данных статистически
- Используйте discriminative score для оценки
- Комбинируйте с реальными данными для augmentation

In [None]:
# Save model and samples
torch.save({
    'model_state_dict': model.state_dict(),
    'config': config,
    'normalization': {'mean': mean, 'std': std}
}, 'diffusion_ts_model.pt')

np.savez(
    'synthetic_financial_data.npz',
    returns=synthetic_samples,
    prices=synthetic_prices,
    normalization={'mean': mean, 'std': std}
)

print("Model and samples saved!")