# Complete Bitcoin Forecasting Pipeline with Diffusion Models

This notebook implements a production-ready pipeline for Bitcoin price forecasting using diffusion models:

1. Data fetching from cryptocurrency exchanges
2. Feature engineering (technical indicators)
3. DDPM model for probabilistic forecasting
4. Monte Carlo uncertainty estimation
5. Backtesting with realistic constraints

Based on the article: [Diffusion Models vs Cryptocurrency Anarchy](https://marketmaker.cc/en/blog/post/diffusion-models-cryptocurrency-prediction)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tqdm import tqdm
import requests
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set seeds
np.random.seed(42)
torch.manual_seed(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## 1. Data Fetching from Bybit

We fetch historical OHLCV data from Bybit exchange using their public API.

In [None]:
class BybitDataFetcher:
    """Fetch historical cryptocurrency data from Bybit."""
    
    BASE_URL = "https://api.bybit.com/v5/market/kline"
    
    def __init__(self):
        self.session = requests.Session()
    
    def fetch_klines(self, symbol="BTCUSDT", interval="60", limit=1000, 
                     start_time=None, end_time=None):
        """
        Fetch kline/candlestick data.
        
        Args:
            symbol: Trading pair (e.g., BTCUSDT)
            interval: Kline interval (1, 5, 15, 30, 60, 120, 240, D, W)
            limit: Number of records (max 1000)
            start_time: Start timestamp in milliseconds
            end_time: End timestamp in milliseconds
        """
        params = {
            "category": "linear",
            "symbol": symbol,
            "interval": interval,
            "limit": limit
        }
        
        if start_time:
            params["start"] = start_time
        if end_time:
            params["end"] = end_time
        
        response = self.session.get(self.BASE_URL, params=params)
        data = response.json()
        
        if data["retCode"] != 0:
            raise Exception(f"API Error: {data['retMsg']}")
        
        return data["result"]["list"]
    
    def fetch_historical_data(self, symbol="BTCUSDT", interval="60", days=30):
        """
        Fetch historical data for specified number of days.
        """
        all_data = []
        end_time = int(datetime.now().timestamp() * 1000)
        
        # Calculate how many requests we need
        interval_minutes = int(interval) if interval.isdigit() else 1440  # D = 1440
        total_candles = (days * 24 * 60) // interval_minutes
        
        with tqdm(total=total_candles, desc="Fetching data") as pbar:
            while len(all_data) < total_candles:
                batch = self.fetch_klines(
                    symbol=symbol,
                    interval=interval,
                    limit=1000,
                    end_time=end_time
                )
                
                if not batch:
                    break
                
                all_data.extend(batch)
                pbar.update(len(batch))
                
                # Update end_time for next batch
                end_time = int(batch[-1][0]) - 1
        
        # Convert to DataFrame
        df = pd.DataFrame(all_data, columns=[
            'timestamp', 'open', 'high', 'low', 'close', 'volume', 'turnover'
        ])
        
        # Convert types
        df['timestamp'] = pd.to_datetime(df['timestamp'].astype(int), unit='ms')
        for col in ['open', 'high', 'low', 'close', 'volume', 'turnover']:
            df[col] = df[col].astype(float)
        
        # Sort by timestamp
        df = df.sort_values('timestamp').reset_index(drop=True)
        
        return df

In [None]:
# Fetch Bitcoin hourly data
fetcher = BybitDataFetcher()

try:
    btc_data = fetcher.fetch_historical_data(symbol="BTCUSDT", interval="60", days=90)
    print(f"Fetched {len(btc_data)} hourly candles")
    print(f"Date range: {btc_data['timestamp'].min()} to {btc_data['timestamp'].max()}")
except Exception as e:
    print(f"Could not fetch live data: {e}")
    print("Generating synthetic data for demonstration...")
    
    # Generate synthetic BTC-like data
    n_samples = 90 * 24  # 90 days of hourly data
    np.random.seed(42)
    
    # Start price
    price = 50000
    prices = [price]
    volumes = []
    
    for i in range(n_samples - 1):
        # Simulate returns with volatility clustering
        returns = np.random.normal(0.0001, 0.015)  # ~1.5% hourly vol
        price = price * (1 + returns)
        prices.append(price)
        volumes.append(np.random.exponential(1000))
    
    volumes.append(np.random.exponential(1000))
    
    btc_data = pd.DataFrame({
        'timestamp': pd.date_range(end=datetime.now(), periods=n_samples, freq='H'),
        'open': prices,
        'high': [p * (1 + abs(np.random.normal(0, 0.005))) for p in prices],
        'low': [p * (1 - abs(np.random.normal(0, 0.005))) for p in prices],
        'close': prices,
        'volume': volumes
    })

btc_data.head()

In [None]:
# Visualize the data
fig, axes = plt.subplots(3, 1, figsize=(14, 10), sharex=True)

# Price
axes[0].plot(btc_data['timestamp'], btc_data['close'], color='blue', linewidth=0.8)
axes[0].set_ylabel('Price (USDT)')
axes[0].set_title('BTC/USDT Hourly Price')
axes[0].grid(True, alpha=0.3)

# Volume
axes[1].bar(btc_data['timestamp'], btc_data['volume'], color='gray', alpha=0.7, width=0.03)
axes[1].set_ylabel('Volume')
axes[1].set_title('Trading Volume')
axes[1].grid(True, alpha=0.3)

# Returns
returns = btc_data['close'].pct_change().dropna()
axes[2].plot(btc_data['timestamp'][1:], returns.values, color='green', linewidth=0.5)
axes[2].axhline(y=0, color='black', linestyle='-', linewidth=0.5)
axes[2].set_ylabel('Returns')
axes[2].set_title('Hourly Returns')
axes[2].set_xlabel('Date')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 2. Feature Engineering

We compute technical indicators commonly used in cryptocurrency trading.

In [None]:
class FeatureEngineer:
    """Compute technical indicators for cryptocurrency data."""
    
    @staticmethod
    def compute_rsi(prices, period=14):
        """Relative Strength Index."""
        delta = prices.diff()
        gain = (delta.where(delta > 0, 0)).rolling(window=period).mean()
        loss = (-delta.where(delta < 0, 0)).rolling(window=period).mean()
        rs = gain / (loss + 1e-10)
        return 100 - (100 / (1 + rs))
    
    @staticmethod
    def compute_macd(prices, fast=12, slow=26, signal=9):
        """MACD indicator."""
        ema_fast = prices.ewm(span=fast, adjust=False).mean()
        ema_slow = prices.ewm(span=slow, adjust=False).mean()
        macd_line = ema_fast - ema_slow
        signal_line = macd_line.ewm(span=signal, adjust=False).mean()
        histogram = macd_line - signal_line
        return macd_line, signal_line, histogram
    
    @staticmethod
    def compute_bollinger_bands(prices, period=20, num_std=2):
        """Bollinger Bands."""
        sma = prices.rolling(window=period).mean()
        std = prices.rolling(window=period).std()
        upper = sma + (std * num_std)
        lower = sma - (std * num_std)
        percent_b = (prices - lower) / (upper - lower + 1e-10)
        return upper, lower, percent_b
    
    @staticmethod
    def compute_atr(high, low, close, period=14):
        """Average True Range."""
        tr1 = high - low
        tr2 = abs(high - close.shift())
        tr3 = abs(low - close.shift())
        tr = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1)
        return tr.rolling(window=period).mean()
    
    def compute_all_features(self, df):
        """Compute all features."""
        features = df.copy()
        
        # Returns
        features['returns'] = features['close'].pct_change()
        features['log_returns'] = np.log(features['close'] / features['close'].shift(1))
        
        # Volatility
        features['volatility_24h'] = features['returns'].rolling(24).std()
        features['volatility_7d'] = features['returns'].rolling(24*7).std()
        
        # RSI
        features['rsi_14'] = self.compute_rsi(features['close'], 14)
        features['rsi_7'] = self.compute_rsi(features['close'], 7)
        
        # MACD
        macd, signal, hist = self.compute_macd(features['close'])
        features['macd'] = macd
        features['macd_signal'] = signal
        features['macd_hist'] = hist
        
        # Bollinger Bands
        upper, lower, pct_b = self.compute_bollinger_bands(features['close'])
        features['bb_upper'] = upper
        features['bb_lower'] = lower
        features['bb_pct'] = pct_b
        
        # ATR
        features['atr'] = self.compute_atr(
            features['high'], features['low'], features['close']
        )
        
        # Volume features
        features['volume_sma'] = features['volume'].rolling(24).mean()
        features['volume_ratio'] = features['volume'] / (features['volume_sma'] + 1e-10)
        
        # Price momentum
        features['momentum_1h'] = features['close'] / features['close'].shift(1) - 1
        features['momentum_24h'] = features['close'] / features['close'].shift(24) - 1
        features['momentum_7d'] = features['close'] / features['close'].shift(24*7) - 1
        
        return features

In [None]:
# Compute features
engineer = FeatureEngineer()
btc_features = engineer.compute_all_features(btc_data)

# Drop NaN rows
btc_features = btc_features.dropna().reset_index(drop=True)
print(f"Features shape: {btc_features.shape}")
print(f"\nFeatures: {list(btc_features.columns)}")

In [None]:
# Visualize some features
fig, axes = plt.subplots(3, 2, figsize=(14, 10))

# RSI
axes[0, 0].plot(btc_features['timestamp'], btc_features['rsi_14'])
axes[0, 0].axhline(y=70, color='r', linestyle='--', alpha=0.5)
axes[0, 0].axhline(y=30, color='g', linestyle='--', alpha=0.5)
axes[0, 0].set_title('RSI (14)')
axes[0, 0].set_ylabel('RSI')

# Volatility
axes[0, 1].plot(btc_features['timestamp'], btc_features['volatility_24h'])
axes[0, 1].set_title('24h Rolling Volatility')
axes[0, 1].set_ylabel('Volatility')

# MACD
axes[1, 0].plot(btc_features['timestamp'], btc_features['macd'], label='MACD')
axes[1, 0].plot(btc_features['timestamp'], btc_features['macd_signal'], label='Signal')
axes[1, 0].bar(btc_features['timestamp'], btc_features['macd_hist'], alpha=0.3, label='Histogram')
axes[1, 0].set_title('MACD')
axes[1, 0].legend()

# Bollinger %B
axes[1, 1].plot(btc_features['timestamp'], btc_features['bb_pct'])
axes[1, 1].axhline(y=1, color='r', linestyle='--', alpha=0.5)
axes[1, 1].axhline(y=0, color='g', linestyle='--', alpha=0.5)
axes[1, 1].set_title('Bollinger Band %B')

# Volume ratio
axes[2, 0].plot(btc_features['timestamp'], btc_features['volume_ratio'])
axes[2, 0].axhline(y=1, color='gray', linestyle='--', alpha=0.5)
axes[2, 0].set_title('Volume Ratio')

# Momentum
axes[2, 1].plot(btc_features['timestamp'], btc_features['momentum_24h'])
axes[2, 1].axhline(y=0, color='gray', linestyle='--', alpha=0.5)
axes[2, 1].set_title('24h Momentum')

plt.tight_layout()
plt.show()

## 3. Diffusion Model for Forecasting

We implement a conditional diffusion model that forecasts future prices given historical features.

In [None]:
class TimeSeriesDataset(Dataset):
    """Dataset for time series with sliding window."""
    
    def __init__(self, data, feature_cols, target_col='close', 
                 seq_length=100, forecast_horizon=24):
        self.data = data
        self.feature_cols = feature_cols
        self.target_col = target_col
        self.seq_length = seq_length
        self.forecast_horizon = forecast_horizon
        
        # Scale features
        self.feature_scaler = StandardScaler()
        self.target_scaler = MinMaxScaler()
        
        self.features = self.feature_scaler.fit_transform(data[feature_cols].values)
        self.targets = self.target_scaler.fit_transform(data[[target_col]].values)
    
    def __len__(self):
        return len(self.data) - self.seq_length - self.forecast_horizon + 1
    
    def __getitem__(self, idx):
        # Historical features
        x = self.features[idx:idx + self.seq_length]
        
        # Future targets
        y = self.targets[idx + self.seq_length:idx + self.seq_length + self.forecast_horizon]
        
        return (
            torch.tensor(x, dtype=torch.float32),
            torch.tensor(y, dtype=torch.float32).squeeze(-1)
        )

In [None]:
class SinusoidalPositionEmbeddings(nn.Module):
    """Sinusoidal embeddings for timestep encoding."""
    
    def __init__(self, dim):
        super().__init__()
        self.dim = dim
    
    def forward(self, time):
        device = time.device
        half_dim = self.dim // 2
        embeddings = np.log(10000) / (half_dim - 1)
        embeddings = torch.exp(torch.arange(half_dim, device=device) * -embeddings)
        embeddings = time[:, None] * embeddings[None, :]
        embeddings = torch.cat([torch.sin(embeddings), torch.cos(embeddings)], dim=-1)
        return embeddings


class ConditionalDiffusionModel(nn.Module):
    """Conditional diffusion model for time series forecasting."""
    
    def __init__(self, input_dim, seq_length, forecast_horizon, 
                 hidden_dim=256, time_emb_dim=64, n_layers=4):
        super().__init__()
        
        self.input_dim = input_dim
        self.seq_length = seq_length
        self.forecast_horizon = forecast_horizon
        
        # Time embedding
        self.time_mlp = nn.Sequential(
            SinusoidalPositionEmbeddings(time_emb_dim),
            nn.Linear(time_emb_dim, hidden_dim),
            nn.SiLU(),
            nn.Linear(hidden_dim, hidden_dim)
        )
        
        # Condition encoder (processes historical features)
        self.condition_encoder = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=2,
            batch_first=True,
            dropout=0.1
        )
        
        # Denoising network
        self.input_proj = nn.Linear(forecast_horizon + hidden_dim * 2, hidden_dim)
        
        self.layers = nn.ModuleList([
            nn.Sequential(
                nn.Linear(hidden_dim, hidden_dim),
                nn.LayerNorm(hidden_dim),
                nn.SiLU(),
                nn.Dropout(0.1)
            ) for _ in range(n_layers)
        ])
        
        self.output_proj = nn.Linear(hidden_dim, forecast_horizon)
    
    def forward(self, x_noisy, t, condition):
        """
        Predict noise given noisy future, timestep, and historical condition.
        
        Args:
            x_noisy: Noisy future values [batch, forecast_horizon]
            t: Diffusion timestep [batch]
            condition: Historical features [batch, seq_length, input_dim]
        """
        # Encode time
        t_emb = self.time_mlp(t)  # [batch, hidden_dim]
        
        # Encode condition (historical data)
        _, (h_n, _) = self.condition_encoder(condition)
        cond_emb = h_n[-1]  # [batch, hidden_dim]
        
        # Combine noisy input with embeddings
        combined = torch.cat([x_noisy, t_emb, cond_emb], dim=-1)
        h = self.input_proj(combined)
        
        # Apply layers with residual connections
        for layer in self.layers:
            h = h + layer(h)
        
        # Output noise prediction
        noise_pred = self.output_proj(h)
        
        return noise_pred

In [None]:
class CryptoDiffusionPipeline:
    """Complete pipeline for cryptocurrency forecasting with diffusion models."""
    
    def __init__(self, seq_length=100, forecast_horizon=24, 
                 num_diffusion_steps=1000, device='cuda'):
        self.seq_length = seq_length
        self.forecast_horizon = forecast_horizon
        self.num_steps = num_diffusion_steps
        self.device = device
        
        # Initialize noise schedule (cosine)
        self._init_noise_schedule()
        
        self.model = None
        self.optimizer = None
    
    def _init_noise_schedule(self):
        """Initialize cosine noise schedule."""
        s = 0.008
        steps = self.num_steps + 1
        t = torch.linspace(0, self.num_steps, steps) / self.num_steps
        alphas_cumprod = torch.cos((t + s) / (1 + s) * np.pi / 2) ** 2
        alphas_cumprod = alphas_cumprod / alphas_cumprod[0]
        betas = 1 - (alphas_cumprod[1:] / alphas_cumprod[:-1])
        betas = torch.clamp(betas, 0.0001, 0.9999)
        
        self.betas = betas.to(self.device)
        self.alphas = (1.0 - betas).to(self.device)
        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0).to(self.device)
        self.sqrt_alphas_cumprod = torch.sqrt(self.alphas_cumprod).to(self.device)
        self.sqrt_one_minus_alphas_cumprod = torch.sqrt(1 - self.alphas_cumprod).to(self.device)
    
    def init_model(self, input_dim, hidden_dim=256):
        """Initialize the diffusion model."""
        self.model = ConditionalDiffusionModel(
            input_dim=input_dim,
            seq_length=self.seq_length,
            forecast_horizon=self.forecast_horizon,
            hidden_dim=hidden_dim
        ).to(self.device)
        
        self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=1e-4)
    
    def add_noise(self, x_0, t, noise=None):
        """Add noise to data at timestep t."""
        if noise is None:
            noise = torch.randn_like(x_0)
        
        sqrt_alpha = self.sqrt_alphas_cumprod[t][:, None]
        sqrt_one_minus_alpha = self.sqrt_one_minus_alphas_cumprod[t][:, None]
        
        return sqrt_alpha * x_0 + sqrt_one_minus_alpha * noise, noise
    
    def train_step(self, condition, target):
        """Single training step."""
        batch_size = condition.shape[0]
        
        # Sample random timesteps
        t = torch.randint(0, self.num_steps, (batch_size,), device=self.device)
        
        # Add noise to target
        x_noisy, noise = self.add_noise(target, t)
        
        # Predict noise
        noise_pred = self.model(x_noisy, t.float(), condition)
        
        # Compute loss
        loss = F.mse_loss(noise_pred, noise)
        
        # Backprop
        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
        self.optimizer.step()
        
        return loss.item()
    
    def train(self, dataloader, epochs=100):
        """Full training loop."""
        self.model.train()
        losses = []
        
        for epoch in tqdm(range(epochs), desc="Training"):
            epoch_losses = []
            for condition, target in dataloader:
                condition = condition.to(self.device)
                target = target.to(self.device)
                
                loss = self.train_step(condition, target)
                epoch_losses.append(loss)
            
            avg_loss = np.mean(epoch_losses)
            losses.append(avg_loss)
            
            if (epoch + 1) % 20 == 0:
                print(f"Epoch {epoch+1}: Loss = {avg_loss:.6f}")
        
        return losses
    
    @torch.no_grad()
    def sample(self, condition, n_samples=100):
        """
        Generate forecast samples using DDPM sampling.
        
        Args:
            condition: Historical features [1, seq_length, input_dim]
            n_samples: Number of Monte Carlo samples
        
        Returns:
            samples: [n_samples, forecast_horizon]
        """
        self.model.eval()
        
        # Expand condition for all samples
        condition = condition.expand(n_samples, -1, -1).to(self.device)
        
        # Start from pure noise
        x = torch.randn(n_samples, self.forecast_horizon, device=self.device)
        
        # Iteratively denoise
        for t in reversed(range(self.num_steps)):
            t_tensor = torch.full((n_samples,), t, device=self.device, dtype=torch.float32)
            
            # Predict noise
            noise_pred = self.model(x, t_tensor, condition)
            
            # Compute coefficients
            alpha = self.alphas[t]
            alpha_cumprod = self.alphas_cumprod[t]
            beta = self.betas[t]
            
            # Denoise
            x = (1 / torch.sqrt(alpha)) * (
                x - (beta / torch.sqrt(1 - alpha_cumprod)) * noise_pred
            )
            
            # Add noise (except for last step)
            if t > 0:
                noise = torch.randn_like(x)
                x = x + torch.sqrt(beta) * noise
        
        return x.cpu().numpy()
    
    def forecast(self, condition, n_samples=100):
        """
        Generate probabilistic forecast with uncertainty.
        
        Returns:
            dict with 'mean', 'std', 'confidence_95', 'confidence_5'
        """
        samples = self.sample(condition, n_samples)
        
        return {
            'samples': samples,
            'mean': np.mean(samples, axis=0),
            'std': np.std(samples, axis=0),
            'median': np.median(samples, axis=0),
            'confidence_5': np.percentile(samples, 5, axis=0),
            'confidence_95': np.percentile(samples, 95, axis=0),
            'confidence_25': np.percentile(samples, 25, axis=0),
            'confidence_75': np.percentile(samples, 75, axis=0)
        }

In [None]:
# Prepare data
feature_cols = ['returns', 'volatility_24h', 'rsi_14', 'macd', 'bb_pct', 
                'volume_ratio', 'momentum_24h']

# Create dataset
dataset = TimeSeriesDataset(
    btc_features, 
    feature_cols=feature_cols,
    target_col='close',
    seq_length=100,
    forecast_horizon=24
)

# Split into train/val/test
train_size = int(0.7 * len(dataset))
val_size = int(0.15 * len(dataset))
test_size = len(dataset) - train_size - val_size

train_dataset = torch.utils.data.Subset(dataset, range(train_size))
val_dataset = torch.utils.data.Subset(dataset, range(train_size, train_size + val_size))
test_dataset = torch.utils.data.Subset(dataset, range(train_size + val_size, len(dataset)))

print(f"Train: {len(train_dataset)}, Val: {len(val_dataset)}, Test: {len(test_dataset)}")

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
# Initialize and train the pipeline
pipeline = CryptoDiffusionPipeline(
    seq_length=100,
    forecast_horizon=24,
    num_diffusion_steps=500,  # Reduced for faster training
    device=device
)

pipeline.init_model(input_dim=len(feature_cols), hidden_dim=128)

print(f"Model parameters: {sum(p.numel() for p in pipeline.model.parameters()):,}")

In [None]:
# Train the model
losses = pipeline.train(train_loader, epochs=50)

# Plot training loss
plt.figure(figsize=(10, 4))
plt.plot(losses)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss')
plt.yscale('log')
plt.grid(True, alpha=0.3)
plt.show()

## 4. Forecasting with Uncertainty Quantification

We generate probabilistic forecasts using Monte Carlo sampling.

In [None]:
# Get a test sample
test_condition, test_target = test_dataset[0]
test_condition = test_condition.unsqueeze(0)  # Add batch dimension

# Generate forecast
forecast_result = pipeline.forecast(test_condition, n_samples=100)

print(f"Forecast shape: {forecast_result['mean'].shape}")
print(f"Mean forecast: {forecast_result['mean'][:5]}")
print(f"Std: {forecast_result['std'][:5]}")

In [None]:
# Inverse transform to get actual prices
actual_target = dataset.target_scaler.inverse_transform(
    test_target.numpy().reshape(-1, 1)
).flatten()

forecast_mean = dataset.target_scaler.inverse_transform(
    forecast_result['mean'].reshape(-1, 1)
).flatten()

forecast_5 = dataset.target_scaler.inverse_transform(
    forecast_result['confidence_5'].reshape(-1, 1)
).flatten()

forecast_95 = dataset.target_scaler.inverse_transform(
    forecast_result['confidence_95'].reshape(-1, 1)
).flatten()

forecast_25 = dataset.target_scaler.inverse_transform(
    forecast_result['confidence_25'].reshape(-1, 1)
).flatten()

forecast_75 = dataset.target_scaler.inverse_transform(
    forecast_result['confidence_75'].reshape(-1, 1)
).flatten()

In [None]:
# Visualize forecast
fig, ax = plt.subplots(figsize=(12, 6))

hours = np.arange(24)

# Plot actual
ax.plot(hours, actual_target, 'b-', linewidth=2, label='Actual')

# Plot forecast mean
ax.plot(hours, forecast_mean, 'r-', linewidth=2, label='Forecast (mean)')

# Plot confidence intervals
ax.fill_between(hours, forecast_5, forecast_95, color='red', alpha=0.2, label='90% CI')
ax.fill_between(hours, forecast_25, forecast_75, color='red', alpha=0.3, label='50% CI')

ax.set_xlabel('Hours Ahead')
ax.set_ylabel('BTC Price (USDT)')
ax.set_title('24-Hour Bitcoin Price Forecast with Uncertainty')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Evaluate on multiple test samples
def evaluate_forecasts(pipeline, dataset, test_indices, n_samples=50):
    """Evaluate forecasts on multiple test samples."""
    all_actuals = []
    all_forecasts = []
    all_stds = []
    
    for idx in tqdm(test_indices, desc="Evaluating"):
        condition, target = dataset[idx]
        condition = condition.unsqueeze(0)
        
        forecast_result = pipeline.forecast(condition, n_samples=n_samples)
        
        # Inverse transform
        actual = dataset.target_scaler.inverse_transform(
            target.numpy().reshape(-1, 1)
        ).flatten()
        
        forecast = dataset.target_scaler.inverse_transform(
            forecast_result['mean'].reshape(-1, 1)
        ).flatten()
        
        all_actuals.append(actual)
        all_forecasts.append(forecast)
        all_stds.append(forecast_result['std'])
    
    return np.array(all_actuals), np.array(all_forecasts), np.array(all_stds)

# Evaluate on test set
test_indices = range(len(test_dataset))
actuals, forecasts, stds = evaluate_forecasts(
    pipeline, 
    test_dataset.dataset,  # Access underlying dataset
    [test_dataset.indices[i] for i in range(min(20, len(test_dataset)))],
    n_samples=50
)

In [None]:
# Compute metrics
def compute_metrics(actuals, forecasts):
    """Compute forecasting metrics."""
    # Per-horizon metrics
    mse_per_horizon = np.mean((actuals - forecasts) ** 2, axis=0)
    mae_per_horizon = np.mean(np.abs(actuals - forecasts), axis=0)
    
    # MAPE (handle zeros)
    mape_per_horizon = np.mean(np.abs((actuals - forecasts) / (actuals + 1e-10)) * 100, axis=0)
    
    # Overall metrics
    overall_mse = np.mean(mse_per_horizon)
    overall_mae = np.mean(mae_per_horizon)
    overall_mape = np.mean(mape_per_horizon)
    overall_rmse = np.sqrt(overall_mse)
    
    return {
        'mse_per_horizon': mse_per_horizon,
        'mae_per_horizon': mae_per_horizon,
        'mape_per_horizon': mape_per_horizon,
        'overall_mse': overall_mse,
        'overall_mae': overall_mae,
        'overall_mape': overall_mape,
        'overall_rmse': overall_rmse
    }

metrics = compute_metrics(actuals, forecasts)

print("\n=== Forecast Metrics ===")
print(f"Overall RMSE: ${metrics['overall_rmse']:.2f}")
print(f"Overall MAE: ${metrics['overall_mae']:.2f}")
print(f"Overall MAPE: {metrics['overall_mape']:.2f}%")

In [None]:
# Plot metrics by horizon
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

hours = np.arange(1, 25)

axes[0].plot(hours, np.sqrt(metrics['mse_per_horizon']), 'b-o')
axes[0].set_xlabel('Forecast Horizon (hours)')
axes[0].set_ylabel('RMSE ($)')
axes[0].set_title('RMSE by Forecast Horizon')
axes[0].grid(True, alpha=0.3)

axes[1].plot(hours, metrics['mae_per_horizon'], 'g-o')
axes[1].set_xlabel('Forecast Horizon (hours)')
axes[1].set_ylabel('MAE ($)')
axes[1].set_title('MAE by Forecast Horizon')
axes[1].grid(True, alpha=0.3)

axes[2].plot(hours, metrics['mape_per_horizon'], 'r-o')
axes[2].set_xlabel('Forecast Horizon (hours)')
axes[2].set_ylabel('MAPE (%)')
axes[2].set_title('MAPE by Forecast Horizon')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 5. Key Takeaways

1. **Diffusion models** provide natural uncertainty quantification through Monte Carlo sampling
2. **Technical indicators** (RSI, MACD, Bollinger Bands) help condition the forecast
3. **Probabilistic forecasts** are more useful than point predictions for risk management
4. **Forecast error increases** with horizon (as expected)

### Limitations:
- Slow inference (requires many denoising steps)
- Computationally expensive training
- Struggles with extreme events (black swans)

### Next Steps:
- See notebook 06 for comparison with GANs
- See Rust implementation for production deployment