# Statistical Arbitrage - Cointegrated Pairs Trading

This notebook demonstrates a statistical arbitrage strategy that:
- Finds cointegrated pairs from a universe of 50 coins
- Trades perpetual futures (allowing both long and short positions)
- Uses z-score for entry/exit signals
- Manages multiple pairs simultaneously

In [None]:
# Import required libraries
from crypto_backtest import run_backtest, load_data, optimize_strategy
from crypto_backtest.features import (
    zscore, cointegration_test, half_life, 
    rolling_corr, rolling_beta
)
import pandas as pd
import numpy as np
from itertools import combinations
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Configuration
EXCHANGE = 'binance'
TIMEFRAME = '15m'  # 15-minute bars for more signals
START_DATE = '2024-01-01'
END_DATE = '2024-03-31'
INITIAL_CAPITAL = 50000  # Larger capital for multiple pairs

# Universe of liquid perpetual contracts on Binance
UNIVERSE = [
    'BTC/USDT:USDT', 'ETH/USDT:USDT', 'BNB/USDT:USDT', 'XRP/USDT:USDT',
    'ADA/USDT:USDT', 'DOGE/USDT:USDT', 'SOL/USDT:USDT', 'DOT/USDT:USDT',
    'MATIC/USDT:USDT', 'SHIB/USDT:USDT', 'TRX/USDT:USDT', 'AVAX/USDT:USDT',
    'UNI/USDT:USDT', 'ATOM/USDT:USDT', 'LTC/USDT:USDT', 'LINK/USDT:USDT',
    'BCH/USDT:USDT', 'NEAR/USDT:USDT', 'XLM/USDT:USDT', 'ALGO/USDT:USDT',
    'ICP/USDT:USDT', 'FIL/USDT:USDT', 'APT/USDT:USDT', 'ARB/USDT:USDT',
    'OP/USDT:USDT', 'INJ/USDT:USDT', 'SUI/USDT:USDT', 'SEI/USDT:USDT',
    'TIA/USDT:USDT', 'ORDI/USDT:USDT', 'GALA/USDT:USDT', 'SAND/USDT:USDT',
    'MANA/USDT:USDT', 'AXS/USDT:USDT', 'ETC/USDT:USDT', 'HBAR/USDT:USDT',
    'XTZ/USDT:USDT', 'CHZ/USDT:USDT', 'ID/USDT:USDT', 'AR/USDT:USDT',
    'BLUR/USDT:USDT', 'GMT/USDT:USDT', 'CFX/USDT:USDT', 'DYDX/USDT:USDT',
    'OCEAN/USDT:USDT', 'ONE/USDT:USDT', 'AAVE/USDT:USDT', 'COMP/USDT:USDT',
    'SNX/USDT:USDT', 'MKR/USDT:USDT'
][:50]  # Ensure we have exactly 50

In [None]:
# Load data for all symbols
print(f"Loading data for {len(UNIVERSE)} symbols...")
data = load_data(UNIVERSE, EXCHANGE, TIMEFRAME, START_DATE, END_DATE)
print(f"Data loaded. Shape: {data[UNIVERSE[0]]['ohlcv'].shape}")

In [None]:
# Function to find cointegrated pairs
def find_cointegrated_pairs(data, p_value_threshold=0.05, lookback_days=30):
    """
    Find cointegrated pairs from the universe
    """
    pairs = []
    symbols = list(data.keys())
    
    # Calculate lookback periods
    periods_per_day = 24 * 60 / 15  # 15-minute bars
    lookback = int(lookback_days * periods_per_day)
    
    print(f"Testing {len(list(combinations(symbols, 2)))} pairs for cointegration...")
    
    for symbol1, symbol2 in combinations(symbols, 2):
        try:
            # Get recent prices
            price1 = data[symbol1]['ohlcv']['close'].iloc[-lookback:]
            price2 = data[symbol2]['ohlcv']['close'].iloc[-lookback:]
            
            # Test for cointegration
            p_value = cointegration_test(price1, price2)
            
            if p_value < p_value_threshold:
                # Calculate hedge ratio
                hedge_ratio = rolling_beta(price1, price2, lookback).iloc[-1]
                
                # Calculate half-life
                spread = price1 - hedge_ratio * price2
                hl = half_life(spread)
                
                if 1 < hl < 30:  # Reasonable half-life (1 to 30 periods)
                    pairs.append({
                        'symbol1': symbol1,
                        'symbol2': symbol2,
                        'p_value': p_value,
                        'hedge_ratio': hedge_ratio,
                        'half_life': hl
                    })
        except:
            continue
    
    # Sort by p-value
    pairs.sort(key=lambda x: x['p_value'])
    
    print(f"Found {len(pairs)} cointegrated pairs")
    return pairs

In [None]:
# Find cointegrated pairs
cointegrated_pairs = find_cointegrated_pairs(data)
print(f"\nTop 10 cointegrated pairs:")
for i, pair in enumerate(cointegrated_pairs[:10]):
    print(f"{i+1}. {pair['symbol1']} - {pair['symbol2']}: p-value={pair['p_value']:.4f}, hedge_ratio={pair['hedge_ratio']:.4f}")

In [None]:
# Statistical Arbitrage Strategy
def stat_arb_strategy(data, position, timestamp, **params):
    """
    Statistical arbitrage strategy for cointegrated pairs
    
    Parameters:
    - lookback: Lookback period for calculations
    - entry_z: Z-score threshold for entry
    - exit_z: Z-score threshold for exit
    - max_pairs: Maximum number of pairs to trade
    - position_size: Size per leg of the pair trade
    - rebalance_frequency: How often to recalculate pairs (in periods)
    """
    orders = []
    
    # Get current period
    current_period = len(data[list(data.keys())[0]]['ohlcv'])
    
    # Recalculate pairs periodically
    if current_period % params.get('rebalance_frequency', 1000) == 0:
        # Store pairs in a way that persists (in real implementation, this would be stored properly)
        params['_pairs'] = find_cointegrated_pairs(data, lookback_days=params['lookback_days'])
    
    # Get pairs to trade
    pairs_to_trade = params.get('_pairs', [])[:params['max_pairs']]
    
    for pair in pairs_to_trade:
        symbol1 = pair['symbol1']
        symbol2 = pair['symbol2']
        
        # Get prices
        price1 = data[symbol1]['ohlcv']['close']
        price2 = data[symbol2]['ohlcv']['close']
        
        if len(price1) < params['lookback']:
            continue
        
        # Calculate spread and z-score
        hedge_ratio = rolling_beta(price1, price2, params['lookback']).iloc[-1]
        spread = price1 - hedge_ratio * price2
        z = zscore(spread, params['lookback']).iloc[-1]
        
        # Current positions
        pos1 = position.get(symbol1, 0)
        pos2 = position.get(symbol2, 0)
        
        # Trading logic
        if abs(z) > params['entry_z'] and pos1 == 0:
            # Enter trade
            if z > params['entry_z']:
                # Spread too high: short symbol1, long symbol2
                orders.append({'symbol': symbol1, 'side': 'sell', 'size': params['position_size']})
                orders.append({'symbol': symbol2, 'side': 'buy', 'size': params['position_size'] * hedge_ratio})
            else:
                # Spread too low: long symbol1, short symbol2
                orders.append({'symbol': symbol1, 'side': 'buy', 'size': params['position_size']})
                orders.append({'symbol': symbol2, 'side': 'sell', 'size': params['position_size'] * hedge_ratio})
        
        elif abs(z) < params['exit_z'] and pos1 != 0:
            # Exit trade
            if pos1 > 0:
                orders.append({'symbol': symbol1, 'side': 'sell', 'size': abs(pos1)})
                orders.append({'symbol': symbol2, 'side': 'buy', 'size': abs(pos2)})
            else:
                orders.append({'symbol': symbol1, 'side': 'buy', 'size': abs(pos1)})
                orders.append({'symbol': symbol2, 'side': 'sell', 'size': abs(pos2)})
    
    return orders

In [None]:
# Pre-calculate initial pairs
initial_pairs = find_cointegrated_pairs(data, lookback_days=30)

# Strategy parameters
params = {
    'lookback': 96 * 2,  # 2 days of 15-minute bars
    'lookback_days': 30,  # For cointegration test
    'entry_z': 2.0,
    'exit_z': 0.5,
    'max_pairs': 5,  # Trade top 5 pairs
    'position_size': 0.1,  # 0.1 BTC equivalent per leg
    'rebalance_frequency': 96 * 7,  # Rebalance weekly
    '_pairs': initial_pairs  # Store initial pairs
}

In [None]:
# Run backtest
print("Running backtest...")
results = run_backtest(
    data=data,
    strategy=stat_arb_strategy,
    initial_capital=INITIAL_CAPITAL,
    params=params,
    commission=0.0002,  # Lower commission for market makers
    slippage_model='linear',
    slippage_bps=5  # 5 bps slippage
)

In [None]:
# Display results
print(results.summary())

# Show trades by pair
if len(results.trades) > 0:
    trades_df = results.trades
    print(f"\nTotal trades: {len(trades_df)}")
    print(f"Unique symbols traded: {trades_df['symbol'].nunique()}")
    print("\nTrades by symbol:")
    print(trades_df['symbol'].value_counts().head(10))

In [None]:
# Plot results
results.plot()

## Parameter Optimization

Let's optimize the z-score thresholds for better performance

In [None]:
# Define parameter space for optimization
param_space = {
    'entry_z': (1.5, 3.0),
    'exit_z': (0.0, 1.0),
    'max_pairs': (3, 10),
    'position_size': (0.05, 0.2)
}

# Keep fixed parameters
fixed_params = {
    'lookback': 96 * 2,
    'lookback_days': 30,
    'rebalance_frequency': 96 * 7,
    '_pairs': initial_pairs
}

# Run optimization
print("Running parameter optimization...")
opt_results = optimize_strategy(
    data=data,
    strategy=lambda d, p, t, **kw: stat_arb_strategy(d, p, t, **{**fixed_params, **kw}),
    param_space=param_space,
    metric='sharpe_ratio',
    n_trials=30,
    initial_capital=INITIAL_CAPITAL,
    commission=0.0002
)

print(f"\nBest parameters: {opt_results['best_params']}")
print(f"Best Sharpe ratio: {opt_results['best_value']:.2f}")

In [None]:
# Display optimized results
print(opt_results['final_results'].summary())
opt_results['final_results'].plot()

## Analysis

This statistical arbitrage strategy:
1. Identifies cointegrated pairs from a universe of 50 perpetual futures
2. Trades mean reversion using z-score signals
3. Manages multiple pairs simultaneously
4. Rebalances the pair universe periodically

Key considerations for production:
- Monitor cointegration stability
- Implement proper risk limits per pair
- Consider funding rates for perpetual contracts
- Add stop losses for diverging pairs
- Optimize execution to minimize slippage