# Spread Analysis for Event-Based Trading

This notebook focuses on analyzing bid-ask spread events for Squid_Ink. We'll use only the first 20,000 timestamps (in-sample data) for our analysis.

In [None]:
import sys
import os

# Import our backtester package
sys.path.append(os.path.abspath('../../'))
from backtester import get_price_data, get_vwap, relative_entropy_binned
print("Using backtester package")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import norm

## 1. Load Data

First, let's load the Squid_Ink price data and limit it to the first 20,000 timestamps (in-sample data).

In [None]:
# Load data directly using backtester package
print("Loading price data...")
prices = get_price_data('SQUID_INK', 1)
print(f"Loaded {len(prices)} price data points")

# Limit to first 20,000 timestamps (in-sample data)
in_sample_prices = prices.iloc[:20000]
print(f"Limited to {len(in_sample_prices)} in-sample data points")

# Get VWAP
print("Getting VWAP for SQUID_INK...")
squid_vwap = in_sample_prices['vwap']
print(f"Got VWAP with {len(squid_vwap)} data points")
print(f"VWAP range: {squid_vwap.min()} to {squid_vwap.max()}")

# Calculate log returns
log_ret = np.log(squid_vwap).diff().dropna()
print(f"Calculated log returns with {len(log_ret)} data points")

## 2. Calculate Spread Metrics

Let's calculate bid-ask spread metrics from the order book data.

In [None]:
# Calculate bid-ask spread
in_sample_prices['spread'] = in_sample_prices['ask_price_1'] - in_sample_prices['bid_price_1']

# Calculate relative spread (spread as a percentage of mid price)
in_sample_prices['mid_price'] = (in_sample_prices['ask_price_1'] + in_sample_prices['bid_price_1']) / 2
in_sample_prices['relative_spread'] = in_sample_prices['spread'] / in_sample_prices['mid_price'] * 100  # in percentage

# Calculate spread moving average and volatility
window = 50  # 50-period window
in_sample_prices['spread_ma'] = in_sample_prices['spread'].rolling(window=window).mean()
in_sample_prices['spread_std'] = in_sample_prices['spread'].rolling(window=window).std()

# Calculate z-score of spread
in_sample_prices['spread_zscore'] = (in_sample_prices['spread'] - in_sample_prices['spread_ma']) / in_sample_prices['spread_std']

# Display the first few rows
in_sample_prices[['spread', 'relative_spread', 'spread_ma', 'spread_std', 'spread_zscore']].head(10)

## 3. Visualize Spread Metrics

Let's visualize the spread metrics over time and their distributions.

In [None]:
# Plot spread metrics over time
plt.figure(figsize=(15, 15))

# Plot absolute spread
plt.subplot(3, 1, 1)
plt.plot(in_sample_prices['spread'], label='Spread')
plt.plot(in_sample_prices['spread_ma'], label='Spread MA', color='red')
plt.title('Absolute Spread Over Time')
plt.legend()
plt.grid(True)

# Plot relative spread
plt.subplot(3, 1, 2)
plt.plot(in_sample_prices['relative_spread'], label='Relative Spread (%)')
plt.title('Relative Spread Over Time (%)')
plt.legend()
plt.grid(True)

# Plot spread z-score
plt.subplot(3, 1, 3)
plt.plot(in_sample_prices['spread_zscore'], label='Spread Z-Score')
plt.axhline(y=2, color='red', linestyle='--', label='Z=2')
plt.axhline(y=-2, color='red', linestyle='--', label='Z=-2')
plt.title('Spread Z-Score Over Time')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

In [None]:
# Plot spread distributions
plt.figure(figsize=(15, 10))

# Plot absolute spread distribution
plt.subplot(2, 2, 1)
plt.hist(in_sample_prices['spread'].dropna(), bins=50)
plt.title('Absolute Spread Distribution')
plt.grid(True)

# Plot relative spread distribution
plt.subplot(2, 2, 2)
plt.hist(in_sample_prices['relative_spread'].dropna(), bins=50)
plt.title('Relative Spread Distribution (%)')
plt.grid(True)

# Plot spread z-score distribution
plt.subplot(2, 2, 3)
plt.hist(in_sample_prices['spread_zscore'].dropna(), bins=50)
plt.title('Spread Z-Score Distribution')
plt.grid(True)

# Plot spread vs. VWAP
plt.subplot(2, 2, 4)
plt.scatter(in_sample_prices['vwap'], in_sample_prices['spread'], alpha=0.5)
plt.title('Spread vs. VWAP')
plt.xlabel('VWAP')
plt.ylabel('Spread')
plt.grid(True)

plt.tight_layout()
plt.show()

## 4. Define Spread Events

Let's define spread events based on extreme values of the spread z-score.

In [None]:
# Define thresholds for spread events
spread_thresholds = {
    'moderate': 1.5,  # 1.5 standard deviations
    'strong': 2.0,    # 2.0 standard deviations
    'extreme': 2.5    # 2.5 standard deviations
}

# Display the thresholds
for name, threshold in spread_thresholds.items():
    print(f"{name.capitalize()} spread threshold: {threshold:.1f} standard deviations")

In [None]:
# Identify spread events
spread_events = pd.DataFrame(index=in_sample_prices.index)
spread_events['spread'] = in_sample_prices['spread']
spread_events['spread_zscore'] = in_sample_prices['spread_zscore']

# Classify spread events by magnitude
for name, threshold in spread_thresholds.items():
    # Widening spread (positive z-score)
    spread_events[f'{name}_widening'] = (spread_events['spread_zscore'] > threshold).astype(int)
    
    # Narrowing spread (negative z-score)
    spread_events[f'{name}_narrowing'] = (spread_events['spread_zscore'] < -threshold).astype(int)

# Display the first few rows
spread_events.head(10)

## 5. Analyze Spread Events

Let's analyze the frequency and characteristics of spread events.

In [None]:
# Count the number of spread events by type
spread_counts = {}

for col in spread_events.columns:
    if col.endswith('_widening') or col.endswith('_narrowing'):
        spread_counts[col] = spread_events[col].sum()

# Calculate the percentage of spread events
total_points = len(spread_events)
spread_percentages = {k: v / total_points * 100 for k, v in spread_counts.items()}

# Display the counts and percentages
counts_df = pd.DataFrame({
    'Count': spread_counts,
    'Percentage (%)': spread_percentages
})

counts_df

## 6. Visualize Spread Events

Let's visualize the spread events on the VWAP chart.

In [None]:
# Plot VWAP with spread events
plt.figure(figsize=(15, 10))

# Plot VWAP
plt.subplot(2, 1, 1)
plt.plot(squid_vwap, label='VWAP', alpha=0.7)

# Plot extreme widening events
extreme_widening = spread_events[spread_events['extreme_widening'] == 1].index
plt.scatter(extreme_widening, squid_vwap.loc[extreme_widening], 
            marker='^', s=100, color='red', label='Extreme Widening')

# Plot extreme narrowing events
extreme_narrowing = spread_events[spread_events['extreme_narrowing'] == 1].index
plt.scatter(extreme_narrowing, squid_vwap.loc[extreme_narrowing], 
            marker='v', s=100, color='green', label='Extreme Narrowing')

plt.title('Squid_Ink VWAP with Spread Events')
plt.legend()
plt.grid(True)

# Plot spread
plt.subplot(2, 1, 2)
plt.plot(spread_events['spread'], label='Spread', alpha=0.7)

# Plot extreme widening events
plt.scatter(extreme_widening, spread_events.loc[extreme_widening, 'spread'], 
            marker='^', s=100, color='red', label='Extreme Widening')

# Plot extreme narrowing events
plt.scatter(extreme_narrowing, spread_events.loc[extreme_narrowing, 'spread'], 
            marker='v', s=100, color='green', label='Extreme Narrowing')

plt.title('Spread with Events')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

## 7. Analyze Post-Event Returns

Let's analyze the returns following spread events.

In [None]:
# Define function to calculate post-event returns
def calculate_post_event_returns(events, returns, event_col, horizons=[1, 5, 10, 20]):
    """Calculate returns after events for different time horizons."""
    post_returns = {}
    
    # Get event timestamps
    event_times = events[events[event_col] == 1].index
    
    if len(event_times) == 0:
        return {h: np.nan for h in horizons}
    
    # Calculate post-event returns for each horizon
    for horizon in horizons:
        horizon_returns = []
        
        for time in event_times:
            try:
                # Get the index position
                idx = returns.index.get_loc(time)
                
                # Calculate cumulative return for the horizon
                if idx + horizon < len(returns):
                    cum_ret = returns.iloc[idx+1:idx+horizon+1].sum()
                    horizon_returns.append(cum_ret)
            except:
                continue
        
        if horizon_returns:
            post_returns[horizon] = np.mean(horizon_returns)
        else:
            post_returns[horizon] = np.nan
    
    return post_returns

In [None]:
# Calculate post-event returns for different spread events
horizons = [1, 5, 10, 20, 50]
post_returns = {}

for col in spread_events.columns:
    if col.endswith('_widening') or col.endswith('_narrowing'):
        post_returns[col] = calculate_post_event_returns(spread_events, log_ret, col, horizons)

# Convert to DataFrame for easier analysis
post_returns_df = pd.DataFrame(post_returns)

# Display the results
post_returns_df

In [None]:
# Visualize post-event returns
plt.figure(figsize=(15, 10))

# Plot post-event returns for widening events
plt.subplot(2, 1, 1)
for col in [c for c in post_returns_df.columns if 'widening' in c]:
    plt.plot(post_returns_df.index, post_returns_df[col], marker='o', label=col)
plt.axhline(y=0, color='r', linestyle='--')
plt.title('Post-Event Returns for Spread Widening Events')
plt.xlabel('Time Horizon')
plt.ylabel('Average Return')
plt.legend()
plt.grid(True)

# Plot post-event returns for narrowing events
plt.subplot(2, 1, 2)
for col in [c for c in post_returns_df.columns if 'narrowing' in c]:
    plt.plot(post_returns_df.index, post_returns_df[col], marker='o', label=col)
plt.axhline(y=0, color='r', linestyle='--')
plt.title('Post-Event Returns for Spread Narrowing Events')
plt.xlabel('Time Horizon')
plt.ylabel('Average Return')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

## 8. Develop a Trading Strategy

Based on our analysis of spread events, let's develop a simple trading strategy.

In [None]:
# Define a simple trading strategy based on spread events
def spread_trading_strategy(prices, log_ret, spread_events, event_type, horizon):
    """Implement a simple trading strategy based on spread events.
    
    Parameters:
    - prices: DataFrame with price data
    - log_ret: Series with log returns
    - spread_events: DataFrame with spread events
    - event_type: Type of event to trade (e.g., 'extreme_widening')
    - horizon: Holding period after event
    
    Returns:
    - positions: Series with trading positions (1 for long, -1 for short, 0 for no position)
    - returns: Series with strategy returns
    """
    # Initialize positions
    positions = pd.Series(0, index=prices.index)
    
    # Get event timestamps
    event_times = spread_events[spread_events[event_type] == 1].index
    
    # Set positions based on events
    for time in event_times:
        try:
            # Get the index position
            idx = prices.index.get_loc(time)
            
            # Set position based on event type
            if 'widening' in event_type:
                # For widening spread, go short (market makers are pulling liquidity)
                pos = -1
            elif 'narrowing' in event_type:
                # For narrowing spread, go long (market makers are adding liquidity)
                pos = 1
            else:
                continue
            
            # Set position for the holding period
            end_idx = min(idx + horizon + 1, len(positions))
            positions.iloc[idx+1:end_idx] = pos
            
        except Exception as e:
            print(f"Error processing event at {time}: {e}")
            continue
    
    # Calculate strategy returns
    # Shift positions by 1 to avoid look-ahead bias
    strategy_returns = positions.shift(1) * log_ret
    
    return positions, strategy_returns.dropna()

In [None]:
# Test the strategy with different parameters
strategy_results = {}

# Test different event types and horizons
for event_type in ['moderate_widening', 'moderate_narrowing', 
                   'strong_widening', 'strong_narrowing',
                   'extreme_widening', 'extreme_narrowing']:
    for horizon in [5, 10, 20]:
        strategy_name = f"{event_type}_h{horizon}"
        
        # Run the strategy
        positions, returns = spread_trading_strategy(
            in_sample_prices, log_ret, spread_events, event_type, horizon)
        
        # Calculate performance metrics
        total_return = returns.sum()
        sharpe_ratio = returns.mean() / returns.std() * np.sqrt(252)  # Annualized
        win_rate = (returns > 0).mean()
        
        # Store results
        strategy_results[strategy_name] = {
            'Total Return': total_return,
            'Sharpe Ratio': sharpe_ratio,
            'Win Rate': win_rate,
            'Returns': returns
        }

# Display performance metrics
metrics_df = pd.DataFrame({
    name: {
        'Total Return': results['Total Return'],
        'Sharpe Ratio': results['Sharpe Ratio'],
        'Win Rate': results['Win Rate']
    } for name, results in strategy_results.items()
}).T

metrics_df.sort_values('Sharpe Ratio', ascending=False).head(10)

In [None]:
# Plot cumulative returns for the best strategies
plt.figure(figsize=(15, 7))

# Sort strategies by Sharpe ratio
top_strategies = metrics_df.sort_values('Sharpe Ratio', ascending=False).head(3).index

for strategy_name in top_strategies:
    returns = strategy_results[strategy_name]['Returns']
    plt.plot(returns.cumsum(), label=strategy_name)

plt.title('Cumulative Returns of Top Spread Strategies')
plt.legend()
plt.grid(True)
plt.show()

## 9. Conclusion

In this notebook, we've analyzed bid-ask spread events in the Squid_Ink data and developed trading strategies based on these events. We've used only the first 20,000 timestamps (in-sample data) for our analysis.

Key findings:
1. Spread events (widening and narrowing) occur with varying frequencies and magnitudes
2. There are patterns in post-event returns that can be exploited for trading
3. The best strategy appears to be [to be filled after running]

In future analyses, we could explore combining spread events with price spike events and volume imbalance events to develop more robust trading strategies.