In [5]:
#Correlation arbitrage
import yfinance as yf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import ffn
from datetime import datetime, timedelta
from scipy.stats import pearsonr
import vectorbt as vbt
from tapy import Indicators
import plotly.graph_objects as go

In [6]:
CL = yf.download('CL=F', start = '2016-01-01')
BZ = yf.download('BZ=F', start = '2016-01-01')
CL.columns = CL.columns.get_level_values(0)
BZ.columns = BZ.columns.get_level_values(0)
# Standardize column names (removes MultiIndex if present)
CL.columns = CL.columns.get_level_values(0)
BZ.columns = BZ.columns.get_level_values(0)

# Combine the 'Close' prices of both assets into a single DataFrame.
# This ensures alignment and drops any rows where data is missing for either asset.
data = pd.concat([CL['Close'], BZ['Close']], axis=1, keys=['CL', 'BZ']).dropna()

# Calculate daily percentage returns for correlation and window performance.
cl_returns = data['CL'].pct_change()
bz_returns = data['BZ'].pct_change()

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [9]:

BZ_cum_rets = (1 + bz_returns).cumprod() - 1
CL_cum_rets = (1 + cl_returns).cumprod() - 1

# Set the rolling window size
rolling_window = 30  # Example: 30 days

In [10]:
# Define a custom function to calculate rolling Pearson correlation.
# It handles potential NaN values and ensures the output Series has the correct index.
# Efficiently calculate rolling correlation
data['Rolling_Corr'] = cl_returns.rolling(window=rolling_window).corr(bz_returns)

# Forward fill any initial NaNs in Rolling_Corr that result from the `reindex` operation,
# as the `rolling_pearsonr_custom` only fills nans when there is insufficient data within the window
data['Rolling_Corr'] = data['Rolling_Corr'].fillna(method='ffill')

# Calculate the percentage return of each asset over the `rolling_window` period.
# This is used to identify the "lagging" asset.
data['CL_Window_Return'] = data['CL'].pct_change(periods=rolling_window)
data['BZ_Window_Return'] = data['BZ'].pct_change(periods=rolling_window)

# Drop any rows that now have NaNs due to window calculations at the start of the dataset.
data = data.dropna()
data

  data['Rolling_Corr'] = data['Rolling_Corr'].fillna(method='ffill')


Unnamed: 0_level_0,CL,BZ,Rolling_Corr,CL_Window_Return,BZ_Window_Return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016-02-17,30.660000,34.500000,0.889689,-0.165941,-0.073079
2016-02-18,30.770000,34.279999,0.888266,-0.144565,-0.058759
2016-02-19,29.639999,33.009998,0.886106,-0.127465,-0.035641
2016-02-22,31.480000,34.689999,0.889981,-0.053802,0.027852
2016-02-23,31.870001,33.270000,0.871261,-0.038902,-0.008346
...,...,...,...,...,...
2025-07-21,67.199997,69.209999,0.990353,0.040570,0.041222
2025-07-22,66.209999,68.589996,0.990014,0.014091,0.023120
2025-07-23,65.250000,68.510002,0.986888,0.004155,0.024525
2025-07-24,66.029999,69.180000,0.985667,-0.031108,-0.008456


In [11]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=data.index,  # x-axis is the index (time periods)
    y=CL_cum_rets,  # y-axis is the AC values
    marker=dict(color='aqua'),
    name="CL"))

fig.add_trace(go.Scatter(
    x=data.index,  # x-axis is the index (time periods)
    y=BZ_cum_rets,  # y-axis is the AC values
    marker=dict(color='darkgreen'),
    name="BZ"))

fig.add_trace(go.Scatter(
    x=data.index,  # x-axis is the index (time periods)
    y=rolling_corr,  # y-axis is the AC values
    marker=dict(color='black'),
    name="R_corr"))

NameError: name 'rolling_corr' is not defined

In [None]:
# --- 3. Vectorized Strategy Logic ---
print("Generating trading signals...")

# Define strategy parameters
ENTRY_CORR_THRESHOLD_LOW = 0.6
EXIT_CORR_THRESHOLD_HIGH = 0.9

# Use .shift(1) to base today's entry decision on yesterday's data (no lookahead bias)
prev_corr = data['Rolling_Corr'].shift(1)
cl_prev_window_ret = data['CL_Window_Return'].shift(1)
bz_prev_window_ret = data['BZ_Window_Return'].shift(1)

# Define entry conditions using boolean logic
is_low_corr = (prev_corr < ENTRY_CORR_THRESHOLD_LOW)
cl_is_lagging = (cl_prev_window_ret < bz_prev_window_ret)
bz_is_lagging = (bz_prev_window_ret < cl_prev_window_ret)

# Generate entry signals directly (NO need for empty_like or loops)
entries_CL = is_low_corr & cl_is_lagging
entries_BZ = is_low_corr & bz_is_lagging

# Define exit condition (reversion to high correlation)
is_high_corr = data['Rolling_Corr'] > EXIT_CORR_THRESHOLD_HIGH
exits_CL = is_high_corr
exits_BZ = is_high_corr

# Combine signals into DataFrames for vectorbt
entries = pd.DataFrame({'CL': entries_CL, 'BZ': entries_BZ})
exits = pd.DataFrame({'CL': exits_CL, 'BZ': exits_BZ})

# --- 4. Backtesting with vectorbt ---
print("\nRunning backtest with vectorbt...")

pf = vbt.Portfolio.from_signals(
    data[['CL', 'BZ']],
    entries=entries,
    exits=exits,
    freq='1D',
    init_cash=100000
)

# --- 5. Display Results ---
print("\n--- Portfolio Performance ---")
print(pf.stats())


In [None]:
# --- 6. Plotting ---

# For a multi-asset portfolio, you must specify which column to plot
# against when visualizing orders.

print("\nGenerating portfolio plot for CL...")
fig_cl = pf.plot(column='CL')
fig_cl.show()

print("\nGenerating portfolio plot for BZ...")
fig_bz = pf.plot(column='BZ')
fig_bz.show()