In [68]:
# imports
import os
import glob
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import coint
from ta.momentum import RSIIndicator
from ta.volatility import BollingerBands
from tqdm import tqdm
import os
import glob
import numpy as np
import pandas as pd
from statsmodels.regression.linear_model import OLS
from statsmodels.tools import add_constant
from statsmodels.tsa.stattools import adfuller

In [69]:
# globals
DATA_DIR = ""   # <- Directory where parquet files are stored
WINDOW = 4320 # rolling window length (4320 = 3 days of 1-min bars set by default)
PVAL_THRESHOLD = 0.05 # cointegration significance level

In [None]:
import pandas as pd
import glob
import os

def load_and_align_fixed_start(folder, start="2024-09-01 22:00:00", 
                               target_len=525600, freq="1min"):
    """
    Load crypto parquet files, align all to start at a fixed timestamp,
    keep only those with full target_len observations.
    Returns dict {symbol: DataFrame} with exactly target_len rows.
    """
    start = pd.Timestamp(start)
    target_index = pd.date_range(start=start, periods=target_len, freq=freq)
    data = {}

    for f in glob.glob(os.path.join(folder, "*.parquet")):
        sym = os.path.basename(f).replace("_1m.parquet", "")
        df = pd.read_parquet(f).sort_index()

        # Restrict to target timeline
        df = df.reindex(target_index)

        # Drop if not complete
        if df["close"].notna().sum() < target_len:
            print(f"Skipping {sym}: only {df["close"].notna().sum()} valid points")
            continue
        print(f"Keeping {sym}, {len(df)} points")
        # Forward-fill small gaps if needed
        # df = df.ffill()
        df["close"] = np.log(df["close"])  # log-prices
        df["open"] = np.log(df["open"])
        df["high"] = np.log(df["high"])
        df["low"] = np.log(df["low"])
        df["log_return"] = df["close"].diff()
        data[sym] = df

    return data

crypto_data = load_and_align_fixed_start("", start="2024-09-01 22:00:00")
print(len(crypto_data))  # number of cryptos kept


Keeping AAVE, 525600 points
Keeping ADA, 525600 points
Keeping APT, 525600 points
Keeping ARB, 525600 points
Keeping ATOM, 525600 points
Keeping AVAX, 525600 points
Keeping BCH, 525600 points
Keeping BNB, 525600 points
Keeping BTC, 525600 points
Keeping DOGE, 525600 points
Keeping DOT, 525600 points
Keeping ENA, 525600 points
Keeping ETC, 525600 points
Keeping ETH, 525600 points
Keeping HBAR, 525600 points
Keeping LINK, 525600 points
Keeping LTC, 525600 points
Keeping NEAR, 525600 points
Skipping ONDO: only 206400 valid points
Skipping POL: only 509040 valid points
Keeping SOL, 525600 points
Keeping SUI, 525600 points
Skipping SYRUP: only 170340 valid points
Keeping TON, 525600 points
Keeping TRX, 525600 points
Keeping UNI, 525600 points
Keeping WLD, 525600 points
Keeping XLM, 525600 points
Keeping XRP, 525600 points
26


In [71]:
# first round selection based on correlation (whole period)
# correlation matrix
def compute_correlation_matrix(crypto_data):
    """
    Compute correlation matrix of closing prices for all cryptos.
    Returns a DataFrame with symbols as both index and columns.
    """
    symbols = list(crypto_data.keys())
    close_prices = pd.DataFrame({sym: crypto_data[sym]["close"] for sym in symbols})
    corr_matrix = close_prices.corr()
    return corr_matrix
correlation_matrix = compute_correlation_matrix(crypto_data)
print(correlation_matrix)

          AAVE       ADA       APT       ARB      ATOM      AVAX       BCH  \
AAVE  1.000000  0.724250 -0.059267  0.280073  0.408515  0.324061  0.732150   
ADA   0.724250  1.000000  0.054949  0.273225  0.613604  0.395559  0.570653   
APT  -0.059267  0.054949  1.000000  0.864795  0.757655  0.871707  0.070465   
ARB   0.280073  0.273225  0.864795  1.000000  0.848723  0.964136  0.354453   
ATOM  0.408515  0.613604  0.757655  0.848723  1.000000  0.915912  0.438328   
AVAX  0.324061  0.395559  0.871707  0.964136  0.915912  1.000000  0.372970   
BCH   0.732150  0.570653  0.070465  0.354453  0.438328  0.372970  1.000000   
BNB   0.816593  0.673416 -0.128915  0.236374  0.336754  0.259385  0.836926   
BTC   0.815698  0.801420 -0.312959 -0.081907  0.228093  0.030855  0.721445   
DOGE  0.650759  0.862172  0.444808  0.578287  0.802669  0.692628  0.573414   
DOT   0.346792  0.563036  0.793314  0.863794  0.965500  0.924411  0.289770   
ENA   0.647865  0.721372  0.519281  0.738480  0.795536  0.792374

In [72]:
# select those with correlation > 0.85
high_corr_pairs = []
threshold = 0.85
symbols = list(crypto_data.keys())
for i in range(len(symbols)):
    for j in range(i+1, len(symbols)):
        sym1, sym2 = symbols[i], symbols[j]
        corr = correlation_matrix.loc[sym1, sym2]
        if abs(corr) > threshold:
            high_corr_pairs.append((sym1, sym2))
            print(f"High correlation: {sym1} & {sym2} = {corr:.2f}")

High correlation: ADA & DOGE = 0.86
High correlation: ADA & HBAR = 0.93
High correlation: ADA & LINK = 0.87
High correlation: ADA & LTC = 0.88
High correlation: ADA & SUI = 0.86
High correlation: ADA & XLM = 0.96
High correlation: ADA & XRP = 0.87
High correlation: APT & ARB = 0.86
High correlation: APT & AVAX = 0.87
High correlation: APT & NEAR = 0.95
High correlation: APT & TON = 0.90
High correlation: APT & WLD = 0.94
High correlation: ARB & AVAX = 0.96
High correlation: ARB & DOT = 0.86
High correlation: ARB & ETC = 0.91
High correlation: ARB & NEAR = 0.94
High correlation: ARB & TON = 0.88
High correlation: ARB & WLD = 0.92
High correlation: ATOM & AVAX = 0.92
High correlation: ATOM & DOT = 0.97
High correlation: ATOM & ETC = 0.91
High correlation: AVAX & DOT = 0.92
High correlation: AVAX & ETC = 0.93
High correlation: AVAX & NEAR = 0.92
High correlation: AVAX & WLD = 0.91
High correlation: BTC & SUI = 0.85
High correlation: BTC & TRX = 0.92
High correlation: BTC & XLM = 0.87
High

In [73]:
len(high_corr_pairs)  # number of high correlation pairs found

46

In [74]:
def rolling_cointegration(y, x, window=4320, adf_pval=0.05):
    """
    Rolling Engle–Granger cointegration test with beta estimation.
    Returns DataFrame with beta, ADF p-value, and spread.
    """
    results = []
    for start in range(0, len(y) - window, window//3):
        end = start + window
        y_win, x_win = y.iloc[start:end], x.iloc[start:end]
        corr = y_win.corr(x_win)
        # Step 1: OLS regression y ~ x
        model = OLS(y_win, add_constant(x_win)).fit()
        alpha, beta = model.params

        # Step 2: ADF on residuals
        residuals = y_win - model.predict(add_constant(x_win))
        adf_p = adfuller(residuals)[1]

        results.append({
            "start": y.index[start],
            "end": y.index[end],
            "alpha": alpha,
            "beta": beta,
            "adf_p": adf_p,
            "cointegrated": adf_p <= adf_pval,
            "correlation": corr
        })

    return pd.DataFrame(results)


In [None]:
def prepare_all_pairs(crypto_data, list_of_pairs, window=4320, adf_pval=0.05):
    """
    Iterate through crypto pairs, compute rolling cointegration + correlation,
    and generate RL features.
    Returns: dict { (sym1, sym2): features_df }
    """
    pair_df = {}
    i = 1
    for sym1, sym2 in list_of_pairs:
        print(f"Processing pair: {sym1}, {sym2}. {i} of {len(list_of_pairs)}")
        i += 1
        y_ohlc = crypto_data[sym1]
        x_ohlc = crypto_data[sym2]

        # align close prices for cointegration
        df_close = pd.concat([y_ohlc["close"], x_ohlc["close"]], axis=1, join="inner").dropna()
        y_aligned, x_aligned = df_close.iloc[:, 0], df_close.iloc[:, 1]

        # Rolling-window Engle–Granger test
        coint_df = rolling_cointegration(y_aligned, x_aligned, window=window,
                                        adf_pval=adf_pval)
        pair_df[(sym1, sym2)] = coint_df

    return pair_df

In [76]:
coint_df = prepare_all_pairs(crypto_data, high_corr_pairs, window=WINDOW,
                             adf_pval=PVAL_THRESHOLD)

Processing pair: ADA, DOGE. 1 of 46
Processing pair: ADA, HBAR. 2 of 46
Processing pair: ADA, LINK. 3 of 46
Processing pair: ADA, LTC. 4 of 46
Processing pair: ADA, SUI. 5 of 46
Processing pair: ADA, XLM. 6 of 46
Processing pair: ADA, XRP. 7 of 46
Processing pair: APT, ARB. 8 of 46
Processing pair: APT, AVAX. 9 of 46
Processing pair: APT, NEAR. 10 of 46
Processing pair: APT, TON. 11 of 46
Processing pair: APT, WLD. 12 of 46
Processing pair: ARB, AVAX. 13 of 46
Processing pair: ARB, DOT. 14 of 46
Processing pair: ARB, ETC. 15 of 46
Processing pair: ARB, NEAR. 16 of 46
Processing pair: ARB, TON. 17 of 46
Processing pair: ARB, WLD. 18 of 46
Processing pair: ATOM, AVAX. 19 of 46
Processing pair: ATOM, DOT. 20 of 46
Processing pair: ATOM, ETC. 21 of 46
Processing pair: AVAX, DOT. 22 of 46
Processing pair: AVAX, ETC. 23 of 46
Processing pair: AVAX, NEAR. 24 of 46
Processing pair: AVAX, WLD. 25 of 46
Processing pair: BTC, SUI. 26 of 46
Processing pair: BTC, TRX. 27 of 46
Processing pair: BTC,

In [None]:
coint_df[("ADA", "DOGE")]  # example for one pair


Unnamed: 0,start,end,beta,adf_p,cointegrated,correlation
0,2024-09-01 22:00:00,2024-09-04 22:00:00,0.621066,0.542324,False,0.509772
1,2024-09-02 22:00:00,2024-09-05 22:00:00,1.129587,0.034548,True,0.858768
2,2024-09-03 22:00:00,2024-09-06 22:00:00,0.593175,0.076234,False,0.809096
3,2024-09-04 22:00:00,2024-09-07 22:00:00,0.425993,0.148105,False,0.734872
4,2024-09-05 22:00:00,2024-09-08 22:00:00,0.517710,0.823804,False,0.504750
...,...,...,...,...,...,...
357,2025-08-24 22:00:00,2025-08-27 22:00:00,0.835671,0.357909,False,0.965981
358,2025-08-25 22:00:00,2025-08-28 22:00:00,0.524518,0.508401,False,0.858548
359,2025-08-26 22:00:00,2025-08-29 22:00:00,1.036413,0.259675,False,0.862308
360,2025-08-27 22:00:00,2025-08-30 22:00:00,1.106676,0.227912,False,0.945504


In [80]:
#save dictionary into csv
for (sym1, sym2), df in coint_df.items():
    filename = f"{sym1}_{sym2}_window_cointegration.csv"
    df.to_csv(filename, index=False)
    print(f"Saved features for pair {sym1}-{sym2} to {filename}")

Saved features for pair ADA-DOGE to ADA_DOGE_window_cointegration.csv
Saved features for pair ADA-HBAR to ADA_HBAR_window_cointegration.csv
Saved features for pair ADA-LINK to ADA_LINK_window_cointegration.csv
Saved features for pair ADA-LTC to ADA_LTC_window_cointegration.csv
Saved features for pair ADA-SUI to ADA_SUI_window_cointegration.csv
Saved features for pair ADA-XLM to ADA_XLM_window_cointegration.csv
Saved features for pair ADA-XRP to ADA_XRP_window_cointegration.csv
Saved features for pair APT-ARB to APT_ARB_window_cointegration.csv
Saved features for pair APT-AVAX to APT_AVAX_window_cointegration.csv
Saved features for pair APT-NEAR to APT_NEAR_window_cointegration.csv
Saved features for pair APT-TON to APT_TON_window_cointegration.csv
Saved features for pair APT-WLD to APT_WLD_window_cointegration.csv
Saved features for pair ARB-AVAX to ARB_AVAX_window_cointegration.csv
Saved features for pair ARB-DOT to ARB_DOT_window_cointegration.csv
Saved features for pair ARB-ETC to A

In [None]:
# selection of top-5 pairs per window based on correlation
top_pairs_per_window = {}
for pair, df in coint_df.items():
    for _, row in df.iterrows():
        window_key = (row["start"], row["end"])
        if window_key not in top_pairs_per_window:
            top_pairs_per_window[window_key] = []
        if row["cointegrated"]:
            top_pairs_per_window[window_key].append((pair, row["correlation"], row["beta"], row["alpha"], row["adf_p"]))
# keep top 5 pairs by correlation for each window
for window_key, pairs in top_pairs_per_window.items():
    # sort by absolute correlation
    pairs.sort(key=lambda x: abs(x[1]), reverse=True)  
    top_pairs_per_window[window_key] = pairs[:5]


In [None]:
# calculate how many times each pair appears in top-5 across all windows
from collections import Counter
pair_counter = Counter()
for pairs in top_pairs_per_window.values():
    for pair_info in pairs:
        pair = pair_info[0]
        pair_counter[pair] += 1


In [None]:
len(pair_counter)
# each pair occurs at least once in top-5 across all windows

46

In [None]:
pair_counter.most_common() 

[(('AVAX', 'DOT'), 67),
 (('DOT', 'ETC'), 65),
 (('HBAR', 'XLM'), 63),
 (('ARB', 'ETC'), 62),
 (('ATOM', 'DOT'), 62),
 (('ATOM', 'ETC'), 60),
 (('ARB', 'WLD'), 59),
 (('XLM', 'XRP'), 58),
 (('NEAR', 'WLD'), 58),
 (('ADA', 'HBAR'), 55),
 (('ARB', 'DOT'), 55),
 (('ADA', 'DOGE'), 54),
 (('APT', 'ARB'), 53),
 (('ARB', 'NEAR'), 53),
 (('ADA', 'LINK'), 50),
 (('ATOM', 'AVAX'), 48),
 (('AVAX', 'NEAR'), 46),
 (('ETC', 'UNI'), 46),
 (('ADA', 'XLM'), 46),
 (('APT', 'NEAR'), 44),
 (('AVAX', 'ETC'), 41),
 (('ARB', 'AVAX'), 40),
 (('AVAX', 'WLD'), 38),
 (('ENA', 'LINK'), 37),
 (('APT', 'WLD'), 37),
 (('ADA', 'XRP'), 36),
 (('ETC', 'SOL'), 33),
 (('HBAR', 'XRP'), 33),
 (('DOGE', 'ENA'), 31),
 (('APT', 'AVAX'), 30),
 (('TON', 'WLD'), 30),
 (('ENA', 'ETC'), 29),
 (('LINK', 'LTC'), 28),
 (('BTC', 'XRP'), 26),
 (('ADA', 'LTC'), 25),
 (('BTC', 'XLM'), 24),
 (('ENA', 'UNI'), 23),
 (('BTC', 'SUI'), 21),
 (('HBAR', 'LTC'), 20),
 (('ARB', 'TON'), 19),
 (('ADA', 'SUI'), 18),
 (('LTC', 'XLM'), 17),
 (('TRX', '

In [120]:
# calulcate how many unique crypto assets are involved in top pairs
unique_assets = set()
for pair in pair_counter.keys():
    unique_assets.update(pair)
len(unique_assets)  # number of unique assets in top pairs

22

In [None]:
# creation a DataFrame with all 1-min timestamps and prices for all symbols in cointegrated pairs, and window-specific spreads

#Get all unique symbols from cointegrated pairs
all_symbols = set()
for pairs in top_pairs_per_window.values():
    for pair_info in pairs:
        sym1, sym2 = pair_info[0]
        all_symbols.add(sym1)
        all_symbols.add(sym2)
all_symbols = sorted(list(all_symbols))

full_df = pd.DataFrame({"timestamp": crypto_data["BTC"].index})  # index of any crypto is in fact timestamps - take any

# price columns for each symbol
for sym in all_symbols:
    df = crypto_data[sym].copy()
    if "timestamp" not in df.columns:
        df = df.reset_index().rename(columns={"index": "timestamp"})
    full_df = full_df.merge(df[["timestamp", "close"]], on="timestamp", how="left")
    full_df = full_df.rename(columns={"close": f"{sym}_close"})

# for each window and pair, assign beta, alpha, and calculate spread for that window
for (start, end), pairs in top_pairs_per_window.items():
    mask = (full_df["timestamp"] >= start) & (full_df["timestamp"] <= end)
    for pair_info in pairs:
        sym1, sym2 = pair_info[0]
        corr, beta, alpha, adf_p = pair_info[1:]
        spread_col = f"{sym1}_{sym2}_spread"
        if spread_col not in full_df.columns:
            full_df[spread_col] = np.nan
        y = full_df.loc[mask, f"{sym1}_close"]
        x = full_df.loc[mask, f"{sym2}_close"]
        full_df.loc[mask, spread_col] = y - (alpha + beta * x)
        full_df.loc[mask, f"{sym1}_{sym2}_beta"] = beta
        full_df.loc[mask, f"{sym1}_{sym2}_alpha"] = alpha
        full_df.loc[mask, f"{sym1}_{sym2}_adf_p"] = adf_p
        full_df.loc[mask, f"{sym1}_{sym2}_corr"] = corr

full_df.head()
# 46 (pairs) spread features*5 + 22 (assets) price features + 1 timestamp = 253 columns total

  full_df.loc[mask, f"{sym1}_{sym2}_adf_p"] = adf_p
  full_df.loc[mask, f"{sym1}_{sym2}_corr"] = corr
  full_df[spread_col] = np.nan
  full_df.loc[mask, f"{sym1}_{sym2}_beta"] = beta
  full_df.loc[mask, f"{sym1}_{sym2}_alpha"] = alpha
  full_df.loc[mask, f"{sym1}_{sym2}_adf_p"] = adf_p
  full_df.loc[mask, f"{sym1}_{sym2}_corr"] = corr
  full_df[spread_col] = np.nan
  full_df.loc[mask, f"{sym1}_{sym2}_beta"] = beta
  full_df.loc[mask, f"{sym1}_{sym2}_alpha"] = alpha
  full_df.loc[mask, f"{sym1}_{sym2}_adf_p"] = adf_p
  full_df.loc[mask, f"{sym1}_{sym2}_corr"] = corr
  full_df[spread_col] = np.nan
  full_df.loc[mask, f"{sym1}_{sym2}_beta"] = beta
  full_df.loc[mask, f"{sym1}_{sym2}_alpha"] = alpha
  full_df.loc[mask, f"{sym1}_{sym2}_adf_p"] = adf_p
  full_df.loc[mask, f"{sym1}_{sym2}_corr"] = corr
  full_df[spread_col] = np.nan
  full_df.loc[mask, f"{sym1}_{sym2}_beta"] = beta
  full_df.loc[mask, f"{sym1}_{sym2}_alpha"] = alpha
  full_df.loc[mask, f"{sym1}_{sym2}_adf_p"] = adf_p
  full_d

Unnamed: 0,timestamp,ADA_close,APT_close,ARB_close,ATOM_close,AVAX_close,BTC_close,DOGE_close,DOT_close,ENA_close,...,ADA_XRP_spread,ADA_XRP_beta,ADA_XRP_alpha,ADA_XRP_adf_p,ADA_XRP_corr,ENA_UNI_spread,ENA_UNI_beta,ENA_UNI_alpha,ENA_UNI_adf_p,ENA_UNI_corr
0,2024-09-01 22:00:00,-1.085301,1.850028,-0.687961,1.49223,3.091042,10.975311,-2.319833,1.429114,-1.465338,...,,,,,,,,,,
1,2024-09-01 22:01:00,-1.087969,1.848455,-0.689753,1.490204,3.087856,10.973769,-2.322278,1.426476,-1.465338,...,,,,,,,,,,
2,2024-09-01 22:02:00,-1.088563,1.846879,-0.691149,1.489528,3.08603,10.97269,-2.323707,1.425996,-1.474033,...,,,,,,,,,,
3,2024-09-01 22:03:00,-1.089752,1.8453,-0.692148,1.489302,3.08603,10.971744,-2.324115,1.425034,-1.474033,...,,,,,,,,,,
4,2024-09-01 22:04:00,-1.090049,1.843719,-0.692348,1.489077,3.085116,10.971383,-2.324933,1.423831,-1.474033,...,,,,,,,,,,


In [124]:
full_df.to_csv("historical_pairs_with_spreads.csv", index=False)