# 04 â€” Multi-Algorithm Comparison

Runs KMeans and DBSCAN alongside OPTICS on the combined universe.
Identifies consensus pairs and runs permutation tests.

In [1]:
import sys, os

project_root = os.path.abspath(os.path.join('..', '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

import pickle
import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN, OPTICS

from config import DEFAULT_CONFIG
from signals.detection import compute_co_cluster_freq
from validation.pair_validation import feature_shuffle_permutation_test
from screener.universe import load_cached_universe

%matplotlib inline

## 1. Load Artifacts

In [2]:
data_dir = os.path.join('..', 'data', 'combined')

_, _, sector_map = load_cached_universe('combined')

with open(os.path.join(data_dir, 'ts_df.pkl'), 'rb') as f:
    ts_df = pickle.load(f)
with open(os.path.join(data_dir, 'pair_co_cluster_freq.pkl'), 'rb') as f:
    optics_freq = pickle.load(f)
with open(os.path.join(data_dir, 'cluster_history.pkl'), 'rb') as f:
    optics_ch = pickle.load(f)

print(f"ts_df: {ts_df.shape[0]} rows")
print(f"OPTICS pairs: {len(optics_freq)}")

ts_df: 201498 rows
OPTICS pairs: 7836


## 2. Run KMeans & DBSCAN

In [3]:
features = list(DEFAULT_CONFIG.features.features_to_cluster)
n_clusters_kmeans = 8  # larger universe needs more clusters

def run_alt_clustering(ts_df, features, algo='kmeans'):
    """Run KMeans or DBSCAN across all timestamps."""
    timestamps = ts_df.index.get_level_values('Datetime').unique()
    results = []
    for ts in timestamps:
        try:
            snapshot = ts_df.xs(ts, level='Datetime')[features].dropna()
        except KeyError:
            continue
        if len(snapshot) < 5:
            continue
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(snapshot.values)
        pca = PCA(n_components=0.90)
        X_pca = pca.fit_transform(X_scaled)
        if algo == 'kmeans':
            model = KMeans(n_clusters=min(n_clusters_kmeans, len(snapshot) - 1),
                          n_init=10, random_state=42)
        elif algo == 'dbscan':
            model = DBSCAN(eps=1.5, min_samples=3)
        model.fit(X_pca)
        n_c = len(set(model.labels_)) - (1 if -1 in model.labels_ else 0)
        if n_c < 1:
            continue
        for ticker, label in zip(snapshot.index.tolist(), model.labels_):
            results.append({'Ticker': ticker, 'Datetime': ts, 'Cluster_ID': int(label)})
    return pd.DataFrame(results)

algo_freqs = {'optics': optics_freq}
for algo_name in ['kmeans', 'dbscan']:
    print(f"Running {algo_name}...")
    ch = run_alt_clustering(ts_df, features, algo=algo_name)
    if ch.empty:
        print(f"  No valid clusters")
        continue
    freq, _ = compute_co_cluster_freq(ch)
    noise_rate = (ch['Cluster_ID'] == -1).mean()
    print(f"  {ch['Datetime'].nunique()} snapshots, {len(freq)} pairs, noise={noise_rate:.1%}")
    algo_freqs[algo_name] = freq

Running kmeans...


  1419 snapshots, 10011 pairs, noise=0.0%
Running dbscan...


  1419 snapshots, 10011 pairs, noise=19.8%


## 3. Consensus Pairs

In [4]:
top_n = 20
algo_top_sets = {}
for algo_name, freq in algo_freqs.items():
    top_pairs = sorted(freq.items(), key=lambda x: x[1], reverse=True)[:top_n]
    pair_strs = {f"{a}-{b}" for (a, b), _ in top_pairs}
    algo_top_sets[algo_name] = pair_strs
    print(f"{algo_name} top-{top_n}: {sorted(pair_strs)[:5]}...")

consensus = set.intersection(*algo_top_sets.values()) if len(algo_top_sets) >= 2 else set()
print(f"\nConsensus (in all {len(algo_top_sets)} algos' top-{top_n}): {len(consensus)}")
for pair in sorted(consensus):
    # Tag with sector info
    parts = pair.split('-', 1)
    if len(parts) == 2:
        sa = sector_map.get(parts[0], '?')
        sb = sector_map.get(parts[1], '?')
        ptype = 'INTRA' if sa == sb else 'CROSS'
        print(f"  {pair:25s} {sa} / {sb}  [{ptype}]")

optics top-20: ['AAL-DAL', 'AAL-UAL', 'AMAT-LRCX', 'BP-XOM', 'CIFR-HUT']...
kmeans top-20: ['AAL-DAL', 'BP-SHEL', 'CNQ-CVE', 'CNQ-SU', 'COP-DVN']...
dbscan top-20: ['CNQ-COP', 'CNQ-SHEL', 'CNQ-XOM', 'COP-CVX', 'COP-DVN']...

Consensus (in all 3 algos' top-20): 3
  COP-DVN                   Energy / Energy  [INTRA]
  CVX-XOM                   Energy / Energy  [INTRA]
  SU-XOM                    Energy / Energy  [INTRA]


## 4. Permutation Test

In [5]:
optics_params = {
    'min_samples': DEFAULT_CONFIG.clustering.min_samples,
    'xi': DEFAULT_CONFIG.clustering.xi,
    'min_cluster_size': DEFAULT_CONFIG.clustering.min_cluster_size,
}
total_windows = optics_ch['Datetime'].nunique()

print("Running permutation test (30 permutations, 80 sampled timestamps)...")
perm = feature_shuffle_permutation_test(
    ts_df, features, optics_params, optics_freq, total_windows,
    n_permutations=30, n_sample_timestamps=80,
)
print(f"Fraction significant (Z>1.96): {perm['fraction_significant']:.1%}")

sig_pairs = {p: z for p, z in perm['pair_zscores'].items() if z > 1.96}
top_sig = sorted(sig_pairs.items(), key=lambda x: x[1], reverse=True)[:10]
print(f"\nTop 10 significant pairs:")
for (a, b), z in top_sig:
    sa = sector_map.get(a, '?')
    sb = sector_map.get(b, '?')
    ptype = 'INTRA' if sa == sb else 'CROSS'
    print(f"  {a}-{b}: Z={z:.2f}  [{ptype}]")

Running permutation test (30 permutations, 80 sampled timestamps)...


Fraction significant (Z>1.96): 32.4%

Top 10 significant pairs:
  PBR-PBR-A: Z=38.59  [INTRA]
  AAL-DAL: Z=38.19  [INTRA]
  AAL-UAL: Z=36.83  [INTRA]
  DAL-UAL: Z=34.03  [INTRA]
  HAL-SLB: Z=32.08  [INTRA]
  KMI-WMB: Z=30.65  [INTRA]
  DVN-OXY: Z=27.04  [INTRA]
  CVX-XOM: Z=26.86  [INTRA]
  SU-XOM: Z=25.86  [INTRA]
  AMAT-LRCX: Z=25.13  [INTRA]


## 5. Save

In [6]:
with open(os.path.join(data_dir, 'consensus_pairs.pkl'), 'wb') as f:
    pickle.dump(consensus, f)
with open(os.path.join(data_dir, 'permutation_results.pkl'), 'wb') as f:
    pickle.dump(perm, f)
print("Saved.")

Saved.
