# Validation Testing File

In [None]:
#IMPORTS
import yfinance as yf
import pandas as pd
import pandas_ta_classic as ta
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import OPTICS
from statsmodels.tsa.stattools import coint, grangercausalitytests
import warnings
import itertools
import matplotlib.gridspec as gridspec
import statsmodels.api as sm
import pykalman as KalmanFilter
warnings.filterwarnings("ignore")

In [None]:
# Data
stocks = [
    # S&P for Beta
    "^GSPC",
    # Megacap Leaders & Generalists
    "NVDA", "TSM", "AVGO", "AMD", "INTC", "MU", "TXN", "QCOM", "ADI", "MCHP",
    
    # Equipment & Manufacturing
    "ASML", "AMAT", "LRCX", "KLAC", "TER", "ENTG", "NVMI", "TOELY",
    
    # Specialized
    "ON", "NXPI", "STM", "LSCC", "MPWR", "QRVO", "SWKS", "ALAB", "CRDO",
    
    # Intellectual Property & Design Software
    "ARM", "SNPS", "CDNS", "CEVA",
    
    # Memory & Storage
    "WDC", "STX", # Removed extra "MU" here
    
    # Emerging & Mid-Cap
    "GFS", "MRVL", "MTSI", "POWI", "SMTC", "VICR", "CAMT"
]

def fetch_data(stocks):
    data = yf.download(tickers=stocks, period="252d", interval="1h", group_by='ticker', auto_adjust=True, threads=True)
    
    price_series_list = []
    for s in stocks:
        try: 
            if s in data:
                series = data[s]['Close']
                series.name = s
                price_series_list.append(series)
        except Exception as e:
            pass

    if price_series_list:
        df = pd.concat(price_series_list, axis=1)
        df = df.ffill() 
        return df
    return pd.DataFrame()

df = fetch_data(stocks)

In [None]:
# ============================================================================
# FEATURE ENGINEERING FOR TRANSIENT REGIME DETECTION
# ============================================================================

# 1. Clean and Prepare Price Data
if isinstance(df.columns, pd.MultiIndex):
    if 'Close' in df.columns.get_level_values(0):
        df = df['Close']
    elif 'Close' in df.columns.get_level_values(1):
        df = df.xs('Close', axis=1, level=1)

# 2. Base Calculations
returns_df = df.pct_change().dropna()
market_returns = returns_df['^GSPC']

# ============================================================================
# CRITICAL CHANGE: Multi-Timeframe Feature Engineering
# ============================================================================

# SHORT-TERM WINDOW (Transient regime detection)
window_short = 50  # ~1 week of hourly data - ALIGNED WITH TRADE DURATION

# MEDIUM-TERM WINDOW (Context/stability check)
window_medium = 147  # ~3 weeks - your original window

print("="*80)
print("FEATURE ENGINEERING - MULTI-TIMEFRAME APPROACH")
print("="*80)
print(f"Short-term window: {window_short} hours (~1 week)")
print(f"Medium-term window: {window_medium} hours (~3 weeks)")
print(f"Optimizing for transient events: 10-50 hour duration\n")


# ============================================================================
# 3A. SHORT-TERM FEATURES (Primary clustering features)
# ============================================================================

print("Calculating SHORT-TERM features (primary regime indicators)...")

# Feature A: SHORT-TERM Volatility (Recent risk behavior)
rolling_vol_short = returns_df.rolling(window=window_short).std() * np.sqrt(252 * 7)

# Feature B: SHORT-TERM Beta to SPX (Recent market sensitivity)
rolling_cov_mkt_short = returns_df.rolling(window=window_short).cov(market_returns)
rolling_mkt_var_short = market_returns.rolling(window=window_short).var()
rolling_beta_spx_short = rolling_cov_mkt_short.divide(rolling_mkt_var_short, axis=0)

# Feature C: SHORT-TERM Beta to Sector (Recent sector coupling)
sector_returns = returns_df.drop(columns=['^GSPC'], errors='ignore').mean(axis=1)
rolling_cov_sector_short = returns_df.rolling(window=window_short).cov(sector_returns)
rolling_sector_var_short = sector_returns.rolling(window=window_short).var()
rolling_beta_sector_short = rolling_cov_sector_short.divide(rolling_sector_var_short, axis=0)


# ============================================================================
# 3B. MEDIUM-TERM FEATURES (Context/stability indicators)
# ============================================================================

print("Calculating MEDIUM-TERM features (context indicators)...")

# These help identify if current behavior is unusual vs. longer-term baseline
rolling_vol_medium = returns_df.rolling(window=window_medium).std() * np.sqrt(252 * 7)

rolling_cov_mkt_medium = returns_df.rolling(window=window_medium).cov(market_returns)
rolling_mkt_var_medium = market_returns.rolling(window=window_medium).var()
rolling_beta_spx_medium = rolling_cov_mkt_medium.divide(rolling_mkt_var_medium, axis=0)

rolling_cov_sector_medium = returns_df.rolling(window=window_medium).cov(sector_returns)
rolling_sector_var_medium = sector_returns.rolling(window=window_medium).var()
rolling_beta_sector_medium = rolling_cov_sector_medium.divide(rolling_sector_var_medium, axis=0)


# ============================================================================
# 3C. INSTANTANEOUS FEATURES (Momentum/Overbought indicators)
# ============================================================================

print("Calculating INSTANTANEOUS features (momentum indicators)...")

# Feature D: RSI (Momentum/Overextended) - Keep at 14 (standard)
def calculate_rsi(data, window=14):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))

rsi_df = df.apply(calculate_rsi)

# Feature E: Short Term Momentum (5-period return)
momentum_5h = df.pct_change(periods=5)

# Feature F: Momentum Acceleration (change in momentum)
momentum_10h = df.pct_change(periods=10)
momentum_acceleration = momentum_5h - momentum_10h


# ============================================================================
# 3D. REGIME CHANGE INDICATORS (New!)
# ============================================================================

print("Calculating REGIME CHANGE indicators...")

# Detect when short-term behavior diverges from medium-term baseline
# This helps identify when a NEW regime is forming

# Volatility Regime Shift (is vol spiking vs. baseline?)
vol_regime_shift = (rolling_vol_short - rolling_vol_medium) / rolling_vol_medium

# Beta Regime Shift (is market sensitivity changing?)
beta_spx_regime_shift = rolling_beta_spx_short - rolling_beta_spx_medium
beta_sector_regime_shift = rolling_beta_sector_short - rolling_beta_sector_medium


# ============================================================================
# 4. Assemble the Master Time-Series DataFrame (ts_df)
# ============================================================================

print("\nAssembling time-series dataframe...")

ts_data_list = []

for ticker in stocks:
    if ticker == '^GSPC' or ticker not in df.columns: 
        continue
    
    # Extract features for this specific ticker
    temp_df = pd.DataFrame({
        # Price & Returns (baseline)
        'Price': df[ticker],
        'Returns': returns_df[ticker],
        
        # SHORT-TERM FEATURES (Primary clustering features)
        'Vol_Short': rolling_vol_short[ticker],
        'Beta_SPX_Short': rolling_beta_spx_short[ticker],
        'Beta_Sector_Short': rolling_beta_sector_short[ticker],
        
        # MEDIUM-TERM FEATURES (Context)
        'Vol_Medium': rolling_vol_medium[ticker],
        'Beta_SPX_Medium': rolling_beta_spx_medium[ticker],
        'Beta_Sector_Medium': rolling_beta_sector_medium[ticker],
        
        # INSTANTANEOUS FEATURES
        'RSI': rsi_df[ticker],
        'Momentum_5H': momentum_5h[ticker],
        'Momentum_10H': momentum_10h[ticker],
        'Momentum_Accel': momentum_acceleration[ticker],
        
        # REGIME CHANGE INDICATORS (New!)
        'Vol_Regime_Shift': vol_regime_shift[ticker],
        'Beta_SPX_Regime_Shift': beta_spx_regime_shift[ticker],
        'Beta_Sector_Regime_Shift': beta_sector_regime_shift[ticker],
        
    }, index=df.index)
    
    temp_df['Ticker'] = ticker
    ts_data_list.append(temp_df)

if ts_data_list:
    ts_df = pd.concat(ts_data_list).reset_index().set_index(['Datetime', 'Ticker'])
    
    # Drop NaNs created by rolling windows
    initial_rows = len(ts_df)
    ts_df = ts_df.dropna()
    dropped_rows = initial_rows - len(ts_df)
    
    print(f"\n{'='*80}")
    print("TIME-SERIES DATAFRAME CREATED SUCCESSFULLY")
    print(f"{'='*80}")
    print(f"Total rows: {len(ts_df):,}")
    print(f"Rows dropped (NaN): {dropped_rows:,} ({dropped_rows/initial_rows:.1%})")
    print(f"Date range: {ts_df.index.get_level_values('Datetime').min()} to {ts_df.index.get_level_values('Datetime').max()}")
    print(f"Unique tickers: {ts_df.index.get_level_values('Ticker').nunique()}")
    print(f"\nFeature columns: {len([c for c in ts_df.columns if c not in ['Price', 'Returns', 'Ticker']])}")
    print("\nSample data:")
    print(ts_df.head())


# ============================================================================
# 5. OPTIONAL: Static Fundamental DataFrame (Keep or Remove?)
# ============================================================================

# NOTE: For transient regime detection, fundamentals are less relevant
# Transient coupling is driven by events/news, not fundamental similarity
# Consider REMOVING this section unless you plan to use it for filtering

print(f"\n{'='*80}")
print("SKIPPING STATIC FUNDAMENTALS (Not relevant for transient detection)")
print(f"{'='*80}")
print("Transient coupling is driven by events/market dynamics, not fundamental profiles.")
print("If you want to filter pairs by fundamentals later, re-enable this section.\n")

# Uncomment below if you want to keep fundamentals
"""
fundamental_list = []
print("Fetching Static Fundamentals...")

for ticker in stocks:
    if ticker == '^GSPC': continue
    try:
        t = yf.Ticker(ticker)
        info = t.info
        
        fundamental_list.append({
            'Ticker': ticker,
            'Sector': info.get('sector', 'Unknown'),
            'Industry': info.get('industry', 'Unknown'),
            'Market_Cap': info.get('marketCap', np.nan),
        })
    except Exception as e:
        print(f"Could not fetch data for {ticker}: {e}")
        continue

static_df = pd.DataFrame(fundamental_list).set_index('Ticker')
print("Static DataFrame (static_df) Created Successfully!")
"""

print("="*80)
print("FEATURE ENGINEERING COMPLETE - Ready for clustering")
print("="*80)

# Clustering

In [None]:
# ============================================================================
# OPTICS CLUSTERING FOR TRANSIENT REGIME DETECTION
# ============================================================================

if 'ts_df' not in locals():
    raise ValueError("Please run the Feature Engineering cell to create 'ts_df' first.")

# Clean duplicates
ts_df = ts_df[~ts_df.index.duplicated(keep='first')]

# Check Density
density = ts_df.groupby(level='Datetime').size()
valid_timestamps = density[density >= 5].index

print(f"{'='*80}")
print(f"TRANSIENT REGIME DETECTION - OPTICS CLUSTERING")
print(f"{'='*80}")
print(f"Data Density: {len(valid_timestamps)} valid hourly timestamps")
print(f"Date Range: {valid_timestamps.min()} to {valid_timestamps.max()}")

# ============================================================================
# USE EXISTING FEATURES (Your current column names)
# ============================================================================
features_to_cluster = ['Returns', 'Rolling_Vol', 'Beta_SPX', 'Beta_Sector', 'RSI', 'Momentum_5H']
print(f"\nUsing features: {features_to_cluster}")

# ============================================================================
# CLUSTERING LOOP - Hourly Regime Detection
# ============================================================================

print(f"\n{'='*80}")
print("PHASE 1: Running OPTICS Clustering (Hourly Snapshots)")
print(f"{'='*80}")

cluster_results = []
cluster_quality_log = []

for i, ts in enumerate(valid_timestamps):
    try:
        snapshot = ts_df.xs(ts, level='Datetime')[features_to_cluster].dropna()
        if len(snapshot) < 5: 
            continue
        
        # Scale & PCA (Dimensionality Reduction)
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(snapshot)
        
        pca = PCA(n_components=0.90)
        X_pca = pca.fit_transform(X_scaled)
        
        # OPTICS Clustering
        optics = OPTICS(min_samples=3, metric='euclidean', xi=0.05, min_cluster_size=3)
        optics.fit(X_pca)
        
        # ===== CLUSTER QUALITY VALIDATION (RELAXED) =====
        unique_clusters = len(set(optics.labels_)) - (1 if -1 in optics.labels_ else 0)
        noise_count = (optics.labels_ == -1).sum()
        noise_pct = noise_count / len(optics.labels_)
        total_stocks = len(optics.labels_)
        
        # Quality Metrics
        quality_metrics = {
            'Datetime': ts,
            'Total_Stocks': total_stocks,
            'Unique_Clusters': unique_clusters,
            'Noise_Count': noise_count,
            'Noise_Pct': noise_pct,
            'PCA_Components': X_pca.shape[1],
            'Variance_Explained': pca.explained_variance_ratio_.sum()
        }
        
        # RELAXED Quality Filters
        is_valid = True
        skip_reason = None
        
        if unique_clusters < 1:
            is_valid = False
            skip_reason = "No clusters found (all noise)"
        elif noise_pct > 0.75:  # Relaxed to 75%
            is_valid = False
            skip_reason = f"Too noisy ({noise_pct:.1%} noise)"
        elif unique_clusters > total_stocks * 0.75:
            is_valid = False
            skip_reason = f"Over-fragmented ({unique_clusters} clusters for {total_stocks} stocks)"
        
        quality_metrics['Is_Valid'] = is_valid
        quality_metrics['Skip_Reason'] = skip_reason
        cluster_quality_log.append(quality_metrics)
        
        # Store valid clusters
        if is_valid:
            snapshot['Cluster_ID'] = optics.labels_
            snapshot['Datetime'] = ts
            snapshot['Num_Clusters'] = unique_clusters
            snapshot['Noise_Pct'] = noise_pct
            cluster_results.append(snapshot.reset_index())
        
        # Progress indicator
        if (i + 1) % 100 == 0:
            valid_so_far = len(cluster_results)
            print(f"  Processed {i+1}/{len(valid_timestamps)} timestamps... ({valid_so_far} valid so far)")
            
    except Exception as e:
        cluster_quality_log.append({
            'Datetime': ts,
            'Is_Valid': False,
            'Skip_Reason': f"Error: {str(e)[:50]}"
        })
        continue

if not cluster_results:
    raise ValueError("No valid clusters found. Check your data quality and OPTICS parameters.")

cluster_history = pd.concat(cluster_results, ignore_index=True)

print(f"\n{'='*80}")
print("PHASE 1 COMPLETE: Cluster Quality Summary")
print(f"{'='*80}")

# ============================================================================
# CLUSTER QUALITY ANALYSIS
# ============================================================================

df_quality = pd.DataFrame(cluster_quality_log)

total_timestamps = len(df_quality)
valid_timestamps_count = df_quality['Is_Valid'].sum()
invalid_timestamps_count = total_timestamps - valid_timestamps_count

print(f"\nTimestamp Analysis:")
print(f"  Total timestamps processed: {total_timestamps}")
print(f"  Valid clustering windows: {valid_timestamps_count} ({valid_timestamps_count/total_timestamps:.1%})")
print(f"  Invalid/skipped windows: {invalid_timestamps_count} ({invalid_timestamps_count/total_timestamps:.1%})")

if invalid_timestamps_count > 0:
    print(f"\nSkip Reasons:")
    skip_summary = df_quality[~df_quality['Is_Valid']]['Skip_Reason'].value_counts()
    for reason, count in skip_summary.items():
        print(f"  - {reason}: {count} ({count/invalid_timestamps_count:.1%})")

# Valid cluster statistics
valid_quality = df_quality[df_quality['Is_Valid']]
if len(valid_quality) > 0:
    print(f"\nValid Cluster Statistics:")
    print(f"  Avg clusters per timestamp: {valid_quality['Unique_Clusters'].mean():.1f}")
    print(f"  Avg noise percentage: {valid_quality['Noise_Pct'].mean():.1%}")
    print(f"  Avg PCA variance retained: {valid_quality['Variance_Explained'].mean():.1%}")

print(f"\nCluster History Generated:")
print(f"  Total rows: {len(cluster_history)}")
print(f"  Unique timestamps: {cluster_history['Datetime'].nunique()}")
print(f"  Date range: {cluster_history['Datetime'].min()} to {cluster_history['Datetime'].max()}")


# ============================================================================
# PHASE 2: Cluster Stability Analysis
# ============================================================================

print(f"\n{'='*80}")
print("PHASE 2: Analyzing Cluster Stability")
print(f"{'='*80}")

pair_co_cluster_freq = {}

for ts in cluster_history['Datetime'].unique():
    snapshot = cluster_history[cluster_history['Datetime'] == ts]
    
    for cluster_id in snapshot['Cluster_ID'].unique():
        if cluster_id == -1:
            continue
        
        members = snapshot[snapshot['Cluster_ID'] == cluster_id]['Ticker'].tolist()
        
        for s1, s2 in itertools.combinations(sorted(members), 2):
            pair_key = (s1, s2)
            
            if pair_key not in pair_co_cluster_freq:
                pair_co_cluster_freq[pair_key] = 0
            pair_co_cluster_freq[pair_key] += 1

# Calculate frequencies
total_valid_windows = cluster_history['Datetime'].nunique()

pair_stability_data = []
for pair, count in pair_co_cluster_freq.items():
    frequency = count / total_valid_windows
    pair_stability_data.append({
        'Ticker_1': pair[0],
        'Ticker_2': pair[1],
        'Pair': f"{pair[0]}-{pair[1]}",
        'Co_Cluster_Count': count,
        'Co_Cluster_Frequency': frequency,
        'Is_Stable': frequency > 0.30
    })

df_pair_stability = pd.DataFrame(pair_stability_data).sort_values('Co_Cluster_Frequency', ascending=False)

print(f"\nPair Clustering Analysis:")
print(f"  Total unique pairs observed: {len(df_pair_stability)}")
print(f"  Pairs clustered together >50% of time: {(df_pair_stability['Co_Cluster_Frequency'] > 0.50).sum()}")
print(f"  Pairs clustered together >30% of time: {(df_pair_stability['Co_Cluster_Frequency'] > 0.30).sum()}")
print(f"  Pairs clustered together <10% of time: {(df_pair_stability['Co_Cluster_Frequency'] < 0.10).sum()}")

print(f"\n{'='*80}")
print("TOP 15 MOST FREQUENTLY CO-CLUSTERED PAIRS")
print(f"{'='*80}")
print(df_pair_stability[['Pair', 'Co_Cluster_Count', 'Co_Cluster_Frequency']].head(15).to_string(index=False))

print(f"\n{'='*80}")
print("TOP 15 MOST TRANSIENT PAIRS (Rare Co-Clustering)")
print(f"{'='*80}")
transient_pairs = df_pair_stability[
    (df_pair_stability['Co_Cluster_Frequency'] > 0.05) &
    (df_pair_stability['Co_Cluster_Frequency'] < 0.20)
].sort_values('Co_Cluster_Frequency', ascending=True)
print(transient_pairs[['Pair', 'Co_Cluster_Count', 'Co_Cluster_Frequency']].head(15).to_string(index=False))


# ============================================================================
# PHASE 3: Temporal Analysis
# ============================================================================

print(f"\n{'='*80}")
print("PHASE 3: Temporal Analysis - When Do Regimes Shift?")
print(f"{'='*80}")

cluster_history['Date'] = pd.to_datetime(cluster_history['Datetime']).dt.date

daily_cluster_stats = cluster_history.groupby('Date').agg({
    'Cluster_ID': lambda x: len(set(x)) - (1 if -1 in x.values else 0),
    'Ticker': 'count'
}).rename(columns={'Cluster_ID': 'Num_Clusters', 'Ticker': 'Total_Obs'})

print(f"\nDaily Clustering Variability:")
print(f"  Days with high differentiation (>4 clusters): {(daily_cluster_stats['Num_Clusters'] > 4).sum()}")
print(f"  Days with low differentiation (â‰¤2 clusters): {(daily_cluster_stats['Num_Clusters'] <= 2).sum()}")

mean_clusters = daily_cluster_stats['Num_Clusters'].mean()
std_clusters = daily_cluster_stats['Num_Clusters'].std()
regime_shift_days = daily_cluster_stats[
    abs(daily_cluster_stats['Num_Clusters'] - mean_clusters) > 1.5 * std_clusters
]

if len(regime_shift_days) > 0:
    print(f"\nPotential Regime Shift Days (unusual cluster patterns):")
    print(f"  {len(regime_shift_days)} days detected")
    print(f"\nTop 5 Most Unusual Days:")
    top_shifts = regime_shift_days.nlargest(5, 'Num_Clusters')
    for date, row in top_shifts.iterrows():
        print(f"  {date}: {row['Num_Clusters']:.0f} clusters (avg: {mean_clusters:.1f})")

print(f"\n{'='*80}")
print("CLUSTERING PHASE COMPLETE")
print(f"{'='*80}")
print(f"\nData structures created:")
print(f"  - cluster_history: {len(cluster_history)} rows")
print(f"  - df_quality: {len(df_quality)} rows")
print(f"  - df_pair_stability: {len(df_pair_stability)} rows")
print(f"\nReady for pair testing phase")

# Validation Testing Here