In [15]:
import sys
import os
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from typing import List, Tuple
import random
from scipy import stats as scipy_stats

# Add project root to path
try:
    project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
except NameError:
    project_root = os.path.abspath("../../")

if project_root not in sys.path:
    sys.path.append(project_root)

from utils.data.curating_stooq import curate_stooq_dir_hourly

# NOTE: notebooks cache imports; reload so edits to jump_detection.py show up without restarting kernel
import importlib
import utils.data.jump_detection as jump_detection
jump_detection = importlib.reload(jump_detection)

detect_jumps_many = jump_detection.detect_jumps_many
compute_jump_score = jump_detection.compute_jump_score

from model.wavelet.wavelet import WaveletModel

In [16]:
def plot_profiles(X_windows, jumps_subset, output_dir, name):
    """
    Plots average temporal profiles along PCA directions.
    X-axis is time relative to jump (centered at 0).
    """
    directions = ["D1_reflexivity", "D2_mean_reversion", "D3_trend"]
    center = X_windows.shape[1] // 2
    t_axis = np.arange(-center, center + 1)
    
    for dim in directions:
        if dim not in jumps_subset.columns: continue
        
        scores = jumps_subset[dim].values
        sorted_idx = np.argsort(scores)
        X_sorted = X_windows[sorted_idx]
        n = len(scores)
        
        # Quantiles to visualize (Low, Mid, High)
        quantiles = [0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0]
        
        fig = go.Figure()
        colors = px.colors.sequential.Viridis
        
        for i in range(len(quantiles)-1):
            q_s, q_e = quantiles[i], quantiles[i+1]
            idx_s, idx_e = int(q_s*n), int(q_e*n)
            if idx_e <= idx_s: continue
            
            avg = np.mean(X_sorted[idx_s:idx_e], axis=0)
            
            color_idx = int(i / (len(quantiles)-1) * (len(colors)-1))
            color = colors[color_idx]
            
            fig.add_trace(go.Scatter(
                x=t_axis, 
                y=avg, 
                mode='lines', 
                name=f"Q {q_s}-{q_e}",
                line=dict(color=color, width=2)
            ))
            
        fig.update_layout(
            title=f"Average Profiles along {dim} - {name}<br>(X-axis: Time relative to Jump)",
            xaxis_title="Time (steps)",
            yaxis_title="Normalized Return x(t)",
            template="plotly_white",
            hovermode="x unified"
        )
        fig.add_vline(x=0, line_dash="dash", line_color="red", annotation_text="Jump")
        
        out_path = os.path.join(output_dir, f"{name}_profile_{dim}.html")
        fig.write_html(out_path)
        print(f"    Saved profile plot to {out_path}")
        
        # Show in notebook
        fig.show()

In [20]:
def run_analysis_for_subset(
    dfs_subset: dict, 
    subset_name: str, 
    output_dir: str, 
    window_steps: int = 12
):
    print(f"\n=== Running Analysis for {subset_name} ({len(dfs_subset)} stocks) ===")

    def infer_bar_timedelta(index: pd.DatetimeIndex):
        if not isinstance(index, pd.DatetimeIndex) or len(index) < 2:
            return None
        idx = index.sort_values()
        deltas = idx.to_series().diff().dropna()
        deltas = deltas[deltas > pd.Timedelta(0)]
        if deltas.empty:
            return None
        return deltas.median()

    # 1. Filter Trading Hours (remove first/last 60 mins of each day)
    # For hourly data, this corresponds to trimming 1 bar at the start/end.
    print("  Filtering trading hours...")
    filtered_dfs = {}
    total_days = 0

    # infer trim bars from the first ticker (assumes common frequency)
    sample_df = next(iter(dfs_subset.values()))
    bar_td = infer_bar_timedelta(sample_df.index)
    trim_time = pd.Timedelta(minutes=60)
    if bar_td is None or bar_td >= pd.Timedelta(days=1):
        trim_bars = 0
    else:
        trim_bars = max(1, int(round(trim_time / bar_td)))

    for ticker, df in dfs_subset.items():
        days = []
        for _, day_df in df.groupby(df.index.date):
            day_df = day_df.sort_index()

            if trim_bars > 0:
                # Need enough bars to trim both ends and still keep at least 1 observation
                if len(day_df) <= (2 * trim_bars + 1):
                    continue
                day_df = day_df.iloc[trim_bars:-trim_bars]

            if not day_df.empty:
                days.append(day_df)

        if days:
            filtered_dfs[ticker] = pd.concat(days)
            total_days += len(days)

    if not filtered_dfs:
        print("  No data remaining after filtering.")
        return

    # 2. Detect Jumps
    print("  Detecting jumps...")
    threshold = 2.0
    jumps_df = detect_jumps_many(filtered_dfs, threshold=threshold)
    print(f"  Detected {len(jumps_df)} total jumps.")

    # Diagnostics: if no/few jumps, plot the full jump-score series to understand why
    if len(jumps_df) == 0:
        sample_ticker, sample_df = max(filtered_dfs.items(), key=lambda kv: len(kv[1]))
        print(f"  No jumps found. Plotting jump-score diagnostics for: {sample_ticker} (n={len(sample_df)})")
        scores = compute_jump_score(sample_df, price_col="close")

        fig_hist = px.histogram(
            scores.reset_index(), x="score", nbins=120,
            title=f"Jump score distribution x(t) - {sample_ticker}",
        )
        fig_hist.add_vline(x=threshold, line_dash="dash", line_color="red")
        fig_hist.add_vline(x=-threshold, line_dash="dash", line_color="red")
        fig_hist.update_layout(template="plotly_white")
        fig_hist.show()

        fig_ts = go.Figure()
        fig_ts.add_trace(go.Scatter(x=scores.index, y=scores["score"], mode="lines", name="x(t)", line=dict(width=1)))
        fig_ts.add_hline(y=threshold, line_dash="dash", line_color="red")
        fig_ts.add_hline(y=-threshold, line_dash="dash", line_color="red")
        fig_ts.update_layout(
            title=f"Jump score x(t) over time - {sample_ticker}",
            xaxis_title="Time",
            yaxis_title="x(t)",
            template="plotly_white",
            hovermode="x unified",
        )
        fig_ts.show()
        return

    if len(jumps_df) < 50:
        print("  Not enough jumps for robust PCA (need > 50).")
        return

    # 3. Extract Windows
    print("  Extracting windows...")
    windows = []
    valid_indices = []
    
    for idx, row in jumps_df.iterrows():
        ticker, ts = row["ticker"], row["timestamp"]
        if ticker not in filtered_dfs: continue
        df = filtered_dfs[ticker]
        
        if ts not in df.index: continue
        loc = df.index.get_loc(ts)
        
        if loc - window_steps < 0 or loc + window_steps + 1 > len(df): continue
        
        subset = df.iloc[loc - window_steps : loc + window_steps + 1]
        
        # Normalization
        norm = row["f"] * row["sigma"]
        if norm == 0: norm = 1e-4
            
        r_window = subset["close"].pct_change().fillna(0.0).values
        x_profile = r_window / norm
        
        # Align Jump Direction (Paper convention: Jump > 0)
        # Center is at index `window_steps`
        jump_sign = np.sign(x_profile[window_steps])
        if jump_sign == 0: jump_sign = 1
        
        windows.append(x_profile * jump_sign)
        valid_indices.append(idx)
        
    X_windows = np.array(windows)
    jumps_subset = jumps_df.loc[valid_indices].copy()
    print(f"  Extracted {len(X_windows)} valid windows.")

    # 4. Wavelet PCA
    print("  Running Wavelet Kernel PCA...")
    wm = WaveletModel(n_layers=0, n_neurons=0, n_outputs=0, J=3, n_components=3)
    embedding = wm.fit_transform(X_windows)
    
    d1 = embedding[:, 0]
    d2 = embedding[:, 1]
    d3 = embedding[:, 2]
    
    # Orient D1 (Reflexivity): Positive should correlate with Post-Jump Activity
    center = window_steps
    act_post = np.sum(np.abs(X_windows[:, center+1:]), axis=1)
    act_pre = np.sum(np.abs(X_windows[:, :center]), axis=1)
    asymmetry = (act_post - act_pre) / (act_post + act_pre + 1e-6)
    
    corr = np.corrcoef(d1, asymmetry)[0, 1]
    if corr < 0:
        print(f"  Flipping D1 sign (correlation was {corr:.2f})")
        d1 *= -1
        corr = -corr  # Update correlation after flipping
        
    jumps_subset["D1_reflexivity"] = d1
    jumps_subset["asymmetry"] = asymmetry
    print(f"  D1-Asymmetry correlation: {corr:.3f}")
    
    # Handcrafted Features
    # D2 (Mean Reversion): Pre-Jump - Post-Jump
    # D3 (Trend): Pre-Jump + Post-Jump
    # t = +/- 1 step from center
    jumps_subset["D2_mean_reversion"] = d2
    jumps_subset["D3_trend"] = d3

    # 5. Generate Plots
    print("  Generating plots...")
    
    # Scatter Plot: D1 vs Asymmetry (to verify separation)
    slope, intercept, r_value, p_value, std_err = scipy_stats.linregress(
        jumps_subset["asymmetry"], jumps_subset["D1_reflexivity"]
    )
    
    fig_asym = px.scatter(
        jumps_subset, x="asymmetry", y="D1_reflexivity", 
        color="asymmetry",
        title=f"D1 (Reflexivity) vs Asymmetry ({subset_name}, N={len(X_windows)})<br>Correlation: {corr:.3f}, R²: {r_value**2:.3f}",
        color_continuous_scale="RdBu", opacity=0.6,
        hover_data=["ticker", "timestamp"],
        labels={"asymmetry": "Asymmetry (Post-Pre)/(Post+Pre)", "D1_reflexivity": "D1 (Reflexivity)"}
    )
    
    # Add regression line
    asym_range = np.linspace(jumps_subset["asymmetry"].min(), jumps_subset["asymmetry"].max(), 100)
    reg_line = slope * asym_range + intercept
    fig_asym.add_trace(go.Scatter(
        x=asym_range, y=reg_line, mode='lines', name=f'Regression (R²={r_value**2:.3f})',
        line=dict(color='red', width=2, dash='dash')
    ))
    
    fig_asym.add_vline(x=0, line_dash="dash", line_color="gray", annotation_text="Symmetric")
    fig_asym.add_hline(y=0, line_dash="dash", line_color="gray")
    
    out_path_asym = os.path.join(output_dir, f"{subset_name}_D1_asymmetry.html")
    fig_asym.write_html(out_path_asym)
    print(f"    Saved D1-Asymmetry plot to {out_path_asym}")
    fig_asym.show()
    
    # Scatter Plot: D1 vs D2 (Mean-Reversion) - Fig 5 equivalent
    fig_mr = px.scatter(
        jumps_subset, x="D1_reflexivity", y="D2_mean_reversion", color="D2_mean_reversion",
        title=f"Reflexivity vs Mean-Reversion ({subset_name}, N={len(X_windows)})", 
        color_continuous_scale="RdBu", opacity=0.5,
        hover_data=["ticker", "timestamp"]
    )
    fig_mr.add_vline(x=0, line_dash="dash"); fig_mr.add_hline(y=0, line_dash="dash")
    
    out_path_scatter = os.path.join(output_dir, f"{subset_name}_fig5_mr.html")
    fig_mr.write_html(out_path_scatter)
    print(f"    Saved scatter plot to {out_path_scatter}")
    fig_mr.show()
    
    # Scatter Plot: D1 vs D3 (Trend) - Fig 6 equivalent
    fig_tr = px.scatter(
        jumps_subset, x="D1_reflexivity", y="D3_trend", color="D3_trend",
        title=f"Reflexivity vs Trend ({subset_name}, N={len(X_windows)})", 
        color_continuous_scale="Viridis", opacity=0.5,
        hover_data=["ticker", "timestamp"]
    )
    fig_tr.add_vline(x=0, line_dash="dash"); fig_tr.add_hline(y=0, line_dash="dash")
    
    out_path_scatter_tr = os.path.join(output_dir, f"{subset_name}_fig6_tr.html")
    fig_tr.write_html(out_path_scatter_tr)
    print(f"    Saved scatter plot to {out_path_scatter_tr}")
    fig_tr.show()
    
    # Profile Plots (all three directions)
    plot_profiles(X_windows, jumps_subset, output_dir, subset_name)

In [21]:
def main():
    # Poland data directory
    data_dir = "/home/janis/4A/timeseries/data/stooq/poland/hourly/ncstocks/"
    print(f"Loading Poland data from {data_dir}...")
    all_dfs = curate_stooq_dir_hourly(data_dir, pattern="*.txt", recursive=True)
    print(f"Loaded {len(all_dfs)} tickers before length filter.")

    # Filter to stocks with enough data to ensure meaningful jumps
    valid_tickers = [t for t, d in all_dfs.items() if len(d) > 500]
    print(f"Found {len(valid_tickers)} valid tickers.")

    if not valid_tickers:
        print("No valid data found.")
        return

    # Sort by data length to pick the best ones
    valid_tickers.sort(key=lambda t: len(all_dfs[t]), reverse=True)

    # Select Subsets
    # 1. Small Subset: Top 5 most liquid/long stocks
    tickers_5 = valid_tickers[:5]
    dfs_5 = {t: all_dfs[t] for t in tickers_5}

    # 2. Large Subset: Top 30 (or all if less)
    limit_30 = max(80, len(valid_tickers))
    tickers_30 = valid_tickers[:limit_30]
    dfs_30 = {t: all_dfs[t] for t in tickers_30}

    # Output Directory
    try:
        base_dir = os.path.dirname(__file__)
    except NameError:
        base_dir = os.getcwd()
    output_dir = os.path.join(base_dir, "poland_outputs_hourly")
    os.makedirs(output_dir, exist_ok=True)

    # Run Comparisons
    run_analysis_for_subset(dfs_5, "5_Stocks_Poland_hourly", output_dir)

    if len(tickers_30) > len(tickers_5):
        run_analysis_for_subset(dfs_30, f"{len(tickers_30)}_Stocks_Poland_hourly", output_dir)
    else:
        print("Skipping large subset analysis (not enough stocks for distinct comparison).")

In [None]:
if __name__ == "__main__":
    main()

Loading Poland data from /home/janis/4A/timeseries/data/stooq/poland/hourly/ncstocks/...
Loaded 288 tickers before length filter.
Found 178 valid tickers.

=== Running Analysis for 5_Stocks_Poland_hourly (5 stocks) ===
  Filtering trading hours...
  Detecting jumps...
  Detected 183 total jumps.
  Extracting windows...
  Extracted 180 valid windows.
  Running Wavelet Kernel PCA...
  Flipping D1 sign (correlation was -0.20)
  D1-Asymmetry correlation: 0.197
  Generating plots...
    Saved D1-Asymmetry plot to /home/janis/4A/timeseries/notebooks/jump/poland_outputs_hourly/5_Stocks_Poland_hourly_D1_asymmetry.html


    Saved scatter plot to /home/janis/4A/timeseries/notebooks/jump/poland_outputs_hourly/5_Stocks_Poland_hourly_fig5_mr.html


    Saved scatter plot to /home/janis/4A/timeseries/notebooks/jump/poland_outputs_hourly/5_Stocks_Poland_hourly_fig6_tr.html


    Saved profile plot to /home/janis/4A/timeseries/notebooks/jump/poland_outputs_hourly/5_Stocks_Poland_hourly_profile_D1_reflexivity.html


    Saved profile plot to /home/janis/4A/timeseries/notebooks/jump/poland_outputs_hourly/5_Stocks_Poland_hourly_profile_D2_mean_reversion.html


    Saved profile plot to /home/janis/4A/timeseries/notebooks/jump/poland_outputs_hourly/5_Stocks_Poland_hourly_profile_D3_trend.html



=== Running Analysis for 178_Stocks_Poland_hourly (178 stocks) ===
  Filtering trading hours...
  Detecting jumps...
  Detected 1733 total jumps.
  Extracting windows...
  Extracted 1641 valid windows.
  Running Wavelet Kernel PCA...
  Flipping D1 sign (correlation was -0.31)
  D1-Asymmetry correlation: 0.313
  Generating plots...
    Saved D1-Asymmetry plot to /home/janis/4A/timeseries/notebooks/jump/poland_outputs_hourly/178_Stocks_Poland_hourly_D1_asymmetry.html


    Saved scatter plot to /home/janis/4A/timeseries/notebooks/jump/poland_outputs_hourly/178_Stocks_Poland_hourly_fig5_mr.html


    Saved scatter plot to /home/janis/4A/timeseries/notebooks/jump/poland_outputs_hourly/178_Stocks_Poland_hourly_fig6_tr.html


    Saved profile plot to /home/janis/4A/timeseries/notebooks/jump/poland_outputs_hourly/178_Stocks_Poland_hourly_profile_D1_reflexivity.html


    Saved profile plot to /home/janis/4A/timeseries/notebooks/jump/poland_outputs_hourly/178_Stocks_Poland_hourly_profile_D2_mean_reversion.html


    Saved profile plot to /home/janis/4A/timeseries/notebooks/jump/poland_outputs_hourly/178_Stocks_Poland_hourly_profile_D3_trend.html


: 