# Diagnostic v3 — Val-Tuned Validation of Conditional Signal

**Purpose:** Verify whether the conditional lift reported in v2 is real or an artifact
of implicit test-set selection.

### What was wrong with v2?
In v2, per-`n_bins` hyperparameters (alpha, tau) were correctly tuned on VAL.
However, the **selection of which `n_bins` is best** was made by inspecting
`backoff_delta` on TEST. This is a subtle form of test-set optimism:
when you report "best delta = +0.002 at n_bins=25" after looking at all 4 test deltas,
you are implicitly picking the luckiest n_bins on test.

### Protocol in this notebook
1. ALL hyperparameters (alpha, tau) tuned on VALIDATION only.
2. **`n_bins` selection** also made on VALIDATION only.
3. TEST evaluated **exactly once** with frozen parameters.
4. Bootstrap confidence intervals on test delta_LL.
5. Multi-step (k>1) with frozen params from step 2.

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import os, warnings

warnings.filterwarnings('ignore')

SEED = 42
np.random.seed(SEED)
EPS = 1e-12  # only for log(p + EPS), not for MI or smoothing

print("Imports ready.")

Imports ready.


## Data Loading
Same dataset and split as v2 / `TransitionProbMatrix_NEWDATA.ipynb`.

In [2]:
train_df  = pd.read_csv("dataset/train_diagnostic.csv")
labels_df = pd.read_csv("dataset/label_diagnostic.csv")

# Compute forward percent change from Price
train_df["Percent_change_forward"] = (
    train_df["Price"].shift(-1) / train_df["Price"] - 1
) * 100.0

# Drop last row (forward return undefined)
train_df = train_df.iloc[:-1].copy()
labels_df = labels_df.iloc[:-1].copy()

# States: 0-based
s_curr_all = (train_df["Backward_Bin"].values.astype(np.int64) - 1)
y_all      = (labels_df["Forward_Bin"].values.astype(np.int64) - 1)

# Raw percent changes (for rebinning)
pct_backward_all = train_df["Percent_change_backward"].values.astype(np.float64)
pct_forward_all  = train_df["Percent_change_forward"].values.astype(np.float64)

n_samples = len(s_curr_all)
n_states_orig = int(max(s_curr_all.max(), y_all.max()) + 1)

# Temporal split: 70 / 15 / 15  (identical to v2)
T = n_samples
train_end = int(0.7 * T)   # 1657
val_end   = int(0.85 * T)  # 2012

idx_train = np.arange(0,         train_end)
idx_val   = np.arange(train_end, val_end)
idx_test  = np.arange(val_end,   T)

pct_fwd_train = pct_forward_all[idx_train]
pct_bwd_train = pct_backward_all[idx_train]

print(f"n_samples={n_samples}, n_states_orig={n_states_orig}")
print(f"Train: {len(idx_train)} [{idx_train[0]}..{idx_train[-1]}]")
print(f"Val:   {len(idx_val)} [{idx_val[0]}..{idx_val[-1]}]")
print(f"Test:  {len(idx_test)} [{idx_test[0]}..{idx_test[-1]}]")

n_samples=2368, n_states_orig=55
Train: 1657 [0..1656]
Val:   355 [1657..2011]
Test:  356 [2012..2367]


## Helper Functions
Copied from v2 — no changes.

In [3]:
def compute_marginal(y, n_classes):
    """Compute marginal distribution from integer labels."""
    counts = np.bincount(y, minlength=n_classes).astype(np.float64)
    return counts / counts.sum()


def compute_joint_counts(s, y, n_x, n_y):
    """Compute raw joint count matrix C[x, y]."""
    C = np.zeros((n_x, n_y), dtype=np.float64)
    for si, yi in zip(s, y):
        C[si, yi] += 1
    return C


def compute_conditional_additive(s, y, n_x, n_y, alpha, marginal_fallback=None):
    """P(y|x) = (C[x,y] + alpha) / (C[x,.] + alpha * n_y).
    If alpha=0 and a row has zero counts, use marginal_fallback.
    All returned probabilities are strictly > 0 when alpha > 0.
    """
    C = compute_joint_counts(s, y, n_x, n_y)
    if alpha == 0:
        row_sums = C.sum(axis=1)
        P = np.zeros_like(C)
        for i in range(n_x):
            if row_sums[i] > 0:
                P[i] = C[i] / row_sums[i]
            elif marginal_fallback is not None:
                P[i] = marginal_fallback
            else:
                P[i] = 1.0 / n_y  # uniform fallback
        return P
    C_alpha = C + alpha
    return C_alpha / C_alpha.sum(axis=1, keepdims=True)


def build_backoff_matrix(s_train, y_train, n_x, n_y, alpha, tau, marginal):
    """Build backoff transition matrix A[x, y] = lam_x * P_cond(y|x) + (1-lam_x) * P_marg(y)
    where lam_x = count(x) / (count(x) + tau).
    Returns: A (n_x, n_y), lam (n_x,), P_cond (n_x, n_y)
    """
    C = compute_joint_counts(s_train, y_train, n_x, n_y)
    C_alpha = C + alpha
    P_cond = C_alpha / C_alpha.sum(axis=1, keepdims=True)
    state_counts = C.sum(axis=1)
    lam = state_counts / (state_counts + tau)
    A = np.zeros((n_x, n_y), dtype=np.float64)
    for i in range(n_x):
        A[i] = lam[i] * P_cond[i] + (1 - lam[i]) * marginal
    return A, lam, P_cond


def predict_backoff(A, s_eval):
    """Return per-sample predicted distributions from backoff matrix."""
    return A[s_eval]


def mean_log_likelihood(pred_dist, y_true):
    N = len(y_true)
    probs = pred_dist[np.arange(N), y_true]
    return np.log(probs + EPS).mean()


def accuracy_score(pred_dist, y_true):
    return (pred_dist.argmax(axis=1) == y_true).mean()


def severity_score(pred_dist, y_true, n_classes):
    bins = np.arange(n_classes, dtype=np.float64)
    expected = (pred_dist * bins[np.newaxis, :]).sum(axis=1)
    return np.abs(expected - y_true.astype(np.float64)).mean()


def rebin_quantile(pct_values, pct_train_for_edges, n_bins):
    """Rebin using quantile edges fit on the given training data."""
    edges = np.quantile(pct_train_for_edges, np.linspace(0, 1, n_bins + 1))
    edges[0] = -np.inf
    edges[-1] = np.inf
    edges = np.unique(edges)
    actual_n = len(edges) - 1
    bins = np.clip(np.digitize(pct_values, edges) - 1, 0, actual_n - 1)
    return bins, edges, actual_n


def prepare_bins(n_bins):
    """Prepare rebinned states and labels for a given n_bins.
    For n_bins=55: original CSV bins.
    Otherwise: quantile-based, SEPARATE edges for fwd/bwd, fit on TRAIN only.
    """
    if n_bins == 55:
        return s_curr_all.copy(), y_all.copy(), 55, "original_fixed"
    y_new, _, n_y = rebin_quantile(pct_forward_all, pct_fwd_train, n_bins)
    s_new, _, n_s = rebin_quantile(pct_backward_all, pct_bwd_train, n_bins)
    actual_n = max(n_y, n_s)
    return s_new, y_new, actual_n, "quantile"


print("Helpers defined.")

Helpers defined.


## A+B) Hyperparameter Tuning on VALIDATION Only

For each `n_bins` in {25, 35, 40, 55}:
1. **Additive baseline**: sweep alpha, pick best by VAL LL.
2. **Backoff baseline**: sweep (alpha, tau) jointly, pick best by VAL LL.
3. Record VAL LL for both, plus marginal VAL LL.

Then select the **overall best (model, n_bins)** configuration by VAL LL.

**TEST is not touched in this cell.**

In [4]:
N_BINS_LIST = [25, 35, 40, 55]
ALPHA_GRID = [1e-6, 1e-4, 1e-3, 1e-2, 1e-1, 1.0, 5.0, 10.0]
TAU_GRID   = [10, 50, 100, 200, 500, 1000]

tuning_rows = []  # store all (n_bins, model, params, val_LL)

for n_bins in N_BINS_LIST:
    s_all, y_all_rb, n_st, method = prepare_bins(n_bins)
    s_tr = s_all[idx_train]; s_va = s_all[idx_val]
    y_tr = y_all_rb[idx_train]; y_va = y_all_rb[idx_val]

    marginal = compute_marginal(y_tr, n_st)

    # Marginal VAL LL
    pred_marg_val = np.tile(marginal, (len(y_va), 1))
    ll_marg_val = mean_log_likelihood(pred_marg_val, y_va)

    tuning_rows.append({
        "n_bins": n_bins, "n_st": n_st, "method": method,
        "model": "marginal", "alpha": None, "tau": None,
        "val_LL": ll_marg_val, "delta_val": 0.0,
    })

    # --- Additive: sweep alpha on VAL ---
    best_add = {"val_LL": -np.inf}
    for alpha in ALPHA_GRID:
        P_cond = compute_conditional_additive(s_tr, y_tr, n_st, n_st, alpha, marginal)
        ll_val = mean_log_likelihood(P_cond[s_va], y_va)
        row = {
            "n_bins": n_bins, "n_st": n_st, "method": method,
            "model": "additive", "alpha": alpha, "tau": None,
            "val_LL": ll_val, "delta_val": ll_val - ll_marg_val,
        }
        tuning_rows.append(row)
        if ll_val > best_add["val_LL"]:
            best_add = row.copy()

    # --- Backoff: sweep (alpha, tau) jointly on VAL ---
    best_bk = {"val_LL": -np.inf}
    for alpha in ALPHA_GRID:
        for tau in TAU_GRID:
            A, _, _ = build_backoff_matrix(s_tr, y_tr, n_st, n_st, alpha, tau, marginal)
            pred_val = A[s_va]
            ll_val = mean_log_likelihood(pred_val, y_va)
            row = {
                "n_bins": n_bins, "n_st": n_st, "method": method,
                "model": "backoff", "alpha": alpha, "tau": tau,
                "val_LL": ll_val, "delta_val": ll_val - ll_marg_val,
            }
            tuning_rows.append(row)
            if ll_val > best_bk["val_LL"]:
                best_bk = row.copy()

    print(f"n_bins={n_bins} ({method}):  marginal_val={ll_marg_val:.6f}")
    print(f"  Best additive: alpha={best_add['alpha']}, val_LL={best_add['val_LL']:.6f}, "
          f"delta_val={best_add['delta_val']:+.6f}")
    print(f"  Best backoff:  alpha={best_bk['alpha']}, tau={best_bk['tau']}, "
          f"val_LL={best_bk['val_LL']:.6f}, delta_val={best_bk['delta_val']:+.6f}")

df_tuning = pd.DataFrame(tuning_rows)

# ---------------------------------------------------------------
# Select OVERALL BEST configuration by VAL LL (across all n_bins)
# ---------------------------------------------------------------
# Best additive across all n_bins
df_add = df_tuning[df_tuning["model"] == "additive"]
best_add_overall = df_add.loc[df_add["val_LL"].idxmax()]

# Best backoff across all n_bins
df_bk = df_tuning[df_tuning["model"] == "backoff"]
best_bk_overall = df_bk.loc[df_bk["val_LL"].idxmax()]

# Best marginal across all n_bins (for reference)
df_marg = df_tuning[df_tuning["model"] == "marginal"]
best_marg_overall = df_marg.loc[df_marg["val_LL"].idxmax()]

print("\n" + "=" * 90)
print("VAL-SELECTED BEST CONFIGURATIONS (no test data used)")
print("=" * 90)
print(f"Marginal:  n_bins={int(best_marg_overall['n_bins'])}, "
      f"val_LL={best_marg_overall['val_LL']:.6f}")
print(f"Additive:  n_bins={int(best_add_overall['n_bins'])}, "
      f"alpha={best_add_overall['alpha']}, "
      f"val_LL={best_add_overall['val_LL']:.6f}, "
      f"delta_val={best_add_overall['delta_val']:+.6f}")
print(f"Backoff:   n_bins={int(best_bk_overall['n_bins'])}, "
      f"alpha={best_bk_overall['alpha']}, tau={best_bk_overall['tau']}, "
      f"val_LL={best_bk_overall['val_LL']:.6f}, "
      f"delta_val={best_bk_overall['delta_val']:+.6f}")
print("=" * 90)

n_bins=25 (quantile):  marginal_val=-3.218704
  Best additive: alpha=10.0, val_LL=-3.231259, delta_val=-0.012555
  Best backoff:  alpha=10.0, tau=1000, val_LL=-3.219013, delta_val=-0.000310
n_bins=35 (quantile):  marginal_val=-3.554669
  Best additive: alpha=10.0, val_LL=-3.557619, delta_val=-0.002949
  Best backoff:  alpha=0.1, tau=1000, val_LL=-3.554212, delta_val=+0.000458
n_bins=40 (quantile):  marginal_val=-3.688670
  Best additive: alpha=10.0, val_LL=-3.690263, delta_val=-0.001593
  Best backoff:  alpha=0.1, tau=1000, val_LL=-3.688277, delta_val=+0.000394
n_bins=55 (original_fixed):  marginal_val=-3.682208
  Best additive: alpha=1.0, val_LL=-3.882289, delta_val=-0.200081
  Best backoff:  alpha=1e-06, tau=500, val_LL=-3.678258, delta_val=+0.003950

VAL-SELECTED BEST CONFIGURATIONS (no test data used)
Marginal:  n_bins=25, val_LL=-3.218704
Additive:  n_bins=25, alpha=10.0, val_LL=-3.231259, delta_val=-0.012555
Backoff:   n_bins=25, alpha=10.0, tau=1000.0, val_LL=-3.219013, delta_va

## C) Test Evaluation — Frozen Parameters

Evaluate the val-selected configurations on TEST exactly once.
Also report ALL n_bins with their val-best params for transparency.

In [5]:
# ---------------------------------------------------------------
# Evaluate ALL n_bins with their val-best params on TEST
# (for full transparency, not just the globally best n_bins)
# ---------------------------------------------------------------
test_results = []

for n_bins in N_BINS_LIST:
    s_all, y_all_rb, n_st, method = prepare_bins(n_bins)
    s_tr = s_all[idx_train]; s_va = s_all[idx_val]; s_te = s_all[idx_test]
    y_tr = y_all_rb[idx_train]; y_va = y_all_rb[idx_val]; y_te = y_all_rb[idx_test]

    marginal = compute_marginal(y_tr, n_st)

    # Marginal
    pred_marg_val  = np.tile(marginal, (len(y_va), 1))
    pred_marg_test = np.tile(marginal, (len(y_te), 1))
    ll_marg_val  = mean_log_likelihood(pred_marg_val, y_va)
    ll_marg_test = mean_log_likelihood(pred_marg_test, y_te)

    test_results.append({
        "n_bins": n_bins, "model": "marginal",
        "alpha": "-", "tau": "-",
        "val_LL": ll_marg_val, "test_LL": ll_marg_test,
        "delta_test": 0.0,
        "test_acc": accuracy_score(pred_marg_test, y_te),
        "test_sev": severity_score(pred_marg_test, y_te, n_st),
    })

    # Best additive for this n_bins (from val tuning)
    sub_add = df_tuning[(df_tuning["model"] == "additive") & (df_tuning["n_bins"] == n_bins)]
    best_a = sub_add.loc[sub_add["val_LL"].idxmax()]
    alpha_a = best_a["alpha"]
    P_cond_a = compute_conditional_additive(s_tr, y_tr, n_st, n_st, alpha_a, marginal)
    pred_add_val  = P_cond_a[s_va]
    pred_add_test = P_cond_a[s_te]
    ll_add_val  = mean_log_likelihood(pred_add_val, y_va)
    ll_add_test = mean_log_likelihood(pred_add_test, y_te)

    test_results.append({
        "n_bins": n_bins, "model": "additive",
        "alpha": alpha_a, "tau": "-",
        "val_LL": ll_add_val, "test_LL": ll_add_test,
        "delta_test": ll_add_test - ll_marg_test,
        "test_acc": accuracy_score(pred_add_test, y_te),
        "test_sev": severity_score(pred_add_test, y_te, n_st),
    })

    # Best backoff for this n_bins (from val tuning)
    sub_bk = df_tuning[(df_tuning["model"] == "backoff") & (df_tuning["n_bins"] == n_bins)]
    best_b = sub_bk.loc[sub_bk["val_LL"].idxmax()]
    alpha_b = best_b["alpha"]
    tau_b   = best_b["tau"]
    A_bk, _, _ = build_backoff_matrix(s_tr, y_tr, n_st, n_st, alpha_b, tau_b, marginal)
    pred_bk_val  = A_bk[s_va]
    pred_bk_test = A_bk[s_te]
    ll_bk_val  = mean_log_likelihood(pred_bk_val, y_va)
    ll_bk_test = mean_log_likelihood(pred_bk_test, y_te)

    test_results.append({
        "n_bins": n_bins, "model": "backoff",
        "alpha": alpha_b, "tau": tau_b,
        "val_LL": ll_bk_val, "test_LL": ll_bk_test,
        "delta_test": ll_bk_test - ll_marg_test,
        "test_acc": accuracy_score(pred_bk_test, y_te),
        "test_sev": severity_score(pred_bk_test, y_te, n_st),
    })

df_test = pd.DataFrame(test_results)

print("=" * 120)
print("k=1 RESULTS: ALL n_bins, val-tuned params, evaluated on TEST")
print("=" * 120)
print(df_test.to_string(index=False, float_format=lambda x: f"{x:.6f}"))
print("=" * 120)

# Highlight the val-selected best
print(f"\nVal-selected best additive: n_bins={int(best_add_overall['n_bins'])}, alpha={best_add_overall['alpha']}")
print(f"Val-selected best backoff:  n_bins={int(best_bk_overall['n_bins'])}, alpha={best_bk_overall['alpha']}, tau={best_bk_overall['tau']}")

# Extract the test delta for the val-selected-best configs
# Additive
sel_add = df_test[(df_test["model"] == "additive") &
                  (df_test["n_bins"] == int(best_add_overall["n_bins"]))]
delta_add_test = sel_add["delta_test"].values[0]
print(f"\n  Val-best additive -> test delta = {delta_add_test:+.6f} nats")

# Backoff
sel_bk = df_test[(df_test["model"] == "backoff") &
                 (df_test["n_bins"] == int(best_bk_overall["n_bins"]))]
delta_bk_test = sel_bk["delta_test"].values[0]
print(f"  Val-best backoff  -> test delta = {delta_bk_test:+.6f} nats")

k=1 RESULTS: ALL n_bins, val-tuned params, evaluated on TEST
 n_bins    model     alpha         tau    val_LL   test_LL  delta_test  test_acc  test_sev
     25 marginal         -           - -3.218704 -3.218758    0.000000  0.036517  5.924157
     25 additive 10.000000           - -3.231259 -3.216606    0.002152  0.050562  5.936565
     25  backoff 10.000000 1000.000000 -3.219013 -3.217956    0.000802  0.067416  5.924923
     35 marginal         -           - -3.554669 -3.554692    0.000000  0.030899  8.311798
     35 additive 10.000000           - -3.557619 -3.562570   -0.007878  0.025281  8.324209
     35  backoff  0.100000 1000.000000 -3.554212 -3.555950   -0.001259  0.022472  8.316177
     40 marginal         -           - -3.688670 -3.688565    0.000000  0.028090  9.522104
     40 additive 10.000000           - -3.690263 -3.695031   -0.006467  0.022472  9.522161
     40  backoff  0.100000 1000.000000 -3.688277 -3.689787   -0.001223  0.019663  9.521996
     55 marginal         -   

## D) Bootstrap Confidence Intervals on Test delta_LL

1000 bootstrap resamples over test indices.
Reports 95% CI for delta_LL = conditional_LL - marginal_LL.

In [6]:
N_BOOT = 1000
rng = np.random.RandomState(SEED)

boot_results = []

for n_bins in N_BINS_LIST:
    s_all, y_all_rb, n_st, method = prepare_bins(n_bins)
    s_tr = s_all[idx_train]; s_te = s_all[idx_test]
    y_tr = y_all_rb[idx_train]; y_te = y_all_rb[idx_test]
    N_te = len(y_te)

    marginal = compute_marginal(y_tr, n_st)

    # Val-best additive for this n_bins
    sub_add = df_tuning[(df_tuning["model"] == "additive") & (df_tuning["n_bins"] == n_bins)]
    alpha_a = sub_add.loc[sub_add["val_LL"].idxmax(), "alpha"]
    P_cond_a = compute_conditional_additive(s_tr, y_tr, n_st, n_st, alpha_a, marginal)

    # Val-best backoff for this n_bins
    sub_bk = df_tuning[(df_tuning["model"] == "backoff") & (df_tuning["n_bins"] == n_bins)]
    best_b = sub_bk.loc[sub_bk["val_LL"].idxmax()]
    alpha_b, tau_b = best_b["alpha"], best_b["tau"]
    A_bk, _, _ = build_backoff_matrix(s_tr, y_tr, n_st, n_st, alpha_b, tau_b, marginal)

    # Per-sample log-likelihoods on test
    ll_marg_per = np.log(marginal[y_te] + EPS)
    ll_add_per  = np.log(P_cond_a[s_te, y_te] + EPS)
    ll_bk_per   = np.log(A_bk[s_te, y_te] + EPS)

    # Per-sample deltas
    delta_add_per = ll_add_per - ll_marg_per
    delta_bk_per  = ll_bk_per  - ll_marg_per

    # Bootstrap
    boot_add = np.zeros(N_BOOT)
    boot_bk  = np.zeros(N_BOOT)
    for b in range(N_BOOT):
        idx_b = rng.randint(0, N_te, size=N_te)
        boot_add[b] = delta_add_per[idx_b].mean()
        boot_bk[b]  = delta_bk_per[idx_b].mean()

    ci_add = np.percentile(boot_add, [2.5, 97.5])
    ci_bk  = np.percentile(boot_bk, [2.5, 97.5])

    boot_results.append({
        "n_bins": n_bins,
        "add_delta_mean": delta_add_per.mean(),
        "add_CI_lo": ci_add[0], "add_CI_hi": ci_add[1],
        "add_CI_excludes_0": "YES" if (ci_add[0] > 0 or ci_add[1] < 0) else "no",
        "bk_delta_mean": delta_bk_per.mean(),
        "bk_CI_lo": ci_bk[0], "bk_CI_hi": ci_bk[1],
        "bk_CI_excludes_0": "YES" if (ci_bk[0] > 0 or ci_bk[1] < 0) else "no",
    })
    print(f"n_bins={n_bins}:")
    print(f"  Additive delta: {delta_add_per.mean():+.6f}  95% CI: [{ci_add[0]:+.6f}, {ci_add[1]:+.6f}]  "
          f"excludes 0? {'YES' if (ci_add[0] > 0 or ci_add[1] < 0) else 'no'}")
    print(f"  Backoff  delta: {delta_bk_per.mean():+.6f}  95% CI: [{ci_bk[0]:+.6f}, {ci_bk[1]:+.6f}]  "
          f"excludes 0? {'YES' if (ci_bk[0] > 0 or ci_bk[1] < 0) else 'no'}")

df_boot = pd.DataFrame(boot_results)
print("\n" + "=" * 110)
print("BOOTSTRAP 95% CI FOR TEST delta_LL (k=1)")
print("=" * 110)
print(df_boot.to_string(index=False, float_format=lambda x: f"{x:.6f}"))
print("=" * 110)

n_bins=25:
  Additive delta: +0.002152  95% CI: [-0.013127, +0.017896]  excludes 0? no
  Backoff  delta: +0.000802  95% CI: [-0.000194, +0.001874]  excludes 0? no
n_bins=35:
  Additive delta: -0.007878  95% CI: [-0.018710, +0.002723]  excludes 0? no
  Backoff  delta: -0.001259  95% CI: [-0.005194, +0.002618]  excludes 0? no
n_bins=40:
  Additive delta: -0.006467  95% CI: [-0.016744, +0.002877]  excludes 0? no
  Backoff  delta: -0.001223  95% CI: [-0.005252, +0.002545]  excludes 0? no
n_bins=55:
  Additive delta: -0.198814  95% CI: [-0.260549, -0.138552]  excludes 0? YES
  Backoff  delta: +0.001940  95% CI: [-0.005578, +0.009597]  excludes 0? no

BOOTSTRAP 95% CI FOR TEST delta_LL (k=1)
 n_bins  add_delta_mean  add_CI_lo  add_CI_hi add_CI_excludes_0  bk_delta_mean  bk_CI_lo  bk_CI_hi bk_CI_excludes_0
     25        0.002152  -0.013127   0.017896                no       0.000802 -0.000194  0.001874               no
     35       -0.007878  -0.018710   0.002723                no      -0.0

## E) Multi-Step Evaluation ($k \in \{1,2,3,5,10\}$)

Using val-frozen hyperparameters. Reports both VAL and TEST LL.

In [7]:
K_LIST = [1, 2, 3, 5, 10]
multistep_results = []

for n_bins in N_BINS_LIST:
    s_all, y_all_rb, n_st, method = prepare_bins(n_bins)
    s_tr = s_all[idx_train]; y_tr = y_all_rb[idx_train]

    marginal = compute_marginal(y_tr, n_st)

    # Val-best backoff for this n_bins
    sub_bk = df_tuning[(df_tuning["model"] == "backoff") & (df_tuning["n_bins"] == n_bins)]
    best_b = sub_bk.loc[sub_bk["val_LL"].idxmax()]
    alpha_b, tau_b = best_b["alpha"], best_b["tau"]

    A_bk, _, _ = build_backoff_matrix(s_tr, y_tr, n_st, n_st, alpha_b, tau_b, marginal)

    for k in K_LIST:
        Ak = np.linalg.matrix_power(A_bk, k)

        # y_all_rb[t] = Forward_Bin at t = state at t+1.
        # So "true state at absolute time t+k" = y_all_rb[t + k - 1]
        # Given anchor at time t (where s_all[t] is current state),
        # the realized state k steps ahead = y_all_rb[t + k - 1].
        # But the transition A maps from s_t to s_{t+1} = y_all_rb[t],
        # so A^k maps s_t to s_{t+k}. The realized s_{t+k} = y_all_rb[t+k-1].
        # Wait — let's be precise:
        #   s_all[t] = Backward_Bin[t] = state of return (P_{t-1} -> P_t)
        #   y_all_rb[t] = Forward_Bin[t] = state of return (P_t -> P_{t+1})
        #   Because Forward_Bin[t] == Backward_Bin[t+1] (verified in v2),
        #   y_all_rb[t] == s_all[t+1].
        #   So A maps s_all[t] -> y_all_rb[t] = s_all[t+1].
        #   A^k maps s_all[t] -> s_all[t+k] = y_all_rb[t+k-1].
        #   True label for k-step = y_all_rb[t + k - 1].

        for split_name, idx_split in [("val", idx_val), ("test", idx_test)]:
            # For k-step: anchor at t, target at t+k-1 in y_all_rb
            # We need t + k - 1 < T  (i.e., y_all_rb[t+k-1] exists)
            if k == 1:
                # A^1 maps s_t -> y_all_rb[t] (same-row target)
                valid = idx_split  # all valid since y_all_rb[t] always exists
                s_anchor = s_all[valid]
                y_target = y_all_rb[valid]
            else:
                # A^k maps s_t -> y_all_rb[t + k - 1]
                valid_mask = (idx_split + k - 1) < T
                valid = idx_split[valid_mask]
                s_anchor = s_all[valid]
                y_target = y_all_rb[valid + k - 1]

            n_valid = len(s_anchor)
            if n_valid < 10:
                continue

            # Marginal
            pred_marg = np.tile(marginal, (n_valid, 1))
            ll_marg = mean_log_likelihood(pred_marg, y_target)

            # Backoff k-step
            pred_bk = Ak[s_anchor]
            ll_bk = mean_log_likelihood(pred_bk, y_target)

            multistep_results.append({
                "n_bins": n_bins, "k": k, "split": split_name,
                "n_valid": n_valid,
                "marginal_LL": ll_marg, "backoff_LL": ll_bk,
                "delta_LL": ll_bk - ll_marg,
                "alpha": alpha_b, "tau": tau_b,
            })

df_ms = pd.DataFrame(multistep_results)

print("=" * 120)
print("MULTI-STEP EVALUATION (val-tuned params, reported on both VAL and TEST)")
print("=" * 120)
display_cols = ["n_bins", "k", "split", "n_valid", "marginal_LL", "backoff_LL", "delta_LL", "alpha", "tau"]
print(df_ms[display_cols].to_string(index=False, float_format=lambda x: f"{x:.6f}"))
print("=" * 120)

MULTI-STEP EVALUATION (val-tuned params, reported on both VAL and TEST)
 n_bins  k split  n_valid  marginal_LL  backoff_LL  delta_LL     alpha         tau
     25  1   val      355    -3.218704   -3.219013 -0.000310 10.000000 1000.000000
     25  1  test      356    -3.218758   -3.217956  0.000802 10.000000 1000.000000
     25  2   val      355    -3.218704   -3.218710 -0.000006 10.000000 1000.000000
     25  2  test      355    -3.218746   -3.218750 -0.000004 10.000000 1000.000000
     25  3   val      355    -3.218704   -3.218714 -0.000010 10.000000 1000.000000
     25  3  test      354    -3.218734   -3.218740 -0.000006 10.000000 1000.000000
     25  5   val      355    -3.218704   -3.218714 -0.000010 10.000000 1000.000000
     25  5  test      352    -3.218752   -3.218757 -0.000006 10.000000 1000.000000
     25 10   val      355    -3.218704   -3.218714 -0.000010 10.000000 1000.000000
     25 10  test      347    -3.218776   -3.218780 -0.000004 10.000000 1000.000000
     35  1   va

## F) Final Summary Table + Conclusion

In [8]:
# ---------------------------------------------------------------
# Build single summary table
# ---------------------------------------------------------------
summary_rows = []

for n_bins in N_BINS_LIST:
    # k=1 test results with bootstrap CI
    boot_row = df_boot[df_boot["n_bins"] == n_bins].iloc[0]

    for model_name in ["marginal", "additive", "backoff"]:
        test_row = df_test[(df_test["n_bins"] == n_bins) & (df_test["model"] == model_name)].iloc[0]
        ms_test_k1 = df_ms[(df_ms["n_bins"] == n_bins) & (df_ms["k"] == 1) & (df_ms["split"] == "test")]
        ms_val_k1  = df_ms[(df_ms["n_bins"] == n_bins) & (df_ms["k"] == 1) & (df_ms["split"] == "val")]

        ci_lo, ci_hi = None, None
        if model_name == "additive":
            ci_lo = boot_row["add_CI_lo"]
            ci_hi = boot_row["add_CI_hi"]
        elif model_name == "backoff":
            ci_lo = boot_row["bk_CI_lo"]
            ci_hi = boot_row["bk_CI_hi"]

        summary_rows.append({
            "model": model_name,
            "n_bins": n_bins,
            "k": 1,
            "alpha": test_row["alpha"],
            "tau": test_row["tau"],
            "val_LL": test_row["val_LL"],
            "test_LL": test_row["test_LL"],
            "delta_test": test_row["delta_test"],
            "CI_lo": ci_lo if ci_lo is not None else "-",
            "CI_hi": ci_hi if ci_hi is not None else "-",
        })

# Add multi-step for backoff (test only, k>1)
for _, ms_row in df_ms[(df_ms["split"] == "test") & (df_ms["k"] > 1)].iterrows():
    # Find corresponding val row
    ms_val = df_ms[(df_ms["n_bins"] == ms_row["n_bins"]) &
                   (df_ms["k"] == ms_row["k"]) &
                   (df_ms["split"] == "val")]
    val_ll = ms_val["backoff_LL"].values[0] if len(ms_val) > 0 else None
    summary_rows.append({
        "model": "backoff",
        "n_bins": int(ms_row["n_bins"]),
        "k": int(ms_row["k"]),
        "alpha": ms_row["alpha"],
        "tau": ms_row["tau"],
        "val_LL": val_ll,
        "test_LL": ms_row["backoff_LL"],
        "delta_test": ms_row["delta_LL"],
        "CI_lo": "-", "CI_hi": "-",
    })

df_summary = pd.DataFrame(summary_rows)

# Save
os.makedirs("results/diagnostics_v3", exist_ok=True)
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
df_tuning.to_csv(f"results/diagnostics_v3/tuning_grid_{ts}.csv", index=False)
df_test.to_csv(f"results/diagnostics_v3/test_k1_{ts}.csv", index=False)
df_boot.to_csv(f"results/diagnostics_v3/bootstrap_ci_{ts}.csv", index=False)
df_ms.to_csv(f"results/diagnostics_v3/multistep_{ts}.csv", index=False)
df_summary.to_csv(f"results/diagnostics_v3/summary_{ts}.csv", index=False)

print("=" * 130)
print("SUMMARY TABLE (val-tuned hyperparameters, test evaluated once)")
print("=" * 130)
print(df_summary.to_string(index=False, float_format=lambda x: f"{x:.6f}"))
print("=" * 130)

# ---------------------------------------------------------------
# FINAL CONCLUSION
# ---------------------------------------------------------------
print("\n" + "=" * 90)
print("FINAL CONCLUSION")
print("=" * 90)

# v2 audit
print("\n[v2 Audit] In DiagnosticExperiment_v2.ipynb:")
print("  - alpha and tau were correctly tuned on VALIDATION per n_bins.")
print("  - However, the 'best n_bins' was selected by inspecting TEST deltas,")
print("    which is a form of implicit test-set selection.")
print("  - v3 corrects this by selecting n_bins on VAL only.")

# Check: does any conditional baseline beat marginal on test, with CI excluding 0?
print("\n[k=1 Results]")
any_sig_beat = False
any_beat = False
for _, brow in df_boot.iterrows():
    nb = int(brow["n_bins"])
    for model_name, d_mean, ci_lo, ci_hi, excl in [
        ("additive", brow["add_delta_mean"], brow["add_CI_lo"], brow["add_CI_hi"], brow["add_CI_excludes_0"]),
        ("backoff",  brow["bk_delta_mean"],  brow["bk_CI_lo"],  brow["bk_CI_hi"],  brow["bk_CI_excludes_0"]),
    ]:
        if d_mean > 0:
            any_beat = True
            sig = " (CI excludes 0)" if excl == "YES" else " (CI includes 0)"
            print(f"  n_bins={nb}, {model_name}: delta={d_mean:+.6f} nats, "
                  f"95% CI=[{ci_lo:+.6f}, {ci_hi:+.6f}]{sig}")
            if excl == "YES":
                any_sig_beat = True

if not any_beat:
    print("  No conditional baseline beats marginal on test at any n_bins.")

# Multi-step check
print("\n[Multi-step]")
ms_test = df_ms[df_ms["split"] == "test"]
any_ms_beat = False
for _, mrow in ms_test[ms_test["delta_LL"] > 0].iterrows():
    nb = int(mrow["n_bins"]); k = int(mrow["k"])
    print(f"  n_bins={nb}, k={k}: delta={mrow['delta_LL']:+.6f} nats")
    any_ms_beat = True
if not any_ms_beat:
    print("  No positive delta at any k.")

# Verdict
print("\n" + "=" * 90)
print("VERDICT")
print("=" * 90)

if any_sig_beat:
    # Find the significant one
    print("  YES — At least one conditional baseline beats marginal on test")
    print("  with 95% bootstrap CI excluding zero.")
    print("  The effect is real under val-tuned hyperparameters.")
    # But is it practically meaningful?
    best_d = max(
        df_boot[df_boot["add_CI_excludes_0"] == "YES"]["add_delta_mean"].max() if len(df_boot[df_boot["add_CI_excludes_0"] == "YES"]) > 0 else -np.inf,
        df_boot[df_boot["bk_CI_excludes_0"] == "YES"]["bk_delta_mean"].max() if len(df_boot[df_boot["bk_CI_excludes_0"] == "YES"]) > 0 else -np.inf,
    )
    if best_d < 0.01:
        print(f"  However, the effect is TINY ({best_d:.6f} nats/sample).")
        print("  Practically: this is unlikely to be exploitable by a neural model.")
    else:
        print(f"  Effect size: {best_d:.6f} nats/sample — potentially exploitable.")
elif any_beat:
    print("  INCONCLUSIVE — Some conditional baselines show positive test delta,")
    print("  but 95% bootstrap CI includes zero for all of them.")
    print("  The observed lift is within sampling noise.")
    print("  Cannot reject the null hypothesis that current state is uninformative.")
else:
    print("  NO — No conditional baseline beats marginal on test")
    print("  under val-tuned hyperparameters.")
    print("  The current-state bin carries no exploitable information about the next-state bin.")

print(f"\nAll results saved to results/diagnostics_v3/ (timestamp: {ts})")
print("=" * 90)

SUMMARY TABLE (val-tuned hyperparameters, test evaluated once)
   model  n_bins  k     alpha         tau    val_LL   test_LL  delta_test     CI_lo     CI_hi
marginal      25  1         -           - -3.218704 -3.218758    0.000000         -         -
additive      25  1 10.000000           - -3.231259 -3.216606    0.002152 -0.013127  0.017896
 backoff      25  1 10.000000 1000.000000 -3.219013 -3.217956    0.000802 -0.000194  0.001874
marginal      35  1         -           - -3.554669 -3.554692    0.000000         -         -
additive      35  1 10.000000           - -3.557619 -3.562570   -0.007878 -0.018710  0.002723
 backoff      35  1  0.100000 1000.000000 -3.554212 -3.555950   -0.001259 -0.005194  0.002618
marginal      40  1         -           - -3.688670 -3.688565    0.000000         -         -
additive      40  1 10.000000           - -3.690263 -3.695031   -0.006467 -0.016744  0.002877
 backoff      40  1  0.100000 1000.000000 -3.688277 -3.689787   -0.001223 -0.005252  0.0025