In [1]:
from __future__ import annotations
import os, ast, random, inspect
from pathlib import Path
from typing import Dict, List

import numpy as np, pandas as pd, torch
import torch.utils.data as torchdata
from tqdm.auto import tqdm
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt, seaborn as sns
from transformers import AutoTokenizer, AutoModel, BertModel, AutoConfig

from IsoScore import IsoScore
from dadapy import Data
from skdim.id import MLE, MOM, TLE, CorrInt, FisherS, lPCA

2025-12-17 10:10:19.537434: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
from __future__ import annotations
import os, gc, ast, random, inspect
from pathlib import Path
from typing import Dict, List, Tuple, Callable

import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer, AutoModel, GPT2TokenizerFast

# ===== Optional deps (gracefully skipped if not installed) =====
HAS_DADAPY = False
try:
    from dadapy import Data  # DADApy ID estimators (TwoNN, GRIDE)
    HAS_DADAPY = True
except Exception:
    pass

HAS_SKDIM = False
try:
    from skdim.id import MOM, TLE, CorrInt, FisherS, lPCA, MLE, DANCo, ESS, MiND_ML, MADA, KNN
    HAS_SKDIM = True
except Exception:
    pass

# IsoScore: use library if available, else a simple monotone fallback
try:
    from isoscore import IsoScore
    _HAS_ISOSCORE = True
except Exception:
    _HAS_ISOSCORE = False
    class _IsoScoreFallback:
        @staticmethod
        def IsoScore(X: np.ndarray) -> float:
            C = np.cov(X.T, ddof=0)
            ev = np.linalg.eigvalsh(C)
            if ev.size == 0 or ev.mean() <= 0 or ev[-1] <= 0:
                return 0.0
            return float(np.clip(ev.mean() / (ev[-1] + 1e-9), 0.0, 1.0))
    IsoScore = _IsoScoreFallback()

# =============================== CONFIG ===============================
CSV_PATH      = "en_ewt-ud-train_sentences.csv"
INDEX_COL     = "index"                 # your existing column with token positions (list[int])

BASELINE      = "gpt2"                  # set to "gpt2" for GPT‑2
WORD_REP_MODE = "last"                  # GPT‑2: {"last","mean"}

# NEW: drop first word of each sentence (word_id == 0)
EXCLUDE_FIRST_WORD = False

RAW_MAX_PER_CLASS            = 184_870  # cap per class
N_BOOTSTRAP_FAST             = 50
N_BOOTSTRAP_HEAVY            = 200
FAST_BS_MAX_SAMP_PER_CLASS   = 184_870
HEAVY_BS_MAX_SAMP_PER_CLASS  = 5_000

INDEX_MAX_CLASS      = 10                # keep classes within [1..10] (10 means 10+)
INCLUDE_ZERO_CLASS   = True             # set True if your positions are 0-based and you want class "0"

RAND_SEED=42
PLOT_DIR = Path("results_INDEX"); PLOT_DIR.mkdir(exist_ok=True, parents=True)
CSV_DIR  = Path("tables_INDEX") / "index_bootstrap"; CSV_DIR.mkdir(exist_ok=True, parents=True)

BATCH_SIZE = 1
os.environ["TOKENIZERS_PARALLELISM"] = "true"
random.seed(RAND_SEED); np.random.seed(RAND_SEED); torch.manual_seed(RAND_SEED)
device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cuda": torch.backends.cudnn.benchmark = True

sns.set_style("darkgrid")
plt.rcParams["figure.dpi"] = 120
EPS = 1e-12

# =============================== HELPERS ===============================
def _to_list(x):
    return ast.literal_eval(x) if isinstance(x, str) and x.startswith("[") else x

def _center(X: np.ndarray) -> np.ndarray:
    return X - X.mean(0, keepdims=True)

def _eigvals_from_X(X: np.ndarray) -> np.ndarray:
    Xc = _center(X.astype(np.float32, copy=False))
    try:
        _, S, _ = np.linalg.svd(Xc, full_matrices=False)
        lam = (S**2).astype(np.float64)
        lam.sort()
        return lam[::-1]
    except Exception:
        return np.array([], dtype=np.float64)

def _jitter_unique(X: np.ndarray, eps: float = 1e-6) -> np.ndarray:
    """Add tiny noise if there are duplicate rows (helps NN-based estimators)."""
    try:
        if np.unique(X, axis=0).shape[0] < X.shape[0]:
            X = X + np.random.normal(scale=eps, size=X.shape).astype(X.dtype)
    except Exception:
        pass
    return X

def _num_hidden_layers(model) -> int:
    n = getattr(model.config, "num_hidden_layers", None)
    if n is None: n = getattr(model.config, "n_layer", None)
    if n is None: raise ValueError("Cannot determine number of hidden layers")
    return int(n)

def _hidden_size(model) -> int:
    d = getattr(model.config, "hidden_size", None)
    if d is None: d = getattr(model.config, "n_embd", None)
    if d is None: raise ValueError("Cannot determine hidden size")
    return int(d)

def _is_gpt_like(model) -> bool:
    mt = str(getattr(model.config, "model_type", "")).lower()
    name = str(getattr(getattr(model, "name_or_path", ""), "lower", lambda: "")())
    return ("gpt2" in mt) or ("gpt2" in name)

# ====================== Robust GPT‑2 / general loader ======================
def _load_tokenizer_and_model(baseline: str):
    """
    Robustly load tokenizer+model.
    - For GPT‑2, use GPT2TokenizerFast and try both 'openai-community/gpt2' and 'gpt2'
      to avoid the 'vocab_file NoneType' bug.
    - For others, fall back to AutoTokenizer + AutoModel.
    """
    b = baseline.lower()
    if "gpt2" in b:
        candidates = []
        for mid in [baseline, "openai-community/gpt2", "gpt2"]:
            if mid not in candidates:
                candidates.append(mid)
        last_err = None
        for mid in candidates:
            try:
                tokzr = GPT2TokenizerFast.from_pretrained(mid, add_prefix_space=True)
                model = AutoModel.from_pretrained(mid, output_hidden_states=True)
                return tokzr, model, mid
            except Exception as e:
                last_err = e
                continue
        raise RuntimeError(f"Failed to load GPT‑2 tokenizer/model. Tried {candidates}. Last error: {last_err}")
    else:
        tokzr = AutoTokenizer.from_pretrained(baseline, use_fast=True, add_prefix_space=True)
        model = AutoModel.from_pretrained(baseline, output_hidden_states=True)
        return tokzr, model, baseline

# ========= Metric single-call functions =========
# --- Isotropy family ---
def _iso_once(X: np.ndarray) -> float:
    return float(IsoScore.IsoScore(X))

def _sf_once(X: np.ndarray) -> float:
    lam = _eigvals_from_X(X)
    if lam.size == 0: return np.nan
    gm = np.exp(np.mean(np.log(lam + EPS)))
    am = float(lam.mean() + EPS)
    return float(gm / am)

def _rand_once(X: np.ndarray, K: int = 2000) -> float:
    n = X.shape[0]
    if n < 2: return np.nan
    rng = np.random.default_rng(RAND_SEED)
    K_eff = min(K, (n*(n-1))//2)
    i = rng.integers(0, n, size=K_eff); j = rng.integers(0, n, size=K_eff)
    same = (i == j)
    if same.any(): j[same] = rng.integers(0, n, size=same.sum())
    A, B = X[i], X[j]
    num = np.sum(A*B, axis=1)
    den = (np.linalg.norm(A, axis=1)*np.linalg.norm(B, axis=1) + 1e-9)
    return float(np.mean(np.abs(num/den)))  # higher ⇒ more anisotropic

def _vmf_kappa_once(X: np.ndarray) -> float:
    if X.shape[0] < 2: return np.nan
    Xn = X / (np.linalg.norm(X, axis=1, keepdims=True) + 1e-9)
    R = np.linalg.norm(Xn.mean(axis=0))
    d = Xn.shape[1]
    if R < 1e-9: return 0.0
    return float(max(R * (d - R**2) / (1.0 - R**2 + 1e-9), 0.0))  # higher ⇒ more anisotropic

# --- Linear ID family ---
def _pcaXX_once(X: np.ndarray, var_ratio: float) -> float:
    lam = _eigvals_from_X(X)
    if lam.size == 0: return np.nan
    c = np.cumsum(lam); thr = c[-1] * var_ratio
    return float(np.searchsorted(c, thr) + 1)

def _pca95_once(X: np.ndarray) -> float: return _pcaXX_once(X, 0.95)
def _pca99_once(X: np.ndarray) -> float: return _pcaXX_once(X, 0.99)

def _erank_once(X: np.ndarray) -> float:
    lam = _eigvals_from_X(X)
    if lam.size == 0: return np.nan
    p = lam / (lam.sum() + EPS)
    H = -(p * np.log(p + EPS)).sum()
    return float(np.exp(H))

def _pr_once(X: np.ndarray) -> float:
    lam = _eigvals_from_X(X)
    if lam.size == 0: return np.nan
    s1 = lam.sum(); s2 = (lam**2).sum()
    return float((s1**2) / (s2 + EPS))

def _stable_rank_once(X: np.ndarray) -> float:
    lam = _eigvals_from_X(X)
    if lam.size == 0: return np.nan
    return float(lam.sum() / (lam.max() + EPS))

# --- Non-linear family ---
def _dadapy_twonn_once(X: np.ndarray) -> float:
    if not HAS_DADAPY: return np.nan
    d = Data(coordinates=_jitter_unique(X))
    id_est, _, _ = d.compute_id_2NN()
    return float(id_est)

def _dadapy_gride_once(X: np.ndarray) -> float:
    if not HAS_DADAPY: return np.nan
    d = Data(coordinates=_jitter_unique(X))
    d.compute_distances(maxk=64)
    ids, _, _ = d.return_id_scaling_gride(range_max=64)
    return float(ids[-1])

def _skdim_factory(name: str):
    if not HAS_SKDIM: return None
    mapping = {
        "mom": MOM, "tle": TLE, "corrint": CorrInt, "fishers": FisherS,
        "lpca": lPCA, "lpca99": lPCA, "lpca95": lPCA,
        "mle": MLE, "danco": DANCo, "mind_ml": MiND_ML, "ess": ESS,
        "mada": MADA, "knn": KNN,
    }
    cls = mapping.get(name)
    if cls is None: return None
    def _builder():
        if name == "lpca":      return cls(ver="FO")
        if name == "lpca99":    return cls(ver="ratio", alphaRatio=0.99)
        if name == "lpca95":    return cls(ver="ratio", alphaRatio=0.95)
        return cls()
    return _builder

def _skdim_once_builder(name: str) -> Callable[[np.ndarray], float] | None:
    build = _skdim_factory(name)
    if build is None: return None
    def _once(X: np.ndarray) -> float:
        est = build(); est.fit(_jitter_unique(X)); return float(getattr(est, "dimension_", np.nan))
    return _once

# Registries
FAST_ONCE: Dict[str, Callable[[np.ndarray], float]] = {
    # Isotropy
    "iso": _iso_once, "sf": _sf_once, "rand": _rand_once, "vmf_kappa": _vmf_kappa_once,
    # Linear ID
    "erank": _erank_once, "pr": _pr_once, "stable_rank": _stable_rank_once,
    "pca95": _pca95_once, "pca99": _pca99_once,
}
HEAVY_ONCE: Dict[str, Callable[[np.ndarray], float] | None] = {
    # Non-linear (plus skdim linear variants)
    "twonn": _dadapy_twonn_once, "gride": _dadapy_gride_once,
    "mom": _skdim_once_builder("mom"), "tle": _skdim_once_builder("tle"),
    "corrint": _skdim_once_builder("corrint"), "fishers": _skdim_once_builder("fishers"),
    "lpca": _skdim_once_builder("lpca"), "lpca95": _skdim_once_builder("lpca95"),
    "lpca99": _skdim_once_builder("lpca99"), "mle": _skdim_once_builder("mle"),
    "danco": _skdim_once_builder("danco"), "mind_ml": _skdim_once_builder("mind_ml"),
    "ess": _skdim_once_builder("ess"), "mada": _skdim_once_builder("mada"),
    "knn": _skdim_once_builder("knn"),
}
LABELS = {
    # Isotropy
    "iso":"IsoScore","sf":"Spectral Flatness","rand":"RandCos |μ|","vmf_kappa":"vMF κ",
    # Linear ID
    "erank":"Effective Rank","pr":"Participation Ratio","stable_rank":"Stable Rank",
    "pca95":"lPCA 0.95","pca99":"lPCA 0.99","lpca":"lPCA FO","lpca95":"lPCA 0.95 (skdim)","lpca99":"lPCA 0.99 (skdim)",
    # Non-linear
    "twonn":"TwoNN ID","gride":"GRIDE","mom":"MOM","tle":"TLE","corrint":"CorrInt",
    "fishers":"FisherS","mle":"MLE","danco":"DANCo","mind_ml":"MiND_ML","ess":"ESS","mada":"MADA","knn":"KNN",
}
#ALL_METRICS = list(FAST_ONCE.keys()) + [k for k,v in HEAVY_ONCE.items() if v is not None]
ALL_METRICS = ["gride"]

# =============================== DATA: use existing index column ===============================
def _pick_index_col(df: pd.DataFrame) -> str:
    for cand in [INDEX_COL, "token_index", "positions", "position", "idx", "INDEX", "Index"]:
        if cand in df.columns:
            return cand
    raise ValueError(
        f"No index column found. Tried: {INDEX_COL}, token_index, positions, position, idx, INDEX."
    )

def load_index_from_column(csv_path: str,
                           index_max: int = INDEX_MAX_CLASS,
                           include_zero: bool = INCLUDE_ZERO_CLASS):
    """
    Expects CSV with:
      - sentence_id (str)
      - tokens      (list[str]) — one row per sentence
      - index       (list[int]) — 1-based or 0-based token positions per sentence
    Produces token-level rows with 'index_class' in {1..index_max} (and optionally 0).
      'index_max' class means 'index_max+'.

    Also: if EXCLUDE_FIRST_WORD=True, drop tokens with word_id == 0 (first word).
    """
    df_all = pd.read_csv(csv_path)
    idx_col = _pick_index_col(df_all)
    df = df_all[["sentence_id","tokens", idx_col]].copy()
    df["sentence_id"] = df["sentence_id"].astype(str)
    df.tokens  = df.tokens.apply(_to_list)
    df[idx_col] = df[idx_col].apply(_to_list)

    rows = []
    for sid, toks, idxs in df[["sentence_id","tokens", idx_col]].itertuples(index=False):
        L = min(len(toks), len(idxs))
        for wid in range(L):
            # --- NEW: optionally skip first word of each sentence (word_id == 0) ---
            if EXCLUDE_FIRST_WORD and wid == 0:
                continue
            try:
                k = int(idxs[wid])
            except Exception:
                continue
            if k == 0 and not include_zero:
                # skip zero if we only want 1..N
                continue
            if k < 0:
                continue
            if k > index_max:
                k = index_max  # bucket as index_max+
            rows.append((sid, wid, str(k), toks[wid]))
    df_tok = pd.DataFrame(rows, columns=["sentence_id","word_id","index_class","word"])

    # Safety: enforce first-word exclusion even if logic changes later
    if EXCLUDE_FIRST_WORD and not df_tok.empty:
        df_tok = df_tok[df_tok.word_id != 0].reset_index(drop=True)

    df_sent = df[["sentence_id","tokens"]].drop_duplicates("sentence_id")
    if df_tok.empty:
        raise ValueError("No token rows constructed—check that your 'index' column contains integer lists.")
    return df_sent, df_tok

def sample_raw(df_tok: pd.DataFrame, per_class_cap: int = RAW_MAX_PER_CLASS) -> pd.DataFrame:
    picks = []
    for c, sub in df_tok.groupby("index_class", sort=False):
        n = min(len(sub), per_class_cap)
        picks.append(sub.sample(n, random_state=RAND_SEED, replace=False))
    return pd.concat(picks, ignore_index=True)

def make_index_palette(classes: List[str]) -> Dict[str, Tuple[float, float, float]]:
    vals = sorted([int(c) for c in classes])
    cmap = sns.color_palette("viridis", len(vals))
    return {str(v): cmap[i] for i, v in enumerate(vals)}

# =============================== EMBEDDING (BERT & GPT‑2) ===============================
def embed_subset(df_sent: pd.DataFrame,
                 subset_df: pd.DataFrame,
                 baseline: str = BASELINE,
                 word_rep_mode: str = WORD_REP_MODE,
                 batch_size: int = BATCH_SIZE) -> Tuple[np.ndarray, np.ndarray]:
    df_sent["sentence_id"]   = df_sent["sentence_id"].astype(str)
    subset_df["sentence_id"] = subset_df["sentence_id"].astype(str)

    # sid -> list[(global_idx, word_id)]
    by_sid: Dict[str, List[Tuple[int,int]]] = {}
    for gidx, (sid, wid) in enumerate(subset_df[["sentence_id","word_id"]].itertuples(index=False)):
        by_sid.setdefault(str(sid), []).append((gidx, int(wid)))

    sids = list(by_sid.keys())
    df_sel = (df_sent[df_sent.sentence_id.isin(sids)]
              .drop_duplicates("sentence_id")
              .set_index("sentence_id")
              .loc[sids])

    # Robust loader (GPT‑2 safe)
    tokzr, model, model_id_used = _load_tokenizer_and_model(baseline)

    enc_kwargs = dict(is_split_into_words=True, return_tensors="pt", padding=True)
    # add_prefix_space for GPT‑2-like models
    if "add_prefix_space" in inspect.signature(tokzr.__call__).parameters:
        enc_kwargs["add_prefix_space"] = True
    # ensure pad token
    if tokzr.pad_token is None and getattr(tokzr, "eos_token", None) is not None:
        tokzr.pad_token = tokzr.eos_token

    if getattr(model.config, "pad_token_id", None) is None and tokzr.pad_token_id is not None:
        model.config.pad_token_id = tokzr.pad_token_id
    if device == "cuda":
        model.half()
    model = model.eval().to(device)

    L = _num_hidden_layers(model) + 1   # include embedding layer
    D = _hidden_size(model)
    N = len(subset_df)

    reps   = np.zeros((L, N, D), np.float16)
    filled = np.zeros(N, dtype=bool)

    # Choose/validate rep mode depending on model family
    gpt_like = _is_gpt_like(model)
    if gpt_like:
        rep_mode = word_rep_mode if word_rep_mode in {"last","mean"} else "last"
    else:
        rep_mode = word_rep_mode if word_rep_mode in {"first","last","mean"} else "first"

    with torch.no_grad(), torch.cuda.amp.autocast(device == "cuda"):
        for start in tqdm(range(0, len(sids), batch_size), desc=f"{model_id_used} (embed subset)"):
            batch_ids    = sids[start : start + batch_size]
            batch_tokens = df_sel.loc[batch_ids, "tokens"].tolist()

            enc_be = tokzr(batch_tokens, **enc_kwargs)
            enc_t  = {k: v.to(device) for k, v in enc_be.items()}
            out = model(**enc_t)
            h = torch.stack(out.hidden_states).detach().cpu().numpy().astype(np.float32)  # (L,B,T,D)

            for b, sid in enumerate(batch_ids):
                mp: Dict[int, List[int]] = {}
                for tidx, wid in enumerate(enc_be.word_ids(b)):
                    if wid is not None:
                        mp.setdefault(int(wid), []).append(int(tidx))

                for gidx, wid in by_sid.get(sid, []):
                    toks = mp.get(wid)
                    if not toks: 
                        continue
                    if rep_mode == "first":
                        vec = h[:, b, toks[0], :]
                    elif rep_mode == "last":
                        vec = h[:, b, toks[-1], :]
                    else:  # "mean"
                        vec = h[:, b, toks, :].mean(axis=1)
                    reps[:, gidx, :] = vec.astype(np.float16, copy=False)
                    filled[gidx] = True

            del enc_be, enc_t, out, h
            if device == "cuda": torch.cuda.empty_cache()

    missing = int((~filled).sum())
    if missing:
        print(f"⚠ Missing vectors for {missing} of {N} tokens")
    del model; gc.collect()
    if device == "cuda": torch.cuda.empty_cache()
    return reps, filled

# =============================== BOOTSTRAP CORE ===============================
def _bs_layer_loop(rep_sub: np.ndarray, M: int, n_reps: int, compute_once: Callable[[np.ndarray], float]):
    L, N, D = rep_sub.shape
    rng = np.random.default_rng(RAND_SEED)
    A = np.full((n_reps, L), np.nan, np.float32)
    for r in range(n_reps):
        idx = rng.integers(0, N, size=M)
        for l in range(L):
            X = rep_sub[l, idx].astype(np.float32, copy=False)
            try:
                A[r, l] = float(compute_once(X))
            except Exception:
                A[r, l] = np.nan
    mu = np.nanmean(A, axis=0).astype(np.float32)
    lo = np.nanpercentile(A, 2.5, axis=0).astype(np.float32)
    hi = np.nanpercentile(A, 97.5, axis=0).astype(np.float32)
    return mu, lo, hi

# =============================== SAVE / PLOT ===============================
def save_metric_csv_all_classes(metric: str,
                                class_to_stats: Dict[str, Dict[str, np.ndarray]],
                                layers: np.ndarray,
                                baseline: str,
                                subset_name: str = "raw"):
    rows = []
    for c, stats in class_to_stats.items():
        mu, lo, hi = stats["mean"], stats.get("lo"), stats.get("hi")
        for l, val in enumerate(mu):
            rows.append({
                "subset": subset_name, "model": baseline, "feature": "index",
                "class": c, "metric": metric, "layer": int(layers[l]),
                "mean": float(val) if np.isfinite(val) else np.nan,
                "ci_low": float(lo[l]) if isinstance(lo, np.ndarray) and np.isfinite(lo[l]) else np.nan,
                "ci_high": float(hi[l]) if isinstance(hi, np.ndarray) and np.isfinite(hi[l]) else np.nan,
                "n_tokens": int(stats.get("n", 0)), "word_rep_mode": WORD_REP_MODE,
                "source_csv": Path(CSV_PATH).name,
            })
    df = pd.DataFrame(rows)
    out = CSV_DIR / f"index_{subset_name}_{metric}_{baseline}.csv"
    df.to_csv(out, index=False)

def plot_metric_with_ci(class_to_stats: Dict[str, Dict[str, np.ndarray]],
                        layers: np.ndarray, metric: str, title: str, out_path: Path,
                        palette: Dict[str, Tuple[float, float, float]] | None = None):
    plt.figure(figsize=(9, 5))
    for c in sorted(class_to_stats.keys(), key=lambda s: int(s)):
        stats = class_to_stats[c]
        mu, lo, hi = stats["mean"], stats.get("lo"), stats.get("hi")
        if mu is None or np.all(np.isnan(mu)): continue
        color = palette.get(c) if isinstance(palette, dict) else None
        plt.plot(layers, mu, label=c, lw=1.8, color=color)
        if isinstance(lo, np.ndarray) and isinstance(hi, np.ndarray) and not np.all(np.isnan(lo)):
            plt.fill_between(layers, lo, hi, alpha=0.15, color=color)
    plt.xlabel("Layer"); plt.ylabel(LABELS.get(metric, metric.upper())); plt.title(title)
    plt.legend(ncol=5, fontsize="small", title=f"Index ({INDEX_MAX_CLASS} = {INDEX_MAX_CLASS}+)", frameon=False)
    plt.tight_layout(); plt.savefig(out_path, dpi=220); plt.close()

# =============================== DRIVER ===============================
def run_index_from_col_pipeline():
    # 1) Load token lists + index classes from existing column
    df_sent, idx_df = load_index_from_column(
        CSV_PATH, index_max=INDEX_MAX_CLASS, include_zero=INCLUDE_ZERO_CLASS
    )
    classes = sorted(idx_df.index_class.unique(), key=lambda s: int(s))
    palette = make_index_palette(classes)
    print(f"✓ corpus ready — {len(idx_df):,} tokens across index classes {classes}")

    # 2) Optional per-class cap
    raw_df = sample_raw(idx_df, RAW_MAX_PER_CLASS)
    print("Sample sizes per index (raw cap):")
    counts = raw_df.index_class.value_counts()
    counts = counts.reindex(sorted(counts.index, key=lambda x: int(x)))
    print(counts.to_dict())

    # 3) Embed once
    reps, filled = embed_subset(df_sent, raw_df, BASELINE, WORD_REP_MODE, BATCH_SIZE)
    raw_df = raw_df.reset_index(drop=True).loc[filled].reset_index(drop=True)
    cls_arr = raw_df.index_class.values
    L = reps.shape[0]; layers = np.arange(L)
    print(f"✓ embedded {len(raw_df):,} tokens  • layers={L}")

    # 4) Metric loop
    for metric in ALL_METRICS:
        print(f"\n→ Computing metric: {metric} …")
        compute_once = FAST_ONCE.get(metric) or HEAVY_ONCE.get(metric)
        if compute_once is None:
            print(f"  (skipping {metric}: estimator unavailable)")
            continue

        n_bs = N_BOOTSTRAP_FAST if metric in FAST_ONCE else N_BOOTSTRAP_HEAVY
        Mcap = FAST_BS_MAX_SAMP_PER_CLASS if metric in FAST_ONCE else HEAVY_BS_MAX_SAMP_PER_CLASS

        class_results: Dict[str, Dict[str, np.ndarray]] = {}
        for c in classes:
            idx = np.where(cls_arr == c)[0]
            if idx.size < 3:
                continue
            sub = reps[:, idx]  # (L, n_c, D)
            Nc = sub.shape[1]
            M = min(Mcap, Nc)
            mu, lo, hi = _bs_layer_loop(sub, M, n_bs, compute_once)
            class_results[c] = {"mean": mu, "lo": lo, "hi": hi, "n": int(Nc)}

        save_metric_csv_all_classes(metric, class_results, layers, BASELINE, subset_name="raw")
        plot_metric_with_ci(class_results, layers, metric,
                            title=f"{LABELS.get(metric, metric.upper())} • {BASELINE}",
                            out_path=PLOT_DIR / f"index_raw_{metric}_{BASELINE}.png",
                            palette=palette)
        print(f"  ✓ saved: CSV= {CSV_DIR}/index_raw_{metric}_{BASELINE}.csv  "
              f"plot= {PLOT_DIR}/index_raw_{metric}_{BASELINE}.png")

        del class_results; gc.collect()
        if device == "cuda": torch.cuda.empty_cache()

    del reps; gc.collect()
    if device == "cuda": torch.cuda.empty_cache()
    print("\n✓ done (incremental outputs produced per metric).")

if __name__ == "__main__":
    run_index_from_col_pipeline()


✓ corpus ready — 194,916 tokens across index classes ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']
Sample sizes per index (raw cap):
{'1': 10067, '2': 9647, '3': 9800, '4': 9932, '5': 9948, '6': 9972, '7': 9559, '8': 9142, '9': 8704, '10': 108145}


openai-community/gpt2 (embed subset): 100%|█| 10067/10067 [01:32<00:00, 108.76it


✓ embedded 194,916 tokens  • layers=13

→ Computing metric: gride …
  ✓ saved: CSV= tables_INDEX/index_bootstrap/index_raw_gride_gpt2.csv  plot= results_INDEX/index_raw_gride_gpt2.png

✓ done (incremental outputs produced per metric).


In [5]:
## from __future__ import annotations

import os, gc, ast, random, inspect
from pathlib import Path
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel, GPT2TokenizerFast

import plotly.graph_objects as go
import plotly.colors as pc

# =============================== CONFIG ===============================
CSV_PATH      = "en_ewt-ud-train_sentences.csv"  # sentence_id, tokens (list[str]), index (list[int]) or similar
BASELINE      = "gpt2"              # e.g. "bert-base-uncased" or "gpt2"
WORD_REP_MODE = "last"                          # BERT: {"first","last","mean"}; GPT-2: {"last","mean"}

INDEX_COL          = "index"                     # token positions per sentence (list[int])
INDEX_MAX_CLASS    = 10                          # bucket to [1..10], with 10 meaning 10+
INCLUDE_ZERO_CLASS = True                        # set True if your positions are 0-based and you want a "0" class

# Plot subsampling per class (to keep the browser snappy)
PCA_MAX_PER_CLASS  = None                        # None → all; try 2000–5000 if needed

# Output
OUT_DIR   = Path("pca3d_index"); OUT_DIR.mkdir(parents=True, exist_ok=True)
HTML_OUT  = OUT_DIR / f"{BASELINE.replace('/','_')}_index_pca3d_layers.html"

# Throughput / device
BATCH_SIZE = 2
RAND_SEED  = 42
os.environ["TOKENIZERS_PARALLELISM"] = "true"
random.seed(RAND_SEED); np.random.seed(RAND_SEED); torch.manual_seed(RAND_SEED)
device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cuda":
    torch.backends.cudnn.benchmark = True

# --------- Plotly font sizes (BIGGER TEXT) ----------
PLOT_FONT            = 20   # global default
PLOT_TITLE_FONT      = 24
PLOT_AXIS_TITLE_FONT = 18
PLOT_AXIS_TICK_FONT  = 20
PLOT_LEGEND_FONT     = 20
PLOT_SLIDER_FONT     = 20
PLOT_HOVER_FONT      = 30


# =============================== HELPERS ===============================
def _to_list(x):
    return ast.literal_eval(x) if isinstance(x, str) and x.startswith("[") else x

def _num_hidden_layers(model) -> int:
    n = getattr(model.config, "num_hidden_layers", None)
    if n is None: n = getattr(model.config, "n_layer", None)  # GPT-2
    if n is None: raise ValueError("Cannot determine num_hidden_layers")
    return int(n)

def _hidden_size(model) -> int:
    d = getattr(model.config, "hidden_size", None)
    if d is None: d = getattr(model.config, "n_embd", None)  # GPT-2
    if d is None: raise ValueError("Cannot determine hidden size")
    return int(d)

# --------- Load index classes from CSV ----------
def _pick_index_col(df: pd.DataFrame) -> str:
    for cand in [INDEX_COL, "token_index", "positions", "position", "idx", "INDEX", "Index"]:
        if cand in df.columns:
            return cand
    raise ValueError(f"No index column found. Tried: {INDEX_COL}, token_index, positions, position, idx, INDEX, Index.")

def load_index_from_column(csv_path: str,
                           index_max: int = INDEX_MAX_CLASS,
                           include_zero: bool = INCLUDE_ZERO_CLASS):
    """
    Build token-level rows with index_class in {1..index_max} (index_max means 'index_max+').
    If include_zero=True, keep class '0' as well (for 0-based positions).
    """
    df_all = pd.read_csv(csv_path)
    idx_col = _pick_index_col(df_all)
    df = df_all[["sentence_id","tokens", idx_col]].copy()
    df["sentence_id"] = df["sentence_id"].astype(str)
    df.tokens  = df.tokens.apply(_to_list)
    df[idx_col] = df[idx_col].apply(_to_list)

    rows = []
    for sid, toks, idxs in df[["sentence_id","tokens", idx_col]].itertuples(index=False):
        L = min(len(toks), len(idxs))
        for wid in range(L):
            try:
                k = int(idxs[wid])
            except Exception:
                continue
            if k == 0 and not include_zero:
                # skip zero if we only want 1..N
                continue
            if k < 0:
                continue
            if k > index_max:
                k = index_max  # bucket as index_max+
            rows.append((sid, wid, str(k), toks[wid]))
    df_tok = pd.DataFrame(rows, columns=["sentence_id","word_id","index_class","word"])
    if df_tok.empty:
        raise ValueError("No token rows constructed—check that your 'index' column contains integer lists.")
    df_sent = df[["sentence_id","tokens"]].drop_duplicates("sentence_id")
    return df_sent, df_tok

def sample_per_class(df_tok: pd.DataFrame, per_class_cap: int | None) -> pd.DataFrame:
    """Optional per-class subsample for plotting."""
    if per_class_cap is None:
        return df_tok.reset_index(drop=True)
    picks = []
    for c, sub in df_tok.groupby("index_class", sort=False):
        n = min(len(sub), per_class_cap)
        picks.append(sub.sample(n, random_state=RAND_SEED, replace=False))
    return pd.concat(picks, ignore_index=True)

# --------- Robust model/tokenizer loader (BERT + GPT-2) ----------
def _load_tok_and_model(model_id: str):
    tried = []
    cands = [model_id]
    # Helpful alt namespace for GPT‑2
    if model_id.lower() in {"gpt2", "gpt-2"}:
        for alt in ["openai-community/gpt2", "gpt2"]:
            if alt not in cands:
                cands.append(alt)

    last_err = None
    for mid in cands:
        try:
            if "gpt2" in mid.lower():
                tok = GPT2TokenizerFast.from_pretrained(mid, add_prefix_space=True)
            else:
                tok = AutoTokenizer.from_pretrained(mid, use_fast=True, add_prefix_space=True)
            # Right padding; PAD=EOS for GPT‑2
            if getattr(tok, "padding_side", None) != "right":
                tok.padding_side = "right"
            if tok.pad_token is None and getattr(tok, "eos_token", None) is not None:
                tok.pad_token = tok.eos_token

            mdl = AutoModel.from_pretrained(mid, output_hidden_states=True)
            if getattr(mdl.config, "pad_token_id", None) is None and tok.pad_token_id is not None:
                mdl.config.pad_token_id = tok.pad_token_id
            mdl = mdl.eval().to(device)
            if device == "cuda":
                mdl.half()
            return tok, mdl, mid
        except Exception as e:
            tried.append((mid, repr(e))); last_err = e
            continue
    raise RuntimeError("Failed to load tokenizer/model. Attempts:\n" + "\n".join(f" - {m}: {err}" for m, err in tried)) from last_err

# --------- Embed selected tokens ----------
def embed_subset(df_sent: pd.DataFrame,
                 subset_df: pd.DataFrame,
                 baseline: str = BASELINE,
                 word_rep_mode: str = WORD_REP_MODE,
                 batch_size: int = BATCH_SIZE) -> Tuple[np.ndarray, List[str], np.ndarray]:
    """
    Return:
      reps   : (L, N, D) float16
      words  : list[str] length N (for hover)
      filled : boolean mask (N,) — True where token got embedded
    """
    df_sent["sentence_id"]   = df_sent["sentence_id"].astype(str)
    subset_df["sentence_id"] = subset_df["sentence_id"].astype(str)

    # sid -> list[(global_idx, word_id)]
    by_sid: Dict[str, List[Tuple[int,int]]] = {}
    for gidx, (sid, wid) in enumerate(subset_df[["sentence_id","word_id"]].itertuples(index=False)):
        by_sid.setdefault(str(sid), []).append((gidx, int(wid)))

    sids = list(by_sid.keys())
    df_sel = (df_sent[df_sent.sentence_id.isin(sids)]
              .drop_duplicates("sentence_id")
              .set_index("sentence_id")
              .loc[sids])

    tokzr, model, resolved = _load_tok_and_model(baseline)
    enc_kwargs = dict(is_split_into_words=True, return_tensors="pt", padding=True, truncation=True)
    if "add_prefix_space" in inspect.signature(tokzr.__call__).parameters:
        enc_kwargs["add_prefix_space"] = True

    L = _num_hidden_layers(model) + 1
    D = _hidden_size(model)
    N = len(subset_df)

    reps   = np.zeros((L, N, D), np.float16)
    words  = [""] * N
    filled = np.zeros(N, dtype=bool)

    with torch.no_grad(), torch.cuda.amp.autocast(device == "cuda"):
        for start in tqdm(range(0, len(sids), batch_size), desc=f"{resolved} (embed subset)"):
            batch_ids    = sids[start : start + batch_size]
            batch_tokens = df_sel.loc[batch_ids, "tokens"].tolist()

            enc_be = tokzr(batch_tokens, **enc_kwargs)
            enc_t  = {k: v.to(device) for k, v in enc_be.items()}
            out    = model(**enc_t)
            h      = torch.stack(out.hidden_states).detach().cpu().numpy().astype(np.float32)  # (L,B,T,D)

            for b, sid in enumerate(batch_ids):
                # map word_id -> token positions
                mp: Dict[int, List[int]] = {}
                wids = enc_be.word_ids(b)
                if wids is None:
                    raise RuntimeError("Fast tokenizer required (word_ids() unavailable).")
                for tidx, wid in enumerate(wids):
                    if wid is not None:
                        mp.setdefault(int(wid), []).append(int(tidx))

                toks_for_sent = df_sel.loc[sid, "tokens"]
                for gidx, wid in by_sid.get(sid, []):
                    toks = mp.get(wid)
                    if not toks:
                        continue
                    if word_rep_mode == "first":
                        vec = h[:, b, toks[0], :]
                    elif word_rep_mode == "last":
                        vec = h[:, b, toks[-1], :]
                    elif word_rep_mode == "mean":
                        vec = h[:, b, toks, :].mean(axis=1)
                    else:
                        raise ValueError("WORD_REP_MODE must be one of {'first','last','mean'}.")
                    reps[:, gidx, :] = vec.astype(np.float16, copy=False)
                    words[gidx] = str(toks_for_sent[wid])
                    filled[gidx] = True

            del enc_be, enc_t, out, h
            if device == "cuda":
                torch.cuda.empty_cache()

    if (~filled).any():
        # drop unfilled rows to keep arrays consistent
        reps  = reps[:, filled]
        words = [w for w, f in zip(words, filled) if f]
    return reps, words, filled

# --------- PCA (3D) without extra deps ----------
def _pca3d_layer(X: np.ndarray, n_components: int = 3) -> np.ndarray:
    """
    PCA to 3D via SVD of centered X; returns (n, 3).
    """
    X = X.astype(np.float32, copy=False)
    Xc = X - X.mean(0, keepdims=True)
    U, S, Vt = np.linalg.svd(Xc, full_matrices=False)
    Y = (U[:, :n_components] * S[:n_components]).astype(np.float32, copy=False)
    return Y

# --------- Distinct color for each class ----------
def _make_distinct_palette(classes: List[str], scale: str = "Turbo") -> Dict[str, str]:
    """
    Return {class -> color} as rgb() strings. We sample a Plotly colorscale evenly
    so every class has a distinct color (works well even for many classes).
    """
    n = len(classes)
    if n <= 1:
        return {classes[0]: "rgb(0,0,0)"} if n == 1 else {}
    colorscale = pc.get_colorscale(scale)  # e.g., "Turbo", "Viridis"
    ts = np.linspace(0.0, 1.0, n)
    sampled = pc.sample_colorscale(colorscale, ts.tolist())
    return {cls: col for cls, col in zip(classes, sampled)}

# --------- Plotly: PCA‑3D per layer, traces per class ----------
def pca3d_by_index_and_plot(reps: np.ndarray,
                            words: List[str],
                            classes_arr: np.ndarray,
                            all_classes: List[str],
                            model_tag: str,
                            html_out: Path):
    """
    Build one trace per (layer, class); slider toggles layers.
    """
    L, N, D = reps.shape
    print(f"PCA plotting on {N:,} tokens across {L} layers...")

    # PCA per layer
    Y_layers = [ _pca3d_layer(reps[l]) for l in range(L) ]

    # Colors
    cmap = _make_distinct_palette(all_classes, scale="Turbo")
    print("\nPalette (class -> color):")
    for k in all_classes:
        print(f"  {k:>3} -> {cmap[k]}")

    traces = []
    # add traces in layer-major order: for each layer, add one trace per class
    for l in range(L):
        Y = Y_layers[l]
        show_legend = (l == 0)  # only show legend for layer 0
        for c in all_classes:
            mask = (classes_arr == c)
            x = Y[mask, 0] if np.any(mask) else []
            y = Y[mask, 1] if np.any(mask) else []
            z = Y[mask, 2] if np.any(mask) else []
            hovertxt = [f"{w} | idx={c}" for w in (np.array(words)[mask] if np.any(mask) else [])]

            traces.append(
                go.Scatter3d(
                    x=x, y=y, z=z,
                    mode="markers",
                    marker=dict(size=2, opacity=0.75, color=cmap[c]),
                    name=str(c),
                    hovertext=hovertxt,
                    # IMPORTANT: no Python %-format on strings containing %{...}
                    hovertemplate="<b>%{hovertext}</b><br>"
                                  "x=%{x:.3f}<br>y=%{y:.3f}<br>z=%{z:.3f}<extra></extra>",
                    visible=(l == 0),
                    showlegend=show_legend
                )
            )

    # Slider steps: turn on the L * |classes| block for layer l
    n_per_layer = len(all_classes)
    steps = []
    for l in range(L):
        vis = [False] * (L * n_per_layer)
        start = l * n_per_layer
        vis[start:start + n_per_layer] = [True] * n_per_layer

        steps.append(dict(
            method="update",
            args=[
                {"visible": vis},
                {"title": {
                    "text": f"{model_tag} • PCA 3D by index • Layer {l} (drag to rotate)",
                    "font": {"size": PLOT_TITLE_FONT},
                }},
            ],
            label=str(l),
        ))

    sliders = [dict(
        active=0,
        steps=steps,
        currentvalue={"prefix": "Layer: ", "font": {"size": PLOT_SLIDER_FONT}},
        font={"size": PLOT_SLIDER_FONT},   # step label font
        pad={"t": 10}
    )]

    layout = go.Layout(
        title={
            "text": f"{model_tag} • PCA 3D by index • Layer 0 (drag to rotate)",
            "font": {"size": PLOT_TITLE_FONT},
        },
        font={"size": PLOT_FONT},
        hoverlabel={"font": {"size": PLOT_HOVER_FONT}},

        scene=dict(
            xaxis=dict(
                title=dict(text="PC1", font=dict(size=PLOT_AXIS_TITLE_FONT)),
                tickfont=dict(size=PLOT_AXIS_TICK_FONT),
            ),
            yaxis=dict(
                title=dict(text="PC2", font=dict(size=PLOT_AXIS_TITLE_FONT)),
                tickfont=dict(size=PLOT_AXIS_TICK_FONT),
            ),
            zaxis=dict(
                title=dict(text="PC3", font=dict(size=PLOT_AXIS_TITLE_FONT)),
                tickfont=dict(size=PLOT_AXIS_TICK_FONT),
            ),
            aspectmode="data",
        ),

        margin=dict(l=0, r=0, b=0, t=60),
        sliders=sliders,

        showlegend=True,
        legend=dict(
            title=dict(text="index class", font=dict(size=PLOT_LEGEND_FONT)),
            font=dict(size=PLOT_LEGEND_FONT),
        )
    )

    fig = go.Figure(data=traces, layout=layout)
   # fig.show()
    fig.write_html(str(html_out), include_plotlyjs="cdn")
    print("\n✓ Saved interactive HTML to:", html_out)

# =============================== DRIVER ===============================
def run_pca3d_index():
    # 1) Load tokens + index classes
    df_sent, idx_df = load_index_from_column(
        CSV_PATH, index_max=INDEX_MAX_CLASS, include_zero=INCLUDE_ZERO_CLASS
    )

    # 2) Optional per-class subsample for plotting
    raw_df = sample_per_class(idx_df, PCA_MAX_PER_CLASS)
    classes = sorted(raw_df.index_class.unique(), key=lambda s: int(s))
    print(f"✓ plotting subset — {len(raw_df):,} tokens across index classes {classes}")

    # 3) Embed once
    reps, words, filled = embed_subset(df_sent, raw_df, BASELINE, WORD_REP_MODE, BATCH_SIZE)
    raw_df = raw_df.reset_index(drop=True).loc[filled].reset_index(drop=True)

    # Align labels with reps
    cls_arr = raw_df.index_class.values.astype(str)

    # 4) PCA→3D per layer + Plotly
    pca3d_by_index_and_plot(
        reps.astype(np.float32, copy=False),
        words,
        cls_arr,
        classes,
        model_tag=BASELINE,
        html_out=HTML_OUT
    )

    # Cleanup
    del reps
    gc.collect()
    if device == "cuda":
        torch.cuda.empty_cache()

if __name__ == "__main__":
    run_pca3d_index()


✓ plotting subset — 194,916 tokens across index classes ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']


openai-community/gpt2 (embed subset): 100%|█| 5034/5034 [01:06<00:00, 75.69it/s]


PCA plotting on 194,916 tokens across 13 layers...

Palette (class -> color):
    1 -> rgb(48, 18, 59)
    2 -> rgb(68, 96, 208)
    3 -> rgb(54, 167, 248)
    4 -> rgb(33, 226, 181)
    5 -> rgb(112, 252, 97)
    6 -> rgb(199, 236, 54)
    7 -> rgb(247, 184, 54)
    8 -> rgb(244, 105, 24)
    9 -> rgb(199, 42, 4)
   10 -> rgb(122, 4, 2)

✓ Saved interactive HTML to: pca3d_index/gpt2_index_pca3d_layers.html
