In [1]:
from __future__ import annotations
import os, ast, random, inspect
from pathlib import Path
from typing import Dict, List

import numpy as np, pandas as pd, torch
import torch.utils.data as torchdata
from tqdm.auto import tqdm
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt, seaborn as sns
from transformers import AutoTokenizer, AutoModel, BertModel, AutoConfig

from IsoScore import IsoScore
from dadapy import Data
from skdim.id import MLE, MOM, TLE, CorrInt, FisherS, lPCA

2025-12-07 21:50:51.915113: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
from __future__ import annotations
import os, gc, ast, random, inspect
from pathlib import Path
from typing import Dict, List, Tuple, Callable
from __future__ import annotations
import os, ast, random, inspect
from pathlib import Path
from typing import Dict, List

import numpy as np, pandas as pd, torch
import torch.utils.data as torchdata
from tqdm.auto import tqdm
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt, seaborn as sns
from transformers import AutoTokenizer, AutoModel, BertModel, AutoConfig

from IsoScore import IsoScore
from dadapy import Data
from skdim.id import MLE, MOM, TLE, CorrInt, FisherS, lPCA
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer, AutoModel

# ===== Optional deps (gracefully skipped if not installed) =====
HAS_DADAPY = False
try:
    from dadapy import Data  # DADApy ID estimators (TwoNN, GRIDE)
    HAS_DADAPY = True
except Exception:
    pass

HAS_SKDIM = False
try:
    from skdim.id import (
        MOM, TLE, CorrInt, FisherS, lPCA,
        MLE, DANCo, ESS, MiND_ML, MADA, KNN
    )
    HAS_SKDIM = True
except Exception:
    pass

# IsoScore: use library if available, else a simple monotone fallback
try:
    from isoscore import IsoScore
    _HAS_ISOSCORE = True
except Exception:
    _HAS_ISOSCORE = False
    class _IsoScoreFallback:
        @staticmethod
        def IsoScore(X: np.ndarray) -> float:
            C = np.cov(X.T, ddof=0)
            ev = np.linalg.eigvalsh(C)
            if ev.mean() <= 0 or ev[-1] <= 0:
                return 0.0
            # mean / max eigenvalue in [0,1]; higher ≈ more isotropic
            return float(np.clip(ev.mean() / (ev[-1] + 1e-9), 0.0, 1.0))
    IsoScore = _IsoScoreFallback()

# =============================== CONFIG ===============================
CSV_PATH      = "en_ewt-ud-train_sentences.csv"
HEAD_DIST_COL = "head_dist"             # <-- your existing column with per-token distances
BASELINE      = "bert-base-uncased"     # set to "gpt2" for GPT-2 family
WORD_REP_MODE = "first"                 # BERT: {"first","last","mean"}; GPT-2: {"last","mean"}

# Sampling / bootstrap
RAW_MAX_PER_CLASS              = int(1e12)  # no cap for fast metrics
N_BOOTSTRAP_FAST               = 50
N_BOOTSTRAP_HEAVY              = 200
FAST_BS_MAX_SAMP_PER_CLASS     = int(1e12)  # M = N (classic bootstrap)
HEAVY_BS_MAX_SAMP_PER_CLASS    = 5000       # practical for TwoNN/GRIDE/skdim
MIN_N_FOR_HEAVY_WARN           = 1000       # warn if a class has fewer than this

# Head-distance classes
HEAD_DIST_CLAMP       = 6                  # keep classes within [-6,6]
INCLUDE_ZERO_CLASS    = False              # set True if you also want "0" class

# Misc
RAND_SEED = 42
PLOT_DIR  = Path("results_HEADDIST"); PLOT_DIR.mkdir(exist_ok=True, parents=True)
CSV_DIR   = Path("tables_HEADDIST") / "headdist_bootstrap"; CSV_DIR.mkdir(exist_ok=True, parents=True)
BATCH_SIZE = 1

# Repro & device
os.environ["TOKENIZERS_PARALLELISM"] = "true"
random.seed(RAND_SEED); np.random.seed(RAND_SEED); torch.manual_seed(RAND_SEED)
device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cuda": torch.backends.cudnn.benchmark = True

sns.set_style("darkgrid")
plt.rcParams["figure.dpi"] = 120
EPS = 1e-12

# =============================== HELPERS ===============================
def _to_list(x):
    return ast.literal_eval(x) if isinstance(x, str) and x.startswith("[") else x

def _center(X: np.ndarray) -> np.ndarray:
    return X - X.mean(0, keepdims=True)

def _eigvals_from_X(X: np.ndarray) -> np.ndarray:
    """Eigenvalues of covariance up to a constant via SVD of centered X (descending)."""
    Xc = _center(X.astype(np.float32, copy=False))
    try:
        _, S, _ = np.linalg.svd(Xc, full_matrices=False)
        lam = (S**2).astype(np.float64)
        lam.sort()
        return lam[::-1]
    except Exception:
        return np.array([], dtype=np.float64)

def _jitter_unique(X: np.ndarray, eps: float = 1e-6) -> np.ndarray:
    """Add tiny noise if there are duplicate rows (helps NN-based estimators)."""
    try:
        if np.unique(X, axis=0).shape[0] < X.shape[0]:
            X = X + np.random.normal(scale=eps, size=X.shape).astype(X.dtype)
    except Exception:
        pass
    return X

def _num_hidden_layers(model) -> int:
    n = getattr(model.config, "num_hidden_layers", None)
    if n is None: n = getattr(model.config, "n_layer", None)  # GPT-2
    if n is None: raise ValueError("Cannot determine number of hidden layers from model.config")
    return int(n)

def _hidden_size(model) -> int:
    d = getattr(model.config, "hidden_size", None)
    if d is None: d = getattr(model.config, "n_embd", None)  # GPT-2
    if d is None: raise ValueError("Cannot determine hidden size from model.config")
    return int(d)

# ========= Metric single-call functions =========
# --- Isotropy (fast) ---
def _iso_once(X: np.ndarray) -> float:
    return float(IsoScore.IsoScore(X))

def _sf_once(X: np.ndarray) -> float:
    lam = _eigvals_from_X(X)
    if lam.size == 0: return np.nan
    gm = np.exp(np.mean(np.log(lam + EPS)))
    am = float(lam.mean() + EPS)
    return float(gm / am)  # higher => flatter spectrum => more isotropic

def _vmf_kappa_once(X: np.ndarray) -> float:
    if X.shape[0] < 2: return np.nan
    Xn = X / (np.linalg.norm(X, axis=1, keepdims=True) + 1e-9)
    R = np.linalg.norm(Xn.mean(axis=0))
    d = Xn.shape[1]
    if R < 1e-9: return 0.0
    return float(max(R * (d - R**2) / (1.0 - R**2 + 1e-9), 0.0))  # higher => more anisotropic

def _spect_once(X: np.ndarray) -> float:
    # spectral ratio: lambda_max / mean(lambda) (higher => more anisotropic)
    ev = np.linalg.eigvalsh(np.cov(X.T, ddof=0))
    if ev.size == 0: return np.nan
    return float(ev[-1] / (ev.mean() + 1e-9))

def _rand_once(X: np.ndarray, K: int = 2000) -> float:
    # mean absolute random-pair cosine (higher => more anisotropic)
    n = X.shape[0]
    if n < 2: return np.nan
    rng = np.random.default_rng(RAND_SEED)
    K_eff = min(K, (n*(n-1))//2)
    i = rng.integers(0, n, size=K_eff); j = rng.integers(0, n, size=K_eff)
    same = (i == j)
    if same.any(): j[same] = rng.integers(0, n, size=same.sum())
    A, B = X[i], X[j]
    num = np.einsum("ij,ij->i", A, B)
    den = (np.linalg.norm(A, axis=1)*np.linalg.norm(B, axis=1) + 1e-9)
    return float(np.mean(np.abs(num/den)))

# --- Linear ID (fast) ---
def _pcaXX_once(X: np.ndarray, var_ratio: float) -> float:
    lam = _eigvals_from_X(X)
    if lam.size == 0: return np.nan
    c = np.cumsum(lam); thr = c[-1] * var_ratio
    return float(np.searchsorted(c, thr) + 1)

def _pca95_once(X: np.ndarray) -> float:
    return _pcaXX_once(X, 0.95)

def _pca99_once(X: np.ndarray) -> float:
    return _pcaXX_once(X, 0.99)

def _erank_once(X: np.ndarray) -> float:
    lam = _eigvals_from_X(X)
    if lam.size == 0: return np.nan
    p = lam / (lam.sum() + EPS)
    H = -(p * np.log(p + EPS)).sum()
    return float(np.exp(H))

def _pr_once(X: np.ndarray) -> float:
    lam = _eigvals_from_X(X)
    if lam.size == 0: return np.nan
    s1 = lam.sum(); s2 = (lam**2).sum()
    return float((s1**2) / (s2 + EPS))

def _stable_rank_once(X: np.ndarray) -> float:
    lam = _eigvals_from_X(X)
    if lam.size == 0: return np.nan
    return float(lam.sum() / (lam.max() + EPS))

# --- Non-linear (heavy) ---
def _dadapy_twonn_once(X: np.ndarray) -> float:
    if not HAS_DADAPY: return np.nan
    d = Data(coordinates=_jitter_unique(X))
    id_est, _, _ = d.compute_id_2NN()
    return float(id_est)

def _dadapy_gride_once(X: np.ndarray) -> float:
    if not HAS_DADAPY: return np.nan
    d = Data(coordinates=_jitter_unique(X))
    d.compute_distances(maxk=64)
    ids, _, _ = d.return_id_scaling_gride(range_max=64)
    return float(ids[-1])

def _skdim_factory(name: str):
    if not HAS_SKDIM: return None
    mapping = {
        "mom": MOM, "tle": TLE, "corrint": CorrInt, "fishers": FisherS,
        "lpca": lPCA, "lpca99": lPCA, "lpca95": lPCA,
        "mle": MLE, "danco": DANCo, "ess": ESS, "mind_ml": MiND_ML,
        "mada": MADA, "knn": KNN,
    }
    cls = mapping.get(name)
    if cls is None: return None
    def _builder():
        if name == "lpca":      return cls(ver="FO")
        if name == "lpca99":    return cls(ver="ratio", alphaRatio=0.99)
        if name == "lpca95":    return cls(ver="ratio", alphaRatio=0.95)
        return cls()
    return _builder

def _skdim_once_builder(name: str) -> Callable[[np.ndarray], float] | None:
    build = _skdim_factory(name)
    if build is None: return None
    def _once(X: np.ndarray) -> float:
        est = build(); est.fit(_jitter_unique(X))
        return float(getattr(est, "dimension_", np.nan))
    return _once

# =============================== DATA: use existing head_dist column ===============================
def load_head_dist_from_column(csv_path: str,
                               head_dist_col: str = HEAD_DIST_COL,
                               clamp: int = HEAD_DIST_CLAMP,
                               include_zero: bool = INCLUDE_ZERO_CLASS):
    """
    Expects CSV columns:
      - sentence_id (str)
      - tokens      (list[str]) — one row per sentence
      - head_dist   (list[int]) — signed distances per token
    Produces token-level rows with 'head_dist_class' in {-clamp..-1, [0], 1..clamp}.
    """
    need = ["sentence_id", "tokens", head_dist_col]
    df = pd.read_csv(csv_path, usecols=need, dtype={"sentence_id": str})
    df.tokens = df.tokens.apply(_to_list)
    df[head_dist_col] = df[head_dist_col].apply(_to_list)

    rows = []
    for sid, toks, dists in df[["sentence_id","tokens", head_dist_col]].itertuples(index=False):
        L = min(len(toks), len(dists))
        for wid in range(L):
            try:
                dist = int(dists[wid])
            except Exception:
                continue
            if dist == 0 and not include_zero:
                continue
            if dist < -clamp: dist = -clamp
            if dist >  clamp: dist =  clamp
            rows.append((sid, wid, str(dist), toks[wid]))
    df_tok  = pd.DataFrame(rows, columns=["sentence_id","word_id","head_dist_class","word"])
    df_sent = df[["sentence_id","tokens"]].drop_duplicates("sentence_id")
    return df_sent, df_tok

def sample_raw(df_tok: pd.DataFrame, per_class_cap: int = RAW_MAX_PER_CLASS) -> pd.DataFrame:
    picks = []
    for c, sub in df_tok.groupby("head_dist_class", sort=False):
        n = min(len(sub), per_class_cap)
        picks.append(sub.sample(n, random_state=RAND_SEED, replace=False))
    return pd.concat(picks, ignore_index=True)

def make_dist_palette(classes: List[str]) -> Dict[str, Tuple[float, float, float]]:
    vals = sorted([int(c) for c in classes])
    cmap = sns.color_palette("coolwarm", len(vals))
    return {str(v): cmap[i] for i, v in enumerate(vals)}

# =============================== EMBEDDING (BERT & GPT‑2) ===============================
def embed_subset(df_sent: pd.DataFrame,
                 subset_df: pd.DataFrame,
                 baseline: str = BASELINE,
                 word_rep_mode: str = WORD_REP_MODE,
                 batch_size: int = BATCH_SIZE) -> Tuple[np.ndarray, np.ndarray]:
    df_sent["sentence_id"]   = df_sent["sentence_id"].astype(str)
    subset_df["sentence_id"] = subset_df["sentence_id"].astype(str)

    # sid -> list[(global_idx, word_id)]
    by_sid: Dict[str, List[Tuple[int,int]]] = {}
    for gidx, (sid, wid) in enumerate(subset_df[["sentence_id","word_id"]].itertuples(index=False)):
        by_sid.setdefault(str(sid), []).append((gidx, int(wid)))

    sids = list(by_sid.keys())
    df_sel = (df_sent[df_sent.sentence_id.isin(sids)]
              .drop_duplicates("sentence_id")
              .set_index("sentence_id")
              .loc[sids])

    tokzr = AutoTokenizer.from_pretrained(baseline, use_fast=True)
    enc_kwargs = dict(is_split_into_words=True, return_tensors="pt", padding=True)
    if "add_prefix_space" in inspect.signature(tokzr.__call__).parameters:
        enc_kwargs["add_prefix_space"] = True
    if tokzr.pad_token is None and getattr(tokzr, "eos_token", None) is not None:
        tokzr.pad_token = tokzr.eos_token

    model = AutoModel.from_pretrained(baseline, output_hidden_states=True).eval().to(device)
    if getattr(model.config, "pad_token_id", None) is None and tokzr.pad_token_id is not None:
        model.config.pad_token_id = tokzr.pad_token_id
    if device == "cuda": model.half()

    L = _num_hidden_layers(model) + 1   # include embeddings
    D = _hidden_size(model)
    N = len(subset_df)

    reps   = np.zeros((L, N, D), np.float16)
    filled = np.zeros(N, dtype=bool)

    with torch.no_grad(), torch.cuda.amp.autocast(device == "cuda"):
        for start in tqdm(range(0, len(sids), batch_size), desc=f"{baseline} (embed subset)"):
            batch_ids    = sids[start : start + batch_size]
            batch_tokens = df_sel.loc[batch_ids, "tokens"].tolist()

            enc_be = tokzr(batch_tokens, **enc_kwargs)
            enc_t  = {k: v.to(device) for k, v in enc_be.items()}
            out = model(**enc_t)
            h = torch.stack(out.hidden_states).detach().cpu().numpy().astype(np.float32)  # (L,B,T,D)

            for b, sid in enumerate(batch_ids):
                mp = {}
                for tidx, wid in enumerate(enc_be.word_ids(b)):
                    if wid is not None:
                        mp.setdefault(int(wid), []).append(int(tidx))

                for gidx, wid in by_sid.get(sid, []):
                    toks = mp.get(wid)
                    if not toks: continue
                    if word_rep_mode == "first":
                        vec = h[:, b, toks[0], :]
                    elif word_rep_mode == "last":
                        vec = h[:, b, toks[-1], :]
                    elif word_rep_mode == "mean":
                        vec = h[:, b, toks, :].mean(axis=1)
                    else:
                        raise ValueError("WORD_REP_MODE must be one of {'first','last','mean'} (for GPT-2 use 'last' or 'mean').")
                    reps[:, gidx, :] = vec.astype(np.float16, copy=False)
                    filled[gidx] = True

            del enc_be, enc_t, out, h
            if device == "cuda": torch.cuda.empty_cache()

    missing = int((~filled).sum())
    if missing: print(f"⚠ Missing vectors for {missing} of {N} tokens")
    del model; gc.collect()
    if device == "cuda": torch.cuda.empty_cache()
    return reps, filled

# =============================== BOOTSTRAP CORE ===============================
def _bs_layer_loop(rep_sub: np.ndarray, M: int, n_reps: int, compute_once: Callable[[np.ndarray], float]):
    L, N, D = rep_sub.shape
    rng = np.random.default_rng(RAND_SEED)
    A = np.full((n_reps, L), np.nan, np.float32)
    for r in range(n_reps):
        idx = rng.integers(0, N, size=M)
        for l in range(L):
            X = rep_sub[l, idx].astype(np.float32, copy=False)
            try:
                A[r, l] = float(compute_once(X))
            except Exception:
                A[r, l] = np.nan
    mu = np.nanmean(A, axis=0).astype(np.float32)
    lo = np.nanpercentile(A, 2.5, axis=0).astype(np.float32)
    hi = np.nanpercentile(A, 97.5, axis=0).astype(np.float32)
    return mu, lo, hi

# ---- Metric registries ----
FAST_ONCE: Dict[str, Callable[[np.ndarray], float]] = {
    # Isotropy
    "iso": _iso_once,
    "sf": _sf_once,
    "vmf_kappa": _vmf_kappa_once,   # anisotropy↑
    "spect": _spect_once,           # anisotropy↑
    "rand": _rand_once,             # anisotropy↑

    # Linear ID (spectral)
    "pca95": _pca95_once,
    "pca99": _pca99_once,
    "erank": _erank_once,
    "pr": _pr_once,
    "stable_rank": _stable_rank_once,
}

HEAVY_ONCE: Dict[str, Callable[[np.ndarray], float] | None] = {
    # DADApy
    "twonn": _dadapy_twonn_once,
    "gride": _dadapy_gride_once,
    # scikit-dimension
    "mom":   _skdim_once_builder("mom"),
    "tle":   _skdim_once_builder("tle"),
    "corrint": _skdim_once_builder("corrint"),
    "fishers": _skdim_once_builder("fishers"),
    "lpca":  _skdim_once_builder("lpca"),
    "lpca95": _skdim_once_builder("lpca95"),
    "lpca99": _skdim_once_builder("lpca99"),
    "mle":   _skdim_once_builder("mle"),
    "danco": _skdim_once_builder("danco"),
    "ess":   _skdim_once_builder("ess"),
    "mind_ml": _skdim_once_builder("mind_ml"),
    "mada":  _skdim_once_builder("mada"),
    "knn":   _skdim_once_builder("knn"),
}

LABELS = {
    # Isotropy
    "iso":"IsoScore", "sf":"Spectral Flatness",
    "vmf_kappa":"vMF κ (anisotropy↑)", "spect":"Spectral Ratio (λ_max/μ, anisotropy↑)",
    "rand":"RandCos |μ| (anisotropy↑)",
    # Linear ID
    "pca95":"lPCA 0.95", "pca99":"lPCA 0.99",
    "erank":"Effective Rank","pr":"Participation Ratio","stable_rank":"Stable Rank",
    # Non-linear
    "twonn":"TwoNN ID","gride":"GRIDE",
    "mom":"MOM","tle":"TLE","corrint":"CorrInt","fishers":"FisherS",
    "lpca":"lPCA FO","lpca95":"lPCA 0.95 (skdim)","lpca99":"lPCA 0.99 (skdim)",
    "mle":"MLE","danco":"DANCo","ess":"ESS","mind_ml":"MiND_ML","mada":"MADA","knn":"KNN",
}

# Compute them all by default
#ALL_METRICS = list(LABELS.keys())
ALL_METRICS = ["gride", "iso", "lpca99"]

# =============================== SAVE / PLOT ===============================
def save_metric_csv_all_classes(metric: str,
                                class_to_stats: Dict[str, Dict[str, np.ndarray]],
                                layers: np.ndarray,
                                baseline: str,
                                subset_name: str = "raw"):
    rows = []
    for c, stats in class_to_stats.items():
        mu, lo, hi = stats["mean"], stats.get("lo"), stats.get("hi")
        for l, val in enumerate(mu):
            rows.append({
                "subset": subset_name, "model": baseline, "feature": "head_dist",
                "class": c, "metric": metric, "layer": int(layers[l]),
                "mean": float(val) if np.isfinite(val) else np.nan,
                "ci_low": float(lo[l]) if isinstance(lo, np.ndarray) and np.isfinite(lo[l]) else np.nan,
                "ci_high": float(hi[l]) if isinstance(hi, np.ndarray) and np.isfinite(hi[l]) else np.nan,
                "n_tokens": int(stats.get("n", 0)),
                "word_rep_mode": WORD_REP_MODE,
                "source_csv": Path(CSV_PATH).name,
            })
    df = pd.DataFrame(rows)
    out = CSV_DIR / f"headdist_{subset_name}_{metric}_{baseline}.csv"
    df.to_csv(out, index=False)

def plot_metric_with_ci(class_to_stats: Dict[str, Dict[str, np.ndarray]],
                        layers: np.ndarray, metric: str, title: str, out_path: Path,
                        palette: Dict[str, Tuple[float, float, float]] | None = None):
    plt.figure(figsize=(9, 5))
    for c in sorted(class_to_stats.keys(), key=lambda s: int(s)):
        stats = class_to_stats[c]
        mu, lo, hi = stats["mean"], stats.get("lo"), stats.get("hi")
        if mu is None or np.all(np.isnan(mu)): continue
        color = palette.get(c) if isinstance(palette, dict) else None
        plt.plot(layers, mu, label=c, lw=1.8, color=color)
        if isinstance(lo, np.ndarray) and isinstance(hi, np.ndarray) and not np.all(np.isnan(lo)):
            plt.fill_between(layers, lo, hi, alpha=0.15, color=color)
    plt.xlabel("Layer"); plt.ylabel(LABELS.get(metric, metric.upper())); plt.title(title)
    plt.legend(ncol=4, fontsize="small", title="Head distance (−6 … 6)", frameon=False)
    plt.tight_layout(); plt.savefig(out_path, dpi=220); plt.close()

# =============================== DRIVER ===============================
def run_headdist_from_col_pipeline():
    # 1) Load token lists + distance classes from existing column
    df_sent, hd_df = load_head_dist_from_column(
        CSV_PATH, head_dist_col=HEAD_DIST_COL,
        clamp=HEAD_DIST_CLAMP, include_zero=INCLUDE_ZERO_CLASS
    )
    classes = sorted(hd_df.head_dist_class.unique(), key=lambda s: int(s))
    palette = make_dist_palette(classes)
    print(f"✓ corpus ready — {len(hd_df):,} tokens across head-dist classes {classes}")

    # 2) Optional per-class cap (currently unlimited for fast metrics)
    raw_df = sample_raw(hd_df, RAW_MAX_PER_CLASS)
    print("Sample sizes per head distance (raw cap):")
    counts = raw_df.head_dist_class.value_counts()
    counts = counts.reindex(sorted(counts.index, key=lambda x: int(x)))
    print(counts.to_dict())
    # Warn if heavy estimators may be unreliable for small classes
    too_small = {c:int(n) for c,n in counts.items() if n < MIN_N_FOR_HEAVY_WARN}
    if too_small:
        print(f"⚠ Some classes have < {MIN_N_FOR_HEAVY_WARN} tokens (heavy ID may be noisy): {too_small}")

    # 3) Embed once
    reps, filled = embed_subset(df_sent, raw_df, BASELINE, WORD_REP_MODE, BATCH_SIZE)
    raw_df = raw_df.reset_index(drop=True).loc[filled].reset_index(drop=True)
    cls_arr = raw_df.head_dist_class.values
    L = reps.shape[0]; layers = np.arange(L)
    print(f"✓ embedded {len(raw_df):,} tokens  • layers={L}")

    # 4) Metric loop
    for metric in ALL_METRICS:
        print(f"\n→ Computing metric: {metric} …")
        compute_once = FAST_ONCE.get(metric) or HEAVY_ONCE.get(metric)
        if compute_once is None:
            print(f"  (skipping {metric}: estimator unavailable)")
            continue
        n_bs = N_BOOTSTRAP_FAST if metric in FAST_ONCE else N_BOOTSTRAP_HEAVY
        Mcap = FAST_BS_MAX_SAMP_PER_CLASS if metric in FAST_ONCE else HEAVY_BS_MAX_SAMP_PER_CLASS

        class_results: Dict[str, Dict[str, np.ndarray]] = {}
        for c in classes:
            idx = np.where(cls_arr == c)[0]
            if idx.size < 3:
                continue
            sub = reps[:, idx]  # (L, n_c, D)
            Nc = sub.shape[1]
            M = min(Mcap, Nc)
            mu, lo, hi = _bs_layer_loop(sub, M, n_bs, compute_once)
            class_results[c] = {"mean": mu, "lo": lo, "hi": hi, "n": int(Nc)}

        save_metric_csv_all_classes(metric, class_results, layers, BASELINE, subset_name="raw")
        plot_metric_with_ci(class_results, layers, metric,
                            title=f"{LABELS.get(metric, metric.upper())} • {BASELINE}",
                            out_path=PLOT_DIR / f"headdist_raw_{metric}_{BASELINE}.png",
                            palette=palette)
        print(f"  ✓ saved: CSV= {CSV_DIR}/headdist_raw_{metric}_{BASELINE}.csv  "
              f"plot= {PLOT_DIR}/headdist_raw_{metric}_{BASELINE}.png")

        del class_results; gc.collect()
        if device == "cuda": torch.cuda.empty_cache()

    del reps; gc.collect()
    if device == "cuda": torch.cuda.empty_cache()
    print("\n✓ done (incremental outputs produced per metric).")

if __name__ == "__main__":
    run_headdist_from_col_pipeline()


✓ corpus ready — 184,870 tokens across head-dist classes ['-6', '-5', '-4', '-3', '-2', '-1', '1', '2', '3', '4', '5', '6']
Sample sizes per head distance (raw cap):
{'-6': 19585, '-5': 4545, '-4': 6684, '-3': 10487, '-2': 15690, '-1': 13038, '1': 57036, '2': 28588, '3': 13592, '4': 6439, '5': 3249, '6': 5937}


bert-base-uncased (embed subset):   8%|▎   | 780/10067 [00:07<01:23, 110.96it/s]


KeyboardInterrupt: 

In [1]:
from __future__ import annotations
import os, gc, ast, random, inspect
from pathlib import Path
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel

# Plotly (interactive)
import plotly.graph_objects as go

# =============================== CONFIG ===============================
CSV_PATH      = "en_ewt-ud-train_sentences.csv"  # needs: sentence_id, tokens (list[str]), head_dist (list[int])
BASELINE      = "gpt2"              # or "gpt2"
WORD_REP_MODE = "last"                          # BERT: {"first","last","mean"}; GPT-2: {"last","mean"}

HEAD_DIST_COL       = "head_dist"  # list[int], token-aligned per sentence
HEAD_DIST_CLAMP     = 6            # bucket distances to [-6,6]
INCLUDE_ZERO_CLASS  = True        # include the "0" class or not

# Plot subsampling per class (for browser smoothness)
PCA_MAX_PER_CLASS   = None   # set None for "all"; 2k-5k per class keeps things fast

# Output
OUT_DIR = Path("pca3d_head_dist"); OUT_DIR.mkdir(parents=True, exist_ok=True)

# Throughput / device
BATCH_SIZE = 2
RAND_SEED  = 42
os.environ["TOKENIZERS_PARALLELISM"] = "true"
random.seed(RAND_SEED); np.random.seed(RAND_SEED); torch.manual_seed(RAND_SEED)
device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cuda":
    torch.backends.cudnn.benchmark = True

# =============================== HELPERS ===============================
def _to_list(x):
    return ast.literal_eval(x) if isinstance(x, str) and x.startswith("[") else x

def _num_hidden_layers(model) -> int:
    n = getattr(model.config, "num_hidden_layers", None)
    if n is None: n = getattr(model.config, "n_layer", None)  # GPT-2
    if n is None: raise ValueError("Cannot determine num_hidden_layers")
    return int(n)

def _hidden_size(model) -> int:
    d = getattr(model.config, "hidden_size", None)
    if d is None: d = getattr(model.config, "n_embd", None)  # GPT-2
    if d is None: raise ValueError("Cannot determine hidden size")
    return int(d)

def _require_csv(path_like) -> Path:
    """Fail early with a clear message if CSV_PATH is None or missing."""
    if path_like is None:
        raise TypeError("CSV_PATH is None — please set CSV_PATH to your dataset CSV filename.")
    p = Path(path_like)
    if not p.exists():
        raise FileNotFoundError(f"CSV_PATH does not exist: {p.resolve()}")
    return p

def load_head_dist_from_column(csv_path: str,
                               head_dist_col: str = HEAD_DIST_COL,
                               clamp: int = HEAD_DIST_CLAMP,
                               include_zero: bool = INCLUDE_ZERO_CLASS):
    """
    Load sentence-level rows with 'tokens' (list[str]) and 'head_dist' (list[int]),
    emit token-level rows with 'head_dist_class' in {-clamp..-1, [0], 1..clamp}.
    """
    csv_file = _require_csv(csv_path)
    need = ["sentence_id", "tokens", head_dist_col]
    df = pd.read_csv(csv_file, usecols=need, dtype={"sentence_id": str})
    df.tokens = df.tokens.apply(_to_list)
    df[head_dist_col] = df[head_dist_col].apply(_to_list)

    rows = []
    for sid, toks, dists in df[["sentence_id","tokens", head_dist_col]].itertuples(index=False):
        L = min(len(toks), len(dists))
        for wid in range(L):
            try:
                dist = int(dists[wid])
            except Exception:
                continue
            if dist == 0 and not include_zero:
                continue
            if dist < -clamp: dist = -clamp
            if dist >  clamp: dist =  clamp
            rows.append((sid, wid, str(dist), toks[wid]))
    df_tok  = pd.DataFrame(rows, columns=["sentence_id","word_id","head_dist_class","word"])
    df_sent = df[["sentence_id","tokens"]].drop_duplicates("sentence_id")
    if df_tok.empty:
        raise ValueError("No token rows constructed—check your head_dist column.")
    return df_sent, df_tok

def sample_per_class(df_tok: pd.DataFrame, per_class_cap: int | None) -> pd.DataFrame:
    """Optional per-class subsample for plotting."""
    if per_class_cap is None:
        return df_tok.reset_index(drop=True)
    picks = []
    for c, sub in df_tok.groupby("head_dist_class", sort=False):
        n = min(len(sub), per_class_cap)
        picks.append(sub.sample(n, random_state=RAND_SEED, replace=False))
    return pd.concat(picks, ignore_index=True)

def _load_tok_and_model(model_id: str):
    """
    Robust loader for both encoder and decoder families.
    Ensures right-padding and PAD token for GPT-2-like tokenizers.
    Returns (tokenizer, model, resolved_model_id:str).
    """
    tried = []
    candidates = [model_id]
    # Robust GPT-2 fallback aliases
    if model_id.lower() in {"gpt2", "gpt-2"}:
        candidates += ["openai-community/gpt2", "gpt2"]

    for mid in candidates:
        try:
            tok = AutoTokenizer.from_pretrained(mid, use_fast=True, add_prefix_space=True)
            if getattr(tok, "padding_side", None) != "right":
                tok.padding_side = "right"
            if tok.pad_token is None and getattr(tok, "eos_token", None) is not None:
                tok.pad_token = tok.eos_token

            mdl = AutoModel.from_pretrained(mid, output_hidden_states=True)
            if getattr(mdl.config, "pad_token_id", None) is None and tok.pad_token_id is not None:
                mdl.config.pad_token_id = tok.pad_token_id

            mdl = mdl.eval().to(device)
            if device == "cuda":
                mdl.half()
            return tok, mdl, mid  # <- resolved id (string) for filenames
        except Exception as e:
            tried.append((mid, repr(e)))
            continue
    raise RuntimeError(
        "Could not load tokenizer/model. Attempts:\n" +
        "\n".join(f" - {m}: {err}" for m, err in tried)
    )

def embed_subset(df_sent: pd.DataFrame,
                 subset_df: pd.DataFrame,
                 baseline: str = BASELINE,
                 word_rep_mode: str = WORD_REP_MODE,
                 batch_size: int = BATCH_SIZE) -> Tuple[np.ndarray, np.ndarray, str]:
    """
    Return (reps (L,N,D), filled mask (N,), resolved_model_id:str).
    """
    df_sent["sentence_id"]   = df_sent["sentence_id"].astype(str)
    subset_df["sentence_id"] = subset_df["sentence_id"].astype(str)

    by_sid: Dict[str, List[Tuple[int,int]]] = {}
    for gidx, (sid, wid) in enumerate(subset_df[["sentence_id","word_id"]].itertuples(index=False)):
        by_sid.setdefault(str(sid), []).append((gidx, int(wid)))

    sids = list(by_sid.keys())
    df_sel = (df_sent[df_sent.sentence_id.isin(sids)]
              .drop_duplicates("sentence_id")
              .set_index("sentence_id")
              .loc[sids])

    tokzr, model, resolved_id = _load_tok_and_model(baseline)
    enc_kwargs = dict(is_split_into_words=True, return_tensors="pt", padding=True, truncation=True)
    if "add_prefix_space" in inspect.signature(tokzr.__call__).parameters:
        enc_kwargs["add_prefix_space"] = True

    L = _num_hidden_layers(model) + 1   # include embeddings
    D = _hidden_size(model)
    N = len(subset_df)

    reps   = np.zeros((L, N, D), np.float16)
    filled = np.zeros(N, dtype=bool)

    with torch.no_grad(), torch.cuda.amp.autocast(device == "cuda"):
        for start in tqdm(range(0, len(sids), batch_size), desc=f"{resolved_id} (embed subset)"):
            batch_ids    = sids[start : start + batch_size]
            batch_tokens = df_sel.loc[batch_ids, "tokens"].tolist()

            enc_be = tokzr(batch_tokens, **enc_kwargs)
            enc_t  = {k: v.to(device) for k, v in enc_be.items()}
            out = model(**enc_t)
            h = torch.stack(out.hidden_states).detach().cpu().numpy().astype(np.float32)  # (L,B,T,D)

            for b, sid in enumerate(batch_ids):
                mp = {}
                wids = enc_be.word_ids(b)
                if wids is None:
                    raise RuntimeError("Fast tokenizer required (word_ids unavailable).")
                for tidx, wid in enumerate(wids):
                    if wid is not None:
                        mp.setdefault(int(wid), []).append(int(tidx))

                for gidx, wid in by_sid.get(sid, []):
                    toks = mp.get(wid)
                    if not toks: continue
                    if word_rep_mode == "first":
                        vec = h[:, b, toks[0], :]
                    elif word_rep_mode == "last":
                        vec = h[:, b, toks[-1], :]
                    elif word_rep_mode == "mean":
                        vec = h[:, b, toks, :].mean(axis=1)
                    else:
                        raise ValueError("WORD_REP_MODE must be in {'first','last','mean'}.")
                    reps[:, gidx, :] = vec.astype(np.float16, copy=False)
                    filled[gidx] = True

            del enc_be, enc_t, out, h
            if device == "cuda":
                torch.cuda.empty_cache()

    if device == "cuda":
        torch.cuda.empty_cache()
    return reps, filled, resolved_id

# =============================== PCA 3D PER LAYER (Plotly) ===============================
def _pca3d_layer(X: np.ndarray, n_components: int = 3) -> np.ndarray:
    """Lightweight PCA to 3D without external deps. Returns Y in R^{n x 3}."""
    X = X.astype(np.float32, copy=False)
    Xc = X - X.mean(0, keepdims=True)
    # economy SVD: Xc = U S V^T → take first 3 comps
    U, S, _ = np.linalg.svd(Xc, full_matrices=False)
    return (U[:, :n_components] * S[:n_components]).astype(np.float32, copy=False)

def distinct_hsl_palette(classes: List[str]) -> Dict[str, str]:
    """
    Return a distinct color for each class using evenly spaced HSL hues.
    Plotly accepts CSS color strings like 'hsl(210, 65%, 50%)'.
    """
    ordered = sorted(classes, key=lambda s: int(s))
    n = len(ordered) if len(ordered) > 0 else 1
    cmap = {}
    for i, c in enumerate(ordered):
        hue = int(round(360.0 * i / n)) % 360
        cmap[c] = f"hsl({hue}, 65%, 50%)"
    return cmap

def pca3d_by_head_dist_and_plot(reps: np.ndarray,
                                words: List[str],
                                classes_arr: np.ndarray,
                                all_classes: List[str],
                                model_tag: str,
                                html_out: Path):
    """
    reps: (L, N, D), words: list[str] length N
    classes_arr: array of class labels (str) per token
    Build one 3D scatter trace per class per layer; a slider toggles layers.
    """
    if html_out is None:
        raise TypeError("html_out is None — provide a valid Path for the HTML file.")

    L, N, D = reps.shape
    print(f"PCA plotting on {N:,} tokens across {L} layers...")

    # PCA→3D per layer on ALL selected points (keeps axes consistent within the layer)
    Y_layers: List[np.ndarray] = []
    for l in range(L):
        Y_layers.append(_pca3d_layer(reps[l]))  # (N,3)

    # Distinct colors for each class
    cmap = distinct_hsl_palette(all_classes)

    # Build traces: (layer, class) pairs
    traces = []
    for l in range(L):
        Y = Y_layers[l]
        for c in all_classes:
            mask = (classes_arr == c)
            if not np.any(mask):
                x = y = z = []
                hover = []
            else:
                idx = np.where(mask)[0]
                x, y, z = Y[idx, 0], Y[idx, 1], Y[idx, 2]
                hover = [f"{words[i]} | hd={c}" for i in idx]
            traces.append(
                go.Scatter3d(
                    x=x, y=y, z=z,
                    mode="markers",
                    marker=dict(size=2, opacity=0.75, color=cmap[c]),
                    name=str(c),
                    hovertext=hover,
                    hovertemplate="<b>%{hovertext}</b><br>"
                                  "x=%{x:.3f}<br>y=%{y:.3f}<br>z=%{z:.3f}"
                                  "<extra></extra>",
                    visible=(l == 0),
                    showlegend=(l == 0),
                )
            )

    # Slider steps (one per layer)
    traces_per_layer = len(all_classes)
    steps = []
    for l in range(L):
        vis = [False] * (L * traces_per_layer)
        s = slice(l * traces_per_layer, (l + 1) * traces_per_layer)
        for k in range(*s.indices(len(vis))):
            vis[k] = True
        steps.append(dict(
            method="update",
            args=[{"visible": vis},
                  {"title": f"{model_tag} • PCA 3D by head_dist • Layer {l} (drag to rotate)"}],
            label=str(l),
        ))

    sliders = [dict(
        active=0,
        steps=steps,
        currentvalue={"prefix": "Layer: "},
        pad={"t": 10}
    )]

    layout = go.Layout(
        title=f"{model_tag} • PCA 3D by head_dist • Layer 0 (drag to rotate)",
        scene=dict(xaxis_title="PC1", yaxis_title="PC2", zaxis_title="PC3", aspectmode="data"),
        margin=dict(l=0, r=0, b=0, t=40),
        sliders=sliders,
        showlegend=True,
        legend=dict(title="head_dist class")
    )

    fig = go.Figure(data=traces, layout=layout)
    fig.show()
    # Convert Path → str explicitly to avoid any pathlike/None confusion
    fig.write_html(str(html_out), include_plotlyjs="cdn")
    print("✓ Saved interactive HTML to:", html_out)

# =============================== DRIVER ===============================
def run_pca3d_head_dist():
    # 1) Load token lists + head_dist classes
    df_sent, hd_df = load_head_dist_from_column(
        CSV_PATH, head_dist_col=HEAD_DIST_COL,
        clamp=HEAD_DIST_CLAMP, include_zero=INCLUDE_ZERO_CLASS
    )
    # 2) (Optional) subsample per class for plotting
    raw_df = sample_per_class(hd_df, PCA_MAX_PER_CLASS)
    classes = sorted(raw_df.head_dist_class.unique(), key=lambda s: int(s))
    print(f"✓ plotting subset — {len(raw_df):,} tokens across head-dist classes {classes}")

    # 3) Embed once
    reps, filled, resolved_id = embed_subset(df_sent, raw_df, BASELINE, WORD_REP_MODE, BATCH_SIZE)
    raw_df = raw_df.reset_index(drop=True).loc[filled].reset_index(drop=True)

    # Prepare labels and words
    cls_arr = raw_df.head_dist_class.values.astype(str)
    words   = raw_df.word.astype(str).tolist()

    # 4) PCA→3D per layer + Plotly (HTML path built AFTER model id is resolved)
    html_out = OUT_DIR / f"{resolved_id.replace('/', '_')}_head_dist_pca3d_layers.html"
    pca3d_by_head_dist_and_plot(
        reps.astype(np.float32, copy=False),
        words,
        cls_arr,
        classes,
        model_tag=resolved_id,
        html_out=html_out
    )

    # Cleanup
    del reps; gc.collect()
    if device == "cuda": torch.cuda.empty_cache()

if __name__ == "__main__":
    run_pca3d_head_dist()


✓ plotting subset — 194,916 tokens across head-dist classes ['-6', '-5', '-4', '-3', '-2', '-1', '0', '1', '2', '3', '4', '5', '6']


2025-12-07 21:46:51.438609: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  with torch.no_grad(), torch.cuda.amp.autocast(device == "cuda"):
openai-community/gpt2 (embed subset): 100%|█| 5034/5034 [01:03<00:00, 78.89it/s]


PCA plotting on 194,916 tokens across 13 layers...


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7f6ccf186380>>
Traceback (most recent call last):
  File "/home/ldomenichelli/venv/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


KeyboardInterrupt: 

In [2]:
from __future__ import annotations
import os, gc, ast, random, inspect
from pathlib import Path
from typing import Dict, List, Tuple, Callable

import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer, AutoModel, GPT2TokenizerFast

# =============================== CONFIG ===============================
CSV_PATH      = "en_ewt-ud-train_sentences.csv"  # <--- set to your actual file path
HEAD_DIST_COL = "head_dist"                      # list[int] column with signed head distances
BASELINE      = "gpt2"                           # "gpt2" or "openai-community/gpt2" or any HF id
WORD_REP_MODE = "last"                           # GPT-2: {"last","mean"}; BERT: {"first","last","mean"}

# Remove the **first** token in every sentence (0-based index == 0)
EXCLUDE_FIRST_TOKEN = True

# Sampling / bootstrap
RAW_MAX_PER_CLASS           = int(1e12)  # no cap for fast metrics
N_BOOTSTRAP_FAST            = 50
N_BOOTSTRAP_HEAVY           = 20
FAST_BS_MAX_SAMP_PER_CLASS  = int(1e12)  # M = N (classic bootstrap)
HEAVY_BS_MAX_SAMP_PER_CLASS = 5000       # practical for TwoNN/GRIDE/skdim
MIN_N_FOR_HEAVY_WARN        = 1000

# Head-distance classes config
HEAD_DIST_CLAMP    = 6       # clamp to [-6, 6]
INCLUDE_ZERO_CLASS = False   # include "0" class or not

# Output
RAND_SEED = 42
PLOT_DIR  = Path("results_HEADDIST_no_index"); PLOT_DIR.mkdir(exist_ok=True, parents=True)
CSV_DIR   = Path("tables_HEADDIST_no_index") / "headdist_bootstrap"; CSV_DIR.mkdir(exist_ok=True, parents=True)
BATCH_SIZE = 1

# Repro & device
os.environ["TOKENIZERS_PARALLELISM"] = "true"
random.seed(RAND_SEED); np.random.seed(RAND_SEED); torch.manual_seed(RAND_SEED)
device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cuda": torch.backends.cudnn.benchmark = True

sns.set_style("darkgrid")
plt.rcParams["figure.dpi"] = 120
EPS = 1e-12

# =============================== HELPERS ===============================
def _to_list(x):
    return ast.literal_eval(x) if isinstance(x, str) and x.startswith("[") else x

def _center(X: np.ndarray) -> np.ndarray:
    return X - X.mean(0, keepdims=True)

def _eigvals_from_X(X: np.ndarray) -> np.ndarray:
    """Eigenvalues of covariance up to a constant via SVD of centered X (descending)."""
    Xc = _center(X.astype(np.float32, copy=False))
    try:
        _, S, _ = np.linalg.svd(Xc, full_matrices=False)
        lam = (S**2).astype(np.float64)
        lam.sort()
        return lam[::-1]
    except Exception:
        return np.array([], dtype=np.float64)

def _jitter_unique(X: np.ndarray, eps: float = 1e-6) -> np.ndarray:
    """Add tiny noise if there are duplicate rows (helps NN-based estimators)."""
    try:
        if np.unique(X, axis=0).shape[0] < X.shape[0]:
            X = X + np.random.normal(scale=eps, size=X.shape).astype(X.dtype)
    except Exception:
        pass
    return X

def _num_hidden_layers(model) -> int:
    n = getattr(model.config, "num_hidden_layers", None)
    if n is None: n = getattr(model.config, "n_layer", None)  # GPT-2
    if n is None: raise ValueError("Cannot determine number of hidden layers from model.config")
    return int(n)

def _hidden_size(model) -> int:
    d = getattr(model.config, "hidden_size", None)
    if d is None: d = getattr(model.config, "n_embd", None)  # GPT-2
    if d is None: raise ValueError("Cannot determine hidden size from model.config")
    return int(d)

# =============================== ROBUST LOADER (fixes NoneType path errors) ===============================
def _load_tok_and_model(baseline: str):
    """
    Robustly load tokenizer + model.

    For GPT-2 we prefer GPT2TokenizerFast and try both IDs:
      - the user-provided `baseline`
      - "openai-community/gpt2"
      - "gpt2"

    We also set:
      - right padding
      - pad_token = eos_token when missing (decoder-only models)
    """
    cands: List[str] = []
    seen = set()
    for mid in [baseline, "openai-community/gpt2", "gpt2"]:
        if "gpt2" in baseline.lower():
            if mid not in seen:
                cands.append(mid); seen.add(mid)
    if not cands:
        cands = [baseline]

    last_err = None
    for mid in cands:
        try:
            if "gpt2" in mid.lower():
                tokzr = GPT2TokenizerFast.from_pretrained(mid, add_prefix_space=True)
            else:
                tokzr = AutoTokenizer.from_pretrained(mid, use_fast=True)
            # right padding
            if getattr(tokzr, "padding_side", None) != "right":
                tokzr.padding_side = "right"
            # pad token for GPT-like models
            if tokzr.pad_token is None and getattr(tokzr, "eos_token", None) is not None:
                tokzr.pad_token = tokzr.eos_token

            model = AutoModel.from_pretrained(mid, output_hidden_states=True)
            if getattr(model.config, "pad_token_id", None) is None and tokzr.pad_token_id is not None:
                model.config.pad_token_id = tokzr.pad_token_id

            return tokzr, model, mid
        except Exception as e:
            last_err = e
            continue
    raise RuntimeError(f"Failed to load tokenizer/model for '{baseline}'. Tried {cands}. Last error: {last_err}")

# =============================== DATA: use existing head_dist column ===============================
def load_head_dist_from_column(csv_path: str,
                               head_dist_col: str = HEAD_DIST_COL,
                               clamp: int = HEAD_DIST_CLAMP,
                               include_zero: bool = INCLUDE_ZERO_CLASS):
    """
    Expects CSV columns:
      - sentence_id (str)
      - tokens      (list[str]) — one row per sentence
      - head_dist   (list[int]) — signed distances per token
    Produces token-level rows with:
      - 'head_dist_class' in {-clamp..-1, [0], 1..clamp}
      - *drops the first token in each sentence* if EXCLUDE_FIRST_TOKEN is True
    """
    need = ["sentence_id", "tokens", head_dist_col]
    df = pd.read_csv(csv_path, usecols=need, dtype={"sentence_id": str})
    df.tokens = df.tokens.apply(_to_list)
    df[head_dist_col] = df[head_dist_col].apply(_to_list)

    rows = []
    for sid, toks, dists in df[["sentence_id","tokens", head_dist_col]].itertuples(index=False):
        L = min(len(toks), len(dists))
        for wid in range(L):
            # --- Drop the FIRST token in each sentence (0-based index == 0) ---
            if EXCLUDE_FIRST_TOKEN and wid == 0:
                continue
            try:
                dist = int(dists[wid])
            except Exception:
                continue
            if dist == 0 and not include_zero:
                continue
            if dist < -clamp: dist = -clamp
            if dist >  clamp: dist =  clamp
            rows.append((sid, wid, str(dist), toks[wid]))

    df_tok  = pd.DataFrame(rows, columns=["sentence_id","word_id","head_dist_class","word"])

    # Safety: enforce the filter even if the loop above changes in the future
    if EXCLUDE_FIRST_TOKEN and not df_tok.empty:
        df_tok = df_tok[df_tok.word_id != 0].reset_index(drop=True)

    df_sent = df[["sentence_id","tokens"]].drop_duplicates("sentence_id")
    if df_tok.empty:
        raise ValueError("No token rows constructed—recheck your CSV columns and EXCLUDE_FIRST_TOKEN setting.")
    return df_sent, df_tok

def sample_raw(df_tok: pd.DataFrame, per_class_cap: int = RAW_MAX_PER_CLASS) -> pd.DataFrame:
    picks = []
    for c, sub in df_tok.groupby("head_dist_class", sort=False):
        n = min(len(sub), per_class_cap)
        picks.append(sub.sample(n, random_state=RAND_SEED, replace=False))
    return pd.concat(picks, ignore_index=True)

def make_dist_palette(classes: List[str]) -> Dict[str, Tuple[float, float, float]]:
    # KEEPING YOUR PALETTE EXACTLY: seaborn "coolwarm" spaced across classes
    vals = sorted([int(c) for c in classes])
    cmap = sns.color_palette("coolwarm", len(vals))
    return {str(v): cmap[i] for i, v in enumerate(vals)}

# =============================== EMBEDDING (GPT‑2 & others) ===============================
def embed_subset(df_sent: pd.DataFrame,
                 subset_df: pd.DataFrame,
                 baseline: str = BASELINE,
                 word_rep_mode: str = WORD_REP_MODE,
                 batch_size: int = BATCH_SIZE) -> Tuple[np.ndarray, np.ndarray]:
    df_sent["sentence_id"]   = df_sent["sentence_id"].astype(str)
    subset_df["sentence_id"] = subset_df["sentence_id"].astype(str)

    # sid -> list[(global_idx, word_id)]
    by_sid: Dict[str, List[Tuple[int,int]]] = {}
    for gidx, (sid, wid) in enumerate(subset_df[["sentence_id","word_id"]].itertuples(index=False)):
        by_sid.setdefault(str(sid), []).append((gidx, int(wid)))

    sids = list(by_sid.keys())
    df_sel = (df_sent[df_sent.sentence_id.isin(sids)]
              .drop_duplicates("sentence_id")
              .set_index("sentence_id")
              .loc[sids])

    tokzr, model, model_id = _load_tok_and_model(baseline)
    model = model.eval().to(device)
    if device == "cuda":
        model.half()

    enc_kwargs = dict(is_split_into_words=True, return_tensors="pt", padding=True)
    # Use add_prefix_space when supported (GPT‑2-friendly)
    if "add_prefix_space" in inspect.signature(tokzr.__call__).parameters:
        enc_kwargs["add_prefix_space"] = True

    L = _num_hidden_layers(model) + 1   # include embedding layer
    D = _hidden_size(model)
    N = len(subset_df)

    reps   = np.zeros((L, N, D), np.float16)
    filled = np.zeros(N, dtype=bool)

    with torch.no_grad(), torch.cuda.amp.autocast(device == "cuda"):
        for start in tqdm(range(0, len(sids), batch_size), desc=f"{model_id} (embed subset)"):
            batch_ids    = sids[start : start + batch_size]
            batch_tokens = df_sel.loc[batch_ids, "tokens"].tolist()

            enc_be = tokzr(batch_tokens, **enc_kwargs)
            enc_t  = {k: v.to(device) for k, v in enc_be.items()}
            out = model(**enc_t)
            h = torch.stack(out.hidden_states).detach().cpu().numpy().astype(np.float32)  # (L,B,T,D)

            for b, sid in enumerate(batch_ids):
                # word_id -> token positions for this item
                mp: Dict[int, List[int]] = {}
                wids = enc_be.word_ids(b)  # requires a *fast* tokenizer
                if wids is None:
                    raise RuntimeError("Fast tokenizer required (word_ids unavailable).")
                for tidx, wid in enumerate(wids):
                    if wid is not None:
                        mp.setdefault(int(wid), []).append(int(tidx))

                for gidx, wid in by_sid.get(sid, []):
                    toks = mp.get(wid)
                    if not toks:
                        continue
                    if word_rep_mode == "first":
                        vec = h[:, b, toks[0], :]
                    elif word_rep_mode == "last":
                        vec = h[:, b, toks[-1], :]
                    elif word_rep_mode == "mean":
                        vec = h[:, b, toks, :].mean(axis=1)
                    else:
                        raise ValueError("WORD_REP_MODE must be one of {'first','last','mean'} (for GPT‑2 use 'last' or 'mean').")
                    reps[:, gidx, :] = vec.astype(np.float16, copy=False)
                    filled[gidx] = True

            del enc_be, enc_t, out, h
            if device == "cuda": torch.cuda.empty_cache()

    missing = int((~filled).sum())
    if missing:
        print(f"⚠ Missing vectors for {missing} of {N} tokens")
    del model; gc.collect()
    if device == "cuda": torch.cuda.empty_cache()
    return reps, filled

# =============================== BOOTSTRAP CORE ===============================
def _bs_layer_loop(rep_sub: np.ndarray, M: int, n_reps: int, compute_once: Callable[[np.ndarray], float]):
    L, N, D = rep_sub.shape
    rng = np.random.default_rng(RAND_SEED)
    A = np.full((n_reps, L), np.nan, np.float32)
    for r in range(n_reps):
        idx = rng.integers(0, N, size=M)
        for l in range(L):
            X = rep_sub[l, idx].astype(np.float32, copy=False)
            try:
                A[r, l] = float(compute_once(X))
            except Exception:
                A[r, l] = np.nan
    mu = np.nanmean(A, axis=0).astype(np.float32)
    lo = np.nanpercentile(A, 2.5, axis=0).astype(np.float32)
    hi = np.nanpercentile(A, 97.5, axis=0).astype(np.float32)
    return mu, lo, hi

# =============================== METRICS ===============================
# --- Isotropy (fast) ---
def _iso_once(X: np.ndarray) -> float:
    # If 'isoscore' package is not installed, use a safe monotone proxy
    try:
        from isoscore import IsoScore
        return float(IsoScore.IsoScore(X))
    except Exception:
        lam = _eigvals_from_X(X)
        if lam.size == 0: return np.nan
        return float(np.clip(lam.mean() / (lam.max() + 1e-9), 0.0, 1.0))

def _sf_once(X: np.ndarray) -> float:
    lam = _eigvals_from_X(X)
    if lam.size == 0: return np.nan
    gm = np.exp(np.mean(np.log(lam + EPS)))
    am = float(lam.mean() + EPS)
    return float(gm / am)  # higher => flatter => more isotropic

def _vmf_kappa_once(X: np.ndarray) -> float:
    if X.shape[0] < 2: return np.nan
    Xn = X / (np.linalg.norm(X, axis=1, keepdims=True) + 1e-9)
    R = np.linalg.norm(Xn.mean(axis=0))
    d = Xn.shape[1]
    if R < 1e-9: return 0.0
    return float(max(R * (d - R**2) / (1.0 - R**2 + 1e-9), 0.0))  # higher => more anisotropic

def _spect_once(X: np.ndarray) -> float:
    ev = np.linalg.eigvalsh(np.cov(X.T, ddof=0))
    if ev.size == 0: return np.nan
    return float(ev[-1] / (ev.mean() + 1e-9))

def _rand_once(X: np.ndarray, K: int = 2000) -> float:
    n = X.shape[0]
    if n < 2: return np.nan
    rng = np.random.default_rng(RAND_SEED)
    K_eff = min(K, (n*(n-1))//2)
    i = rng.integers(0, n, size=K_eff); j = rng.integers(0, n, size=K_eff)
    same = (i == j)
    if same.any(): j[same] = rng.integers(0, n, size=same.sum())
    A, B = X[i], X[j]
    num = np.einsum("ij,ij->i", A, B)
    den = (np.linalg.norm(A, axis=1)*np.linalg.norm(B, axis=1) + 1e-9)
    return float(np.mean(np.abs(num/den)))

# --- Linear ID (fast) ---
def _pcaXX_once(X: np.ndarray, var_ratio: float) -> float:
    lam = _eigvals_from_X(X)
    if lam.size == 0: return np.nan
    c = np.cumsum(lam); thr = c[-1] * var_ratio
    return float(np.searchsorted(c, thr) + 1)

def _pca95_once(X: np.ndarray) -> float: return _pcaXX_once(X, 0.95)
def _pca99_once(X: np.ndarray) -> float: return _pcaXX_once(X, 0.99)

def _erank_once(X: np.ndarray) -> float:
    lam = _eigvals_from_X(X)
    if lam.size == 0: return np.nan
    p = lam / (lam.sum() + EPS)
    H = -(p * np.log(p + EPS)).sum()
    return float(np.exp(H))

def _pr_once(X: np.ndarray) -> float:
    lam = _eigvals_from_X(X)
    if lam.size == 0: return np.nan
    s1 = lam.sum(); s2 = (lam**2).sum()
    return float((s1**2) / (s2 + EPS))

def _stable_rank_once(X: np.ndarray) -> float:
    lam = _eigvals_from_X(X)
    if lam.size == 0: return np.nan
    return float(lam.sum() / (lam.max() + EPS))

# --- Non-linear (heavy) ---
HAS_DADAPY = False
try:
    from dadapy import Data  # DADApy ID estimators (TwoNN, GRIDE)
    HAS_DADAPY = True
except Exception:
    pass

def _dadapy_twonn_once(X: np.ndarray) -> float:
    if not HAS_DADAPY: return np.nan
    d = Data(coordinates=_jitter_unique(X))
    id_est, _, _ = d.compute_id_2NN()
    return float(id_est)

def _dadapy_gride_once(X: np.ndarray) -> float:
    if not HAS_DADAPY: return np.nan
    d = Data(coordinates=_jitter_unique(X))
    d.compute_distances(maxk=64)
    ids, _, _ = d.return_id_scaling_gride(range_max=64)
    return float(ids[-1])

FAST_ONCE: Dict[str, Callable[[np.ndarray], float]] = {
    # Isotropy
    "iso": _iso_once, "sf": _sf_once, "vmf_kappa": _vmf_kappa_once,
    "spect": _spect_once, "rand": _rand_once,
    # Linear ID
    "pca95": _pca95_once, "pca99": _pca99_once,
    "erank": _erank_once, "pr": _pr_once, "stable_rank": _stable_rank_once,
}
HEAVY_ONCE: Dict[str, Callable[[np.ndarray], float] | None] = {
    "twonn": _dadapy_twonn_once, "gride": _dadapy_gride_once,
}

LABELS = {
    # Isotropy
    "iso":"IsoScore", "sf":"Spectral Flatness",
    "vmf_kappa":"vMF κ (anisotropy↑)", "spect":"Spectral Ratio (λ_max/μ, anisotropy↑)",
    "rand":"RandCos |μ| (anisotropy↑)",
    # Linear ID
    "pca95":"lPCA 0.95", "pca99":"lPCA 0.99",
    "erank":"Effective Rank","pr":"Participation Ratio","stable_rank":"Stable Rank",
    # Non-linear
    "twonn":"TwoNN ID","gride":"GRIDE",
}

# Compute a compact set by default (tweak as needed)
ALL_METRICS = ["pca99"]

# =============================== SAVE / PLOT ===============================
def save_metric_csv_all_classes(metric: str,
                                class_to_stats: Dict[str, Dict[str, np.ndarray]],
                                layers: np.ndarray,
                                baseline: str,
                                subset_name: str = "raw"):
    rows = []
    for c, stats in class_to_stats.items():
        mu, lo, hi = stats["mean"], stats.get("lo"), stats.get("hi")
        for l, val in enumerate(mu):
            rows.append({
                "subset": subset_name, "model": baseline, "feature": "head_dist",
                "class": c, "metric": metric, "layer": int(layers[l]),
                "mean": float(val) if np.isfinite(val) else np.nan,
                "ci_low": float(lo[l]) if isinstance(lo, np.ndarray) and np.isfinite(lo[l]) else np.nan,
                "ci_high": float(hi[l]) if isinstance(hi, np.ndarray) and np.isfinite(hi[l]) else np.nan,
                "n_tokens": int(stats.get("n", 0)),
                "word_rep_mode": WORD_REP_MODE,
                "source_csv": Path(CSV_PATH).name,
            })
    df = pd.DataFrame(rows)
    out = CSV_DIR / f"headdist_{subset_name}_{metric}_{baseline}.csv"
    df.to_csv(out, index=False)

def plot_metric_with_ci(class_to_stats: Dict[str, Dict[str, np.ndarray]],
                        layers: np.ndarray, metric: str, title: str, out_path: Path,
                        palette: Dict[str, Tuple[float, float, float]] | None = None):
    plt.figure(figsize=(9, 5))
    for c in sorted(class_to_stats.keys(), key=lambda s: int(s)):
        stats = class_to_stats[c]
        mu, lo, hi = stats["mean"], stats.get("lo"), stats.get("hi")
        if mu is None or np.all(np.isnan(mu)): continue
        color = palette.get(c) if isinstance(palette, dict) else None
        plt.plot(layers, mu, label=c, lw=1.8, color=color)
        if isinstance(lo, np.ndarray) and isinstance(hi, np.ndarray) and not np.all(np.isnan(lo)):
            plt.fill_between(layers, lo, hi, alpha=0.15, color=color)
    plt.xlabel("Layer"); plt.ylabel(LABELS.get(metric, metric.upper())); plt.title(title)
    plt.legend(ncol=4, fontsize="small", title="Head distance (−6 … 6)", frameon=False)
    plt.tight_layout(); plt.savefig(out_path, dpi=220); plt.close()

# =============================== DRIVER ===============================
def run_headdist_from_col_pipeline():
    # 1) Load token lists + distance classes from existing column
    df_sent, hd_df = load_head_dist_from_column(
        CSV_PATH, head_dist_col=HEAD_DIST_COL,
        clamp=HEAD_DIST_CLAMP, include_zero=INCLUDE_ZERO_CLASS
    )
    classes = sorted(hd_df.head_dist_class.unique(), key=lambda s: int(s))
    palette = make_dist_palette(classes)
    print(f"✓ corpus ready — {len(hd_df):,} tokens across head-dist classes {classes}")

    # 2) Optional per-class cap (currently unlimited for fast metrics)
    raw_df = sample_raw(hd_df, RAW_MAX_PER_CLASS)
    print("Sample sizes per head distance (raw cap):")
    counts = raw_df.head_dist_class.value_counts()
    counts = counts.reindex(sorted(counts.index, key=lambda x: int(x)))
    print(counts.to_dict())
    too_small = {c:int(n) for c,n in counts.items() if n < MIN_N_FOR_HEAVY_WARN}
    if too_small:
        print(f"⚠ Some classes have < {MIN_N_FOR_HEAVY_WARN} tokens (heavy ID may be noisy): {too_small}")

    # 3) Embed once
    reps, filled = embed_subset(df_sent, raw_df, BASELINE, WORD_REP_MODE, BATCH_SIZE)
    raw_df = raw_df.reset_index(drop=True).loc[filled].reset_index(drop=True)
    cls_arr = raw_df.head_dist_class.values
    L = reps.shape[0]; layers = np.arange(L)
    print(f"✓ embedded {len(raw_df):,} tokens  • layers={L}")

    # 4) Metric loop
    for metric in ALL_METRICS:
        print(f"\n→ Computing metric: {metric} …")
        compute_once = FAST_ONCE.get(metric) or HEAVY_ONCE.get(metric)
        if compute_once is None:
            print(f"  (skipping {metric}: estimator unavailable)")
            continue
        n_bs = N_BOOTSTRAP_FAST if metric in FAST_ONCE else N_BOOTSTRAP_HEAVY
        Mcap = FAST_BS_MAX_SAMP_PER_CLASS if metric in FAST_ONCE else HEAVY_BS_MAX_SAMP_PER_CLASS

        class_results: Dict[str, Dict[str, np.ndarray]] = {}
        for c in classes:
            idx = np.where(cls_arr == c)[0]
            if idx.size < 3:
                continue
            sub = reps[:, idx]  # (L, n_c, D)
            Nc = sub.shape[1]
            M = min(Mcap, Nc)
            mu, lo, hi = _bs_layer_loop(sub, M, n_bs, compute_once)
            class_results[c] = {"mean": mu, "lo": lo, "hi": hi, "n": int(Nc)}

        save_metric_csv_all_classes(metric, class_results, layers, BASELINE, subset_name="raw")
        plot_metric_with_ci(class_results, layers, metric,
                            title=f"{LABELS.get(metric, metric.upper())} • {BASELINE}",
                            out_path=PLOT_DIR / f"headdist_raw_{metric}_{BASELINE}.png",
                            palette=palette)
        print(f"  ✓ saved: CSV= {CSV_DIR}/headdist_raw_{metric}_{BASELINE}.csv  "
              f"plot= {PLOT_DIR}/headdist_raw_{metric}_{BASELINE}.png")

        del class_results; gc.collect()
        if device == "cuda": torch.cuda.empty_cache()

    del reps; gc.collect()
    if device == "cuda": torch.cuda.empty_cache()
    print("\n✓ done (incremental outputs produced per metric).")

if __name__ == "__main__":
    run_headdist_from_col_pipeline()


✓ corpus ready — 175,934 tokens across head-dist classes ['-6', '-5', '-4', '-3', '-2', '-1', '1', '2', '3', '4', '5', '6']
Sample sizes per head distance (raw cap):
{'-6': 19585, '-5': 4545, '-4': 6684, '-3': 10487, '-2': 15690, '-1': 13038, '1': 53836, '2': 26199, '3': 12161, '4': 5719, '5': 2883, '6': 5107}


openai-community/gpt2 (embed subset): 100%|█| 10067/10067 [01:31<00:00, 110.37it


✓ embedded 175,934 tokens  • layers=13

→ Computing metric: pca99 …


KeyboardInterrupt: 

## Finegrained

In [5]:
CSV_PATH      = "en_ewt-ud-train_sentences.csv"
HEAD_DIST_COL = "head_dist"             # <-- your existing column with per-token distances
BASELINE      = "bert-base-uncased"     # set to "gpt2" for GPT-2 family
WORD_REP_MODE = "first"                 # BERT: {"first","last","mean"}; GPT-2: {"last","mean"}

POS_COL      = "pos"                    # column name in en_ewt-ud-train_sentences.csv
KEEP_POS_TAG = "NOUN"                   # only keep tokens whose POS == "NOUN"

# Sampling / bootstrap
RAW_MAX_PER_CLASS              = int(1e12)  # no cap for fast metrics
N_BOOTSTRAP_FAST               = 50
N_BOOTSTRAP_HEAVY              = 200
FAST_BS_MAX_SAMP_PER_CLASS     = int(1e12)  # M = N (classic bootstrap)
HEAVY_BS_MAX_SAMP_PER_CLASS    = 5000       # practical for TwoNN/GRIDE/skdim
MIN_N_FOR_HEAVY_WARN           = 1000       # warn if a class has fewer than this

# Head-distance classes
HEAD_DIST_CLAMP       = 6                  # keep classes within [-6,6]
INCLUDE_ZERO_CLASS    = False              # set True if you also want "0" class

# Misc
RAND_SEED = 42
PLOT_DIR  = Path("results_HEADDIST"); PLOT_DIR.mkdir(exist_ok=True, parents=True)
CSV_DIR   = Path("tables_HEADDIST") / "headdist_bootstrap"; CSV_DIR.mkdir(exist_ok=True, parents=True)
BATCH_SIZE = 1

# Repro & device
os.environ["TOKENIZERS_PARALLELISM"] = "true"
random.seed(RAND_SEED); np.random.seed(RAND_SEED); torch.manual_seed(RAND_SEED)
device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cuda": torch.backends.cudnn.benchmark = True

sns.set_style("darkgrid")
plt.rcParams["figure.dpi"] = 120
EPS = 1e-12


In [6]:
# =============================== HELPERS ===============================
def _to_list(x):
    # Turn a string like "['a', 'b']" into a real Python list
    return ast.literal_eval(x) if isinstance(x, str) and x.startswith("[") else x


In [7]:
# =============================== DATA: use existing head_dist column ===============================
def load_head_dist_from_column(csv_path: str,
                               head_dist_col: str = HEAD_DIST_COL,
                               clamp: int = HEAD_DIST_CLAMP,
                               include_zero: bool = INCLUDE_ZERO_CLASS,
                               pos_col: str = POS_COL,
                               keep_pos_tag: str = KEEP_POS_TAG):
    """
    Expects CSV columns:
      - sentence_id (str)
      - tokens      (list[str]) — one row per sentence
      - head_dist   (list[int]) — signed distances per token
      - pos         (list[str]) — POS tags per token

    Produces token-level rows with 'head_dist_class' in {-clamp..-1, [0], 1..clamp},
    **restricted to tokens with POS == keep_pos_tag** (default: "NOUN").
    """
    # We now also read the POS column
    need = ["sentence_id", "tokens", head_dist_col, pos_col]
    df = pd.read_csv(csv_path, usecols=need, dtype={"sentence_id": str})

    # Convert stringified lists to actual Python lists
    df.tokens         = df.tokens.apply(_to_list)
    df[head_dist_col] = df[head_dist_col].apply(_to_list)
    df[pos_col]       = df[pos_col].apply(_to_list)

    rows = []
    for sid, toks, dists, poss in df[["sentence_id", "tokens", head_dist_col, pos_col]].itertuples(index=False):
        # be safe if lengths differ
        L = min(len(toks), len(dists), len(poss))
        for wid in range(L):
            pos_tag = poss[wid]

            # *** POS FILTER: keep only NOUNs ***
            if pos_tag != keep_pos_tag:
                continue

            try:
                dist = int(dists[wid])
            except Exception:
                continue

            if dist == 0 and not include_zero:
                continue
            if dist < -clamp:
                dist = -clamp
            if dist >  clamp:
                dist =  clamp

            rows.append((sid, wid, str(dist), toks[wid]))

    df_tok  = pd.DataFrame(
        rows,
        columns=["sentence_id","word_id","head_dist_class","word"]
    )
    df_sent = df[["sentence_id","tokens"]].drop_duplicates("sentence_id")

    print(f"✓ kept {len(df_tok):,} tokens with POS == {keep_pos_tag!r}")
    return df_sent, df_tok


In [8]:
def sample_raw(df_tok: pd.DataFrame, per_class_cap: int = RAW_MAX_PER_CLASS) -> pd.DataFrame:
    picks = []
    for c, sub in df_tok.groupby("head_dist_class", sort=False):
        n = min(len(sub), per_class_cap)
        picks.append(sub.sample(n, random_state=RAND_SEED, replace=False))
    return pd.concat(picks, ignore_index=True)

def make_dist_palette(classes: List[str]) -> Dict[str, Tuple[float, float, float]]:
    vals = sorted([int(c) for c in classes])
    cmap = sns.color_palette("coolwarm", len(vals))
    return {str(v): cmap[i] for i, v in enumerate(vals)}


In [9]:
ALL_METRICS=[   "lpca99"]

In [10]:
def run_headdist_from_col_pipeline():
    # 1) Load token lists + distance classes from existing column
    df_sent, hd_df = load_head_dist_from_column(
        CSV_PATH, head_dist_col=HEAD_DIST_COL,
        clamp=HEAD_DIST_CLAMP, include_zero=INCLUDE_ZERO_CLASS
    )
    classes = sorted(hd_df.head_dist_class.unique(), key=lambda s: int(s))
    palette = make_dist_palette(classes)
    print(f"✓ corpus ready — {len(hd_df):,} tokens across head-dist classes {classes}")

    # 2) Optional per-class cap (currently unlimited for fast metrics)
    raw_df = sample_raw(hd_df, RAW_MAX_PER_CLASS)
    print("Sample sizes per head distance (raw cap):")
    counts = raw_df.head_dist_class.value_counts()
    counts = counts.reindex(sorted(counts.index, key=lambda x: int(x)))
    print(counts.to_dict())
    # Warn if heavy estimators may be unreliable for small classes
    too_small = {c:int(n) for c,n in counts.items() if n < MIN_N_FOR_HEAVY_WARN}
    if too_small:
        print(f"⚠ Some classes have < {MIN_N_FOR_HEAVY_WARN} tokens (heavy ID may be noisy): {too_small}")

    # 3) Embed once
    reps, filled = embed_subset(df_sent, raw_df, BASELINE, WORD_REP_MODE, BATCH_SIZE)
    raw_df = raw_df.reset_index(drop=True).loc[filled].reset_index(drop=True)
    cls_arr = raw_df.head_dist_class.values
    L = reps.shape[0]; layers = np.arange(L)
    print(f"✓ embedded {len(raw_df):,} tokens  • layers={L}")

    # 4) Metric loop
    for metric in ALL_METRICS:
        print(f"\n→ Computing metric: {metric} …")
        compute_once = FAST_ONCE.get(metric) or HEAVY_ONCE.get(metric)
        if compute_once is None:
            print(f"  (skipping {metric}: estimator unavailable)")
            continue
        n_bs = N_BOOTSTRAP_FAST if metric in FAST_ONCE else N_BOOTSTRAP_HEAVY
        Mcap = FAST_BS_MAX_SAMP_PER_CLASS if metric in FAST_ONCE else HEAVY_BS_MAX_SAMP_PER_CLASS

        class_results: Dict[str, Dict[str, np.ndarray]] = {}
        for c in classes:
            idx = np.where(cls_arr == c)[0]
            if idx.size < 3:
                continue
            sub = reps[:, idx]  # (L, n_c, D)
            Nc = sub.shape[1]
            M = min(Mcap, Nc)
            mu, lo, hi = _bs_layer_loop(sub, M, n_bs, compute_once)
            class_results[c] = {"mean": mu, "lo": lo, "hi": hi, "n": int(Nc)}

        save_metric_csv_all_classes(metric, class_results, layers, BASELINE, subset_name="raw")
        plot_metric_with_ci(class_results, layers, metric,
                            title=f"{LABELS.get(metric, metric.upper())} • {BASELINE}",
                            out_path=PLOT_DIR / f"headdist_raw_{metric}_{BASELINE}.png",
                            palette=palette)
        print(f"  ✓ saved: CSV= {CSV_DIR}/headdist_raw_{metric}_{BASELINE}.csv  "
              f"plot= {PLOT_DIR}/headdist_raw_{metric}_{BASELINE}.png")

        del class_results; gc.collect()
        if device == "cuda": torch.cuda.empty_cache()

    del reps; gc.collect()
    if device == "cuda": torch.cuda.empty_cache()
    print("\n✓ done (incremental outputs produced per metric).")

if __name__ == "__main__":
    run_headdist_from_col_pipeline()


✓ kept 32,299 tokens with POS == 'NOUN'
✓ corpus ready — 32,299 tokens across head-dist classes ['-6', '-5', '-4', '-3', '-2', '-1', '1', '2', '3', '4', '5', '6']
Sample sizes per head distance (raw cap):
{'-6': 4000, '-5': 1882, '-4': 3138, '-3': 5790, '-2': 6275, '-1': 1213, '1': 4959, '2': 1706, '3': 1115, '4': 612, '5': 394, '6': 1215}
⚠ Some classes have < 1000 tokens (heavy ID may be noisy): {'4': 612, '5': 394}


bert-base-uncased (embed subset):   5%|▎    | 465/9078 [00:03<01:08, 126.01it/s]


KeyboardInterrupt: 

In [6]:
CSV_PATH      = "en_ewt-ud-train_sentences.csv"
HEAD_DIST_COL = "head_dist"             # <-- your existing column with per-token distances
BASELINE      = "gpt2"     # set to "gpt2" for GPT-2 family
WORD_REP_MODE = "last"                 # BERT: {"first","last","mean"}; GPT-2: {"last","mean"}

POS_COL      = "pos"                    # column name in en_ewt-ud-train_sentences.csv
KEEP_POS_TAG = "NOUN"                   # only keep tokens whose POS == "NOUN"

# Sampling / bootstrap
RAW_MAX_PER_CLASS              = int(1e12)  # no cap for fast metrics
N_BOOTSTRAP_FAST               = 50
N_BOOTSTRAP_HEAVY              = 200
FAST_BS_MAX_SAMP_PER_CLASS     = int(1e12)  # M = N (classic bootstrap)
HEAVY_BS_MAX_SAMP_PER_CLASS    = 5000       # practical for TwoNN/GRIDE/skdim
MIN_N_FOR_HEAVY_WARN           = 1000       # warn if a class has fewer than this

# Head-distance classes
HEAD_DIST_CLAMP       = 6                  # keep classes within [-6,6]
INCLUDE_ZERO_CLASS    = False              # set True if you also want "0" class

# Misc
RAND_SEED = 42
PLOT_DIR  = Path("results_HEADDIST"); PLOT_DIR.mkdir(exist_ok=True, parents=True)
CSV_DIR   = Path("tables_HEADDIST") / "headdist_bootstrap"; CSV_DIR.mkdir(exist_ok=True, parents=True)
BATCH_SIZE = 1

# Repro & device
os.environ["TOKENIZERS_PARALLELISM"] = "true"
random.seed(RAND_SEED); np.random.seed(RAND_SEED); torch.manual_seed(RAND_SEED)
device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cuda": torch.backends.cudnn.benchmark = True

sns.set_style("darkgrid")
plt.rcParams["figure.dpi"] = 120
EPS = 1e-12



# =============================== HELPERS ===============================
def _to_list(x):
    # Turn a string like "['a', 'b']" into a real Python list
    return ast.literal_eval(x) if isinstance(x, str) and x.startswith("[") else x



# =============================== DATA: use existing head_dist column ===============================


def load_head_dist_from_column(csv_path: str,
                               head_dist_col: str = HEAD_DIST_COL,
                               clamp: int = HEAD_DIST_CLAMP,
                               include_zero: bool = INCLUDE_ZERO_CLASS,
                               pos_col: str = POS_COL,
                               keep_pos_tag: str = KEEP_POS_TAG):
    """
    Expects CSV columns:
      - sentence_id (str)
      - tokens      (list[str]) — one row per sentence
      - head_dist   (list[int]) — signed distances per token
      - pos         (list[str]) — POS tags per token

    Produces token-level rows with 'head_dist_class' in {-clamp..-1, [0], 1..clamp},
    **restricted to tokens with POS == keep_pos_tag** (default: "NOUN").
    """
    # We now also read the POS column
    need = ["sentence_id", "tokens", head_dist_col, pos_col]
    df = pd.read_csv(csv_path, usecols=need, dtype={"sentence_id": str})

    # Convert stringified lists to actual Python lists
    df.tokens         = df.tokens.apply(_to_list)
    df[head_dist_col] = df[head_dist_col].apply(_to_list)
    df[pos_col]       = df[pos_col].apply(_to_list)

    rows = []
    for sid, toks, dists, poss in df[["sentence_id", "tokens", head_dist_col, pos_col]].itertuples(index=False):
        # be safe if lengths differ
        L = min(len(toks), len(dists), len(poss))
        for wid in range(L):
            pos_tag = poss[wid]

            # *** POS FILTER: keep only NOUNs ***
            if pos_tag != keep_pos_tag:
                continue

            try:
                dist = int(dists[wid])
            except Exception:
                continue

            if dist == 0 and not include_zero:
                continue
            if dist < -clamp:
                dist = -clamp
            if dist >  clamp:
                dist =  clamp

            rows.append((sid, wid, str(dist), toks[wid]))

    df_tok  = pd.DataFrame(
        rows,
        columns=["sentence_id","word_id","head_dist_class","word"]
    )
    df_sent = df[["sentence_id","tokens"]].drop_duplicates("sentence_id")

    print(f"✓ kept {len(df_tok):,} tokens with POS == {keep_pos_tag!r}")
    return df_sent, df_tok



def sample_raw(df_tok: pd.DataFrame, per_class_cap: int = RAW_MAX_PER_CLASS) -> pd.DataFrame:
    picks = []
    for c, sub in df_tok.groupby("head_dist_class", sort=False):
        n = min(len(sub), per_class_cap)
        picks.append(sub.sample(n, random_state=RAND_SEED, replace=False))
    return pd.concat(picks, ignore_index=True)

def make_dist_palette(classes: List[str]) -> Dict[str, Tuple[float, float, float]]:
    vals = sorted([int(c) for c in classes])
    cmap = sns.color_palette("coolwarm", len(vals))
    return {str(v): cmap[i] for i, v in enumerate(vals)}


ALL_METRICS=[ "pca99"]
def run_headdist_from_col_pipeline():
    # 1) Load token lists + distance classes from existing column
    df_sent, hd_df = load_head_dist_from_column(
        CSV_PATH, head_dist_col=HEAD_DIST_COL,
        clamp=HEAD_DIST_CLAMP, include_zero=INCLUDE_ZERO_CLASS
    )
    classes = sorted(hd_df.head_dist_class.unique(), key=lambda s: int(s))
    palette = make_dist_palette(classes)
    print(f"✓ corpus ready — {len(hd_df):,} tokens across head-dist classes {classes}")

    # 2) Optional per-class cap (currently unlimited for fast metrics)
    raw_df = sample_raw(hd_df, RAW_MAX_PER_CLASS)
    print("Sample sizes per head distance (raw cap):")
    counts = raw_df.head_dist_class.value_counts()
    counts = counts.reindex(sorted(counts.index, key=lambda x: int(x)))
    print(counts.to_dict())
    # Warn if heavy estimators may be unreliable for small classes
    too_small = {c:int(n) for c,n in counts.items() if n < MIN_N_FOR_HEAVY_WARN}
    if too_small:
        print(f"⚠ Some classes have < {MIN_N_FOR_HEAVY_WARN} tokens (heavy ID may be noisy): {too_small}")

    # 3) Embed once
    reps, filled = embed_subset(df_sent, raw_df, BASELINE, WORD_REP_MODE, BATCH_SIZE)
    raw_df = raw_df.reset_index(drop=True).loc[filled].reset_index(drop=True)
    cls_arr = raw_df.head_dist_class.values
    L = reps.shape[0]; layers = np.arange(L)
    print(f"✓ embedded {len(raw_df):,} tokens  • layers={L}")

    # 4) Metric loop
    for metric in ALL_METRICS:
        print(f"\n→ Computing metric: {metric} …")
        compute_once = FAST_ONCE.get(metric) or HEAVY_ONCE.get(metric)
        if compute_once is None:
            print(f"  (skipping {metric}: estimator unavailable)")
            continue
        n_bs = N_BOOTSTRAP_FAST if metric in FAST_ONCE else N_BOOTSTRAP_HEAVY
        Mcap = FAST_BS_MAX_SAMP_PER_CLASS if metric in FAST_ONCE else HEAVY_BS_MAX_SAMP_PER_CLASS

        class_results: Dict[str, Dict[str, np.ndarray]] = {}
        for c in classes:
            idx = np.where(cls_arr == c)[0]
            if idx.size < 3:
                continue
            sub = reps[:, idx]  # (L, n_c, D)
            Nc = sub.shape[1]
            M = min(Mcap, Nc)
            mu, lo, hi = _bs_layer_loop(sub, M, n_bs, compute_once)
            class_results[c] = {"mean": mu, "lo": lo, "hi": hi, "n": int(Nc)}

        save_metric_csv_all_classes(metric, class_results, layers, BASELINE, subset_name="raw")
        plot_metric_with_ci(class_results, layers, metric,
                            title=f"{LABELS.get(metric, metric.upper())} • {BASELINE}",
                            out_path=PLOT_DIR / f"headdist_raw_{metric}_{BASELINE}.png",
                            palette=palette)
        print(f"  ✓ saved: CSV= {CSV_DIR}/headdist_raw_{metric}_{BASELINE}.csv  "
              f"plot= {PLOT_DIR}/headdist_raw_{metric}_{BASELINE}.png")

        del class_results; gc.collect()
        if device == "cuda": torch.cuda.empty_cache()

    del reps; gc.collect()
    if device == "cuda": torch.cuda.empty_cache()
    print("\n✓ done (incremental outputs produced per metric).")

if __name__ == "__main__":
    run_headdist_from_col_pipeline()


✓ kept 32,299 tokens with POS == 'NOUN'
✓ corpus ready — 32,299 tokens across head-dist classes ['-6', '-5', '-4', '-3', '-2', '-1', '1', '2', '3', '4', '5', '6']
Sample sizes per head distance (raw cap):
{'-6': 4000, '-5': 1882, '-4': 3138, '-3': 5790, '-2': 6275, '-1': 1213, '1': 4959, '2': 1706, '3': 1115, '4': 612, '5': 394, '6': 1215}
⚠ Some classes have < 1000 tokens (heavy ID may be noisy): {'4': 612, '5': 394}


openai-community/gpt2 (embed subset): 100%|█| 9078/9078 [01:16<00:00, 118.75it/s


ValueError: too many values to unpack (expected 2)

In [4]:
CSV_PATH      = "en_ewt-ud-train_sentences.csv"
HEAD_DIST_COL = "head_dist"             # <-- your existing column with per-token distances
BASELINE      = "gpt2"                  # set to "gpt2" for GPT-2 family
WORD_REP_MODE = "last"                  # BERT: {"first","last","mean"}; GPT-2: {"last","mean"}

POS_COL      = "pos"                    # column name in en_ewt-ud-train_sentences.csv
KEEP_POS_TAG = "NOUN"                   # only keep tokens whose POS == "NOUN"

# Sampling / bootstrap
RAW_MAX_PER_CLASS              = int(1e12)  # no cap for fast metrics
N_BOOTSTRAP_FAST               = 50
N_BOOTSTRAP_HEAVY              = 200
FAST_BS_MAX_SAMP_PER_CLASS     = int(1e12)  # M = N (classic bootstrap)
HEAVY_BS_MAX_SAMP_PER_CLASS    = 5000       # practical for TwoNN/GRIDE/skdim
MIN_N_FOR_HEAVY_WARN           = 1000       # warn if a class has fewer than this

# Head-distance classes
HEAD_DIST_CLAMP       = 6                  # keep classes within [-6,6]
INCLUDE_ZERO_CLASS    = False              # set True if you also want "0" class

# Misc
RAND_SEED = 42
PLOT_DIR  = Path("results_HEADDIST"); PLOT_DIR.mkdir(exist_ok=True, parents=True)
CSV_DIR   = Path("tables_HEADDIST") / "headdist_bootstrap"; CSV_DIR.mkdir(exist_ok=True, parents=True)
BATCH_SIZE = 1

# Repro & device
os.environ["TOKENIZERS_PARALLELISM"] = "true"
random.seed(RAND_SEED); np.random.seed(RAND_SEED); torch.manual_seed(RAND_SEED)
device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cuda": torch.backends.cudnn.benchmark = True

sns.set_style("darkgrid")
plt.rcParams["figure.dpi"] = 120
EPS = 1e-12

# =============================== HELPERS ===============================
def _to_list(x):
    # Turn a string like "['a', 'b']" into a real Python list
    return ast.literal_eval(x) if isinstance(x, str) and x.startswith("[") else x

# =============================== DATA: use existing head_dist column ===============================
def load_head_dist_from_column(csv_path: str,
                               head_dist_col: str = HEAD_DIST_COL,
                               clamp: int = HEAD_DIST_CLAMP,
                               include_zero: bool = INCLUDE_ZERO_CLASS,
                               pos_col: str = POS_COL,
                               keep_pos_tag: str = KEEP_POS_TAG):
    """
    Expects CSV columns:
      - sentence_id (str)
      - tokens      (list[str]) — one row per sentence
      - head_dist   (list[int]) — signed distances per token
      - pos         (list[str]) — POS tags per token

    Produces token-level rows with 'head_dist_class' in {-clamp..-1, [0], 1..clamp},
    restricted to tokens with POS == keep_pos_tag (default: "NOUN"),
    and **excluding tokens at sentence position 0**.
    """
    # We now also read the POS column
    need = ["sentence_id", "tokens", head_dist_col, pos_col]
    df = pd.read_csv(csv_path, usecols=need, dtype={"sentence_id": str})

    # Convert stringified lists to actual Python lists
    df.tokens         = df.tokens.apply(_to_list)
    df[head_dist_col] = df[head_dist_col].apply(_to_list)
    df[pos_col]       = df[pos_col].apply(_to_list)

    rows = []
    for sid, toks, dists, poss in df[["sentence_id", "tokens", head_dist_col, pos_col]].itertuples(index=False):
        # be safe if lengths differ
        L = min(len(toks), len(dists), len(poss))
        for wid in range(L):
            # --- NEW FILTER: ignore first token in each sentence ---
            if wid == 0:                      # <<< NEW: skip first token
                continue

            pos_tag = poss[wid]

            # POS FILTER: keep only NOUNs
            if pos_tag != keep_pos_tag:
                continue

            try:
                dist = int(dists[wid])
            except Exception:
                continue

            if dist == 0 and not include_zero:
                continue
            if dist < -clamp:
                dist = -clamp
            if dist >  clamp:
                dist =  clamp

            rows.append((sid, wid, str(dist), toks[wid]))

    df_tok  = pd.DataFrame(
        rows,
        columns=["sentence_id","word_id","head_dist_class","word"]
    )
    df_sent = df[["sentence_id","tokens"]].drop_duplicates("sentence_id")

    print(f"✓ kept {len(df_tok):,} tokens with POS == {keep_pos_tag!r} and word_id > 0")
    return df_sent, df_tok

def sample_raw(df_tok: pd.DataFrame, per_class_cap: int = RAW_MAX_PER_CLASS) -> pd.DataFrame:
    picks = []
    for c, sub in df_tok.groupby("head_dist_class", sort=False):
        n = min(len(sub), per_class_cap)
        picks.append(sub.sample(n, random_state=RAND_SEED, replace=False))
    return pd.concat(picks, ignore_index=True)

def make_dist_palette(classes: List[str]) -> Dict[str, Tuple[float, float, float]]:
    vals = sorted([int(c) for c in classes])
    cmap = sns.color_palette("coolwarm", len(vals))
    return {str(v): cmap[i] for i, v in enumerate(vals)}

ALL_METRICS = [ "pca99"]

def run_headdist_from_col_pipeline():
    # 1) Load token lists + distance classes from existing column
    df_sent, hd_df = load_head_dist_from_column(
        CSV_PATH, head_dist_col=HEAD_DIST_COL,
        clamp=HEAD_DIST_CLAMP, include_zero=INCLUDE_ZERO_CLASS
    )
    classes = sorted(hd_df.head_dist_class.unique(), key=lambda s: int(s))
    palette = make_dist_palette(classes)
    print(f"✓ corpus ready — {len(hd_df):,} tokens across head-dist classes {classes}")

    # 2) Optional per-class cap (currently unlimited for fast metrics)
    raw_df = sample_raw(hd_df, RAW_MAX_PER_CLASS)
    print("Sample sizes per head distance (raw cap):")
    counts = raw_df.head_dist_class.value_counts()
    counts = counts.reindex(sorted(counts.index, key=lambda x: int(x)))
    print(counts.to_dict())
    # Warn if heavy estimators may be unreliable for small classes
    too_small = {c:int(n) for c,n in counts.items() if n < MIN_N_FOR_HEAVY_WARN}
    if too_small:
        print(f"⚠ Some classes have < {MIN_N_FOR_HEAVY_WARN} tokens (heavy ID may be noisy): {too_small}")

    # 3) Embed once
    reps, filled = embed_subset(df_sent, raw_df, BASELINE, WORD_REP_MODE, BATCH_SIZE)
    raw_df = raw_df.reset_index(drop=True).loc[filled].reset_index(drop=True)
    cls_arr = raw_df.head_dist_class.values
    L = reps.shape[0]; layers = np.arange(L)
    print(f"✓ embedded {len(raw_df):,} tokens  • layers={L}")

    # 4) Metric loop
    for metric in ALL_METRICS:
        print(f"\n→ Computing metric: {metric} …")
        compute_once = FAST_ONCE.get(metric) or HEAVY_ONCE.get(metric)
        if compute_once is None:
            print(f"  (skipping {metric}: estimator unavailable)")
            continue
        n_bs = N_BOOTSTRAP_FAST if metric in FAST_ONCE else N_BOOTSTRAP_HEAVY
        Mcap = FAST_BS_MAX_SAMP_PER_CLASS if metric in FAST_ONCE else HEAVY_BS_MAX_SAMP_PER_CLASS

        class_results: Dict[str, Dict[str, np.ndarray]] = {}
        for c in classes:
            idx = np.where(cls_arr == c)[0]
            if idx.size < 3:
                continue
            sub = reps[:, idx]  # (L, n_c, D)
            Nc = sub.shape[1]
            M = min(Mcap, Nc)
            mu, lo, hi = _bs_layer_loop(sub, M, n_bs, compute_once)
            class_results[c] = {"mean": mu, "lo": lo, "hi": hi, "n": int(Nc)}

        save_metric_csv_all_classes(metric, class_results, layers, BASELINE, subset_name="raw")
        plot_metric_with_ci(class_results, layers, metric,
                            title=f"{LABELS.get(metric, metric.upper())} • {BASELINE}",
                            out_path=PLOT_DIR / f"headdist_raw_{metric}_{BASELINE}.png",
                            palette=palette)
        print(f"  ✓ saved: CSV= {CSV_DIR}/headdist_raw_{metric}_{BASELINE}.csv  "
              f"plot= {PLOT_DIR}/headdist_raw_{metric}_{BASELINE}.png")

        del class_results; gc.collect()
        if device == "cuda": torch.cuda.empty_cache()

    del reps; gc.collect()
    if device == "cuda": torch.cuda.empty_cache()
    print("\n✓ done (incremental outputs produced per metric).")

if __name__ == "__main__":
    run_headdist_from_col_pipeline()


✓ kept 31,997 tokens with POS == 'NOUN' and word_id > 0
✓ corpus ready — 31,997 tokens across head-dist classes ['-6', '-5', '-4', '-3', '-2', '-1', '1', '2', '3', '4', '5', '6']
Sample sizes per head distance (raw cap):
{'-6': 4000, '-5': 1882, '-4': 3138, '-3': 5790, '-2': 6275, '-1': 1213, '1': 4846, '2': 1650, '3': 1073, '4': 585, '5': 378, '6': 1167}
⚠ Some classes have < 1000 tokens (heavy ID may be noisy): {'4': 585, '5': 378}


openai-community/gpt2 (embed subset): 100%|█| 9052/9052 [01:15<00:00, 120.59it/s


✓ embedded 31,997 tokens  • layers=13

→ Computing metric: pca99 …
  ✓ saved: CSV= tables_HEADDIST/headdist_bootstrap/headdist_raw_pca99_gpt2.csv  plot= results_HEADDIST/headdist_raw_pca99_gpt2.png

✓ done (incremental outputs produced per metric).
