# Skillbook History Analysis

Interactive exploration of skillbook evolution across the variance experiment:
7 budget levels × 5 runs × 25 traces = 875 snapshots.

Uses `scripts/analysis/skillbook_history.py` for data loading and analysis.

In [None]:
import sys
from pathlib import Path

ROOT = Path.cwd().parent if Path.cwd().name == "results" else Path.cwd()
sys.path.insert(0, str(ROOT / "scripts"))

import numpy as np
import matplotlib.pyplot as plt

plt.rcParams.update({
    "figure.dpi": 120,
    "figure.facecolor": "white",
    "axes.grid": True,
    "grid.alpha": 0.3,
    "font.size": 10,
})

from analysis.skillbook_history import (
    load_experiment,
    skill_counts, section_counts, toon_token_counts, next_id_counts,
    tokens_per_skill, avg_content_lengths,
    delta_counts_by_trace, skill_survival, skill_lifespans, churn_per_trace,
    section_first_appearance, section_sizes_over_time,
    section_to_topic,
    load_final_embeddings, cross_run_embedding_overlap, cluster_final_skills,
    budget_comparison_table,
    load_compression_metrics, compression_distribution,
    plot_growth_curves, plot_cross_budget_overlay, plot_delta_bars,
    plot_survival_curve, plot_section_timeline,
    DeltaType, BUDGET_COLORS, BUDGET_ORDER, NUM_TRACES,
    generate_report,
)

print("Imports OK")

In [None]:
EXPERIMENT_DIR = ROOT / "results" / "variance_experiment"
exp = load_experiment(EXPERIMENT_DIR)

print(f"Loaded {len(exp.budget_groups)} budget groups:")
for g in exp.budget_groups:
    n_snaps = sum(len(r.snapshots) for r in g.runs)
    final_skills = [skill_counts(r)[-1] for r in g.runs if r.metrics]
    print(f"  {g.budget_label:>13s}: {len(g.runs)} runs, {n_snaps} snapshots, "
          f"final skills = {np.mean(final_skills):.1f} ± {np.std(final_skills):.1f}")

## 1. Growth Curves

How do skill counts, TOON tokens, and section counts evolve over the 25-trace sequence?

In [None]:
fig = plot_growth_curves(exp, skill_counts, "Skills", "Skill Growth per Budget")
plt.show()

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for ax, fn, ylabel, title in [
    (axes[0], skill_counts, "Skills", "Skills"),
    (axes[1], toon_token_counts, "TOON Tokens", "TOON Tokens"),
    (axes[2], section_counts, "Sections", "Sections"),
]:
    x = np.arange(NUM_TRACES)
    for group in exp.budget_groups:
        arrays = [np.array(fn(r)) for r in group.runs]
        if not arrays:
            continue
        mean = np.mean(arrays, axis=0)
        std = np.std(arrays, axis=0)
        color = BUDGET_COLORS[group.budget_label]
        ax.plot(x[:len(mean)], mean, color=color, linewidth=2, label=group.budget_label)
        ax.fill_between(x[:len(mean)], mean - std, mean + std, color=color, alpha=0.1)
    ax.set_xlabel("Trace")
    ax.set_ylabel(ylabel)
    ax.set_title(title)
    ax.legend(fontsize=7)

fig.suptitle("Cross-Budget Overlay", fontsize=14, fontweight="bold", y=1.02)
fig.tight_layout()
plt.show()

## 2. Skill Lifecycle

ADD/UPDATE/REMOVE event frequency, survival curves, churn analysis, and lifespan distributions.

In [None]:
# Delta bars for a representative budget (budget-3000, run 1)
rep = exp.get_budget("budget-3000")
if rep and rep.runs:
    fig = plot_delta_bars(rep.runs[0])
    plt.show()

In [None]:
fig = plot_survival_curve(exp)
plt.show()

In [None]:
# Churn analysis: next_id vs final count + churn rate bar chart
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Scatter: next_id vs final skill count
ax = axes[0]
for group in exp.budget_groups:
    color = BUDGET_COLORS[group.budget_label]
    for run in group.runs:
        if not run.metrics:
            continue
        final_nid = run.next_ids[-1] if run.next_ids else 0
        final_sk = run.metrics[-1].skill_count
        ax.scatter(final_nid, final_sk, color=color, s=40, alpha=0.7)
# Diagonal reference (no churn)
lim = max(ax.get_xlim()[1], ax.get_ylim()[1])
ax.plot([0, lim], [0, lim], "k--", alpha=0.3, label="No churn")
ax.set_xlabel("next_id (total skills ever created)")
ax.set_ylabel("Final skill count (surviving)")
ax.set_title("Churn: Skills Created vs Surviving")
ax.legend(fontsize=8)

# Bar chart: churn rate per budget
ax = axes[1]
labels = []
rates = []
stds = []
colors = []
for group in exp.budget_groups:
    run_rates = []
    for run in group.runs:
        if not run.next_ids or not run.metrics:
            continue
        nid = run.next_ids[-1]
        sk = run.metrics[-1].skill_count
        run_rates.append((nid - sk) / nid if nid > 0 else 0)
    labels.append(group.budget_label)
    rates.append(np.mean(run_rates) if run_rates else 0)
    stds.append(np.std(run_rates) if run_rates else 0)
    colors.append(BUDGET_COLORS[group.budget_label])

ax.bar(range(len(labels)), rates, yerr=stds, color=colors, capsize=4)
ax.set_xticks(range(len(labels)))
ax.set_xticklabels(labels, rotation=45, ha="right", fontsize=8)
ax.set_ylabel("Churn rate (removed / created)")
ax.set_title("Skill Churn Rate by Budget")

fig.tight_layout()
plt.show()

In [None]:
# Lifespan distributions per budget
fig, axes = plt.subplots(2, 4, figsize=(18, 8), squeeze=False, sharey=True)

for idx, group in enumerate(exp.budget_groups):
    ax = axes[idx // 4][idx % 4]
    all_spans = []
    for run in group.runs:
        all_spans.extend(skill_lifespans(run))
    if all_spans:
        ax.hist(all_spans, bins=range(0, NUM_TRACES + 1), color=BUDGET_COLORS[group.budget_label], alpha=0.7)
    ax.set_title(group.budget_label, fontsize=10)
    ax.set_xlabel("Lifespan (traces)")
    if idx % 4 == 0:
        ax.set_ylabel("Count")

for idx in range(len(exp.budget_groups), 8):
    axes[idx // 4][idx % 4].set_visible(False)

fig.suptitle("Skill Lifespan Distributions (removed skills only)", fontsize=14, fontweight="bold", y=1.01)
fig.tight_layout()
plt.show()

## 3. Section Evolution

When do sections first appear? How do their sizes change over time?

In [None]:
# Section appearance timelines (one run per budget)
fig, axes = plt.subplots(2, 4, figsize=(20, 10), squeeze=False, sharex=True)

for idx, group in enumerate(exp.budget_groups):
    ax = axes[idx // 4][idx % 4]
    if not group.runs:
        continue
    run = group.runs[0]
    first = section_first_appearance(run)
    sizes = section_sizes_over_time(run)
    sections_sorted = sorted(first.keys(), key=lambda s: first[s])

    for i, sec in enumerate(sections_sorted):
        start = first[sec]
        sz = sizes.get(sec, [])
        end = start
        for t in range(len(sz) - 1, -1, -1):
            if sz[t] > 0:
                end = t
                break
        ax.barh(i, end - start + 1, left=start, height=0.6,
                color=BUDGET_COLORS[group.budget_label], alpha=0.7)

    ax.set_yticks(range(len(sections_sorted)))
    ax.set_yticklabels(sections_sorted, fontsize=6)
    ax.set_xlabel("Trace")
    ax.set_title(f"{group.budget_label} (run 1)", fontsize=10)

for idx in range(len(exp.budget_groups), 8):
    axes[idx // 4][idx % 4].set_visible(False)

fig.suptitle("Section Appearance Timelines", fontsize=14, fontweight="bold", y=1.01)
fig.tight_layout()
plt.show()

In [None]:
# Section size heatmap for a single budget
rep = exp.get_budget("budget-3000")
if rep and rep.runs:
    run = rep.runs[0]
    sizes = section_sizes_over_time(run)
    sections = sorted(sizes.keys())
    mat = np.array([sizes[s] for s in sections])

    fig, ax = plt.subplots(figsize=(14, max(4, len(sections) * 0.4)))
    im = ax.imshow(mat, aspect="auto", cmap="YlOrRd", interpolation="nearest")
    ax.set_yticks(range(len(sections)))
    ax.set_yticklabels(sections, fontsize=8)
    ax.set_xlabel("Trace")
    ax.set_title(f"Section Sizes — budget-3000/run_1")
    fig.colorbar(im, ax=ax, label="Skills in section")
    fig.tight_layout()
    plt.show()

## 4. Cross-Run Convergence

Do independent runs converge on similar skill/section structures?

In [None]:
# Embedding nearest-neighbor overlap (loads final snapshot embeddings)
print("Loading final embeddings and computing pairwise NN similarity...")
print()

for group in exp.budget_groups:
    stats = cross_run_embedding_overlap(group)
    print(f"{group.budget_label}:")
    print(f"  Overall mean NN cosine: {stats['overall_mean_nn']:.3f} ± {stats['overall_std_nn']:.3f}")
    for p in stats['pairs']:
        print(f"    run_{p['run_i']} ↔ run_{p['run_j']}: {p['mean_nn']:.3f}")
    print()

In [None]:
# Core skill clustering per budget
fig, axes = plt.subplots(2, 4, figsize=(18, 8), squeeze=False, sharey=True)

for idx, group in enumerate(exp.budget_groups):
    ax = axes[idx // 4][idx % 4]
    result = cluster_final_skills(group, n_clusters=10)
    if "error" in result:
        ax.text(0.5, 0.5, result["error"], ha="center", va="center", transform=ax.transAxes)
        ax.set_title(group.budget_label)
        continue

    clusters = result["clusters"]
    coverage = [c["run_coverage"] for c in clusters]
    sizes = [c["size"] for c in clusters]
    colors_bar = [BUDGET_COLORS[group.budget_label] if c >= 0.6 else "#d1d5db" for c in coverage]

    ax.bar(range(len(clusters)), coverage, color=colors_bar)
    ax.axhline(0.6, color="red", linestyle="--", alpha=0.5, label="Core threshold")
    ax.set_xlabel("Cluster")
    if idx % 4 == 0:
        ax.set_ylabel("Run coverage")
    ax.set_title(f"{group.budget_label} ({result['core_clusters']} core)", fontsize=10)
    ax.set_ylim(0, 1.1)

for idx in range(len(exp.budget_groups), 8):
    axes[idx // 4][idx % 4].set_visible(False)

fig.suptitle("Skill Cluster Run Coverage (KMeans k=10)", fontsize=14, fontweight="bold", y=1.01)
fig.tight_layout()
plt.show()

## 5. Conciseness

Token efficiency: TOON tokens per skill and average content length over time.

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

x = np.arange(NUM_TRACES)
for ax, fn, ylabel, title in [
    (axes[0], tokens_per_skill, "Tokens/Skill", "TOON Tokens per Skill"),
    (axes[1], avg_content_lengths, "Avg Content Length (chars)", "Content Length per Skill"),
]:
    for group in exp.budget_groups:
        arrays = [np.array(fn(r)) for r in group.runs]
        if not arrays:
            continue
        mean = np.mean(arrays, axis=0)
        std = np.std(arrays, axis=0)
        color = BUDGET_COLORS[group.budget_label]
        ax.plot(x[:len(mean)], mean, color=color, linewidth=2, label=group.budget_label)
        ax.fill_between(x[:len(mean)], mean - std, mean + std, color=color, alpha=0.1)
    ax.set_xlabel("Trace")
    ax.set_ylabel(ylabel)
    ax.set_title(title)
    ax.legend(fontsize=7)

fig.tight_layout()
plt.show()

## 6. Cross-Budget Summary

Final metrics across all budgets.

In [None]:
import pandas as pd

rows = []
for group in exp.budget_groups:
    finals_sk = [skill_counts(r)[-1] for r in group.runs if r.metrics]
    finals_tt = [toon_token_counts(r)[-1] for r in group.runs if r.metrics]
    finals_sc = [section_counts(r)[-1] for r in group.runs if r.metrics]
    finals_nid = [r.next_ids[-1] for r in group.runs if r.next_ids]
    finals_tps = [tokens_per_skill(r)[-1] for r in group.runs if r.metrics]

    rows.append({
        "Budget": group.budget_label,
        "Skills": f"{np.mean(finals_sk):.1f} ± {np.std(finals_sk):.1f}",
        "Sections": f"{np.mean(finals_sc):.1f} ± {np.std(finals_sc):.1f}",
        "TOON Tokens": f"{np.mean(finals_tt):.0f} ± {np.std(finals_tt):.0f}",
        "next_id": f"{np.mean(finals_nid):.0f} ± {np.std(finals_nid):.0f}",
        "Tok/Skill": f"{np.mean(finals_tps):.1f} ± {np.std(finals_tps):.1f}",
    })

df = pd.DataFrame(rows)
df

In [None]:
# Saturation plot: budget value vs final metrics
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

for ax, fn, ylabel in [
    (axes[0], skill_counts, "Final Skills"),
    (axes[1], toon_token_counts, "Final TOON Tokens"),
    (axes[2], section_counts, "Final Sections"),
]:
    budget_vals = []
    means = []
    stds = []
    colors = []
    for group in exp.budget_groups:
        vals = [fn(r)[-1] for r in group.runs if r.metrics]
        bv = group.budget_value if group.budget_value is not None else 20000
        budget_vals.append(bv)
        means.append(np.mean(vals))
        stds.append(np.std(vals))
        colors.append(BUDGET_COLORS[group.budget_label])

    ax.errorbar(budget_vals, means, yerr=stds, fmt="o-", capsize=4)
    for bv, m, c in zip(budget_vals, means, colors):
        ax.scatter([bv], [m], color=c, s=60, zorder=5)
    ax.set_xscale("log")
    ax.set_xlabel("Token Budget")
    ax.set_ylabel(ylabel)
    ax.set_title(ylabel)
    # Label the no-budget point
    ax.annotate("no-budget", (20000, means[-1]), textcoords="offset points",
                xytext=(0, 10), ha="center", fontsize=8)

fig.suptitle("Budget Saturation", fontsize=14, fontweight="bold", y=1.02)
fig.tight_layout()
plt.show()

## 8. Generate Report

Write `SKILLBOOK_HISTORY_ANALYSIS.md` with all computed values and save figures to `results/figures/`.

## 7. Opus Compression: Individual Runs vs Consensus

How do Opus-compressed skillbooks compare across budgets?
All 35 individual runs compressed + 7 consensus compressions, with raw→compressed comparison.

In [None]:
# Load compression metrics
comp_metrics = load_compression_metrics(EXPERIMENT_DIR)
comp_dist = compression_distribution(EXPERIMENT_DIR)

# Summary table: individual runs (raw → compressed) + consensus
import pandas as pd

rows = []
for budget in BUDGET_ORDER:
    d = comp_dist.get(budget)
    if not d:
        continue
    ckey = f"consensus_{budget}"
    c = comp_metrics.get(ckey, {})
    rows.append({
        "Budget": budget,
        "Raw Skills": f"{d['raw_skills_mean']:.1f} ± {d['raw_skills_std']:.1f}",
        "Raw MD Tokens": f"{d['raw_md_tokens_mean']:.0f} ± {d['raw_md_tokens_std']:.0f}",
        "Opus Skills": f"{d['skills_mean']:.1f} ± {d['skills_std']:.1f}",
        "Opus MD Tokens": f"{d['md_tokens_tiktoken_mean']:.0f} ± {d['md_tokens_tiktoken_std']:.0f}",
        "Compression %": f"{d['compression_pct_mean']:.1f}% ± {d['compression_pct_std']:.1f}%",
        "Consensus Skills": c.get("skills", ""),
        "Consensus Tokens": c.get("md_tokens_tiktoken", ""),
    })

df_comp = pd.DataFrame(rows)
df_comp

In [None]:
# Compression distribution: 3 panels — skills, tokens, compression %
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

budget_vals = []
for budget in BUDGET_ORDER:
    bv = 20000 if budget == "no-budget" else int(budget.split("-")[1])
    budget_vals.append(bv)

# Panel 1: Skills (raw → opus → consensus)
ax = axes[0]
raw_sk = [comp_dist[b]["raw_skills_mean"] for b in BUDGET_ORDER]
raw_sk_std = [comp_dist[b]["raw_skills_std"] for b in BUDGET_ORDER]
opus_sk = [comp_dist[b]["skills_mean"] for b in BUDGET_ORDER]
opus_sk_std = [comp_dist[b]["skills_std"] for b in BUDGET_ORDER]
cons_sk = [comp_metrics.get(f"consensus_{b}", {}).get("skills", 0) for b in BUDGET_ORDER]

ax.errorbar(budget_vals, raw_sk, yerr=raw_sk_std, fmt="o-", capsize=4,
            label="Raw (uncompressed)", color="#9ca3af")
ax.errorbar(budget_vals, opus_sk, yerr=opus_sk_std, fmt="o-", capsize=4,
            label="Opus (individual)", color="#2563eb")
ax.plot(budget_vals, cons_sk, "s--", color="#dc2626", label="Opus (consensus)", markersize=7)
ax.set_xscale("log")
ax.set_xlabel("Token Budget")
ax.set_ylabel("Skills")
ax.set_title("Skill Counts")
ax.legend(fontsize=7)

# Panel 2: MD Tokens (raw → opus → consensus)
ax = axes[1]
raw_toks = [comp_dist[b]["raw_md_tokens_mean"] for b in BUDGET_ORDER]
raw_toks_std = [comp_dist[b]["raw_md_tokens_std"] for b in BUDGET_ORDER]
opus_toks = [comp_dist[b]["md_tokens_tiktoken_mean"] for b in BUDGET_ORDER]
opus_toks_std = [comp_dist[b]["md_tokens_tiktoken_std"] for b in BUDGET_ORDER]
cons_toks = [comp_metrics.get(f"consensus_{b}", {}).get("md_tokens_tiktoken", 0) for b in BUDGET_ORDER]

ax.errorbar(budget_vals, raw_toks, yerr=raw_toks_std, fmt="o-", capsize=4,
            label="Raw (uncompressed)", color="#9ca3af")
ax.errorbar(budget_vals, opus_toks, yerr=opus_toks_std, fmt="o-", capsize=4,
            label="Opus (individual)", color="#2563eb")
ax.plot(budget_vals, cons_toks, "s--", color="#dc2626", label="Opus (consensus)", markersize=7)
ax.set_xscale("log")
ax.set_xlabel("Token Budget")
ax.set_ylabel("Tiktoken Tokens")
ax.set_title("MD Token Counts")
ax.legend(fontsize=7)

# Panel 3: Compression % (individual runs + consensus)
ax = axes[2]
comp_pcts = [comp_dist[b]["compression_pct_mean"] for b in BUDGET_ORDER]
comp_pcts_std = [comp_dist[b]["compression_pct_std"] for b in BUDGET_ORDER]

# Consensus compression % from raw_md_tokens stored in metrics
cons_comp = []
for b in BUDGET_ORDER:
    c = comp_metrics.get(f"consensus_{b}", {})
    raw = c.get("raw_md_tokens", 0)
    opus = c.get("md_tokens_tiktoken", 0)
    cons_comp.append(opus / raw * 100 if raw > 0 else 0)

ax.errorbar(budget_vals, comp_pcts, yerr=comp_pcts_std, fmt="o-", capsize=4,
            label="Individual runs (mean ± std)", color="#2563eb")
ax.plot(budget_vals, cons_comp, "s--", color="#dc2626", label="Consensus", markersize=7)
ax.axhline(45, color="#9ca3af", linestyle="--", alpha=0.5, label="~45% average")
ax.set_xscale("log")
ax.set_xlabel("Token Budget")
ax.set_ylabel("Compression %")
ax.set_title("Opus Compression Ratio (compressed/raw)")
ax.set_ylim(0, 70)
ax.legend(fontsize=7)

fig.suptitle("Opus Compression: Individual Runs vs Consensus", fontsize=14, fontweight="bold", y=1.02)
fig.tight_layout()
plt.show()

In [None]:
out = generate_report(exp, output_dir=ROOT / "results")
print(f"Report written to: {out}")
print(f"Figures saved to: {ROOT / 'results' / 'figures'}")