# 03 â€” Semantic Novelty

This notebook:
1) Loads LSA embeddings from `data/lsa/chunk_embeddings.npy`
2) Loads aligned metadata from `data/lsa/chunk_index.jsonl`
3) Computes novelty metrics per chunk:
   - novelty vs cumulative history centroid
   - novelty vs trailing window centroid
   - novelty vs previous chunk
4) Aggregates novelty into per-document summary + novelty curve outputs

Outputs:
- `data/lsa/novelty_per_chunk.jsonl`
- `data/lsa/novelty_summary_per_doc.jsonl`
- `data/lsa/novelty_curves_per_doc.jsonl`

Interpretation:
- Low novelty over time suggests low semantic information rate (high predictability)
- High novelty spikes often indicate topic shifts or introduction of new ideas


## Imports + paths

In [None]:
from __future__ import annotations

import json
from pathlib import Path
from typing import Dict, Any, List, Tuple, Optional

import numpy as np

from _paths import set_repo_root
ROOT = set_repo_root()

EMB_IN = ROOT / "data" / "lsa" / "chunk_embeddings.npy"
INDEX_IN = ROOT / "data" / "lsa" / "chunk_index.jsonl"

OUT_DIR = ROOT / "data" / "lsa"
OUT_DIR.mkdir(parents=True, exist_ok=True)

NOVELTY_CHUNKS_OUT = OUT_DIR / "novelty_per_chunk.jsonl"
NOVELTY_DOC_SUMMARY_OUT = OUT_DIR / "novelty_summary_per_doc.jsonl"
NOVELTY_CURVES_OUT = OUT_DIR / "novelty_curves_per_doc.jsonl"

print("Embeddings:", EMB_IN.resolve())
print("Index:", INDEX_IN.resolve())


## Load embeddings + aligned metadata

In [None]:
def read_jsonl(path: Path) -> List[Dict[str, Any]]:
    rows = []
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            rows.append(json.loads(line))
    return rows

Z = np.load(EMB_IN)  # shape [n_chunks, d], should already be L2-normalized
meta = read_jsonl(INDEX_IN)

if len(meta) != Z.shape[0]:
    raise ValueError(f"Mismatch: {len(meta)} metadata rows vs {Z.shape[0]} embeddings")

print("Loaded:", Z.shape[0], "chunks with dim", Z.shape[1])
print("Example meta:", meta[0])


### Helper: group chunks by document (in order)

In [None]:
# Build doc -> list of indices (already sorted upstream, but we'll ensure ordering by chunk_index)
doc_to_indices: Dict[str, List[int]] = {}
doc_info: Dict[str, Dict[str, Any]] = {}

for i, r in enumerate(meta):
    doc_id = r["doc_id"]
    doc_to_indices.setdefault(doc_id, []).append(i)
    doc_info.setdefault(doc_id, {"title": r["title"], "chunk_type": r["chunk_type"]})

# Sort indices by chunk_index within each doc
for doc_id, idxs in doc_to_indices.items():
    idxs.sort(key=lambda i: meta[i]["chunk_index"])

print("Documents:", len(doc_to_indices))


## Novelty definitions

Because embeddings are unit-normalized, cosine similarity is just dot product.

We compute three novelty signals:

1) **Cumulative novelty**: novelty of chunk t vs centroid(0..t-1)
2) **Window novelty**: novelty vs centroid(t-w..t-1)
3) **Delta novelty**: novelty vs chunk t-1

All yield values in ~[0, 2] (but typically [0, 1.2]). Lower = more predictable/redundant.

In [None]:
def l2_normalize(v: np.ndarray, eps: float = 1e-12) -> np.ndarray:
    n = np.linalg.norm(v)
    if n < eps:
        return v
    return v / n

def novelty_vs_vector(z_t: np.ndarray, ref: np.ndarray) -> float:
    # novelty = 1 - cosine_similarity; with L2-normalized vectors, cosine = dot
    ref_n = l2_normalize(ref)
    return float(1.0 - np.dot(z_t, ref_n))

def compute_doc_novelty(
    Z: np.ndarray,
    indices: List[int],
    window: int = 5,
) -> List[Dict[str, float]]:
    """
    Returns list of dicts aligned to each chunk in this doc:
      - novelty_cum: vs centroid of all previous chunks
      - novelty_win: vs centroid of trailing window
      - novelty_prev: vs previous chunk
    """
    out: List[Dict[str, float]] = []
    prev_vec: Optional[np.ndarray] = None

    # Maintain cumulative sum for fast centroid
    cum_sum = np.zeros(Z.shape[1], dtype=np.float64)

    # Maintain rolling window list
    win_vecs: List[np.ndarray] = []

    for t, idx in enumerate(indices):
        z = Z[idx].astype(np.float64)

        if t == 0:
            novelty_cum = float("nan")
            novelty_win = float("nan")
            novelty_prev = float("nan")
        else:
            # cumulative centroid is average of previous vectors
            cum_centroid = cum_sum / t
            novelty_cum = novelty_vs_vector(z, cum_centroid)

            # window centroid: average of last `window` vectors
            w = min(window, len(win_vecs))
            win_centroid = np.mean(win_vecs[-w:], axis=0)
            novelty_win = novelty_vs_vector(z, win_centroid)

            # previous chunk novelty
            novelty_prev = novelty_vs_vector(z, prev_vec)

        out.append({
            "novelty_cum": novelty_cum,
            "novelty_win": novelty_win,
            "novelty_prev": novelty_prev,
        })

        # Update state AFTER computing novelty
        cum_sum += z
        win_vecs.append(z)
        prev_vec = z

    return out


## Compute novelty for all docs

In [None]:
WINDOW = 5  # trailing window size for novelty_win

novelty_rows: List[Dict[str, Any]] = []

for doc_id, idxs in doc_to_indices.items():
    novs = compute_doc_novelty(Z, idxs, window=WINDOW)
    for idx, nov in zip(idxs, novs):
        r = meta[idx]
        novelty_rows.append({
            **r,
            **nov,
        })

print("Novelty rows:", len(novelty_rows))
print("Example novelty row:", {k: novelty_rows[0][k] for k in ["title","chunk_index","novelty_cum","novelty_win","novelty_prev"]})


## Summaries per document

We compute stable summary stats excluding the first chunk (which is NaN by definition).

In [None]:
def finite(vals: List[float]) -> np.ndarray:
    arr = np.array(vals, dtype=np.float64)
    return arr[np.isfinite(arr)]

def summarize(values: np.ndarray) -> Dict[str, float]:
    if values.size == 0:
        return {"mean": float("nan"), "median": float("nan"), "p10": float("nan"), "p90": float("nan")}
    return {
        "mean": float(np.mean(values)),
        "median": float(np.median(values)),
        "p10": float(np.percentile(values, 10)),
        "p90": float(np.percentile(values, 90)),
    }

doc_summaries: List[Dict[str, Any]] = []
doc_curves: List[Dict[str, Any]] = []

for doc_id, idxs in doc_to_indices.items():
    rows = [r for r in novelty_rows if r["doc_id"] == doc_id]
    rows = sorted(rows, key=lambda r: r["chunk_index"])

    nov_cum = finite([r["novelty_cum"] for r in rows])
    nov_win = finite([r["novelty_win"] for r in rows])
    nov_prev = finite([r["novelty_prev"] for r in rows])

    title = doc_info[doc_id]["title"]
    chunk_type = doc_info[doc_id]["chunk_type"]
    n_chunks = len(rows)

    doc_summaries.append({
        "doc_id": doc_id,
        "title": title,
        "chunk_type": chunk_type,
        "n_chunks": n_chunks,
        "window": WINDOW,
        "novelty_cum": summarize(nov_cum),
        "novelty_win": summarize(nov_win),
        "novelty_prev": summarize(nov_prev),
    })

    # Save full curve values for plotting later
    doc_curves.append({
        "doc_id": doc_id,
        "title": title,
        "chunk_type": chunk_type,
        "window": WINDOW,
        "curve": [
            {
                "chunk_index": int(r["chunk_index"]),
                "novelty_cum": (None if not np.isfinite(r["novelty_cum"]) else float(r["novelty_cum"])),
                "novelty_win": (None if not np.isfinite(r["novelty_win"]) else float(r["novelty_win"])),
                "novelty_prev": (None if not np.isfinite(r["novelty_prev"]) else float(r["novelty_prev"])),
            }
            for r in rows
        ],
    })

print("Doc summaries:", len(doc_summaries))
print("Example summary:", doc_summaries[0])


## Save outputs (JSONL)

In [None]:
def write_jsonl(path: Path, rows: List[Dict[str, Any]]) -> None:
    with path.open("w", encoding="utf-8") as f:
        for r in rows:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

write_jsonl(NOVELTY_CHUNKS_OUT, novelty_rows)
write_jsonl(NOVELTY_DOC_SUMMARY_OUT, doc_summaries)
write_jsonl(NOVELTY_CURVES_OUT, doc_curves)

print("Wrote:")
print("-", NOVELTY_CHUNKS_OUT, f"({NOVELTY_CHUNKS_OUT.stat().st_size} bytes)")
print("-", NOVELTY_DOC_SUMMARY_OUT, f"({NOVELTY_DOC_SUMMARY_OUT.stat().st_size} bytes)")
print("-", NOVELTY_CURVES_OUT, f"({NOVELTY_CURVES_OUT.stat().st_size} bytes)")


## Fun diagnostic: most/least novel chunks per document

This is where you get "semantic entropy collapse" examples.

In [None]:
# OPTIONAL: load the raw cleaned chunk text for preview (if you want it here)
# The index jsonl doesn't include text (by design). If you want previews, re-load chunks.jsonl.
CHUNKS_TEXT_IN = ROOT / "data" / "texts_clean" / "chunks.jsonl"
chunks_with_text = read_jsonl(CHUNKS_TEXT_IN)

# Build lookup: chunk_id -> text (and title/index for sanity)
text_lookup = {r["chunk_id"]: r["text"] for r in chunks_with_text}

def preview_chunk(chunk_id: str, n: int = 240) -> str:
    t = text_lookup.get(chunk_id, "")
    t = t.replace("\n", " ")
    return t[:n] + ("..." if len(t) > n else "")

# Group novelty rows by doc
by_doc: Dict[str, List[Dict[str, Any]]] = {}
for r in novelty_rows:
    by_doc.setdefault(r["doc_id"], []).append(r)

# Print for a few docs
DOCS_TO_SHOW = 3
shown = 0

for doc_id, rows in by_doc.items():
    title = doc_info[doc_id]["title"]
    rows = [r for r in rows if np.isfinite(r["novelty_win"])]
    if len(rows) < 4:
        continue

    rows_sorted = sorted(rows, key=lambda r: r["novelty_win"])
    low = rows_sorted[:2]
    high = rows_sorted[-2:]

    print("\n" + "="*80)
    print("DOC:", title, "| chunks:", len(doc_to_indices[doc_id]), "| window:", WINDOW)

    print("\nLowest novelty (most predictable):")
    for r in low:
        print(f"  win={r['novelty_win']:.3f}  idx={r['chunk_index']}  {preview_chunk(r['chunk_id'])}")

    print("\nHighest novelty (most new meaning):")
    for r in high[::-1]:
        print(f"  win={r['novelty_win']:.3f}  idx={r['chunk_index']}  {preview_chunk(r['chunk_id'])}")

    shown += 1
    if shown >= DOCS_TO_SHOW:
        break


### "Boredom risk" heuristic label (optional)

Not a diagnosis: just a materials-side label for quick comparison.

In [None]:
def label_bandwidth(mean_novelty: float) -> str:
    # Totally heuristic thresholds; tune after you see your data distribution.
    # Lower mean novelty -> lower bandwidth -> higher boredom risk.
    if not np.isfinite(mean_novelty):
        return "unknown"
    if mean_novelty < 0.10:
        return "very_low_bandwidth"
    if mean_novelty < 0.18:
        return "low_bandwidth"
    if mean_novelty < 0.28:
        return "medium_bandwidth"
    return "high_bandwidth"

# Add labels to doc summaries and rewrite summary file with labels if you want.
labeled = []
for s in doc_summaries:
    mean_win = s["novelty_win"]["mean"]
    s2 = dict(s)
    s2["bandwidth_label"] = label_bandwidth(mean_win)
    labeled.append(s2)

# Write labeled summaries (overwrite or new file)
LABELED_OUT = OUT_DIR / "novelty_summary_per_doc_labeled.jsonl"
write_jsonl(LABELED_OUT, labeled)

print("Wrote labeled summary:", LABELED_OUT, f"({LABELED_OUT.stat().st_size} bytes)")
print("Example labeled row:", labeled[0]["title"], labeled[0]["novelty_win"]["mean"], labeled[0]["bandwidth_label"])


## Next notebooks

**04_redundancy_metrics.ipynb**
- compression ratios (gzip)
- character n-gram entropy
- lexical repetition baselines

**05_contextual_diversity.ipynb**
- contextual diversity of words/ideas across passages

Then:
- a Streamlit app that produces a one-page "Boredom Report"
  combining novelty + redundancy + diversity.
