# 02 — Train LSA / LSI Embeddings

This notebook:
1) Loads standardized chunks from `data/texts_clean/chunks.jsonl`
2) Builds a TF-IDF matrix
3) Fits TruncatedSVD to learn an LSA/LSI semantic space
4) Produces an embedding vector for every chunk
5) Saves the model + embeddings for downstream novelty metrics

Outputs:
- `data/lsa/model.joblib`
- `data/lsa/chunk_embeddings.npy`
- `data/lsa/chunk_index.jsonl`
- `data/lsa/nearest_neighbors.jsonl` (optional)



## Imports + paths

In [None]:
from __future__ import annotations

import json
import re
from pathlib import Path
from typing import List, Dict, Any, Tuple

import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
import joblib

from _paths import set_repo_root
ROOT = set_repo_root()

CHUNKS_IN = ROOT / "data" / "texts_clean" / "chunks.jsonl"

OUT_DIR = ROOT / "data" / "lsa"
OUT_DIR.mkdir(parents=True, exist_ok=True)

MODEL_OUT = OUT_DIR / "model.joblib"
EMB_OUT = OUT_DIR / "chunk_embeddings.npy"
INDEX_OUT = OUT_DIR / "chunk_index.jsonl"
NN_OUT = OUT_DIR / "nearest_neighbors.jsonl"

print("Input:", CHUNKS_IN.resolve())
print("Output dir:", OUT_DIR.resolve())


## Load chunks


In [None]:
def read_jsonl(path: Path) -> List[Dict[str, Any]]:
    rows = []
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            rows.append(json.loads(line))
    return rows

chunks = read_jsonl(CHUNKS_IN)
print("Loaded chunks:", len(chunks))

# Minimal validation
required = {"chunk_id", "doc_id", "title", "chunk_index", "chunk_type", "text"}
missing = [i for i, r in enumerate(chunks) if not required.issubset(r.keys())]
if missing:
    raise ValueError(f"Some rows are missing required keys. Example bad row index: {missing[0]}")

# Sort deterministically: by title then chunk_index (within doc) then chunk_id
chunks = sorted(chunks, key=lambda r: (r["title"], r["doc_id"], r["chunk_index"], r["chunk_id"]))
texts = [r["text"] for r in chunks]
print("Example chunk:\n", texts[0][:400])


## Text normalization for vectorization

We keep this light: lowercasing, stripping, collapsing whitespace. Don't over-clean (LSA wants distributional signal).

In [None]:
RE_WS = re.compile(r"\s+")
RE_NONWORD_SPACE = re.compile(r"[^a-zA-Z0-9'\- ]+")

def normalize_for_tfidf(s: str) -> str:
    s = s.lower().strip()
    s = s.replace("\u00a0", " ")  # NBSP
    s = RE_WS.sub(" ", s)
    # Keep apostrophes and hyphens; drop other punctuation to reduce feature sparsity
    s = RE_NONWORD_SPACE.sub(" ", s)
    s = RE_WS.sub(" ", s).strip()
    return s

texts_norm = [normalize_for_tfidf(t) for t in texts]
print("Normalized example:\n", texts_norm[0][:300])


## Configure TF-IDF + SVD

These defaults are good for small-to-medium corpora. Tune later.


In [None]:
# --- TF-IDF settings ---
# min_df: ignore very rare terms (noise)
# max_df: ignore extremely common terms (near stopwords)
# ngram_range: (1,2) helps capture short phrases without going wild
TFIDF_MIN_DF = 2
TFIDF_MAX_DF = 0.9
NGRAM_RANGE = (1, 2)
MAX_FEATURES = 50000  # cap for stability

# --- SVD settings ---
N_COMPONENTS = 200  # 100–400 typical; depends on corpus size
RANDOM_STATE = 42

print("TF-IDF:", dict(min_df=TFIDF_MIN_DF, max_df=TFIDF_MAX_DF, ngram_range=NGRAM_RANGE, max_features=MAX_FEATURES))
print("SVD:", dict(n_components=N_COMPONENTS))


## Fit TF-IDF


In [None]:
vectorizer = TfidfVectorizer(
    min_df=TFIDF_MIN_DF,
    max_df=TFIDF_MAX_DF,
    ngram_range=NGRAM_RANGE,
    max_features=MAX_FEATURES,
    strip_accents="unicode",
)

X_tfidf = vectorizer.fit_transform(texts_norm)
print("TF-IDF shape:", X_tfidf.shape)

# Quick feature sanity check
feat_names = vectorizer.get_feature_names_out()
print("Num features:", len(feat_names))
print("Sample features:", feat_names[:20])


## Fit LSA (TruncatedSVD) and create embeddings

Classic LSA uses SVD on TF-IDF (or term counts). We normalize embeddings to unit length for cosine similarity.

In [None]:
svd = TruncatedSVD(n_components=min(N_COMPONENTS, X_tfidf.shape[1]-1), random_state=RANDOM_STATE)
Z = svd.fit_transform(X_tfidf)  # [n_chunks, n_components]

# Normalize to unit length (so dot product ~ cosine similarity)
Z = normalize(Z, norm="l2")

explained = float(np.sum(svd.explained_variance_ratio_))
print("Embeddings shape:", Z.shape)
print("Explained variance ratio (sum):", round(explained, 4))


## Save model + embeddings + index

In [None]:
# Save model as a dict so you can load everything cleanly
model = {
    "vectorizer": vectorizer,
    "svd": svd,
    "settings": {
        "tfidf_min_df": TFIDF_MIN_DF,
        "tfidf_max_df": TFIDF_MAX_DF,
        "ngram_range": NGRAM_RANGE,
        "max_features": MAX_FEATURES,
        "n_components": int(Z.shape[1]),
        "random_state": RANDOM_STATE,
    },
}

joblib.dump(model, MODEL_OUT)
np.save(EMB_OUT, Z)

# Save aligned metadata index (one JSON per row, same row order as embeddings)
with INDEX_OUT.open("w", encoding="utf-8") as f:
    for r in chunks:
        meta = {
            "chunk_id": r["chunk_id"],
            "doc_id": r["doc_id"],
            "title": r["title"],
            "chunk_index": r["chunk_index"],
            "chunk_type": r["chunk_type"],
            "n_chars": r.get("n_chars", None),
            "n_tokens_approx": r.get("n_tokens_approx", None),
        }
        f.write(json.dumps(meta, ensure_ascii=False) + "\n")

print("Saved:")
print("-", MODEL_OUT, f"({MODEL_OUT.stat().st_size} bytes)")
print("-", EMB_OUT, f"({EMB_OUT.stat().st_size} bytes)")
print("-", INDEX_OUT, f"({INDEX_OUT.stat().st_size} bytes)")


### Quick qualitative check: nearest neighbors (optional but fun)

This helps validate that the semantic space is doing something reasonable.

In [None]:
def top_neighbors(Z: np.ndarray, idx: int, k: int = 5) -> List[Tuple[int, float]]:
    sims = (Z @ Z[idx])  # cosine similarity because Z normalized
    sims[idx] = -1.0
    nn = np.argsort(-sims)[:k]
    return [(int(i), float(sims[i])) for i in nn]

# Pick a few example indices: first chunk of each document
example_indices = []
seen_docs = set()
for i, r in enumerate(chunks):
    if r["doc_id"] not in seen_docs:
        seen_docs.add(r["doc_id"])
        example_indices.append(i)
    if len(example_indices) >= 5:
        break

for idx in example_indices:
    r = chunks[idx]
    print("\nQUERY:", r["title"], "chunk", r["chunk_index"])
    print("TEXT:", r["text"][:200].replace("\n", " ") + ("..." if len(r["text"])>200 else ""))
    for j, s in top_neighbors(Z, idx, k=5):
        rr = chunks[j]
        print("  NN:", f"{s:.3f}", "|", rr["title"], "chunk", rr["chunk_index"])


### Save neighbors to JSONL (optional)

In [None]:
SAVE_NEIGHBORS = True
K = 5

if SAVE_NEIGHBORS:
    with NN_OUT.open("w", encoding="utf-8") as f:
        for i, r in enumerate(chunks):
            nns = top_neighbors(Z, i, k=K)
            out = {
                "chunk_id": r["chunk_id"],
                "title": r["title"],
                "chunk_index": r["chunk_index"],
                "neighbors": [
                    {
                        "neighbor_chunk_id": chunks[j]["chunk_id"],
                        "neighbor_title": chunks[j]["title"],
                        "neighbor_chunk_index": chunks[j]["chunk_index"],
                        "cosine_sim": s,
                    }
                    for j, s in nns
                ],
            }
            f.write(json.dumps(out, ensure_ascii=False) + "\n")

    print("Wrote neighbors:", NN_OUT, f"({NN_OUT.stat().st_size} bytes)")


## Next notebook: 03_semantic_novelty.ipynb

We now have normalized LSA embeddings `chunk_embeddings.npy` aligned with `chunk_index.jsonl`.

Next we’ll compute novelty curves, e.g.:

- per document:
  - novelty of chunk t = 1 - cos( embedding(t), centroid(0..t-1) )
  - or novelty relative to a trailing window (e.g. last 5 chunks)

Then we can compare:
- decodable vs authentic sets
- curriculum A vs curriculum B
- "semantic bandwidth" over time
