# 06 — Build the Boredom Report Table (Per Document)

This notebook joins outputs from:
- 03 semantic novelty
- 04 redundancy metrics
- 05 contextual diversity

…into a single per-document report table.

Outputs:
- `data/report/boredom_report_per_doc.jsonl`
- `data/report/boredom_report_per_doc.csv`

Important:
This report describes properties of **materials**, not students.
It does not diagnose boredom.


## Imports + paths

In [None]:
from __future__ import annotations

import json
from pathlib import Path
from typing import Dict, Any, List, Optional

import numpy as np
import pandas as pd

from _paths import set_repo_root
ROOT = set_repo_root()

# Inputs (from previous notebooks)
NOVELTY_IN = ROOT / "data" / "lsa" / "novelty_summary_per_doc_labeled.jsonl"
# If you didn't generate labeled, fallback to unlabeled:
NOVELTY_FALLBACK_IN = ROOT / "data" / "lsa" / "novelty_summary_per_doc.jsonl"

REDUNDANCY_IN = ROOT / "data" / "redundancy" / "redundancy_summary_per_doc.jsonl"
DIVERSITY_IN = ROOT / "data" / "diversity" / "doc_contextual_diversity_summary.jsonl"

OUT_DIR = ROOT / "data" / "report"
OUT_DIR.mkdir(parents=True, exist_ok=True)

OUT_JSONL = OUT_DIR / "boredom_report_per_doc.jsonl"
OUT_CSV = OUT_DIR / "boredom_report_per_doc.csv"

print("Output dir:", OUT_DIR.resolve())


## Read JSONL helper

In [None]:
def read_jsonl(path: Path) -> List[Dict[str, Any]]:
    rows = []
    if not path.exists():
        return rows
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            rows.append(json.loads(line))
    return rows

nov = read_jsonl(NOVELTY_IN)
if not nov:
    print("Labeled novelty file not found; using fallback.")
    nov = read_jsonl(NOVELTY_FALLBACK_IN)

red = read_jsonl(REDUNDANCY_IN)
div = read_jsonl(DIVERSITY_IN)

print("Loaded:")
print(" novelty:", len(nov))
print(" redundancy:", len(red))
print(" diversity:", len(div))


## Normalize nested dict columns into flat columns

Novelty and redundancy summaries have nested stat dicts; flatten them.

In [None]:
def flatten_stats(prefix: str, obj: Any, out: Dict[str, Any]) -> None:
    """
    Flatten dicts like {"mean":..., "median":..., "p10":..., "p90":...} into columns.
    """
    if isinstance(obj, dict) and {"mean","median","p10","p90"}.issubset(obj.keys()):
        for k, v in obj.items():
            out[f"{prefix}_{k}"] = v
    else:
        out[prefix] = obj

def flatten_row(row: Dict[str, Any]) -> Dict[str, Any]:
    flat: Dict[str, Any] = {}
    for k, v in row.items():
        if isinstance(v, dict):
            # Try flattening stat dicts (or nested)
            if {"mean","median","p10","p90"}.issubset(v.keys()):
                for kk, vv in v.items():
                    flat[f"{k}_{kk}"] = vv
            else:
                # For other dicts, JSON-stringify to avoid losing info
                flat[k] = json.dumps(v, ensure_ascii=False)
        else:
            flat[k] = v
    return flat

nov_df = pd.DataFrame([flatten_row(r) for r in nov])
red_df = pd.DataFrame([flatten_row(r) for r in red])
div_df = pd.DataFrame([flatten_row(r) for r in div])

print(nov_df.columns.tolist()[:20])
print(red_df.columns.tolist()[:20])
print(div_df.columns.tolist()[:20])


## Join into a single per-doc table

We join on `doc_id`. Titles should match; we'll keep the leftmost.

In [None]:
# Ensure doc_id exists
for name, df in [("nov", nov_df), ("red", red_df), ("div", div_df)]:
    if "doc_id" not in df.columns:
        raise ValueError(f"{name} table missing doc_id")

df = nov_df.merge(red_df, on=["doc_id"], how="left", suffixes=("", "_red"))
df = df.merge(div_df, on=["doc_id"], how="left", suffixes=("", "_div"))

# Normalize title/chunk_type columns if duplicates were created
if "title_red" in df.columns and "title" in df.columns:
    df["title"] = df["title"].fillna(df["title_red"])
    df.drop(columns=["title_red"], inplace=True)

if "chunk_type_red" in df.columns and "chunk_type" in df.columns:
    df["chunk_type"] = df["chunk_type"].fillna(df["chunk_type_red"])
    df.drop(columns=["chunk_type_red"], inplace=True)

print("Joined docs:", len(df))
df.head(3)


## Build composite scores (robust + interpretable)

We'll define "bandwidth" positively (higher = better):

- **Semantic bandwidth**: mean novelty (window) from 03
- **Redundancy penalty**: redundancy_gzip_mean and entropy_norm_mean from 04
- **Context variety**: mean_pairwise_distance and centroid_sim_mean from 05

We'll rank-normalize (percentiles) so scales don't dominate.

In [None]:
def percentile_rank(series: pd.Series) -> pd.Series:
    # returns values in [0,1], NaNs preserved
    s = series.copy()
    mask = s.notna()
    s2 = s[mask].rank(pct=True)
    out = pd.Series(np.nan, index=s.index)
    out[mask] = s2
    return out

# --- Pick key columns (fallbacks included) ---
# Novelty: prefer novelty_win_mean
nov_col = "novelty_win_mean" if "novelty_win_mean" in df.columns else "novelty_cum_mean"

# Redundancy: higher redundancy_gzip_mean = more redundant (bad)
red_gz_col = "redundancy_gzip_mean" if "redundancy_gzip_mean" in df.columns else None
# Entropy norm: lower entropy = more predictable (bad), so we want higher entropy as better
ent_col = "char_ngram_entropy_norm_mean" if "char_ngram_entropy_norm_mean" in df.columns else None

# Semantic variety: mean_pairwise_distance higher = more variety (good)
mpd_col = "mean_pairwise_distance" if "mean_pairwise_distance" in df.columns else None
# Centroid similarity higher = tighter cluster (bad), so invert for variety
cent_col = "centroid_sim_mean" if "centroid_sim_mean" in df.columns else None

# Create ranked components
df["rank_semantic_novelty"] = percentile_rank(df[nov_col]) if nov_col else np.nan
df["rank_entropy_good"] = percentile_rank(df[ent_col]) if ent_col else np.nan

if red_gz_col:
    df["rank_redundancy_bad"] = percentile_rank(df[red_gz_col])  # higher = worse
else:
    df["rank_redundancy_bad"] = np.nan

if mpd_col:
    df["rank_semantic_variety"] = percentile_rank(df[mpd_col])  # higher = better
else:
    df["rank_semantic_variety"] = np.nan

if cent_col:
    df["rank_centroid_variety"] = 1.0 - percentile_rank(df[cent_col])  # invert (higher centroid sim -> lower variety)
else:
    df["rank_centroid_variety"] = np.nan

# Combine variety from mpd + centroid-based if both exist
var_parts = ["rank_semantic_variety", "rank_centroid_variety"]
df["rank_variety_combined"] = df[var_parts].mean(axis=1, skipna=True)

# Composite bandwidth score: encourage novelty + variety + entropy, penalize redundancy
# We subtract redundancy_bad with a small weight (to avoid double-counting with entropy).
df["bandwidth_score"] = (
    0.40 * df["rank_semantic_novelty"] +
    0.30 * df["rank_variety_combined"] +
    0.20 * df["rank_entropy_good"] -
    0.10 * df["rank_redundancy_bad"]
)

# Convenience: "boredom risk" as inverse of bandwidth score (materials-side only)
df["boredom_risk_proxy"] = 1.0 - df["bandwidth_score"]

# Make a friendly bucket label
def bucket(x: float) -> str:
    if not np.isfinite(x):
        return "unknown"
    if x < 0.25:
        return "very_low_bandwidth"
    if x < 0.45:
        return "low_bandwidth"
    if x < 0.65:
        return "medium_bandwidth"
    return "high_bandwidth"

df["bandwidth_bucket"] = df["bandwidth_score"].apply(bucket)

print("Composite computed. Non-NaN bandwidth_score:", df["bandwidth_score"].notna().sum())
df[["title", nov_col, "bandwidth_score", "bandwidth_bucket"]].head(10)


## Save report table (CSV + JSONL)

In [None]:
# Sort by bandwidth_score descending
df_sorted = df.sort_values(by="bandwidth_score", ascending=False)

# Save CSV
df_sorted.to_csv(OUT_CSV, index=False)

# Save JSONL
with OUT_JSONL.open("w", encoding="utf-8") as f:
    for _, row in df_sorted.iterrows():
        rec = row.to_dict()
        # Convert numpy types to Python types for JSON
        for k, v in list(rec.items()):
            if isinstance(v, (np.integer,)):
                rec[k] = int(v)
            elif isinstance(v, (np.floating,)):
                rec[k] = float(v)
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

print("Wrote:")
print("-", OUT_CSV, f"({OUT_CSV.stat().st_size} bytes)")
print("-", OUT_JSONL, f"({OUT_JSONL.stat().st_size} bytes)")


## Fun printouts: "most over-constrained" vs "highest bandwidth"

In [None]:
def show_top(df: pd.DataFrame, n: int = 8, ascending: bool = False, label: str = ""):
    d = df.sort_values("bandwidth_score", ascending=ascending).head(n)
    print("\n" + ("="*80))
    print(label)
    for _, r in d.iterrows():
        print(f"  score={r['bandwidth_score']:.3f}  bucket={r['bandwidth_bucket']:<18}  title={r.get('title','')}")
    print("="*80 + "\n")

show_top(df, n=10, ascending=True, label="Most over-constrained (lowest bandwidth) — materials-side proxy")
show_top(df, n=10, ascending=False, label="Highest bandwidth (most semantic variety / novelty) — materials-side proxy")


## Quick sanity: correlate components

In [None]:
cols = ["rank_semantic_novelty","rank_variety_combined","rank_entropy_good","rank_redundancy_bad","bandwidth_score"]
corr = df[cols].corr(numeric_only=True)
corr


## Next: Streamlit app

Next we’ll build `app/streamlit_app.py` that:
- lets you upload text sets
- runs notebooks’ logic (or loads precomputed report files)
- renders a one-page “Boredom Report” with:
  - bandwidth bucket + score
  - novelty curve (if available)
  - redundancy highlights (“most templated chunk”)
  - context diversity summary

If you want the app to be *fast*, we’ll start by reading the precomputed
`data/report/boredom_report_per_doc.csv` and adding optional drill-down views.
