# Corpus Quality Checks

This notebook inspects the processed corpus to verify text coverage, sidecar availability, and metadata consistency.

In [None]:
from __future__ import annotations

import json
from collections import Counter
from pathlib import Path
from typing import Any

import matplotlib.pyplot as plt
import pandas as pd

PROJECT_ROOT = Path.cwd().resolve()
PROCESSED_DIR = PROJECT_ROOT / "data/processed"
RAW_DIR = PROJECT_ROOT / "data/raw/arxiv_2025"
DATA_QUALITY_DIR = PROJECT_ROOT / "evaluation/results/data_quality"

MANIFEST_PATH = PROCESSED_DIR / "manifest.json"
METADATA_JSONL = RAW_DIR / "metadata.jsonl"
SELECTION_LOG = RAW_DIR / "selection_log.jsonl"

plt.rcParams.update({"figure.figsize": (8, 4), "axes.grid": True})

In [None]:
def load_manifest(path: Path) -> pd.DataFrame:
    if not path.exists():
        raise FileNotFoundError(f"Processed manifest missing at {path}")
    data = json.loads(path.read_text(encoding="utf-8"))
    docs = data.get("documents", [])
    if not isinstance(docs, list):
        raise TypeError("Manifest documents field must be a list")
    records: list[dict[str, Any]] = []
    for entry in docs:
        if not isinstance(entry, dict):
            continue
        record = entry.copy()
        record["text_path"] = PROCESSED_DIR / record["text_path"]
        pages_path = record.get("pages_path")
        record["pages_path"] = PROCESSED_DIR / pages_path if pages_path else None
        records.append(record)
    frame = pd.DataFrame(records)
    if not frame.empty:
        frame["has_pages_sidecar"] = frame["pages_path"].apply(
            lambda p: bool(p and Path(p).exists())
        )
    return frame


def load_metadata(jsonl_path: Path) -> pd.DataFrame:
    if not jsonl_path.exists():
        raise FileNotFoundError(f"Metadata JSONL missing at {jsonl_path}")
    rows: list[dict[str, Any]] = []
    with jsonl_path.open("r", encoding="utf-8") as handle:
        for line in handle:
            line = line.strip()
            if not line:
                continue
            payload = json.loads(line)
            if isinstance(payload, dict):
                rows.append(payload)
    return pd.DataFrame(rows)


def estimate_tokens(text_path: Path) -> tuple[int, int]:
    text = text_path.read_text(encoding="utf-8", errors="ignore")
    words = text.split()
    return len(text), len(words)


def ensure_output_dir(path: Path) -> None:
    path.mkdir(parents=True, exist_ok=True)

In [None]:
manifest_df = load_manifest(MANIFEST_PATH)
display(manifest_df.head())
print(f"Total documents in manifest: {len(manifest_df)}")
if not manifest_df.empty:
    total_bytes = manifest_df["bytes"].sum()
    print(f"Total text bytes: {total_bytes:,}")
    manifest_meta = json.loads(MANIFEST_PATH.read_text(encoding="utf-8"))
    print(f"Manifest generated_at: {manifest_meta.get('generated_at', 'unknown')}")
    print(f"Manifest total_documents: {manifest_meta.get('total_documents', 'unknown')}")

In [None]:
metadata_df = load_metadata(METADATA_JSONL)
print(f"Metadata rows: {len(metadata_df)}")
if not metadata_df.empty:
    duplicates = metadata_df["arxiv_id"].str.split("v").str[0].value_counts()

In [None]:
if manifest_df.empty:
    raise RuntimeError("Manifest is empty; aborting QA metrics.")

byte_lengths: list[int] = []
token_counts: list[int] = []
sidecar_pages: list[float] = []
for row in manifest_df.itertuples(index=False):
    text_path = Path(row.text_path)
    if not text_path.exists():
        continue
    chars, tokens = estimate_tokens(text_path)
    byte_lengths.append(chars)
    token_counts.append(tokens)
    pages_path = row.pages_path
    if pages_path:
        with Path(pages_path).open("r", encoding="utf-8") as handle:
            page_counter = sum(1 for _ in handle)
    else:
        page_counter = float("nan")
    sidecar_pages.append(page_counter)

summary = pd.DataFrame(
    {
        "tokens": token_counts,
        "chars": byte_lengths,
        "sidecar_pages": sidecar_pages,
    }
)
summary_stats = summary.describe(percentiles=[0.05, 0.5, 0.95])
display(summary_stats)

print("Documents missing sidecars:", (~manifest_df["has_pages_sidecar"]).sum())

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
axes[0].hist(summary["tokens"], bins=30, color="#1f77b4")
axes[0].set_title("Token count distribution")
axes[0].set_xlabel("Tokens per document")
axes[0].set_ylabel("Frequency")

axes[1].hist(summary["sidecar_pages"].dropna(), bins=30, color="#ff7f0e")
axes[1].set_title("Pages per sidecar")
axes[1].set_xlabel("Pages")
axes[1].set_ylabel("Frequency")
plt.tight_layout()
plt.show()

In [None]:
ensure_output_dir(DATA_QUALITY_DIR / "plots")
output_csv = DATA_QUALITY_DIR / "qa_summary_stats.csv"
summary_stats.to_csv(output_csv, index=True)
print(f"Wrote summary stats to {output_csv}")

selection_entries: list[dict[str, Any]] = []
if SELECTION_LOG.exists():
    with SELECTION_LOG.open("r", encoding="utf-8") as handle:
        for line in handle:
            line = line.strip()
            if not line:
                continue
            selection_entries.append(json.loads(line))
if selection_entries:
    batches = Counter(entry.get("selection_batch", "unknown") for entry in selection_entries)
    print("Selection batches recorded:", batches)
else:
    print("No selection log entries found; ensure selection_log.jsonl is populated in future runs.")