# 01 — Ingest & Clean

This notebook:
1) Ingests reading materials from:
   - pasted text
   - `.txt` files
   - `.docx` files  
2) Cleans common artifacts (headers/footers, extra whitespace, weird punctuation)
3) Chunks text into analysis units (paragraphs or pseudo-pages)
4) Saves standardized JSONL outputs for downstream metrics.

Outputs:
- `data/texts_clean/documents.jsonl`
- `data/texts_clean/chunks.jsonl`


## Imports + paths

In [None]:
from __future__ import annotations

import os
import re
import json
import glob
import hashlib
from dataclasses import dataclass, asdict
from typing import List, Dict, Optional, Iterable, Tuple

from pathlib import Path
import os

ROOT = Path.cwd()
while ROOT != ROOT.parent and not (ROOT / ".git").exists():
    ROOT = ROOT.parent

os.chdir(ROOT)

DATA_RAW = ROOT / "data" / "texts_raw"
DATA_CLEAN = ROOT / "data" / "texts_clean"

print("Repo root:", ROOT)
print("Raw:", DATA_RAW, "exists?", DATA_RAW.exists())
print("Clean:", DATA_CLEAN, "exists?", DATA_CLEAN.exists())

### Optional docx support

In [None]:
DOCX_AVAILABLE = False
try:
    import docx  # python-docx
    DOCX_AVAILABLE = True
except Exception as e:
    DOCX_AVAILABLE = False
    print("python-docx not available. .docx ingestion will be skipped.")


## Data model

In [None]:
@dataclass
class Document:
    doc_id: str
    title: str
    source_path: str
    raw_text: str
    clean_text: str

@dataclass
class Chunk:
    chunk_id: str
    doc_id: str
    title: str
    chunk_index: int
    chunk_type: str  # "paragraph" or "page"
    text: str
    n_chars: int
    n_tokens_approx: int


## Utility: stable IDs

In [None]:
def stable_id(*parts: str) -> str:
    h = hashlib.sha1()
    for p in parts:
        h.update(p.encode("utf-8", errors="ignore"))
        h.update(b"|")
    return h.hexdigest()[:12]


## Ingestion: read txt/docx + optional pasted text

In [None]:
def read_txt(path: Path) -> str:
    return path.read_text(encoding="utf-8", errors="ignore")

def read_docx(path: Path) -> str:
    if not DOCX_AVAILABLE:
        raise RuntimeError("python-docx not installed")
    d = docx.Document(str(path))
    # Keep paragraph boundaries; join with double newline
    paras = [p.text for p in d.paragraphs if p.text is not None]
    return "\n\n".join([p for p in paras if p.strip()])

def ingest_from_folder(folder: Path) -> List[Tuple[str, str, str]]:
    """
    Returns list of (title, source_path, raw_text).
    """
    items: List[Tuple[str, str, str]] = []
    if not folder.exists():
        print(f"Folder does not exist: {folder}")
        return items

    # TXT
    for fp in sorted(folder.glob("*.txt")):
        items.append((fp.stem, str(fp), read_txt(fp)))

    # DOCX
    if DOCX_AVAILABLE:
        for fp in sorted(folder.glob("*.docx")):
            items.append((fp.stem, str(fp), read_docx(fp)))

    return items

# Optional: paste text here for quick experiments
PASTED_TITLE = ""  # e.g., "Decodable_Set_A"
PASTED_TEXT = ""   # paste text here

raw_items = ingest_from_folder(DATA_RAW)

if PASTED_TEXT.strip():
    raw_items.append((PASTED_TITLE or "pasted_text", "pasted://", PASTED_TEXT))

print(f"Ingested {len(raw_items)} item(s).")
for t, p, _ in raw_items[:5]:
    print("-", t, "(", p, ")")


## Cleaning functions (curriculum-friendly)

In [None]:
# Common junk patterns:
RE_MULTISPACE = re.compile(r"[ \t]+")
RE_MULTI_NL = re.compile(r"\n{3,}")
RE_SOFT_HYPHEN = re.compile("\u00ad")  # discretionary hyphen
RE_NONBREAKING_SPACE = re.compile("\u00a0")

# Simple header/footer removal heuristics:
# - lines that are mostly digits (page numbers)
# - lines that repeat frequently
def normalize_text(s: str) -> str:
    s = s.replace("\r\n", "\n").replace("\r", "\n")
    s = RE_NONBREAKING_SPACE.sub(" ", s)
    s = RE_SOFT_HYPHEN.sub("", s)

    # Normalize quotes/dashes a bit (optional)
    s = s.replace("“", '"').replace("”", '"').replace("’", "'").replace("–", "-").replace("—", "-")

    # Collapse spaces, normalize newlines
    s = RE_MULTISPACE.sub(" ", s)
    s = RE_MULTI_NL.sub("\n\n", s)
    return s.strip()

def split_lines(s: str) -> List[str]:
    return [ln.strip() for ln in s.split("\n")]

def detect_repeated_lines(lines: List[str], min_len: int = 8, min_count: int = 3) -> set:
    """
    Finds lines that repeat often (typical headers/footers).
    Conservative: only longer lines repeated >= min_count.
    """
    freq: Dict[str, int] = {}
    for ln in lines:
        key = ln.strip()
        if len(key) < min_len:
            continue
        freq[key] = freq.get(key, 0) + 1
    return {k for k, v in freq.items() if v >= min_count}

def drop_headers_footers(s: str) -> str:
    lines = split_lines(s)
    repeated = detect_repeated_lines(lines)

    cleaned_lines = []
    for ln in lines:
        if not ln:
            cleaned_lines.append("")
            continue

        # Drop pure page numbers or "Page 3" style
        if re.fullmatch(r"\d{1,3}", ln):
            continue
        if re.fullmatch(r"(page|pg)\s*\d{1,3}", ln.lower()):
            continue

        # Drop repeated header/footer lines
        if ln in repeated:
            continue

        cleaned_lines.append(ln)

    out = "\n".join(cleaned_lines)
    out = RE_MULTI_NL.sub("\n\n", out)
    return out.strip()

def clean_document_text(raw: str) -> str:
    s = normalize_text(raw)
    s = drop_headers_footers(s)
    s = normalize_text(s)  # re-normalize after dropping lines
    return s


## Chunking: paragraphs or pseudo-pages

This gives you two options:

- **paragraph chunks** (best default)
- **pseudo-pages** (approximate "page" by character budget; useful for novelty curves)


In [None]:
def approx_token_count(text: str) -> int:
    # quick approximation good enough for reporting
    return max(1, len(text.split()))

def chunk_by_paragraph(text: str) -> List[str]:
    paras = [p.strip() for p in text.split("\n\n")]
    paras = [p for p in paras if p]
    return paras

def chunk_by_pseudopage(text: str, target_chars: int = 1200, min_chars: int = 500) -> List[str]:
    """
    Greedy accumulation of paragraphs into ~page-sized chunks.
    """
    paras = chunk_by_paragraph(text)
    pages = []
    buf = []
    buf_len = 0

    for p in paras:
        if buf_len + len(p) + 2 <= target_chars or buf_len < min_chars:
            buf.append(p)
            buf_len += len(p) + 2
        else:
            pages.append("\n\n".join(buf).strip())
            buf = [p]
            buf_len = len(p)

    if buf:
        pages.append("\n\n".join(buf).strip())

    return [pg for pg in pages if pg]


### Build documents + chunks


In [None]:
documents: List[Document] = []
chunks: List[Chunk] = []

CHUNK_MODE = "paragraph"  # "paragraph" or "page"
PSEUDOPAGE_TARGET_CHARS = 1200

for title, source_path, raw_text in raw_items:
    clean_text = clean_document_text(raw_text)

    doc_id = stable_id(title, source_path, clean_text[:200])
    documents.append(Document(
        doc_id=doc_id,
        title=title,
        source_path=source_path,
        raw_text=raw_text,
        clean_text=clean_text
    ))

    if CHUNK_MODE == "paragraph":
        unit_texts = chunk_by_paragraph(clean_text)
        chunk_type = "paragraph"
    elif CHUNK_MODE == "page":
        unit_texts = chunk_by_pseudopage(clean_text, target_chars=PSEUDOPAGE_TARGET_CHARS)
        chunk_type = "page"
    else:
        raise ValueError("CHUNK_MODE must be 'paragraph' or 'page'")

    for i, t in enumerate(unit_texts):
        chunk_id = stable_id(doc_id, str(i), t[:200])
        chunks.append(Chunk(
            chunk_id=chunk_id,
            doc_id=doc_id,
            title=title,
            chunk_index=i,
            chunk_type=chunk_type,
            text=t,
            n_chars=len(t),
            n_tokens_approx=approx_token_count(t),
        ))

print(f"Documents: {len(documents)}")
print(f"Chunks: {len(chunks)} (mode={CHUNK_MODE})")


### Quick sanity checks


In [None]:
def preview(doc: Document, n_chars: int = 800):
    print("TITLE:", doc.title)
    print("DOC_ID:", doc.doc_id)
    print("SOURCE:", doc.source_path)
    print("\nCLEAN PREVIEW:\n")
    print(doc.clean_text[:n_chars])
    print("\n---\n")

if documents:
    preview(documents[0])

# Chunk stats
if chunks:
    lens = [c.n_tokens_approx for c in chunks]
    print("Chunk token approx:")
    print(" min:", min(lens))
    print(" p50:", sorted(lens)[len(lens)//2])
    print(" max:", max(lens))


## Save JSONL outputs


In [None]:
def write_jsonl(path: Path, rows: Iterable[dict]) -> None:
    with path.open("w", encoding="utf-8") as f:
        for r in rows:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

write_jsonl(DOCS_OUT, (asdict(d) for d in documents))
write_jsonl(CHUNKS_OUT, (asdict(c) for c in chunks))

print("Wrote:")
print("-", DOCS_OUT, f"({DOCS_OUT.stat().st_size} bytes)")
print("-", CHUNKS_OUT, f"({CHUNKS_OUT.stat().st_size} bytes)")


## Next notebook

Move on to **02_train_lsa.ipynb**, which will:
- load `chunks.jsonl`
- train LSA/LSI representations
- save embeddings for downstream novelty / redundancy / diversity metrics
