# Canon index and metadata

Build a stable `meta.csv` for the canon corpus and run a quick token-length audit for LaTa.


In [None]:
from pathlib import Path
import os
import sys


def in_colab() -> bool:
    return "COLAB_GPU" in os.environ or "COLAB_RELEASE_TAG" in os.environ


if in_colab():
    from google.colab import drive

    drive.mount("/content/drive")
    REPO_ROOT = Path(os.environ.get("REPO_ROOT", "/content/localLatin"))
    CANON_ROOT = Path(os.environ.get("CANON_ROOT", "/content/drive/MyDrive/localLatin_data/canon"))
    RUNS_ROOT = Path(os.environ.get("RUNS_ROOT", "/content/drive/MyDrive/localLatin_runs/ff1_lata_postact"))
else:
    def find_repo_root(start: Path) -> Path:
        for candidate in [start, *start.parents]:
            if (candidate / "canon").exists() and (candidate / "src").exists():
                return candidate
        raise FileNotFoundError("Could not locate repo root containing canon/ and src/")

    REPO_ROOT = Path(os.environ.get("REPO_ROOT", "")) if os.environ.get("REPO_ROOT") else find_repo_root(Path.cwd())
    CANON_ROOT = Path(os.environ.get("CANON_ROOT", str(REPO_ROOT / "canon")))
    RUNS_ROOT = Path(os.environ.get("RUNS_ROOT", str(REPO_ROOT / "runs" / "ff1_lata_postact")))

sys.path.append(str(REPO_ROOT / "src"))

print(f"REPO_ROOT: {REPO_ROOT}")
print(f"CANON_ROOT: {CANON_ROOT}")
print(f"RUNS_ROOT: {RUNS_ROOT}")


In [None]:
import pandas as pd

from canon_retrieval import build_meta, meta_stats

META_CSV = str(RUNS_ROOT / "meta.csv")
RUNS_ROOT.mkdir(parents=True, exist_ok=True)

meta = build_meta(str(CANON_ROOT), META_CSV)
stats = meta_stats(meta)

print(f"Meta saved to: {META_CSV}")
print(stats)
meta.head()


FileNotFoundError: Could not locate repo root containing canon/ and src/

In [None]:
from transformers import AutoTokenizer

from canon_retrieval import load_texts, token_lengths

MODEL_NAME = "bowphs/LaTa"
MAX_LENGTH = 512

texts = load_texts(meta["path"].tolist())
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
lengths = token_lengths(tokenizer, texts, max_length=MAX_LENGTH)

print(f"Token length stats (max_length={MAX_LENGTH})")
print(f"  mean={lengths.mean():.2f}, max={lengths.max()}, min={lengths.min()}")
print(f"  truncated={(lengths >= MAX_LENGTH).sum()}")
