# arXiv Sanity (Updated)

This notebook reflects current backend contracts:
- triage input is title + abstract only
- summary uses fulltext first, slices fallback when token budget is exceeded


In [None]:
from pathlib import Path
import json

from eegfm_digest.triage import triage_paper, load_schema
from eegfm_digest.summarize import summarize_paper
from eegfm_digest.pdf import extract_text, slice_paper_text
from eegfm_digest.llm_gemini import GeminiClient, LLMConfig, load_api_key


In [None]:
TRIAGE_PROMPT = Path('prompts/triage.md').read_text(encoding='utf-8')
SUMMARIZE_PROMPT = Path('prompts/summarize.md').read_text(encoding='utf-8')
REPAIR_PROMPT = Path('prompts/repair_json.md').read_text(encoding='utf-8')
TRIAGE_SCHEMA = load_schema(Path('schemas/triage.json'))
SUMMARY_SCHEMA = load_schema(Path('schemas/summary.json'))


In [None]:
paper = {
    'arxiv_id': '2501.00001v1',
    'arxiv_id_base': '2501.00001',
    'version': 1,
    'title': 'Example EEG Foundation Model Paper',
    'summary': 'We propose a self-supervised EEG pretraining framework for transfer across tasks.',
    'authors': ['Author A', 'Author B'],
    'categories': ['cs.LG', 'q-bio.NC'],
    'published': '2025-01-10T00:00:00Z',
    'updated': '2025-01-10T00:00:00Z',
    'links': {'abs': 'https://arxiv.org/abs/2501.00001', 'pdf': None},
}
paper


In [None]:
triage_client = GeminiClient(
    LLMConfig(
        api_key=load_api_key(),
        model='gemini-3-flash-preview',
        temperature=0.2,
        max_output_tokens=1024,
    )
)
triage = triage_paper(
    paper=paper,
    llm=triage_client,
    prompt_template=TRIAGE_PROMPT,
    repair_template=REPAIR_PROMPT,
    schema=TRIAGE_SCHEMA,
)
triage


## Summary payload mode check

If `fulltext` prompt tokens are too large, summarization automatically falls back to `fulltext_slices`.


In [None]:
pdf_path = Path('data/EEGFormer_eegfm.pdf')
text_path = Path('outputs/_sanity/eegformer.txt')
meta = extract_text(pdf_path, text_path)
raw_text = text_path.read_text(encoding='utf-8')
slices = slice_paper_text(raw_text)
meta, {k: len(v) for k, v in slices.items()}


In [None]:
summary_client = GeminiClient(
    LLMConfig(
        api_key=load_api_key(),
        model='gemini-3-flash-preview',
        temperature=0.2,
        max_output_tokens=2048,
    )
)
summary = summarize_paper(
    paper=paper,
    triage=triage,
    raw_fulltext=raw_text,
    fulltext_slices=slices,
    used_fulltext=True,
    notes=json.dumps(meta, sort_keys=True),
    llm=summary_client,
    prompt_template=SUMMARIZE_PROMPT,
    repair_template=REPAIR_PROMPT,
    schema=SUMMARY_SCHEMA,
    max_input_tokens=120000,
)
summary
