# arXiv Sanity (Updated)

This notebook supports two quick checks:
- arXiv query-only count sanity (no LLM calls)
- backend contracts for triage + summary


In [3]:
from pathlib import Path
import json
import os
import sys

def find_repo_root(start: Path) -> Path:
    for candidate in [start, *start.parents]:
        if (candidate / 'pyproject.toml').exists() and (candidate / 'src').exists():
            return candidate
    raise RuntimeError('Could not find repo root. Open notebook from this repo or set CWD accordingly.')

REPO_ROOT = find_repo_root(Path.cwd())
SRC_DIR = REPO_ROOT / 'src'
if str(SRC_DIR) not in sys.path:
    sys.path.insert(0, str(SRC_DIR))

from eegfm_digest.arxiv import fetch_query, in_month, category_match, dedupe_latest
from eegfm_digest.keywords import QUERY_A, QUERY_B
from eegfm_digest.triage import triage_paper, load_schema
from eegfm_digest.summarize import summarize_paper
from eegfm_digest.pdf import extract_text, slice_paper_text
from eegfm_digest.llm_gemini import GeminiClient, LLMConfig, load_api_key

REPO_ROOT


PosixPath('/Users/ismaelrobles-razzaq/2_cs_projects/eeg-fm-paper-roundup')

In [4]:
TRIAGE_PROMPT = (REPO_ROOT / 'prompts' / 'triage.md').read_text(encoding='utf-8')
SUMMARIZE_PROMPT = (REPO_ROOT / 'prompts' / 'summarize.md').read_text(encoding='utf-8')
REPAIR_PROMPT = (REPO_ROOT / 'prompts' / 'repair_json.md').read_text(encoding='utf-8')
TRIAGE_SCHEMA = load_schema(REPO_ROOT / 'schemas' / 'triage.json')
SUMMARY_SCHEMA = load_schema(REPO_ROOT / 'schemas' / 'summary.json')
TRIAGE_MODEL = os.environ.get('GEMINI_MODEL_TRIAGE', 'gemini-3-flash-preview')
SUMMARY_MODEL = os.environ.get('GEMINI_MODEL_SUMMARY', 'gemini-3-flash-preview')


## arXiv query-only sanity check

Run a single arXiv `search_query` prompt and inspect how many results it returns.


In [9]:
# Set the exact arXiv search_query prompt you want to test.
ARXIV_QUERY = QUERY_A
MONTH = None  # Set to None to skip month filtering.
MAX_RESULTS = 10000
RATE_LIMIT_SECONDS = 10.0


In [7]:
rows = fetch_query(
    query=ARXIV_QUERY,
    max_results=MAX_RESULTS,
    rate_limit_seconds=RATE_LIMIT_SECONDS,
)

raw_count = len(rows)
if MONTH:
    month_rows = [r for r in rows if in_month(r['published'], MONTH)]
else:
    month_rows = rows
category_rows = [r for r in month_rows if category_match(r['categories'])]
deduped_rows = dedupe_latest(category_rows)

print(f'query={ARXIV_QUERY}')
print(f'raw_count={raw_count}')
if MONTH:
    print(f'in_month_count={len(month_rows)}')
print(f'category_match_count={len(category_rows)}')
print(f'deduped_count={len(deduped_rows)}')

[
    {'arxiv_id_base': r['arxiv_id_base'], 'published': r['published'], 'title': r['title']}
    for r in deduped_rows[:10]
]


query=all:(eeg OR electroencephalograph* OR brainwave*) AND all:("foundation model" OR pretrain OR pretrained OR "self-supervised" OR "self supervised")
raw_count=253
category_match_count=233
deduped_count=233


[{'arxiv_id_base': '1806.09532',
  'published': '2018-06-20T11:34:36Z',
  'title': 'Cross-paradigm pretraining of convolutional networks improves intracranial EEG decoding'},
 {'arxiv_id_base': '1811.07516',
  'published': '2018-11-19T06:07:33Z',
  'title': 'Unsupervised Learning in Reservoir Computing for EEG-based Emotion Recognition'},
 {'arxiv_id_base': '1911.05419',
  'published': '2019-11-13T12:17:31Z',
  'title': 'Self-supervised representation learning from electroencephalography signals'},
 {'arxiv_id_base': '2005.09687',
  'published': '2020-05-19T18:10:35Z',
  'title': 'Deep learning approaches for neural decoding: from CNNs to LSTMs and spikes to fMRI'},
 {'arxiv_id_base': '2007.04871',
  'published': '2020-06-30T20:32:37Z',
  'title': 'Subject-Aware Contrastive Learning for Biosignals'},
 {'arxiv_id_base': '2007.13018',
  'published': '2020-07-25T21:59:17Z',
  'title': 'Federated Self-Supervised Learning of Multi-Sensor Representations for Embedded Intelligence'},
 {'arxiv

### Query A vs Query B counts

Runs both canonical prompts and shows per-query counts plus a combined deduped union count.


In [8]:
def count_view(rows, month):
    month_rows = [r for r in rows if in_month(r['published'], month)] if month else rows
    category_rows = [r for r in month_rows if category_match(r['categories'])]
    deduped_rows = dedupe_latest(category_rows)
    return {
        'raw_count': len(rows),
        'in_month_count': len(month_rows),
        'category_match_count': len(category_rows),
        'deduped_count': len(deduped_rows),
    }, deduped_rows

rows_a = fetch_query(query=QUERY_A, max_results=MAX_RESULTS, rate_limit_seconds=RATE_LIMIT_SECONDS)
rows_b = fetch_query(query=QUERY_B, max_results=MAX_RESULTS, rate_limit_seconds=RATE_LIMIT_SECONDS)

counts_a, deduped_a = count_view(rows_a, MONTH)
counts_b, deduped_b = count_view(rows_b, MONTH)
combined_deduped = dedupe_latest(deduped_a + deduped_b)

print('QUERY_A counts:', counts_a)
print('QUERY_B counts:', counts_b)
print('combined_deduped_count:', len(combined_deduped))


QUERY_A counts: {'raw_count': 253, 'in_month_count': 253, 'category_match_count': 233, 'deduped_count': 233}
QUERY_B counts: {'raw_count': 548, 'in_month_count': 548, 'category_match_count': 470, 'deduped_count': 470}
combined_deduped_count: 594


In [None]:
paper = {
    'arxiv_id': '2501.00001v1',
    'arxiv_id_base': '2501.00001',
    'version': 1,
    'title': 'Example EEG Foundation Model Paper',
    'summary': 'We propose a self-supervised EEG pretraining framework for transfer across tasks.',
    'authors': ['Author A', 'Author B'],
    'categories': ['cs.LG', 'q-bio.NC'],
    'published': '2025-01-10T00:00:00Z',
    'updated': '2025-01-10T00:00:00Z',
    'links': {'abs': 'https://arxiv.org/abs/2501.00001', 'pdf': None},
}
paper


In [None]:
triage_client = GeminiClient(
    LLMConfig(
        api_key=load_api_key(),
        model=TRIAGE_MODEL,
        temperature=0.2,
        max_output_tokens=1024,
    )
)
triage = triage_paper(
    paper=paper,
    llm=triage_client,
    prompt_template=TRIAGE_PROMPT,
    repair_template=REPAIR_PROMPT,
    schema=TRIAGE_SCHEMA,
)
triage


## Summary payload mode check

If `fulltext` prompt tokens are too large, summarization automatically falls back to `fulltext_slices`.


In [None]:
pdf_path = REPO_ROOT / 'data' / 'EEGFormer_eegfm.pdf'
out_dir = REPO_ROOT / 'outputs' / '_sanity'
out_dir.mkdir(parents=True, exist_ok=True)
text_path = out_dir / 'eegformer.txt'
assert pdf_path.exists(), f'Missing PDF at {pdf_path}'
meta = extract_text(pdf_path, text_path)
raw_text = text_path.read_text(encoding='utf-8')
slices = slice_paper_text(raw_text)
meta, {k: len(v) for k, v in slices.items()}


In [None]:
summary_client = GeminiClient(
    LLMConfig(
        api_key=load_api_key(),
        model=SUMMARY_MODEL,
        temperature=0.2,
        max_output_tokens=2048,
    )
)
summary = summarize_paper(
    paper=paper,
    triage=triage,
    raw_fulltext=raw_text,
    fulltext_slices=slices,
    used_fulltext=True,
    notes=json.dumps(meta, sort_keys=True),
    llm=summary_client,
    prompt_template=SUMMARIZE_PROMPT,
    repair_template=REPAIR_PROMPT,
    schema=SUMMARY_SCHEMA,
    max_input_tokens=120000,
)
summary
