# Pipeline One-Paper Sanity

Real-Gemini end-to-end smoke run for one month and one accepted paper path.


In [None]:
from pathlib import Path
import json
import sqlite3

from eegfm_digest.config import load_config
from eegfm_digest.pipeline import run_month


In [None]:
cfg = load_config()
month = '2025-01'
run_month(cfg, month, no_pdf=False, no_site=True, force=False)
month_out = cfg.output_dir / month
month_out


In [None]:
def read_jsonl(path: Path):
    rows = []
    if not path.exists():
        return rows
    for line in path.read_text(encoding='utf-8').splitlines():
        line = line.strip()
        if line:
            rows.append(json.loads(line))
    return rows

triage_rows = read_jsonl(month_out / 'triage.jsonl')
summary_rows = read_jsonl(month_out / 'papers.jsonl')
backend_rows = read_jsonl(month_out / 'backend_rows.jsonl')
len(triage_rows), len(summary_rows), len(backend_rows)


In [None]:
backend_rows[0] if backend_rows else None


In [None]:
db_path = cfg.data_dir / 'digest.sqlite'
conn = sqlite3.connect(db_path)
cur = conn.cursor()
for table in ('papers', 'triage', 'summaries', 'runs'):
    n = cur.execute(f'SELECT COUNT(*) FROM {table}').fetchone()[0]
    print(table, n)
conn.close()


In [None]:
required_backend_keys = {
    'arxiv_id', 'arxiv_id_base', 'version', 'title', 'summary',
    'authors', 'categories', 'published', 'updated', 'links',
    'triage', 'paper_summary', 'pdf',
}
if backend_rows:
    print(required_backend_keys.issubset(set(backend_rows[0].keys())))
    print(backend_rows[0]['triage'].keys())
    print(backend_rows[0]['pdf'].keys())
