## Step 3: get full content for items to pursue

In [1]:
import sys
sys.path.append('../')

import os
import asyncio
import pandas as pd
from tqdm.asyncio import tqdm_asyncio
from limiter import Limiter
from tenacity import retry, wait_random_exponential, stop_after_attempt

# Rate limiters
jina_limiter = Limiter(rate=250, capacity=250, consume=1)
arxiv_limiter = Limiter(rate=100, capacity=100, consume=1)

from data import ContentManager
manager = ContentManager(base_path="../data")

INFO:data.content_manager:Loaded content index with 233 entries


In [3]:
df = pd.read_csv("../data/research_items.csv")

# Filter pursue items not yet loaded
if "content_loaded_locally" in df.columns:
    pursue_df = df[(df["scout_decision"] == "pursue") & (df["content_loaded_locally"].isna())]
else:
    pursue_df = df[df["scout_decision"] == "pursue"]

print(f'{len(pursue_df)} items to load from {len(df)} total items')

2418 items to load from 4202 total items


Define function to get single item (routing between PDF from arxiv and html websites)

In [4]:
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(3))
async def load_single_item(row):
    url = row["url"]
    title = row.get("title")
    
    if manager.exists(url):
        return {"url": url, "loaded": True, "error": None, "source": manager.get(url)["source"]}
    
    try:
        if "arxiv.org" in url:
            async with arxiv_limiter:
                manager.save_arxiv_pdf(url, title=title, abstract=row.get("relevance"))
            return {"url": url, "loaded": True, "error": None, "source": "arxiv"}
        else:
            async with jina_limiter:
                await manager.fetch_and_save_jina(url, title=title)
            return {"url": url, "loaded": True, "error": None, "source": "jina"}
    except Exception as e:
        return {"url": url, "loaded": False, "error": str(e)[:200], "source": None}

Run batch

In [5]:
items = pursue_df.to_dict("records")
print(f"Loading content for {len(items)} items")

Loading content for 2418 items


In [None]:
results = await tqdm_asyncio.gather(*[load_single_item(item) for item in items], desc="Loading content")

Loading content:   0%|          | 0/4 [00:00<?, ?it/s]

INFO:data.content_manager:Saved arXiv: 2512.10573v1
INFO:data.content_manager:Saved arXiv: 2512.10547v1
INFO:data.content_manager:Saved arXiv: 2512.10561v1
INFO:data.content_manager:Saved arXiv: 2512.10563v1
Loading content: 100%|██████████| 4/4 [00:07<00:00,  2.00s/it]


Update df

In [None]:
# Update DataFrame
for result in results:
    mask = df["url"] == result["url"]
    df.loc[mask, "content_loaded_locally"] = result["loaded"] if result["loaded"] else None
    df.loc[mask, "content_load_error"] = result["error"]

In [None]:
df.to_csv("../data/research_items.csv", index=False)