## Step 3: get full content for items to pursue

In [1]:
import sys
sys.path.append('../')

import pandas as pd
from tqdm.asyncio import tqdm_asyncio
from limiter import Limiter
from tenacity import retry, wait_random_exponential, stop_after_attempt

# Rate limiters
jina_limiter = Limiter(rate=20, capacity=20, consume=1)
arxiv_limiter = Limiter(rate=100, capacity=100, consume=1)

from data import ContentManager
manager = ContentManager(base_path="../data")

INFO:data.content_manager:Loaded content index with 2995 entries


In [2]:
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(3))
async def load_single_item(row):
    url = row["url"]
    title = row.get("title")
    
    if manager.exists(url):
        return {"url": url, "loaded": True, "error": None, "source": manager.get(url)["source"]}
    
    try:
        if "arxiv.org" in url:
            async with arxiv_limiter:
                manager.save_arxiv_pdf(url, title=title, abstract=row.get("relevance"))
            return {"url": url, "loaded": True, "error": None, "source": "arxiv"}
        else:
            async with jina_limiter:
                await manager.fetch_and_save_jina(url, title=title)
            return {"url": url, "loaded": True, "error": None, "source": "jina"}
    except Exception as e:
        return {"url": url, "loaded": False, "error": str(e)[:200], "source": None}

In [3]:
df = pd.read_csv("../data/research_items.csv")

Run in while loop with max attempts to handle remaining errors

In [4]:
MAX_ATTEMPTS = 7
attempt = 0

while attempt < MAX_ATTEMPTS:
    attempt += 1
    print(f"\n{'='*50}")
    print(f"Attempt {attempt}/{MAX_ATTEMPTS}")
    print(f"{'='*50}")
    
    # 1. FILTER
    if "content_loaded_locally" in df.columns:
        pursue_df = df[
            (df["scout_decision"] == "pursue") & 
            (df["content_loaded_locally"].isna() | df["content_load_error"].notna())
        ]
    else:
        pursue_df = df[df["scout_decision"] == "pursue"]
    
    if len(pursue_df) == 0:
        print("✅ All items loaded successfully!")
        break
    
    print(f'{len(pursue_df)} items to load from {len(df)} total items')
    
    # 2. CONVERT TO DICT
    items = pursue_df.to_dict("records")

    # 3. RUN BATCH
    results = await tqdm_asyncio.gather(*[load_single_item(item) for item in items], desc="Loading content")

    # 4. UPDATE DF
    for result in results:
        mask = df["url"] == result["url"]
        df.loc[mask, "content_loaded_locally"] = result["loaded"] if result["loaded"] else None
        df.loc[mask, "content_load_error"] = result["error"]
    
    # 5. CHECK ERRORS
    errors = set([result['error'] for result in results if result["error"] is not None])
    if not errors:
        print("✅ No errors in this batch!")
        break
    else:
        print(f"⚠️ {len(errors)} unique errors remain, retrying...")
        
else:
    error_count = df[~df['content_load_error'].isna()].shape[0]
    print(f"❌ Max attempts ({MAX_ATTEMPTS}) reached, {error_count} errors remain")


Attempt 1/7
434 items to load from 5595 total items


Loading content:   0%|          | 0/434 [00:00<?, ?it/s]INFO:data.content_manager:Saved arXiv: 2503.17407
INFO:data.content_manager:Saved arXiv: 2512.23171v1
INFO:data.content_manager:Saved arXiv: 2512.20460v2
INFO:data.content_manager:Saved arXiv: 2512.23167v1
INFO:data.content_manager:Saved arXiv: 2512.22238v1
INFO:data.content_manager:Saved arXiv: 2512.23165v1
INFO:data.content_manager:Saved arXiv: 2512.20291v2
INFO:data.content_manager:Saved arXiv: 2512.23145v1
INFO:data.content_manager:Saved arXiv: 2512.22236v1
INFO:data.content_manager:Saved arXiv: 2512.23138v1
INFO:data.content_manager:Saved arXiv: 2512.20156v2
INFO:data.content_manager:Saved arXiv: 2512.23133v1
INFO:data.content_manager:Saved arXiv: 2512.22234v1
INFO:data.content_manager:Saved arXiv: 2512.23132v1
INFO:data.content_manager:Saved arXiv: 2512.22226v1
INFO:data.content_manager:Saved arXiv: 2512.23128v1
INFO:data.content_manager:Saved arXiv: 2512.22225v1
INFO:data.content_manager:Saved arXiv: 2512.23126v1
INFO:data.

MuPDF error: syntax error: cannot find ExtGState resource 'gs0'



INFO:data.content_manager:Saved arXiv: 2512.22247v1
INFO:data.content_manager:Saved arXiv: 2512.23213v1
INFO:data.content_manager:Saved arXiv: 2512.22245v1
INFO:data.content_manager:Saved arXiv: 2512.23206v1
INFO:data.content_manager:Saved arXiv: 2512.20745v2
INFO:data.content_manager:Saved arXiv: 2512.23184v1
INFO:data.content_manager:Saved arXiv: 2512.22240v1
INFO:data.content_manager:Saved arXiv: 2512.23173v1
INFO:data.content_manager:Saved arXiv: 2512.20491v4
Loading content:   0%|          | 1/434 [04:45<34:22:23, 285.78s/it]INFO:httpx:HTTP Request: POST https://r.jina.ai/ "HTTP/1.1 200 OK"
INFO:data.content_manager:Saved via Jina: https://platform.openai.com/docs/guides/tools-connectors-mcp
Loading content:  82%|████████▏ | 357/434 [04:47<00:43,  1.77it/s]  INFO:httpx:HTTP Request: POST https://r.jina.ai/ "HTTP/1.1 200 OK"
INFO:data.content_manager:Saved via Jina: https://newsletter.weaviate.io/p/94-faster-keyword-search-multi-vector-embeddings-nvidia-integrations-and-much-more
I

✅ No errors in this batch!


Save back to DF once all attempts are exhausted

In [5]:
df.to_csv("../data/research_items.csv", index=False)

Check file size, delete PDFs

In [6]:
%%bash
for ext in md pdf json csv py ipynb; do
  count=$(find ../data -name "*.$ext" 2>/dev/null | wc -l | tr -d ' ')
  if [ $count -gt 0 ]; then
    size=$(find ../data -name "*.$ext" -exec du -ch {} + 2>/dev/null | grep total$ | awk '{print $1}')
    printf "%-8s : %4d files, %8s\n" ".$ext" "$count" "$size"
  fi
done

.md      : 3389 files,     197M
.pdf     :  314 files,     986M
.json    :    1 files,     3.7M
.csv     :    2 files,      36M
.py      :    3 files,      16K


In [7]:
%%bash
find ../data/contents -name "*.pdf" -type f -delete

In [8]:
%%bash
for ext in md pdf json csv py ipynb; do
  count=$(find ../data -name "*.$ext" 2>/dev/null | wc -l | tr -d ' ')
  if [ $count -gt 0 ]; then
    size=$(find ../data -name "*.$ext" -exec du -ch {} + 2>/dev/null | grep total$ | awk '{print $1}')
    printf "%-8s : %4d files, %8s\n" ".$ext" "$count" "$size"
  fi
done

.md      : 3389 files,     197M
.json    :    1 files,     3.7M
.csv     :    2 files,      36M
.py      :    3 files,      16K
