## Step 3: get full content for items to pursue

In [1]:
import sys
sys.path.append('../')

import os
import asyncio
import pandas as pd
from tqdm.asyncio import tqdm_asyncio
from limiter import Limiter
from tenacity import retry, wait_random_exponential, stop_after_attempt

# Rate limiters
jina_limiter = Limiter(rate=250, capacity=250, consume=1)
arxiv_limiter = Limiter(rate=100, capacity=100, consume=1)

from data import ContentManager
manager = ContentManager(base_path="../data")

INFO:data.content_manager:Loaded content index with 233 entries


In [35]:
df = pd.read_csv("../data/research_items.csv")

# Filter pursue items not yet loaded
if "content_loaded_locally" in df.columns:
    pursue_df = df[(df["scout_decision"] == "pursue") & (df["content_loaded_locally"].isna())]
else:
    pursue_df = df[df["scout_decision"] == "pursue"]

print(f'{len(pursue_df)} items to load from {len(df)} total items')

52 items to load from 4202 total items


Define function to get single item (routing between PDF from arxiv and html websites)

In [36]:
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(3))
async def load_single_item(row):
    url = row["url"]
    title = row.get("title")
    
    if manager.exists(url):
        return {"url": url, "loaded": True, "error": None, "source": manager.get(url)["source"]}
    
    try:
        if "arxiv.org" in url:
            async with arxiv_limiter:
                manager.save_arxiv_pdf(url, title=title, abstract=row.get("relevance"))
            return {"url": url, "loaded": True, "error": None, "source": "arxiv"}
        else:
            async with jina_limiter:
                await manager.fetch_and_save_jina(url, title=title)
            return {"url": url, "loaded": True, "error": None, "source": "jina"}
    except Exception as e:
        return {"url": url, "loaded": False, "error": str(e)[:200], "source": None}

Run batch

In [37]:
items = pursue_df.to_dict("records")
print(f"Loading content for {len(items)} items")

Loading content for 52 items


In [38]:
results = await tqdm_asyncio.gather(*[load_single_item(item) for item in items], desc="Loading content")

Loading content:   0%|          | 0/52 [00:00<?, ?it/s]INFO:httpx:HTTP Request: POST https://r.jina.ai/ "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://r.jina.ai/ "HTTP/1.1 200 OK"
Loading content:   2%|▏         | 1/52 [00:01<01:25,  1.68s/it]INFO:httpx:HTTP Request: POST https://r.jina.ai/ "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://r.jina.ai/ "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://r.jina.ai/ "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://r.jina.ai/ "HTTP/1.1 200 OK"
Loading content:   8%|▊         | 4/52 [00:02<00:28,  1.71it/s]INFO:httpx:HTTP Request: POST https://r.jina.ai/ "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://r.jina.ai/ "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://r.jina.ai/ "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://r.jina.ai/ "HTTP/1.1 200 OK"
Loading content:  19%|█▉        | 10/52 [00:03<00:09,  4.55it/s]INFO:httpx:HTTP Request: POST https://r.jina.ai/ "HTTP/1.1 200 OK"
Loading content:  

In [39]:
set([result['error'] for result in results if result["error"] is not None])

set()

Update df

In [42]:
# Update DataFrame
for result in results:
    mask = df["url"] == result["url"]
    df.loc[mask, "content_loaded_locally"] = result["loaded"] if result["loaded"] else None
    df.loc[mask, "content_load_error"] = result["error"]

In [43]:
df[~df['content_load_error'].isna()]

Unnamed: 0,focus_area,provider,url,title,source,published,relevance,date_added,scout_decision,scout_confidence,scout_reasoning,scouted_at,content_loaded_locally,content_load_error


In [44]:
df.to_csv("../data/research_items.csv", index=False)

In [55]:
df

Unnamed: 0,focus_area,provider,url,title,source,published,relevance,date_added,scout_decision,scout_confidence,scout_reasoning,scouted_at,content_loaded_locally,content_load_error
0,dummy1,openai,https://dummy1.com,dummy1,OpenAI Blog,2025-12-01,dummy example 1,2025-12-20,discard,0.95,The title and summary are placeholders (“dummy...,2025-12-26T23:51:12.379244,,
1,dummy2,anthropic,https://dummy2.com,dummy2,arXiv,2025-12-15,dummy example 2,2025-12-21,discard,0.98,The title and summary are placeholders (“dummy...,2025-12-26T23:51:12.379626,,
2,reasoning_agent,openai,https://openai.com/index/gpt-5-2-codex,Introducing GPT-5.2-Codex,OpenAI blog,2025-12-18,Official release of an agentic coding model em...,2025-12-24,pursue,0.93,"This is an official OpenAI release (credible, ...",2025-12-26T23:51:12.380414,True,
3,reasoning_agent,openai,https://openai.com/index/introducing-gpt-5-2/,Introducing GPT-5.2,OpenAI blog,2025-12-11,Details GPT-5.2 “Thinking/Pro” modes and API r...,2025-12-24,pursue,0.93,This is a recent (2025-12-11) primary-source r...,2025-12-26T23:51:12.380718,True,
4,reasoning_agent,openai,https://blog.google/products/gemini/gemini-3/,Introducing Gemini 3: our most intelligent mod...,Google Blog (Gemini/DeepMind),2025-11-18,Announces Gemini 3 with “thinking”/Deep Think ...,2025-12-24,pursue,0.86,"This is a very recent, high-signal primary-sou...",2025-12-26T23:51:12.380975,True,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4197,arxiv,arxiv,https://arxiv.org/abs/2512.10563v1,NormCode: A Semi-Formal Language for Context-I...,arXiv,2025-12-11,Summary: Multistep workflows that chain large ...,2025-12-27,pursue,0.82,This is directly relevant to agentic workflow ...,2025-12-27T22:33:14.888704,True,
4198,arxiv,arxiv,https://arxiv.org/abs/2512.10561v1,Causal Reasoning Favors Encoders: On The Limit...,arXiv,2025-12-11,Summary: In context learning (ICL) underpins r...,2025-12-27,pursue,0.78,This is directly relevant to our “reasoning re...,2025-12-27T22:33:14.888972,True,
4199,arxiv,arxiv,https://arxiv.org/abs/2512.10551v1,LLM-Auction: Generative Auction towards LLM-Na...,arXiv,2025-12-11,Summary: The rapid advancement of large langua...,2025-12-27,discard,0.85,This is primarily about mechanism design for L...,2025-12-27T22:33:14.889237,,
4200,arxiv,arxiv,https://arxiv.org/abs/2512.10547v1,Unlocking the Address Book: Dissecting the Spa...,arXiv,2025-12-11,Summary: The Key-Value (KV) cache is the prima...,2025-12-27,pursue,0.78,This looks directly relevant to production con...,2025-12-27T22:33:14.889502,True,


Check file size, delete PDFs

In [50]:
%%bash
for ext in md pdf json csv py ipynb; do
  count=$(find ../data -name "*.$ext" 2>/dev/null | wc -l | tr -d ' ')
  if [ $count -gt 0 ]; then
    size=$(find ../data -name "*.$ext" -exec du -ch {} + 2>/dev/null | grep total$ | awk '{print $1}')
    printf "%-8s : %4d files, %8s\n" ".$ext" "$count" "$size"
  fi
done

.md      : 2483 files,     148M
.pdf     : 1697 files,     5.9G
.json    :    1 files,     3.2M
.csv     :    2 files,      14M
.py      :    3 files,      16K


In [53]:
%%bash
find ../data/contents -name "*.pdf" -type f -delete

In [54]:
%%bash
for ext in md pdf json csv py ipynb; do
  count=$(find ../data -name "*.$ext" 2>/dev/null | wc -l | tr -d ' ')
  if [ $count -gt 0 ]; then
    size=$(find ../data -name "*.$ext" -exec du -ch {} + 2>/dev/null | grep total$ | awk '{print $1}')
    printf "%-8s : %4d files, %8s\n" ".$ext" "$count" "$size"
  fi
done

.md      : 2483 files,     148M
.json    :    1 files,     3.2M
.csv     :    2 files,      14M
.py      :    3 files,      16K
