## Step 3: get full content for items to pursue

In [1]:
import sys
sys.path.append('../')

import pandas as pd
from tqdm.asyncio import tqdm_asyncio
from limiter import Limiter
from tenacity import retry, wait_random_exponential, stop_after_attempt

# Rate limiters
jina_limiter = Limiter(rate=250, capacity=250, consume=1)
arxiv_limiter = Limiter(rate=100, capacity=100, consume=1)

from data import ContentManager
manager = ContentManager(base_path="../data")

INFO:data.content_manager:Loaded content index with 2521 entries


In [None]:
df = pd.read_csv("../data/research_items.csv")

In [10]:
# Filter pursue items not yet loaded
if "content_loaded_locally" in df.columns:
    pursue_df = df[(df["scout_decision"] == "pursue") & (df["content_loaded_locally"].isna())]
else:
    pursue_df = df[df["scout_decision"] == "pursue"]

print(f'{len(pursue_df)} items to load from {len(df)} total items')

1 items to load from 4355 total items


Define function to get single item (routing between PDF from arxiv and html websites)

In [11]:
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(3))
async def load_single_item(row):
    url = row["url"]
    title = row.get("title")
    
    if manager.exists(url):
        return {"url": url, "loaded": True, "error": None, "source": manager.get(url)["source"]}
    
    try:
        if "arxiv.org" in url:
            async with arxiv_limiter:
                manager.save_arxiv_pdf(url, title=title, abstract=row.get("relevance"))
            return {"url": url, "loaded": True, "error": None, "source": "arxiv"}
        else:
            async with jina_limiter:
                await manager.fetch_and_save_jina(url, title=title)
            return {"url": url, "loaded": True, "error": None, "source": "jina"}
    except Exception as e:
        return {"url": url, "loaded": False, "error": str(e)[:200], "source": None}

Run batch

In [12]:
items = pursue_df.to_dict("records")
print(f"Loading content for {len(items)} items")

Loading content for 1 items


In [13]:
results = await tqdm_asyncio.gather(*[load_single_item(item) for item in items], desc="Loading content")

Loading content:   0%|          | 0/1 [00:00<?, ?it/s]INFO:httpx:HTTP Request: POST https://r.jina.ai/ "HTTP/1.1 200 OK"
Loading content: 100%|██████████| 1/1 [00:04<00:00,  4.77s/it]


In [14]:
set([result['error'] for result in results if result["error"] is not None])

set()

Update df

In [15]:
# Update DataFrame
for result in results:
    mask = df["url"] == result["url"]
    df.loc[mask, "content_loaded_locally"] = result["loaded"] if result["loaded"] else None
    df.loc[mask, "content_load_error"] = result["error"]

In [16]:
df[~df['content_load_error'].isna()]

Unnamed: 0,focus_area,provider,url,title,source,published,relevance,date_added,scout_decision,scout_confidence,...,content_loaded_locally,content_load_error,curator_summary,curator_takeaways,curator_tags,applicability_score,novelty_score,priority_score,verdict_reasoning,curated_at


In [17]:
df.to_csv("../data/research_items.csv", index=False)

Check file size, delete PDFs

In [18]:
%%bash
for ext in md pdf json csv py ipynb; do
  count=$(find ../data -name "*.$ext" 2>/dev/null | wc -l | tr -d ' ')
  if [ $count -gt 0 ]; then
    size=$(find ../data -name "*.$ext" -exec du -ch {} + 2>/dev/null | grep total$ | awk '{print $1}')
    printf "%-8s : %4d files, %8s\n" ".$ext" "$count" "$size"
  fi
done

.md      : 2632 files,     155M
.pdf     :   44 files,     146M
.json    :    1 files,     3.3M
.csv     :    2 files,      33M


.py      :    3 files,      16K


In [19]:
%%bash
find ../data/contents -name "*.pdf" -type f -delete

In [20]:
%%bash
for ext in md pdf json csv py ipynb; do
  count=$(find ../data -name "*.$ext" 2>/dev/null | wc -l | tr -d ' ')
  if [ $count -gt 0 ]; then
    size=$(find ../data -name "*.$ext" -exec du -ch {} + 2>/dev/null | grep total$ | awk '{print $1}')
    printf "%-8s : %4d files, %8s\n" ".$ext" "$count" "$size"
  fi
done

.md      : 2632 files,     155M


.json    :    1 files,     3.3M
.csv     :    2 files,      32M
.py      :    3 files,      16K


In [None]:
# Todo run until no error