## Step 1: Pipeline to run research to collect candidates

In [1]:
import sys
sys.path.append('../')

import pandas as pd
from datetime import datetime

import logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

from components.agents.research_agents import run_mixed_research_agents
from components.prompts.research_agents import FOCUS_AREAS
from data.content_manager import ContentManager
from data.dedup import deduplicate_research

content_manager = ContentManager(base_path="../data")   

2025-12-28 19:39:23,699 - data.content_manager - INFO - Loaded content index with 2484 entries


## 1. Using OpenAI, Anthropic and EXA web agents

In [2]:
FOCUS_AREAS.keys()

dict_keys(['reasoning_and_planning', 'agents_and_finance', 'agent_infrastructure', 'retrieval_and_embeddings', 'multimodal_and_generation'])

In [3]:
print(FOCUS_AREAS['reasoning_and_planning'])

Reasoning LLMs, chain-of-thought, inference-time compute, self-reflection, planning with LLMs, MCTS (Monte Carlo Tree Search) for language models, test-time scaling, hallucination reduction and detection, grounding, factuality


### Run

In [4]:
research_results = await run_mixed_research_agents(
    focus_areas=FOCUS_AREAS,
    content_manager=content_manager
)

2025-12-28 19:39:30,773 - components.agents.research_agents - INFO - Running 15 tasks
Research agents (mixed):   0%|          | 0/15 [00:00<?, ?it/s]2025-12-28 19:39:30,790 - components.agents.research_agents - INFO - Running OpenAI research agent for focus area: agents_and_finance
2025-12-28 19:39:31,084 - components.agents.research_agents - INFO - Running Anthropic research agent for focus area: agent_infrastructure
2025-12-28 19:39:31,093 - components.agents.research_agents - INFO - Running OpenAI research agent for focus area: multimodal_and_generation
2025-12-28 19:39:31,096 - components.agents.research_agents - INFO - Running OpenAI research agent for focus area: agent_infrastructure
2025-12-28 19:39:31,098 - components.agents.research_agents - INFO - Running Exa research agent for focus area: reasoning_and_planning
2025-12-28 19:39:31,134 - components.agents.research_agents - INFO - Running Exa research agent for focus area: agent_infrastructure
2025-12-28 19:39:31,135 - compone

### Deduplicate based on URL

In [5]:
# Parse the research results into a flat list of records
records = []
for key, result in research_results.items():
    focus_area, llm_provider = key.split(' --- ')
    for item in result.items:
        records.append({
            'focus_area': focus_area,
            'provider': llm_provider,
            'url': item.url,
            'title': item.title,
            'source': item.source,
            'published': item.published,
            'relevance': item.relevance,
            'date_added': datetime.now().strftime("%Y-%m-%d")
        })

# Create DataFrame
df = pd.DataFrame(records)

print(f"There are {len(df)} ResearchItems present in the research results")

There are 449 ResearchItems present in the research results


In [6]:
# Deduplicate based on URL (keep first occurrence)
print(f"df.shape: {df.shape}")
df = df.drop_duplicates(subset='url', keep='first')
print(f"df.shape: {df.shape}")

df.shape: (449, 8)
df.shape: (431, 8)


### Add to existing collection

In [7]:
result = deduplicate_research(
    new_df=df,
    main_csv="../data/research_items.csv",
    save=True
)

print(f"Added {result['new_added']} new items")
print(f"Skipped {result['skipped']} duplicates")
print(f"Total in DB: {result['total_after']}")

2025-12-28 19:47:10,391 - data.dedup - INFO - Loaded 4202 existing items from ../data/research_items.csv
2025-12-28 19:47:10,393 - data.dedup - INFO - New items: 153, Skipped (duplicates): 278
2025-12-28 19:47:10,581 - data.dedup - INFO - Saved 4355 items to ../data/research_items.csv


Added 153 new items
Skipped 278 duplicates
Total in DB: 4355


## 2. Using direct arXiv API

In [9]:
from components.tools import ArxivClient, ARXIV_CATEGORIES, ARXIV_KEYWORDS
arxiv = ArxivClient()

In [10]:
arxiv_results = arxiv.search_cats_and_kws_both(
    categories=ARXIV_CATEGORIES,
    keywords=ARXIV_KEYWORDS,
    max_results_cats=3000,
    max_results_kws=150,
    last_n_days=None
)

2025-12-28 19:48:04,431 - components.tools.arxiv - INFO - Found 3000 entries for submitted categories
2025-12-28 19:48:17,260 - components.tools.arxiv - INFO - Found 150 entries for submitted keywords
2025-12-28 19:48:17,263 - components.tools.arxiv - INFO - Found 3150 total entries


In [11]:
len(arxiv_results.items)

3150

Dedup

In [12]:
# Convert to DataFrame (same as web agents)
records = []
for item in arxiv_results.items:
    records.append({
        'focus_area': 'arxiv',
        'provider': 'arxiv',
        'url': item.url,
        'title': item.title,
        'source': item.source,
        'published': item.published,
        'relevance': item.relevance,
        'date_added': datetime.now().strftime("%Y-%m-%d")
    })

df_arxiv = pd.DataFrame(records)
print(f"There are {len(df_arxiv)} ResearchItems present in the research results")

There are 3150 ResearchItems present in the research results


In [13]:
# Deduplicate based on URL (keep first occurrence)
print(f"df.shape: {df_arxiv.shape}")
df_arxiv = df_arxiv.drop_duplicates(subset='url', keep='first')
print(f"df.shape: {df_arxiv.shape}")

df.shape: (3150, 8)
df.shape: (3068, 8)


Add to existing collection

In [14]:
result = deduplicate_research(
    new_df=df_arxiv,
    main_csv="../data/research_items.csv",
    save=True
)

print(f"Added {result['new_added']} new items")
print(f"Skipped {result['skipped']} duplicates")
print(f"Total in DB: {result['total_after']}")

2025-12-28 19:48:49,907 - data.dedup - INFO - Loaded 4355 existing items from ../data/research_items.csv
2025-12-28 19:48:49,910 - data.dedup - INFO - New items: 0, Skipped (duplicates): 3068
2025-12-28 19:48:50,100 - data.dedup - INFO - Saved 4355 items to ../data/research_items.csv


Added 0 new items
Skipped 3068 duplicates
Total in DB: 4355


### Check research items csv

In [15]:
recent_df = pd.read_csv("../data/research_items.csv")

In [16]:
provider_counts = recent_df['provider'].value_counts()
print("Absolute counts:")
print(provider_counts)
print("\nNormalized (proportions):")
print(recent_df['provider'].value_counts(normalize=True))

Absolute counts:
provider
arxiv        3126
openai        520
anthropic     421
exa           288
Name: count, dtype: int64

Normalized (proportions):
provider
arxiv        0.717796
openai       0.119403
anthropic    0.096670
exa          0.066131
Name: proportion, dtype: float64
