### Pipeline to run research to collect candidates

In [1]:
import sys
sys.path.append('../')

import pandas as pd
from datetime import datetime

import logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

from components.agents.research_agents import run_mixed_research_agents
from components.prompts.research_agents import FOCUS_AREAS
from data.dedup import deduplicate_research

## 1. Using OpenAI and Anthropic web agents

In [2]:
FOCUS_AREAS.keys()

dict_keys(['reasoning_and_planning', 'agents_and_finance', 'agent_infrastructure', 'retrieval_and_embeddings', 'multimodal_and_generation'])

In [3]:
print(FOCUS_AREAS['reasoning_and_planning'])

Reasoning LLMs, chain-of-thought, inference-time compute, self-reflection, planning with LLMs, MCTS (Monte Carlo Tree Search) for language models, test-time scaling, hallucination reduction and detection, grounding, factuality


### Run

In [4]:
research_results = await run_mixed_research_agents(
    focus_areas=FOCUS_AREAS
)

2025-12-24 16:54:37,164 - components.agents.research_agents - INFO - Running 10 tasks
Research agents (mixed):   0%|          | 0/10 [00:00<?, ?it/s]2025-12-24 16:54:37,183 - components.agents.research_agents - INFO - Running OpenAI research agent for focus area: multimodal_and_generation
2025-12-24 16:54:37,539 - components.agents.research_agents - INFO - Running Anthropic research agent for focus area: retrieval_and_embeddings
2025-12-24 16:54:37,541 - components.agents.research_agents - INFO - Running Anthropic research agent for focus area: multimodal_and_generation
2025-12-24 16:54:37,542 - components.agents.research_agents - INFO - Running Anthropic research agent for focus area: reasoning_and_planning
2025-12-24 16:54:37,543 - components.agents.research_agents - INFO - Running Anthropic research agent for focus area: agent_infrastructure
2025-12-24 16:54:37,544 - components.agents.research_agents - INFO - Running OpenAI research agent for focus area: agents_and_finance
2025-12-2

### Deduplicate based on URL

In [8]:
# Parse the research results into a flat list of records
records = []
for key, result in research_results.items():
    focus_area, llm_provider = key.split(' --- ')
    for item in result.items:
        records.append({
            'focus_area': focus_area,
            'provider': llm_provider,
            'url': item.url,
            'title': item.title,
            'source': item.source,
            'published': item.published,
            'relevance': item.relevance,
            'date_added': datetime.now().strftime("%Y-%m-%d")
        })

# Create DataFrame
df = pd.DataFrame(records)

print(f"There are {len(df)} ResearchItems present in the research results")

There are 296 ResearchItems present in the research results


In [9]:
# Deduplicate based on URL (keep first occurrence)
print(f"df.shape: {df.shape}")
df = df.drop_duplicates(subset='url', keep='first')
print(f"df.shape: {df.shape}")

df.shape: (296, 8)
df.shape: (289, 8)


### Add to existing collection

In [10]:
result = deduplicate_research(
    new_df=df,
    main_csv="../data/research_items.csv",
    save=True
)

print(f"Added {result['new_added']} new items")
print(f"Skipped {result['skipped']} duplicates")
print(f"Total in DB: {result['total_after']}")

2025-12-24 17:30:42,308 - data.dedup - INFO - Loaded 59 existing items from ../data/research_items.csv


2025-12-24 17:30:42,312 - data.dedup - INFO - New items: 277, Skipped (duplicates): 12
2025-12-24 17:30:42,323 - data.dedup - INFO - Saved 336 items to ../data/research_items.csv
2025-12-24 17:30:42,328 - data.dedup - INFO - Loaded 59 existing skipped items from ../data/skipped_items.csv
2025-12-24 17:30:42,330 - data.dedup - INFO - No new skipped items to log (all already in ../data/skipped_items.csv)


Added 277 new items
Skipped 12 duplicates
Total in DB: 336


## 2. Using direct arXiv API