### Pipeline to run research to collect candidates

In [None]:
import sys
sys.path.append('../')

import pandas as pd
from datetime import datetime

import logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

from components.agents.research_agents import run_mixed_research_agents
from components.prompts.research_agents import FOCUS_AREAS
from data.content_manager import ContentManager
from data.dedup import deduplicate_research

content_manager = ContentManager(base_path="../data")   

## 1. Using OpenAI, Anthropic and EXA web agents

In [2]:
FOCUS_AREAS.keys()

dict_keys(['reasoning_and_planning', 'agents_and_finance', 'agent_infrastructure', 'retrieval_and_embeddings', 'multimodal_and_generation'])

In [3]:
print(FOCUS_AREAS['reasoning_and_planning'])

Reasoning LLMs, chain-of-thought, inference-time compute, self-reflection, planning with LLMs, MCTS (Monte Carlo Tree Search) for language models, test-time scaling, hallucination reduction and detection, grounding, factuality


### Run

In [None]:
research_results = await run_mixed_research_agents(
    focus_areas=FOCUS_AREAS,
    content_manager=content_manager
)

2025-12-26 23:59:28,901 - components.agents.research_agents - INFO - Running 15 tasks
Research agents (mixed):   0%|          | 0/15 [00:00<?, ?it/s]2025-12-26 23:59:28,919 - components.agents.research_agents - INFO - Running OpenAI research agent for focus area: agents_and_finance
2025-12-26 23:59:29,216 - components.agents.research_agents - INFO - Running Exa research agent for focus area: agent_infrastructure
2025-12-26 23:59:29,245 - components.agents.research_agents - INFO - Running Anthropic research agent for focus area: multimodal_and_generation
2025-12-26 23:59:29,247 - components.agents.research_agents - INFO - Running Anthropic research agent for focus area: agent_infrastructure
2025-12-26 23:59:29,248 - components.agents.research_agents - INFO - Running OpenAI research agent for focus area: reasoning_and_planning
2025-12-26 23:59:29,249 - components.agents.research_agents - INFO - Running Exa research agent for focus area: reasoning_and_planning
2025-12-26 23:59:29,250 - co

### Deduplicate based on URL

In [5]:
# Parse the research results into a flat list of records
records = []
for key, result in research_results.items():
    focus_area, llm_provider = key.split(' --- ')
    for item in result.items:
        records.append({
            'focus_area': focus_area,
            'provider': llm_provider,
            'url': item.url,
            'title': item.title,
            'source': item.source,
            'published': item.published,
            'relevance': item.relevance,
            'date_added': datetime.now().strftime("%Y-%m-%d")
        })

# Create DataFrame
df = pd.DataFrame(records)

print(f"There are {len(df)} ResearchItems present in the research results")

There are 448 ResearchItems present in the research results


In [6]:
# Deduplicate based on URL (keep first occurrence)
print(f"df.shape: {df.shape}")
df = df.drop_duplicates(subset='url', keep='first')
print(f"df.shape: {df.shape}")

df.shape: (448, 8)
df.shape: (429, 8)


### Add to existing collection

In [7]:
result = deduplicate_research(
    new_df=df,
    main_csv="../data/research_items.csv",
    save=True
)

print(f"Added {result['new_added']} new items")
print(f"Skipped {result['skipped']} duplicates")
print(f"Total in DB: {result['total_after']}")

2025-12-27 00:04:10,920 - data.dedup - INFO - Loaded 1356 existing items from ../data/research_items.csv
2025-12-27 00:04:10,922 - data.dedup - INFO - New items: 201, Skipped (duplicates): 228
2025-12-27 00:04:10,955 - data.dedup - INFO - Saved 1557 items to ../data/research_items.csv


Added 201 new items
Skipped 228 duplicates
Total in DB: 1557


## 2. Using direct arXiv API

In [18]:
from components.tools import ArxivClient, ARXIV_CATEGORIES, ARXIV_KEYWORDS
arxiv = ArxivClient()

In [46]:
arxiv_results = arxiv.search_cats_and_kws_both(
    categories=ARXIV_CATEGORIES,
    keywords=ARXIV_KEYWORDS,
    max_results_cats=3000,
    max_results_kws=150,
    last_n_days=None
)

2025-12-27 13:19:27,149 - components.tools.arxiv - INFO - Found 3000 entries for submitted categories
2025-12-27 13:19:27,430 - components.tools.arxiv - INFO - Found 150 entries for submitted keywords
2025-12-27 13:19:27,432 - components.tools.arxiv - INFO - Found 3150 total entries


In [47]:
len(arxiv_results.items)

3150

Dedup

In [48]:
# Convert to DataFrame (same as web agents)
records = []
for item in arxiv_results.items:
    records.append({
        'focus_area': 'arxiv',
        'provider': 'arxiv',
        'url': item.url,
        'title': item.title,
        'source': item.source,
        'published': item.published,
        'relevance': item.relevance,
        'date_added': datetime.now().strftime("%Y-%m-%d")
    })

df_arxiv = pd.DataFrame(records)
print(f"There are {len(df_arxiv)} ResearchItems present in the research results")

There are 3150 ResearchItems present in the research results


In [49]:
# Deduplicate based on URL (keep first occurrence)
print(f"df.shape: {df_arxiv.shape}")
df_arxiv = df_arxiv.drop_duplicates(subset='url', keep='first')
print(f"df.shape: {df_arxiv.shape}")

df.shape: (3150, 8)
df.shape: (3068, 8)


Add to existing collection

In [52]:
result = deduplicate_research(
    new_df=df_arxiv,
    main_csv="../data/research_items.csv",
    save=True
)

print(f"Added {result['new_added']} new items")
print(f"Skipped {result['skipped']} duplicates")
print(f"Total in DB: {result['total_after']}")

2025-12-27 13:19:51,411 - data.dedup - INFO - Loaded 1557 existing items from ../data/research_items.csv
2025-12-27 13:19:51,418 - data.dedup - INFO - New items: 2645, Skipped (duplicates): 423
2025-12-27 13:19:51,499 - data.dedup - INFO - Saved 4202 items to ../data/research_items.csv


Added 2645 new items
Skipped 423 duplicates
Total in DB: 4202


### Check research items csv

In [53]:
recent_df = pd.read_csv("../data/research_items.csv")
recent_df.head()

Unnamed: 0,focus_area,provider,url,title,source,published,relevance,date_added,scout_decision,scout_confidence,scout_reasoning,scouted_at
0,dummy1,openai,https://dummy1.com,dummy1,OpenAI Blog,2025-12-01,dummy example 1,2025-12-20,discard,0.95,The title and summary are placeholders (“dummy...,2025-12-26T23:51:12.379244
1,dummy2,anthropic,https://dummy2.com,dummy2,arXiv,2025-12-15,dummy example 2,2025-12-21,discard,0.98,The title and summary are placeholders (“dummy...,2025-12-26T23:51:12.379626
2,reasoning_agent,openai,https://openai.com/index/gpt-5-2-codex,Introducing GPT-5.2-Codex,OpenAI blog,2025-12-18,Official release of an agentic coding model em...,2025-12-24,pursue,0.93,"This is an official OpenAI release (credible, ...",2025-12-26T23:51:12.380414
3,reasoning_agent,openai,https://openai.com/index/introducing-gpt-5-2/,Introducing GPT-5.2,OpenAI blog,2025-12-11,Details GPT-5.2 “Thinking/Pro” modes and API r...,2025-12-24,pursue,0.93,This is a recent (2025-12-11) primary-source r...,2025-12-26T23:51:12.380718
4,reasoning_agent,openai,https://blog.google/products/gemini/gemini-3/,Introducing Gemini 3: our most intelligent mod...,Google Blog (Gemini/DeepMind),2025-11-18,Announces Gemini 3 with “thinking”/Deep Think ...,2025-12-24,pursue,0.86,"This is a very recent, high-signal primary-sou...",2025-12-26T23:51:12.380975


In [54]:
provider_counts = recent_df['provider'].value_counts()
print("Absolute counts:")
print(provider_counts)
print("\nNormalized (proportions):")
print(recent_df['provider'].value_counts(normalize=True))

Absolute counts:
provider
arxiv        3126
openai        447
anthropic     377
exa           252
Name: count, dtype: int64

Normalized (proportions):
provider
arxiv        0.743931
openai       0.106378
anthropic    0.089719
exa          0.059971
Name: proportion, dtype: float64
