### Pipeline to run research to collect candidates

In [None]:
import sys
sys.path.append('../')

import pandas as pd
from datetime import datetime

import logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

from components.agents.research_agents import run_mixed_research_agents
from components.prompts.research_agents import FOCUS_AREAS
from data.content_saver import ContentSaver
from data.dedup import deduplicate_research

content_saver = ContentSaver(base_path="../data")

## 1. Using OpenAI, Anthropic and EXA web agents

In [2]:
FOCUS_AREAS.keys()

dict_keys(['reasoning_and_planning', 'agents_and_finance', 'agent_infrastructure', 'retrieval_and_embeddings', 'multimodal_and_generation'])

In [3]:
print(FOCUS_AREAS['reasoning_and_planning'])

Reasoning LLMs, chain-of-thought, inference-time compute, self-reflection, planning with LLMs, MCTS (Monte Carlo Tree Search) for language models, test-time scaling, hallucination reduction and detection, grounding, factuality


### Run

In [4]:
research_results = await run_mixed_research_agents(
    focus_areas=FOCUS_AREAS,
    content_saver=content_saver
)

2025-12-25 18:50:35,479 - components.agents.research_agents - INFO - Running 15 tasks
Research agents (mixed):   0%|          | 0/15 [00:00<?, ?it/s]2025-12-25 18:50:35,501 - components.agents.research_agents - INFO - Running Anthropic research agent for focus area: reasoning_and_planning
2025-12-25 18:50:35,508 - components.agents.research_agents - INFO - Running OpenAI research agent for focus area: retrieval_and_embeddings
2025-12-25 18:50:35,786 - components.agents.research_agents - INFO - Running Exa research agent for focus area: multimodal_and_generation
2025-12-25 18:50:35,816 - components.agents.research_agents - INFO - Running Exa research agent for focus area: agent_infrastructure
2025-12-25 18:50:35,816 - components.agents.research_agents - INFO - Running OpenAI research agent for focus area: agents_and_finance
2025-12-25 18:50:35,819 - components.agents.research_agents - INFO - Running Anthropic research agent for focus area: retrieval_and_embeddings
2025-12-25 18:50:35,82

### Deduplicate based on URL

In [5]:
# Parse the research results into a flat list of records
records = []
for key, result in research_results.items():
    focus_area, llm_provider = key.split(' --- ')
    for item in result.items:
        records.append({
            'focus_area': focus_area,
            'provider': llm_provider,
            'url': item.url,
            'title': item.title,
            'source': item.source,
            'published': item.published,
            'relevance': item.relevance,
            'date_added': datetime.now().strftime("%Y-%m-%d")
        })

# Create DataFrame
df = pd.DataFrame(records)

print(f"There are {len(df)} ResearchItems present in the research results")

There are 443 ResearchItems present in the research results


In [6]:
# Deduplicate based on URL (keep first occurrence)
print(f"df.shape: {df.shape}")
df = df.drop_duplicates(subset='url', keep='first')
print(f"df.shape: {df.shape}")

df.shape: (443, 8)
df.shape: (426, 8)


### Add to existing collection

In [7]:
result = deduplicate_research(
    new_df=df,
    main_csv="../data/research_items.csv",
    save=True
)

print(f"Added {result['new_added']} new items")
print(f"Skipped {result['skipped']} duplicates")
print(f"Total in DB: {result['total_after']}")

2025-12-25 18:59:56,301 - data.dedup - INFO - Loaded 584 existing items from ../data/research_items.csv
2025-12-25 18:59:56,304 - data.dedup - INFO - New items: 327, Skipped (duplicates): 99
2025-12-25 18:59:56,321 - data.dedup - INFO - Saved 911 items to ../data/research_items.csv
2025-12-25 18:59:56,324 - data.dedup - INFO - Loaded 59 existing skipped items from ../data/skipped_items.csv
2025-12-25 18:59:56,326 - data.dedup - INFO - Logged 84 new skipped items to ../data/skipped_items.csv


Added 327 new items
Skipped 99 duplicates
Total in DB: 911


## 2. Using direct arXiv API

In [8]:
from components.tools import ArxivClient, ARXIV_CATEGORIES, ARXIV_KEYWORDS
arxiv = ArxivClient()

In [12]:
arxiv_results = arxiv.search_cats_and_kws_both(
    categories=ARXIV_CATEGORIES,
    keywords=ARXIV_KEYWORDS,
    max_results_cats=200,
    max_results_kws=150,
    last_n_days=None
)

2025-12-25 19:02:58,286 - components.tools.arxiv - INFO - Found 150 entries for submitted categories
2025-12-25 19:03:19,927 - components.tools.arxiv - INFO - Found 150 entries for submitted keywords
2025-12-25 19:03:19,930 - components.tools.arxiv - INFO - Found 300 total entries


In [13]:
len(arxiv_results.items)

300

Dedup

In [14]:
# Convert to DataFrame (same as web agents)
records = []
for item in arxiv_results.items:
    records.append({
        'focus_area': 'arxiv',
        'provider': 'arxiv',
        'url': item.url,
        'title': item.title,
        'source': item.source,
        'published': item.published,
        'relevance': item.relevance,
        'date_added': datetime.now().strftime("%Y-%m-%d")
    })

df_arxiv = pd.DataFrame(records)
print(f"There are {len(df_arxiv)} ResearchItems present in the research results")

There are 300 ResearchItems present in the research results


In [15]:
# Deduplicate based on URL (keep first occurrence)
print(f"df.shape: {df_arxiv.shape}")
df_arxiv = df_arxiv.drop_duplicates(subset='url', keep='first')
print(f"df.shape: {df_arxiv.shape}")

df.shape: (300, 8)
df.shape: (238, 8)


Add to existing collection

In [16]:
result = deduplicate_research(
    new_df=df_arxiv,
    main_csv="../data/research_items.csv",
    save=True
)

print(f"Added {result['new_added']} new items")
print(f"Skipped {result['skipped']} duplicates")
print(f"Total in DB: {result['total_after']}")

2025-12-25 19:03:38,996 - data.dedup - INFO - Loaded 911 existing items from ../data/research_items.csv
2025-12-25 19:03:38,998 - data.dedup - INFO - New items: 214, Skipped (duplicates): 24
2025-12-25 19:03:39,023 - data.dedup - INFO - Saved 1125 items to ../data/research_items.csv
2025-12-25 19:03:39,026 - data.dedup - INFO - Loaded 143 existing skipped items from ../data/skipped_items.csv
2025-12-25 19:03:39,029 - data.dedup - INFO - Logged 24 new skipped items to ../data/skipped_items.csv


Added 214 new items
Skipped 24 duplicates
Total in DB: 1125


### Check research items csv

In [17]:
recent_df = pd.read_csv("../data/research_items.csv")
recent_df.head()

Unnamed: 0,focus_area,provider,url,title,source,published,relevance,date_added
0,dummy1,openai,https://dummy1.com,dummy1,OpenAI Blog,2025-12-01,dummy example 1,2025-12-20
1,dummy2,anthropic,https://dummy2.com,dummy2,arXiv,2025-12-15,dummy example 2,2025-12-21
2,reasoning_agent,openai,https://openai.com/index/gpt-5-2-codex,Introducing GPT-5.2-Codex,OpenAI blog,2025-12-18,Official release of an agentic coding model em...,2025-12-24
3,reasoning_agent,openai,https://openai.com/index/introducing-gpt-5-2/,Introducing GPT-5.2,OpenAI blog,2025-12-11,Details GPT-5.2 “Thinking/Pro” modes and API r...,2025-12-24
4,reasoning_agent,openai,https://blog.google/products/gemini/gemini-3/,Introducing Gemini 3: our most intelligent mod...,Google Blog (Gemini/DeepMind),2025-11-18,Announces Gemini 3 with “thinking”/Deep Think ...,2025-12-24


In [18]:
provider_counts = recent_df['provider'].value_counts()
print("Absolute counts:")
print(provider_counts)
print("\nNormalized (proportions):")
print(recent_df['provider'].value_counts(normalize=True))

Absolute counts:
provider
arxiv        462
openai       272
anthropic    254
exa          137
Name: count, dtype: int64

Normalized (proportions):
provider
arxiv        0.410667
openai       0.241778
anthropic    0.225778
exa          0.121778
Name: proportion, dtype: float64
