### Pipeline to run research to collect candidates

In [1]:
import sys
sys.path.append('../')

import pandas as pd
from datetime import datetime

import logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

from components.agents.research_agents import run_mixed_research_agents
from components.prompts.research_agents import FOCUS_AREAS
from data.dedup import deduplicate_research

In [2]:
FOCUS_AREAS.keys()

dict_keys(['reasoning_and_planning', 'agents_and_finance', 'agent_infrastructure', 'retrieval_and_embeddings', 'multimodal_and_generation'])

In [3]:
print(FOCUS_AREAS['reasoning_and_planning'])

Reasoning LLMs, chain-of-thought, inference-time compute, self-reflection, planning with LLMs, MCTS (Monte Carlo Tree Search) for language models, test-time scaling, hallucination reduction and detection, grounding, factuality


### Test run on dict

In [4]:
research_results = await run_mixed_research_agents(
    {"reasoning_agent": "reasoning agents, chain of thought planning with LLMs, test time compute"}
)

2025-12-24 16:10:34,006 - components.agents.research_agents - INFO - Running 2 tasks
Research agents (mixed):   0%|          | 0/2 [00:00<?, ?it/s]2025-12-24 16:10:34,020 - components.agents.research_agents - INFO - Running OpenAI research agent for focus area: reasoning_agent
2025-12-24 16:10:34,256 - components.agents.research_agents - INFO - Running Anthropic research agent for focus area: reasoning_agent
2025-12-24 16:12:17,343 - httpx - INFO - HTTP Request: POST https://api.anthropic.com/v1/messages?beta=true "HTTP/1.1 200 OK"
Research agents (mixed):  50%|█████     | 1/2 [01:43<01:43, 103.81s/it]2025-12-24 16:16:17,176 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
Research agents (mixed): 100%|██████████| 2/2 [05:43<00:00, 171.67s/it]


### Deduplicate based on URL

In [17]:
# Parse the research results into a flat list of records
records = []
for key, result in research_results.items():
    focus_area, llm_provider = key.split(' --- ')
    for item in result.items:
        records.append({
            'focus_area': focus_area,
            'provider': llm_provider,
            'url': item.url,
            'title': item.title,
            'source': item.source,
            'published': item.published,
            'relevance': item.relevance,
            'date_added': datetime.now().strftime("%Y-%m-%d")
        })

# Create DataFrame
df = pd.DataFrame(records)

print(f"There are {len(df)} ResearchItems present in the research results")

There are 58 ResearchItems present in the research results


In [19]:
# Deduplicate based on URL (keep first occurrence)
print(f"df.shape: {df.shape}")
df = df.drop_duplicates(subset='url', keep='first')
print(f"df.shape: {df.shape}")

df.shape: (58, 8)
df.shape: (57, 8)


### Add to existing collection

In [10]:
result = deduplicate_research(
    new_df=df,
    main_csv="../data/research_items.csv",
    save=True
)

print(f"Added {result['new_added']} new items")
print(f"Skipped {result['skipped']} duplicates")
print(f"Total in DB: {result['total_after']}")

2025-12-24 16:46:14,389 - data.dedup - INFO - Loaded 59 existing items from ../data/research_items.csv
2025-12-24 16:46:14,391 - data.dedup - INFO - New items: 0, Skipped (duplicates): 59
2025-12-24 16:46:14,393 - data.dedup - INFO - Saved 59 items to ../data/research_items.csv
2025-12-24 16:46:14,395 - data.dedup - INFO - Logged 95 skipped items to ../data/skipped_items.csv


Added 0 new items
Skipped 59 duplicates
Total in DB: 59
