# Deep Research System

This notebook implements a deep research system using LangGraph that performs comprehensive research on a query by:
1. Search Planner: Using LLM to create multiple search items
2. Search Agent: Searching each item on the web and summarizing results
3. Final Report: Generating a comprehensive report from the summaries

### Imports and Setup

Import required libraries for the deep research agent, including LangChain for tools and embeddings, LangGraph for stateful graphs, and Pydantic for structured outputs.

In [1]:
import json
import operator
import requests
import os
from typing import Annotated, TypedDict, List, Dict
from langchain_community.vectorstores import FAISS
from langchain_core.messages import AIMessage, BaseMessage, HumanMessage, SystemMessage, ToolMessage
from langchain_core.tools import tool
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_openai import ChatOpenAI
from langgraph.graph import END, StateGraph
from pydantic import BaseModel, Field
from datetime import datetime

### Agent State

Defines DeepResearchState as a typed dictionary to persist state across the LangGraph workflow, including messages, search queries, search results, and final report.

In [2]:
# --- 1. Define Agent State ---
class DeepResearchState(TypedDict):
    messages: Annotated[list[BaseMessage], operator.add]
    original_query: str
    search_queries: List[str]
    search_results: Dict[str, str]  # query -> summary
    final_report: str

### Tool Definitions

Defines tools for web search (using Serper API) to enable the agent to call them dynamically in the graph.

In [4]:
class SearchQuery(BaseModel):
    query: str = Field(description="Search query for web search")


@tool(args_schema=SearchQuery)
def web_search_tool(query: str):
    """Get real-time Internet information using Serper API"""
    # Get API key from environment variable or use placeholder
    api_key = os.environ.get('SERPER_API_KEY', 'YOUR_SERPER_API_KEY')
    
    url = "https://google.serper.dev/search"
    payload = json.dumps({
        "q": query,
        "num": 5,  # Get 5 results
    })
    headers = {
      'X-API-KEY': 'd6205f8378e105dc9afdcb2dbbd44521c716b9c4',
      'Content-Type': 'application/json'
    }
    
    try:
        response = requests.post(url, headers=headers, data=payload, timeout=10)
        response.raise_for_status()  # Raise an exception for bad status codes
        data = response.json()
        
        if 'organic' in data:
            results = []
            for item in data['organic'][:3]:  # Take top 3 results
                result = {
                    'title': item.get('title', ''),
                    'snippet': item.get('snippet', ''),
                    'link': item.get('link', '')
                }
                results.append(result)
            return json.dumps(results, ensure_ascii=False)
        else:
            return json.dumps({"error": "No results found"}, ensure_ascii=False)
    except requests.exceptions.RequestException as e:
        return json.dumps({"error": f"Network error occurred: {str(e)}"}, ensure_ascii=False)
    except json.JSONDecodeError as e:
        return json.dumps({"error": f"Failed to parse response: {str(e)}"}, ensure_ascii=False)
    except Exception as e:
        return json.dumps({"error": f"An unexpected error occurred: {str(e)}"}, ensure_ascii=False)

### LLM Configuration

Sets up ChatOpenAI with a local Qwen model, binding tools for structured calls.

In [5]:
llm = ChatOpenAI(
    model_name="qwen3:8b",
    base_url='http://localhost:11434/v1',
    openai_api_key="<KEY>",  # Replace with valid key
    temperature=0.3
)

llm_with_tools = llm.bind_tools([web_search_tool])

### Node Functions

Defines node functions for the workflow, including search planning, web search execution, result summarization, and final report generation.

In [6]:
# Search Planner: Generate multiple search queries from the original query
def search_planner(state: DeepResearchState):
    """
    Generates multiple search queries based on the original query using the LLM.
    """
    print("---GENERATING SEARCH QUERIES---")
    original_query = state["original_query"]
    
    planner_prompt = f"""You are a research query planner. Generate 3-5 diverse search queries that would help research the topic: "{original_query}".
    Make the queries specific and focused on different aspects of the topic.
    Return ONLY a JSON array of strings, nothing else.
    
    Example format:
    ["query 1", "query 2", "query 3"]
    """
    
    response = llm.invoke([HumanMessage(content=planner_prompt)])
    
    try:
        # Try to parse as JSON
        search_queries = json.loads(response.content)
    except:
        # If parsing fails, try to extract queries from the text
        import re
        queries = re.findall(r'"([^"]+)"', response.content)
        if not queries:
            # Fallback: split by lines or common delimiters
            queries = [q.strip('"') for q in response.content.split('\n') if q.strip()]
        search_queries = queries[:5]  # Limit to 5 queries
    
    print(f"Generated queries: {search_queries}")
    return {"search_queries": search_queries}

# Search Agent: Execute search for each query and summarize results
def search_agent(state: DeepResearchState):
    """
    Executes web searches for each query and generates summaries.
    """
    print("---EXECUTING WEB SEARCHES---")
    search_queries = state["search_queries"]
    search_results = {}
    
    for query in search_queries:
        print(f"Searching for: {query}")
        try:
            # Execute web search
            search_result = web_search_tool.invoke({"query": query})
            search_data = json.loads(search_result)
            
            # Check for errors in search results
            if isinstance(search_data, dict) and "error" in search_data:
                search_results[query] = f"Error in search: {search_data['error']}"
                continue
            
            # Summarize the results
            if isinstance(search_data, list) and len(search_data) > 0:
                # Format the search results for better summarization
                formatted_results = "\n".join([
                    f"Title: {item.get('title', 'N/A')}\nSnippet: {item.get('snippet', 'N/A')}\nLink: {item.get('link', 'N/A')}\n"
                    for item in search_data
                ])
                
                summarization_prompt = f"""You are a research summarizer. Summarize the following search results for the query "{query}". 
                Provide a comprehensive summary that captures the key points and main ideas from all results.
                
                Search Results:
                {formatted_results}
                
                Summary:"""
                
                summary_response = llm.invoke([HumanMessage(content=summarization_prompt)])
                search_results[query] = summary_response.content
            else:
                search_results[query] = "No relevant information found for this query."
        except Exception as e:
            print(f"Error searching for '{query}': {str(e)}")
            search_results[query] = f"Error occurred while searching: {str(e)}"
    
    return {"search_results": search_results}

# Final Report Generator: Create comprehensive report from summaries
def generate_final_report(state: DeepResearchState):
    """
    Generates a final comprehensive report based on all search summaries.
    """
    print("---GENERATING FINAL REPORT---")
    original_query = state["original_query"]
    search_results = state["search_results"]
    
    # Check if we have any search results
    if not search_results:
        final_report = f"# Research Report: {original_query}\n\nNo relevant information could be found for this topic."
        return {"final_report": final_report}
    
    # Format the search results for the report
    formatted_results = "\n\n".join([f"## {query}\n\n{summary}" for query, summary in search_results.items()])
    
    report_prompt = f"""You are a research report writer. Create a comprehensive report on the topic: "{original_query}".
    
    Use the following research findings to create your report:
    
{formatted_results}
    
    Please structure your report with:
    1. A title with the original query
    2. An introduction explaining the importance of the topic
    3. Detailed sections for each research query with key findings (use the query as a subheading)
    4. A conclusion summarizing the overall findings and their significance
    5. Proper formatting with markdown headers
    6. Keep the report professional and well-organized
    
    Report:"""
    
    response = llm.invoke([HumanMessage(content=report_prompt)])
    final_report = response.content
    
    print("Final report generated successfully.")
    return {"final_report": final_report}

## Graph Construction

Builds the LangGraph workflow by adding nodes, setting entry points, and defining edges for the deep research process.

In [7]:
workflow = StateGraph(DeepResearchState)

# Add nodes
workflow.add_node("search_planner", search_planner)
workflow.add_node("search_agent", search_agent)
workflow.add_node("generate_final_report", generate_final_report)

# Set entry point
workflow.set_entry_point("search_planner")

# Add edges
workflow.add_edge("search_planner", "search_agent")
workflow.add_edge("search_agent", "generate_final_report")
workflow.add_edge("generate_final_report", END)

# Compile the graph with error handling
try:
    graph = workflow.compile()
    print("Graph compiled successfully!")
except Exception as e:
    print(f"Error compiling graph: {e}")
    raise

Graph compiled successfully!


### Test the System

Test the graph with sample queries to verify the deep research workflow.

In [11]:
# Test query
query = "What are the latest advancements in quantum computing?"
initial_state = {
    "messages": [HumanMessage(content=query)],
    "original_query": query,
    "search_queries": [],
    "search_results": {},
    "final_report": ""
}

# Set the Serper API key (replace with your actual key)
# os.environ['SERPER_API_KEY'] = 'your_actual_serper_api_key_here'

print(f"Starting deep research on: {query}")
print("=" * 50)

try:
    for event in graph.stream(initial_state, {"recursion_limit": 25}):
        for key, value in event.items():
            print(f"--- Output from node '{key}' ---")
            if key == "search_planner":
                print(f"Generated queries: {value.get('search_queries', [])}")
            elif key == "search_agent":
                print(f"Number of search results: {len(value.get('search_results', {}))}")
            elif key == "generate_final_report":
                print("Final report generated")
            print("\n---\n")

    # Display the final report
    final_state = list(graph.stream(initial_state, {"recursion_limit": 25}))[-1]
    print("===== FINAL REPORT =====")
    print(final_state['generate_final_report']['final_report'])
except Exception as e:
    print(f"Error during deep research: {e}")
    import traceback
    traceback.print_exc()

Starting deep research on: What are the latest advancements in quantum computing?
---GENERATING SEARCH QUERIES---
Generated queries: ['recent breakthroughs in qubit stability and error correction in quantum computing', 'novel quantum algorithms for optimization and machine learning in 2023', 'latest quantum computing software frameworks and development tools', 'recent studies on quantum supremacy achievements and real-world applications in 2023', 'advances in quantum error correction techniques and scalability challenges in 2023']
--- Output from node 'search_planner' ---
Generated queries: ['recent breakthroughs in qubit stability and error correction in quantum computing', 'novel quantum algorithms for optimization and machine learning in 2023', 'latest quantum computing software frameworks and development tools', 'recent studies on quantum supremacy achievements and real-world applications in 2023', 'advances in quantum error correction techniques and scalability challenges in 2023'