In [7]:
pip install -r requirements.txt



In [8]:
from dotenv import load_dotenv, find_dotenv
!ls -a

import os

# Load the .env file into environment variables
load_dotenv()

# Optional sanity check (prints only variable names, not values)
print("Loaded keys:", [k for k in os.environ.keys() if "API" in k])

.  ..  arxiv_dataset.json  .config  .env  files  requirements.txt  sample_data
Loaded keys: ['OPENAI_API_KEY', 'GOOGLE_API_KEY', 'PINECONE_API_KEY', 'TAVILY_API_KEY', 'LANGCHAIN_API_KEY', 'SERPAPI_KEY', 'LANGSMITH_API_KEY']


In [9]:
import requests
import pandas as pd
import json
import xml.etree.ElementTree as ET

# Namespace for ArXiv's Atom-based XML format.
ARXIV_NAMESPACE = '{http://www.w3.org/2005/Atom}'

def extract_from_arxiv(search_query='q-bio.GN', max_results=100, json_file_path='/content/arxiv_dataset.json'):
    """
    Fetches papers from the ArXiv API based on a search query, saves them as JSON,
    and returns a pandas DataFrame.

    Args:
        search_query (str): The search query for ArXiv (default is 'cat:cs.AI').
        max_results (int): The maximum number of results to retrieve (default is 100).
        json_file_path (str): File path where JSON data will be saved.

    Returns:
        pd.DataFrame: DataFrame containing the extracted paper information.
    """

    # Construct the URL for the API request.
    url = f'http://export.arxiv.org/api/query?search_query={search_query}&max_results={max_results}'

    # Send a GET request to the ArXiv API.
    response = requests.get(url)

    # Parse the XML response.
    root = ET.fromstring(response.content)

    papers = []

    # Loop through each "entry" in the XML, representing a single paper.
    for entry in root.findall(f'{ARXIV_NAMESPACE}entry'):
        title = entry.find(f'{ARXIV_NAMESPACE}title').text.strip()
        summary = entry.find(f'{ARXIV_NAMESPACE}summary').text.strip()

        # Get the authors of the paper.
        author_elements = entry.findall(f'{ARXIV_NAMESPACE}author')
        authors = [author.find(f'{ARXIV_NAMESPACE}name').text for author in author_elements]

        # Get the paper's URL.
        paper_url = entry.find(f'{ARXIV_NAMESPACE}id').text
        arxiv_id = paper_url.split('/')[-1]

        # Check for the PDF link.
        pdf_link = next((link.attrib['href'] for link in entry.findall(f'{ARXIV_NAMESPACE}link')
                         if link.attrib.get('title') == 'pdf'), None)

        papers.append({
            'title': title,
            'summary': summary,
            'authors': authors,
            'arxiv_id': arxiv_id,
            'url': paper_url,
            'pdf_link': pdf_link
        })

    # Convert list into a pandas DataFrame.
    df = pd.DataFrame(papers)

    # Save the DataFrame to a JSON file.
    with open(json_file_path, 'w', encoding='utf-8') as f:
        json.dump(papers, f, ensure_ascii=False, indent=4)
        print(f'Data saved to {json_file_path} ...')

    return df

In [10]:
df = extract_from_arxiv(max_results=20)
df

Data saved to /content/arxiv_dataset.json ...


Unnamed: 0,title,summary,authors,arxiv_id,url,pdf_link
0,Keynotes on membrane proteomics,This review article deals with the specificiti...,[Thierry Rabilloud],0807.1039v1,http://arxiv.org/abs/0807.1039v1,http://arxiv.org/pdf/0807.1039v1
1,Introns Restructure tRNA Genes of Archaea,This paper has been withdrawn by the author.,"[Zhumur Ghosh, Smarajit Das, Jayprokas Chakrab...",0507008v2,http://arxiv.org/abs/q-bio/0507008v2,http://arxiv.org/pdf/q-bio/0507008v2
2,Using Periodicity of Nucleotide Sequences,Withdrawn by arXiv administrators due to conte...,[Rick B. Jenison],1301.5273v2,http://arxiv.org/abs/1301.5273v2,http://arxiv.org/pdf/1301.5273v2
3,Phen-Gen: combining phenotype and genotype to ...,"We introduce Phen-Gen, a method which combines...","[Asif Javed, Saloni Agrawal, Pauline C. Ng]",1502.07829v1,http://arxiv.org/abs/1502.07829v1,http://arxiv.org/pdf/1502.07829v1
4,Whole-Genome Sequence of the Trypoxylus dichot...,The draft whole-genome sequence of the Japanes...,[Norichika Ogata],2011.08845v1,http://arxiv.org/abs/2011.08845v1,http://arxiv.org/pdf/2011.08845v1
5,New strategies to improve minimap2 alignment a...,Summary: We present several recent improvement...,[Heng Li],2108.03515v1,http://arxiv.org/abs/2108.03515v1,http://arxiv.org/pdf/2108.03515v1
6,Quantum Gradient Optimized Drug Repurposing Pr...,This paper presents a novel quantum-enhanced p...,"[Don Roosan, Saif Nirzhor, Rubayat Khan, Fahmi...",2506.19097v1,http://arxiv.org/abs/2506.19097v1,http://arxiv.org/pdf/2506.19097v1
7,tRNA-alike in Nanoarchaeum equitans ?,The recent algorithm for five split tRNA-genes...,"[Bibekanand Mallick, Jayprokas Chakrabarti, Zh...",0504034v1,http://arxiv.org/abs/q-bio/0504034v1,http://arxiv.org/pdf/q-bio/0504034v1
8,Identity Elements of Archaeal tRNA,Features unique to a transfer-RNA are recogniz...,"[Bibekanand Mallick, Jayprokas Chakrabarti, Sa...",0506020v2,http://arxiv.org/abs/q-bio/0506020v2,http://arxiv.org/pdf/q-bio/0506020v2
9,Complex Network Approach to Human Promoter Seq...,Based upon the correlation matrix of the human...,"[Huijie Yang, Fangcui Zhao, Binghong Wang]",0508018v1,http://arxiv.org/abs/q-bio/0508018v1,http://arxiv.org/pdf/q-bio/0508018v1


In [11]:
import json
file_name = '/content/arxiv_dataset.json'
with  open(file_name, 'r') as file:
    data = json.load(file)

print(data)

[{'title': 'Keynotes on membrane proteomics', 'summary': 'This review article deals with the specificities of the proteomics analysis\nof membrane proteins.', 'authors': ['Thierry Rabilloud'], 'arxiv_id': '0807.1039v1', 'url': 'http://arxiv.org/abs/0807.1039v1', 'pdf_link': 'http://arxiv.org/pdf/0807.1039v1'}, {'title': 'Introns Restructure tRNA Genes of Archaea', 'summary': 'This paper has been withdrawn by the author.', 'authors': ['Zhumur Ghosh', 'Smarajit Das', 'Jayprokas Chakrabarti', 'Bibekanand Mallick', 'Satyabrata Sahoo'], 'arxiv_id': '0507008v2', 'url': 'http://arxiv.org/abs/q-bio/0507008v2', 'pdf_link': 'http://arxiv.org/pdf/q-bio/0507008v2'}, {'title': 'Using Periodicity of Nucleotide Sequences', 'summary': 'Withdrawn by arXiv administrators due to content entirely plagiarized from\nother authors (not in arXiv).', 'authors': ['Rick B. Jenison'], 'arxiv_id': '1301.5273v2', 'url': 'http://arxiv.org/abs/1301.5273v2', 'pdf_link': 'http://arxiv.org/pdf/1301.5273v2'}, {'title': '

In [12]:
import pandas as pd
df = pd.DataFrame(data)
df.sample(n=5)

Unnamed: 0,title,summary,authors,arxiv_id,url,pdf_link
6,Quantum Gradient Optimized Drug Repurposing Pr...,This paper presents a novel quantum-enhanced p...,"[Don Roosan, Saif Nirzhor, Rubayat Khan, Fahmi...",2506.19097v1,http://arxiv.org/abs/2506.19097v1,http://arxiv.org/pdf/2506.19097v1
3,Phen-Gen: combining phenotype and genotype to ...,"We introduce Phen-Gen, a method which combines...","[Asif Javed, Saloni Agrawal, Pauline C. Ng]",1502.07829v1,http://arxiv.org/abs/1502.07829v1,http://arxiv.org/pdf/1502.07829v1
13,Organelle proteomics,This unit describes strategies for studying th...,"[Pierre Lescuyer, Mireille Chevallet, Sylvie L...",0904.0636v1,http://arxiv.org/abs/0904.0636v1,http://arxiv.org/pdf/0904.0636v1
10,Impact of Tandem Repeats on the Scaling of Nuc...,Techniques such as detrended fluctuation analy...,"[Radhakrishnan Nagarajan, Meenakshi Upreti]",0510042v1,http://arxiv.org/abs/q-bio/0510042v1,http://arxiv.org/pdf/q-bio/0510042v1
12,Proteomic nonlinear waves in networks of trans...,A chain of connected genes with activation-rep...,[A. S. Carstea],0611042v1,http://arxiv.org/abs/q-bio/0611042v1,http://arxiv.org/pdf/q-bio/0611042v1


In [13]:
import pandas as pd
import requests
import os

def download_pdfs(df, download_folder='files'):
    """
    Downloads PDFs from URLs listed in the DataFrame and saves them to a specified folder.
    The file names are stored in a new column 'pdf_file_name' in the DataFrame.

    Args:
        df (pd.DataFrame): DataFrame containing a 'pdf_link' column with URLs to download.
        download_folder (str): Path to the folder where PDFs will be saved (default is 'files').

    Returns:
        pd.DataFrame: The original DataFrame with an additional 'pdf_file_name' column containing
                      the paths of the downloaded PDF files or None if the download failed.
    """

    if not os.path.exists(download_folder):
        os.makedirs(download_folder)

    pdf_file_names = []

    # Loop through each row to download PDFs
    for index, row in df.iterrows():
        pdf_link = row['pdf_link']

        try:
            response = requests.get(pdf_link)
            response.raise_for_status()

            file_name = os.path.join(download_folder, pdf_link.split('/')[-1]) + '.pdf'
            pdf_file_names.append(file_name)

            # Save the downloaded PDF
            with open(file_name, 'wb') as f:
                f.write(response.content)

            print(f'PDF downloaded successfully and saved as {file_name}')

        except requests.exceptions.RequestException as e:
            print(f'Failed to download the PDF: {e}')
            pdf_file_names.append(None)

    df['pdf_file_name'] = pdf_file_names

    return df

In [14]:
df = download_pdfs(df)

PDF downloaded successfully and saved as files/0807.1039v1.pdf
Failed to download the PDF: 404 Client Error: Not Found for url: https://arxiv.org/pdf/q-bio/0507008v2
Failed to download the PDF: 404 Client Error: Not Found for url: https://arxiv.org/pdf/1301.5273v2
PDF downloaded successfully and saved as files/1502.07829v1.pdf
PDF downloaded successfully and saved as files/2011.08845v1.pdf
PDF downloaded successfully and saved as files/2108.03515v1.pdf
PDF downloaded successfully and saved as files/2506.19097v1.pdf
PDF downloaded successfully and saved as files/0504034v1.pdf
PDF downloaded successfully and saved as files/0506020v2.pdf
PDF downloaded successfully and saved as files/0508018v1.pdf
PDF downloaded successfully and saved as files/0510042v1.pdf
PDF downloaded successfully and saved as files/0609022v1.pdf
PDF downloaded successfully and saved as files/0611042v1.pdf
PDF downloaded successfully and saved as files/0904.0636v1.pdf
PDF downloaded successfully and saved as files/091

In [15]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

def load_and_chunk_pdf(pdf_file_name, chunk_size=512):
    """
    Loads a PDF file and splits its content into chunks of a specified size.

    Args:
        file (str): Path to the PDF file to be loaded.
        chunk_size (int): The maximum size of each chunk in characters (default is 512).

    Returns:
        List[Document]: A list of document chunks.
    """

    print(f'Loading and splitting into chunks: {pdf_file_name}')

    # Load the content of the PDF
    loader = PyPDFLoader(pdf_file_name)
    data = loader.load()

    # Split the content into chunks with slight overlap to preserve context
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=64)
    chunks = text_splitter.split_documents(data)

    return chunks

In [16]:
def expand_df(df):
    """
    Expands each row in the DataFrame by splitting PDF documents into chunks.

    Args:
        df (pd.DataFrame): DataFrame containing 'pdf_file_name', 'arxiv_id', 'title', 'summary',
                           'authors', and 'url' columns.

    Returns:
        pd.DataFrame: A new DataFrame where each row represents a chunk of the original document,
                      with additional metadata such as chunk identifiers and relationships to
                      adjacent chunks.
    """

    expanded_rows = []  # List to store expanded rows with chunk information

    # Loop through each row in the DataFrame
    for idx, row in df.iterrows():
        try:
            chunks = load_and_chunk_pdf(row['pdf_file_name'])
        except Exception as e:
            print(f"Error processing file {row['pdf_file_name']}: {e}")
            continue

        # Loop over the chunks and construct a new DataFrame row for each
        for i, chunk in enumerate(chunks):
            prechunk_id = i-1 if i > 0 else ''  # Preceding chunk ID
            postchunk_id = i+1 if i < len(chunks) - 1 else ''  # Following chunk ID

            expanded_rows.append({
                'id': f"{row['arxiv_id']}#{i}",  # Unique chunk identifier
                'title': row['title'],
                'summary': row['summary'],
                'authors': row['authors'],
                'arxiv_id': row['arxiv_id'],
                'url': row['url'],
                'chunk': chunk.page_content,  # Text content of the chunk
                'prechunk_id': '' if i == 0 else f"{row['arxiv_id']}#{prechunk_id}",  # Previous chunk ID
                'postchunk_id': '' if i == len(chunks) - 1 else f"{row['arxiv_id']}#{postchunk_id}"  # Next chunk ID
            })

    # Return a new expanded DataFrame
    return pd.DataFrame(expanded_rows)

In [17]:
expanded_df = expand_df(df)

Loading and splitting into chunks: files/0807.1039v1.pdf
Loading and splitting into chunks: None
Error processing file None: File path None is not a valid file or url
Loading and splitting into chunks: None
Error processing file None: File path None is not a valid file or url
Loading and splitting into chunks: files/1502.07829v1.pdf




Loading and splitting into chunks: files/2011.08845v1.pdf
Loading and splitting into chunks: files/2108.03515v1.pdf
Loading and splitting into chunks: files/2506.19097v1.pdf
Loading and splitting into chunks: files/0504034v1.pdf
Loading and splitting into chunks: files/0506020v2.pdf
Loading and splitting into chunks: files/0508018v1.pdf
Loading and splitting into chunks: files/0510042v1.pdf
Loading and splitting into chunks: files/0609022v1.pdf
Loading and splitting into chunks: files/0611042v1.pdf
Loading and splitting into chunks: files/0904.0636v1.pdf
Loading and splitting into chunks: files/0910.0429v1.pdf
Loading and splitting into chunks: files/1203.2178v1.pdf
Loading and splitting into chunks: files/1203.3929v1.pdf
Loading and splitting into chunks: files/1207.3907v2.pdf
Loading and splitting into chunks: files/1301.5187v1.pdf
Loading and splitting into chunks: files/1310.3197v1.pdf


In [18]:
import os
from getpass import getpass

from semantic_router.encoders import OpenAIEncoder

# Check if 'OPENAI_API_KEY' is set; prompt if not
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY') or getpass('OpenAI API key: ')

# Initialize the OpenAIEncoder with a specific model
encoder = OpenAIEncoder(name='text-embedding-3-small')

In [19]:
encoder('hello hallo hola salut')

[[0.013684329576790333,
  0.006457979325205088,
  0.01937759667634964,
  0.030178703367710114,
  0.015191800892353058,
  0.0020563083235174417,
  -0.0031850826926529408,
  0.05090278014540672,
  -0.015411335974931717,
  -0.04999537020921707,
  0.012872051447629929,
  -0.001522107282653451,
  -0.07101215422153473,
  -0.004562662914395332,
  0.024514708667993546,
  0.012937911786139011,
  -0.023973189294338226,
  -0.003260090248659253,
  0.014233166351914406,
  0.04150669276714325,
  -0.02092897519469261,
  0.03369125723838806,
  0.041155438870191574,
  0.03928207606077194,
  0.007361730094999075,
  -0.009432674385607243,
  -0.010515712201595306,
  -0.01399899646639824,
  0.02890540286898613,
  -0.015572328120470047,
  0.034745026379823685,
  -0.032754577696323395,
  0.02717839553952217,
  -0.02918347902595997,
  -0.007720303721725941,
  0.044111840426921844,
  -0.04935140162706375,
  0.04329224303364754,
  -0.02691495418548584,
  0.022802338004112244,
  -0.010281541384756565,
  0.000187

In [20]:
dims = len(encoder('hello hallo hola salut')[0])
dims

1536

In [21]:
from pinecone import Pinecone, ServerlessSpec

# Check if 'PINECONE_API_KEY' is set; prompt if not
api_key = os.getenv('PINECONE_API_KEY') or getpass('Pinecone API key: ')

# Initialize the Pinecone client
pc = Pinecone(api_key=api_key)

# Define the serverless specification for Pinecone (AWS region 'us-east-1')
spec = ServerlessSpec(
    cloud='aws',
    region='us-east-1'
)


In [22]:
import time
from pinecone import Pinecone, ServerlessSpec

# Initialize client
pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])

index_name = "langgraph-research-agent"
dims = 1536
spec = ServerlessSpec(cloud="aws", region="us-east-1")

# Create index if missing
existing = [i.name for i in pc.list_indexes()]
if index_name not in existing:
    print(f"Creating index '{index_name}'...")
    pc.create_index(index_name, dimension=dims, metric="cosine", spec=spec)
    while not pc.describe_index(index_name).status.get("ready"):
        time.sleep(1)
    print("✅ Index ready.")

# Connect to index
index = pc.Index(index_name)

# Confirm stats
stats = index.describe_index_stats()
print(stats)

{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 5924}},
 'total_vector_count': 5924,
 'vector_type': 'dense'}


In [23]:
from tqdm.auto import tqdm

data = expanded_df
batch_size = 64  # Set batch size

# Loop through the data in batches, using tqdm for a progress bar
for i in tqdm(range(0, len(data), batch_size)):
    i_end = min(len(data), i + batch_size)  # Define batch endpoint
    batch = data[i:i_end].to_dict(orient='records')  # Slice data into a batch

    # Extract metadata for each chunk in the batch
    metadata = [{
        'arxiv_id': r['arxiv_id'],
        'title': r['title'],
        'chunk': r['chunk'],
    } for r in batch]

    # Generate unique IDs for each chunk
    ids = [r['id'] for r in batch]

    # Extract the chunk content
    chunks = [r['chunk'] for r in batch]

    # Convert chunks into embeddings
    embeds = encoder(chunks)

    # Upload embeddings, IDs, and metadata to Pinecone
    index.upsert(vectors=zip(ids, embeds, metadata))

  0%|          | 0/20 [00:00<?, ?it/s]

In [24]:
{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 5765}},
 'total_vector_count': 5765,
 'vector_type': 'dense'}

{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 5765}},
 'total_vector_count': 5765,
 'vector_type': 'dense'}

In [25]:
from langchain_core.tools import tool
import requests
import re

# Compile a regular expression pattern to find the abstract in the HTML response
abstract_pattern = re.compile(
    r'<blockquote class="abstract mathjax">\s*<span class="descriptor">Abstract:</span>\s*(.*?)\s*</blockquote>',
    re.DOTALL
)

@tool('fetch_arxiv')
def fetch_arxiv(arxiv_id: str) -> str:
    '''Fetches the abstract from an ArXiv paper given its ArXiv ID.

    Args:
        arxiv_id (str): The ArXiv paper ID.

    Returns:
        str: The extracted abstract text from the ArXiv paper.
    '''

    res = requests.get(f'https://arxiv.org/abs/{arxiv_id}')

    re_match = abstract_pattern.search(res.text)

    return re_match.group(1) if re_match else 'Abstract not found.'

In [26]:
# Defining the ArXiv paper ID and invoking the tool with that ID.
arxiv_id = '1502.07829v1'
output = fetch_arxiv.invoke(input={'arxiv_id': arxiv_id})
print(output)

We introduce Phen-Gen, a method which combines patient disease symptoms and sequencing data with prior domain knowledge to identify the causative gene(s) for rare disorders.


In [27]:
# --- 1. Imports & setup ---
from langchain.tools import tool
from serpapi import GoogleSearch
import os

# --- 2. Define SerpAPI parameters (use the correct env var) ---
SERPAPI_KEY = os.getenv("SERPAPI_KEY", "").strip()
if not SERPAPI_KEY:
    raise RuntimeError("❌ SERPAPI_KEY not found — check your .env file.")

serpapi_params = {
    "engine": "google",
    "hl": "en",
    "gl": "us",
    "api_key": SERPAPI_KEY,
}

# --- 3. Define the tool ---
@tool('web_search')
def web_search(query: str) -> str:
    '''Search the PUBLIC WEB for NEW or RECENT information (last ~3 years).
        Prefer this when the query mentions 'recent', 'latest', 'today', dates,
        or 'breakthroughs', or when RAG results look old '''
    search = GoogleSearch({
        **serpapi_params,
        "q": query,
        "num": 5
    })
    results = search.get_dict().get("organic_results", [])
    if not results:
        return "No results found."
    return "\n---\n".join(
        f"{r.get('title')}\n{r.get('snippet')}\n{r.get('link')}"
        for r in results
    )

# --- 4. Test the tool ---
output = web_search.invoke(input={"query": "villas in france"})
print(output)


Villas in France | French Villas with Pools Rental 2025
From elegant chateaux to stylish villas with pools, our extensive portfolio ensures your luxury villa holiday in France is nothing short of extraordinary.
https://www.oliverstravels.com/france/
---
Best Luxury Villas in France 2025/2026
From grandiose farmhouses in the rolling hills of Provence to beautiful villas in the south of France, we offer a huge choice of luxury french villas.
https://www.akvillas.com/destinations/france
---
Villas for sale in France
Find villas for sale in France. View details & photos, save properties, refine your search & book viewings on French-Property.com.
https://www.french-property.com/properties-for-sale?property_types_any=villa
---
Villas and vacation rentals France
An incredible array of luxury villas are available in France that take full advantage of the huge range of geographic and cultural variations on offer.
https://www.relaischateaux.com/us/villas/destination/france
---
France Villa Vacat

In [28]:
def format_rag_contexts(matches: list) -> str:
    '''Formats the retrieved context matches into a readable string format.

    Args:
        matches (list): A list of matched documents with metadata.

    Returns:
        str: A formatted string of document titles, chunks, and ArXiv IDs.
    '''
    formatted_results = []

    # Loop through each match and extract its metadata.
    for x in matches:
        text = (
            f"Title: {x['metadata']['title']}\n"
            f"Chunk: {x['metadata']['chunk']}\n"
            f"ArXiv ID: {x['metadata']['arxiv_id']}\n"
        )
        # Append each formatted string to the results list.
        formatted_results.append(text)

    # Join all the individual formatted strings into one large string.
    return '\n---\n'.join(formatted_results)

In [29]:
from langchain_core.tools import tool

@tool
def rag_search_filter(query: str, arxiv_id: str) -> str:
    '''Finds information from the ArXiv database using a natural language query and a specific ArXiv ID.

    Args:
        query (str): The search query in natural language.
        arxiv_id (str): The ArXiv ID of the specific paper to filter by.

    Returns:
        str: A formatted string of relevant document contexts.
    '''

    # Encode the query into a vector representation.
    xq = encoder([query])

    # Perform a search on the Pinecone index, filtering by ArXiv ID.
    xc = index.query(vector=xq, top_k=6, include_metadata=True, filter={'arxiv_id': arxiv_id})

    # Format and return the search results.
    return format_rag_contexts(xc['matches'])


In [30]:
@tool('rag_search')
def rag_search(query: str) -> str:
    '''Finds specialist information on Genomes using a natural language query.

    Args:
        query (str): The search query in natural language.

    Returns:
        str: A formatted string of relevant document contexts.
    '''

    # Encode the query into a vector representation.
    xq = encoder([query])

    # Perform a broader search without filtering by ArXiv ID.
    xc = index.query(vector=xq, top_k=5, include_metadata=True)

    # Format and return the search results.
    return format_rag_contexts(xc['matches'])


In [148]:
from typing import Union, List
from langchain_core.tools import tool  # keep same decorator you used elsewhere

@tool
def final_answer(
    introduction: str,
    research_steps: Union[str, List[str]],
    main_body: str,
    conclusion: str,
    sources: Union[str, List[str]]
) -> str:
    '''Returns a natural language response in the form of a research report.

    Args:
        introduction (str): A short paragraph introducing the user's question and the topic.
        research_steps (str | List[str]): Bullet points or text explaining the steps taken for research.
        main_body (str): The bulk of the answer, 3-4 paragraphs long, providing high-quality information.
        conclusion (str): A short paragraph summarizing the findings.
        sources (str | List[str]): A list or text providing the sources referenced during the research.

    Returns:
        str: A formatted research report string.
    '''

    # If research_steps is a list, join into bullets
    if isinstance(research_steps, list):
        research_steps = '\n'.join(f'- {r}' for r in research_steps if str(r).strip())

    # If sources is a list, join into bullets
    if isinstance(sources, list):
        sources = '\n'.join(f'- {s}' for s in sources if str(s).strip())

    return (
        f"{introduction}\n\n"
        f"Research Steps:\n{research_steps}\n\n"
        f"Main Body:\n{main_body}\n\n"
        f"Conclusion:\n{conclusion}\n\n"
        f"Sources:\n{sources}"
    )

In [149]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

# Write your raw system prompt exactly as you want it (with normal braces).
_raw_system_prompt = (
    '''You are **The Supervisor** — an AI controller that decides the next best action for a RAG/Research workflow.

You receive a STATE object (read-only) with keys like:
- user_query: str
- messages: list                # conversation turns
- scratchpad: list              # short notes/evidence gathered so far
- tools_used: list[{"tool": str, "query": str, "count": int}]
- last_tool_result: any         # result from the previous tool call
- namespace: str | null         # e.g., 'q-bio.GN'
- constraints: dict | null      # e.g., {"max_tool_calls": 2}
- context_flags: dict | null    # e.g., {"need_citations": true}
- filters: dict | null          # e.g., {"arxiv_id": "1706.03762"}

You can select exactly ONE action per turn:
{"action":"call_tool","tool":"<tool_name>","args":{...}}

====================
DECISION POLICY
====================
1) Decide Intelligently
   - Read user_query and scratchpad. If additional facts are required to answer confidently, pick the tool that adds the most NEW information.
   - Use tools only if they add factual/contextual value. Otherwise proceed to synthesis.

2) Non-Redundancy & Tool Budget
   - If a tool has already been used with the SAME effective query (normalize by lowercasing & trimming), do NOT call it again.
   - No tool may be used more than TWICE total across the entire reasoning sequence. If a tool appears twice in tools_used, do not use it again.
   - Prefer diversity: if two tools can answer similarly, pick the one not yet used.

3) Namespaces & Filters
   - If STATE.filters.arxiv_id exists, prefer a filtered RAG search (e.g., rag_search_filter) over a broad search.
   - If STATE.namespace exists (e.g., "q-bio.GN"), pass it to the tool to scope results.
   - Keep arguments minimal and precise (avoid verbose queries).

4) Sufficiency & Halting
   - If the scratchpad + last_tool_result provide enough evidence to answer the user’s query clearly and accurately, STOP by CALLING the `final_answer` tool.
   - If additional retrieval would be duplicative or low-yield, STOP and CALL `final_answer`.

5) Output Style (Final Answer)
   - Be concise, structured, and factual. Synthesize across sources.
   - If citations/links exist in scratchpad or tool results, include short source attributions inline (e.g., arXiv ID or URL).
   - Avoid speculation; if uncertain, state limits transparently.
   - **When finalizing, you MUST CALL the `final_answer` tool with ALL required fields. Do NOT return raw JSON answers in `content`.**

====================
TOOL SELECTION HEURISTICS
====================
- rag_search(query, namespace?, top_k?): Broad semantic search in Pinecone RAG DB.
- rag_search_filter(query, arxiv_id, namespace?): Targeted RAG search within a specific paper.
- web_search(query): Public web for recent info.
- final_answer(introduction, research_steps, main_body, conclusion, sources): Use when you have sufficient evidence to answer.

====================
ARGUMENT RULES
====================
- Always pass the user’s current intent in plain language (brief query).
- Include namespace if present in STATE.
- Include filters (e.g., arxiv_id) when available.
- Use small, sensible top_k (3–6) unless the query is broad.

====================
SAFETY & ROBUSTNESS
====================
- If last_tool_result indicates no matches: try a different tool or refine the query once; do not loop endlessly.
- Do not invent tools or parameters. Use only provided tools.
- Never leak internal state; only return the JSON action.

====================
RETURN FORMAT (STRICT JSON)
====================
Return ONLY one JSON object on a single line, no prose. Examples:

# Example A: Call a filtered RAG search
{"action":"call_tool","tool":"rag_search_filter","args":{"query":"extract main contribution and methods section","arxiv_id":"1706.03762","namespace":"q-bio.GN"}}

# Example B: Call a broad RAG search
{"action":"call_tool","tool":"rag_search","args":{"query":"single-cell RNA-seq normalization approaches","namespace":"q-bio.GN","top_k":5}}

# Example C: FINALIZE by CALLING the `final_answer` TOOL (do NOT write raw JSON answers)
{"action":"call_tool","tool":"final_answer","args":{
  "introduction":"2–3 sentence intro that frames the user’s question and scope.",
  "research_steps":["Step 1 source …","Step 2 source …","Step 3 source …"],
  "main_body":"3–6 short paragraphs synthesizing findings with attributions.",
  "conclusion":"1–2 sentence takeaway.",
  "sources":["https://example.com/a","https://example.com/b","https://arxiv.org/abs/XXXX.XXXXX"]
}}

====================
THINK, THEN ACT
====================
1) Read STATE.
2) Check tools_used to avoid duplication and budget overuse.
3) Decide the single highest-value next action.
4) Return the JSON action (and nothing else).

Given the user's query and previous tool outputs (in the scratchpad),
choose the next most appropriate tool from this list:
- fetch_arxiv → retrieve relevant academic papers
- rag_search → perform vector search in Pinecone
- web_search → search the web for the most recent information
- rag_search_filter → refine the RAG results
- final_answer → summarize and synthesize all gathered information

Do NOT call the same tool more than twice.
Try to gather information from multiple sources before concluding.
Always end by CALLING `final_answer` once enough context has been gathered.'''
)

# Escape braces so LangChain doesn't treat them as template vars.
system_prompt = _raw_system_prompt.replace("{", "{{").replace("}", "}}")

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    MessagesPlaceholder(variable_name="chat_history"),
    ("user", "{input}"),
    ("assistant", "scratchpad: {scratchpad}"),
])

In [150]:
from langchain_core.messages import ToolCall, ToolMessage
from langchain_openai import ChatOpenAI
import os

# Initialize the OpenAI language model with specific settings.
llm = ChatOpenAI(
    model='gpt-4o',
    openai_api_key=os.environ['OPENAI_API_KEY'],
    temperature=0
)

# Define the list of tools available to the supervisor.
tools = [
    rag_search_filter,
    rag_search,
    fetch_arxiv,
    web_search,
    final_answer
]

def create_scratchpad(intermediate_steps: list) -> str:
    MAX_CHARS = 4000  # trim to keep TPM under control
    research_steps = []
    for step in intermediate_steps or []:
        if getattr(step, "log", "TBD") != "TBD":
            line = (
                f"Tool: {step.tool}, input: {step.tool_input}\n"
                f"Output: {str(step.log)[:1200]}"  # cap each tool output
            )
            research_steps.append(line)
    text = "\n---\n".join(research_steps)
    return text[:MAX_CHARS]

# Define the supervisor's decision-making pipeline.
supervisor = (
    {
        "input": lambda x: x["input"],
        "chat_history": lambda x: x["chat_history"],
        "scratchpad": lambda x: create_scratchpad(intermediate_steps=x["intermediate_steps"]),
    }
    | prompt.partial(
        routing_rule=(
            "Routing rule: If the user asks for 'recent', 'latest', 'today', dates, or "
            "'breakthroughs', you MUST choose `web_search` over `rag_search` unless the user "
            "explicitly says to use the local index."
        )
    )
    | llm.bind_tools(tools, tool_choice="auto")
)

In [151]:
input = 'What is the ArXiv paper with the ID 1502.07829v1 all about?'

inputs = {
    'input': input,
    'chat_history': [],
    'intermediate_steps': [],
}


out = supervisor.invoke(inputs)


print(out)

content='' additional_kwargs={'tool_calls': [{'id': 'call_8vOaWnD971P5JBpRA55iUZPx', 'function': {'arguments': '{"arxiv_id":"1502.07829v1"}', 'name': 'fetch_arxiv'}, 'type': 'function'}], 'refusal': None} response_metadata={'token_usage': {'completion_tokens': 23, 'prompt_tokens': 1704, 'total_tokens': 1727, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_f64f290af2', 'id': 'chatcmpl-CSOrpbZ2NxdSJvWbKrcKHjwrQxxud', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None} id='run--00f8f96c-89bb-46e3-a00f-e01fb5f71cdf-0' tool_calls=[{'name': 'fetch_arxiv', 'args': {'arxiv_id': '1502.07829v1'}, 'id': 'call_8vOaWnD971P5JBpRA55iUZPx', 'type': 'tool_call'}] usage_metadata={'input_tokens': 1704, 'output_tokens': 23, 'total_tokens': 1727, 'input_token_details'

In [152]:

out.tool_calls[0]['name']


'fetch_arxiv'

In [153]:

out.tool_calls[0]['args']

{'arxiv_id': '1502.07829v1'}

In [165]:
def run_supervisor(state: dict) -> dict:
    """Ask the model which tool to run next; allow any tool; robust fallbacks."""
    print("run_supervisor")
    steps = list(state.get("intermediate_steps", []))
    user_input = state.get("input", "")
    chat_history = state.get("chat_history", [])

    out = supervisor.invoke({
        "input": user_input,
        "chat_history": chat_history,
        "intermediate_steps": steps,
    })

    # Extract tool calls robustly (covers SDK variants)
    tool_calls = []
    if hasattr(out, "tool_calls") and out.tool_calls:
        tool_calls = list(out.tool_calls)
    elif hasattr(out, "additional_kwargs"):
        tool_calls = list((out.additional_kwargs or {}).get("tool_calls", []) or [])

    print(f"supervisor content: {getattr(out, 'content', None)}")
    print(f"supervisor tool_calls: {tool_calls}")

    # NEW: obey JSON action emitted in content (if present)
    content = (getattr(out, "content", "") or "").strip()
    if not tool_calls and content.startswith("{"):
        try:
            import json
            obj = json.loads(content)
            if obj.get("action") == "call_tool":
                steps.append(AgentAction(
                    tool=obj.get("tool"),
                    tool_input=obj.get("args", {}) or {},
                    log="TBD"
                ))
                return {"intermediate_steps": steps}
        except Exception:
            pass


    if not tool_calls:
        action = AgentAction(
            tool="final_answer",
            tool_input={
                "introduction": "",
                "research_steps": create_scratchpad(steps),
                "main_body": getattr(out, "content", "") or "",
                "conclusion": "",
                "sources": "",
            },
            log="TBD",
        )
        steps.append(action)
        return {"intermediate_steps": steps}

    # Normal path: run the first selected tool as-is
    first = tool_calls[0]
    tool_name = first.get("name")
    tool_args = first.get("args", {}) or {}

    steps.append(AgentAction(tool=tool_name, tool_input=tool_args, log="TBD"))
    return {"intermediate_steps": steps}


In [166]:
# The router() function determines the next tool to use based on the current state.
def router(state: dict) -> str:
    '''Determines the next tool to use based on the current state.

    Args:
        state (dict): The current state containing 'intermediate_steps'.

    Returns:
        str: The name of the tool to use next.
    '''

    if isinstance(state['intermediate_steps'], list):
        return state['intermediate_steps'][-1].tool
    else:
        print('Router invalid format')
        return 'final_answer'


tool_str_to_func = {
    'rag_search_filter': rag_search_filter,
    'rag_search': rag_search,
    'fetch_arxiv': fetch_arxiv,
    'web_search': web_search,
    'final_answer': final_answer
}

# The run_tool() function executes the appropriate tool based on the current state.
def run_tool(state: dict) -> dict:
    '''Executes the appropriate tool based on the current state.

    Args:
        state (dict): The current state containing the 'intermediate_steps'.

    Returns:
        dict: A new state with updated 'intermediate_steps' including the tool's result.
    '''

    tool_name = state['intermediate_steps'][-1].tool
    tool_args = state['intermediate_steps'][-1].tool_input

    print(f'{tool_name}.invoke(input={tool_args})')

    out = tool_str_to_func[tool_name].invoke(input=tool_args)

    action_out = AgentAction(
        tool=tool_name,
        tool_input=tool_args,
        log=str(out)
    )

    return {'intermediate_steps': [action_out]}

In [167]:
from typing import TypedDict, Annotated, List
from langchain_core.agents import AgentAction
from langchain_core.messages import BaseMessage
import operator

class AgentState(TypedDict):
    '''Represents the state of an agent.'''

    input: str
    chat_history: List[BaseMessage]
    intermediate_steps: Annotated[List[tuple[AgentAction, str]], operator.add]

In [172]:
from langgraph.graph import StateGraph, END

# Initialize the state graph with AgentState to manage the workflow.
graph = StateGraph(AgentState)

graph.add_node('supervisor', run_supervisor)
graph.add_node('rag_search_filter', run_tool)
graph.add_node('rag_search', run_tool)
graph.add_node('fetch_arxiv', run_tool)
graph.add_node('web_search', run_tool)
graph.add_node('final_answer', run_tool)

# Set the entry point to 'supervisor'.
graph.set_entry_point('supervisor')

# Add conditional edges to determine the next step using the router function.
graph.add_conditional_edges(source='supervisor', path=router)

# Add edges from each tool back to 'supervisor', except 'final_answer', which leads to 'END'.
for tool_obj in tools:
    if tool_obj.name != 'final_answer':
        graph.add_edge(tool_obj.name, 'supervisor')

graph.add_edge('final_answer', END)

# Compile the graph to make it executable.
runnable = graph.compile()

In [173]:
def run_pipeline(user_input: str, chat_history=None, max_steps: int = 6):
    state = {"input": user_input, "chat_history": chat_history or [], "intermediate_steps": []}
    for _ in range(max_steps):
        state = run_supervisor(state)
        tool_name = router(state)
        state = run_tool(state)
        # Stop if we just produced the final answer
        if tool_name == "final_answer":
            break
    return state

In [177]:
# Run the full oracle → tool → feedback loop
result = run_pipeline(
    user_input="Summarize recent breakthroughs in single-cell RNA sequencing from q-bio.GN papers.",
    chat_history=[],
    max_steps=10  # Increase if you want it to explore more tools
)

print(result)

run_supervisor
supervisor content: 
supervisor tool_calls: [{'name': 'rag_search', 'args': {'query': 'recent breakthroughs in single-cell RNA sequencing', 'namespace': 'q-bio.GN', 'top_k': 5}, 'id': 'call_JA65T3OE7B5Y4vfxTGkCJZsW', 'type': 'tool_call'}]
rag_search.invoke(input={'query': 'recent breakthroughs in single-cell RNA sequencing', 'namespace': 'q-bio.GN', 'top_k': 5})
run_supervisor
supervisor content: {"action":"call_tool","tool":"web_search","args":{"query":"recent breakthroughs in single-cell RNA sequencing"}}
supervisor tool_calls: []
web_search.invoke(input={'query': 'recent breakthroughs in single-cell RNA sequencing'})
run_supervisor
supervisor content: {"action":"call_tool","tool":"final_answer","args":{"introduction":"Single-cell RNA sequencing (scRNA-seq) is a powerful technique that allows researchers to analyze the gene expression of individual cells, providing insights into cellular heterogeneity and function. Recent advancements in this field have focused on over

In [178]:
def build_report(output: dict) -> str:
    """Builds a formatted research report from the agent's final_answer payload."""
    # Normalize lists into text
    def normalize(value):
        if isinstance(value, list):
            return "\n".join(f"- {v}" for v in value)
        return str(value or "").strip()

    intro = normalize(output.get("introduction", ""))
    steps = normalize(output.get("research_steps", ""))
    body = normalize(output.get("main_body", ""))
    concl = normalize(output.get("conclusion", ""))
    sources = normalize(output.get("sources", ""))

    return f"""
INTRODUCTION
------------
{intro}

RESEARCH STEPS
--------------
{steps}

REPORT
------
{body}

CONCLUSION
----------
{concl}

SOURCES
-------
{sources}
""".strip()

In [179]:
payload = result["intermediate_steps"][-1].tool_input
print(build_report(payload))

INTRODUCTION
------------
Single-cell RNA sequencing (scRNA-seq) is a powerful technique that allows researchers to analyze the gene expression of individual cells, providing insights into cellular heterogeneity and function. Recent advancements in this field have focused on overcoming technical challenges and enhancing the accuracy and efficiency of data analysis.

RESEARCH STEPS
--------------
- Conducted a web search for recent breakthroughs in single-cell RNA sequencing.
- Reviewed recent articles and reviews on advancements in scRNA-seq techniques and applications.

REPORT
------
Recent developments in single-cell RNA sequencing have been significant, particularly in the context of bacterial scRNA-seq. These advancements address technical challenges and introduce methodological innovations that enhance the resolution and accuracy of cellular analysis [ScienceDirect]. Additionally, new technological developments in single-cell sequencing approaches have been highlighted, focusing o