# Lesson 3: Chatbot Example

Build a chatbot that makes use of ArXiv for searching for papers and finding some information.

## Import Libraries

In [2]:
! pip install arxiv
! pip install python-dotenv
! pip install anthropic

Collecting arxiv
  Downloading arxiv-2.2.0-py3-none-any.whl.metadata (6.3 kB)
Collecting feedparser~=6.0.10 (from arxiv)
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting sgmllib3k (from feedparser~=6.0.10->arxiv)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading arxiv-2.2.0-py3-none-any.whl (11 kB)
Downloading feedparser-6.0.11-py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6046 sha256=e62af96c48d7c513b574dd289aa208d727f99581d8f5827043c6e567c5943d4d
  Stored in directory: /root/.cache/pip/wheels/3b/25/2a/105d6a15df6914f4d15047691c6c28f9052cc1173e40285d03
Successfully built sgmllib3k
Installing collected packag

In [3]:
import arxiv
import json
import os
from typing import List
from dotenv import load_dotenv
import anthropic

In [4]:
PAPER_DIR = "papers" # define directory name to save papers in

## Search Papers function

In [31]:
def search_papers(topic: str, max_results: int = 5) -> List[str]:
  """
  Search for papers on arXiv based on a topic and store their information.

  Args:
    topic: The topic to search for papers on.
    max_results: The maximum number of results to return. (default: 5)

  Returns:
    List of paper IDs found in the search.
  """

  # Use arxiv to find the papers
  client = arxiv.Client()

  # Search for the most relevant articles matching the queried topic
  search = arxiv.Search(
      query = "all:" + topic,
      max_results = max_results,
      sort_by = arxiv.SortCriterion.Relevance,
  )

  papers = client.results(search)

  # Create directory for this topic
  path = os.path.join(PAPER_DIR, topic.lower().replace(" ", "_"))
  os.makedirs(path, exist_ok=True)

  file_path = os.path.join(path, "papers_info.json")

  # Try to load existing papers info
  try:
    with open(file_path, "r") as json_file:
      papers_info = json.load(json_file)
  except (FileNotFoundError, json.JSONDecodeError):
    papers_info = {}

  # Process each paper and add to papers_info
  paper_ids = []
  for paper in papers:
    paper_ids.append(paper.get_short_id())
    paper_info = {
        "title": paper.title,
        "summary": paper.summary,
        "authors": [author.name for author in paper.authors],
        "pdf_url": paper.pdf_url,
        "published": str(paper.published.date())
    }
    papers_info[paper.get_short_id()] = paper_info

  # Save updated papers_info to json file
  with open(file_path, "w") as json_file:
    json.dump(papers_info, json_file, indent=2)

  print(f"Results are saved in: {file_path}")

  return paper_ids

In [32]:
search_papers("computers")

Results are saved in: papers/computers/papers_info.json


['1312.3300v1', '2207.05241v1', '2012.10468v1', '2009.00041v1', '2009.08005v1']

## Extract Information from Papers
The second tool looks for information about a specific paper across all topic directories inside the `papers` directory.

In [33]:
def extract_info(paper_id: str) -> str:
  """
  Search for information about a specific paper across all topic directories

  Args:
    paper_id: The ID of the paper to search for.

  Returns:
    JSON string containing paper information if found, error message if not found.
  """
  for item in os.listdir(PAPER_DIR):
    item_path = os.path.join(PAPER_DIR, item)
    if os.path.isdir(item_path):
      file_path = os.path.join(item_path, "papers_info.json")
      if os.path.isfile(file_path):
        try:
          with open(file_path, "r") as json_file:
            papers_info = json.load(json_file)
            if paper_id in papers_info:
              return json.dumps(papers_info[paper_id], indent=2)
        except (FileNotFoundError, json.JSONDecodeError) as e:
          print(f"Error reading {file_path}: {str(e)}")
          continue

  return f"There is no saved information saved about paper {paper_id}."

In [34]:
extract_info('1312.3300v1')

'{\n  "title": "Numerical Reproducibility and Parallel Computations: Issues for Interval Algorithms",\n  "summary": "What is called \\"numerical reproducibility\\" is the problem of getting the same result when the scientific computation is run several times, either on the same machine or on different machines, with different types and numbers of processing units, execution environments, computational loads etc. This problem is especially stringent for HPC numerical simulations. In what follows, the focus is on parallel implementations of interval arithmetic using floating-point arithmetic. For interval computations, numerical reproducibility is of course an issue for testing and debugging purposes. However, as long as the computed result encloses the exact and unknown result, the inclusion property, which is the main property of interval arithmetic, is satisfied and getting bit for bit identical results may not be crucial. Still, implementation issues may invalidate the inclusion prop

## Tools List
Each tool in the list should have at least a name and a description. We can also provide an input schema to be followed as needed.

This list is kind of like YAML files, where we are simply creating a "contract", but not actually calling the function.

In [35]:
tools = [
    {
        "name": "search_papers",
        "description": "Search for papers on arXiv based on a topic and store their information.",
        "input_schema": {
            "type": "object",
            "properties": {
                "topic": {
                    "type": "string",
                    "description": "The topic to search for papers on.",
                },
                "max_results": {
                    "type": "integer",
                    "description": "The maximum number of results to return. (default: 5)",
                },
            },
            "required": ["topic"],
        }
    },
    {
        "name": "extract_info",
        "description": "Search for information about a specific paper across all topic directories",
        "input_schema": {
            "type": "object",
            "properties": {
                "paper_id": {
                    "type": "string",
                    "description": "The ID of the paper to search for.",
                },
            },
            "required": ["paper_id"],
        }
    }
]

## Tools Mapping


In [36]:
mapping_tool_function = {
    "search_papers": search_papers,
    "extract_info": extract_info
}

def execute_tool(tool_name, tool_args):
  """
  Execute a tool based on its name.
  """
  # Execute function to get result
  result = mapping_tool_function[tool_name](**tool_args)

  # Format result into a JSON string
  if result is None:
    result = "The operation completed, but did not return any results."

  elif isinstance(result, list):
    # if result is a list
    result = ', '.join(result)

  elif isinstance(result, dict):
    # if result is a dictionary
    result = json.dumps(result, indent=2)

  else:
    # all other formats
    result = str(result)

  return result

## Chatbot

In [51]:
from google.colab import userdata

In [52]:
load_dotenv()
client = anthropic.Anthropic(
    api_key=userdata.get('ANTHROPIC_API_KEY')
)

In [47]:
def process_query(query):
  messages = [{'role': 'user', 'content': 'query'}]

  response = client.messages.create(max_tokens=2024,
                                    model='claude-3-7-sonnet-20250219',
                                    tools = tools,
                                    messages = messages)

  process_query = True

  while process_query:
    assistant_content = []

    for content in response.content:
      if content.type == 'text':
        print(content.text)
        assistant_content.append(content.text)

        if len(response.content) == 1:
          process_query = False

      elif content.type == 'tool_use':
        assistant_content.append(content)
        messages.append({'role': 'assistant', 'content': assistant_content})

        tool_id = content.id
        tool_args = content.input
        tool_name = content.name
        print(f"Calling tool {tool_name} with args: {tool_args}")

        result = execute_tool(tool_name, tool_args)
        messages.append({'role':'user',
                         'content': [
                             {"type": "tool_result",
                              "tool_use_id": tool_id,
                              "content": result
                              }
                          ]
                         })

        response = client.messages.create(max_tokens=2024,
                                          model='claude-3-7-sonnet-20250219',
                                          tools = tools,
                                          messages = messages)

        if len(response.content) == 1 and response.content[0].type == 'text':
          print(response.content[0].text)
          process_query = False

### Chat Loop

In [48]:
def chat_loop():
  print("Type your queries or 'quit' to exit.")
  while True:
    try:
      query = input("\nQuery: ").strip()
      if query.lower() == 'quit':
        break

      process_query(query)
      print("\n")

    except Exception as e:
      print(f"\nAn error occurred: {str(e)}")

In [53]:
chat_loop()

Type your queries or 'quit' to exit.

Query: hi

An error occurred: Error code: 400 - {'type': 'error', 'error': {'type': 'invalid_request_error', 'message': 'Your credit balance is too low to access the Anthropic API. Please go to Plans & Billing to upgrade or purchase credits.'}}

Query: quit


## Concluding remarks
- No persistent memory here. Nothing is actually being saved here, and each conversation is a brand new start.