In [None]:
import os

import numpy as np
import pandas as pd
from datasets import Dataset, load_from_disk
from dotenv import load_dotenv
from openai import OpenAI
from sentence_transformers import SentenceTransformer, util
from smolagents import AzureOpenAIModel, CodeAgent, Tool

load_dotenv()

True

# Creating a basic AI agent using the smolagents library

In the previous notebook we used semantic retrieval to find data in a large set of news articles.
Now we want to use this retriever to build a tool for a LLM to use.

It will be your task to use the smolagents framework to create your first simple AI agent.

For reference you have the retriever class once more in the cell below, you can either use it in the tool or you can rewrite the code for the tool.

In [None]:
class InformationRetrieval:
    """Handles the retreival of news articles from the corpus.

    Attributes:
        ds (Dataset): The news data corpus.
        embedding_model (SentenceTransformer): Embedding model for the text.
    """

    def __init__(self) -> None:
        """Initialize a new news data retreiver."""
        self.ds = load_from_disk("data/financial-news-dataset")

        self.embedding_model = SentenceTransformer(
            "sentence-transformers/all-MiniLM-L6-v2", token=False
        )
        self.load_index()

    def load_index(self, index_file: str = "my_index.faiss") -> None:
        """
        Load the index from a local file.

        Args:
            index_file (str): Path for the index file.
        """
        self.ds.load_faiss_index("embeddings", "data/my_index.faiss")
        pass

    def search(self, query: str, top_k: int = 3) -> list[str]:
        """
        Search the corpus for the most similar articles

        Args:
            query (str): Search query.
            top_k (int): Number of articles returned.

        Returns:
            list[str]: List of most similar articles.
        """
        query_embedding = self.embedding_model.encode(query)

        scores, retrieved_examples = self.ds.get_nearest_examples(
            "embeddings", query_embedding, k=top_k
        )
        return retrieved_examples["text"]

## Task 1:

Create a class ```InformationRetrievalTool``` which inherits from ```Tool```.
You can reuse the class/code from earlier and check the [documentation](https://huggingface.co/docs/smolagents/tutorials/tools) for reference.

The child class needs to define:

* An attribute name, which corresponds to the name of the tool itself. The name usually describes what the tool does.
* An attribute description is used to populate the agent’s system prompt.
* An inputs attribute, which is a dictionary with keys "type" and "description". It contains information that helps the Python interpreter make educated choices about the input.
* An output_type attribute, which specifies the output type. The types for both inputs and output_type should be Pydantic formats.
* A forward method which contains the inference code to be executed.  

In [None]:
class InformationRetrievalTool(Tool):
    # Your code here:
    name = "information_retrieval_tool"
    description = """
    This is a tool that searches a corpus of financial news articles and retrieves the most relevant articles based on a given query.
    It can be used to find information on specific topics, events, or entities mentioned in the news articles.
    """
    inputs = {
        "query": {
            "type": "string",
            "description": "the search query to find relevant news articles",
        },
        "top_k": {
            "type": "integer",
            "description": "the number of top relevant articles to retrieve",
            "default": 3,
        },
    }
    output_type = "array"
    output_schema = {"type": "array", "items": {"type": "string"}}

    retriever = InformationRetrieval()

    def forward(self, query: str, top_k: int) -> list[str]:
        results = self.retriever.search(query, top_k=3)
        return results

## Setting up the agent

Now that we have created our tool we can set up the agent.
Below we have configured the connection to the LLM end point and the agent, which is very straight forward.

[If you are curious here is the prompt which is used for the agent.](https://github.com/huggingface/smolagents/blob/main/src/smolagents/prompts/code_agent.yaml)

In [None]:
api_key = os.getenv("AZURE_KEY")
endpoint = "https://aa-dsa-training-msca.openai.azure.com/"
deployment = "gpt-5-nano"
model_name = "gpt-5-nano"
api_version = "2024-12-01-preview"

model = AzureOpenAIModel(
    model_id=model_name, azure_endpoint=endpoint, api_key=api_key, api_version=api_version
)


In [41]:
information_retrieval_tool = InformationRetrievalTool()

agent = CodeAgent(
    tools=[
        information_retrieval_tool,
    ],
    model=model,
    stream_outputs=False,
)

## Query the agent

Now everything is set up and you can use the agent!
Feel free to play around and see how the agent reacts to different queries.

In [None]:
agent.run(
    "Search my news corpus for articles about the recent trends in cryptocurrency and return the top 3 most relevant articles."
)

'Top 3 articles:\n- RPT-Wall St Week Ahead-Investors look to January effect at start of 2016\n- Investors look to January effect at start of 2016\n- Why Bitcoin Matters – TechCrunch'

# Task 2:

smolagents comes with a few select [built in tools](https://huggingface.co/docs/smolagents/en/reference/default_tools).

Your task is it to build an agent using one or more built in tools and query the agent.
Of course you can still keep the previous information retrieval tool to see how they interact.

Please experiment!

In [None]:
# Your code here:
from smolagents import WikipediaSearchTool

wikipedia_tool = WikipediaSearchTool()

agent = CodeAgent(
    tools=[
        information_retrieval_tool,
        wikipedia_tool,
    ],
    model=model,
    stream_outputs=False,
)

agent.run(
    "Search my corpus for entities related to cryptocurrency and then look up their Wikipedia pages."
)

'Bitcoin: https://en.wikipedia.org/wiki/Bitcoin'