<a href="https://colab.research.google.com/github/haziqa5122/Agentic-RAG-with-ApertureDB-and-SmolAgents/blob/main/ApertureDB_Agentic_RAG_with_ApertureDB_and_SmolAgents.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install opendatasets openai unstructured[pdf] gradio langchain-openai aperturedb pandas langchain-community smolagents 'smolagents[litellm]' arxiv --upgrade --quiet

In [None]:
!sudo apt-get update
!apt-get install poppler-utils

In [None]:
import nltk
nltk.download('punkt_tab')

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')

In [None]:
!pip install tesseract

In [None]:
!apt install tesseract-ocr

In [None]:
! adb config create --active --from-json

In [None]:
import os
import json
import arxiv
import requests
import pandas as pd
import opendatasets as od
from langchain_core.documents import Document
from unstructured.partition.auto import partition
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain.chains import (
    StuffDocumentsChain, LLMChain
)
from langchain.schema import HumanMessage, AIMessage
from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate, MessagesPlaceholder
from langchain.callbacks.manager import (
    trace_as_chain_group,
)
import gradio as gr

In [None]:
syedhamza17
d4f0de9fd0ebb1ed71b6e3aaa7236207

In [None]:
dataset = 'https://www.kaggle.com/datasets/Cornell-University/arxiv'
od.download(dataset)

In [None]:
def fetch_paper_details(arxiv_id):
    paper = next(arxiv.Client().results(arxiv.Search(id_list=[arxiv_id])))
    paper.download_pdf( filename=f"{arxiv_id}.pdf")
    return partition(f"{arxiv_id}.pdf")

In [None]:
papers = []
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=5000,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False,
)

sample = 5 # Arxiv has over 1.7M articles, using 20 for our application

# Open the JSON file and process entries
with open("arxiv/arxiv-metadata-oai-snapshot.json", "r") as file:
    for _ in range(sample):
        line = file.readline()
        data = json.loads(line)

        # Extract relevant fields
        arxiv_id = data.get("id", "")

        # Add paper details by downloading and parsing the paper
        paper_details = "".join(
            text if isinstance((text := element.text), str)
            else "".join(str(part) for part in text) if isinstance(text, (list, tuple))
            else str(text)
            for element in fetch_paper_details(arxiv_id)
        )
        print(type(paper_details))
        # Use LangChain's splitter to divide paper details into chunks
        chunks = text_splitter.create_documents([paper_details])
        print(len(chunks))
        # Create a Document for each chunk
        for idx, chunk in enumerate(chunks):
            print(chunk,type(chunk))
            document_id = f"{arxiv_id}_{idx + 1}"  # Unique ID for each chunk
            document = Document(
                page_content=chunk.page_content,
                id=document_id,
                metadata={
                    'title': data.get("title",""),
                    'authors': data.get("authors", ""),
                    'submitter': data.get("submitter", ""),
                    'abstract': data.get("abstract", ""),
                    'paper_content': chunk.page_content
                }
            )
            papers.append(document)

print("Processing complete. Papers saved to processed_papers.json.")

In [None]:
API_KEY = "<Your-openAI-API-key>"

In [None]:
os.environ["OPENAI_API_KEY"] = API_KEY

In [None]:
from langchain_community.vectorstores import ApertureDB

embeddings = OpenAIEmbeddings(api_key  = API_KEY)
vector_db = ApertureDB.from_documents(papers, embeddings)

In [None]:
from openai import OpenAI

In [None]:
OPENAI_CLIENT = OpenAI(api_key=API_KEY)

In [None]:
EMBEDDING_MODEL = "text-embedding-3-large"
GENERATION_MODEL = "gpt-4o-2024-11-20"

In [None]:
from smolagents import Tool

class RetrieverTool(Tool):
    name = "retriever"
    description = "Uses semantic search to retrieve documents that could be relevant to answer the query."
    inputs = {
        "query": {
            "type": "string",
            "description": "The query to perform. This should be semantically close to the target documents. Use the affirmative form rather than a question.",
        }
    }
    output_type = "string"

    def __init__(self, openai_client, **kwargs):
        super().__init__(**kwargs)
        self.embedder = openai_client

    def simple_retriever(self, query: str, n=5):
      """
      Retrieve documents based on the given query using similarity search

      Args:
          query (str): query to pass to the DB
          n: Number of documents to retrieve

      Returns:
          List of the retrieved documents' texts
      """

      retriever = vector_db.as_retriever(search_type="mmr", search_kwargs={"k": n})
      results = retriever.invoke(query)

      return "\nRetrieved documents:\n" + "".join(
            [
                f"\n\n===== Document {str(i)} =====\n" + doc.page_content
                for i, doc in enumerate(results)
            ]
        )

    def get_embedding(self, text: str):
        return self.embedder.embeddings.create(
            input=text, model=EMBEDDING_MODEL
            ).data[0].embedding

    def forward(self, query: str) -> str:
        assert isinstance(query, str)

        docs = self.simple_retriever(query)

        return docs

retriever_tool = RetrieverTool(OPENAI_CLIENT)

In [None]:
def simple_retriever( query: str, n=5):
      """
      Retrieve documents based on the given query using similarity search

      Args:
          query (str): query to pass to the DB
          n: Number of documents to retrieve

      Returns:
          List of the retrieved documents' texts
      """

      retriever = vector_db.as_retriever(search_type="mmr", search_kwargs={"k": n})
      results = retriever.invoke(query)

      return "\nRetrieved documents:\n" + "".join(
            [
                f"\n\n===== Document {str(i)} =====\n" + doc.page_content
                for i, doc in enumerate(results)
            ]
        )

def get_embedding(text: str):
    return OPENAI_CLIENT.embeddings.create(
        input=text, model=EMBEDDING_MODEL
        ).data[0].embedding


In [None]:
from smolagents import ToolCallingAgent, LiteLLMModel

model = LiteLLMModel(model_id=GENERATION_MODEL)

agent = ToolCallingAgent(tools=[retriever_tool], model=model)

In [None]:
# Create an agent executor by passing in the agent and tools
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)
agent_executor.invoke({"input": "what is LangChain?"})

In [None]:
question = "Why is calculating Higgs Boson decay important?"
agent_output = agent.run(question)
print(agent_output)