<a href="https://colab.research.google.com/github/isamdr86/towards-ai/blob/main/notebooks/13-Adding_Router_Agents_ir.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Packages and Setup Variables


In [1]:
!pip install -q llama-index==0.10.57 openai==1.37.0 cohere==5.6.2 tiktoken==0.7.0 chromadb==0.5.5 html2text sentence_transformers pydantic llama-index-vector-stores-chroma==0.1.10 kaleido==0.2.1 llama-index-llms-gemini==0.1.11

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/56.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.5/56.5 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m337.0/337.0 kB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.4/177.4 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m37.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
%%capture
!pip install openai==1.55.3 httpx==0.27.2 tiktoken==0.7.0 --force-reinstall

In [3]:
import os

from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get('openai_api_key')
os.environ["GOOGLE_API_KEY"] = userdata.get('google_api_key')

In [4]:
# Allows running asyncio in environments with an existing event loop, like Jupyter notebooks.
import nest_asyncio

nest_asyncio.apply()

# Load a Model


In [5]:
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings

Settings.llm = OpenAI(temperature=1, model="gpt-4o-mini")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")

# Load Indexes


In [6]:
# Downloading Vector store from Hugging face hub
from huggingface_hub import hf_hub_download

vectorstore = hf_hub_download(repo_id="jaiganesan/ai_tutor_knowledge", filename="vectorstore.zip", repo_type="dataset", local_dir=".")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vectorstore.zip:   0%|          | 0.00/97.2M [00:00<?, ?B/s]

In [7]:
!unzip vectorstore.zip

Archive:  vectorstore.zip
   creating: ai_tutor_knowledge/
   creating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/
  inflating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/length.bin  
  inflating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/index_metadata.pickle  
  inflating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/link_lists.bin  
  inflating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/header.bin  
  inflating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/data_level0.bin  
  inflating: ai_tutor_knowledge/chroma.sqlite3  


In [8]:
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import VectorStoreIndex

# Create your index
db = chromadb.PersistentClient(path="./ai_tutor_knowledge")
chroma_collection = db.get_or_create_collection("ai_tutor_knowledge")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
vector_index = VectorStoreIndex.from_vector_store(vector_store)

In [9]:
# Query Engine
ai_tutor_knowledge_query_engine = vector_index.as_query_engine(similarity_top_k=3)

res = ai_tutor_knowledge_query_engine.query("How does Retrieval Augmented Generation (RAG) work?")
print(res.response)

Retrieval Augmented Generation (RAG) operates through a workflow that integrates two main components: Retrieval and Generation. 

1. **Retrieval Component**: The purpose of this phase is to extract relevant information from external knowledge sources. It involves:
   - **Indexing**: Organizing documents to enable efficient retrieval, using methods like inverted indexes for sparse retrieval or dense vector encoding for dense retrieval.
   - **Searching**: Using the organized indexes to locate relevant documents based on user queries. This phase often includes rerankers to improve the relevance ranking of the retrieved documents.

2. **Generation Component**: This phase utilizes the information obtained from the retrieval step to produce coherent and contextually relevant responses. It includes:
   - **Prompting Techniques**: Methods such as Chain of Thought and Tree of Thought help guide the process of generating responses by enhancing the quality of the response based on the context an

In [10]:
for src in res.source_nodes:
    print("Node ID\t", src.node_id)
    print("Title\t", src.metadata["title"])
    print("Text\t", src.text)
    print("Score\t", src.score)
    print("Metadata\t", src.metadata)
    print("-_" * 20)

Node ID	 2aa05360-f43a-4819-bce7-0acf7b897eab
Title	 Searching for Best Practices in Retrieval-Augmented Generation:1 Introduction
Text	 Generative large language models are prone to producing outdated information or fabricating facts, although they were aligned with human preferences by reinforcement learning [1] or lightweight alternatives [2–5]. Retrieval-augmented generation (RAG) techniques address these issues by combining the strengths of pretraining and retrieval-based models, thereby providing a robust framework for enhancing model performance [6]. Furthermore, RAG enables rapid deployment of applications for specific organizations and domains without necessitating updates to the model parameters, as long as query-related documents are provided. Many RAG approaches have been proposed to enhance large language models (LLMs) through query-dependent retrievals [6–8]. A typical RAG workflow usually contains multiple intervening processing steps: query classification (determining w

# Router

Routers are modules that take in a user query and a set of “choices” (defined by metadata), and returns one or more selected choices.

They can be used for the following use cases and more:

- Selecting the right data source among a diverse range of data sources

- Deciding whether to do summarization (e.g. using summary index query engine) or semantic search (e.g. using vector index query engine)

- Deciding whether to “try” out a bunch of choices at once and combine the results (using multi-routing capabilities).


## Lets create a different query engine with Mistral AI information


In [11]:
from pathlib import Path
import requests

wiki_titles = [
    "Mistral AI",
    "Llama (language model)",
    "Claude AI",
    "OpenAI",
    "Gemini AI",
]

data_path = Path("llm_data_wiki")

if not data_path.exists():
    data_path.mkdir()

for title in wiki_titles:
    response = requests.get(
        "https://en.wikipedia.org/w/api.php",
        params={
            "action": "query",
            "format": "json",
            "titles": title,
            "prop": "extracts",
            "explaintext": True,
        },
    ).json()

    page = next(iter(response["query"]["pages"].values()))

    if "extract" in page:
        wiki_text = page["extract"]

        with open(data_path / "llm_data_wiki.txt", "a") as fp:
            fp.write(f"Title: {title}\n{wiki_text}\n\n")
    else:
        print(f"No extract found for '{title}'")


In [12]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.text_splitter import TokenTextSplitter
from llama_index.core.extractors import (
    SummaryExtractor,
    QuestionsAnsweredExtractor,
    KeywordExtractor,
)

# Assuming you have prepared a directory for llm data
documents = SimpleDirectoryReader("llm_data_wiki").load_data()

text_splitter = TokenTextSplitter(separator=" ", chunk_size=512, chunk_overlap=128)

transformations = [
    text_splitter,
    QuestionsAnsweredExtractor(questions=2),
    SummaryExtractor(summaries=["prev", "self"]),
    KeywordExtractor(keywords=10),
    OpenAIEmbedding(model="text-embedding-3-small"),
]

llm_index = VectorStoreIndex.from_documents(documents=documents, transformations=transformations)

llm_query_engine = llm_index.as_query_engine(similarity_top_k=2)

100%|██████████| 50/50 [00:31<00:00,  1.57it/s]
100%|██████████| 50/50 [01:16<00:00,  1.53s/it]
100%|██████████| 50/50 [00:16<00:00,  3.02it/s]


In [13]:
from llama_index.core.query_engine import RouterQueryEngine
from llama_index.core.selectors import PydanticSingleSelector
from llama_index.core.tools import QueryEngineTool

# initialize tools
ai_tutor_knowledge_tool = QueryEngineTool.from_defaults(
    query_engine=ai_tutor_knowledge_query_engine,
    description="Useful for questions about general generative AI concepts",
)
llm_tool = QueryEngineTool.from_defaults(
    query_engine=llm_query_engine,
    description="Useful for questions about particular LLMs like Mistral, Claude, OpenAI, Gemini",
)

# initialize router query engine (single selection, pydantic)
query_engine = RouterQueryEngine(
    selector=PydanticSingleSelector.from_defaults(),
    query_engine_tools=[
        ai_tutor_knowledge_tool,
        llm_tool,
    ],
)

In [14]:
res = query_engine.query(
    "What is the LLama model?",
)
print(res.response)

The LLaMa model, developed by Meta Platforms, is a large language model designed for a variety of applications. It focuses on enabling advanced natural language understanding and generation tasks. The model has multiple versions, including LLaMa 2 and LLaMa 3, each with increasing capabilities. The LLaMa models are training on extensive datasets and have features that allow for fine-tuning in specific applications, such as chat functions and code generation. Additionally, there are discussions surrounding its usage policies, particularly concerning military applications and concerns about potential misuse for harmful purposes.


In [15]:
for src in res.source_nodes:
    print("Node ID\t", src.node_id)
    print("Text\t", src.text)
    print("Score\t", src.score)
    print("-_" * 20)

Node ID	 154e68f3-4be3-443b-835b-25c8583ddcb0
Text	 model architecture remains largely unchanged from that of LLaMA-1 models, but 40% more data was used to train the foundational models. The accompanying preprint also mentions a model with 34B parameters that might be released in the future upon satisfying safety targets.
LLaMa 2 includes foundation models and models fine-tuned for chat. In a further departure from the original version of LLaMa, all models are released with weights and may be used for many commercial use cases. However, because LLaMa's license enforces an acceptable use policy that prohibits Llama from being used for some purposes, Meta's use of the term open source to describe Llama has been disputed by the Open Source Initiative (which maintains the The Open Source Definition) and others.
Code Llama is a fine-tune of LLaMa 2 with code specific datasets. 7B, 13B, and 34B versions were released on August 24, 2023, with the 70B releasing on the January 29, 2024. Startin

In [16]:
res = query_engine.query("Explain parameter-efficient finetuning methods")
print(res.response)

Parameter-efficient fine-tuning methods are designed to optimize the process of adapting large language models (LLMs) without the need for extensive computational resources. These methods make slight adjustments to a model's weights and reduce the number of trainable parameters, thus minimizing the computational cost associated with full fine-tuning.

Three main approaches in parameter-efficient fine-tuning (PEFT) are:

1. **Selective**: This approach involves fine-tuning only a subset of the model's parameters. By strategically choosing which parameters to adjust, this method aims to enhance efficiency while maintaining the model's performance on specific tasks.

2. **Reparameterization**: This method utilizes low-rank representations to modify model weights. A notable example is Low Rank Adaptation (LoRA), which approximates the original weight matrices using two smaller matrices derived from rank decomposition. This significantly reduces the number of parameters, as demonstrated by 

In [17]:
for src in res.source_nodes:
    print("Node ID\t", src.node_id)
    print("Text\t", src.text)
    print("Score\t", src.score)
    print("-_" * 20)

Node ID	 6be88fa3-2f8b-43e7-aba0-d874b39809fc
Text	 # FourierFT: Discrete Fourier Transformation Fine-Tuning[FourierFT](https://huggingface.co/papers/2405.03003) is a parameter-efficient fine-tuning technique that leverages Discrete Fourier Transform to compress the model's tunable weights. This method outperforms LoRA in the GLUE benchmark and common ViT classification tasks using much less parameters.FourierFT currently has the following constraints:- Only `nn.Linear` layers are supported.- Quantized layers are not supported.If these constraints don't work for your use case, consider other methods instead.The abstract from the paper is:> Low-rank adaptation (LoRA) has recently gained much interest in fine-tuning foundation models. It effectively reduces the number of trainable parameters by incorporating low-rank matrices A and B to represent the weight change, i.e., Delta W=BA. Despite LoRA's progress, it faces storage challenges when handling extensive customization adaptations or 

# OpenAI Agent

In [18]:
from llama_index.agent.openai import OpenAIAgent

In [19]:
system_message_openai_agent = """You are an AI teacher, answering questions from students of an applied AI course on Large Language Models (LLMs or llm) and Retrieval Augmented Generation (RAG) for LLMs. Topics covered include training models, fine-tuning models, giving memory to LLMs, prompting tips, hallucinations and bias, vector databases, transformer architectures, embeddings, RAG frameworks, Langchain, LlamaIndex, making LLMs interact with tools, AI agents, reinforcement learning with human feedback. Questions should be understood in this context.

Your answers are aimed to teach students, so they should be complete, clear, and easy to understand.

Use the available tools to gather insights pertinent to the field of AI. Always use two tools at the same time. These tools accept a string (a user query rewritten as a statement) and return informative content regarding the domain of AI.
e.g:
User question: 'How can I fine-tune an LLM?'
Input to the tool: 'Fine-tuning an LLM'

User question: How can quantize an LLM?
Input to the tool: 'Quantization for LLMs'

User question: 'Teach me how to build an AI agent"'
Input to the tool: 'Building an AI Agent'

Only some information returned by the tools might be relevant to the question, so ignore the irrelevant part and answer the question with what you have.

Your responses are exclusively based on the output provided by the tools. Refrain from incorporating information not directly obtained from the tool's responses.

When the conversation deepens or shifts focus within a topic, adapt your input to the tools to reflect these nuances. This means if a user requests further elaboration on a specific aspect of a previously discussed topic, you should reformulate your input to the tool to capture this new angle or more profound layer of inquiry.

Provide comprehensive answers, ideally structured in multiple paragraphs, drawing from the tool's variety of relevant details. The depth and breadth of your responses should align with the scope and specificity of the information retrieved.

Should the tools repository lack information on the queried topic, politely inform the user that the question transcends the bounds of your current knowledge base, citing the absence of relevant content in the tool's documentation.

At the end of your answers, always invite the students to ask deeper questions about the topic if they have any. Make sure to reformulate the question to the tool to capture this new angle or more profound layer of inquiry.

Do not refer to the documentation directly, but use the information provided within it to answer questions.

If code is provided in the information, share it with the students. It's important to provide complete code blocks so they can execute the code when they copy and paste them.

Make sure to format your answers in Markdown format, including code blocks and snippets.

Politely reject questions not related to AI, while being cautious not to reject unfamiliar terms or acronyms too quickly."""

In [20]:
from llama_index.llms.openai import OpenAI

llm = OpenAI(model="gpt-4o")

agent = OpenAIAgent.from_tools(
    llm=llm,
    tools=[ai_tutor_knowledge_tool, llm_tool],
    system_prompt=system_message_openai_agent,
)

In [21]:
response = agent.chat("What is the LLama model?")
print(response.response)

The LLaMa model series, developed by Meta Platforms, represents a significant advancement in AI language models, designed for a variety of applications. Here are some key aspects of the LLaMa models:

1. **Architecture**: LLaMa models are built on a decoder-only Transformer architecture, similar to models like GPT-3. They incorporate unique features such as the SwiGLU activation function, rotary positional embeddings (RoPE), and RMSNorm for normalization, which contribute to their efficiency and performance.

2. **Model Variants and Sizes**: The series includes several versions, with the latest being LLaMa 3, available in sizes like 8B, 70B, and a planned 405B configuration. These models have been fine-tuned on extensive datasets to enhance their performance on various benchmarks.

3. **Training Data**: The training of LLaMa models emphasizes the volume of data, with datasets comprising trillions of tokens sourced from public resources such as CommonCrawl and GitHub. This approach aims

In [22]:
response = agent.chat("Explain parameter-efficient finetuning methods")
print(response.response)

It seems the available resources did not provide specific details on parameter-efficient finetuning methods. However, I can offer a general overview based on common practices in the field.

Parameter-efficient finetuning methods are designed to adapt large pre-trained models to specific tasks without updating all the model's parameters. This approach is particularly useful when computational resources are limited or when the model needs to be adapted to multiple tasks efficiently. Here are some common techniques:

1. **Adapters**: This method involves adding small neural network modules, called adapters, between the layers of a pre-trained model. During finetuning, only the parameters of these adapters are updated, while the original model parameters remain unchanged. This reduces the number of parameters that need to be trained, making the process more efficient.

2. **Low-Rank Adaptation (LoRA)**: LoRA introduces low-rank matrices into the model's architecture. During finetuning, onl

In [23]:
response = agent.chat("Write the recipe for a chocolate cake.")
print(response.response)

I'm here to assist with questions related to AI and related technologies. If you have any questions about AI models, fine-tuning, or other related topics, feel free to ask!


# Code related questions to GPT-4o, the remaining questions to Gemini

In [24]:
from llama_index.agent.openai import OpenAIAgent
from llama_index.llms.openai import OpenAI
from llama_index.llms.gemini import Gemini
from llama_index.core.query_engine import RouterQueryEngine
from llama_index.core.selectors import PydanticSingleSelector
from llama_index.core.tools import QueryEngineTool

# initialize LLMs
gpt_4o_llm = OpenAI(model="gpt-4o")
gemini_llm = Gemini(model="models/gemini-1.5-flash", temperature=1, max_tokens=512)

# define query engines
llm_query_engine_code = vector_index.as_query_engine(
    llm=gpt_4o_llm,
    similarity_top_k=3,
    embed_model=OpenAIEmbedding(model="text-embedding-3-small", mode="text_search"),
)

llm_query_engine_rest = vector_index.as_query_engine(
    llm=gemini_llm,
    similarity_top_k=3,
    embed_model=OpenAIEmbedding(model="text-embedding-3-small", mode="text_search"),
)

# define tools for LLM
llm_tool_code = QueryEngineTool.from_defaults(
    query_engine=llm_query_engine_code,
    description="Ideal for handling code-related queries, technical implementations, and troubleshooting involving Large Language Models.",
    name="LLMCodeTool",
)

llm_tool_rest = QueryEngineTool.from_defaults(
    query_engine=llm_query_engine_rest,
    description="Best suited for answering conceptual, theoretical, and general questions about Large Language Models.",
    name="LLMGeneralTool",
)

# Initialize OpenAIAgent with the system message and the router query engine
agent = OpenAIAgent.from_tools(
    llm=gpt_4o_llm,  # The base LLM, used only if no other tools apply
    tools=[llm_tool_code, llm_tool_rest],
    system_prompt=system_message_openai_agent,
)

In [25]:
# Test the agent with a code-related question
response = agent.chat("How do I fine-tune the LLama model? Write the code for it.")
for source in response.sources:
    print(source.tool_name)

LLMGeneralTool
LLMCodeTool


In [26]:
# Test the agent with a Non code-related question
response1 = agent.chat("What is the relationship between Llama and Meta?")
for source in response1.sources:
    print(source.tool_name)

LLMGeneralTool
