<a href="https://colab.research.google.com/github/isamdr86/towards-ai/blob/main/notebooks/11-Adding_Hybrid_Search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Packages and Setup Variables


In [1]:
!pip install -q llama-index==0.10.57 openai==1.37.0 llama-index-finetuning llama-index-embeddings-huggingface llama-index-embeddings-cohere llama-index-readers-web cohere==5.6.2 tiktoken==0.7.0 chromadb==0.5.5 html2text sentence_transformers pydantic llama-index-vector-stores-chroma==0.1.10 llama-index-llms-gemini==0.1.11

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/56.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.5/56.5 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━

In [2]:
%%capture
!pip install openai==1.55.3 httpx==0.27.2 tiktoken==0.7.0 --force-reinstall --quiet

In [4]:
import os
from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get('openai_api_key')

In [5]:
# Allows running asyncio in environments with an existing event loop, like Jupyter notebooks.

import nest_asyncio

nest_asyncio.apply()

# Load the Models


In [6]:
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings
from llama_index.embeddings.openai import OpenAIEmbedding

Settings.llm = OpenAI(temperature=0, model="gpt-4o-mini")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")

# Download knowledge base


In [7]:
from huggingface_hub import hf_hub_download

hf_hub_download(repo_id="jaiganesan/ai_tutor_knowledge", filename="vectorstore.zip", repo_type="dataset", local_dir=".")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vectorstore.zip:   0%|          | 0.00/97.2M [00:00<?, ?B/s]

'vectorstore.zip'

In [8]:
!unzip -o vectorstore.zip

Archive:  vectorstore.zip
   creating: ai_tutor_knowledge/
   creating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/
  inflating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/length.bin  
  inflating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/index_metadata.pickle  
  inflating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/link_lists.bin  
  inflating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/header.bin  
  inflating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/data_level0.bin  
  inflating: ai_tutor_knowledge/chroma.sqlite3  


# Create vector index

In [9]:
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import VectorStoreIndex

# Load the vector store from the local storage.
db = chromadb.PersistentClient(path="./ai_tutor_knowledge")
chroma_collection = db.get_collection("ai_tutor_knowledge")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

# Create the index based on the vector store.
vector_index = VectorStoreIndex.from_vector_store(vector_store=vector_store)

# Create keyword index

In [10]:
def retrieve_all_nodes_from_vector_index(vector_index, query="Whatever", similarity_top_k=100000000):
    # Set similarity_top_k to a large number to retrieve all the nodes
    vector_retriever = vector_index.as_retriever(similarity_top_k=similarity_top_k)

    # Retrieve all nodes
    all_nodes = vector_retriever.retrieve(query)
    nodes = [item.node for item in all_nodes]

    return nodes

nodes = retrieve_all_nodes_from_vector_index(vector_index)
print(len(nodes))



5834


In [11]:
from llama_index.core import SimpleKeywordTableIndex

# Define the KeyworddTableIndex using all the nodes.
keyword_index = SimpleKeywordTableIndex(nodes=nodes)

# Hybrid Retriever


In [12]:
from llama_index.core import QueryBundle
from llama_index.core.schema import NodeWithScore
from llama_index.core.retrievers import (
    BaseRetriever,
    VectorIndexRetriever,
    KeywordTableSimpleRetriever,
)
from typing import List

class HybridRetriever(BaseRetriever):
    """Hybrid retriever that performs both semantic search and keyword search."""

    def __init__(
        self,
        vector_retriever: VectorIndexRetriever,
        keyword_retriever: KeywordTableSimpleRetriever,
        max_retrieve: int = 10,
    ) -> None:
        """Init params."""

        self._vector_retriever = vector_retriever
        self._keyword_retriever = keyword_retriever
        self._max_retrieve = max_retrieve
        super().__init__()

    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        """Retrieve nodes given query."""

        vector_nodes = self._vector_retriever.retrieve(query_bundle)
        keyword_nodes = self._keyword_retriever.retrieve(query_bundle)

        resulting_nodes = []
        node_ids_added = set()
        for i in range(min(len(vector_nodes), len(keyword_nodes))):
            vector_node = vector_nodes[i]
            if vector_node.node.node_id not in node_ids_added:
                resulting_nodes += [vector_node]
                node_ids_added.add(vector_node.node.node_id)

            keyword_node = keyword_nodes[i]
            if keyword_node.node.node_id not in node_ids_added:
                resulting_nodes += [keyword_node]
                node_ids_added.add(keyword_node.node.node_id)

        return resulting_nodes

# Test hybrid retriever vs vector retriever

In [13]:
from llama_index.core import get_response_synthesizer
from llama_index.core.query_engine import RetrieverQueryEngine

# Create hybrid query engine
vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=6)
keyword_retriever = KeywordTableSimpleRetriever(index=keyword_index, num_chunks_per_query=6)
hybrid_retriever = HybridRetriever(vector_retriever, keyword_retriever, max_retrieve=6)
response_synthesizer = get_response_synthesizer(llm=Settings.llm)
hybrid_query_engine = RetrieverQueryEngine(
    retriever=hybrid_retriever,
    response_synthesizer=response_synthesizer,
)

# Test the query engine
answer = hybrid_query_engine.query("How does KOSMOS-2 work?")
print(answer)

KOSMOS-2 is a Transformer-based causal language model designed for multimodal tasks, particularly focusing on integrating language understanding with visual perception. It is trained on a large dataset of grounded image-text pairs known as GRIT. The model enhances its capabilities by converting spatial coordinates from bounding boxes in the dataset into location tokens, which are linked to corresponding text spans, resembling hyperlinks. This allows KOSMOS-2 to effectively connect visual elements with their textual descriptions.

The model can handle various tasks, including multimodal grounding (such as referring expression comprehension and phrase grounding), multimodal referring (referring expression generation), perception-language tasks, and general language understanding and generation. KOSMOS-2's design aims to advance the development of Embodiment AI, contributing to the convergence of language, multimodal perception, action, and world modeling, which are essential for achievin

In [14]:
# Create vector query engine
vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=6)
vector_query_engine = RetrieverQueryEngine(
    retriever=vector_retriever,
    response_synthesizer=response_synthesizer,
)

# Test the query engine
answer = vector_query_engine.query("How does KOSMOS-2 work?")
print(answer)

The provided context does not contain information about KOSMOS-2 or its workings. Therefore, I cannot provide an answer to that query.


# Evaluate

Run the following code if you want to generate an evaluation dataset from scratch. You can choose to download an evaluation dataset running the cell after this one.

In [15]:
from llama_index.core.evaluation import generate_question_context_pairs

# Create questions for each segment. These questions will be used to
# assess whether the retriever can accurately identify and return the
# corresponding segment when queried.
rag_eval_dataset = generate_question_context_pairs(
    nodes, llm=Settings.llm, num_questions_per_chunk=1
)

# We can save the evaluation dataset as a json file for later use.
rag_eval_dataset.save_json("./rag_eval_dataset_question_context.json")

  0%|          | 12/5834 [00:10<1:23:23,  1.16it/s]


KeyboardInterrupt: 

You can download a version of the evaluation dataset with the following code cell, so that you don't have to create the eval dataset from scratch with the code above.

In [17]:
from huggingface_hub import hf_hub_download
from llama_index.finetuning.embeddings.common import (
    EmbeddingQAFinetuneDataset,
)

# Download the evaluation dataset
hf_hub_download(repo_id="jaiganesan/ai_tutor_knowledge", filename="rag_eval_dataset_question_context_subset_50.json", repo_type="dataset", local_dir=".")
rag_eval_dataset = EmbeddingQAFinetuneDataset.from_json("./rag_eval_dataset_question_context_subset_50.json")

(…)_dataset_question_context_subset_50.json:   0%|          | 0.00/14.6k [00:00<?, ?B/s]

In [18]:
import pandas as pd

#  A simple function to show the evaluation result.
def from_eval_results_to_dataframe(name, eval_results):
    """Convert evaluation results to a pandas dataframe."""
    metric_dicts = []
    for eval_result in eval_results:
        metric_dict = eval_result.metric_vals_dict
        metric_dicts.append(metric_dict)

    full_df = pd.DataFrame(metric_dicts)

    hit_rate = full_df["hit_rate"].mean()
    mrr = full_df["mrr"].mean()

    metric_df = pd.DataFrame(
        {"Retriever Name": [name], "Hit Rate": [hit_rate], "MRR": [mrr]}
    )

    return metric_df

The Hit rate measures the proportion of retrieved documents that contain the correct answer. Mean Reciprocal Rank (MRR) calculates the average reciprocal ranks of the correct answer’s position in the retrieved documents, focusing on how early the correct answer appears.

In [19]:
from llama_index.core.evaluation import RetrieverEvaluator

# We can evaluate the retievers with different top_k values.
for i in [2, 4, 6, 8, 10]:
    # Evaluate hybrid retriever
    vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=i)
    keyword_retriever = KeywordTableSimpleRetriever(index=keyword_index, num_chunks_per_query=i)
    hybrid_retriever = HybridRetriever(vector_retriever, keyword_retriever, max_retrieve=i)
    retriever_evaluator = RetrieverEvaluator.from_metric_names(
        ["mrr", "hit_rate"], retriever=hybrid_retriever
    )
    eval_results = await retriever_evaluator.aevaluate_dataset(rag_eval_dataset)
    print(from_eval_results_to_dataframe(f"Hybrid retriever top_{i}", eval_results))

    # Evaluate vector retriever
    vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=i)
    retriever_evaluator = RetrieverEvaluator.from_metric_names(
        ["mrr", "hit_rate"], retriever=vector_retriever
    )
    eval_results = await retriever_evaluator.aevaluate_dataset(rag_eval_dataset)
    print(from_eval_results_to_dataframe(f"Vector retriever top_{i}", eval_results))

           Retriever Name  Hit Rate       MRR
0  Hybrid retriever top_2      0.64  0.578333
           Retriever Name  Hit Rate   MRR
0  Vector retriever top_2       0.6  0.57
           Retriever Name  Hit Rate       MRR
0  Hybrid retriever top_4      0.72  0.590357
           Retriever Name  Hit Rate       MRR
0  Vector retriever top_4      0.68  0.591667
           Retriever Name  Hit Rate       MRR
0  Hybrid retriever top_6      0.72  0.590357
           Retriever Name  Hit Rate       MRR
0  Vector retriever top_6      0.68  0.591667
           Retriever Name  Hit Rate       MRR
0  Hybrid retriever top_8       0.8  0.596991
           Retriever Name  Hit Rate       MRR
0  Vector retriever top_8      0.74  0.599881
            Retriever Name  Hit Rate      MRR
0  Hybrid retriever top_10      0.84  0.59964
            Retriever Name  Hit Rate       MRR
0  Vector retriever top_10      0.76  0.602103
