In [1]:
!pip install llama-index
!pip install llama-index-multi-modal-llms-gemini
!pip install llama-index-embeddings-gemini

Collecting llama-index-embeddings-gemini
  Downloading llama_index_embeddings_gemini-0.3.2-py3-none-any.whl.metadata (907 bytes)
Downloading llama_index_embeddings_gemini-0.3.2-py3-none-any.whl (4.0 kB)
Installing collected packages: llama-index-embeddings-gemini
Successfully installed llama-index-embeddings-gemini-0.3.2


In [2]:
from getpass import getpass
GOOGLE_API_KEY = getpass("Enter your Google API:")

Enter your Google API:··········


In [5]:
import os
from llama_index.core import SimpleDirectoryReader, Settings

# Define the correct path
data_dir = "/content/sample_data/data"  # Adjust this based on where your folder is

# Check if the directory exists
if not os.path.exists(data_dir):
    raise ValueError(f"Directory {data_dir} does not exist. Check the path.")

# Load documents from the correct directory
documents = SimpleDirectoryReader(data_dir).load_data()

# Parse nodes from documents
nodes = Settings.node_parser.get_nodes_from_documents(documents)


In [7]:
from llama_index.embeddings.gemini import GeminiEmbedding
from llama_index.llms.gemini import Gemini
from llama_index.core import Settings  # Import Settings properly

# Set up embedding model
Settings.embed_model = GeminiEmbedding(
    model_name="models/embedding-001",
    api_key=GOOGLE_API_KEY  # Ensure GOOGLE_API_KEY is defined
)

# Set up language model
Settings.llm = Gemini(api_key=GOOGLE_API_KEY)


  Settings.embed_model = GeminiEmbedding(
  Settings.llm = Gemini(api_key=GOOGLE_API_KEY)


In [9]:
from llama_index.core import StorageContext

# Create a default storage context
storage_context = StorageContext.from_defaults()

# Add documents to the docstore (make sure 'nodes' is defined)
storage_context.docstore.add_documents(nodes)


In [11]:
from llama_index.core import SimpleKeywordTableIndex, VectorStoreIndex

# Create a VectorStore index
vector_index = VectorStoreIndex(nodes, storage_context=storage_context)

# Create a Simple Keyword Table index
keyword_index = SimpleKeywordTableIndex(nodes, storage_context=storage_context)


In [13]:
from llama_index.core import QueryBundle
from llama_index.core.schema import NodeWithScore
from llama_index.core.retrievers import (
    BaseRetriever,
    VectorIndexRetriever,
    KeywordTableSimpleRetriever,
)
from typing import List

class CustomRetriever(BaseRetriever):
    def __init__(
        self,
        vector_retriever: VectorIndexRetriever,
        keyword_retriever: KeywordTableSimpleRetriever,
        mode: str = "AND",
    ) -> None:
        self._vector_retriever = vector_retriever
        self._keyword_retriever = keyword_retriever

        if mode not in ("AND", "OR"):
            raise ValueError("Invalid mode. Choose 'AND' or 'OR'.")

        self._mode = mode
        super().__init__()

    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        vector_nodes = self._vector_retriever.retrieve(query_bundle)
        keyword_nodes = self._keyword_retriever.retrieve(query_bundle)

        vector_ids = {n.node.node_id for n in vector_nodes}
        keyword_ids = {n.node.node_id for n in keyword_nodes}

        combined_dict = {n.node.node_id: n for n in vector_nodes}
        combined_dict.update({n.node.node_id: n for n in keyword_nodes})

        if self._mode == "AND":
            retrieve_ids = vector_ids.intersection(keyword_ids)
        else:
            retrieve_ids = vector_ids.union(keyword_ids)

        retrieve_nodes = [combined_dict[r_id] for r_id in retrieve_ids]
        return retrieve_nodes


In [15]:
from llama_index.core import get_response_synthesizer
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.retrievers import VectorIndexRetriever, KeywordTableSimpleRetriever

# Create retrievers
vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=2)
keyword_retriever = KeywordTableSimpleRetriever(index=keyword_index)

# Custom retriever combining vector and keyword retrievers
custom_retriever = CustomRetriever(vector_retriever, keyword_retriever)

# Define response synthesizer
response_synthesizer = get_response_synthesizer()

# Create custom query engine
custom_query_engine = RetrieverQueryEngine(
    retriever=custom_retriever,
    response_synthesizer=response_synthesizer,
)


In [17]:
query = "what does the data context contain?"
print(custom_query_engine.query(query))


The data context contains information about data warehouse testing, a certificate awarded on December 15, 2024, and a QR code for verification.

