In [1]:
!pip install chromadb sentence-transformers  langchain-google-genai==2.0.4

Collecting chromadb
  Downloading chromadb-1.3.5-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting langchain-google-genai==2.0.4
  Downloading langchain_google_genai-2.0.4-py3-none-any.whl.metadata (3.8 kB)
Collecting langchain-core<0.4,>=0.3.15 (from langchain-google-genai==2.0.4)
  Downloading langchain_core-0.3.80-py3-none-any.whl.metadata (3.2 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.3.0-py3-none-any.whl.metadata (5.6 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=

In [2]:
import os
import json
import numpy as np
from chromadb.utils import embedding_functions


In [3]:
# LangChain for Gemini Embeddings
from langchain_google_genai import GoogleGenerativeAIEmbeddings
# ChromaDB Client
import chromadb
from google.colab import userdata


In [4]:
# --- API Key Setup ---
# Use Colab Secrets for GEMINI_API_KEY
gemini_api_key = userdata.get("key_1")


if not gemini_api_key:
    print("Warning: GEMINI_API_KEY not found. Gemini embedding will not run.")
else:
    print("Setup complete. Environment configured.")


Setup complete. Environment configured.


In [5]:
# 2.1. Defining Sample Data: We will use four sample documents, two of which are conceptually similar ("Biking" and "Cycling").
documents = [
    "The official university policy states that all faculty must submit expense reports by the 15th of every month.", # Doc 1: Finance
    "Riding a bicycle provides excellent low-impact cardiovascular exercise and is a great way to commute.",           # Doc 2: Cycling
    "I enjoy going cycling on the weekends, especially when the weather is clear and the trails are dry.",           # Doc 3: Biking
    "Please consult the academic handbook regarding grading policies and attendance requirements for final year students." # Doc 4: Academics
]

# The user's query we want to compare against
user_query = "What is the best form of exercise using wheels?"

gemini_embedder = GoogleGenerativeAIEmbeddings(
    model="models/text-embedding-004",
    google_api_key=gemini_api_key
)

In [6]:
# Generate embeddings for the documents and the query
try:
    doc_embeddings_gemini = gemini_embedder.embed_documents(documents)
    query_embedding_gemini = gemini_embedder.embed_query(user_query)

    print(f"Gemini Embedding Dimension: {len(query_embedding_gemini)}")
    print(f"Vector for Doc 1 (start): {doc_embeddings_gemini[0][:5]}...")
    print(f"Vector for Query (start): {query_embedding_gemini[:5]}...")

except Exception as e:
    print(f"\nError using Gemini Embedder (Check API key): {e}")

Gemini Embedding Dimension: 768
Vector for Doc 1 (start): [0.033828187733888626, -0.016320111230015755, -0.013897339813411236, -0.03159501776099205, 0.015815390273928642]...
Vector for Query (start): [0.010527719743549824, -0.06336859613656998, 0.00984877347946167, -0.033117953687906265, 0.03267448768019676]...


In [7]:

# Instead of using ChromaDB's wrapper for direct embedding,
# we will use the SentenceTransformer library directly, as the wrapper
# does not expose `embed_documents` or `embed_query` methods in this manner for external calls.
from sentence_transformers import SentenceTransformer

# Initialize the open-source sentence transformer model
model_name = "all-MiniLM-L6-v2"
hf_model = SentenceTransformer(model_name)

# Generate embeddings using the open-source model
doc_embeddings_hf = hf_model.encode(documents).tolist()
query_embedding_hf = hf_model.encode(user_query).tolist()

print(f"\nHF Embedding Dimension: {len(query_embedding_hf)}")
print(f"Vector for Doc 1 (start): {doc_embeddings_hf[0][:5]}...")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


HF Embedding Dimension: 384
Vector for Doc 1 (start): [0.01963353157043457, -0.01359286718070507, -0.053158652037382126, 0.01634487695991993, 0.05234808847308159]...


3
Vector Database:
ChromaDBChromaDB is a lightweight, open-source vector store perfect for development and RAG demos. We will load the data and vectors.

Initialize Chroma Client (in-memory, perfect for Colab)

In [8]:
from chromadb.api.types import EmbeddingFunction, Embeddable
from typing import List

# Define a custom wrapper class to make GoogleGenerativeAIEmbeddings compatible with ChromaDB
class GeminiChromaEmbeddingFunction(EmbeddingFunction):
    def __init__(self, embedder_model):
        self._embedder = embedder_model

    def __call__(self, input: Embeddable) -> List[List[float]]:
        # ChromaDB's EmbeddingFunction expects a list of strings
        # LangChain's embed_documents handles this directly
        return self._embedder.embed_documents(input)

    def name(self) -> str:
        # Provide a unique name for this embedding function, as ChromaDB expects it
        return "google_gemini_text_embedding"

client = chromadb.Client()

# Wrap the LangChain embedder for ChromaDB compatibility
chroma_gemini_ef = GeminiChromaEmbeddingFunction(gemini_embedder)

# Create a collection using the wrapped Gemini Embeddings
collection_gemini = client.get_or_create_collection(
    name="gemini_docs_collection",
    embedding_function=chroma_gemini_ef # Use the wrapped function here
)

# Add documents (text and metadata) to the collection
# Only add documents if the collection is empty to prevent duplicates on re-run
if collection_gemini.count() == 0:
    collection_gemini.add(
        documents=documents,
        ids=[f"doc{i+1}" for i in range(len(documents))],
        metadatas=[
            {"type": "Finance"},
            {"type": "Cycling"},
            {"type": "Biking"},
            {"type": "Academics"}
        ]
    )
    print("\nChromaDB collection created and documents embedded using Gemini.")
else:
    print("\nChromaDB collection already exists and contains documents. Skipping document addition.")


ChromaDB collection created and documents embedded using Gemini.


4. Semantic Search & Cosine Similarity:
We use Cosine Similarity to find the documents whose meaning (vectors) are closest to the user's query vector.The cosine similarity formula calculates the cosine of the angle between two vectors:
Cosine Similarity=A⋅B||A||⋅||B||
A score of 1.0 means the vectors are identical (same meaning); 0.0 means orthogonal (unrelated).

4.1. Manual Cosine Similarity Calculation:
Let's write a helper function to calculate the score manually (using NumPy).

In [9]:
def cosine_similarity(vec_a, vec_b):
    """Calculates cosine similarity between two NumPy vectors."""
    # Convert lists to NumPy arrays if necessary
    A = np.array(vec_a)
    B = np.array(vec_b)

    # Calculate Dot Product
    dot_product = np.dot(A, B)

    # Calculate Magnitude (Norm)
    norm_a = np.linalg.norm(A)
    norm_b = np.linalg.norm(B)

    if norm_a == 0 or norm_b == 0:
        return 0.0

    return dot_product / (norm_a * norm_b)

# Compare the query against all documents using Gemini Embeddings
print("\n--- Manual Cosine Similarity Scores (Gemini) ---")

for i, doc_vector in enumerate(doc_embeddings_gemini):
    score = cosine_similarity(query_embedding_gemini, doc_vector)
    print(f"Doc {i+1} ({collection_gemini.get(ids=[f'doc{i+1}'])['metadatas'][0]['type']}): {score:.4f}")


--- Manual Cosine Similarity Scores (Gemini) ---
Doc 1 (Finance): 0.2788
Doc 2 (Cycling): 0.6211
Doc 3 (Biking): 0.4806
Doc 4 (Academics): 0.2878


4.2. ChromaDB Retrieval:
ChromaDB handles the calculation automatically using its query method. We expect the highest scores for the "Cycling" and "Biking" documents.

ChromaDB performs the embedding of the query and the similarity search internally.

In [10]:
results = collection_gemini.query(
    query_texts=[user_query],
    n_results=4, # Return top  results
    include=['documents', 'distances']
)

print("\n--- ChromaDB Semantic Search Results ---")
print(f"Query: {user_query}")
print("-" * 40)

# Output the results
for i in range(len(results['documents'][0])):
    doc_id = results['ids'][0][i]
    content = results['documents'][0][i]
    distance = results['distances'][0][i]
    metadata = collection_gemini.get(ids=[doc_id])['metadatas'][0]

    print(f"Rank {i+1}: ID {doc_id} | Type: {metadata['type']} | Distance: {distance:.4f}")
    print(f"   Content: {content[:70]}...")



--- ChromaDB Semantic Search Results ---
Query: What is the best form of exercise using wheels?
----------------------------------------
Rank 1: ID doc2 | Type: Cycling | Distance: 0.4030
   Content: Riding a bicycle provides excellent low-impact cardiovascular exercise...
Rank 2: ID doc3 | Type: Biking | Distance: 0.6020
   Content: I enjoy going cycling on the weekends, especially when the weather is ...
Rank 3: ID doc4 | Type: Academics | Distance: 0.9100
   Content: Please consult the academic handbook regarding grading policies and at...
Rank 4: ID doc1 | Type: Finance | Distance: 0.9677
   Content: The official university policy states that all faculty must submit exp...


5. Summary and Conclusion:
This lab demonstrated that:Text can be converted into meaningful numerical vectors (Embeddings).Vector Databases efficiently store these embeddings.Semantic Search (via Cosine Similarity) correctly identifies conceptual relevance ("exercise using wheels"  →  "cycling/biking") even if keywords don't match.Final Takeaway: This RAG pipeline foundation is what gives the LLM the necessary external context (memory) to answer questions accurately and avoid hallucination!