In [1]:
# Define a function that sets custom CSS for the notebook output
# This specific style makes <pre> blocks wrap text instead of overflowing in a single line
def set_css():
    display(HTML('''
    <style>
      pre {
          white-space: pre-wrap;  /* Enable word-wrapping in code/output blocks */
      }
    </style>
    '''))

# Register the CSS-setting function to run automatically before each code cell runs
# This ensures the styling stays applied throughout the notebook session
get_ipython().events.register('pre_run_cell', set_css)

# Large Language Model Embeddings and Retrieval-Augmented Generation

This module focuses on creating a complete Retrieval-Augmented Generation (RAG) system using modern NLP techniques, embedding models, and vector databases. The system allows users to search through text documents semantically and receive AI-generated answers based on relevant retrieved contexts.

## Objective
The main objective of this module is to demonstrate how to:
1. Process and chunk text documents for efficient retrieval
2. Generate high-quality embeddings using pre-trained models
3. Store and query vector embeddings in a vector database (Qdrant)
4. Implement a complete RAG pipeline by connecting retrieval with an LLM
5. Create a user-friendly interface for interacting with the RAG system


In [3]:
import markdown
from IPython.display import display, HTML

def render_markdown(md_text):
    # Convert Markdown to HTML
    html = markdown.markdown(md_text)
    # Display the HTML
    display(HTML(html))

Error in callback <function set_css at 0x106650b80> (for pre_run_cell), with arguments args (<ExecutionInfo object at 1067839b0, raw_cell="import markdown
from IPython.display import displa.." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell:/Users/kiran.ramanna/Documents/github/2025stanford/005-module/module_5_large_embeddingv1_1_Tech103.ipynb#W2sZmlsZQ%3D%3D>,),kwargs {}:


TypeError: set_css() takes 0 positional arguments but 1 was given

## Setup and Dependencies

In [4]:
# Install the necessary libraries
!pip install sentence_transformers openai
!pip install plotly
!pip install matplotlib
!pip install -Uqqq rich openai gradio
!pip install qdrant_client
!pip install transformers

# Import basic libraries
import numpy as np
import os, random
from pathlib import Path
from getpass import getpass
from rich.markdown import Markdown
import torch
import sys
import csv
csv.field_size_limit(sys.maxsize)


Error in callback <function set_css at 0x106650b80> (for pre_run_cell), with arguments args (<ExecutionInfo object at 1067aa690, raw_cell="# Install the necessary libraries
!pip install sen.." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell:/Users/kiran.ramanna/Documents/github/2025stanford/005-module/module_5_large_embeddingv1_1_Tech103.ipynb#W4sZmlsZQ%3D%3D>,),kwargs {}:


TypeError: set_css() takes 0 positional arguments but 1 was given

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting plotly
  Downloading plotly-6.0.1-py3-none-any.whl.metadata (6.7 kB)
Collecting narwhals>=1.15.1 (from plotly)
  Downloading narwhals-1.37.1-py3-none-any.whl.metadata (9.3 kB)
Downloading plotly-6.0.1-py3-none-any.whl (14.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.8/14.8 MB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading narwhals-1.37.1-py3-none-any.whl (332 kB)
Installing collected packages: narwhals, plotly
Successfully installed narwhals-1.37.1 plotly-6.0.1
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
autoeval-micro-judges 0.1.0 requires rich<14.0.0,>=13.8.1, but yo

131072

## OpenAI-Compatible LLM Client Configuration

In [6]:
# Retrieve API key securely from Colab user data
# from google.colab import userdata
# OPEN_ROUTER_API_KEY = userdata.get('OPEN_ROUTER_API_KEY')
# OPEN_ROUTER_API_KEY = "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"
OPEN_ROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")

# Initialize OpenRouter client (OpenAI-compatible API)
from openai import OpenAI
open_router_client = OpenAI(
  base_url="https://openrouter.ai/api/v1",
  api_key=OPEN_ROUTER_API_KEY,
)


Error in callback <function set_css at 0x106650b80> (for pre_run_cell), with arguments args (<ExecutionInfo object at 1067aac60, raw_cell="# Retrieve API key securely from Colab user data
#.." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell:/Users/kiran.ramanna/Documents/github/2025stanford/005-module/module_5_large_embeddingv1_1_Tech103.ipynb#W6sZmlsZQ%3D%3D>,),kwargs {}:


TypeError: set_css() takes 0 positional arguments but 1 was given

## Data Loading and Preparation

In [7]:
# Load data from Google Drive
import pandas as pd
url = 'https://drive.google.com/uc?id=1gl7WAkJr6Nyke7YckzXxdL-iM4UjhLGX'
df = pd.read_csv(url)
df = df[:5]  # Using only 5 rows for demonstration
df = df.dropna(axis=1)  # Drop columns with null values

# Prepare data with metadata for traceability
data = []
for row_num, row in df.iterrows():
    content = " ".join([f"{col}: {row[col]}" for col in df.columns])
    data.append({
        "page_content": content,
        "metadata": {
            "source": row["title"],
        }
    })


Error in callback <function set_css at 0x106650b80> (for pre_run_cell), with arguments args (<ExecutionInfo object at 166146d80, raw_cell="# Load data from Google Drive
import pandas as pd
.." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell:/Users/kiran.ramanna/Documents/github/2025stanford/005-module/module_5_large_embeddingv1_1_Tech103.ipynb#X11sZmlsZQ%3D%3D>,),kwargs {}:


TypeError: set_css() takes 0 positional arguments but 1 was given

## Document Chunking

In [8]:
def simple_recursive_split(docs, chunk_size=1000, chunk_overlap=200, separators=None):
    # Extract the main text and its associated metadata
    text = docs["page_content"]
    metadata = docs["metadata"]

    # Set default separators if none are provided
    if separators is None:
        separators = ["\n\n", "\n", " ", ".", ",", "\uff0c", "\u3001", "\uff0e", "\u3002"]

    # Helper function to recursively split text based on the separators
    def split_with_separators(t):
        # If the text is already within the chunk size, return it directly
        if len(t) <= chunk_size:
            return [t]

        # Attempt splitting by each separator in order
        for sep in separators:
            if sep and sep in t:
                parts = t.split(sep)
                chunks = []
                current = ""

                # Build chunks without exceeding the maximum chunk size
                for part in parts:
                    part += sep  # Reattach the separator to preserve structure
                    if len(current + part) <= chunk_size:
                        current += part
                    else:
                        if current:
                            chunks.append(current.strip())
                        current = part  # Start a new chunk

                # Add the final leftover chunk
                if current:
                    chunks.append(current.strip())

                # Recursively re-split chunks that are still too large
                result = []
                for chunk in chunks:
                    if len(chunk) > chunk_size:
                        result.extend(split_with_separators(chunk))
                    else:
                        result.append(chunk)
                return result

        # Fallback: if no separators are effective, split the text by fixed character lengths
        return [t[i:i + chunk_size] for i in range(0, len(t), chunk_size)]

    # Split the original text
    splits = split_with_separators(text)

    # Add overlap between chunks to preserve context between adjacent segments
    overlapped = []
    for i, chunk in enumerate(splits):
        if i == 0:
            # First chunk, no overlap
            overlapped.append({
                "page_content": chunk,
                "metadata": metadata
            })
        else:
            # For subsequent chunks, add overlap from the end of the previous chunk
            overlap = splits[i - 1][-chunk_overlap:]
            overlapped.append({
                "page_content": f"{overlap} {chunk}",
                "metadata": metadata
            })

    return overlapped

# Apply the chunking function to each document in the dataset
# This flattens all chunks into a single list
texts = [chunk for doc in data for chunk in simple_recursive_split(doc, 2048, 50)]

Error in callback <function set_css at 0x106650b80> (for pre_run_cell), with arguments args (<ExecutionInfo object at 169986a80, raw_cell="def simple_recursive_split(docs, chunk_size=1000, .." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell:/Users/kiran.ramanna/Documents/github/2025stanford/005-module/module_5_large_embeddingv1_1_Tech103.ipynb#X13sZmlsZQ%3D%3D>,),kwargs {}:


TypeError: set_css() takes 0 positional arguments but 1 was given

In [9]:
print (f'You now have {len(texts)} document(s) in your data')
print (f'There are {len(texts[1]["page_content"])} characters in your document')

Error in callback <function set_css at 0x106650b80> (for pre_run_cell), with arguments args (<ExecutionInfo object at 166145c10, raw_cell="print (f'You now have {len(texts)} document(s) in .." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell:/Users/kiran.ramanna/Documents/github/2025stanford/005-module/module_5_large_embeddingv1_1_Tech103.ipynb#X14sZmlsZQ%3D%3D>,),kwargs {}:


TypeError: set_css() takes 0 positional arguments but 1 was given

You now have 22 document(s) in your data
There are 1704 characters in your document


## Text Embedding Generation

In [11]:
# Load embedding model from HuggingFace
from transformers import AutoTokenizer, AutoModel
text_tokenizer = AutoTokenizer.from_pretrained("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True)
text_model = AutoModel.from_pretrained("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True)

# Function to generate embeddings from text
def get_text_embeddings(text):
    inputs = text_tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    outputs = text_model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings[0].detach().numpy()

# Example usage of the function
text = "This is a test sentence."

# Get the embedding vector for the input text
embeddings = get_text_embeddings(text)

# Optionally, get the length of the embedding (number of dimensions)
text_embeddings_size = len(embeddings)

# Print the first 5 values of the embedding vector for inspection
print(embeddings[:5])



Error in callback <function set_css at 0x106650b80> (for pre_run_cell), with arguments args (<ExecutionInfo object at 313781280, raw_cell="# Load embedding model from HuggingFace
from trans.." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell:/Users/kiran.ramanna/Documents/github/2025stanford/005-module/module_5_large_embeddingv1_1_Tech103.ipynb#X16sZmlsZQ%3D%3D>,),kwargs {}:


TypeError: set_css() takes 0 positional arguments but 1 was given

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
<All keys matched successfully>


[ 1.2799697   0.40158516 -3.5162659  -0.39813337  1.5919148 ]


In [12]:
# Generate embeddings for all chunks
text_embeded = [get_text_embeddings(document["page_content"]) for document in texts]

Error in callback <function set_css at 0x106650b80> (for pre_run_cell), with arguments args (<ExecutionInfo object at 3497aeab0, raw_cell="# Generate embeddings for all chunks
text_embeded .." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell:/Users/kiran.ramanna/Documents/github/2025stanford/005-module/module_5_large_embeddingv1_1_Tech103.ipynb#X20sZmlsZQ%3D%3D>,),kwargs {}:


TypeError: set_css() takes 0 positional arguments but 1 was given

## Qdrant VectorDatabase

In [13]:
# Import necessary modules from the Qdrant client library
# Qdrant is a vector database that allows you to store and search high-dimensional vector embeddings efficiently
from qdrant_client import QdrantClient, models

# Create a new Qdrant client instance using in-memory storage
# ":memory:" means the data will be stored temporarily in RAM (not saved to disk)
# Useful for testing or prototyping — everything is wiped when the program ends
client = QdrantClient(":memory:")

# Display the size (number of dimensions) of the text embeddings we generated earlier
# This is important because Qdrant needs to know the exact size of each vector to create a collection
text_embeddings_size

Error in callback <function set_css at 0x106650b80> (for pre_run_cell), with arguments args (<ExecutionInfo object at 34995f650, raw_cell="# Import necessary modules from the Qdrant client .." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell:/Users/kiran.ramanna/Documents/github/2025stanford/005-module/module_5_large_embeddingv1_1_Tech103.ipynb#X22sZmlsZQ%3D%3D>,),kwargs {}:


TypeError: set_css() takes 0 positional arguments but 1 was given

768

In [14]:
# Check if a collection named "demo_collection" already exists in Qdrant
# A collection is like a table in a database — it stores a set of vectors and associated metadata
if not client.collection_exists("demo_collection"):  # Creating the collection only if it doesn't exist

    # Create a new collection in Qdrant
    client.create_collection(
        collection_name="demo_collection",  # Name of the collection (you can choose any name)

        # Define the configuration for the vectors that will be stored in this collection
        vectors_config=models.VectorParams(
            size=text_embeddings_size,       # The size (number of dimensions) of the vectors, must match your model's output
            distance=models.Distance.COSINE  # Use cosine similarity for comparing vectors (good for text embeddings)
        ),
    )


Error in callback <function set_css at 0x106650b80> (for pre_run_cell), with arguments args (<ExecutionInfo object at 34eaaf0e0, raw_cell="# Check if a collection named "demo_collection" al.." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell:/Users/kiran.ramanna/Documents/github/2025stanford/005-module/module_5_large_embeddingv1_1_Tech103.ipynb#X23sZmlsZQ%3D%3D>,),kwargs {}:


TypeError: set_css() takes 0 positional arguments but 1 was given

In [15]:

# Import the `uuid4` function to generate unique IDs for each vector
# These IDs help identify and retrieve individual points later
from uuid import uuid4

# Import NumPy to handle vector data formats (embeddings are stored as NumPy arrays)
import numpy as np

# Upload all our text embeddings to the "demo_collection" in Qdrant
client.upload_points(
    collection_name="demo_collection",  # Target collection where we want to store our vectors

    # Create a list of PointStruct objects, one for each text chunk
    points=[
        models.PointStruct(
            id=str(uuid4()),  # Generate a unique ID for each point (as a string)

            # Convert the embedding to a NumPy array, which is the expected format
            vector=np.array(text_embeded[idx]),

            # Attach payload — additional information stored with each vector
            # This allows us to retrieve the original text and its metadata later
            payload={
                "metadata": doc["metadata"],         # Source and row info
                "content": doc["page_content"]       # The full text chunk
            }
        )
        for idx, doc in enumerate(texts)  # Loop through all texts and match them to their embeddings
    ]
)


Error in callback <function set_css at 0x106650b80> (for pre_run_cell), with arguments args (<ExecutionInfo object at 34995f650, raw_cell="
# Import the `uuid4` function to generate unique .." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell:/Users/kiran.ramanna/Documents/github/2025stanford/005-module/module_5_large_embeddingv1_1_Tech103.ipynb#X24sZmlsZQ%3D%3D>,),kwargs {}:


TypeError: set_css() takes 0 positional arguments but 1 was given

In [16]:
# Import the Google Drive integration module for Google Colab
from google.colab import drive

# Mount your Google Drive to the Colab environment
# This allows you to read from and write to files stored in your Drive
# After running this, a link will appear asking for permission to access your Drive
# Once authorized, your Drive will be available under '/content/drive'
drive.mount('/content/drive')


Error in callback <function set_css at 0x106650b80> (for pre_run_cell), with arguments args (<ExecutionInfo object at 34eb7c620, raw_cell="# Import the Google Drive integration module for G.." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell:/Users/kiran.ramanna/Documents/github/2025stanford/005-module/module_5_large_embeddingv1_1_Tech103.ipynb#X25sZmlsZQ%3D%3D>,),kwargs {}:


TypeError: set_css() takes 0 positional arguments but 1 was given

ModuleNotFoundError: No module named 'google.colab'

In [None]:
# Import necessary modules from the Qdrant client
# QdrantClient lets us interact with the Qdrant vector database
# models is used for configuring vectors and managing points
from qdrant_client import QdrantClient, models

# Import the os module to work with the file system (for creating directories, etc.)
import os

# Define the local directory where Qdrant will store its data files
# This path is inside the Colab environment (or Google Drive if mounted)
qdrant_data_dir = '/content/drive/MyDrive/Semantic_Search/qdrant_data'

# Create the directory if it doesn't already exist
# This ensures Qdrant has a place to store persistent data like vectors and collections
os.makedirs(qdrant_data_dir, exist_ok=True)

# Initialize the Qdrant client and set its storage location to the directory we just created
# This will persist your Qdrant data (e.g., vector collections) to disk instead of just in memory
client = QdrantClient(path=qdrant_data_dir)


In [None]:
# Check if a collection named "demo_collection" already exists in Qdrant
# A collection is like a table in a database — it stores a set of vectors and associated metadata
if not client.collection_exists("demo_collection"):  # Creating the collection only if it doesn't exist

    # Create a new collection in Qdrant
    client.create_collection(
        collection_name="demo_collection",  # Name of the collection (you can choose any name)

        # Define the configuration for the vectors that will be stored in this collection
        vectors_config=models.VectorParams(
            size=text_embeddings_size,       # The size (number of dimensions) of the vectors, must match your model's output
            distance=models.Distance.COSINE  # Use cosine similarity for comparing vectors (good for text embeddings)
        ),
    )

# Import the `uuid4` function to generate unique IDs for each vector
# These IDs help identify and retrieve individual points later
from uuid import uuid4

# Import NumPy to handle vector data formats (embeddings are stored as NumPy arrays)
import numpy as np

# Upload all our text embeddings to the "demo_collection" in Qdrant
client.upload_points(
    collection_name="demo_collection",  # Target collection where we want to store our vectors

    # Create a list of PointStruct objects, one for each text chunk
    points=[
        models.PointStruct(
            id=str(uuid4()),  # Generate a unique ID for each point (as a string)

            # Convert the embedding to a NumPy array, which is the expected format
            vector=np.array(text_embeded[idx]),

            # Attach payload — additional information stored with each vector
            # This allows us to retrieve the original text and its metadata later
            payload={
                "metadata": doc["metadata"],         # Source and row info
                "content": doc["page_content"]       # The full text chunk
            }
        )
        for idx, doc in enumerate(texts)  # Loop through all texts and match them to their embeddings
    ]
)


## Run Query

In [None]:
# Create a query vector by embedding a sample search string
# This string represents what you're looking for — in this case, something about "Democrats challenges in Senate"
# The result is a vector in the same format as the ones stored in the Qdrant collection
query = get_text_embeddings('Democrats challenges in Senate')

# Perform a similarity search in Qdrant using the query vector
# This finds the most relevant text chunks (based on vector similarity)
text_hits = client.query_points(
    collection_name="demo_collection",  # The name of the collection where vectors were stored
    query=query,                         # The query vector — what we want to find similar results to
    limit=3,                             # Limit the number of results to 3 most relevant chunks
).points                                 # Extract only the list of matching points (each with vector + payload)


In [None]:
text_hits

[ScoredPoint(id='a9b0db57-6858-4b37-844f-0d90f2aeb7bb', version=0, score=0.6620315493425604, payload={'metadata': {'source': 'Are The Democrats Screwed In The Senate After 2024?'}, 'content': "airly competitive race against Mike Lee this year. Even with an additional senator going into 2023, the 2024 map is still so bad for Democrats that keeping the Senate for years to come will be a fairly tough order. The party’s prospects might rest more upon limiting the damage in 2024 so that it has a chance to regain the Senate in 2026 or 2028. But a bad 2024 could make it very difficult for Democrats to regain the Senate before 2030 or 2032.\nThat bleak picture may shape the next few years of political maneuvering. When Vox’s Dylan Matthews suggested on Twitter that liberal Justices Sonia Sotomayor (age 68) and Elena Kagan (age 62) should retire while Democrats have their Senate majority and be replaced by younger justices, it didn’t go over well. But it’s a perfectly rational suggestion if Dem

## We can start from here now!


In [None]:
!pip install qdrant_client
!pip install openai

Collecting qdrant_client
  Downloading qdrant_client-1.14.2-py3-none-any.whl.metadata (10 kB)
Collecting portalocker<3.0.0,>=2.7.0 (from qdrant_client)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Downloading qdrant_client-1.14.2-py3-none-any.whl (327 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m327.7/327.7 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-2.10.1-py3-none-any.whl (18 kB)
Installing collected packages: portalocker, qdrant_client
Successfully installed portalocker-2.10.1 qdrant_client-1.14.2


In [None]:
# Import the Google Drive integration module for Google Colab
from google.colab import drive

# Mount your Google Drive to the Colab environment
# This allows you to read from and write to files stored in your Drive
# After running this, a link will appear asking for permission to access your Drive
# Once authorized, your Drive will be available under '/content/drive'
drive.mount('/content/drive')

In [None]:
# Import the main Qdrant client class to connect and interact with a Qdrant vector database
from qdrant_client import QdrantClient

# Import specific classes used to configure how vectors are stored and compared in a collection
from qdrant_client.http.models import Distance, VectorParams

# Import the `userdata` module from Google Colab.
from google.colab import userdata

# This is used to send requests to OpenRouter, which gives access to various LLMs (large language models)
from openai import OpenAI

# Import HTML and display tools from IPython
# These allow you to inject custom HTML or CSS into the notebook
from IPython.display import HTML, display

# Import necessary classes from the Hugging Face Transformers library
# AutoTokenizer handles breaking text into tokens
# AutoModel loads the pre-trained model used to compute vector embeddings
from transformers import AutoTokenizer, AutoModel

# Import the openai

import openai

**1. Define the Qdrant client first to connect to the vector database.**

In [None]:
# Attempt to initialize the Qdrant client
try:
    # Initialize the Qdrant client and set its storage path
    # This stores and retrieves the vector database in the specified directory on disk
    client = QdrantClient(path='/content/drive/MyDrive/Semantic_Search/qdrant_data')

except RuntimeError as e:
    # Catch the specific error that occurs when the Qdrant client is already running with this path
    if "already accessed by another instance" in str(e):
        print("Qdrant is already initialized with this path in the current session.")
        print("You don't need to create the client again — reuse the existing one.")
    else:
        # Re-raise the error if it's something else
        raise


**2. Define the OpenRouter client to serve as the language model (LLM) for the pipeline.**


In [None]:

# Retrieve the value of a saved environment variable named 'OPEN_ROUTER_API_KEY'.

OPEN_ROUTER_API_KEY = userdata.get('OPEN_ROUTER_API_KEY')



# Initialize the OpenAI-compatible client, but point it to OpenRouter's API instead of OpenAI's
# OpenRouter is a gateway to multiple LLMs like GPT, Claude, Mistral, and others, through one unified API

open_router_client = OpenAI(
    base_url="https://openrouter.ai/api/v1",  # Set the API endpoint to OpenRouter (not OpenAI)
    api_key=OPEN_ROUTER_API_KEY               # Use your OpenRouter API key for authentication
)


**3. Import the same embedding model used during vector database creation to ensure consistency.**

In [None]:


# Load a pre-trained tokenizer and model designed for generating text embeddings
# "nomic-ai/nomic-embed-text-v1.5" is a model specifically trained to turn text into high-quality vector representations
# trust_remote_code=True allows use of any custom logic included with the model
text_tokenizer = AutoTokenizer.from_pretrained("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True)
text_model = AutoModel.from_pretrained("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True)

# Define a function to convert input text into a fixed-size vector (embedding)
def get_text_embeddings(text):
    # Tokenize the input text and return it as PyTorch tensors
    # padding=True: pad shorter sequences to ensure consistent length
    # truncation=True: cut off text that is too long for the model
    inputs = text_tokenizer(text, return_tensors="pt", padding=True, truncation=True)

    # Pass the tokenized input through the model to obtain output embeddings
    outputs = text_model(**inputs)

    # outputs.last_hidden_state contains embeddings for each token
    # We take the mean across all tokens to get a single vector for the entire text
    embeddings = outputs.last_hidden_state.mean(dim=1)

    # Convert the result to a NumPy array and remove it from the computation graph
    return embeddings[0].detach().numpy()


tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

configuration_hf_nomic_bert.py:   0%|          | 0.00/1.96k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/nomic-ai/nomic-bert-2048:
- configuration_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_hf_nomic_bert.py:   0%|          | 0.00/104k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/nomic-ai/nomic-bert-2048:
- modeling_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/547M [00:00<?, ?B/s]



**4. Test the retrieval functions to ensure they're returning relevant results.**

In [None]:
query = """Democrats challenges in Senate"""

In [None]:
# get query embedded
query_em = get_text_embeddings(query)

In [None]:
text_hits = client.query_points(
        collection_name="demo_collection",
        query=query_em,
        limit=10,
    ).points



In [None]:
# Extract the original text content from each result returned by the similarity search
# `text_hits` is a list of points returned by Qdrant's query
# Each point has a `payload`, which contains metadata and the original text chunk

contents = [point.payload['content'] for point in text_hits]

In [None]:
contents

["airly competitive race against Mike Lee this year. Even with an additional senator going into 2023, the 2024 map is still so bad for Democrats that keeping the Senate for years to come will be a fairly tough order. The party’s prospects might rest more upon limiting the damage in 2024 so that it has a chance to regain the Senate in 2026 or 2028. But a bad 2024 could make it very difficult for Democrats to regain the Senate before 2030 or 2032.\nThat bleak picture may shape the next few years of political maneuvering. When Vox’s Dylan Matthews suggested on Twitter that liberal Justices Sonia Sotomayor (age 68) and Elena Kagan (age 62) should retire while Democrats have their Senate majority and be replaced by younger justices, it didn’t go over well. But it’s a perfectly rational suggestion if Democrats don’t feel like gambling with their judicial future. (Consider how consequential Ruth Bader Ginsburg’s decision not to retire has been for liberals.) Democrats have a narrow path to Se

In [None]:
# Extract the metadata for each point returned by the similarity search
# Each result (point) has a payload dictionary that includes metadata stored when uploading the vectors

meta = [point.payload['metadata'] for point in text_hits]

In [None]:
meta

[{'source': 'Are The Democrats Screwed In The Senate After 2024?', 'row': 0},
 {'source': 'Are The Democrats Screwed In The Senate After 2024?', 'row': 0},
 {'source': 'Are The Democrats Screwed In The Senate After 2024?', 'row': 0},
 {'source': 'Are The Democrats Screwed In The Senate After 2024?', 'row': 0},
 {'source': 'Are The Democrats Screwed In The Senate After 2024?', 'row': 0},
 {'source': 'Are The Democrats Screwed In The Senate After 2024?', 'row': 0},
 {'source': 'Are The Democrats Screwed In The Senate After 2024?', 'row': 0},
 {'source': 'Are The Democrats Screwed In The Senate After 2024?', 'row': 0},
 {'source': 'Ballot Measures: A Preview', 'row': 2},
 {'source': 'Ballot Measures: A Preview', 'row': 2}]

In [None]:
# Loop through each text chunk in the `contents` list
# These are the top-matching results returned by the Qdrant similarity search
for i in contents:
    # Print the actual text content
    print(i)

    # Print a separator line to clearly distinguish between different chunks
    print('###########')


airly competitive race against Mike Lee this year. Even with an additional senator going into 2023, the 2024 map is still so bad for Democrats that keeping the Senate for years to come will be a fairly tough order. The party’s prospects might rest more upon limiting the damage in 2024 so that it has a chance to regain the Senate in 2026 or 2028. But a bad 2024 could make it very difficult for Democrats to regain the Senate before 2030 or 2032.
That bleak picture may shape the next few years of political maneuvering. When Vox’s Dylan Matthews suggested on Twitter that liberal Justices Sonia Sotomayor (age 68) and Elena Kagan (age 62) should retire while Democrats have their Senate majority and be replaced by younger justices, it didn’t go over well. But it’s a perfectly rational suggestion if Democrats don’t feel like gambling with their judicial future. (Consider how consequential Ruth Bader Ginsburg’s decision not to retire has been for liberals.) Democrats have a narrow path to Senat

**5. Create a retriever function to extract relevant chunks from the documents.**

In [None]:
# Define a function to search the Qdrant vector database using a natural language query
def query_qdrant(query, qdrant_client, limit=5):
    # Step 1: Convert the query text into an embedding (vector representation)
    # This embedding will be compared with stored vectors in the collection
    query_em = get_text_embeddings(query)

    # Step 2: Query the Qdrant collection using the embedding
    # This finds the top `limit` most similar text chunks based on vector similarity
    text_hits = qdrant_client.query_points(
        collection_name="demo_collection",  # The name of the Qdrant collection to search
        query=query_em,                     # The embedding of the input query
        limit=limit                         # Number of top results to return
    ).points                                 # Extract the matching points (results)

    # Step 3: Prepare the results in a clean format (text + metadata)
    results = []
    for point in text_hits:
        results.append({
            'content': point.payload['content'],    # The original text content
            'metadata': point.payload['metadata']   # Associated metadata (e.g., title, row number)
        })

    # Return the list of results
    return results


In [None]:
query_qdrant(query, client)

[{'content': "airly competitive race against Mike Lee this year. Even with an additional senator going into 2023, the 2024 map is still so bad for Democrats that keeping the Senate for years to come will be a fairly tough order. The party’s prospects might rest more upon limiting the damage in 2024 so that it has a chance to regain the Senate in 2026 or 2028. But a bad 2024 could make it very difficult for Democrats to regain the Senate before 2030 or 2032.\nThat bleak picture may shape the next few years of political maneuvering. When Vox’s Dylan Matthews suggested on Twitter that liberal Justices Sonia Sotomayor (age 68) and Elena Kagan (age 62) should retire while Democrats have their Senate majority and be replaced by younger justices, it didn’t go over well. But it’s a perfectly rational suggestion if Democrats don’t feel like gambling with their judicial future. (Consider how consequential Ruth Bader Ginsburg’s decision not to retire has been for liberals.) Democrats have a narro

**6. Now, let's integrate everything by combining our Retrieval functiom with the Language Model to complete our RAG (Retrieval-Augmented Generation) pipeline.**

In [None]:
# Define a function that uses a language model to generate an answer based on a user's query
def generate_answer(query):
    # Build the prompt that will be sent to the LLM
    # The prompt includes:
    # - Instructions to clean and format the answer
    # - The user's original query
    # - The context retrieved from Qdrant (via semantic search)
    prompt = f"""
    Based on the following query from a user, please generate a small answer
    focusing on the original query and the response given. The answer should be paragraphs.
    Remove the special characters and (/n), make the output clean and long.
    Please cite source for each part as [1][2].
    Just start with the answer, no need to give any salutations.

    ###########
    query:
    "{query}"

    ########

    context:
    "{query_qdrant(query, client)}"
    #####

    Return in Markdown format.
    """

    # Send the prompt to the LLM using streaming mode
    # This allows the response to be received in real-time, piece by piece
    stream = open_router_client.chat.completions.create(
        model="qwen/qwen3-8b:free",  # Model to use (can be any OpenAI-compatible model)
        messages=[
            {
                "role": "user",
                "content": prompt,
            },
        ],
        stream=True,  # Enable streaming so we get partial output as it generates
    )

    # Initialize a variable to hold the full response
    output_text = ""

    # Iterate through the streaming response chunks
    for chunk in stream:
        if chunk.choices[0].delta.content is not None:
            content = chunk.choices[0].delta.content
            output_text += content  # Append new content to the full output
            print(content, end="")  # Print each chunk live as it's received

    # Return both the final answer and the context used (for reference or display)
    return output_text, query_qdrant(query, client)


In [None]:
response,sources = generate_answer(query)

The Democrats face significant challenges in maintaining Senate control after 2024, as the current political map is heavily unfavorable for their interests. The 2024 Senate race is described as exceptionally difficult for the party, with even an additional senator in 2023 not enough to secure long-term dominance. Democrats’ prospects rely on minimizing losses in 2024 to create a viable path for regaining the Senate in 2026 or 2028. However, a poor performance in 2024 could severely hinder their ability to recapture majority control before 2030 or 2032. This precarious situation influences strategic political decisions, including discussions about judicial appointments and retirement timelines for Supreme Court justices, as highlighted by Nate Silver’s analysis [1].  

The 2024 election outcomes will heavily determine the Democrats’ Senate outlook. If Democrats win the presidency but lose the Senate, it would not be an unprecedented scenario. However, the party’s ability to regain Senat

In [None]:
#for markdown layout
render_markdown(response)

## Time to Build a functional Gradio interface to interact with the RAG system.

In [None]:
import gradio as gr

**1. Redefine our RAG function**

In [None]:
# Import OpenAI-compatible library (used here with OpenRouter)
import openai

# Define a function to generate a streamed answer to a user's query using an LLM
# This version includes error handling and uses Python's `yield` to stream results back as they're generated
def generate_answer(query):
    # Step 1: Try to get relevant context from Qdrant (vector search)
    try:
        sources = query_qdrant(query, client)
    except Exception as e:
        # If something goes wrong (e.g., Qdrant is not running), return a fallback message
        sources = [{"error": f"Error retrieving sources: {str(e)}"}]

    # Step 2: Prepare the prompt for the language model
    # Includes the user's question and the context retrieved from the vector database
    prompt = f"""
    Based on the following query from a user, please generate a small answer
    focusing on the original query and the response given. The answer should be paragraphs.
    Remove special characters and (/n); make the output clean and long.
    Please cite source for each part as [1][2]. Just start with the answer — no salutations.

    ###########
    query:
    "{query}"

    ########

    context:
    "{sources}"
    #####

    Return in Markdown format.
    """

    # Step 3: Send the prompt to the OpenRouter-compatible LLM (Qwen model)
    stream = open_router_client.chat.completions.create(
        model="qwen/qwen3-8b:free",  # A free-to-use large language model hosted on OpenRouter
        messages=[
            {
                "role": "user",
                "content": prompt,
            },
        ],
        stream=True,  # Enable streaming response
    )

    # Step 4: Stream and yield the generated content chunk by chunk
    full_response = ""
    for chunk in stream:
        if chunk.choices[0].delta.content is not None:
            content = chunk.choices[0].delta.content
            full_response += content

            # Yield lets us return partial results as they're received (for real-time feedback)
            yield full_response


**2. Create a Demo Interface**

In [None]:
# Define example inputs for the UI — users can click these to try predefined queries
examples = [
    ["Democrats in Senate"],
    ["Climate Change Challenges in Europe"],
    ["Philosophy in the world of Minimalism"],
    ["Hate Speech vs Freedom of Speech"],
    ["Articles by Noam Chomsky on US Politics"],
    ["The importance of values and reflection"]
]

# Set up the Gradio interface
# - fn: the function to call when user enters input (must be a generator if using yield)
# - title: the name shown at the top of the web app
# - inputs: defines the input component (in this case, a text box)
# - outputs: defines what kind of output to display (Textbox with 3 lines labeled "Response")
# - examples: preloaded example queries for users to click and run

import gradio as gr

demo = gr.Interface(
    fn=generate_answer,  # The function that will process user input
    title="The Truth Serum",  # Title for the web app
    inputs="text",  # Single text input from the user
    outputs=gr.components.Textbox(lines=3, label="Response"),  # Output display
    examples=examples,  # List of sample queries for users to try
    live=False,  # Optional: set to True if you want real-time feedback as user types
)

# Launch the interface
# - share=True gives you a public link (useful in Colab or for sharing with others)
# - debug=True enables logging for error tracking
demo.launch(share=True, debug=True)


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://b7f9733d761220a24f.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://b7f9733d761220a24f.gradio.live




**3. Create a Demo Interface with Sources**

In [None]:
import json

def generate_answer(query):
    # First, get the sources
    try:
        sources = query_qdrant(query, client)
    except Exception as e:
        sources = [{"error": f"Error retrieving sources: {str(e)}"}]

    # Convert the sources list to a string for the prompt
    #sources_str = "\n".join([f"Source {i+1}: {source['content']}" for i, source in enumerate(sources)])

    prompt = f"""
    Based on the following query from a user, please generate a small answer
    focusing on the original query and the response given. The answer should be paragraphs
    remove the special characters and (/n ), make the output clean and long. Please cite source for each part as [1][2]
    Just start with the answer, no need to give any salutations

    ###########
    query:
    "{query}"

    ########

    context:
    "{sources}"
    #####

    Return in Markdown format.
    """

    # Send the prompt to the OpenRouter-compatible LLM (Qwen model)
    stream = open_router_client.chat.completions.create(
        model="qwen/qwen3-8b:free",  # A free-to-use large language model hosted on OpenRouter
        messages=[
            {
                "role": "user",
                "content": prompt,
            },
        ],
        stream=True,  # Enable streaming response
    )
    # Convert sources to a proper JSON string for the JSON component
    sources_json = json.dumps(sources)

    # For Gradio streaming with multiple outputs
    full_response = ""
    for chunk in stream:
        if chunk.choices[0].delta.content is not None:
            content = chunk.choices[0].delta.content
            full_response += content
            # Return both the accumulated response and the sources as JSON string
            yield full_response, sources_json

    # In case the stream is empty, yield one final time
    if not full_response:
        yield "No response generated", sources_json

examples = [
    ["Democrats in Senate"],
    ["Climate Change Challenges in Europe"],
    ["Philosophy in the world of Minimalism"],
    ["Hate Speech vs Freedom of Speech"],
    ["Articles by Noam Chomsky on US Politics"],
    ["The importance of values and reflection"]
]

demo = gr.Interface(
    fn=generate_answer,
    title="The Truth Serum",
    inputs="text",
    outputs=[
        gr.components.Textbox(lines=8, label="Response"),
        gr.components.JSON(label="Sources")
    ],
    examples=examples
)

demo.queue()
demo.launch(share=True, debug=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://f53afd903f889cb305.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://f53afd903f889cb305.gradio.live


