### 1. Performance and Load testing

This jupyter notebook is used to test performance and load testing while indexing large amount of dataset document files.

### Import required libraries.

In [2]:
# Import torch and faiss for model and vector indexing.
import torch
import faiss

# Import time for performance measurement.
import time

# Import other required libraries.
import numpy as np
import pandas as pd

# Import SentenceTransformer for embedding generation.
from sentence_transformers import SentenceTransformer

# Ignore warnings for cleaner output.
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Import pickle for saving and loading objects.
import pickle

# Import sqlite3 for database operations.
import sqlite3

#### Check device type and versions.

In [3]:
# Check pytorch device
print("PyTorch Device:", "GPU" if torch.cuda.is_available() else "CPU")
print("FAISS version:", faiss.__version__)

PyTorch Device: CPU
FAISS version: 1.7.4


Check FAISS mode.

In [4]:
if faiss.get_num_gpus() > 0:
    print(f"FAISS is GPU-enabled. GPUs detected: {faiss.get_num_gpus()}")
else:
    print("FAISS is running in CPU mode.")

FAISS is running in CPU mode.


*******

## Performance testing.

### 1. Load large amount of texts.

In [103]:
# Generate dummy dataset of 1000 sentences
sentences = [f"This is sample sentence number {i}" for i in range(2000)]

### 2. Load input model

In [8]:
# Load a pre-trained SentenceTransformer model
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer(model_name, device=device)

### 3. Benchmarking embedding performance.

In [15]:
# Benchmark: Encoding
start_time = time.time()
embeddings = model.encode(sentences, batch_size=256, show_progress_bar=True)
elapsed = time.time() - start_time
print(f"\nEncoding 2000 sentences took: {elapsed:.2f} seconds")

Batches:   0%|          | 0/8 [00:00<?, ?it/s]


Encoding 2000 sentences took: 6.14 seconds


### 4. Convert embeddings to numpy "Float32"

In [16]:
# Convert embeddings to numpy float32
embeddings = np.array(embeddings, dtype="float32")

### 5. Create FAISS indexes.

In [18]:
# Benchmark: FAISS Indexing.
d = embeddings.shape[1]  # Dimension of the embeddings
index = faiss.IndexFlatL2(d)  # Create a FAISS index
if faiss.get_num_gpus() > 0:
    index = faiss.index_cpu_to_all_gpus(index)  # Move index to GPU if available
start_time = time.time()
index.add(embeddings)  # Add embeddings to the index
elapsed = time.time() - start_time
print(f"Indexing 2000 embeddings took: {elapsed:.2f} seconds")

Indexing 2000 embeddings took: 0.00 seconds


### 6. Save FAISS index to disk space.

In [19]:
# Save the FAISS index to disk
# Write indexes.
faiss.write_index(index, "my_index.faiss")
print("FAISS index saved to 'faiss_index.bin'")

FAISS index saved to 'faiss_index.bin'


### 7. Load index later.

In [20]:
# Load the FAISS index from disk
index = faiss.read_index("my_index.faiss")
if faiss.get_num_gpus() > 0:
    index = faiss.index_cpu_to_all_gpus(index)  # Move index to GPU if available
print("FAISS index loaded from 'faiss_index.bin'")

FAISS index loaded from 'faiss_index.bin'


### 8. Perform a search query (Evaluation step).

In [22]:
# Perform a sample search
query = model.encode(["This is a query sentence"], convert_to_numpy=True)

# Evaluate search performance.
start_time = time.time()
D, I = index.search(query.astype("float32"), k=5)  # Top-5 results
elapsed = time.time() - start_time

print(f"Search took: {elapsed:.4f} seconds")
print("Top results:", I[0])

Search took: 0.0006 seconds
Top results: [ 330  136 1364 1361 1366]


******

## Scalable Embedding and FAISS Indexing

### 1. Get embedding dimensions.

In [24]:
# Generate dummy dataset of 1000 sentences
sentences = [f"This is sample sentence number {i}" for i in range(2000)]

In [25]:
# Get sentence embedding dimension.
embedding_dim = model.get_sentence_embedding_dimension()

In [26]:
# Create a FAISS index.
index = faiss.IndexFlatL2(embedding_dim)

In [27]:
# If GPU is available, use FAISS GPU
if faiss.get_num_gpus() > 0:
    res = faiss.StandardGpuResources()
    index = faiss.index_cpu_to_gpu(res, 0, index)

### 2. Encode in batches.

In [28]:
# Batch encoding function
def encode_in_batches(texts, batch_size=128):
    for i in range(0, len(texts), batch_size):
        yield texts[i:i+batch_size]

In [32]:
# Benchmark: Encoding 
start_time = time.time()
batch_size = 256
for batch in encode_in_batches(sentences, batch_size=batch_size):
    embeddings = model.encode(batch, batch_size=batch_size, convert_to_numpy=True, show_progress_bar=True)
    embeddings = np.array(embeddings, dtype="float32")
    index.add(embeddings)
elapsed = time.time() - start_time
print(f"\nEncoding 2000 sentences took: {elapsed:.2f} seconds")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Encoding 2000000 sentences took: 14.26 seconds


In [33]:
print("Finished indexing all documents")
print("Total vectors in index:", index.ntotal)

Finished indexing all documents
Total vectors in index: 6000


*******

### FAISS + Metadata storage

Store document IDs + Metadata (title, text, etc.) along side FAISS embeddings, this will help to retrieve full documents.

### 1. Get the sample dataframe.

In [55]:
# Example dataframe (replace with your real dataset)
data = pd.DataFrame({
    "id": range(1, 6),
    "title": [f"Title {i}" for i in range(1, 6)],
    "text": [f"This is the text of document {i}" for i in range(1, 6)]
})

In [56]:
data

Unnamed: 0,id,title,text
0,1,Title 1,This is the text of document 1
1,2,Title 2,This is the text of document 2
2,3,Title 3,This is the text of document 3
3,4,Title 4,This is the text of document 4
4,5,Title 5,This is the text of document 5


### 2. Build metadata mapping : Index -> Document info

In [57]:
# Build metadata mapping: index -> document info
metadata_store = {}

### 3. Setup FAISS index.

In [58]:
# Setup FAISS index
embedding_dim = model.get_sentence_embedding_dimension()
index = faiss.IndexFlatL2(embedding_dim)

In [59]:
# If GPU available, move FAISS to GPU
if faiss.get_num_gpus() > 0:
    res = faiss.StandardGpuResources()
    index = faiss.index_cpu_to_gpu(res, 0, index)

### 4. Embedding and Indexing.

In [60]:
# Encode and add in batches
batch_size = 2
counter = 0

In [61]:
for i in range(0, len(data), batch_size):
    batch = data.iloc[i:i+batch_size]
    embeddings = model.encode(batch["text"].tolist(), batch_size=batch_size, convert_to_numpy=True)
    embeddings = np.array(embeddings, dtype="float32")
    index.add(embeddings)
    
    # Store metadata alongside embeddings
    for j, row in batch.iterrows():
        metadata_store[counter] = {
            "id": row["id"],
            "title": row["title"],
            "text": row["text"]
        }
        counter += 1

In [62]:
print("✅ Finished indexing")
print("Total vectors:", index.ntotal)

✅ Finished indexing
Total vectors: 5


### 5. Query.

In [63]:
# Query
query = "sample text document"
query_vec = model.encode([query], convert_to_numpy=True).astype("float32")
D, I = index.search(query_vec, k=3)

### 6. Retrieve full document.

In [64]:
# Retrieve full documents
print("\nTop results:")
for idx, score in zip(I[0], D[0]):
    doc = metadata_store[idx]
    print(f"Score: {score:.4f} | ID: {doc['id']} | Title: {doc['title']} | Text: {doc['text']}")


Top results:
Score: 0.9075 | ID: 1 | Title: Title 1 | Text: This is the text of document 1
Score: 0.9481 | ID: 5 | Title: Title 5 | Text: This is the text of document 5
Score: 0.9796 | ID: 2 | Title: Title 2 | Text: This is the text of document 2


********

### Save & Load FAISS + Metadata

In [65]:
# ===== SAVE =====
def save_faiss_with_metadata(index, metadata_store, faiss_path="my_index.faiss", meta_path="metadata.pkl"):
    # Save FAISS index
    faiss.write_index(index, faiss_path)
    # Save metadata (Python dict) using pickle
    with open(meta_path, "wb") as f:
        pickle.dump(metadata_store, f)
    print(f" Saved FAISS index to {faiss_path} and metadata to {meta_path}")

In [66]:
# ===== LOAD =====
def load_faiss_with_metadata(faiss_path="my_index.faiss", meta_path="metadata.pkl"):
    # Load FAISS index
    index = faiss.read_index(faiss_path)
    # Load metadata
    with open(meta_path, "rb") as f:
        metadata_store = pickle.load(f)
    print(f" Loaded FAISS index and metadata (docs: {len(metadata_store)})")
    return index, metadata_store

In [67]:
# === Example Usage ===
# Suppose `index` and `metadata_store` exist from your embedding step:
save_faiss_with_metadata(index, metadata_store)

 Saved FAISS index to my_index.faiss and metadata to metadata.pkl


In [68]:
# Later (in a new session / notebook):
index, metadata_store = load_faiss_with_metadata()

 Loaded FAISS index and metadata (docs: 5)


In [69]:
# Query after reload
query = "sample text document"
query_vec = model.encode([query], convert_to_numpy=True).astype("float32")
D, I = index.search(query_vec, k=3)

In [71]:
# Retrieve full documents
print("\nTop results:")
for idx, score in zip(I[0], D[0]):
    doc = metadata_store[idx]
    print(f"Score: {score:.4f} | ID: {doc['id']} | Title: {doc['title']} | Text: {doc['text']}")


Top results:
Score: 0.9075 | ID: 1 | Title: Title 1 | Text: This is the text of document 1
Score: 0.9481 | ID: 5 | Title: Title 5 | Text: This is the text of document 5
Score: 0.9796 | ID: 2 | Title: Title 2 | Text: This is the text of document 2


******

# <b>Recommended approach: Hybrid model, Using SQLLite for Metadata storage and FAISS vector database for index storage.</b>

### 1. Setup SQLite Database for Metadata Storage.

In [85]:
# --- IGNORE ---
### 1. Setup SQLite Database for Metadata Storage.
connection = sqlite3.connect('metadata_storage.db')
cursor = connection.cursor()

Create table for documents.

In [86]:
cursor.execute('''
CREATE TABLE IF NOT EXISTS documents (
    doc_id INTEGER PRIMARY KEY,
    title TEXT,
    text TEXT
)
''')

connection.commit()

In [88]:
# Example dataset
documents = [
    {"title": "Doc 1", "text": "This is the first document"},
    {"title": "Doc 2", "text": "This is the second document"},
    {"title": "Doc 3", "text": "Another document for testing"},
    {"title": "Doc 4", "text": "Another document Himanshu for testing4"},
    {"title": "Doc 5", "text": "Another document Engineer for testing 5"}
]

In [89]:
documents

[{'title': 'Doc 1', 'text': 'This is the first document'},
 {'title': 'Doc 2', 'text': 'This is the second document'},
 {'title': 'Doc 3', 'text': 'Another document for testing'},
 {'title': 'Doc 4', 'text': 'Another document Himanshu for testing4'},
 {'title': 'Doc 5', 'text': 'Another document Engineer for testing 5'}]

### 2. Insert Metadata while indexing.

In [90]:
# Load input model
model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

# FAISS index
embedding_dim = model.get_sentence_embedding_dimension()
index = faiss.IndexFlatL2(embedding_dim)

# Insert documents and add embeddings
for doc in documents:
    # Save metadata in SQLite
    cursor.execute("INSERT INTO documents (title, text) VALUES (?, ?)", (doc["title"], doc["text"]))
    doc_id = cursor.lastrowid  # SQLite assigns unique ID
    
    # Encode and add to FAISS
    embedding = model.encode([doc["text"]], convert_to_numpy=True).astype("float32")
    index.add(embedding)

# Commit changes to SQLite.
connection.commit()

print("✅ Indexed documents in FAISS + SQLite")


✅ Indexed documents in FAISS + SQLite


### 3. Save and Reload.

In [91]:
# Save FAISS index
faiss.write_index(index, "faiss_index.faiss")
connection.close()

In [100]:
# Later reload:
index = faiss.read_index("faiss_index.faiss")
conn = sqlite3.connect("metadata_storage.db")
cursor = conn.cursor()

### 4. Query with FAISS + Metadata

In [101]:
# Query FAISS
query = "find a test document"
query_vec = model.encode([query], convert_to_numpy=True).astype("float32")
D, I = index.search(query_vec, k=3)

In [98]:
D, I

(array([[10.7447405, 18.093216 , 19.046337 ]], dtype=float32),
 array([[2, 3, 4]]))

In [102]:
# Fetch metadata from SQLite
print("\nSearch results:")
for idx, score in zip(I[0], D[0]):
    # SQLite doc_id is 1-based, FAISS index is 0-based → offset by +1
    cursor.execute("SELECT doc_id, title, text FROM documents WHERE doc_id=?", (idx+1,))
    row = cursor.fetchone()
    if row:
        print(f"Score: {score:.4f} | ID: {row[0]} | Title: {row[1]} | Text: {row[2]}")



Search results:


*****