In [1]:
import faiss
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [2]:
# Example dataframe with discussion data
data = {
    'discussion_id': [1, 2, 3],
    'discussion_text': [
        'How to implement data indexing in Python?',
        'The FAISS library is useful for vector search.',
        'GPT models can be used for generating embeddings.'
    ]
}
df = pd.DataFrame(data)

In [3]:
# Load the SentenceTransformer model (auto-detects GPU if available)
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for the discussion data
embeddings = model.encode(df['discussion_text'].tolist(), batch_size=16, show_progress_bar=True)

# Convert to numpy array
embeddings = np.array(embeddings)

# Define the dimension of embeddings
d = embeddings.shape[1]

# Create the FAISS index
index = faiss.IndexFlatL2(d)  # L2 distance (Euclidean)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
# Add the embeddings to the FAISS index
index.add(embeddings)
print(f"Total vectors indexed: {index.ntotal}")

# Example query
query_text = "How can I use FAISS for efficient searches?"

# Generate embedding for the query
query_embedding = model.encode([query_text])

# Perform the search for the nearest neighbors
k = 2  # Number of nearest neighbors to return
distances, indices = index.search(np.array(query_embedding), k)

Total vectors indexed: 3


In [5]:
# Display the results
print("\nQuery:", query_text)
print("\nTop matching discussions:")
for idx, dist in zip(indices[0], distances[0]):
    print(f"Discussion ID: {df.iloc[idx]['discussion_id']}, Text: '{df.iloc[idx]['discussion_text']}', Distance: {dist}")


Query: How can I use FAISS for efficient searches?

Top matching discussions:
Discussion ID: 2, Text: 'The FAISS library is useful for vector search.', Distance: 0.9339967966079712
Discussion ID: 1, Text: 'How to implement data indexing in Python?', Distance: 1.3885002136230469
