In [1]:
# =========================
# Assuming you already have:
# documents -> list of text chunks
# embeddings -> numpy array of shape (num_docs, embedding_dim)
# =========================

import faiss
from rank_bm25 import BM25Okapi
import nltk
import numpy as np
from sentence_transformers import SentenceTransformer

nltk.download('punkt')

# =========================
# Step 1: Tokenize documents for BM25
# =========================
def tokenize(text):
    return nltk.word_tokenize(text.lower())

tokenized_docs = [tokenize(doc) for doc in documents]

# BM25 Index
bm25 = BM25Okapi(tokenized_docs)

# =========================
# Step 2: FAISS Dense Index
# =========================
# Normalize embeddings for cosine similarity
embeddings_norm = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
embedding_dim = embeddings_norm.shape[1]
faiss_index = faiss.IndexFlatIP(embedding_dim)
faiss_index.add(embeddings_norm)

# =========================
# Step 3: Hybrid Search Function
# =========================
def hybrid_search(query, top_k=5, alpha=0.5):
    # Tokenize query
    tokenized_query = tokenize(query)
    
    # Sparse BM25 scores
    bm25_scores = bm25.get_scores(tokenized_query)
    
    # Dense vector search
    model = SentenceTransformer('all-MiniLM-L6-v2')
    query_emb = model.encode([query], convert_to_numpy=True)
    query_emb = query_emb / np.linalg.norm(query_emb, axis=1, keepdims=True)
    D, I = faiss_index.search(query_emb, top_k)
    dense_scores = D[0]
    
    # Normalize BM25 scores
    bm25_scores_norm = (bm25_scores - bm25_scores.min()) / (bm25_scores.max() - bm25_scores.min() + 1e-8)
    
    # Compute final score combining sparse and dense
    final_scores = []
    for idx in range(len(documents)):
        dense_score = dense_scores[idx] if idx < len(dense_scores) else 0
        score = alpha * bm25_scores_norm[idx] + (1 - alpha) * dense_score
        final_scores.append(score)
    
    # Rank documents
    ranked_indices = np.argsort(final_scores)[::-1][:top_k]
    results = [(documents[i], final_scores[i]) for i in ranked_indices]
    return results

# =========================
# Step 4: Test Hybrid Search
# =========================
query = "semantic search using embeddings"
results = hybrid_search(query, top_k=3, alpha=0.5)

print("Query:", query)
print("\nTop Results:")
for doc, score in results:
    print(f"{score:.4f} -> {doc}")


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/saurabhjain/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


NameError: name 'documents' is not defined