In [None]:
%pip install -q sentence-transformers rank-bm25 scikit-learn numpy

In [None]:
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer, util
import numpy as np

In [None]:
docs = [
  "Enable two-factor authentication (2FA) in your account settings to add an extra security step.",
  "HbA1c measures long-term glucose; talk to your physician about tests for glycated hemoglobin.",
  "Our PTO policy covers paid time off for vacations and sick leave.",
  "How to fix engine misfires caused by bad spark plugs.",
  "Kubernetes Ingress configuration for path-based routing.",
  "Configure MFA with authenticator apps.",
  "Doctor appointment scheduling policy."
]

In [37]:
#queries = ["How do I set up 2FA?", "What does HbA1c mean?", "sick leave policy?"]
queries = ["How do I set up 2FA"]

In [38]:
tokenized_corpus = [d.lower().split() for d in docs]
bm25 = BM25Okapi(tokenized_corpus)

In [39]:
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')  # fast, general-purpose
doc_emb = model.encode(docs, convert_to_tensor=True, normalize_embeddings=True)

In [40]:
def show_results(query, k=3):
    print(f"\nQUERY: {query}\n" + "-"*60)
    bm25_scores = bm25.get_scores(query.lower().split())
    top_bm25 = np.argsort(-bm25_scores)[:k]
    print("BM25 top-k:")
    for i in top_bm25:
        print(f"  [{i}] {bm25_scores[i]:.3f}  {docs[i]}")
        
    q_emb = model.encode([query], convert_to_tensor=True, normalize_embeddings=True)
    cos = util.cos_sim(q_emb, doc_emb)[0].cpu().numpy()
    top_st = np.argsort(-cos)[:k]
    print("\nSemantic (SentenceTransformer) top-k:")
    for i in top_st:
        print(f"  [{i}] {cos[i]:.3f}  {docs[i]}")
    
    bm25_norm = (bm25_scores - bm25_scores.min()) / (bm25_scores.max() - bm25_scores.min() + 1e-9)
    st_norm   = (cos         -         cos.min()) / (        cos.max() -         cos.min() + 1e-9)
    
    #hybrid = 0.5*bm25_norm + 0.7*st_norm
    alpha = 0.8  # semantic weight
    hybrid = alpha * st_norm + (1 - alpha) * bm25_norm
    
    top_h = np.argsort(-hybrid)[:k]
    print("\nHybrid (BM25 + Semantic) top-k:")
    for i in top_h:
        print(f"  [{i}] {hybrid[i]:.3f}  {docs[i]}")
    
    
    

In [41]:
for q in queries:
    show_results(q)


QUERY: How do I set up 2FA
------------------------------------------------------------
BM25 top-k:
  [3] 1.407  How to fix engine misfires caused by bad spark plugs.
  [0] 0.000  Enable two-factor authentication (2FA) in your account settings to add an extra security step.
  [1] 0.000  HbA1c measures long-term glucose; talk to your physician about tests for glycated hemoglobin.

Semantic (SentenceTransformer) top-k:
  [0] 0.673  Enable two-factor authentication (2FA) in your account settings to add an extra security step.
  [5] 0.363  Configure MFA with authenticator apps.
  [2] 0.044  Our PTO policy covers paid time off for vacations and sick leave.

Hybrid (BM25 + Semantic) top-k:
  [0] 0.800  Enable two-factor authentication (2FA) in your account settings to add an extra security step.
  [5] 0.436  Configure MFA with authenticator apps.
  [3] 0.235  How to fix engine misfires caused by bad spark plugs.
