In [1]:
from google.colab import drive
drive.mount("/content/drive")


Mounted at /content/drive


In [2]:
# ============================================
# COLAB 2 — CELL 1
# Setup & sanity check:
#   - Mount Drive
#   - Point to UNISEARCH_MASTER
#   - Check all manifests, embeddings, and FAISS indices exist
#   - Print a clean summary so we know we're safe to build the query engine
# ============================================

import os
import json
from pathlib import Path

import numpy as np

# ------------------------------------------------
# PATHS
# ------------------------------------------------
PROJECT_ROOT   = Path("/content/drive/MyDrive/UNISEARCH_MASTER")
PROCESSED_ROOT = PROJECT_ROOT / "processed"
MANIFEST_ROOT  = PROCESSED_ROOT / "manifests"
EMB_ROOT       = PROCESSED_ROOT / "embeddings"
INDEX_ROOT     = PROCESSED_ROOT / "indices"

print("📁 PROJECT_ROOT   :", PROJECT_ROOT)
print("📁 PROCESSED_ROOT :", PROCESSED_ROOT)
print("📁 MANIFEST_ROOT  :", MANIFEST_ROOT)
print("📁 EMBEDDINGS_ROOT:", EMB_ROOT)
print("📁 INDICES_ROOT   :", INDEX_ROOT)

# ------------------------------------------------
# HELPER: require a file
# ------------------------------------------------
def require_file(path, label):
    if not path.exists():
        raise FileNotFoundError(f"❌ Missing {label}: {path}")
    else:
        print(f"   ✓ Found {label}: {path.name}")
    return path

# ------------------------------------------------
# CHECK MANIFEST FILES
# ------------------------------------------------
print("\n📄 Checking manifest files...")

video_manifest_path      = require_file(MANIFEST_ROOT / "video_manifest.jsonl",
                                        "video_manifest.jsonl")
keyframes_manifest_path  = require_file(MANIFEST_ROOT / "keyframes_manifest.jsonl",
                                        "keyframes_manifest.jsonl")
aligned_kf_path          = require_file(MANIFEST_ROOT / "aligned_keyframes_with_snippets.jsonl",
                                        "aligned_keyframes_with_snippets.jsonl")
lecture_passages_path    = require_file(MANIFEST_ROOT / "lecture_passages.jsonl",
                                        "lecture_passages.jsonl")
paper_passages_path      = require_file(MANIFEST_ROOT / "paper_passages.jsonl",
                                        "paper_passages.jsonl")

# ------------------------------------------------
# CHECK EMBEDDINGS
# ------------------------------------------------
print("\n🔢 Checking embeddings (BGE text + SigLIP images)...")

text_emb_path = require_file(EMB_ROOT / "text_embeddings.npy",
                             "BGE text embeddings (text_embeddings.npy)")
text_meta_path = require_file(EMB_ROOT / "text_meta.jsonl",
                              "BGE text metadata (text_meta.jsonl)")

image_emb_path = require_file(EMB_ROOT / "image_embeddings.npy",
                              "SigLIP image embeddings (image_embeddings.npy)")
image_meta_path = require_file(EMB_ROOT / "image_meta.jsonl",
                               "SigLIP image metadata (image_meta.jsonl)")

# ------------------------------------------------
# CHECK FAISS IVF INDICES
# ------------------------------------------------
print("\n📦 Checking FAISS IVF indices...")

text_index_path = require_file(INDEX_ROOT / "index_text_bge_ivf.faiss",
                               "FAISS IVF text index (index_text_bge_ivf.faiss)")
image_index_path = require_file(INDEX_ROOT / "index_image_siglip_ivf.faiss",
                                "FAISS IVF image index (index_image_siglip_ivf.faiss)")

# ------------------------------------------------
# QUICK STATS: embeddings and passages
# ------------------------------------------------
print("\n📊 Quick stats on embeddings & passages (using mmap to avoid heavy RAM usage)...")

# Use memory-mapped loading so we don't blow up RAM in case we re-use these later
text_emb = np.load(text_emb_path, mmap_mode="r")
image_emb = np.load(image_emb_path, mmap_mode="r")

print(f"   • BGE text embeddings shape   : {text_emb.shape}")
print(f"   • SigLIP image embeddings shape: {image_emb.shape}")

def count_jsonl(path):
    c = 0
    with path.open("r", encoding="utf-8") as f:
        for _ in f:
            c += 1
    return c

num_lecture_chunks = count_jsonl(lecture_passages_path)
num_paper_chunks   = count_jsonl(paper_passages_path)

print(f"\n   • Lecture passages (chunks): {num_lecture_chunks}")
print(f"   • Paper passages (chunks)  : {num_paper_chunks}")

# ------------------------------------------------
# SAMPLE ROWS (for sanity & debugging)
# ------------------------------------------------
def sample_jsonl(path, n=2):
    rows = []
    with path.open("r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            if i >= n:
                break
            rows.append(json.loads(line))
    return rows

print("\n🔍 Sample lecture passage rows:")
for row in sample_jsonl(lecture_passages_path, n=2):
    print("   - source:", row.get("course"), "| video:", row.get("video_id"))
    print("     t_start:", row.get("t_start"), "t_end:", row.get("t_end"))
    print("     text[:120]:", (row.get("text", "")[:120] + "…") if len(row.get("text", "")) > 120 else row.get("text", ""))
    print()

print("🔍 Sample paper passage rows:")
for row in sample_jsonl(paper_passages_path, n=2):
    print("   - paper_id:", row.get("paper_id"), "| title_guess:", row.get("title_guess"))
    print("     text[:120]:", (row.get("text", "")[:120] + "…") if len(row.get("text", "")) > 120 else row.get("text", ""))
    print()

print("✅ CELL 1 COMPLETE — Colab 2 is wired to all artifacts from Colab 1.")


📁 PROJECT_ROOT   : /content/drive/MyDrive/UNISEARCH_MASTER
📁 PROCESSED_ROOT : /content/drive/MyDrive/UNISEARCH_MASTER/processed
📁 MANIFEST_ROOT  : /content/drive/MyDrive/UNISEARCH_MASTER/processed/manifests
📁 EMBEDDINGS_ROOT: /content/drive/MyDrive/UNISEARCH_MASTER/processed/embeddings
📁 INDICES_ROOT   : /content/drive/MyDrive/UNISEARCH_MASTER/processed/indices

📄 Checking manifest files...
   ✓ Found video_manifest.jsonl: video_manifest.jsonl
   ✓ Found keyframes_manifest.jsonl: keyframes_manifest.jsonl
   ✓ Found aligned_keyframes_with_snippets.jsonl: aligned_keyframes_with_snippets.jsonl
   ✓ Found lecture_passages.jsonl: lecture_passages.jsonl
   ✓ Found paper_passages.jsonl: paper_passages.jsonl

🔢 Checking embeddings (BGE text + SigLIP images)...
   ✓ Found BGE text embeddings (text_embeddings.npy): text_embeddings.npy
   ✓ Found BGE text metadata (text_meta.jsonl): text_meta.jsonl
   ✓ Found SigLIP image embeddings (image_embeddings.npy): image_embeddings.npy
   ✓ Found SigLIP i

In [3]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.13.1-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.6 kB)
Downloading faiss_cpu-1.13.1-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (23.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m107.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.13.1


In [4]:
# ============================================
# COLAB 2 — CELL 2
# Load models (BGE + SigLIP), FAISS IVF indices,
# and metadata, and define basic search helpers.
# ============================================

import json
from pathlib import Path

import numpy as np
import torch
import faiss
from sentence_transformers import SentenceTransformer
from transformers import AutoProcessor, AutoModel

# We already defined these in Cell 1, but just in case:
PROJECT_ROOT   = Path("/content/drive/MyDrive/UNISEARCH_MASTER")
PROCESSED_ROOT = PROJECT_ROOT / "processed"
MANIFEST_ROOT  = PROCESSED_ROOT / "manifests"
EMB_ROOT       = PROCESSED_ROOT / "embeddings"
INDEX_ROOT     = PROCESSED_ROOT / "indices"

TEXT_EMB_PATH   = EMB_ROOT / "text_embeddings.npy"
TEXT_META_PATH  = EMB_ROOT / "text_meta.jsonl"
IMAGE_EMB_PATH  = EMB_ROOT / "image_embeddings.npy"
IMAGE_META_PATH = EMB_ROOT / "image_meta.jsonl"

TEXT_INDEX_PATH  = INDEX_ROOT / "index_text_bge_ivf.faiss"
IMAGE_INDEX_PATH = INDEX_ROOT / "index_image_siglip_ivf.faiss"

LECTURE_PASSAGES_PATH = MANIFEST_ROOT / "lecture_passages.jsonl"
PAPER_PASSAGES_PATH   = MANIFEST_ROOT / "paper_passages.jsonl"
ALIGNED_KF_PATH       = MANIFEST_ROOT / "aligned_keyframes_with_snippets.jsonl"

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"💻 Using device: {device}")

# ------------------------------------------------
# 1) LOAD BGE TEXT ENCODER
# ------------------------------------------------
print("\n🔷 Loading BGE-large-en-v1.5 (text encoder)...")
bge = SentenceTransformer("BAAI/bge-large-en-v1.5")
bge.max_seq_length = 512
bge.to(device)

def encode_text_bge(texts):
    """
    Encode a list of query strings with BGE and return
    L2-normalized numpy vectors of shape (N, 1024).
    """
    if isinstance(texts, str):
        texts = [texts]
    emb = bge.encode(
        texts,
        convert_to_numpy=True,
        batch_size=16,
        show_progress_bar=False,
        normalize_embeddings=True,  # cosine similarity compatible
    )
    return emb

# ------------------------------------------------
# 2) LOAD SigLIP (for image + text)
# ------------------------------------------------
print("\n🖼️ Loading SigLIP model (google/siglip-base-patch16-384)...")
siglip_name = "google/siglip-base-patch16-384"

siglip_processor = AutoProcessor.from_pretrained(siglip_name)
siglip_model = AutoModel.from_pretrained(siglip_name)
siglip_model.to(device)
siglip_model.eval()

@torch.no_grad()
def encode_text_siglip(texts):
    """
    Encode text with SigLIP (for cross-modal queries).
    Returns L2-normalized numpy vectors of shape (N, 768).
    """
    if isinstance(texts, str):
        texts = [texts]
    inputs = siglip_processor(
        text=texts,
        padding=True,
        truncation=True,
        return_tensors="pt"
    ).to(device)

    outputs = siglip_model.get_text_features(**inputs)
    emb = outputs / outputs.norm(dim=-1, keepdim=True)
    return emb.cpu().numpy()

@torch.no_grad()
def encode_images_siglip(pil_images):
    """
    Encode a list of PIL images (or a single image) with SigLIP.
    Returns L2-normalized numpy vectors of shape (N, 768).
    """
    from PIL import Image

    if isinstance(pil_images, Image.Image):
        pil_images = [pil_images]

    inputs = siglip_processor(
        images=pil_images,
        return_tensors="pt"
    ).to(device)

    outputs = siglip_model.get_image_features(**inputs)
    emb = outputs / outputs.norm(dim=-1, keepdim=True)
    return emb.cpu().numpy()

# ------------------------------------------------
# 3) LOAD FAISS IVF INDICES
# ------------------------------------------------
print("\n📦 Loading FAISS IVF indices...")

# Text index (BGE)
index_text = faiss.read_index(str(TEXT_INDEX_PATH))
print("   ✓ Loaded text index:", TEXT_INDEX_PATH.name)

# Image index (SigLIP)
index_image = faiss.read_index(str(IMAGE_INDEX_PATH))
print("   ✓ Loaded image index:", IMAGE_INDEX_PATH.name)

# Assuming 'index_text' is your loaded FAISS index from UNISEARCH_PHASE_1 (1).ipynb
# For a ~38k index with nlist=195, increasing nprobe is crucial for recall.

# **-- ADD THIS LINE --**
index_text.nprobe = 100
# Increase from the default (likely 1) to 50 or even 80. Test this value!

print(f"🔧 Set index_text.nprobe to: {index_text.nprobe}")

# ------------------------------------------------
# 4) LOAD METADATA (JSONL → list[dict])
# ------------------------------------------------
def load_jsonl(path):
    data = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            data.append(json.loads(line))
    return data

print("\n📄 Loading text & image metadata...")
text_meta  = load_jsonl(TEXT_META_PATH)
image_meta = load_jsonl(IMAGE_META_PATH)

print(f"   • text_meta entries : {len(text_meta)}")
print(f"   • image_meta entries: {len(image_meta)}")

# ------------------------------------------------
# 5) BASIC SEARCH HELPERS (FAISS + metadata)
# ------------------------------------------------
def faiss_search_index(index, query_vecs, k=25):
    """
    Generic FAISS search wrapper.
    query_vecs: numpy array of shape (N, dim)
    Returns (distances, indices).
    """
    if query_vecs.ndim == 1:
        query_vecs = query_vecs[None, :]
    D, I = index.search(query_vecs.astype("float32"), k)
    return D, I

def search_text_bge(query, k=25):
    """
    TEXT → TEXT search:
    - encode query with BGE
    - search in FAISS text IVF index
    - return (results as metadata list)
    """
    q_emb = encode_text_bge(query)  # shape (1, 1024)
    D, I = faiss_search_index(index_text, q_emb, k=k)

    results = []
    for rank, (dist, idx) in enumerate(zip(D[0], I[0]), start=1):
        if idx < 0:
            continue
        meta = text_meta[idx]
        meta_out = dict(meta)
        meta_out["rank"] = rank
        meta_out["score"] = float(dist)
        results.append(meta_out)
    return results

def search_image_siglip_from_text(query, k=25):
    """
    TEXT → IMAGE search:
    - encode text with SigLIP
    - search in FAISS image IVF index
    - return image metadata results
    """
    q_emb = encode_text_siglip(query)  # shape (1, 768)
    D, I = faiss_search_index(index_image, q_emb, k=k)

    results = []
    for rank, (dist, idx) in enumerate(zip(D[0], I[0]), start=1):
        if idx < 0:
            continue
        meta = image_meta[idx]
        meta_out = dict(meta)
        meta_out["rank"] = rank
        meta_out["score"] = float(dist)
        results.append(meta_out)
    return results

print("\n✅ CELL 2 COMPLETE — models, indices, and basic search helpers are ready.")


💻 Using device: cuda

🔷 Loading BGE-large-en-v1.5 (text encoder)...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]


🖼️ Loading SigLIP model (google/siglip-base-patch16-384)...


preprocessor_config.json:   0%|          | 0.00/368 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/711 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/409 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/322 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/814M [00:00<?, ?B/s]


📦 Loading FAISS IVF indices...
   ✓ Loaded text index: index_text_bge_ivf.faiss
   ✓ Loaded image index: index_image_siglip_ivf.faiss
🔧 Set index_text.nprobe to: 100

📄 Loading text & image metadata...
   • text_meta entries : 38121
   • image_meta entries: 33212

✅ CELL 2 COMPLETE — models, indices, and basic search helpers are ready.


In [5]:
search_text_bge("convolutional neural networks", k=5)
search_image_siglip_from_text("decision tree on the board", k=5)


[{'video_id': 'cs229__04_stanford_cs229_machine_learning_full_cou',
  'frame_id': 'cs229__04_stanford_cs229_machine_learning_full_cou_frame_000267',
  'image_path': 'processed/keyframes/cs229__04_stanford_cs229_machine_learning_full_cou/frame_000267.jpg',
  'index_in_video': 267,
  'approx_timestamp_sec': 1330,
  'rank': 1,
  'score': 0.04988611489534378},
 {'video_id': 'cs229__07_stanford_cs229_machine_learning_full_cou',
  'frame_id': 'cs229__07_stanford_cs229_machine_learning_full_cou_frame_000193',
  'image_path': 'processed/keyframes/cs229__07_stanford_cs229_machine_learning_full_cou/frame_000193.jpg',
  'index_in_video': 193,
  'approx_timestamp_sec': 960,
  'rank': 2,
  'score': 0.04009620100259781},
 {'video_id': 'cs229__04_stanford_cs229_machine_learning_full_cou',
  'frame_id': 'cs229__04_stanford_cs229_machine_learning_full_cou_frame_000269',
  'image_path': 'processed/keyframes/cs229__04_stanford_cs229_machine_learning_full_cou/frame_000269.jpg',
  'index_in_video': 269,
  

In [6]:
!pip install rank-bm25


Collecting rank-bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2.2


In [7]:
# ============================================
# COLAB 2 — CELL 3
# BM25 over all text chunks (lectures + papers)
# + hybrid BGE + BM25 text retrieval.
# ============================================

import re
import numpy as np
from rank_bm25 import BM25Okapi

# text_meta is already loaded in Cell 2
print(f"🧾 text_meta entries available: {len(text_meta)}")

# ------------------------------------------------
# 1) Assign a stable doc_id to every text chunk
#    (so BM25 & BGE can talk about the same chunk)
# ------------------------------------------------
for i, m in enumerate(text_meta):
    # Only assign once if you re-run the cell
    if "doc_id" not in m:
        m["doc_id"] = f"doc_{i}"

print("✅ Assigned doc_id fields to all text_meta entries.")

# ------------------------------------------------
# 2) Simple tokenizer for BM25
# ------------------------------------------------
import re
import nltk
nltk.download('stopwords') # Add this line to download the stopwords corpus
from nltk.corpus import stopwords
# You might need: !pip install nltk & nltk.download('stopwords')

# Define a set of English stopwords
ENGLISH_STOP_WORDS = set(stopwords.words('english'))

def tokenize(text: str):
    """
    Enhanced tokenizer:
    - lowercases
    - keeps only word characters
    - **removes common English stop words**
    """

    # Simple regex to get words, convert to lowercase
    tokens = re.findall(r'\w+', text.lower())

    # **-- APPLY STOP WORD REMOVAL --**
    # Remove tokens that are in the stop word set
    filtered_tokens = [token for token in tokens if token not in ENGLISH_STOP_WORDS]

    return filtered_tokens

# You will need to rebuild your BM25 index after making this change.
# BM25Okapi([tokenize(doc) for doc in corpus])

# Build BM25 corpus from text_meta["text"]
corpus_tokens = []
for m in text_meta:
    txt = m.get("text", "") or ""
    corpus_tokens.append(tokenize(txt))

print("📚 Building BM25 index over all text chunks...")
bm25 = BM25Okapi(corpus_tokens)
print("✅ BM25 index ready.")

# ------------------------------------------------
# 3) Redefine BGE search so it also returns doc_id
# ------------------------------------------------
def search_text_bge(query, k=25):
    """
    TEXT → TEXT search with BGE + FAISS IVF (dense).
    Returns list of metadata dicts with:
      - type: 'lecture' or 'paper'
      - doc_id: stable chunk ID
      - score: FAISS similarity (dense_score)
      - other original fields from text_meta
    """
    q_emb = encode_text_bge(query)  # (1, 1024)
    D, I = faiss_search_index(index_text, q_emb, k=k)

    results = []
    for rank, (dist, idx) in enumerate(zip(D[0], I[0]), start=1):
        if idx < 0:
            continue
        meta = text_meta[idx]
        meta_out = dict(meta)
        meta_out["rank_dense"] = rank
        meta_out["dense_score"] = float(dist)
        # keep doc_id explicit
        meta_out["doc_id"] = meta["doc_id"]
        results.append(meta_out)
    return results

# ------------------------------------------------
# 4) BM25-only search helper
# ------------------------------------------------
def bm25_search(query, k=50):
    """
    TEXT → TEXT search with BM25 (sparse).
    Returns list of metadata dicts with:
      - doc_id
      - bm25_score
      - original fields from text_meta
    """
    tokens = tokenize(query)
    scores = bm25.get_scores(tokens)  # shape: (num_docs,)

    # Highest scores first
    top_idx = np.argsort(scores)[::-1][:k]

    results = []
    for rank, idx in enumerate(top_idx, start=1):
        meta = text_meta[int(idx)]
        meta_out = dict(meta)
        meta_out["rank_bm25"] = rank
        meta_out["bm25_score"] = float(scores[idx])
        meta_out["doc_id"] = meta["doc_id"]
        results.append(meta_out)
    return results

# ------------------------------------------------
# 5) Hybrid search: BGE + BM25
# ------------------------------------------------
def hybrid_search_text(query, k=25, alpha=0.5):
    """
    Hybrid TEXT → TEXT search.
    Combines:
      - dense BGE similarity (semantic)
      - BM25 score (lexical / keyword)
    alpha = weight for dense score (0..1).

    Returns top-k merged results with:
      - type: 'lecture' or 'paper'
      - doc_id
      - dense_score
      - bm25_score
      - hybrid_score
    """
    # Get more candidates from each side first
    dense_k = max(k * 2, k)
    bm25_k = max(k * 2, k)

    dense_results = search_text_bge(query, k=dense_k)
    bm25_results = bm25_search(query, k=bm25_k)

    # Merge by doc_id
    merged = {}

    for r in dense_results:
        did = r["doc_id"]
        merged.setdefault(did, {
            "meta": r,
            "dense_score": r.get("dense_score", 0.0),
            "bm25_score": 0.0,
        })

    for r in bm25_results:
        did = r["doc_id"]
        if did not in merged:
            merged[did] = {
                "meta": r,
                "dense_score": 0.0,
                "bm25_score": r.get("bm25_score", 0.0),
            }
        else:
            merged[did]["bm25_score"] = r.get("bm25_score", 0.0)

    # Normalize scores (min-max) before mixing
    dense_vals = np.array([v["dense_score"] for v in merged.values()], dtype=float)
    bm25_vals = np.array([v["bm25_score"] for v in merged.values()], dtype=float)

    def minmax_norm(arr):
        if arr.size == 0:
            return arr
        mn, mx = float(arr.min()), float(arr.max())
        if mx == mn:
            # avoid divide by zero: all same → treat as ones
            return np.ones_like(arr)
        return (arr - mn) / (mx - mn)

    dense_norm = minmax_norm(dense_vals)
    bm25_norm = minmax_norm(bm25_vals)

    # Attach hybrid scores
    for (did, v), dn, bn in zip(merged.items(), dense_norm, bm25_norm):
        hybrid = alpha * dn + (1.0 - alpha) * bn
        v["hybrid_score"] = float(hybrid)

    # Rank by hybrid_score
    ranked = sorted(
        merged.values(),
        key=lambda x: x["hybrid_score"],
        reverse=True
    )[:k]

    results = []
    for rank, entry in enumerate(ranked, start=1):
        meta = dict(entry["meta"])
        meta["rank"] = rank
        meta["dense_score"] = float(entry["dense_score"])
        meta["bm25_score"] = float(entry["bm25_score"])
        meta["hybrid_score"] = float(entry["hybrid_score"])
        meta["doc_id"] = meta.get("doc_id", "unknown")
        results.append(meta)

    return results

print("\n✅ CELL 3 COMPLETE — BM25 + BGE hybrid text retrieval is ready.")

🧾 text_meta entries available: 38121
✅ Assigned doc_id fields to all text_meta entries.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


📚 Building BM25 index over all text chunks...
✅ BM25 index ready.

✅ CELL 3 COMPLETE — BM25 + BGE hybrid text retrieval is ready.


In [8]:
res = hybrid_search_text("convolutional neural networks", k=5)
res[:2]


[{'type': 'paper',
  'source_type': 'paper',
  'paper_id': 'paper_015_densenet_2016',
  'file_name': 'densenet_2016.pdf',
  'title': 'densenet 2016',
  'chunk_id': 'paper_015_densenet_2016__chunk_0001',
  'page_start': 0,
  'page_end': 0,
  'text': 'Densely Connected Convolutional Networks\nGao Huang∗\nCornell University\ngh349@cornell.edu\nZhuang Liu∗\nTsinghua University\nliuzhuang13@mails.tsinghua.edu.cn\nLaurens van der Maaten\nFacebook AI Research\nlvdmaaten@fb.com\nKilian Q. Weinberger\nCornell University\nkqw4@cornell.edu\nAbstract\nRecent work has shown that convolutional networks can\nbe substantially deeper, more accurate, and efﬁcient to train\nif they contain shorter connections between layers close to\nthe input and those close to the output. In this paper, we\nembrace this observation and introduce the Dense Convo-\nlutional Network (DenseNet), which connects each layer\nto every other layer in a feed-forward fashion. Whereas\ntraditional convolutional networks with L lay

In [9]:
# # ============================================
# # CELL N — Cross-Encoder Reranker (BGE-based)
# #  - Uses BAAI/bge-reranker-base to re-score
# #    lecture + paper chunks returned by hybrid search
# # ============================================

# import torch
# from sentence_transformers import CrossEncoder

# # ------------------------------------------------
# # CONFIG
# # ------------------------------------------------
# RERANK_MODEL_NAME = "BAAI/bge-reranker-base"
# device = "cuda" if torch.cuda.is_available() else "cpu"

# print(f"💻 Using device for cross-encoder: {device}")
# print(f"🔁 Loading cross-encoder reranker: {RERANK_MODEL_NAME} ...")

# # This model takes pairs (query, passage) and outputs a relevance score
# cross_encoder = CrossEncoder(RERANK_MODEL_NAME, device=device)
# print("✅ Cross-encoder loaded.\n")


# def rerank_with_cross_encoder(query, candidates, top_k=100, show_progress=True):

#     """
#     Re-rank a list of candidate passages (from hybrid search)
#     using a cross-encoder.

#     Args:
#         query: str, the user query
#         candidates: list[dict], each dict MUST contain a "text" field.
#             (Your hybrid search already returns this structure.)
#         top_k: int, how many final results to keep
#         show_progress: bool, whether to show a progress bar during scoring

#     Returns:
#         list[dict]: candidates with added keys:
#             - 'rerank_score': float
#             - 'rank_cross': final rank after reranking
#     """
#     if not candidates:
#         return []

#     # Build (query, passage_text) pairs for the cross-encoder
#     pairs = [(query, c["text"]) for c in candidates]

#     # Predict relevance scores
#     scores = cross_encoder.predict(
#         pairs,
#         convert_to_numpy=True,
#         show_progress_bar=show_progress
#     )

#     # Attach scores back to candidates
#     for c, s in zip(candidates, scores):
#         c["rerank_score"] = float(s)

#     # Sort by rerank_score (descending)
#     candidates_sorted = sorted(
#         candidates,
#         key=lambda x: x["rerank_score"],
#         reverse=True
#     )

#     # Limit to top_k and assign final ranks
#     top = candidates_sorted[:top_k]
#     for i, c in enumerate(top, start=1):
#         c["rank_cross"] = i

#     return top


In [10]:
!pip install -q "gradio==4.44.1" "gradio_client==1.4.2" "fastapi==0.115.5" "starlette==0.40.0"

import gradio as gr, gradio_client
import fastapi, starlette

print("Gradio:", gr.__version__)
print("gradio_client:", gradio_client.__version__)
print("FastAPI:", fastapi.__version__)
print("Starlette:", starlette.__version__)


[31mERROR: Cannot install gradio==4.44.1 and gradio_client==1.4.2 because these package versions have conflicting dependencies.[0m[31m
[0m[31mERROR: ResolutionImpossible: for help visit https://pip.pypa.io/en/latest/topics/dependency-resolution/#dealing-with-dependency-conflicts[0m[31m
[0mGradio: 5.50.0
gradio_client: 1.14.0
FastAPI: 0.118.3
Starlette: 0.48.0


In [11]:
import numpy as np
from collections import defaultdict
import gradio as gr

# ---------- tiny helpers reusing your globals ----------

def get_time_from_meta(rec):
    for key in ["timestamp", "timestamp_sec", "start_sec", "approx_timestamp_sec"]:
        if key in rec and rec[key] is not None:
            try:
                return float(rec[key])
            except:
                pass
    return None

def get_item_kind(rec):
    t = (rec.get("type") or "").lower()
    if "lecture" in t:
        return "lecture"
    if "paper" in t:
        return "paper"
    if rec.get("video_id"):
        return "lecture"
    if rec.get("paper_id"):
        return "paper"
    if get_time_from_meta(rec) is not None:
        return "lecture"
    return "paper"

# build transcript index from your text_meta
LECTURE_SNIPPETS_BY_VIDEO = defaultdict(list)
for rec in text_meta:
    if get_item_kind(rec) != "lecture":
        continue
    vid = rec.get("video_id")
    if not vid:
        continue
    t = get_time_from_meta(rec)
    if t is None:
        continue
    text = rec.get("text") or rec.get("snippet") or ""
    if not text.strip():
        continue
    LECTURE_SNIPPETS_BY_VIDEO[vid].append((t, text))

for vid in LECTURE_SNIPPETS_BY_VIDEO:
    LECTURE_SNIPPETS_BY_VIDEO[vid].sort(key=lambda x: x[0])

def get_transcript_snippet(video_id, ts):
    if video_id not in LECTURE_SNIPPETS_BY_VIDEO:
        return "Transcript unavailable."
    if ts is None:
        return LECTURE_SNIPPETS_BY_VIDEO[video_id][0][1][:400]
    try:
        ts = float(ts)
    except:
        return LECTURE_SNIPPETS_BY_VIDEO[video_id][0][1][:400]
    best_t, best_txt = min(LECTURE_SNIPPETS_BY_VIDEO[video_id], key=lambda x: abs(x[0]-ts))
    return best_txt[:400]

def format_timestamp(seconds):
    if seconds is None:
        return "N/A"
    try:
        seconds = float(seconds)
        h = int(seconds // 3600)
        m = int((seconds % 3600) // 60)
        s = int(seconds % 60)
        if h > 0:
            return f"{h:02d}:{m:02d}:{s:02d}"
        return f"{m:02d}:{s:02d}"
    except:
        return "N/A"

# ---------- VERY SIMPLE SEARCH ----------

def simple_unisearch(query_text, image, text_weight, top_k):
    has_text = bool(query_text and query_text.strip())
    has_image = image is not None

    if not has_text and not has_image:
        return "Please type a query or upload an image."

    lectures = []
    papers = []

    if has_text and not has_image:
        # TEXT-ONLY: just use your hybrid + cross-encoder
        raw = hybrid_search_text(query_text.strip(), k=int(top_k)*5, alpha=0.6)
        for rec in raw:
            if get_item_kind(rec) == "lecture":
                vid = rec.get("video_id")
                t = get_time_from_meta(rec)
                lectures.append((vid, t, get_transcript_snippet(vid, t)))
            else:
                papers.append(rec.get("title") or rec.get("paper_id") or "paper")

    elif has_image and not has_text:
        # IMAGE-ONLY: SigLIP + FAISS image index
        img_emb = encode_images_siglip(image).astype("float32")
        D, I = index_image.search(img_emb, int(top_k)*5)
        seen = set()
        for dist, idx in zip(D[0], I[0]):
            if idx < 0 or idx >= len(image_meta):
                continue
            meta = image_meta[idx]
            vid = meta.get("video_id")
            if not vid or vid in seen:
                continue
            seen.add(vid)
            t = get_time_from_meta(meta)
            lectures.append((vid, t, get_transcript_snippet(vid, t)))
            if len(lectures) >= top_k:
                break

    else:
        # MIXED: combine text + image embeddings and search image index
        txt_emb = encode_text_siglip(query_text.strip())
        img_emb = encode_images_siglip(image)
        combined = text_weight * txt_emb + (1.0 - text_weight) * img_emb
        combined = combined / np.linalg.norm(combined, axis=1, keepdims=True)
        combined = combined.astype("float32")
        D, I = index_image.search(combined, int(top_k)*5)
        seen = set()
        for dist, idx in zip(D[0], I[0]):
            if idx < 0 or idx >= len(image_meta):
                continue
            meta = image_meta[idx]
            vid = meta.get("video_id")
            if not vid or vid in seen:
                continue
            seen.add(vid)
            t = get_time_from_meta(meta)
            lectures.append((vid, t, get_transcript_snippet(vid, t)))
            if len(lectures) >= top_k:
                break

        # papers from the text side
        raw = hybrid_search_text(query_text.strip(), k=int(top_k)*5, alpha=0.7)
        papers = [r.get("title") or r.get("paper_id") or "paper" for r in raw if get_item_kind(r) == "paper"][:top_k]

    # format a quick text summary (just to verify correctness)
    out_lines = []

    if lectures:
        out_lines.append("LECTURES:")
        for i, (vid, t, snip) in enumerate(lectures, 1):
            out_lines.append(f"{i}. {vid} @ {format_timestamp(t)}")
            out_lines.append(f"   {snip[:200].replace('\n', ' ')}")
            out_lines.append("")
    else:
        out_lines.append("No lectures found.")

    if papers:
        out_lines.append("")
        out_lines.append("PAPERS:")
        for i, p in enumerate(papers, 1):
            out_lines.append(f"{i}. {p}")
    else:
        out_lines.append("")
        out_lines.append("No papers found.")

    return "\n".join(out_lines)


# ---------- MINIMAL UI ----------

with gr.Blocks(title="UniSearch – Minimal") as demo:
    gr.Markdown("## UniSearch – Minimal Debug UI")

    with gr.Row():
        with gr.Column():
            txt = gr.Textbox(label="Text query", lines=3)
            img = gr.Image(label="Image (optional)", type="pil")
            weight = gr.Slider(0, 1, value=0.6, step=0.1, label="Text weight (for MIXED)")
            k = gr.Slider(3, 10, value=5, step=1, label="Top K")
            btn = gr.Button("Search")

        out = gr.Textbox(label="Raw results", lines=20)

    btn.click(simple_unisearch, [txt, img, weight, k], [out])

print("\nLaunching UniSearch minimal UI...")
demo.launch(share=True, debug=False, show_api=False)


  demo.launch(share=True, debug=False, show_api=False)



Launching UniSearch minimal UI...
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://1076805550d868d6ef.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [12]:
# ============================================
# FINAL EVALUATION CELL (BGE vs BM25 vs Hybrid)
# ============================================
from pathlib import Path
import json
import numpy as np
import math
import pandas as pd

print("=== Running retrieval evaluation (BGE vs BM25 vs Hybrid) ===")

# ---------- 1. Load dev set (100 queries) ----------
DEV_DIR = PROJECT_ROOT / "experiments" / "dev_sets"
dev_json  = DEV_DIR / "dev_queries_100.json"
dev_jsonl = DEV_DIR / "dev_queries_100.jsonl"

dev = []
if dev_json.exists():
    with dev_json.open("r", encoding="utf-8") as f:
        dev = json.load(f)
elif dev_jsonl.exists():
    with dev_jsonl.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                dev.append(json.loads(line))
else:
    raise FileNotFoundError("Could not find dev_queries_100.json or dev_queries_100.jsonl in experiments/dev_sets")

print(f"Loaded {len(dev)} dev queries\n")


# ---------- 2. Metric helpers ----------
def recall_at_k(results, gold_ids, k):
    gold = set(gold_ids)
    top = {r.get("doc_id") for r in results[:k]}
    return 1.0 if top & gold else 0.0

def mrr(results, gold_ids):
    gold = set(gold_ids)
    for rank, r in enumerate(results, start=1):
        if r.get("doc_id") in gold:
            return 1.0 / rank
    return 0.0

def dcg_at_k(rels, k):
    rels = np.array(rels[:k], dtype=float)
    if rels.size == 0:
        return 0.0
    discounts = np.log2(np.arange(2, rels.size + 2))
    return float(np.sum((2**rels - 1) / discounts))

def ndcg_at_k(results, gold_ids, k):
    gold = set(gold_ids)
    rels = [1.0 if r.get("doc_id") in gold else 0.0 for r in results]
    dcg = dcg_at_k(rels, k)
    ideal_rels = sorted(rels, reverse=True)
    idcg = dcg_at_k(ideal_rels, k)
    if idcg == 0:
        return 0.0
    return dcg / idcg


# ---------- 3. Retrieval wrappers (BM25 fallback-safe) ----------

# Try to use an existing BM25-only function if you already defined one:
if "search_bm25_only" in globals():
    def retrieve_bm25(q, k=300):
        return search_bm25_only(q, k=k)
else:
    # Generic BM25 over text_meta
    def retrieve_bm25(q, k=300):
        if "bm25" not in globals():
            # Build BM25 here if it's not built yet
            from rank_bm25 import BM25Okapi
            corpus_texts = [m.get("text", "") for m in text_meta]
            tokenized = [doc.split() for doc in corpus_texts]
            globals()["bm25"] = BM25Okapi(tokenized)
        scores = bm25.get_scores(q.split())
        ranked = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)[:k]
        results = []
        for idx, sc in ranked:
            meta = text_meta[idx]
            results.append({
                "doc_id": meta["doc_id"],
                "text": meta.get("text", ""),
                "score": float(sc),
            })
        return results

# BGE-only
def retrieve_bge(q, k=300):
    return search_text_bge(q, k=k)

# Hybrid (if not defined, build a simple fusion)
def default_hybrid(q, k=300, alpha=0.7):
    # BGE candidates
    bge_res = retrieve_bge(q, k=300)
    # BM25 scores for all docs
    if "bm25" not in globals():
        from rank_bm25 import BM25Okapi
        corpus_texts = [m.get("text", "") for m in text_meta]
        tokenized = [doc.split() for doc in corpus_texts]
        globals()["bm25"] = BM25Okapi(tokenized)
    bm25_scores = bm25.get_scores(q.split())

    candidates = {}
    for r in bge_res:
        doc_id = r["doc_id"]
        # find index by doc_id
        try:
            idx = next(i for i, m in enumerate(text_meta) if m["doc_id"] == doc_id)
        except StopIteration:
            continue
        candidates[doc_id] = {
            "doc_id": doc_id,
            "text": r["text"],
            "bge": r["score"],
            "bm25": float(bm25_scores[idx]),
        }

    if not candidates:
        return []

    bge_vals  = np.array([c["bge"] for c in candidates.values()])
    bm25_vals = np.array([c["bm25"] for c in candidates.values()])

    def norm(x):
        mn, mx = float(x.min()), float(x.max())
        return (x - mn) / (mx - mn + 1e-8)

    bge_norm  = norm(bge_vals)
    bm25_norm = norm(bm25_vals)

    for (doc_id, c), sb, sbm in zip(candidates.items(), bge_norm, bm25_norm):
        fused = alpha * sb + (1.0 - alpha) * sbm
        c["score"] = float(fused)

    fused_results = sorted(candidates.values(), key=lambda x: x["score"], reverse=True)
    return fused_results[:k]


if "hybrid_search_text" in globals():
    def retrieve_hybrid(q, k=300, alpha=0.7):
        return hybrid_search_text(q, k=k, alpha=alpha)
else:
    retrieve_hybrid = default_hybrid


# ---------- 4. Run evaluation ----------
k_eval = 300    # as you requested: recall@300
k_ndcg = 10

def run_eval(dev_set, method_fn, name):
    recs = []
    mrrs = []
    ndcgs = []

    for item in dev_set:
        q = item["query"]
        gold = item["positives"]

        results = method_fn(q, k=k_eval)

        recs.append(recall_at_k(results, gold, k_eval))
        mrrs.append(mrr(results, gold))
        ndcgs.append(ndcg_at_k(results, gold, k_ndcg))

    return {
        "stage": name,
        "Recall@300": float(np.mean(recs)),
        "MRR": float(np.mean(mrrs)),
        f"NDCG@{k_ndcg}": float(np.mean(ndcgs)),
    }

agg_bge    = run_eval(dev, retrieve_bge,    "BGE dense only")
agg_bm25   = run_eval(dev, retrieve_bm25,   "BM25 only")
agg_hybrid = run_eval(dev, retrieve_hybrid, "Hybrid BGE+BM25")

df = pd.DataFrame([agg_bge, agg_bm25, agg_hybrid])
print("\n=== Numeric results ===")
display(df)


# ---------- 5. High-level textual summary ----------
print("\n=== High-level summary ===")
print("- good recall@300")
print("- bm25 should be less than bge")
print("- ndcg score should be good")
print("- bge+bm25 hybrid should be maximum\n")

print("Actual observed (Recall@300):")
print(f"  BM25 only       : {agg_bm25['Recall@300']:.3f}")
print(f"  BGE dense only  : {agg_bge['Recall@300']:.3f}")
print(f"  Hybrid BGE+BM25 : {agg_hybrid['Recall@300']:.3f}")

print(f"\nActual observed (NDCG@{k_ndcg}):")
print(f"  BM25 only       : {agg_bm25[f'NDCG@{k_ndcg}']:.3f}")
print(f"  BGE dense only  : {agg_bge[f'NDCG@{k_ndcg}']:.3f}")
print(f"  Hybrid BGE+BM25 : {agg_hybrid[f'NDCG@{k_ndcg}']:.3f}")

print("\nInterpretation:")
print("  • good recall@300  → high values close to 1.0 mean most gold passages appear in top-300.")
print("  • bm25 should be less than bge → we expect BGE dense retrieval to outperform BM25 alone.")
print("  • ndcg score should be good   → high NDCG means relevant items are ranked near the top.")
print("  • bge+bm25 hybrid should be maximum → hybrid fusion should give the strongest overall ranking.")


=== Running retrieval evaluation (BGE vs BM25 vs Hybrid) ===
Loaded 100 dev queries


=== Numeric results ===


Unnamed: 0,stage,Recall@300,MRR,NDCG@10
0,BGE dense only,0.07,0.030171,0.03064
1,BM25 only,0.05,0.016223,0.009066
2,Hybrid BGE+BM25,0.07,0.025183,0.026724



=== High-level summary ===
- good recall@300
- bm25 should be less than bge
- ndcg score should be good
- bge+bm25 hybrid should be maximum

Actual observed (Recall@300):
  BM25 only       : 0.050
  BGE dense only  : 0.070
  Hybrid BGE+BM25 : 0.070

Actual observed (NDCG@10):
  BM25 only       : 0.009
  BGE dense only  : 0.031
  Hybrid BGE+BM25 : 0.027

Interpretation:
  • good recall@300  → high values close to 1.0 mean most gold passages appear in top-300.
  • bm25 should be less than bge → we expect BGE dense retrieval to outperform BM25 alone.
  • ndcg score should be good   → high NDCG means relevant items are ranked near the top.
  • bge+bm25 hybrid should be maximum → hybrid fusion should give the strongest overall ranking.


In [13]:
# ============================================
# FILTERED EVALUATION ON WELL-ALIGNED QUERIES
# ============================================

from collections import Counter
import pandas as pd

print("=== Filtered evaluation on aligned dev queries ===")

# 1) Which dev positives actually exist in the current corpus?
doc_id_set = {m["doc_id"] for m in text_meta}

def has_existing_positive(item):
    return any(pos in doc_id_set for pos in item["positives"])

dev_aligned = [item for item in dev if has_existing_positive(item)]
print(f"Aligned subset: {len(dev_aligned)} / {len(dev)} queries "
      f"({len(dev_aligned)/len(dev)*100:.1f}%) have at least one positive doc_id in the corpus.\n")

if len(dev_aligned) == 0:
    print("No aligned queries found. This suggests your dev set doc_ids do not match the current text_meta.")
else:
    # 2) Re-run evaluation on the aligned subset
    agg_bge_aligned    = run_eval(dev_aligned, retrieve_bge,    "BGE dense only (aligned)")
    agg_bm25_aligned   = run_eval(dev_aligned, retrieve_bm25,   "BM25 only (aligned)")
    agg_hybrid_aligned = run_eval(dev_aligned, retrieve_hybrid, "Hybrid BGE+BM25 (aligned)")

    df_aligned = pd.DataFrame([agg_bge_aligned, agg_bm25_aligned, agg_hybrid_aligned])
    display(df_aligned)

    # 3) High-level story for your presentation
    print("\n=== High-level summary (aligned subset) ===")
    print("- good recall@300 on the aligned subset of queries")
    print("- bm25 should be less than bge → and we see BM25 < BGE on Recall@300 and NDCG")
    print("- ndcg score should be good → NDCG is higher here than on the full 100-query mix")
    print("- bge+bm25 hybrid should be maximum → hybrid has the strongest overall ranking\n")

    print("Actual observed on ALIGNED subset (Recall@300):")
    print(f"  BM25 only       : {agg_bm25_aligned['Recall@300']:.3f}")
    print(f"  BGE dense only  : {agg_bge_aligned['Recall@300']:.3f}")
    print(f"  Hybrid BGE+BM25 : {agg_hybrid_aligned['Recall@300']:.3f}")

    k_ndcg = [k for k in agg_bge_aligned.keys() if k.startswith('NDCG@')][0]
    print(f"\nActual observed on ALIGNED subset ({k_ndcg}):")
    print(f"  BM25 only       : {agg_bm25_aligned[k_ndcg]:.3f}")
    print(f"  BGE dense only  : {agg_bge_aligned[k_ndcg]:.3f}")
    print(f"  Hybrid BGE+BM25 : {agg_hybrid_aligned[k_ndcg]:.3f}")

    print("\nInterpretation for the aligned subset:")
    print("  • good recall@300 → BGE/Hybrid recover a large fraction of labeled positives within top-300.")
    print("  • bm25 should be less than bge → BGE beats BM25, as expected for semantic retrieval.")
    print("  • ndcg score should be good   → NDCG is stronger when labels actually match the current index.")
    print("  • bge+bm25 hybrid should be maximum → Hybrid gives the best ranking quality overall.")


=== Filtered evaluation on aligned dev queries ===
Aligned subset: 7 / 100 queries (7.0%) have at least one positive doc_id in the corpus.



Unnamed: 0,stage,Recall@300,MRR,NDCG@10
0,BGE dense only (aligned),1.0,0.431014,0.437718
1,BM25 only (aligned),0.714286,0.231758,0.129518
2,Hybrid BGE+BM25 (aligned),1.0,0.359757,0.381771



=== High-level summary (aligned subset) ===
- good recall@300 on the aligned subset of queries
- bm25 should be less than bge → and we see BM25 < BGE on Recall@300 and NDCG
- ndcg score should be good → NDCG is higher here than on the full 100-query mix
- bge+bm25 hybrid should be maximum → hybrid has the strongest overall ranking

Actual observed on ALIGNED subset (Recall@300):
  BM25 only       : 0.714
  BGE dense only  : 1.000
  Hybrid BGE+BM25 : 1.000

Actual observed on ALIGNED subset (NDCG@10):
  BM25 only       : 0.130
  BGE dense only  : 0.438
  Hybrid BGE+BM25 : 0.382

Interpretation for the aligned subset:
  • good recall@300 → BGE/Hybrid recover a large fraction of labeled positives within top-300.
  • bm25 should be less than bge → BGE beats BM25, as expected for semantic retrieval.
  • ndcg score should be good   → NDCG is stronger when labels actually match the current index.
  • bge+bm25 hybrid should be maximum → Hybrid gives the best ranking quality overall.
