In [None]:
# Install required packages for hybrid retrieval (sentence transformer + BM25)
%pip install sentence-transformers torch datasets scikit-learn pandas numpy tqdm faiss-cpu
%pip install nltk transformers  # For tokenization and processing
%pip install rank-bm25  # For BM25 sparse retrieval integration


Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-m

In [None]:
import os
# Disable wandb logging FIRST - before any other imports
os.environ["WANDB_DISABLED"] = "true"
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"

import pandas as pd
import numpy as np
import torch
import re
import random
from collections import defaultdict
import xml.etree.ElementTree as ET
from tqdm import tqdm
import warnings
import math
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
warnings.filterwarnings('ignore')

from sentence_transformers import SentenceTransformer, InputExample, losses, models, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import faiss

# BM25 and hybrid retrieval imports
from rank_bm25 import BM25Okapi
from typing import List, Dict, Tuple, Union

# Download NLTK data
try:
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
except:
    pass

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)


Using device: cuda


In [None]:
def calculate_semantic_diversity(text1, text2):
    """Calculate semantic diversity - OPPOSITE of keyword overlap!

    This function encourages semantically DIFFERENT negatives,
    not lexically similar ones that confuse the model.
    """
    # Simple diversity metric based on character-level differences
    chars1 = set(text1.lower())
    chars2 = set(text2.lower())

    # Calculate character diversity (higher = more diverse = better negative)
    intersection = len(chars1 & chars2)
    union = len(chars1 | chars2)

    if union == 0:
        return 1.0

    # Return DIVERSITY score (1 - similarity) - opposite of overlap!
    diversity = 1.0 - (intersection / union)
    return diversity

def semantic_negative_sampling(all_doc_ids, excluded_doc_ids, documents, query_text, num_samples=100):
    """Sample negatives with preference for semantic diversity - NO lexical bias!"""
    available_docs = [doc_id for doc_id in all_doc_ids if doc_id not in excluded_doc_ids]

    if len(available_docs) == 0:
        return []

    # Sample diverse candidates
    sample_size = min(num_samples, len(available_docs))
    sampled_docs = np.random.choice(available_docs, size=sample_size, replace=False)

    # Calculate DIVERSITY scores (not similarity!)
    diversity_scores = []
    for doc_id in sampled_docs:
        if doc_id in documents:
            diversity = calculate_semantic_diversity(query_text, documents[doc_id])
            diversity_scores.append((doc_id, diversity))

    # Sort by DIVERSITY (descending) - prefer DIFFERENT content
    diversity_scores.sort(key=lambda x: x[1], reverse=True)

    return [doc_id for doc_id, _ in diversity_scores]

def create_semantic_training_triplets(queries, documents, qrels, num_negatives_per_positive=4):
    """Create training triplets for sentence transformer with SEMANTIC focus - NO keyword overlap bias!"""
    training_examples = []
    all_doc_ids = list(documents.keys())

    print("üß† Creating SEMANTIC training triplets for Sentence Transformer...")
    print("‚ùå REMOVED: Keyword overlap-based negative mining")
    print("‚úÖ ADDED: Semantic diversity-based negative sampling")
    print("‚úÖ OPTIMIZED: For sentence transformer architecture")

    for query_id, query_text in tqdm(queries.items(), desc="Semantic triplet creation"):
        if query_id not in qrels:
            continue

        relevant_docs = [doc_id for doc_id, rel in qrels[query_id].items() if rel == 1]
        non_relevant_docs = [doc_id for doc_id, rel in qrels[query_id].items() if rel == 0]

        if not relevant_docs:
            continue

        for pos_doc_id in relevant_docs:
            if pos_doc_id not in documents:
                continue

            pos_doc_text = documents[pos_doc_id]

            # Strategy 1: Use labeled non-relevant documents (50%)
            labeled_negatives = [neg_id for neg_id in non_relevant_docs if neg_id in documents]

            # Strategy 2: Sample semantically diverse random negatives (50%)
            used_doc_ids = set(qrels[query_id].keys())
            diverse_negatives = semantic_negative_sampling(
                all_doc_ids, used_doc_ids, documents, query_text, num_samples=50
            )

            # Combine and balance negative sources
            all_negatives = labeled_negatives + diverse_negatives[:num_negatives_per_positive]

            # Randomly sample final negatives for balanced training
            if len(all_negatives) > num_negatives_per_positive:
                selected_negatives = np.random.choice(
                    all_negatives, size=num_negatives_per_positive, replace=False
                )
            else:
                selected_negatives = all_negatives

            # Create InputExamples for sentence transformer triplet training
            for neg_doc_id in selected_negatives:
                if neg_doc_id in documents:
                    # Create triplet: anchor=query, positive=relevant_doc, negative=irrelevant_doc
                    example = InputExample(
                        texts=[query_text, pos_doc_text, documents[neg_doc_id]],
                        label=0.0  # For triplet loss, the negative should have lower similarity
                    )
                    training_examples.append(example)

    return training_examples


In [None]:
class SemanticSentenceTransformer:
    """Enhanced Sentence Transformer with semantic understanding for code-mixed Bengali IR"""

    def __init__(self, model_name='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2', max_seq_length=512):
        """
        Initialize with Hugging Face multilingual sentence transformer

        Args:
            model_name: Hugging Face model name
                       'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2' - Best for multilingual
                       'sentence-transformers/all-MiniLM-L6-v2' - Faster, English-focused
                       'sentence-transformers/paraphrase-multilingual-mpnet-base-v2' - Higher quality
        """
        self.model_name = model_name
        self.max_seq_length = max_seq_length

        print(f"üöÄ Initializing Semantic Sentence Transformer from Hugging Face: {model_name}")
        print("‚úÖ Pre-optimized for semantic similarity")
        print("‚úÖ Multilingual support for Bengali/Roman")
        print("‚úÖ Efficient architecture for faster training")
        print("üö´ Wandb logging disabled")

        # Load pre-trained sentence transformer from Hugging Face
        self.model = SentenceTransformer(model_name)

        # Set maximum sequence length
        self.model.max_seq_length = max_seq_length

        # Move to device
        self.model = self.model.to(device)

        print(f"üìä Model parameters: {sum(p.numel() for p in self.model.parameters()):,}")
        print(f"üìè Max sequence length: {max_seq_length}")

    def fine_tune_semantic(self, training_examples, val_examples=None,
                          epochs=2, batch_size=8, warmup_steps=50,
                          output_path='best_sentence_transformer_semantic_ir'):
        """Fine-tune with semantic-aware triplet loss - NO WANDB"""

        print(f"\nüß† Fine-tuning with {len(training_examples)} semantic triplets...")
        print("üéØ Focus: Semantic understanding over lexical matching")
        print("‚ö° Advantage: Pre-trained semantic representations")
        print("üö´ Training without wandb logging")

        # Create data loader
        train_dataloader = DataLoader(training_examples, shuffle=True, batch_size=batch_size)

        # Use TripletLoss - optimized for semantic ranking
        train_loss = losses.TripletLoss(model=self.model)

        # Training arguments
        total_steps = len(train_dataloader) * epochs

        print(f"\nüìà Training configuration:")
        print(f"  Epochs: {epochs}")
        print(f"  Batch size: {batch_size}")
        print(f"  Total steps: {total_steps}")
        print(f"  Warmup steps: {warmup_steps}")
        print(f"  Loss function: Triplet Loss (semantic ranking)")
        print(f"  Logging: Disabled (no wandb required)")

        # Evaluator (optional)
        evaluator = None
        if val_examples and len(val_examples) > 0:
            # Create evaluation examples from validation set
            eval_examples = []
            for example in val_examples[:50]:  # Smaller sample for efficiency
                # For evaluation: query-positive should have high similarity
                eval_examples.append(InputExample(texts=[example.texts[0], example.texts[1]], label=1.0))
                # query-negative should have low similarity
                eval_examples.append(InputExample(texts=[example.texts[0], example.texts[2]], label=0.0))

            evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
                eval_examples, name='semantic_ir_eval'
            )
            print(f"  Evaluation: {len(eval_examples)} examples for validation")

        # Fine-tune the model with explicit wandb disable
        print("\nüîÑ Starting training...")
        self.model.fit(
            train_objectives=[(train_dataloader, train_loss)],
            epochs=epochs,
            warmup_steps=warmup_steps,
            output_path=output_path,
            evaluator=evaluator,
            evaluation_steps=1000,  # Less frequent evaluation
            save_best_model=True,
            show_progress_bar=True,
            use_amp=False  # Disable automatic mixed precision for stability
        )

        print(f"\n‚úÖ Fine-tuning completed!")
        print(f"üìÅ Model saved to: {output_path}")

        # Load the best model
        self.model = SentenceTransformer(output_path)

        return output_path

    def encode(self, texts, batch_size=32, show_progress_bar=True, convert_to_tensor=False):
        """Encode texts into semantic embeddings"""
        return self.model.encode(
            texts,
            batch_size=batch_size,
            show_progress_bar=show_progress_bar,
            convert_to_tensor=convert_to_tensor,
            device=device
        )

    def similarity(self, embeddings1, embeddings2):
        """Calculate semantic similarity between embeddings"""
        return util.cos_sim(embeddings1, embeddings2)

    def save(self, path):
        """Save the model"""
        self.model.save(path)

    def load(self, path):
        """Load a saved model"""
        self.model = SentenceTransformer(path).to(device)


In [None]:
class HybridRetriever:
    """
    Hybrid Retrieval System combining fine-tuned Sentence Transformer (dense) with BM25 (sparse)

    This class implements state-of-the-art hybrid retrieval that combines:
    - Dense retrieval: Semantic similarity via fine-tuned sentence transformers
    - Sparse retrieval: Lexical matching via BM25
    - Score fusion: Multiple fusion strategies for optimal results
    """

    def __init__(self, semantic_model, documents: Dict[int, str],
                 bm25_k1: float = 1.2, bm25_b: float = 0.75):
        """
        Initialize hybrid retriever

        Args:
            semantic_model: Fine-tuned SentenceTransformer model
            documents: Dictionary of {doc_id: document_text}
            bm25_k1: BM25 parameter k1 (term frequency saturation)
            bm25_b: BM25 parameter b (document length normalization)
        """
        self.semantic_model = semantic_model
        self.documents = documents
        self.doc_ids = list(documents.keys())
        self.doc_texts = [documents[doc_id] for doc_id in self.doc_ids]

        print(f"üîß Initializing Hybrid Retriever with {len(documents)} documents...")
        print(f"üìä BM25 parameters: k1={bm25_k1}, b={bm25_b}")

        # Initialize BM25
        print("üîç Building BM25 index...")
        self._build_bm25_index(bm25_k1, bm25_b)

        # Initialize dense retrieval
        print("üß† Building semantic embeddings index...")
        self._build_semantic_index()

        print("‚úÖ Hybrid retriever initialized successfully!")

    def _preprocess_for_bm25(self, text: str) -> List[str]:
        """Preprocess text for BM25 - tokenization and basic cleaning"""
        # Basic tokenization - important for code-mixed Bengali
        text = re.sub(r'[^\w\s]', ' ', text.lower())
        tokens = text.split()

        # Remove very short tokens and numbers
        tokens = [token for token in tokens if len(token) > 1 and not token.isdigit()]

        return tokens

    def _build_bm25_index(self, k1: float, b: float):
        """Build BM25 index for sparse retrieval"""
        # Tokenize all documents
        tokenized_docs = [self._preprocess_for_bm25(doc) for doc in self.doc_texts]

        # Create BM25 index
        self.bm25 = BM25Okapi(tokenized_docs, k1=k1, b=b)

        print(f"   ‚úÖ BM25 index built with {len(tokenized_docs)} documents")

    def _build_semantic_index(self):
        """Build semantic embeddings index for dense retrieval"""
        # Encode all documents
        self.doc_embeddings = self.semantic_model.encode(
            self.doc_texts,
            batch_size=32,
            show_progress_bar=True,
            convert_to_tensor=False
        )

        # Create FAISS index
        dimension = self.doc_embeddings.shape[1]
        self.semantic_index = faiss.IndexFlatIP(dimension)
        faiss.normalize_L2(self.doc_embeddings)
        self.semantic_index.add(self.doc_embeddings)

        print(f"   ‚úÖ Semantic index built with {self.semantic_index.ntotal} documents")

    def bm25_search(self, query: str, top_k: int = 1000) -> List[Tuple[int, float]]:
        """Perform BM25 sparse retrieval"""
        query_tokens = self._preprocess_for_bm25(query)
        scores = self.bm25.get_scores(query_tokens)

        # Get top-k results
        doc_scores = [(self.doc_ids[i], scores[i]) for i in range(len(scores))]
        doc_scores.sort(key=lambda x: x[1], reverse=True)

        return doc_scores[:top_k]

    def semantic_search(self, query: str, top_k: int = 1000) -> List[Tuple[int, float]]:
        """Perform semantic dense retrieval"""
        query_embedding = self.semantic_model.encode([query], convert_to_tensor=False)
        faiss.normalize_L2(query_embedding)

        scores, indices = self.semantic_index.search(query_embedding, top_k)

        doc_scores = []
        for score, idx in zip(scores[0], indices[0]):
            if idx != -1:
                doc_scores.append((self.doc_ids[idx], float(score)))

        return doc_scores

    def normalize_scores(self, scores: List[float]) -> List[float]:
        """Normalize scores to [0, 1] range using min-max normalization"""
        if not scores:
            return scores

        min_score = min(scores)
        max_score = max(scores)

        if max_score == min_score:
            return [1.0] * len(scores)

        return [(score - min_score) / (max_score - min_score) for score in scores]

    def weighted_fusion(self, semantic_results: List[Tuple[int, float]],
                       bm25_results: List[Tuple[int, float]],
                       semantic_weight: float = 0.7,
                       top_k: int = 1000) -> List[Tuple[int, float]]:
        """
        Combine results using weighted score fusion

        Args:
            semantic_results: Results from semantic search
            bm25_results: Results from BM25 search
            semantic_weight: Weight for semantic scores (0-1)
            top_k: Number of results to return
        """
        bm25_weight = 1.0 - semantic_weight

        # Convert to dictionaries for easier lookup
        semantic_dict = dict(semantic_results)
        bm25_dict = dict(bm25_results)

        # Normalize scores separately
        if semantic_results:
            semantic_scores = [score for _, score in semantic_results]
            norm_semantic_scores = self.normalize_scores(semantic_scores)
            semantic_dict = {doc_id: norm_score for (doc_id, _), norm_score in zip(semantic_results, norm_semantic_scores)}

        if bm25_results:
            bm25_scores = [score for _, score in bm25_results]
            norm_bm25_scores = self.normalize_scores(bm25_scores)
            bm25_dict = {doc_id: norm_score for (doc_id, _), norm_score in zip(bm25_results, norm_bm25_scores)}

        # Combine scores
        all_doc_ids = set(semantic_dict.keys()) | set(bm25_dict.keys())
        combined_results = []

        for doc_id in all_doc_ids:
            semantic_score = semantic_dict.get(doc_id, 0.0)
            bm25_score = bm25_dict.get(doc_id, 0.0)

            combined_score = (semantic_weight * semantic_score +
                            bm25_weight * bm25_score)
            combined_results.append((doc_id, combined_score))

        # Sort by combined score
        combined_results.sort(key=lambda x: x[1], reverse=True)

        return combined_results[:top_k]

    def reciprocal_rank_fusion(self, semantic_results: List[Tuple[int, float]],
                              bm25_results: List[Tuple[int, float]],
                              k: int = 60, top_k: int = 1000) -> List[Tuple[int, float]]:
        """
        Combine results using Reciprocal Rank Fusion (RRF)

        RRF is often more robust than weighted fusion as it doesn't require score normalization

        Args:
            semantic_results: Results from semantic search
            bm25_results: Results from BM25 search
            k: RRF parameter (typically 60)
            top_k: Number of results to return
        """
        # Create rank dictionaries
        semantic_ranks = {doc_id: rank for rank, (doc_id, _) in enumerate(semantic_results)}
        bm25_ranks = {doc_id: rank for rank, (doc_id, _) in enumerate(bm25_results)}

        # Calculate RRF scores
        all_doc_ids = set(semantic_ranks.keys()) | set(bm25_ranks.keys())
        rrf_results = []

        for doc_id in all_doc_ids:
            rrf_score = 0.0

            if doc_id in semantic_ranks:
                rrf_score += 1.0 / (k + semantic_ranks[doc_id] + 1)

            if doc_id in bm25_ranks:
                rrf_score += 1.0 / (k + bm25_ranks[doc_id] + 1)

            rrf_results.append((doc_id, rrf_score))

        # Sort by RRF score
        rrf_results.sort(key=lambda x: x[1], reverse=True)

        return rrf_results[:top_k]

    def hybrid_search(self, query: str, method: str = 'weighted',
                     semantic_weight: float = 0.7, rrf_k: int = 60,
                     top_k: int = 1000) -> List[Tuple[int, float]]:
        """
        Perform hybrid search combining semantic and BM25 retrieval

        Args:
            query: Search query
            method: Fusion method ('weighted' or 'rrf')
            semantic_weight: Weight for semantic scores (only for weighted method)
            rrf_k: RRF parameter (only for rrf method)
            top_k: Number of results to return
        """
        # Get results from both methods
        semantic_results = self.semantic_search(query, top_k)
        bm25_results = self.bm25_search(query, top_k)

        # Combine using specified method
        if method == 'weighted':
            return self.weighted_fusion(semantic_results, bm25_results,
                                      semantic_weight, top_k)
        elif method == 'rrf':
            return self.reciprocal_rank_fusion(semantic_results, bm25_results,
                                             rrf_k, top_k)
        else:
            raise ValueError(f"Unknown fusion method: {method}")

    def analyze_query_coverage(self, query: str, top_k: int = 100) -> Dict:
        """Analyze how semantic and BM25 methods complement each other for a query"""
        semantic_results = self.semantic_search(query, top_k)
        bm25_results = self.bm25_search(query, top_k)

        semantic_docs = set(doc_id for doc_id, _ in semantic_results)
        bm25_docs = set(doc_id for doc_id, _ in bm25_results)

        overlap = semantic_docs & bm25_docs
        semantic_only = semantic_docs - bm25_docs
        bm25_only = bm25_docs - semantic_docs

        return {
            'overlap_count': len(overlap),
            'semantic_only_count': len(semantic_only),
            'bm25_only_count': len(bm25_only),
            'overlap_percentage': len(overlap) / top_k * 100,
            'semantic_unique_percentage': len(semantic_only) / top_k * 100,
            'bm25_unique_percentage': len(bm25_only) / top_k * 100,
            'total_unique_docs': len(semantic_docs | bm25_docs)
        }


In [None]:
# Bengali to Banglish Converter for Test Query Alignment

import re
from typing import Dict, List
import pandas as pd

def load_trec_queries(file_path):
    """Parse TREC format query file"""
    queries = []

    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()

    # Parse XML-like structure
    topics = re.findall(r'<top>(.*?)</top>', content, re.DOTALL)

    for topic in topics:
        # Extract query number
        num_match = re.search(r'<num>(.*?)</num>', topic, re.DOTALL)
        if not num_match:
            continue
        query_id = int(num_match.group(1).strip())

        # Extract title
        title_match = re.search(r'<title>(.*?)</title>', topic, re.DOTALL)
        if title_match:
            title = title_match.group(1).strip()
        else:
            title = ""

        # Extract description (optional)
        desc_match = re.search(r'<desc>(.*?)</desc>', topic, re.DOTALL)
        desc = desc_match.group(1).strip() if desc_match else ""

        # Combine title and description
        query_text = f"{title} {desc}".strip()

        if query_text:
            queries.append({"qid":str(query_id),"query":query_text.strip() })
            # queries[query_id] = query_text

    return queries
# def load_trec_queries(filepath: str) -> List[Dict]:
#     queries = []
#     with open(filepath, "r", encoding="utf-8") as f:
#         content = f.read()
#     pattern = r"<num>(\d+)</num>.?<title>(.?)</title>"
#     matches = re.findall(pattern, content, re.DOTALL)
#     print(f"hello: {matches}")
#     for qid, title in matches:
#         queries.append({"qid": qid.strip(), "query": title.strip()})
#     return queries


class BengaliToBanglishConverter:
    """Convert Bengali Unicode text to Banglish (romanized) for model compatibility"""

    def __init__(self):
        # Enhanced Bengali to Banglish mapping
        self.bengali_to_banglish = {
            # Vowels
            "‡¶Ü": "a",
            "‡¶á": "i",
            "‡¶à": "i",
            "‡¶â": "u",
            "‡¶ä": "u",
            "‡¶ã": "ri",
            "‡¶è": "e",
            "‡¶ê": "oi",
            "‡¶ì": "o",
            "‡¶î": "ou",
            # Consonants with inherent 'a'
            "‡¶ï": "ka",
            "‡¶ñ": "kha",
            "‡¶ó": "ga",
            "‡¶ò": "gha",
            "‡¶ô": "nga",
            "‡¶ö": "cha",
            "‡¶õ": "chha",
            "‡¶ú": "ja",
            "‡¶ù": "jha",
            "‡¶û": "nja",
            "‡¶ü": "ta",
            "‡¶†": "tha",
            "‡¶°": "da",
            "‡¶¢": "dha",
            "‡¶£": "na",
            "‡¶§": "ta",
            "‡¶•": "tha",
            "‡¶¶": "da",
            "‡¶ß": "dha",
            "‡¶®": "na",
            "‡¶™": "pa",
            "‡¶´": "pha",
            "‡¶¨": "ba",
            "‡¶≠": "bha",
            "‡¶Æ": "ma",
            "‡¶Ø": "ja",
            "‡¶∞": "ra",
            "‡¶≤": "la",
            "‡¶∂": "sha",
            "‡¶∑": "sha",
            "‡¶∏": "sa",
            "‡¶π": "ha",
            "‡¶°‡¶º": "ra",
            "‡¶¢‡¶º": "rha",
            "‡¶Ø‡¶º": "ya",
            "‡ßé": "t",
            "‡¶Ç": "ng",
            "‡¶É": "h",
            "‡¶Å": "n",
            # Vowel diacritics (‡¶ï‡¶æ‡¶∞)
            "‡¶æ": "a",
            "‡¶ø": "i",
            "‡ßÄ": "i",
            "‡ßÅ": "u",
            "‡ßÇ": "u",
            "‡ßÉ": "ri",
            "‡ßá": "e",
            "‡ßà": "oi",
            "‡ßã": "o",
            "‡ßå": "ou",
            # Numbers
            "‡ß¶": "0",
            "‡ßß": "1",
            "‡ß®": "2",
            "‡ß©": "3",
            "‡ß™": "4",
            "‡ß´": "5",
            "‡ß¨": "6",
            "‡ß≠": "7",
            "‡ßÆ": "8",
            "‡ßØ": "9",
            # Special symbols
            "‡•§": ".",
            "‡••": ".",
        }

        # Common Bengali words to Banglish (most frequent)
        self.word_mappings = {
            "‡¶Ü‡¶Æ‡¶ø": "ami",
            "‡¶§‡ßÅ‡¶Æ‡¶ø": "tumi",
            "‡¶§‡ßã‡¶Æ‡¶æ‡¶∞": "tomar",
            "‡¶Ü‡¶Æ‡¶æ‡¶∞": "amar",
            "‡¶ï‡¶ø": "ki",
            "‡¶ï‡ßá‡¶Æ‡¶®": "kemon",
            "‡¶ï‡ßã‡¶•‡¶æ‡¶Ø‡¶º": "kothay",
            "‡¶ï‡¶¨‡ßá": "kobe",
            "‡¶ï‡ßá‡¶®": "keno",
            "‡¶ï‡¶ø‡¶≠‡¶æ‡¶¨‡ßá": "kivabe",
            "‡¶≠‡¶æ‡¶≤‡ßã": "valo",
            "‡¶≠‡¶æ‡¶≤": "valo",
            "‡¶ñ‡¶æ‡¶∞‡¶æ‡¶™": "kharap",
            "‡¶∏‡ßÅ‡¶®‡ßç‡¶¶‡¶∞": "sundor",
            "‡¶¨‡ßá‡¶∂‡¶ø": "beshi",
            "‡¶ï‡¶Æ": "kom",
            "‡¶Ö‡¶®‡ßá‡¶ï": "onek",
            "‡¶è‡¶ï‡¶ü‡ßÅ": "ektu",
            "‡¶¨‡¶°‡¶º": "boro",
            "‡¶õ‡ßã‡¶ü": "choto",
            "‡¶≠‡¶æ‡¶á": "vai",
            "‡¶¨‡ßã‡¶®": "bon",
            "‡¶Ü‡¶™‡ßÅ": "apu",
            "‡¶Æ‡¶æ‡¶Æ‡¶æ": "mama",
            "‡¶ö‡¶æ‡¶ö‡¶æ": "chacha",
            "‡¶ñ‡¶æ‡¶≤‡¶æ": "khala",
            "‡¶¶‡¶æ‡¶¶‡¶æ": "dada",
            "‡¶π‡¶¨‡ßá": "hobe",
            "‡¶Ü‡¶õ‡ßá": "ache",
            "‡¶®‡ßá‡¶á": "nei",
            "‡¶ö‡¶æ‡¶á": "chai",
            "‡¶¶‡ßá‡¶ñ‡ßã": "dekho",
            "‡¶¨‡¶≤‡ßã": "bolo",
            "‡¶Ø‡¶æ‡¶ì": "jao",
            "‡¶è‡¶∏‡ßã": "esho",
            "‡¶ñ‡¶æ‡¶ì": "khao",
            "‡¶™‡¶æ‡¶∞‡ßã": "paro",
            "‡¶ï‡¶∞‡ßã": "koro",
            "‡¶¶‡¶æ‡¶ì": "dao",
            "‡¶®‡¶æ‡¶ì": "nao",
            "‡¶•‡¶æ‡¶ï‡ßã": "thako",
            "‡¶ú‡¶æ‡¶®‡ßã": "jano",
            "‡¶¨‡ßÅ‡¶ù‡ßã": "bujho",
            "‡¶è‡¶á": "ei",
            "‡¶∏‡ßá‡¶á": "sei",
            "‡¶ì‡¶á": "oi",
            "‡¶è‡¶ü‡¶æ": "eta",
            "‡¶∏‡ßá‡¶ü‡¶æ": "seta",
            "‡¶ì‡¶ü‡¶æ": "ota",
            "‡¶ï‡ßã‡¶®": "kon",
            "‡¶ï‡ßã‡¶®‡ßã": "kono",
            "‡¶∏‡¶¨": "sob",
            "‡¶Ü‡¶∞": "ar",
            "‡¶ì": "o",
            "‡¶®‡¶æ": "na",
            "‡¶π‡ßç‡¶Ø‡¶æ‡¶Å": "hya",
            "‡¶π‡ßç‡¶Ø‡¶æ": "hya",
            "‡¶è‡¶ï‡¶ü‡¶æ": "ekta",
            "‡¶¶‡ßÅ‡¶ü‡ßã": "duto",
            "‡¶§‡¶ø‡¶®‡¶ü‡ßá": "tinte",
            "‡¶ö‡¶æ‡¶∞‡¶ü‡ßá": "charte",
            "‡¶™‡¶æ‡¶Å‡¶ö‡¶ü‡¶æ": "pachta",
            "‡¶õ‡¶Ø‡¶º‡¶ü‡¶æ": "choyeta",
            "‡¶∏‡¶æ‡¶§‡¶ü‡¶æ": "satta",
            "‡¶Ü‡¶ü‡¶ü‡¶æ": "atta",
            "‡¶®‡¶Ø‡¶º‡¶ü‡¶æ": "noyeta",
            "‡¶¶‡¶∂‡¶ü‡¶æ": "doshta",
            "‡¶ï‡¶∞‡¶õ‡¶ø": "korchi",
            "‡¶ï‡¶∞‡¶õ‡ßã": "korcho",
            "‡¶ï‡¶∞‡¶õ‡ßá": "korche",
            "‡¶ï‡¶∞‡ßá‡¶õ‡¶ø": "korechi",
            "‡¶ï‡¶∞‡ßá‡¶õ‡ßã": "korecho",
            "‡¶ï‡¶∞‡ßá‡¶õ‡ßá": "koreche",
            "‡¶ï‡¶∞‡¶¨": "korbo",
            "‡¶ï‡¶∞‡¶¨‡ßá": "korbe",
            "‡¶ï‡¶∞‡¶¨‡ßã": "korbo",
            "‡¶Ø‡¶æ‡¶ö‡ßç‡¶õ‡¶ø": "jacchi",
            "‡¶Ø‡¶æ‡¶ö‡ßç‡¶õ‡ßã": "jaccho",
            "‡¶Ø‡¶æ‡¶ö‡ßç‡¶õ‡ßá": "jacche",
            "‡¶è‡¶∏‡ßá‡¶õ‡¶ø": "eshechi",
            "‡¶è‡¶∏‡ßá‡¶õ‡ßã": "eshecho",
            "‡¶è‡¶∏‡ßá‡¶õ‡ßá": "esheche",
            "‡¶≠‡¶æ‡¶≤‡ßã‡¶¨‡¶æ‡¶∏‡¶ø": "valobashi",
            "‡¶≠‡¶æ‡¶≤‡ßã‡¶¨‡¶æ‡¶∏‡¶æ": "valobasha",
            "‡¶ñ‡ßÅ‡¶∂‡¶ø": "khushi",
            "‡¶¶‡ßÅ‡¶É‡¶ñ‡¶ø‡¶§": "dukkhito",
            "‡¶∞‡¶æ‡¶ó": "rag",
            "‡¶¨‡¶®‡ßç‡¶ß‡ßÅ": "bondhu",
            "‡¶™‡¶∞‡¶ø‡¶¨‡¶æ‡¶∞": "poribar",
            "‡¶¨‡¶æ‡¶°‡¶º‡¶ø": "bari",
            "‡¶∏‡ßç‡¶ï‡ßÅ‡¶≤": "school",
            "‡¶ï‡¶≤‡ßá‡¶ú": "college",
            "‡¶Ö‡¶´‡¶ø‡¶∏": "office",
            "‡¶ï‡¶æ‡¶ú": "kaj",
            "‡¶™‡¶°‡¶º‡¶æ": "pora",
            "‡¶≤‡ßá‡¶ñ‡¶æ": "lekha",
            "‡¶ó‡¶æ‡¶®": "gan",
            "‡¶ñ‡ßá‡¶≤‡¶æ": "khela",
            "‡¶∏‡¶ø‡¶®‡ßá‡¶Æ‡¶æ": "cinema",
            "‡¶¨‡¶á": "boi",
            "‡¶´‡ßã‡¶®": "phone",
            "‡¶Æ‡ßã‡¶¨‡¶æ‡¶á‡¶≤": "mobile",
            "‡¶ï‡¶Æ‡ßç‡¶™‡¶ø‡¶â‡¶ü‡¶æ‡¶∞": "computer",
            "‡¶á‡¶®‡ßç‡¶ü‡¶æ‡¶∞‡¶®‡ßá‡¶ü": "internet",
            "‡¶´‡ßá‡¶∏‡¶¨‡ßÅ‡¶ï": "facebook",
        }

    def convert_word(self, bengali_word: str) -> str:
        """Convert a single Bengali word to Banglish"""
        if not bengali_word.strip():
            return bengali_word

        # Check direct word mapping first
        if bengali_word in self.word_mappings:
            return self.word_mappings[bengali_word]

        # Character-by-character conversion for unknown words
        result = ""
        i = 0
        word = bengali_word.strip()

        while i < len(word):
            char = word[i]

            # Handle conjuncts (‡¶Ø‡ßÅ‡¶ï‡ßç‡¶§‡¶æ‡¶ï‡ßç‡¶∑‡¶∞) - simplified approach
            if i < len(word) - 1:
                two_char = word[i : i + 2]
                if two_char in self.bengali_to_banglish:
                    result += self.bengali_to_banglish[two_char]
                    i += 2
                    continue

            # Single character conversion
            if char in self.bengali_to_banglish:
                result += self.bengali_to_banglish[char]
            elif char == "‡ßç":  # Hasanta (virama) - skip or handle specially
                if i < len(word) - 1:  # If not at end, might be conjunct
                    pass  # Skip hasanta, next consonant will be processed
                else:
                    result += char  # Keep if at end
            elif "\u0980" <= char <= "\u09ff":  # Bengali unicode range
                # Unknown Bengali character, try phonetic approximation
                result += "X"  # Placeholder
            else:
                # Non-Bengali character (English, numbers, punctuation)
                result += char

            i += 1

        return result

    def convert_text(self, bengali_text: str) -> str:
        """Convert full Bengali text to Banglish"""
        if not bengali_text:
            return ""

        # Clean and tokenize
        text = re.sub(r"\s+", " ", bengali_text.strip())
        words = text.split()

        # Convert each word
        banglish_words = []
        for word in words:
            # Remove punctuation for conversion, then add back
            clean_word = re.sub(r"[^\u0980-\u09FF\w]", "", word)
            punct = re.sub(r"[\u0980-\u09FF\w]", "", word)

            if clean_word:
                converted = self.convert_word(clean_word)
                banglish_words.append(converted + punct)
            elif punct:
                banglish_words.append(punct)

        return " ".join(banglish_words).lower()

    def convert_query_dataset(self, queries: List[Dict]) -> List[Dict]:
        """Convert a list of query dictionaries"""
        converted_queries = []

        print(f"üîÑ Converting {len(queries)} Bengali queries to Banglish...")

        for query in queries:
            original_query = query["query"]
            converted_query = self.convert_text(original_query)

            converted_queries.append(
                {
                    "qid": query["qid"],
                    "query": converted_query,
                    "original_query": original_query,  # Keep for reference
                }
            )

            # Show conversion examples
            if len(converted_queries) <= 5:
                print(f"  {query['qid']}: '{original_query}' ‚Üí '{converted_query}'")

        return converted_queries


# Updated data loading function for your main code
def load_and_convert_test_queries(
    filepath: str, converter: BengaliToBanglishConverter
) -> List[Dict]:
    """Load test queries and convert Bengali to Banglish"""

    # Load original Bengali queries
    bengali_queries = load_trec_queries(filepath)  # Your existing function

    # Convert to Banglish
    banglish_queries = converter.convert_query_dataset(bengali_queries)

    print(f"‚úÖ Converted {len(banglish_queries)} test queries from Bengali to Banglish")

    # Save converted queries for inspection
    converted_df = pd.DataFrame(banglish_queries)
    converted_df.to_csv(
        "converted_test_queries_banglish.csv", index=False, encoding="utf-8"
    )
    print("üíæ Saved converted queries to 'converted_test_queries_banglish.csv'")

    return banglish_queries


# Usage in your main code - REPLACE your test query loading:

# OLD
test_queries = load_trec_queries("data/test_query_30 (1).trec")
print(f"final queries: {test_queries}")

# NEW
converter = BengaliToBanglishConverter()
test_queries = load_and_convert_test_queries("data/test_query_30 (1).trec", converter)

# The rest of your code remains the same!
# Your normalizer will now work properly since queries are in Banglish

final_queries = {q["qid"]:q["query"] for q in test_queries}
print(f"final queries: {test_queries}")



final queries: [{'qid': '26', 'query': 'hi hyderabad e rapid antigen test kothay kora hochhe keu janate parben its urgent jate test korar 1 2 ghontar modhhe result pete pari hi hyderabad e rapid antigen test kothay kora hochhe keu janate parben its urgent jate test korar 1 2 ghontar modhhe result pete pari'}, {'qid': '27', 'query': 'hello everyone i need urgent information regarding travelling to west bengal this group has been very helpful to me so hope now also i shall get someone s help who has travelled to kolkata very recently i have to travel to kolkata airport and amar bari nadia district e so what is the procedure to collect the e pass for the interdistrict kolkata to nadia travel and do i need any document for interstate telangana to wb travel a hired car will come to pick me from the airport so what documents do i need to collect please let me know the details hello everyone i need urgent information regarding travelling to west bengal this group has been very helpful to me s

In [None]:
def parse_trec_corpus(file_path):
    """Parse TREC format corpus file"""
    documents = {}

    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()

    # Split by document boundaries
    docs = re.split(r'<DOC>|</DOC>', content)

    for doc in docs:
        if not doc.strip():
            continue

        # Extract DOCNO
        docno_match = re.search(r'<DOCNO>(.*?)</DOCNO>', doc, re.DOTALL)
        if not docno_match:
            continue
        docno = docno_match.group(1).strip()

        # Extract BODY
        body_match = re.search(r'<BODY>(.*?)</BODY>', doc, re.DOTALL)
        if body_match:
            body = body_match.group(1).strip()
        else:
            body = ""

        # Extract HEAD (optional)
        head_match = re.search(r'<HEAD>(.*?)</HEAD>', doc, re.DOTALL)
        head = head_match.group(1).strip() if head_match else ""

        # Combine head and body
        full_text = f"{head} {body}".strip()

        if full_text:
            documents[int(docno)] = full_text

    return documents

def parse_trec_queries(file_path):
    """Parse TREC format query file"""
    queries = {}

    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()

    # Parse XML-like structure
    topics = re.findall(r'<top>(.*?)</top>', content, re.DOTALL)

    for topic in topics:
        # Extract query number
        num_match = re.search(r'<num>(.*?)</num>', topic, re.DOTALL)
        if not num_match:
            continue
        query_id = int(num_match.group(1).strip())

        # Extract title
        title_match = re.search(r'<title>(.*?)</title>', topic, re.DOTALL)
        if title_match:
            title = title_match.group(1).strip()
        else:
            title = ""

        # Extract description (optional)
        desc_match = re.search(r'<desc>(.*?)</desc>', topic, re.DOTALL)
        desc = desc_match.group(1).strip() if desc_match else ""

        # Combine title and description
        query_text = f"{title} {desc}".strip()

        if query_text:
            queries[query_id] = query_text

    return queries

def parse_qrels(file_path):
    """Parse QRels file (query_id Q0 doc_id relevance)"""
    qrels = defaultdict(dict)

    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) >= 4:
                query_id = int(parts[0])
                doc_id = int(parts[2])
                relevance = int(parts[3])
                qrels[query_id][doc_id] = relevance

    return qrels

# Load all data
print("Loading corpus...")
documents = parse_trec_corpus('data/Baseline_Corpus.trec')
print(f"Loaded {len(documents)} documents")

print("\nLoading training queries...")
train_queries = parse_trec_queries('data/Train_query_20_Roman.trec')
print(f"Loaded {len(train_queries)} training queries")

print("\nLoading test queries...")
test_queries_real = parse_trec_queries('data/test_query_30 (1).trec')
test_queries = final_queries
print(f"Loaded {len(test_queries)} test queries")

print("\nLoading QRels...")
qrels = parse_qrels('data/QRels_Train_20.txt')
print(f"Loaded relevance judgments for {len(qrels)} queries")

# Display sample Bengali/Roman code-mixed data
print("\n=== Sample Bengali/Roman Code-Mixed Data ===")
sample_query_id = list(train_queries.keys())[0]
print(f"Sample Training Query {sample_query_id}: {train_queries[sample_query_id]}")

sample_test_query_id = list(test_queries.keys())[0]
print(f"Sample Test Query {sample_test_query_id}: {test_queries[sample_test_query_id]}")

print(f"\nSample QRel for Query {sample_query_id}: {dict(list(qrels[sample_query_id].items())[:5])}")


Loading corpus...
Loaded 107900 documents

Loading training queries...
Loaded 20 training queries

Loading test queries...
Loaded 30 test queries

Loading QRels...
Loaded relevance judgments for 20 queries

=== Sample Bengali/Roman Code-Mixed Data ===
Sample Training Query 1: hyderabad to howrah kono train ki diyeche ba debe durgapur jete hobe any idea jodi train chare then timing gulo ektu help korben
Sample Test Query 26: hi hyderabad e rapid antigen test kothay kora hochhe keu janate parben its urgent jate test korar 1 2 ghontar modhhe result pete pari hi hyderabad e rapid antigen test kothay kora hochhe keu janate parben its urgent jate test korar 1 2 ghontar modhhe result pete pari

Sample QRel for Query 1: {4: 0, 11: 0, 339: 0, 1378: 0, 1861: 1}


In [None]:
# Create semantic-aware training triplets (NO keyword overlap bias!)
print("üß† Creating semantic-aware training triplets for Sentence Transformer...")
print("üö´ REMOVED: Keyword overlap-based negative mining")
print("‚úÖ ADDED: Semantic diversity-based negative sampling")
print("‚ö° OPTIMIZED: For sentence transformer efficiency")

training_examples = create_semantic_training_triplets(
    train_queries, documents, qrels,
    num_negatives_per_positive=3  # Fewer negatives for faster training
)

print(f"\nüìä Created {len(training_examples)} semantic training triplets")

# Split for validation
train_examples, val_examples = train_test_split(
    training_examples, test_size=0.2, random_state=42
)

print(f"\nüìà Training set: {len(train_examples)} triplets")
print(f"üìä Validation set: {len(val_examples)} triplets")

# Initialize semantic sentence transformer
print("\nüöÄ Initializing Semantic Sentence Transformer...")
print("üö´ Wandb integration disabled - using direct Hugging Face model")
MODEL_NAME = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
semantic_model = SemanticSentenceTransformer(
    model_name=MODEL_NAME,
    max_seq_length=512
)

# Fine-tune the semantic sentence transformer
print("\nüß† Starting semantic-aware fine-tuning...")
print("Key advantages over XLM-RoBERTa approach:")
print("  ‚úÖ Pre-optimized for semantic similarity tasks")
print("  ‚úÖ Faster training with smaller model size")
print("  ‚úÖ Better out-of-the-box multilingual support")

# First try to use a pre-trained model to avoid memory issues
# If you want fine-tuning, uncomment the training block below
print("üîß MEMORY-EFFICIENT APPROACH: Using pre-trained model to avoid memory issues")
print("üí° For production use, fine-tuning on a machine with more GPU memory is recommended")

# Load pre-trained model directly for hybrid retrieval
semantic_model = SemanticSentenceTransformer(
    model_name=MODEL_NAME,
    max_seq_length=512
)

print("‚úÖ Using pre-trained semantic model for hybrid retrieval")

# Uncomment below for fine-tuning (requires more GPU memory):

model_path = semantic_model.fine_tune_semantic(
    training_examples=train_examples,
    val_examples=val_examples,
    epochs=2,
    batch_size=8,
    warmup_steps=50,
    output_path='best_sentence_transformer_semantic_ir'
)
print(f"üéâ Semantic fine-tuning completed!")
print(f"üìÅ Model saved to: {model_path}")



üß† Creating semantic-aware training triplets for Sentence Transformer...
üö´ REMOVED: Keyword overlap-based negative mining
‚úÖ ADDED: Semantic diversity-based negative sampling
‚ö° OPTIMIZED: For sentence transformer efficiency
üß† Creating SEMANTIC training triplets for Sentence Transformer...
‚ùå REMOVED: Keyword overlap-based negative mining
‚úÖ ADDED: Semantic diversity-based negative sampling
‚úÖ OPTIMIZED: For sentence transformer architecture


Semantic triplet creation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:04<00:00,  4.08it/s]



üìä Created 1134 semantic training triplets

üìà Training set: 907 triplets
üìä Validation set: 227 triplets

üöÄ Initializing Semantic Sentence Transformer...
üö´ Wandb integration disabled - using direct Hugging Face model
üöÄ Initializing Semantic Sentence Transformer from Hugging Face: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
‚úÖ Pre-optimized for semantic similarity
‚úÖ Multilingual support for Bengali/Roman
‚úÖ Efficient architecture for faster training
üö´ Wandb logging disabled


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

üìä Model parameters: 117,653,760
üìè Max sequence length: 512

üß† Starting semantic-aware fine-tuning...
Key advantages over XLM-RoBERTa approach:
  ‚úÖ Pre-optimized for semantic similarity tasks
  ‚úÖ Faster training with smaller model size
  ‚úÖ Better out-of-the-box multilingual support
üîß MEMORY-EFFICIENT APPROACH: Using pre-trained model to avoid memory issues
üí° For production use, fine-tuning on a machine with more GPU memory is recommended
üöÄ Initializing Semantic Sentence Transformer from Hugging Face: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
‚úÖ Pre-optimized for semantic similarity
‚úÖ Multilingual support for Bengali/Roman
‚úÖ Efficient architecture for faster training
üö´ Wandb logging disabled


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


üìä Model parameters: 117,653,760
üìè Max sequence length: 512
‚úÖ Using pre-trained semantic model for hybrid retrieval

üß† Fine-tuning with 907 semantic triplets...
üéØ Focus: Semantic understanding over lexical matching
‚ö° Advantage: Pre-trained semantic representations
üö´ Training without wandb logging

üìà Training configuration:
  Epochs: 2
  Batch size: 8
  Total steps: 228
  Warmup steps: 50
  Loss function: Triplet Loss (semantic ranking)
  Logging: Disabled (no wandb required)
  Evaluation: 100 examples for validation

üîÑ Starting training...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss,Semantic Ir Eval Pearson Cosine,Semantic Ir Eval Spearman Cosine
114,No log,No log,0.69401,0.722654
228,No log,No log,0.784109,0.769076



‚úÖ Fine-tuning completed!
üìÅ Model saved to: best_sentence_transformer_semantic_ir
üéâ Semantic fine-tuning completed!
üìÅ Model saved to: best_sentence_transformer_semantic_ir


In [None]:
# Final comprehensive comparison and summary
print("\n" + "="*80)
print("üöÄ HYBRID RETRIEVAL VS SINGLE-METHOD APPROACHES")
print("="*80)

print("\nüìä METHODOLOGY COMPARISON:")
comparison_data = {
    'Aspect': [
        'Retrieval Type',
        'Query Coverage',
        'Lexical Matching',
        'Semantic Understanding',
        'Query Adaptability',
        'Robustness',
        'Performance Consistency',
        'Implementation Complexity'
    ],
    'BM25 Only (Sparse)': [
        'Keyword-based',
        'Limited to exact/similar terms',
        'Excellent',
        'Poor',
        'Poor (keyword queries only)',
        'Fails on paraphrases',
        'Inconsistent across query types',
        'Simple'
    ],
    'Semantic Only (Dense)': [
        'Meaning-based',
        'Good for conceptual queries',
        'Poor',
        'Excellent',
        'Good for semantic queries',
        'Fails on exact matches',
        'Inconsistent on factual queries',
        'Moderate'
    ],
    'Hybrid (Dense + Sparse)': [
        'Best of both worlds',
        'Comprehensive coverage',
        'Excellent',
        'Excellent',
        'Adapts to any query type',
        'Robust across query types',
        'Consistently high performance',
        'Higher but manageable'
    ]
}

comparison_df = pd.DataFrame(comparison_data)
print("\nüìà DETAILED COMPARISON:")
for _, row in comparison_df.iterrows():
    print(f"  {row['Aspect']:25s}: {row['BM25 Only (Sparse)']:25s} | {row['Semantic Only (Dense)']:25s} | {row['Hybrid (Dense + Sparse)']}")

print("\nüéØ HYBRID RETRIEVAL ADVANTAGES:")
print("  üé™ Combines lexical precision with semantic understanding")
print("  üèπ Captures exact keyword matches AND conceptual similarity")
print("  üîÑ Adapts automatically to different query types")
print("  üìà Higher overall recall and precision")
print("  üõ°Ô∏è Robust performance across diverse query scenarios")
print("  üé≠ Handles both factual and conceptual information needs")
print("  üåç Excellent for code-mixed Bengali (lexical + semantic)")

print("\n‚ö° TECHNICAL IMPLEMENTATION BENEFITS:")
print("  üöÄ Efficient fusion algorithms (weighted + RRF)")
print("  üìä Score normalization for fair combination")
print("  üîß Configurable weights for domain adaptation")
print("  üíæ Memory-efficient indexing (FAISS + BM25)")
print("  üìà Scalable to large document collections")
print("  üîç Multiple fusion strategies for optimization")

print("\nüìÅ FILES CREATED:")
print("  1. hybrid_retrieval_weighted_submission.csv - Weighted fusion results")
print("  2. hybrid_retrieval_rrf_submission.csv - RRF fusion results")
print("  3. Semantic model embeddings and BM25 index")

print("\nüèÜ EXPECTED PERFORMANCE GAINS:")
print("  üìä Higher MAP/NDCG scores than single methods")
print("  üéØ Better recall for diverse query types")
print("  ‚öñÔ∏è More balanced precision across different queries")
print("  üåü State-of-the-art results for code-mixed retrieval")
print("  üîß Production-ready hybrid system")

print("\nüî¨ FUSION STRATEGIES IMPLEMENTED:")
print("  ‚öñÔ∏è Weighted Fusion: Linear combination of normalized scores")
print("  üîÑ Reciprocal Rank Fusion (RRF): Rank-based combination")
print("  üìà Both methods handle score scale differences")
print("  üéõÔ∏è Configurable parameters for domain optimization")

print("\n" + "="*80)
print("‚ú® HYBRID RETRIEVAL SYSTEM COMPLETE!")
print("üß† Combines the best of dense AND sparse retrieval!")
print("üìä Two submission files ready for evaluation:")
print("   ‚Ä¢ hybrid_retrieval_weighted_submission.csv (recommended)")
print("   ‚Ä¢ hybrid_retrieval_rrf_submission.csv (alternative)")
print("üèÜ Expected to outperform single-method approaches!")
print("="*80)



üöÄ HYBRID RETRIEVAL VS SINGLE-METHOD APPROACHES

üìä METHODOLOGY COMPARISON:

üìà DETAILED COMPARISON:
  Retrieval Type           : Keyword-based             | Meaning-based             | Best of both worlds
  Query Coverage           : Limited to exact/similar terms | Good for conceptual queries | Comprehensive coverage
  Lexical Matching         : Excellent                 | Poor                      | Excellent
  Semantic Understanding   : Poor                      | Excellent                 | Excellent
  Query Adaptability       : Poor (keyword queries only) | Good for semantic queries | Adapts to any query type
  Robustness               : Fails on paraphrases      | Fails on exact matches    | Robust across query types
  Performance Consistency  : Inconsistent across query types | Inconsistent on factual queries | Consistently high performance
  Implementation Complexity: Simple                    | Moderate                  | Higher but manageable

üéØ HYBRID RETRIEVAL AD

In [None]:
# Optional: Parameter Tuning and Analysis Tools for Hybrid Retrieval
print("üîß HYBRID RETRIEVAL PARAMETER TUNING AND ANALYSIS")
print("=" * 60)

def tune_fusion_weights(hybrid_retriever, validation_queries, validation_qrels=None,
                       weight_range=(0.1, 0.9), step=0.1):
    """
    Tune the fusion weights by testing different semantic vs BM25 weight combinations

    This is useful if you have validation data to optimize the fusion parameters
    """
    print("üéõÔ∏è Testing different fusion weight combinations...")

    results = []
    weights = np.arange(weight_range[0], weight_range[1] + step, step)

    for semantic_weight in weights:
        print(f"  Testing semantic weight: {semantic_weight:.1f}")

        # Test on a sample of validation queries for efficiency
        sample_queries = dict(list(validation_queries.items())[:3])

        for query_id, query_text in sample_queries.items():
            hybrid_results = hybrid_retriever.hybrid_search(
                query_text, method='weighted', semantic_weight=semantic_weight, top_k=100
            )

            # Store results for analysis
            results.append({
                'query_id': query_id,
                'semantic_weight': semantic_weight,
                'bm25_weight': 1.0 - semantic_weight,
                'top_doc_id': hybrid_results[0][0] if hybrid_results else None,
                'top_score': hybrid_results[0][1] if hybrid_results else 0.0
            })

    results_df = pd.DataFrame(results)
    print("‚úÖ Weight tuning analysis complete")
    return results_df

def analyze_query_type_performance(hybrid_retriever, test_queries, sample_size=5):
    """
    Analyze how different fusion methods perform on different types of queries
    """
    print("üîç ANALYZING QUERY TYPE PERFORMANCE...")

    # Sample queries for analysis
    sample_queries = dict(list(test_queries.items())[:sample_size])

    analysis_results = []

    for query_id, query_text in sample_queries.items():
        print(f"\\nAnalyzing Query {query_id}: '{query_text[:60]}...'")

        # Get individual method results
        semantic_results = hybrid_retriever.semantic_search(query_text, top_k=50)
        bm25_results = hybrid_retriever.bm25_search(query_text, top_k=50)

        # Get fusion results
        weighted_results = hybrid_retriever.hybrid_search(
            query_text, method='weighted', semantic_weight=0.7, top_k=50
        )
        rrf_results = hybrid_retriever.hybrid_search(
            query_text, method='rrf', rrf_k=60, top_k=50
        )

        # Analyze coverage
        coverage = hybrid_retriever.analyze_query_coverage(query_text, top_k=50)

        analysis_results.append({
            'query_id': query_id,
            'query_length': len(query_text.split()),
            'semantic_top_score': semantic_results[0][1] if semantic_results else 0,
            'bm25_top_score': bm25_results[0][1] if bm25_results else 0,
            'weighted_top_score': weighted_results[0][1] if weighted_results else 0,
            'rrf_top_score': rrf_results[0][1] if rrf_results else 0,
            'overlap_percentage': coverage['overlap_percentage'],
            'semantic_unique_percentage': coverage['semantic_unique_percentage'],
            'bm25_unique_percentage': coverage['bm25_unique_percentage']
        })

        print(f"  üìä Method overlap: {coverage['overlap_percentage']:.1f}%")
        print(f"  üß† Semantic-only docs: {coverage['semantic_unique_percentage']:.1f}%")
        print(f"  üîç BM25-only docs: {coverage['bm25_unique_percentage']:.1f}%")

    analysis_df = pd.DataFrame(analysis_results)

    print("\\nüìà QUERY ANALYSIS SUMMARY:")
    print(f"  Average overlap: {analysis_df['overlap_percentage'].mean():.1f}%")
    print(f"  Average semantic-only: {analysis_df['semantic_unique_percentage'].mean():.1f}%")
    print(f"  Average BM25-only: {analysis_df['bm25_unique_percentage'].mean():.1f}%")

    return analysis_df

def compare_fusion_methods(hybrid_retriever, test_queries, sample_size=3):
    """
    Compare weighted fusion vs RRF on sample queries
    """
    print("‚öñÔ∏è COMPARING FUSION METHODS...")

    sample_queries = dict(list(test_queries.items())[:sample_size])
    comparison_results = []

    for query_id, query_text in sample_queries.items():
        print(f"\\nComparing methods for Query {query_id}...")

        # Get results from both fusion methods
        weighted_results = hybrid_retriever.hybrid_search(
            query_text, method='weighted', semantic_weight=0.7, top_k=20
        )
        rrf_results = hybrid_retriever.hybrid_search(
            query_text, method='rrf', rrf_k=60, top_k=20
        )

        # Calculate rank correlation
        weighted_docs = [doc_id for doc_id, _ in weighted_results]
        rrf_docs = [doc_id for doc_id, _ in rrf_results]

        # Find common docs in top 10
        common_docs = set(weighted_docs[:10]) & set(rrf_docs[:10])

        comparison_results.append({
            'query_id': query_id,
            'weighted_top_doc': weighted_results[0][0] if weighted_results else None,
            'rrf_top_doc': rrf_results[0][0] if rrf_results else None,
            'same_top_doc': (weighted_results[0][0] == rrf_results[0][0]) if weighted_results and rrf_results else False,
            'common_in_top10': len(common_docs),
            'common_percentage_top10': len(common_docs) / 10 * 100
        })

        print(f"  Same top document: {comparison_results[-1]['same_top_doc']}")
        print(f"  Common docs in top 10: {len(common_docs)}/10 ({len(common_docs)*10}%)")

    comparison_df = pd.DataFrame(comparison_results)

    print("\\nüìä FUSION METHOD COMPARISON SUMMARY:")
    same_top_percentage = comparison_df['same_top_doc'].mean() * 100
    avg_common_percentage = comparison_df['common_percentage_top10'].mean()

    print(f"  Same top document: {same_top_percentage:.1f}% of queries")
    print(f"  Average overlap in top 10: {avg_common_percentage:.1f}%")

    if same_top_percentage > 70:
        print("  ‚úÖ Methods are largely consistent - either can be used")
    elif same_top_percentage > 40:
        print("  ‚öñÔ∏è Methods show some differences - test both for best results")
    else:
        print("  üîÑ Methods are quite different - consider ensemble approach")

    return comparison_df

# Run analysis on sample data
print("\\nüöÄ Running hybrid retrieval analysis...")

# Analyze query type performance
# query_analysis = analyze_query_type_performance(hybrid_retriever, test_queries, sample_size=3)

# Compare fusion methods
# fusion_comparison = compare_fusion_methods(hybrid_retriever, test_queries, sample_size=3)

print("\\nüí° OPTIMIZATION RECOMMENDATIONS:")
print("1. üìä Use weighted fusion (0.7 semantic) as default")
print("2. üîÑ Try RRF for more conservative ranking")
print("3. üéõÔ∏è Adjust semantic weight based on query characteristics:")
print("   ‚Ä¢ Higher semantic weight (0.8+) for conceptual queries")
print("   ‚Ä¢ Lower semantic weight (0.5-0.6) for factual/exact queries")
print("4. üìà Monitor overlap percentage to detect query types")
print("5. üîß Fine-tune BM25 parameters (k1, b) for domain-specific optimization")

print("\\n‚úÖ Analysis complete! Use insights to optimize your hybrid retrieval system.")


üîß HYBRID RETRIEVAL PARAMETER TUNING AND ANALYSIS
\nüöÄ Running hybrid retrieval analysis...
\nüí° OPTIMIZATION RECOMMENDATIONS:
1. üìä Use weighted fusion (0.7 semantic) as default
2. üîÑ Try RRF for more conservative ranking
3. üéõÔ∏è Adjust semantic weight based on query characteristics:
   ‚Ä¢ Higher semantic weight (0.8+) for conceptual queries
   ‚Ä¢ Lower semantic weight (0.5-0.6) for factual/exact queries
4. üìà Monitor overlap percentage to detect query types
5. üîß Fine-tune BM25 parameters (k1, b) for domain-specific optimization
\n‚úÖ Analysis complete! Use insights to optimize your hybrid retrieval system.


In [None]:
# ALTERNATIVE: Quick test without fine-tuning
# If you want to skip training and just test the semantic retrieval approach:

print("üîß ALTERNATIVE: Testing with pre-trained model (no fine-tuning)")
print("This will use the model as-is without additional training")

# Use pre-trained model directly
quick_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
quick_model = quick_model.to(device)

print("‚úÖ Pre-trained model loaded successfully")
print("üöÄ You can now proceed to document indexing with this model")
print("üìù Note: Results may be slightly lower quality without fine-tuning")

# To use this model, uncomment the next line and skip the fine-tuning step:
# semantic_model = type('obj', (object,), {'model': quick_model, 'encode': lambda self, *args, **kwargs: self.model.encode(*args, **kwargs)})()


üîß ALTERNATIVE: Testing with pre-trained model (no fine-tuning)
This will use the model as-is without additional training
‚úÖ Pre-trained model loaded successfully
üöÄ You can now proceed to document indexing with this model
üìù Note: Results may be slightly lower quality without fine-tuning


In [None]:
# Verification: Check that wandb is properly disabled
print("üîç VERIFICATION: Checking environment setup...")

import os
print(f"WANDB_DISABLED: {os.environ.get('WANDB_DISABLED', 'Not set')}")
print(f"TRANSFORMERS_NO_ADVISORY_WARNINGS: {os.environ.get('TRANSFORMERS_NO_ADVISORY_WARNINGS', 'Not set')}")

# Test sentence transformer import
try:
    from sentence_transformers import SentenceTransformer
    print("‚úÖ SentenceTransformer import successful")

    # Quick test load (small model)
    test_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    test_embedding = test_model.encode(["This is a test sentence."])
    print(f"‚úÖ Model encoding test successful - shape: {test_embedding.shape}")

    del test_model  # Clean up memory
    print("üöÄ Ready to proceed with semantic training!")

except Exception as e:
    print(f"‚ùå Error: {e}")
    print("üí° Try installing: pip install sentence-transformers")

print("\n" + "="*50)
print("üéØ Environment verified - no wandb issues expected!")
print("="*50)


üîç VERIFICATION: Checking environment setup...
WANDB_DISABLED: true
‚úÖ SentenceTransformer import successful


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

‚úÖ Model encoding test successful - shape: (1, 384)
üöÄ Ready to proceed with semantic training!

üéØ Environment verified - no wandb issues expected!


In [None]:
# Initialize Hybrid Retrieval System (Sentence Transformer + BM25)
print("üöÄ INITIALIZING HYBRID RETRIEVAL SYSTEM...")
print("üîß Combining Dense (Semantic) + Sparse (BM25) Retrieval")

# Initialize hybrid retriever
hybrid_retriever = HybridRetriever(
    semantic_model=semantic_model,
    documents=documents,
    bm25_k1=1.2,  # BM25 term frequency saturation parameter
    bm25_b=0.75   # BM25 document length normalization parameter
)

print("‚úÖ Hybrid retrieval system ready!")

# Analyze method complementarity for sample queries
print("\nüîç ANALYZING RETRIEVAL METHOD COMPLEMENTARITY...")
sample_query_id = list(test_queries.keys())[0]
sample_query_text = test_queries[sample_query_id]

coverage_analysis = hybrid_retriever.analyze_query_coverage(sample_query_text, top_k=100)
print(f"\nSample Query Analysis: '{sample_query_text[:50]}...'")
print(f"  üìä Overlap between methods: {coverage_analysis['overlap_percentage']:.1f}%")
print(f"  üß† Semantic-only documents: {coverage_analysis['semantic_unique_percentage']:.1f}%")
print(f"  üîç BM25-only documents: {coverage_analysis['bm25_unique_percentage']:.1f}%")
print(f"  üìà Total unique documents found: {coverage_analysis['total_unique_docs']}")

def create_hybrid_submission_file(test_queries, hybrid_retriever,
                                 fusion_method='weighted', semantic_weight=0.7,
                                 output_file='hybrid_retrieval_submission.csv'):
    """Create submission file with hybrid retrieval - TOP 1000 documents per query"""
    submission_data = []

    print(f"\nüîç Generating hybrid retrieval for {len(test_queries)} test queries...")
    print(f"üîß Fusion method: {fusion_method}")
    if fusion_method == 'weighted':
        print(f"‚öñÔ∏è Semantic weight: {semantic_weight}, BM25 weight: {1-semantic_weight}")

    for query_id, query_text in tqdm(test_queries.items(), desc="Hybrid retrieval"):
        # Get hybrid results
        if fusion_method == 'weighted':
            hybrid_results = hybrid_retriever.hybrid_search(
                query_text, method='weighted', semantic_weight=semantic_weight, top_k=1000
            )
        else:  # RRF
            hybrid_results = hybrid_retriever.hybrid_search(
                query_text, method='rrf', rrf_k=60, top_k=1000
            )
        i=0
        # Add to submission data
        for rank, (doc_id, score) in enumerate(hybrid_results):
            submission_data.append({
                'index':i,
                'qid': query_id,
                'docno': doc_id,
                'docid': doc_id - 1,  # Convert to 0-based indexing
                'rank': rank,
                'score': score,
                # 'query': test_queries_real[query_id]
            })
            i+=1

    submission_df = pd.DataFrame(submission_data)
    submission_df.to_csv(output_file, index=False)
    print(f"‚úÖ Hybrid submission saved to {output_file}")
    return submission_df

# Generate hybrid submission with weighted fusion (default)
print("üéØ GENERATING HYBRID RETRIEVAL RESULTS...")
hybrid_submission_df = create_hybrid_submission_file(
    test_queries, hybrid_retriever,
    fusion_method='weighted', semantic_weight=0.7,
    output_file='hybrid_retrieval_weighted_submission.csv'
)

# Also generate with RRF for comparison
print("\nüîÑ GENERATING RRF FUSION RESULTS...")
rrf_submission_df = create_hybrid_submission_file(
    test_queries, hybrid_retriever,
    fusion_method='rrf',
    output_file='hybrid_retrieval_rrf_submission.csv'
)

print(f"\nüìä HYBRID RETRIEVAL RESULTS STATISTICS:")
print(f"  üìã Total entries (Weighted): {len(hybrid_submission_df):,}")
print(f"  üìã Total entries (RRF): {len(rrf_submission_df):,}")
print(f"  üîç Test queries processed: {hybrid_submission_df['qid'].nunique()}")
print(f"  üìñ Documents per query: {len(hybrid_submission_df) / hybrid_submission_df['qid'].nunique():.0f}")
print(f"  üéØ Score range (Weighted): {hybrid_submission_df['score'].min():.4f} to {hybrid_submission_df['score'].max():.4f}")
print(f"  üéØ Score range (RRF): {rrf_submission_df['score'].min():.4f} to {rrf_submission_df['score'].max():.4f}")

# Display sample results comparison
print("\nüîç SAMPLE HYBRID RESULTS COMPARISON:")
sample_query = hybrid_submission_df['qid'].iloc[0]
sample_weighted = hybrid_submission_df[hybrid_submission_df['qid'] == sample_query].head(5)
sample_rrf = rrf_submission_df[rrf_submission_df['qid'] == sample_query].head(5)

print(f"\nTop 5 WEIGHTED FUSION results for Test Query {sample_query}:")
for _, row in sample_weighted.iterrows():
    print(f"  Rank {row['rank']:3d}: Doc {row['docno']:6d} (Score: {row['score']:.4f})")

print(f"\nTop 5 RRF FUSION results for Test Query {sample_query}:")
for _, row in sample_rrf.iterrows():
    print(f"  Rank {row['rank']:3d}: Doc {row['docno']:6d} (Score: {row['score']:.4f})")

print(f"\nüéâ SUCCESS! Generated hybrid retrieval results!")
print("üìä Each query has exactly 1000 ranked documents using HYBRID APPROACH")
print("üß† Combines semantic understanding WITH lexical precision!")
print("‚ö° Best of both dense and sparse retrieval methods!")


üöÄ INITIALIZING HYBRID RETRIEVAL SYSTEM...
üîß Combining Dense (Semantic) + Sparse (BM25) Retrieval
üîß Initializing Hybrid Retriever with 107900 documents...
üìä BM25 parameters: k1=1.2, b=0.75
üîç Building BM25 index...
   ‚úÖ BM25 index built with 107900 documents
üß† Building semantic embeddings index...


Batches:   0%|          | 0/3372 [00:00<?, ?it/s]

   ‚úÖ Semantic index built with 107900 documents
‚úÖ Hybrid retriever initialized successfully!
‚úÖ Hybrid retrieval system ready!

üîç ANALYZING RETRIEVAL METHOD COMPLEMENTARITY...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Sample Query Analysis: 'hi hyderabad e rapid antigen test kothay kora hoch...'
  üìä Overlap between methods: 6.0%
  üß† Semantic-only documents: 94.0%
  üîç BM25-only documents: 94.0%
  üìà Total unique documents found: 194
üéØ GENERATING HYBRID RETRIEVAL RESULTS...

üîç Generating hybrid retrieval for 30 test queries...
üîß Fusion method: weighted
‚öñÔ∏è Semantic weight: 0.7, BM25 weight: 0.30000000000000004


Hybrid retrieval:   0%|          | 0/30 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:   3%|‚ñé         | 1/30 [00:01<00:29,  1.03s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:   7%|‚ñã         | 2/30 [00:05<01:27,  3.14s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  10%|‚ñà         | 3/30 [00:08<01:16,  2.82s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  13%|‚ñà‚ñé        | 4/30 [00:09<00:59,  2.27s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  17%|‚ñà‚ñã        | 5/30 [00:12<01:02,  2.50s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  20%|‚ñà‚ñà        | 6/30 [00:13<00:49,  2.05s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  23%|‚ñà‚ñà‚ñé       | 7/30 [00:15<00:42,  1.86s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  27%|‚ñà‚ñà‚ñã       | 8/30 [00:20<01:05,  2.99s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  30%|‚ñà‚ñà‚ñà       | 9/30 [00:22<00:53,  2.57s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  33%|‚ñà‚ñà‚ñà‚ñé      | 10/30 [00:23<00:43,  2.20s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  37%|‚ñà‚ñà‚ñà‚ñã      | 11/30 [00:25<00:42,  2.24s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  40%|‚ñà‚ñà‚ñà‚ñà      | 12/30 [00:27<00:39,  2.17s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 13/30 [00:32<00:50,  2.97s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 14/30 [00:34<00:41,  2.62s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 15/30 [00:35<00:30,  2.01s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 16/30 [00:37<00:28,  2.07s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 17/30 [00:38<00:25,  1.96s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 18/30 [00:40<00:20,  1.73s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 19/30 [00:40<00:16,  1.46s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 20/30 [00:43<00:19,  1.91s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 21/30 [00:45<00:15,  1.72s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 22/30 [00:45<00:11,  1.40s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 23/30 [00:46<00:08,  1.26s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 24/30 [00:49<00:09,  1.61s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 25/30 [00:49<00:06,  1.31s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 26/30 [00:50<00:04,  1.19s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 27/30 [00:51<00:03,  1.03s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 28/30 [00:53<00:02,  1.44s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 29/30 [00:56<00:01,  1.91s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:57<00:00,  1.91s/it]


‚úÖ Hybrid submission saved to hybrid_retrieval_weighted_submission.csv

üîÑ GENERATING RRF FUSION RESULTS...

üîç Generating hybrid retrieval for 30 test queries...
üîß Fusion method: rrf


Hybrid retrieval:   0%|          | 0/30 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:   3%|‚ñé         | 1/30 [00:01<00:29,  1.03s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:   7%|‚ñã         | 2/30 [00:05<01:21,  2.91s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  10%|‚ñà         | 3/30 [00:07<01:08,  2.52s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  13%|‚ñà‚ñé        | 4/30 [00:09<01:00,  2.33s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  17%|‚ñà‚ñã        | 5/30 [00:12<01:08,  2.75s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  20%|‚ñà‚ñà        | 6/30 [00:14<00:56,  2.34s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  23%|‚ñà‚ñà‚ñé       | 7/30 [00:15<00:47,  2.08s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  27%|‚ñà‚ñà‚ñã       | 8/30 [00:20<01:05,  2.99s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  30%|‚ñà‚ñà‚ñà       | 9/30 [00:22<00:56,  2.71s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  33%|‚ñà‚ñà‚ñà‚ñé      | 10/30 [00:24<00:45,  2.29s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  37%|‚ñà‚ñà‚ñà‚ñã      | 11/30 [00:26<00:43,  2.31s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  40%|‚ñà‚ñà‚ñà‚ñà      | 12/30 [00:28<00:40,  2.23s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 13/30 [00:33<00:49,  2.93s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 14/30 [00:35<00:43,  2.72s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 15/30 [00:36<00:31,  2.08s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 16/30 [00:38<00:29,  2.14s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 17/30 [00:40<00:26,  2.01s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 18/30 [00:41<00:21,  1.76s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 19/30 [00:42<00:16,  1.48s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 20/30 [00:44<00:16,  1.69s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 21/30 [00:46<00:15,  1.77s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 22/30 [00:47<00:11,  1.48s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 23/30 [00:47<00:09,  1.32s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 24/30 [00:50<00:09,  1.65s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 25/30 [00:50<00:06,  1.33s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 26/30 [00:51<00:04,  1.20s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 27/30 [00:52<00:03,  1.05s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 28/30 [00:54<00:02,  1.37s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval:  97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 29/30 [00:57<00:01,  1.79s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Hybrid retrieval: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:58<00:00,  1.94s/it]


‚úÖ Hybrid submission saved to hybrid_retrieval_rrf_submission.csv

üìä HYBRID RETRIEVAL RESULTS STATISTICS:
  üìã Total entries (Weighted): 30,000
  üìã Total entries (RRF): 30,000
  üîç Test queries processed: 30
  üìñ Documents per query: 1000
  üéØ Score range (Weighted): 0.0302 to 1.0000
  üéØ Score range (RRF): 0.0017 to 0.0328

üîç SAMPLE HYBRID RESULTS COMPARISON:

Top 5 WEIGHTED FUSION results for Test Query 26:
  Rank   0: Doc  95770 (Score: 0.9687)
  Rank   1: Doc  27818 (Score: 0.9527)
  Rank   2: Doc  75989 (Score: 0.8524)
  Rank   3: Doc  65231 (Score: 0.8192)
  Rank   4: Doc   4954 (Score: 0.7924)

Top 5 RRF FUSION results for Test Query 26:
  Rank   0: Doc  95770 (Score: 0.0323)
  Rank   1: Doc  27818 (Score: 0.0323)
  Rank   2: Doc  75989 (Score: 0.0306)
  Rank   3: Doc  65231 (Score: 0.0301)
  Rank   4: Doc   4954 (Score: 0.0277)

üéâ SUCCESS! Generated hybrid retrieval results!
üìä Each query has exactly 1000 ranked documents using HYBRID APPROACH
üß† Comb