In [15]:
!nvidia-smi

Sun Feb 15 04:25:57 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 576.88                 Driver Version: 576.88         CUDA Version: 12.9     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 5060 Ti   WDDM  |   00000000:01:00.0  On |                  N/A |
|  0%   39C    P8              4W /  180W |    4309MiB /  16311MiB |      2%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [1]:
import re
from collections import defaultdict
import json
import pickle
import math

class CLIRIndex:
    def __init__(self):
        self.documents = {}
        self.index = defaultdict(lambda: defaultdict(list))
        self.doc_lengths = {}
        self.num_docs = {'bn': 0, 'en': 0}
        self.avg_doc_length = {'bn': 0, 'en': 0}
        self.doc_counter = 0

    def tokenize(self, text, lang='en'):
        text = text.lower()
        if lang == 'bn':
            tokens = re.findall(r'[\u0980-\u09FF]+', text)
        else:
            tokens = re.findall(r'[a-z0-9]+', text)
        return tokens

    def add_document(self, doc):
        doc_id = self.doc_counter
        self.doc_counter += 1
        self.documents[doc_id] = doc

        lang = doc.get('language', 'en')
        self.num_docs[lang] += 1

        title_tokens = self.tokenize(doc.get('title', ''), lang)
        body_tokens = self.tokenize(doc.get('body', ''), lang)
        all_tokens = title_tokens + body_tokens
        self.doc_lengths[doc_id] = len(all_tokens)

        for pos, token in enumerate(all_tokens):
            self.index[token][lang].append((doc_id, pos))

        return doc_id

    def build_from_json_files(self, files_list):
        print("Building index from files...")
        for fpath in files_list:
            print(f"  Processing {fpath}...")
            count = 0
            with open(fpath, 'r', encoding='utf-8') as f:
                for line in f:
                    line = line.strip()
                    if line:
                        try:
                            doc = json.loads(line)
                            self.add_document(doc)
                            count += 1
                        except Exception as e:
                            continue
            print(f"  Added {count} documents")

        # Calculate average doc lengths
        for lang in ['bn', 'en']:
            if self.num_docs[lang] > 0:
                total = sum(
                    self.doc_lengths[did]
                    for did, doc in self.documents.items()
                    if doc.get('language', 'en') == lang
                )
                self.avg_doc_length[lang] = total / self.num_docs[lang]

        print("-" * 70)
        print("Index built successfully!")
        print("-" * 70)
        print(f"Total documents: {self.doc_counter}")
        print(f"Bangla documents: {self.num_docs['bn']}")
        print(f"English documents: {self.num_docs['en']}")
        print(f"Unique terms: {len(self.index)}")
        print(f"Avg Bangla doc length: {self.avg_doc_length['bn']:.2f} tokens")
        print(f"Avg English doc length: {self.avg_doc_length['en']:.2f} tokens")

    def search(self, query, lang='en', topk=10, k1=1.5, b=0.75):
        """BM25 Search"""
        query_tokens = self.tokenize(query, lang)
        if not query_tokens:
            return []

        scores = defaultdict(float)
        N = self.num_docs[lang]

        if N == 0:
            return []

        avgdl = self.avg_doc_length[lang]

        for term in query_tokens:
            if term not in self.index or lang not in self.index[term]:
                continue

            postings = self.index[term][lang]
            df = len(set(doc_id for doc_id, _ in postings))
            idf = math.log((N - df + 0.5) / (df + 0.5) + 1)

            tf_doc = defaultdict(int)
            for doc_id, _ in postings:
                tf_doc[doc_id] += 1

            for doc_id, tf in tf_doc.items():
                doc_len = self.doc_lengths[doc_id]
                score = idf * (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * (doc_len / avgdl)))
                scores[doc_id] += score

        ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:topk]

        results = []
        for doc_id, score in ranked:
            doc = self.documents[doc_id].copy()
            doc['score'] = score
            doc['docid'] = doc_id
            results.append(doc)

        return results

    def save(self, fname='clir_index.pkl'):
        """Save index"""
        # Convert defaultdict to regular dict
        index_dict = {}
        for term, lang_dict in self.index.items():
            index_dict[term] = dict(lang_dict)

        with open(fname, 'wb') as f:
            pickle.dump({
                'documents': self.documents,
                'index': index_dict,
                'doc_lengths': self.doc_lengths,
                'num_docs': self.num_docs,
                'avg_doc_length': self.avg_doc_length,
                'doc_counter': self.doc_counter
            }, f)

        print(f"‚úì Index saved to {fname}")

    def load(self, fname='clir_index.pkl'):
        """Load index from file"""
        with open(fname, 'rb') as f:
            data = pickle.load(f)

        self.documents = data['documents']

        # Convert back to defaultdict
        self.index = defaultdict(lambda: defaultdict(list))
        for term, lang_dict in data['index'].items():
            self.index[term] = defaultdict(list, lang_dict)

        self.doc_lengths = data['doc_lengths']
        self.num_docs = data['num_docs']
        self.avg_doc_length = data['avg_doc_length']
        self.doc_counter = data['doc_counter']

        print(f"‚úì Index loaded from {fname}")

print("‚úì CLIRIndex class defined!")


‚úì CLIRIndex class defined!


In [3]:
# Load the index you created in Module B
index = CLIRIndex()
index.load('D:/UG/4-1/DMin/BaECLIR/ModuleB/results/updated/clir_index.pkl')
print("‚úì Index loaded successfully!")


‚úì Index loaded from D:/UG/4-1/DMin/BaECLIR/ModuleB/results/updated/clir_index.pkl
‚úì Index loaded successfully!


In [5]:
# Install sentence transformers for semantic search
!pip install -q sentence-transformers
print("‚úì Installed sentence-transformers")


‚úì Installed sentence-transformers



[notice] A new release of pip is available: 25.0.1 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Semantic Search Code

In [6]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

# Load multilingual embedding model (this will download ~500MB model)
print("Loading LaBSE model... (this may take 1-2 minutes)")
semantic_model = SentenceTransformer('sentence-transformers/LaBSE')
print("‚úì Model loaded!")

def semantic_search(query, documents, topk=10):
    """
    Search using semantic embeddings
    """
    print(f"Encoding query: {query}")
    query_embedding = semantic_model.encode(query, convert_to_tensor=True)

    scores = []
    doc_list = list(documents.items())

    # Process in batches to avoid memory issues
    batch_size = 100
    for i in range(0, len(doc_list), batch_size):
        batch = doc_list[i:i+batch_size]

        # Create document texts
        doc_texts = []
        doc_ids = []
        for docid, doc in batch:
            text = f"{doc.get('title', '')} {doc.get('body', '')[:300]}"
            doc_texts.append(text)
            doc_ids.append(docid)

        # Encode batch
        doc_embeddings = semantic_model.encode(doc_texts, convert_to_tensor=True)

        # Calculate similarities
        similarities = util.cos_sim(query_embedding, doc_embeddings)[0]

        for docid, sim in zip(doc_ids, similarities):
            scores.append((docid, sim.item()))

    # Sort and get top-k
    ranked = sorted(scores, key=lambda x: x[1], reverse=True)[:topk]

    results = []
    for docid, score in ranked:
        doc = documents[docid].copy()
        doc['score'] = score
        doc['docid'] = docid
        results.append(doc)

    return results

print("‚úì Semantic search function ready!")


  from .autonotebook import tqdm as notebook_tqdm


Loading LaBSE model... (this may take 1-2 minutes)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Loading weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 199/199 [00:00<00:00, 1435.85it/s, Materializing param=pooler.dense.weight]                               
[1mBertModel LOAD REPORT[0m from: sentence-transformers/LaBSE
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


‚úì Model loaded!
‚úì Semantic search function ready!



Test Semantic Search




In [7]:
# Test with a simple query
test_query = "balngladesh election"

print("="*70)
print("Testing Semantic Search")
print("="*70)
print(f"Query: {test_query}\n")

# Run semantic search
results = semantic_search(test_query, index.documents, topk=5)

print("Top 5 Results:\n")
for i, doc in enumerate(results, 1):
    print(f"{i}. Score: {doc['score']:.4f}")
    print(f"   Title: {doc.get('title', 'N/A')[:80]}")
    print(f"   URL: {doc.get('url', 'N/A')}")
    print(f"   Language: {doc.get('language', 'N/A')}")
    print()


Testing Semantic Search
Query: balngladesh election

Encoding query: balngladesh election
Top 5 Results:

1. Score: 0.4884
   Title: Bangladesh Election: Voters react to changing campaign styles
   URL: https://www.dhakatribune.com/bangladesh/election/402547/bangladesh-election-voters-react-to-changing
   Language: en

2. Score: 0.4492
   Title: When politics abandons truth
   URL: https://www.dhakatribune.com/opinion/op-ed/402374/when-politics-abandons-truth
   Language: en

3. Score: 0.4302
   Title: ‡¶®‡¶ø‡¶∞‡ßç‡¶¨‡¶æ‡¶ö‡¶® ‡¶ï‡¶ø ‡¶§‡¶æ‡¶π‡¶≤‡ßá ‡¶Æ‡¶ß‡ßç‡¶Ø ‡¶ì ‡¶°‡¶æ‡¶®‡¶™‡¶®‡ßç‡¶•‡¶æ‡¶∞ ‡¶≤‡ßú‡¶æ‡¶á‡ßü‡ßá‡¶∞ ‡¶¶‡¶ø‡¶ï‡ßá ‡¶Ø‡¶æ‡¶ö‡ßç‡¶õ‡ßá
   URL: https://www.prothomalo.com/opinion/column/qb8a5px0ku
   Language: bn

4. Score: 0.4184
   Title: ‡¶ú‡¶æ‡¶Æ‡¶æ‡ßü‡¶æ‡¶§‡ßá‡¶∞ ‡¶®‡¶ø‡¶∞‡ßç‡¶¨‡¶æ‡¶ö‡¶®‡¶ø ‡¶á‡¶∂‡¶§‡ßá‡¶π‡¶æ‡¶∞ ‡¶ò‡ßã‡¶∑‡¶£‡¶æ
   URL: https://www.bd-pratidin.com/current-politics/2026/02/04/1213034
   Language: bn

5. Score: 0.4082
   Title: Populism or pragmatism? What B

Compare BM25 vs Semantic

In [None]:
# Compare BM25 and Semantic Search side-by-side

test_queries = [
    ("education system", "en"),
    ("‡¶∂‡¶ø‡¶ï‡ßç‡¶∑‡¶æ", "bn"),  # education in Bangla
    ("prime minister", "en")
]

for query, lang in test_queries:
    print("="*70)
    print(f"Query: '{query}' (Language: {lang})")
    print("="*70)

    # BM25 Results
    print("\n BM25 Results:")
    bm25_results = index.search(query, lang=lang, topk=5)
    for i, doc in enumerate(bm25_results, 1):
        print(f"  {i}. {doc.get('title', 'N/A')[:60]} (Score: {doc['score']:.4f})")

    # Semantic Results
    print("\n Semantic Results:")
    semantic_results = semantic_search(query, index.documents, topk=5)
    for i, doc in enumerate(semantic_results, 1):
        print(f"  {i}. {doc.get('title', 'N/A')[:60]} (Score: {doc['score']:.4f})")

    print("\n")


Query: 'education system' (Language: en)

üìä BM25 Results:
  1. Online applications for MPO enlistment open till Jan 25 (Score: 9.2043)
  2. Minimum standard of edn a must for MPO: Farruk (Score: 9.1045)
  3. No edn reform commission reflects neglect: BNP (Score: 8.7155)
  4. Disasters disrupt education (Score: 8.7025)
  5. Education policy 2O1O: Keeping the promise (Score: 8.5847)

üß† Semantic Results:
Encoding query: education system
  1. ‡¶∂‡¶ø‡¶ï‡ßç‡¶∑‡¶æ‡¶∞ ‡¶Æ‡¶æ‡¶®‡ßã‡¶®‡ßç‡¶®‡ßü‡¶®‡ßá ‡¶∂‡¶æ‡¶∏‡¶ï‡¶∂‡ßç‡¶∞‡ßá‡¶£‡¶ø‡¶∞ ‡¶≠‡ßÇ‡¶Æ‡¶ø‡¶ï‡¶æ ‡¶®‡ßá‡¶á: ‡¶Ö‡¶ß‡ßç‡¶Ø‡¶æ‡¶™‡¶ï ‡¶∞‡ßá‡¶π‡¶Æ‡¶æ‡¶® ‡¶∏‡ßã (Score: 0.4355)
  2. ‡¶è‡¶≤‡¶æ‡¶ï‡¶æ‡¶ü‡¶ø ‡¶Ø‡ßá‡¶® ‚Äò‡¶∂‡¶ø‡¶ï‡ßç‡¶∑‡¶æ‡¶∞ ‡¶π‡¶æ‡¶ü‚Äô (Score: 0.4247)
  3. ‚Äò‡¶õ‡ßá‡¶≤‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶ï‡ßá ‡¶∏‡ßá‡¶∞‡¶æ ‡¶Æ‡¶æ‡ßü‡ßá‡¶∞ ‡¶∏‡¶Æ‡ßç‡¶Æ‡¶æ‡¶® ‡¶¶‡¶ø‡ßü‡ßá ‡¶ó‡ßá‡¶≤‚Äô (Score: 0.4048)
  4. ‡¶®‡ßà‡¶§‡¶ø‡¶ï‡¶§‡¶æ‡¶π‡ßÄ‡¶® ‡¶∂‡¶ø‡¶ï‡ßç‡¶∑‡¶æ‡¶¨‡ßç‡¶Ø‡¶¨‡¶∏‡ßç‡¶•‡¶æ ‡¶™‡¶∂‡ßÅ ‡¶§‡ßà‡¶∞‡¶ø ‡¶ï‡¶∞‡¶õ‡ßá, ‡¶Æ‡¶æ‡¶®‡ßÅ‡¶∑ ‡¶®‡ßü (Score: 0.399

Add Fuzzy Search

In [9]:
!pip install -q fuzzywuzzy python-Levenshtein


[notice] A new release of pip is available: 25.0.1 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:


from fuzzywuzzy import fuzz

def fuzzy_search(query, documents, topk=10):
    """
    Search using fuzzy string matching
    """
    scores = []

    for docid, doc in documents.items():
        title = doc.get('title', '')
        body = doc.get('body', '')[:500]

        # Calculate fuzzy scores
        title_score = fuzz.partial_ratio(query.lower(), title.lower())
        body_score = fuzz.partial_ratio(query.lower(), body.lower())

        # Combine and normalize to 0-1
        combined = (title_score * 0.7 + body_score * 0.3) / 100
        scores.append((docid, combined))

    # Sort and return top-k
    ranked = sorted(scores, key=lambda x: x[1], reverse=True)[:topk]

    results = []
    for docid, score in ranked:
        doc = documents[docid].copy()
        doc['score'] = score
        doc['docid'] = docid
        results.append(doc)

    return results

print("‚úì Fuzzy search ready!")

# Test it
print("\n Fuzzy Search Test:")
fuzzy_results = fuzzy_search("education", index.documents, topk=5)
for i, doc in enumerate(fuzzy_results, 1):
    print(f"{i}. {doc.get('title', 'N/A')[:60]} (Score: {doc['score']:.4f})")


‚úì Fuzzy search ready!

üîç Fuzzy Search Test:
1. Education policy 2O1O: Keeping the promise (Score: 1.0000)
2. Integrated approach needed to address challenges in higher e (Score: 1.0000)
3. Meet Mahmood Moosa Syed, the entrepreneur simplifying intern (Score: 1.0000)
4. Educational institutes to reopen on Feb 22 (Score: 1.0000)
5. IT education essential for students: NU VC (Score: 1.0000)


 Save Your Results

In [None]:
# Save a comparison report
import json

comparison_results = {}

test_queries_final = [
    "Prime Minister of Bangladesh",
    "‡¶∂‡¶ø‡¶ï‡ßç‡¶∑‡¶æ ‡¶¨‡ßç‡¶Ø‡¶¨‡¶∏‡ßç‡¶•‡¶æ",
    "prison protest",
    "‡¶Ö‡¶∞‡ßç‡¶•‡¶®‡ßÄ‡¶§‡¶ø",
    "dhaka traffic"
]

for query in test_queries_final:
    lang = 'bn' if any('\u0980' <= c <= '\u09FF' for c in query) else 'en'

    comparison_results[query] = {
        'bm25': index.search(query, lang=lang, topk=5),
        'semantic': semantic_search(query, index.documents, topk=5),
        'fuzzy': fuzzy_search(query, index.documents, topk=5)
    }

# Save to file
with open('module_c_comparison_5k_1.json', 'w', encoding='utf-8') as f:
    json.dump(comparison_results, f, ensure_ascii=False, indent=2, default=str)

print("‚úì Comparison results saved to 'module_c_comparison_5k_1.json'")
# files.download('module_c_comparison.json')


Encoding query: Prime Minister of Bangladesh
Encoding query: ‡¶∂‡¶ø‡¶ï‡ßç‡¶∑‡¶æ ‡¶¨‡ßç‡¶Ø‡¶¨‡¶∏‡ßç‡¶•‡¶æ
Encoding query: prison protest
Encoding query: ‡¶Ö‡¶∞‡ßç‡¶•‡¶®‡ßÄ‡¶§‡¶ø
Encoding query: dhaka traffic
‚úì Comparison results saved to 'module_c_comparison.json'


In [16]:
from google.colab import files
files.download('module_c_comparison.json')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Hybrid Ranking

In [None]:
def hybrid_search(query, index, lang='en', topk=10,
                  weights={'bm25': 0.3, 'semantic': 0.5, 'fuzzy': 0.2}):
    """
    Combine BM25 + Semantic + Fuzzy
    """
    # Get results from each
    bm25_results = index.search(query, lang=lang, topk=50)
    semantic_results = semantic_search(query, index.documents, topk=50)
    fuzzy_results = fuzzy_search(query, index.documents, topk=50)

    # Normalize scores to [0, 1]
    def normalize(results):
        if not results:
            return {}
        max_score = max(r['score'] for r in results)
        if max_score == 0:
            return {}
        return {r['docid']: r['score'] / max_score for r in results}

    bm25_norm = normalize(bm25_results)
    semantic_norm = normalize(semantic_results)
    fuzzy_norm = normalize(fuzzy_results)

    # Weighted combination
    combined = {}
    all_docs = set(bm25_norm.keys()) | set(semantic_norm.keys()) | set(fuzzy_norm.keys())

    for docid in all_docs:
        score = (weights['bm25'] * bm25_norm.get(docid, 0) +
                 weights['semantic'] * semantic_norm.get(docid, 0) +
                 weights['fuzzy'] * fuzzy_norm.get(docid, 0))
        combined[docid] = score

    # Rank
    ranked = sorted(combined.items(), key=lambda x: x[1], reverse=True)[:topk]

    results = []
    for docid, score in ranked:
        doc = index.documents[docid].copy()
        doc['score'] = score
        doc['docid'] = docid
        results.append(doc)

    return results

# Test hybrid
print(" Hybrid Search (BM25 + Semantic + Fuzzy)")
hybrid_results = hybrid_search("education", index, lang='en', topk=5)
for i, doc in enumerate(hybrid_results, 1):
    print(f"{i}. {doc.get('title', 'N/A')[:60]} (Score: {doc['score']:.4f})")


üîÄ Hybrid Search (BM25 + Semantic + Fuzzy)
Encoding query: education
1. Rights to education and health (Score: 0.8797)
2. IT education essential for students: NU VC (Score: 0.8543)
3. Durga Puja holiday starts for edn instts today (Score: 0.7159)
4. Dipu Moni: No closure of schools unless necessary (Score: 0.6727)
5. Minimum standard of edn a must for MPO: Farruk (Score: 0.6721)


In [13]:
import json

queries = [
    ("education system", "en"),
    ("‡¶∂‡¶ø‡¶ï‡ßç‡¶∑‡¶æ", "bn"),
    ("prison protest", "en"),
]

all_hybrid_results = []

for q, lang in queries:
    res = hybrid_search(q, index, lang=lang, topk=10)
    all_hybrid_results.append({
        "query": q,
        "lang": lang,
        "results": res,
    })

with open("hybrid_results_5k_1.json", "w", encoding="utf-8") as f:
    json.dump(all_hybrid_results, f, ensure_ascii=False, indent=2, default=str)

print("saved to hybrid_results_5k_1.json")
# files.download('hybrid_results.json')



Encoding query: education system
Encoding query: ‡¶∂‡¶ø‡¶ï‡ßç‡¶∑‡¶æ
Encoding query: prison protest
saved to hybrid_results_5k_1.json


In [14]:
import json

# Test queries
queries = [
    ("education system", "en"),
    ("‡¶∂‡¶ø‡¶ï‡ßç‡¶∑‡¶æ", "bn"),
    ("Prime Minister", "en"),
    ("prison protest", "en"),
    ("‡¶Ö‡¶∞‡ßç‡¶•‡¶®‡ßÄ‡¶§‡¶ø", "bn"),
]

all_results = []

for query, lang in queries:
    print(f"Processing: {query}")

    # Get results from ALL models
    bm25_res = index.search(query, lang=lang, topk=10)
    fuzzy_res = fuzzy_search(query, index.documents, topk=10)
    semantic_res = semantic_search(query, index.documents, topk=10)
    hybrid_res = hybrid_search(query, index, lang=lang, topk=10)

    # Save everything together
    all_results.append({
        "query": query,
        "language": lang,
        "bm25_results": bm25_res,
        "fuzzy_results": fuzzy_res,
        "semantic_results": semantic_res,
        "hybrid_results": hybrid_res
    })

# Save to JSON file
with open("module_c_all_models_comparison_5k_1.json", "w", encoding="utf-8") as f:
    json.dump(all_results, f, ensure_ascii=False, indent=2, default=str)

print("‚úì Saved all model results to module_c_all_models_comparison_5k_1.json")

# Download (if in Colab)
# from google.colab import files
# files.download("module_c_all_models_comparison.json")


Processing: education system
Encoding query: education system
Encoding query: education system
Processing: ‡¶∂‡¶ø‡¶ï‡ßç‡¶∑‡¶æ
Encoding query: ‡¶∂‡¶ø‡¶ï‡ßç‡¶∑‡¶æ
Encoding query: ‡¶∂‡¶ø‡¶ï‡ßç‡¶∑‡¶æ
Processing: Prime Minister
Encoding query: Prime Minister
Encoding query: Prime Minister
Processing: prison protest
Encoding query: prison protest
Encoding query: prison protest
Processing: ‡¶Ö‡¶∞‡ßç‡¶•‡¶®‡ßÄ‡¶§‡¶ø
Encoding query: ‡¶Ö‡¶∞‡ßç‡¶•‡¶®‡ßÄ‡¶§‡¶ø
Encoding query: ‡¶Ö‡¶∞‡ßç‡¶•‡¶®‡ßÄ‡¶§‡¶ø
‚úì Saved all model results to module_c_all_models_comparison_5k_1.json
