In [6]:
import re
from collections import defaultdict
import json
import pickle
import math

class CLIRIndex:
    def __init__(self):
        self.documents = {}
        self.index = defaultdict(lambda: defaultdict(list))
        self.doc_lengths = {}
        self.num_docs = {'bn': 0, 'en': 0}
        self.avg_doc_length = {'bn': 0, 'en': 0}
        self.doc_counter = 0

    def tokenize(self, text, lang='en'):
        text = text.lower()
        if lang == 'bn':
            tokens = re.findall(r'[\u0980-\u09FF]+', text)
        else:
            tokens = re.findall(r'[a-z0-9]+', text)
        return tokens

    def add_document(self, doc):
        doc_id = self.doc_counter
        self.doc_counter += 1
        self.documents[doc_id] = doc

        lang = doc.get('language', 'en')
        self.num_docs[lang] += 1

        title_tokens = self.tokenize(doc.get('title', ''), lang)
        body_tokens = self.tokenize(doc.get('body', ''), lang)
        all_tokens = title_tokens + body_tokens
        self.doc_lengths[doc_id] = len(all_tokens)

        for pos, token in enumerate(all_tokens):
            self.index[token][lang].append((doc_id, pos))

        return doc_id

    def build_from_json_files(self, files_list):
        print("Building index from files...")
        for fpath in files_list:
            print(f"  Processing {fpath}...")
            count = 0
            with open(fpath, 'r', encoding='utf-8') as f:
                for line in f:
                    line = line.strip()
                    if line:
                        try:
                            doc = json.loads(line)
                            self.add_document(doc)
                            count += 1
                        except Exception as e:
                            continue
            print(f"  Added {count} documents")

        # Calculate average doc lengths
        for lang in ['bn', 'en']:
            if self.num_docs[lang] > 0:
                total = sum(
                    self.doc_lengths[did]
                    for did, doc in self.documents.items()
                    if doc.get('language', 'en') == lang
                )
                self.avg_doc_length[lang] = total / self.num_docs[lang]

        print("-" * 70)
        print("Index built successfully!")
        print("-" * 70)
        print(f"Total documents: {self.doc_counter}")
        print(f"Bangla documents: {self.num_docs['bn']}")
        print(f"English documents: {self.num_docs['en']}")
        print(f"Unique terms: {len(self.index)}")
        print(f"Avg Bangla doc length: {self.avg_doc_length['bn']:.2f} tokens")
        print(f"Avg English doc length: {self.avg_doc_length['en']:.2f} tokens")

    def search(self, query, lang='en', topk=10, k1=1.5, b=0.75):
        """BM25 Search"""
        query_tokens = self.tokenize(query, lang)
        if not query_tokens:
            return []

        scores = defaultdict(float)
        N = self.num_docs[lang]

        if N == 0:
            return []

        avgdl = self.avg_doc_length[lang]

        for term in query_tokens:
            if term not in self.index or lang not in self.index[term]:
                continue

            postings = self.index[term][lang]
            df = len(set(doc_id for doc_id, _ in postings))
            idf = math.log((N - df + 0.5) / (df + 0.5) + 1)

            tf_doc = defaultdict(int)
            for doc_id, _ in postings:
                tf_doc[doc_id] += 1

            for doc_id, tf in tf_doc.items():
                doc_len = self.doc_lengths[doc_id]
                score = idf * (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * (doc_len / avgdl)))
                scores[doc_id] += score

        ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:topk]

        results = []
        for doc_id, score in ranked:
            doc = self.documents[doc_id].copy()
            doc['score'] = score
            doc['docid'] = doc_id
            results.append(doc)

        return results

    def save(self, fname='clir_index.pkl'):
        """Save index"""
        # Convert defaultdict to regular dict
        index_dict = {}
        for term, lang_dict in self.index.items():
            index_dict[term] = dict(lang_dict)

        with open(fname, 'wb') as f:
            pickle.dump({
                'documents': self.documents,
                'index': index_dict,
                'doc_lengths': self.doc_lengths,
                'num_docs': self.num_docs,
                'avg_doc_length': self.avg_doc_length,
                'doc_counter': self.doc_counter
            }, f)

        print(f"✓ Index saved to {fname}")

    def load(self, fname='clir_index.pkl'):
        """Load index from file"""
        with open(fname, 'rb') as f:
            data = pickle.load(f)

        self.documents = data['documents']

        # Convert back to defaultdict
        self.index = defaultdict(lambda: defaultdict(list))
        for term, lang_dict in data['index'].items():
            self.index[term] = defaultdict(list, lang_dict)

        self.doc_lengths = data['doc_lengths']
        self.num_docs = data['num_docs']
        self.avg_doc_length = data['avg_doc_length']
        self.doc_counter = data['doc_counter']

        print(f"✓ Index loaded from {fname}")

print("✓ CLIRIndex class defined!")


✓ CLIRIndex class defined!


In [7]:
# Load the index you created in Module B
index = CLIRIndex()
index.load('clir_index.pkl')
print("✓ Index loaded successfully!")


✓ Index loaded from clir_index.pkl
✓ Index loaded successfully!


In [8]:
# Install sentence transformers for semantic search
!pip install -q sentence-transformers
print("✓ Installed sentence-transformers")


✓ Installed sentence-transformers


Semantic Search Code

In [9]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

# Load multilingual embedding model (this will download ~500MB model)
print("Loading LaBSE model... (this may take 1-2 minutes)")
semantic_model = SentenceTransformer('sentence-transformers/LaBSE')
print("✓ Model loaded!")

def semantic_search(query, documents, topk=10):
    """
    Search using semantic embeddings
    """
    print(f"Encoding query: {query}")
    query_embedding = semantic_model.encode(query, convert_to_tensor=True)

    scores = []
    doc_list = list(documents.items())

    # Process in batches to avoid memory issues
    batch_size = 100
    for i in range(0, len(doc_list), batch_size):
        batch = doc_list[i:i+batch_size]

        # Create document texts
        doc_texts = []
        doc_ids = []
        for docid, doc in batch:
            text = f"{doc.get('title', '')} {doc.get('body', '')[:300]}"
            doc_texts.append(text)
            doc_ids.append(docid)

        # Encode batch
        doc_embeddings = semantic_model.encode(doc_texts, convert_to_tensor=True)

        # Calculate similarities
        similarities = util.cos_sim(query_embedding, doc_embeddings)[0]

        for docid, sim in zip(doc_ids, similarities):
            scores.append((docid, sim.item()))

    # Sort and get top-k
    ranked = sorted(scores, key=lambda x: x[1], reverse=True)[:topk]

    results = []
    for docid, score in ranked:
        doc = documents[docid].copy()
        doc['score'] = score
        doc['docid'] = docid
        results.append(doc)

    return results

print("✓ Semantic search function ready!")


Loading LaBSE model... (this may take 1-2 minutes)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/804 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

2_Dense/model.safetensors:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

✓ Model loaded!
✓ Semantic search function ready!



Test Semantic Search




In [10]:
# Test with a simple query
test_query = "education system"

print("="*70)
print("Testing Semantic Search")
print("="*70)
print(f"Query: {test_query}\n")

# Run semantic search
results = semantic_search(test_query, index.documents, topk=5)

print("Top 5 Results:\n")
for i, doc in enumerate(results, 1):
    print(f"{i}. Score: {doc['score']:.4f}")
    print(f"   Title: {doc.get('title', 'N/A')[:80]}")
    print(f"   URL: {doc.get('url', 'N/A')}")
    print(f"   Language: {doc.get('language', 'N/A')}")
    print()


Testing Semantic Search
Query: education system

Encoding query: education system
Top 5 Results:

1. Score: 0.4355
   Title: শিক্ষার মানোন্নয়নে শাসকশ্রেণির ভূমিকা নেই: অধ্যাপক রেহমান সোবহান
   URL: https://www.prothomalo.com/bangladesh/v78p0s8278
   Language: bn

2. Score: 0.4247
   Title: এলাকাটি যেন ‘শিক্ষার হাট’
   URL: https://www.prothomalo.com/bangladesh/district/rkz6zecazs
   Language: bn

3. Score: 0.4048
   Title: ‘ছেলে আমাকে সেরা মায়ের সম্মান দিয়ে গেল’
   URL: https://www.prothomalo.com/bangladesh/district/ch4my2e972
   Language: bn

4. Score: 0.3573
   Title: শিক্ষাপ্রতিষ্ঠান ধ্বংসকারীরা সভ্যতার শত্রু: শিক্ষামন্ত্রী
   URL: https://www.prothomalo.com/bangladesh/শিক্ষাপ্রতিষ্ঠান-ধ্বংসকারীরা-সভ্যতার-শত্রু
   Language: bn

5. Score: 0.3562
   Title: Durga Puja holiday starts for edn instts today
   URL: https://www.newagebd.net/post/education/277395/durga-puja-holiday-starts-for-edn-instts-today
   Language: en



Compare BM25 vs Semantic

In [11]:
# Compare BM25 and Semantic Search side-by-side

test_queries = [
    ("education system", "en"),
    ("শিক্ষা", "bn"),  # education in Bangla
    ("prime minister", "en")
]

for query, lang in test_queries:
    print("="*70)
    print(f"Query: '{query}' (Language: {lang})")
    print("="*70)

    # BM25 Results
    print("\n📊 BM25 Results:")
    bm25_results = index.search(query, lang=lang, topk=5)
    for i, doc in enumerate(bm25_results, 1):
        print(f"  {i}. {doc.get('title', 'N/A')[:60]} (Score: {doc['score']:.4f})")

    # Semantic Results
    print("\n🧠 Semantic Results:")
    semantic_results = semantic_search(query, index.documents, topk=5)
    for i, doc in enumerate(semantic_results, 1):
        print(f"  {i}. {doc.get('title', 'N/A')[:60]} (Score: {doc['score']:.4f})")

    print("\n")


Query: 'education system' (Language: en)

📊 BM25 Results:
  1. Minimum standard of edn a must for MPO: Farruk (Score: 9.3298)
  2. No edn reform commission reflects neglect: BNP (Score: 8.8665)
  3. Disasters disrupt education (Score: 8.7735)
  4. Education policy 2O1O: Keeping the promise (Score: 8.7043)
  5. AAUB holds 18th syndicate meeting (Score: 8.0605)

🧠 Semantic Results:
Encoding query: education system
  1. শিক্ষার মানোন্নয়নে শাসকশ্রেণির ভূমিকা নেই: অধ্যাপক রেহমান সো (Score: 0.4355)
  2. এলাকাটি যেন ‘শিক্ষার হাট’ (Score: 0.4247)
  3. ‘ছেলে আমাকে সেরা মায়ের সম্মান দিয়ে গেল’ (Score: 0.4048)
  4. শিক্ষাপ্রতিষ্ঠান ধ্বংসকারীরা সভ্যতার শত্রু: শিক্ষামন্ত্রী (Score: 0.3573)
  5. Durga Puja holiday starts for edn instts today (Score: 0.3562)


Query: 'শিক্ষা' (Language: bn)

📊 BM25 Results:
  1. নিজের ভাষায় পড়তে চায় ওরা (Score: 6.2005)
  2. প্রাথমিকে নিয়োগের প্রথম ধাপের পরীক্ষা প্রশ্নবিদ্ধ না হোক (Score: 5.8221)
  3. বাড়িভাড়া বাড়ানোর পর শ্রেণিকক্ষে ফেরার ঘোষণা এমপিওভুক্ত শিক্ষ (Score:

Add Fuzzy Search

In [12]:
!pip install -q fuzzywuzzy python-Levenshtein

from fuzzywuzzy import fuzz

def fuzzy_search(query, documents, topk=10):
    """
    Search using fuzzy string matching
    """
    scores = []

    for docid, doc in documents.items():
        title = doc.get('title', '')
        body = doc.get('body', '')[:500]

        # Calculate fuzzy scores
        title_score = fuzz.partial_ratio(query.lower(), title.lower())
        body_score = fuzz.partial_ratio(query.lower(), body.lower())

        # Combine and normalize to 0-1
        combined = (title_score * 0.7 + body_score * 0.3) / 100
        scores.append((docid, combined))

    # Sort and return top-k
    ranked = sorted(scores, key=lambda x: x[1], reverse=True)[:topk]

    results = []
    for docid, score in ranked:
        doc = documents[docid].copy()
        doc['score'] = score
        doc['docid'] = docid
        results.append(doc)

    return results

print("✓ Fuzzy search ready!")

# Test it
print("\n🔍 Fuzzy Search Test:")
fuzzy_results = fuzzy_search("education", index.documents, topk=5)
for i, doc in enumerate(fuzzy_results, 1):
    print(f"{i}. {doc.get('title', 'N/A')[:60]} (Score: {doc['score']:.4f})")


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/153.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.3/153.3 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m3.1/3.2 MB[0m [31m114.0 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m3.1/3.2 MB[0m [31m114.0 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.2/3.2 MB[0m [31m31.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[?25h✓ Fuzzy search ready!

🔍 Fuzzy Search Test:
1. Integrated approach needed to address challenges in higher e (Score: 1.0000)
2. IT education essenti

 Save Your Results

In [13]:
# Save a comparison report
import json

comparison_results = {}

test_queries_final = [
    "Prime Minister of Bangladesh",
    "শিক্ষা ব্যবস্থা",
    "prison protest",
    "অর্থনীতি",
    "dhaka traffic"
]

for query in test_queries_final:
    lang = 'bn' if any('\u0980' <= c <= '\u09FF' for c in query) else 'en'

    comparison_results[query] = {
        'bm25': index.search(query, lang=lang, topk=5),
        'semantic': semantic_search(query, index.documents, topk=5),
        'fuzzy': fuzzy_search(query, index.documents, topk=5)
    }

# Save to file
with open('module_c_comparison.json', 'w', encoding='utf-8') as f:
    json.dump(comparison_results, f, ensure_ascii=False, indent=2, default=str)

print("✓ Comparison results saved to 'module_c_comparison.json'")
files.download('module_c_comparison.json')


Encoding query: Prime Minister of Bangladesh
Encoding query: শিক্ষা ব্যবস্থা
Encoding query: prison protest
Encoding query: অর্থনীতি
Encoding query: dhaka traffic
✓ Comparison results saved to 'module_c_comparison.json'


NameError: name 'files' is not defined

In [16]:
from google.colab import files
files.download('module_c_comparison.json')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Hybrid Ranking

In [20]:
def hybrid_search(query, index, lang='en', topk=10,
                  weights={'bm25': 0.3, 'semantic': 0.5, 'fuzzy': 0.2}):
    """
    Combine BM25 + Semantic + Fuzzy
    """
    # Get results from each
    bm25_results = index.search(query, lang=lang, topk=50)
    semantic_results = semantic_search(query, index.documents, topk=50)
    fuzzy_results = fuzzy_search(query, index.documents, topk=50)

    # Normalize scores to [0, 1]
    def normalize(results):
        if not results:
            return {}
        max_score = max(r['score'] for r in results)
        if max_score == 0:
            return {}
        return {r['docid']: r['score'] / max_score for r in results}

    bm25_norm = normalize(bm25_results)
    semantic_norm = normalize(semantic_results)
    fuzzy_norm = normalize(fuzzy_results)

    # Weighted combination
    combined = {}
    all_docs = set(bm25_norm.keys()) | set(semantic_norm.keys()) | set(fuzzy_norm.keys())

    for docid in all_docs:
        score = (weights['bm25'] * bm25_norm.get(docid, 0) +
                 weights['semantic'] * semantic_norm.get(docid, 0) +
                 weights['fuzzy'] * fuzzy_norm.get(docid, 0))
        combined[docid] = score

    # Rank
    ranked = sorted(combined.items(), key=lambda x: x[1], reverse=True)[:topk]

    results = []
    for docid, score in ranked:
        doc = index.documents[docid].copy()
        doc['score'] = score
        doc['docid'] = docid
        results.append(doc)

    return results

# Test hybrid
print("🔀 Hybrid Search (BM25 + Semantic + Fuzzy)")
hybrid_results = hybrid_search("education", index, lang='en', topk=5)
for i, doc in enumerate(hybrid_results, 1):
    print(f"{i}. {doc.get('title', 'N/A')[:60]} (Score: {doc['score']:.4f})")


🔀 Hybrid Search (BM25 + Semantic + Fuzzy)
Encoding query: education
1. Rights to education and health (Score: 0.8782)
2. IT education essential for students: NU VC (Score: 0.8524)
3. Education policy 2O1O: Keeping the promise (Score: 0.8364)
4. Govt needs to put in more capital to up education (Score: 0.8022)
5. Durga Puja holiday starts for edn instts today (Score: 0.7131)


In [None]:
import json

queries = [
    ("education system", "en"),
    ("শিক্ষা", "bn"),
    ("prison protest", "en"),
]

all_hybrid_results = []

for q, lang in queries:
    res = hybrid_search(q, index, lang=lang, topk=10)
    all_hybrid_results.append({
        "query": q,
        "lang": lang,
        "results": res,
    })

with open("hybrid_results.json", "w", encoding="utf-8") as f:
    json.dump(all_hybrid_results, f, ensure_ascii=False, indent=2, default=str)

print("saved to hybrid_results.json")
files.download('hybrid_results.json')



Encoding query: education system


In [18]:
import json

# Test queries
queries = [
    ("education system", "en"),
    ("শিক্ষা", "bn"),
    ("Prime Minister", "en"),
    ("prison protest", "en"),
    ("অর্থনীতি", "bn"),
]

all_results = []

for query, lang in queries:
    print(f"Processing: {query}")

    # Get results from ALL models
    bm25_res = index.search(query, lang=lang, topk=10)
    fuzzy_res = fuzzy_search(query, index.documents, topk=10)
    semantic_res = semantic_search(query, index.documents, topk=10)
    hybrid_res = hybrid_search(query, index, lang=lang, topk=10)

    # Save everything together
    all_results.append({
        "query": query,
        "language": lang,
        "bm25_results": bm25_res,
        "fuzzy_results": fuzzy_res,
        "semantic_results": semantic_res,
        "hybrid_results": hybrid_res
    })

# Save to JSON file
with open("module_c_all_models_comparison.json", "w", encoding="utf-8") as f:
    json.dump(all_results, f, ensure_ascii=False, indent=2, default=str)

print("✓ Saved all model results to module_c_all_models_comparison.json")

# Download (if in Colab)
from google.colab import files
files.download("module_c_all_models_comparison.json")


Processing: education system
Encoding query: education system


KeyboardInterrupt: 