In [1]:
# Cell 1: Install libraries
!pip install rank-bm25 fuzzywuzzy python-Levenshtein sentence-transformers -q

print("✓ All libraries installed!")

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/153.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.3/153.3 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/3.2 MB[0m [31m30.3 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m3.0/3.2 MB[0m [31m35.5 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.2/3.2 MB[0m [31m32.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
[?25h✓ All libraries installed!


In [2]:
# Cell 2: Upload files from Module A & B
from google.colab import files
import pickle
import json

print("Upload these files:")
print("1. simple_index.pkl (from Module A)")
print("2. entity_mapper.json (from Module B)")
print("3. bangla_articles_with_ner.json (from Module A)")
print("4. english_articles_with_ner.json (from Module A)")

uploaded = files.upload()

Upload these files:
1. simple_index.pkl (from Module A)
2. entity_mapper.json (from Module B)
3. bangla_articles_with_ner.json (from Module A)
4. english_articles_with_ner.json (from Module A)


Saving bangla_articles_with_ner.json to bangla_articles_with_ner.json
Saving english_articles_with_ner.json to english_articles_with_ner.json
Saving entity_mapper.json to entity_mapper.json
Saving index.html to index.html
Saving Module_B_QueryProcessing.ipynb to Module_B_QueryProcessing.ipynb
Saving simple_index.pkl to simple_index.pkl


In [3]:
# Cell 3: Load all documents
import json
import pickle

# Load articles
with open('bangla_articles_with_ner.json', 'r', encoding='utf-8') as f:
    bangla_docs = json.load(f)

with open('english_articles_with_ner.json', 'r', encoding='utf-8') as f:
    english_docs = json.load(f)

all_documents = bangla_docs + english_docs

print(f"✓ Loaded {len(all_documents)} documents")
print(f"  Bangla: {len(bangla_docs)}")
print(f"  English: {len(english_docs)}")

✓ Loaded 6188 documents
  Bangla: 3094
  English: 3094


In [4]:
# Cell 4: Define class first, then load index
from collections import defaultdict
import re
import pickle

class SimpleIndex:
    def __init__(self):
        self.inverted_index = defaultdict(list)
        self.documents = {}
        self.doc_count = 0

    def tokenize(self, text):
        return re.findall(r'\w+', text.lower())

    def search(self, query):
        tokens = self.tokenize(query)
        doc_ids = set()
        for token in tokens:
            if token in self.inverted_index:
                doc_ids.update(self.inverted_index[token])
        return [self.documents[doc_id] for doc_id in doc_ids]

# Now load the index
with open('simple_index.pkl', 'rb') as f:
    index = pickle.load(f)

print(f"✓ Index loaded: {index.doc_count} documents")

✓ Index loaded: 6188 documents


In [5]:
# Cell 5: BM25 Retriever Class
from rank_bm25 import BM25Okapi
import numpy as np

class BM25Retriever:
    """Lexical retrieval using BM25 algorithm"""

    def __init__(self, documents):
        print("Building BM25 index...")
        self.documents = documents

        # Tokenize documents (title + body)
        self.corpus = []
        for doc in documents:
            text = (doc.get('title', '') + ' ' + doc.get('body', '')).lower()
            tokens = text.split()
            self.corpus.append(tokens)

        # Build BM25 index
        self.bm25 = BM25Okapi(self.corpus)
        print(f"✓ BM25 index built for {len(documents)} documents")

    def search(self, query, top_k=10):
        """
        Search documents using BM25

        Args:
            query: Search query string
            top_k: Number of results to return

        Returns:
            List of dicts with 'doc', 'score', 'method'
        """
        query_tokens = query.lower().split()
        scores = self.bm25.get_scores(query_tokens)

        # Get top-k indices
        top_indices = np.argsort(scores)[::-1][:top_k]

        results = []
        for idx in top_indices:
            if scores[idx] > 0:  # Only include if score > 0
                results.append({
                    'doc': self.documents[idx],
                    'score': float(scores[idx]),
                    'method': 'BM25'
                })

        return results

# Initialize BM25
bm25_retriever = BM25Retriever(all_documents)

Building BM25 index...
✓ BM25 index built for 6188 documents


In [6]:
# Cell 6: Test BM25
test_query = "cricket in Bangladesh"
results = bm25_retriever.search(test_query, top_k=5)

print(f"Query: '{test_query}'")
print(f"Found: {len(results)} results\n")

for i, r in enumerate(results, 1):
    print(f"[{i}] Score: {r['score']:.3f}")
    print(f"    {r['doc']['title'][:70]}")
    print(f"    Language: {r['doc']['language']}")
    print()

Query: 'cricket in Bangladesh'
Found: 5 results

[1] Score: 13.100
    Shakib receives Argentina cricket team’s gift for Bangladesh
    Language: en

[2] Score: 11.834
    Shakib tries his luck out in tape-tennis cricket
    Language: en

[3] Score: 11.670
    Mahir Sarowar Megh: The 17-year-old designer of Durdanto Dhaka’s jerse
    Language: en

[4] Score: 11.488
    Umpire Sharfuddoula Ibne Shahid Saikat completes international century
    Language: en

[5] Score: 11.280
    Aleem Dar calls it a day
    Language: en



Step 4: Model 2 - Fuzzy Matching

In [7]:
# Cell 7: Fuzzy Matcher Class
from fuzzywuzzy import fuzz

class FuzzyMatcher:
    """Fuzzy string matching for cross-script queries"""

    def __init__(self, documents):
        self.documents = documents
        print(f"✓ Fuzzy matcher initialized for {len(documents)} documents")

    def fuzzy_score(self, query, text):
        """Calculate fuzzy similarity score"""
        return fuzz.token_set_ratio(query.lower(), text.lower()) / 100.0

    def search(self, query, top_k=10):
        """
        Search using fuzzy string matching

        Args:
            query: Search query string
            top_k: Number of results to return

        Returns:
            List of dicts with 'doc', 'score', 'method'
        """
        scores = []

        for doc in self.documents:
            # Match against title + body snippet
            text = doc.get('title', '') + ' ' + doc.get('body', '')[:500]
            score = self.fuzzy_score(query, text)

            scores.append({
                'doc': doc,
                'score': score,
                'method': 'Fuzzy'
            })

        # Sort by score
        scores.sort(key=lambda x: x['score'], reverse=True)

        return scores[:top_k]

# Initialize Fuzzy Matcher
fuzzy_matcher = FuzzyMatcher(all_documents)

✓ Fuzzy matcher initialized for 6188 documents


In [8]:
# Cell 8: Test Fuzzy
results = fuzzy_matcher.search(test_query, top_k=5)

print(f"Fuzzy Search: '{test_query}'")
print(f"Found: {len(results)} results\n")

for i, r in enumerate(results, 1):
    print(f"[{i}] Score: {r['score']:.3f}")
    print(f"    {r['doc']['title'][:70]}")
    print()

Fuzzy Search: 'cricket in Bangladesh'
Found: 5 results

[1] Score: 1.000
    BKSP, BWSF sign MoU to train girls

[2] Score: 1.000
    ‘Bad omen’ as eight First Division clubs relegated

[3] Score: 1.000
    Miraz’s Odommyo prevail in CWAB’s Victory Day celebration match

[4] Score: 1.000
    Stars of yesteryears turn up for Victory Day criclet

[5] Score: 1.000
    T20 cricket’s lingering issues and the BPL



Step 5: Model 3 - Semantic Retrieval

In [9]:
# Cell 9: Semantic Retriever Class
from sentence_transformers import SentenceTransformer, util
import torch

class SemanticRetriever:
    """Semantic retrieval using multilingual embeddings"""

    def __init__(self, documents, model_name='LaBSE'):
        self.documents = documents
        self.model_name = model_name

        print(f"Loading {model_name} model...")
        self.model = SentenceTransformer('sentence-transformers/LaBSE')
        print("✓ Model loaded")

        # Will store precomputed embeddings
        self.doc_embeddings = None

    def compute_embeddings(self):
        """Compute embeddings for all documents (SLOW - run once!)"""
        print(f"Computing embeddings for {len(self.documents)} documents...")
        print("⏳ This will take 30-40 minutes...")

        # Extract text (title + body snippet)
        doc_texts = []
        for doc in self.documents:
            text = doc.get('title', '') + ' ' + doc.get('body', '')[:500]
            doc_texts.append(text)

        # Compute embeddings
        self.doc_embeddings = self.model.encode(
            doc_texts,
            convert_to_tensor=True,
            show_progress_bar=True,
            batch_size=32
        )

        print(f"✓ Embeddings computed! Shape: {self.doc_embeddings.shape}")

    def save_embeddings(self, filepath='doc_embeddings.pkl'):
        """Save embeddings to file"""
        import pickle

        if self.doc_embeddings is None:
            print("❌ No embeddings to save. Run compute_embeddings() first.")
            return

        # Move to CPU before saving
        embeddings_cpu = self.doc_embeddings.cpu()

        with open(filepath, 'wb') as f:
            pickle.dump({
                'embeddings': embeddings_cpu,
                'model_name': self.model_name,
                'num_docs': len(self.documents)
            }, f)

        print(f"✓ Embeddings saved to {filepath}")

    def load_embeddings(self, filepath='doc_embeddings.pkl'):
        """Load precomputed embeddings"""
        import pickle

        with open(filepath, 'rb') as f:
            data = pickle.load(f)

        self.doc_embeddings = data['embeddings']

        # Move to GPU if available
        if torch.cuda.is_available():
            self.doc_embeddings = self.doc_embeddings.cuda()

        print(f"✓ Embeddings loaded from {filepath}")
        print(f"  Shape: {self.doc_embeddings.shape}")

    def search(self, query, top_k=10):
        """
        Search using semantic similarity

        Args:
            query: Search query string
            top_k: Number of results to return

        Returns:
            List of dicts with 'doc', 'score', 'method'
        """
        if self.doc_embeddings is None:
            print("❌ No embeddings loaded. Run compute_embeddings() or load_embeddings() first.")
            return []

        # Encode query
        query_embedding = self.model.encode(query, convert_to_tensor=True)

        # Compute cosine similarity
        scores = util.cos_sim(query_embedding, self.doc_embeddings)[0]

        # Get top-k indices
        top_indices = torch.argsort(scores, descending=True)[:top_k]

        results = []
        for idx in top_indices:
            results.append({
                'doc': self.documents[idx.item()],
                'score': float(scores[idx]),
                'method': 'Semantic'
            })

        return results

# Initialize Semantic Retriever
semantic_retriever = SemanticRetriever(all_documents)

Loading LaBSE model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/804 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

2_Dense/model.safetensors:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

✓ Model loaded


In [10]:
# Cell 10: Compute embeddings (FIRST TIME ONLY - 30-40 mins)
# ⚠️ Run this cell ONLY ONCE! Then save and never run again!

semantic_retriever.compute_embeddings()

# Save immediately after computing
semantic_retriever.save_embeddings('doc_embeddings.pkl')

# Download to your computer NOW!
from google.colab import files
files.download('doc_embeddings.pkl')

print("\n✅ IMPORTANT: Keep doc_embeddings.pkl safe!")
print("   Next time, just load it instead of recomputing.")

Computing embeddings for 6188 documents...
⏳ This will take 30-40 minutes...


Batches:   0%|          | 0/194 [00:00<?, ?it/s]

✓ Embeddings computed! Shape: torch.Size([6188, 768])
✓ Embeddings saved to doc_embeddings.pkl


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


✅ IMPORTANT: Keep doc_embeddings.pkl safe!
   Next time, just load it instead of recomputing.


In [11]:
# Cell 11: Load embeddings (for future sessions)
# If you already have doc_embeddings.pkl, upload it and run this:

uploaded = files.upload()  # Upload doc_embeddings.pkl
semantic_retriever.load_embeddings('doc_embeddings.pkl')

Saving doc_embeddings.pkl to doc_embeddings (1).pkl
✓ Embeddings loaded from doc_embeddings.pkl
  Shape: torch.Size([6188, 768])


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [12]:
# Cell 12: Test Semantic Search
results = semantic_retriever.search(test_query, top_k=5)

print(f"Semantic Search: '{test_query}'")
print(f"Found: {len(results)} results\n")

for i, r in enumerate(results, 1):
    print(f"[{i}] Score: {r['score']:.3f}")
    print(f"    {r['doc']['title'][:70]}")
    print(f"    Language: {r['doc']['language']}")
    print()

Semantic Search: 'cricket in Bangladesh'
Found: 5 results

[1] Score: 0.416
    হ্যান্ডবলে চ্যাম্পিয়ন বিজিবি
    Language: bn

[2] Score: 0.409
    মানববন্ধন
    Language: bn

[3] Score: 0.401
    Bangladesh boxing poised for growth spurts
    Language: en

[4] Score: 0.393
    শাস্তি পেলেন শ্রীলঙ্কার তিন আম্পায়ার
    Language: bn

[5] Score: 0.392
    ঘরোয়া ক্রীড়াঙ্গনের খ ব র
    Language: bn



Model 4 - Hybrid Ranking

In [13]:
# Cell 13: Hybrid Ranker Class
class HybridRanker:
    """Combine multiple retrieval models with weighted scoring"""

    def __init__(self, bm25, fuzzy, semantic, weights=(0.3, 0.2, 0.5)):
        """
        Args:
            weights: (BM25 weight, Fuzzy weight, Semantic weight)
                    Default: (0.3, 0.2, 0.5) - semantic has highest weight
        """
        self.bm25 = bm25
        self.fuzzy = fuzzy
        self.semantic = semantic
        self.weights = weights

        print(f"✓ Hybrid Ranker initialized")
        print(f"  Weights: BM25={weights[0]}, Fuzzy={weights[1]}, Semantic={weights[2]}")

    def normalize_scores(self, results):
        """Normalize scores to [0, 1] range"""
        if not results:
            return results

        scores = [r['score'] for r in results]
        min_score = min(scores)
        max_score = max(scores)

        # Avoid division by zero
        if max_score == min_score:
            for r in results:
                r['score'] = 1.0
            return results

        # Normalize
        for r in results:
            r['score'] = (r['score'] - min_score) / (max_score - min_score)

        return results

    def search(self, query, top_k=10):
        """
        Search using hybrid ranking

        Args:
            query: Search query string
            top_k: Number of results to return

        Returns:
            List of dicts with combined scores
        """
        # Get results from all models (larger pool for better coverage)
        bm25_results = self.bm25.search(query, top_k=50)
        fuzzy_results = self.fuzzy.search(query, top_k=50)
        semantic_results = self.semantic.search(query, top_k=50)

        # Normalize scores
        bm25_results = self.normalize_scores(bm25_results)
        fuzzy_results = self.normalize_scores(fuzzy_results)
        semantic_results = self.normalize_scores(semantic_results)

        # Combine scores by document URL (unique identifier)
        combined = {}

        # Add BM25 scores
        for r in bm25_results:
            url = r['doc']['url']
            if url not in combined:
                combined[url] = {
                    'doc': r['doc'],
                    'bm25_score': 0,
                    'fuzzy_score': 0,
                    'semantic_score': 0,
                    'methods': []
                }
            combined[url]['bm25_score'] = r['score']
            combined[url]['methods'].append('BM25')

        # Add Fuzzy scores
        for r in fuzzy_results:
            url = r['doc']['url']
            if url not in combined:
                combined[url] = {
                    'doc': r['doc'],
                    'bm25_score': 0,
                    'fuzzy_score': 0,
                    'semantic_score': 0,
                    'methods': []
                }
            combined[url]['fuzzy_score'] = r['score']
            combined[url]['methods'].append('Fuzzy')

        # Add Semantic scores
        for r in semantic_results:
            url = r['doc']['url']
            if url not in combined:
                combined[url] = {
                    'doc': r['doc'],
                    'bm25_score': 0,
                    'fuzzy_score': 0,
                    'semantic_score': 0,
                    'methods': []
                }
            combined[url]['semantic_score'] = r['score']
            combined[url]['methods'].append('Semantic')

        # Calculate weighted combined score
        for url in combined:
            combined[url]['score'] = (
                combined[url]['bm25_score'] * self.weights[0] +
                combined[url]['fuzzy_score'] * self.weights[1] +
                combined[url]['semantic_score'] * self.weights[2]
            )
            combined[url]['method'] = 'Hybrid'

        # Sort by combined score
        final_results = sorted(
            combined.values(),
            key=lambda x: x['score'],
            reverse=True
        )

        return final_results[:top_k]

# Initialize Hybrid Ranker
hybrid_ranker = HybridRanker(bm25_retriever, fuzzy_matcher, semantic_retriever)

✓ Hybrid Ranker initialized
  Weights: BM25=0.3, Fuzzy=0.2, Semantic=0.5


In [14]:
# Cell 14: Test Hybrid Ranking
results = hybrid_ranker.search(test_query, top_k=5)

print(f"Hybrid Search: '{test_query}'")
print(f"Found: {len(results)} results\n")

for i, r in enumerate(results, 1):
    print(f"[{i}] Combined Score: {r['score']:.3f}")
    print(f"    BM25: {r['bm25_score']:.3f} | Fuzzy: {r['fuzzy_score']:.3f} | Semantic: {r['semantic_score']:.3f}")
    print(f"    {r['doc']['title'][:70]}")
    print(f"    Methods: {', '.join(set(r['methods']))}")
    print()

Hybrid Search: 'cricket in Bangladesh'
Found: 5 results

[1] Combined Score: 0.775
    BM25: 1.000 | Fuzzy: 1.000 | Semantic: 0.551
    Shakib receives Argentina cricket team’s gift for Bangladesh
    Methods: Fuzzy, Semantic, BM25

[2] Combined Score: 0.672
    BM25: 0.739 | Fuzzy: 1.000 | Semantic: 0.500
    Umpire Sharfuddoula Ibne Shahid Saikat completes international century
    Methods: Fuzzy, Semantic, BM25

[3] Combined Score: 0.608
    BM25: 0.479 | Fuzzy: 1.000 | Semantic: 0.528
    BPL opening ceremony uncertain: BCB
    Methods: Fuzzy, Semantic, BM25

[4] Combined Score: 0.581
    BM25: 0.795 | Fuzzy: 1.000 | Semantic: 0.285
    Shakib tries his luck out in tape-tennis cricket
    Methods: Fuzzy, Semantic, BM25

[5] Combined Score: 0.535
    BM25: 0.299 | Fuzzy: 1.000 | Semantic: 0.491
    National cricketers convey Eid greetings
    Methods: Fuzzy, Semantic, BM25



Step 7: Comparison & Testing

In [15]:
# Cell 15: Compare All Models
test_queries = [
    "cricket in Bangladesh",
    "নির্বাচন",  # election in Bangla
    "university education",
    "ভারত পাকিস্তান",  # India Pakistan in Bangla
    "health policy"
]

import pandas as pd

comparison_data = []

for query in test_queries:
    print(f"\n{'='*70}")
    print(f"Query: '{query}'")
    print('='*70)

    # Get results from each model
    bm25_res = bm25_retriever.search(query, top_k=3)
    fuzzy_res = fuzzy_matcher.search(query, top_k=3)  # ← ADD THIS
    semantic_res = semantic_retriever.search(query, top_k=3)
    hybrid_res = hybrid_ranker.search(query, top_k=3)

    print(f"\nBM25 Top 3:")
    for i, r in enumerate(bm25_res, 1):
        print(f"  [{i}] {r['score']:.3f} - {r['doc']['title'][:60]}")

    # ← ADD THIS BLOCK
    print(f"\nFuzzy Top 3:")
    for i, r in enumerate(fuzzy_res, 1):
        print(f"  [{i}] {r['score']:.3f} - {r['doc']['title'][:60]}")

    print(f"\nSemantic Top 3:")
    for i, r in enumerate(semantic_res, 1):
        print(f"  [{i}] {r['score']:.3f} - {r['doc']['title'][:60]}")

    print(f"\nHybrid Top 3:")
    for i, r in enumerate(hybrid_res, 1):
        print(f"  [{i}] {r['score']:.3f} - {r['doc']['title'][:60]}")

    # Store for CSV
    comparison_data.append({
        'query': query,
        'bm25_top1': bm25_res[0]['doc']['title'] if bm25_res else 'N/A',
        'fuzzy_top1': fuzzy_res[0]['doc']['title'] if fuzzy_res else 'N/A',  # ← ADD THIS
        'semantic_top1': semantic_res[0]['doc']['title'] if semantic_res else 'N/A',
        'hybrid_top1': hybrid_res[0]['doc']['title'] if hybrid_res else 'N/A'
    })

# Save comparison
df = pd.DataFrame(comparison_data)
df.to_csv('model_comparison.csv', index=False)
print("\n✓ Comparison saved to model_comparison.csv")


Query: 'cricket in Bangladesh'

BM25 Top 3:
  [1] 13.100 - Shakib receives Argentina cricket team’s gift for Bangladesh
  [2] 11.834 - Shakib tries his luck out in tape-tennis cricket
  [3] 11.670 - Mahir Sarowar Megh: The 17-year-old designer of Durdanto Dha

Fuzzy Top 3:
  [1] 1.000 - BKSP, BWSF sign MoU to train girls
  [2] 1.000 - ‘Bad omen’ as eight First Division clubs relegated
  [3] 1.000 - Miraz’s Odommyo prevail in CWAB’s Victory Day celebration ma

Semantic Top 3:
  [1] 0.416 - হ্যান্ডবলে চ্যাম্পিয়ন বিজিবি
  [2] 0.409 - মানববন্ধন
  [3] 0.401 - Bangladesh boxing poised for growth spurts

Hybrid Top 3:
  [1] 0.775 - Shakib receives Argentina cricket team’s gift for Bangladesh
  [2] 0.672 - Umpire Sharfuddoula Ibne Shahid Saikat completes internation
  [3] 0.608 - BPL opening ceremony uncertain: BCB

Query: 'নির্বাচন'

BM25 Top 3:
  [1] 7.297 - ডিসিসি নির্বাচন-জটিলতা
  [2] 7.219 - পশ্চিমবঙ্গে পঞ্চায়েত নির্বাচন শুরু কাল
  [3] 7.202 - যেকোনো সরকারের অধীনে ইসি নির্বাচনে প্রস্তু

Step 8: Save Final Retrieval System

In [16]:
# Cell 16: Save Complete Retrieval System
import pickle

retrieval_system = {
    'bm25': bm25_retriever,
    'fuzzy': fuzzy_matcher,
    'semantic': semantic_retriever,
    'hybrid': hybrid_ranker,
    'documents': all_documents
}

with open('retrieval_system.pkl', 'wb') as f:
    pickle.dump(retrieval_system, f)

print("✓ Complete retrieval system saved to retrieval_system.pkl")

✓ Complete retrieval system saved to retrieval_system.pkl


In [17]:
from google.colab import drive
import shutil

# Mount
drive.mount('/content/drive')

# Copy to Drive
shutil.copy('doc_embeddings.pkl', '/content/drive/MyDrive/')
shutil.copy('retrieval_system.pkl', '/content/drive/MyDrive/')
shutil.copy('model_comparison.csv', '/content/drive/MyDrive/')
shutil.copy('simple_index.pkl', '/content/drive/MyDrive/')
shutil.copy('entity_mapper.json', '/content/drive/MyDrive/')

print("✓ All saved to Drive!")

Mounted at /content/drive
✓ All saved to Drive!


In [18]:
# Cell 17: Download all files
from google.colab import files

print("Downloading files...")

# 1. Embeddings (if not already downloaded)
files.download('doc_embeddings.pkl')

# 2. Retrieval system
files.download('retrieval_system.pkl')

# 3. Comparison results
files.download('model_comparison.csv')

# 4. Notebook will be downloaded separately: File → Download → .ipynb

print("\n✓ All files downloaded!")

Downloading files...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


✓ All files downloaded!


In [19]:
# Quick download - just the updated CSV
from google.colab import files
files.download('model_comparison.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>