In [3]:
!nvidia-smi

Fri Feb 13 20:05:25 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.82.07              Driver Version: 580.82.07      CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   37C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

+----------------------------------------------

In [4]:

#  Install Required Libraries

!pip install -q googletrans==4.0.0-rc1
!pip install -q nltk scikit-learn langdetect


#  Import Libraries

import re
from collections import defaultdict
import json
import pickle
from google.colab import files
from googletrans import Translator
from langdetect import detect
import nltk
from nltk.corpus import stopwords
import math
import time

nltk.download('stopwords')


#  Upload JSON/JSONL Files

print("Upload your JSON/JSONL files (all files)")
uploaded = files.upload()

json_filenames = list(uploaded.keys())
print("Uploaded files:", json_filenames)


#  Query Processing Utilities

translator = Translator()
stop_en = set(stopwords.words("english"))
stop_bn = {
    'এবং', 'বা', 'যে', 'এই', 'ও', 'তা', 'সে', 'যা', 'কি', 'কে',
    'তার', 'আর', 'এর', 'হয়', 'করা', 'থেকে', 'সঙ্গে', 'দিয়ে',
    'জন্য', 'পর', 'আছে', 'ছিল', 'হবে', 'না', 'নেই'
}

# Named-entity mapping
NE_MAP = {
    "bangladesh": "বাংলাদেশ",
    "dhaka": "ঢাকা",
    "chittagong": "চট্টগ্রাম",
    "sylhet": "সিলেট",
    "rajshahi": "রাজশাহী",
    "prime minister": "প্রধানমন্ত্রী",
    "sheikh hasina": "শেখ হাসিনা",
    "khaleda zia": "খালেদা জিয়া",
    "prison": "কারাগার",
    "protest": "বিক্ষোভ",
    "education": "শিক্ষা",
    "economy": "অর্থনীতি",
    "government": "সরকার"
}

# Query Expansion
EXPANSIONS = {
    "covid": ["corona", "covid-19", "pandemic"],
    "বাংলাদেশ": ["bd", "bangladesh"],
    "protest": ["demonstration", "rally", "strike"],
    "প্রধানমন্ত্রী": ["pm", "prime minister"],
    "কারাগার": ["জেল", "jail", "prison"],
    "শিক্ষা": ["education", "school"],
    "বিক্ষোভ": ["protest", "demonstration"]
}

def process_query(query: str, verbose=True):
    """
    Complete query processing pipeline
    """
    if verbose:
        print(f"\n{'='*70}")
        print(f"Processing Query: '{query}'")
        print(f"{'='*70}")

    # 1️ Language Detection
    try:
        lang_detected = detect(query)
        lang = 'bn' if lang_detected == 'bn' else 'en'
    except:
        # Fallback: check for Bangla Unicode
        if any('\u0980' <= c <= '\u09FF' for c in query):
            lang = 'bn'
        else:
            lang = 'en'

    if verbose:
        print(f"1. Language Detection: {lang}")

    # 2️ Normalization
    query_norm = query.lower()
    query_norm = re.sub(r'\s+', ' ', query_norm).strip()

    # Remove stopwords
    if lang == 'en':
        words = [w for w in query_norm.split() if w not in stop_en]
    else:
        words = [w for w in query_norm.split() if w not in stop_bn]

    query_norm = ' '.join(words) if words else query_norm

    if verbose:
        print(f"2. Normalized: '{query_norm}'")

    # 3️ Query Translation
    target_lang = 'bn' if lang == 'en' else 'en'

    try:
        if lang == 'en':
            query_translated = translator.translate(query_norm, src='en', dest='bn').text
        else:
            query_translated = translator.translate(query_norm, src='bn', dest='en').text
    except Exception as e:
        print(f"   Translation error: {e}, using original")
        query_translated = query_norm

    if verbose:
        print(f"3. Translated ({lang}→{target_lang}): '{query_translated}'")

    # 4️ Query Expansion
    query_expanded = query_translated
    for term, syns in EXPANSIONS.items():
        if term in query_translated.lower():
            query_expanded += ' ' + ' '.join(syns)

    if verbose and query_expanded != query_translated:
        print(f"4. Expanded: '{query_expanded}'")

    # 5️ Named-Entity Mapping
    query_mapped = query_expanded
    if target_lang == 'en':
        # Mapping Bangla→English
        for en, bn in NE_MAP.items():
            query_mapped = query_mapped.replace(bn, en)
    else:
        # Mapping English to Bangla
        for en, bn in NE_MAP.items():
            query_mapped = query_mapped.replace(en, bn)

    if verbose and query_mapped != query_expanded:
        print(f"5. NE Mapped: '{query_mapped}'")

    return {
        'original': query,
        'lang': lang,
        'normalized': query_norm,
        'translated': query_translated,
        'expanded': query_expanded,
        'mapped': query_mapped,
        'target_lang': target_lang
    }


#  Build Inverted Index from JSONL

class CLIRIndex:
    def __init__(self):
        self.documents = {}
        self.index = defaultdict(lambda: defaultdict(list))
        self.doc_lengths = {}
        self.num_docs = {'bn': 0, 'en': 0}
        self.avg_doc_length = {'bn': 0, 'en': 0}
        self.doc_counter = 0

    def tokenize(self, text, lang='en'):
        text = text.lower()
        if lang == 'bn':
            tokens = re.findall(r'\S+', text)
        else:
            tokens = re.findall(r'\b\w+\b', text)
        return tokens

    def add_document(self, doc):
        doc_id = self.doc_counter
        self.doc_counter += 1
        self.documents[doc_id] = doc

        lang = doc.get('language', 'en')
        self.num_docs[lang] += 1

        title_tokens = self.tokenize(doc.get('title', ''), lang)
        body_tokens = self.tokenize(doc.get('body', ''), lang)
        all_tokens = title_tokens + body_tokens
        self.doc_lengths[doc_id] = len(all_tokens)

        for pos, token in enumerate(all_tokens):
            self.index[token][lang].append((doc_id, pos))
        return doc_id

    def build_from_json_files(self, files_list):
        print("Building index from files...")
        for fpath in files_list:
            print(f"  Processing {fpath}...")
            count = 0
            with open(fpath, 'r', encoding='utf-8') as f:
                for line in f:
                    line = line.strip()
                    if line:
                        try:
                            doc = json.loads(line)
                            self.add_document(doc)
                            count += 1
                        except Exception as e:
                            continue
            print(f"    Added {count} documents")

        # Calculate average doc lengths
        for lang in ['bn', 'en']:
            if self.num_docs[lang] > 0:
                total = sum(
                    self.doc_lengths[did]
                    for did, doc in self.documents.items()
                    if doc.get('language', 'en') == lang
                )
                self.avg_doc_length[lang] = total / self.num_docs[lang]

        print("\n" + "="*70)
        print("Index built successfully!")
        print("="*70)
        print(f"Total documents: {self.doc_counter}")
        print(f"Bangla documents: {self.num_docs['bn']}")
        print(f"English documents: {self.num_docs['en']}")
        print(f"Unique terms: {len(self.index)}")
        print(f"Avg Bangla doc length: {self.avg_doc_length['bn']:.2f} tokens")
        print(f"Avg English doc length: {self.avg_doc_length['en']:.2f} tokens")

    def search(self, query, lang='en', top_k=10, k1=1.5, b=0.75):
        """BM25 Search"""
        query_tokens = self.tokenize(query, lang)
        if not query_tokens:
            return []

        scores = defaultdict(float)
        N = self.num_docs[lang]

        if N == 0:
            return []

        avgdl = self.avg_doc_length[lang]

        for term in query_tokens:
            if term not in self.index or lang not in self.index[term]:
                continue

            postings = self.index[term][lang]
            df = len(set(doc_id for doc_id, _ in postings))
            idf = math.log((N - df + 0.5) / (df + 0.5) + 1)

            tf_doc = defaultdict(int)
            for doc_id, _ in postings:
                tf_doc[doc_id] += 1

            for doc_id, tf in tf_doc.items():
                doc_len = self.doc_lengths[doc_id]
                score = idf * ((tf * (k1 + 1)) / (tf + k1 * (1 - b + b * (doc_len / avgdl))))
                scores[doc_id] += score

        ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_k]
        results = []
        for doc_id, score in ranked:
            doc = self.documents[doc_id].copy()
            doc['score'] = score
            doc['doc_id'] = doc_id
            results.append(doc)
        return results

    def save(self, fname='clir_index.pkl'):
        """Save index (FIXED to handle lambda in defaultdict)"""
        # Convert defaultdict to regular dict
        index_dict = {}
        for term, lang_dict in self.index.items():
            index_dict[term] = dict(lang_dict)

        with open(fname, 'wb') as f:
            pickle.dump({
                'documents': self.documents,
                'index': index_dict,  # Regular dict instead of defaultdict
                'doc_lengths': self.doc_lengths,
                'num_docs': self.num_docs,
                'avg_doc_length': self.avg_doc_length,
                'doc_counter': self.doc_counter
            }, f)
        print(f"\n✓ Index saved to {fname}")

    def load(self, fname='clir_index.pkl'):
        """Load index from file"""
        with open(fname, 'rb') as f:
            data = pickle.load(f)

        self.documents = data['documents']
        # Convert back to defaultdict
        self.index = defaultdict(lambda: defaultdict(list))
        for term, lang_dict in data['index'].items():
            self.index[term] = defaultdict(list, lang_dict)

        self.doc_lengths = data['doc_lengths']
        self.num_docs = data['num_docs']
        self.avg_doc_length = data['avg_doc_length']
        self.doc_counter = data['doc_counter']

        print(f"✓ Index loaded from {fname}")


#  Build Index from Uploaded Files

index = CLIRIndex()
index.build_from_json_files(json_filenames)
index.save('clir_index.pkl')

# Download the index
files.download('clir_index.pkl')


#  Cross-Lingual Search Function

def cross_lingual_search(query, top_k=10, verbose=True):
    """
    Perform cross-lingual search with query processing
    """
    start_time = time.time()

    # Process query
    q_proc = process_query(query, verbose=verbose)

    # Search in target language (translated query)
    target_lang = q_proc['target_lang']
    mapped_query = q_proc['mapped']

    if verbose:
        print(f"\n{'='*70}")
        print(f"Searching in {target_lang.upper()} documents...")
        print(f"Search query: '{mapped_query}'")
        print(f"{'='*70}")

    results = index.search(mapped_query, lang=target_lang, top_k=top_k)

    # Also search in original language
    orig_lang = q_proc['lang']
    orig_query = q_proc['normalized']

    if verbose:
        print(f"\nAlso searching in {orig_lang.upper()} documents...")
        print(f"Search query: '{orig_query}'")

    results_orig = index.search(orig_query, lang=orig_lang, top_k=top_k)

    # Calculate normalized scores (0-1)
    max_score = max(
        [r['score'] for r in results] + [r['score'] for r in results_orig] + [0.1]
    )

    elapsed_time = (time.time() - start_time) * 1000

    if verbose:
        print(f"\n{'='*70}")
        print(f"RESULTS - {target_lang.upper()} Documents (Translated Query)")
        print(f"{'='*70}")

    if results:
        for i, doc in enumerate(results, 1):
            confidence = min(doc['score'] / max_score, 1.0)

            if verbose:
                print(f"\n{i}. Score: {doc['score']:.4f} | Confidence: {confidence:.2f}")
                print(f"   Title: {doc.get('title', 'N/A')[:100]}...")
                print(f"   URL: {doc.get('url', 'N/A')}")
                print(f"   Date: {doc.get('date', 'N/A')}")

            # Low confidence warning
            if i == 1 and confidence < 0.20:
                print(f"\n   ⚠️ WARNING: Low confidence ({confidence:.2f})")
                print(f"   Retrieved results may not be relevant.")
    else:
        if verbose:
            print("   ⚠️ No results found")

    if verbose:
        print(f"\n{'='*70}")
        print(f"RESULTS - {orig_lang.upper()} Documents (Original Query)")
        print(f"{'='*70}")

    if results_orig:
        for i, doc in enumerate(results_orig, 1):
            confidence = min(doc['score'] / max_score, 1.0)

            if verbose:
                print(f"\n{i}. Score: {doc['score']:.4f} | Confidence: {confidence:.2f}")
                print(f"   Title: {doc.get('title', 'N/A')[:100]}...")
                print(f"   URL: {doc.get('url', 'N/A')}")
    else:
        if verbose:
            print("   ⚠️ No results found")

    if verbose:
        print(f"\n{'='*70}")
        print(f"Total processing time: {elapsed_time:.2f}ms")
        print(f"{'='*70}")

    return {
        'query': query,
        'processed': q_proc,
        'results_translated': results,
        'results_original': results_orig,
        'max_score': max_score,
        'processing_time_ms': elapsed_time
    }


# Test Queries

print("\n" + "="*70)
print("TESTING CROSS-LINGUAL SEARCH")
print("="*70)

test_queries = [
    "Prime Minister of Bangladesh",
    "প্রধানমন্ত্রী শেখ হাসিনা",
    "prison protest",
    "চট্টগ্রাম কারাগার বিক্ষোভ",
    "education system",
    "ঢাকা শিক্ষা"
]

for query in test_queries:
    result = cross_lingual_search(query, top_k=5, verbose=True)
    print("\n" + "="*70 + "\n")


#  Interactive Search

def interactive_search():
    """
    Interactive search interface
    """
    print("\n" + "="*70)
    print("INTERACTIVE CROSS-LINGUAL SEARCH")
    print("="*70)
    print("Type 'exit' to quit\n")

    while True:
        query = input("Enter your query: ").strip()

        if query.lower() == 'exit':
            print("Goodbye!")
            break

        if not query:
            continue

        cross_lingual_search(query, top_k=10, verbose=True)
        print("\n")

# Uncomment to run interactive search
  #interactive_search()


# Export Results for Evaluation

import csv

def export_results_for_evaluation(queries_list, output_file='evaluation_queries.csv'):
    """
    Export search results for manual relevance labeling
    """
    with open(output_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow([
            'query', 'query_lang', 'doc_lang', 'rank',
            'doc_id', 'title', 'url', 'score', 'confidence', 'relevant'
        ])

        for query in queries_list:
            result = cross_lingual_search(query, top_k=50, verbose=False)

            q_lang = result['processed']['lang']
            max_score = result['max_score']

            # Export translated results
            for i, doc in enumerate(result['results_translated'], 1):
                confidence = min(doc['score'] / max_score, 1.0)
                writer.writerow([
                    query,
                    q_lang,
                    result['processed']['target_lang'],
                    i,
                    doc['doc_id'],
                    doc.get('title', 'N/A'),
                    doc.get('url', 'N/A'),
                    f"{doc['score']:.4f}",
                    f"{confidence:.4f}",
                    ''  # Empty for manual labeling
                ])

            # Export original language results
            for i, doc in enumerate(result['results_original'], 1):
                confidence = min(doc['score'] / max_score, 1.0)
                writer.writerow([
                    query,
                    q_lang,
                    q_lang,
                    i,
                    doc['doc_id'],
                    doc.get('title', 'N/A'),
                    doc.get('url', 'N/A'),
                    f"{doc['score']:.4f}",
                    f"{confidence:.4f}",
                    ''
                ])

    print(f"\n✓ Results exported to {output_file}")
    print(f"  Total queries: {len(queries_list)}")
    print(f"  You can now manually label the 'relevant' column with 'yes' or 'no'")

    files.download(output_file)

# Example: Export evaluation queries
evaluation_queries = [
    "প্রধানমন্ত্রী",
    "prime minister",
    "কারাগার বিক্ষোভ",
    "prison protest",
    "শিক্ষা ব্যবস্থা",
    "education system",
    "অর্থনীতি",
    "economy",
    "ঢাকা",
    "dhaka"
]

export_results_for_evaluation(evaluation_queries)


Upload your JSON/JSONL files (all files)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Saving bn_merged.jsonl to bn_merged.jsonl
Saving en_merged.jsonl to en_merged.jsonl
Uploaded files: ['bn_merged.jsonl', 'en_merged.jsonl']
Building index from files...
  Processing bn_merged.jsonl...
    Added 2500 documents
  Processing en_merged.jsonl...
    Added 2500 documents

Index built successfully!
Total documents: 5000
Bangla documents: 2500
English documents: 2500
Unique terms: 120536
Avg Bangla doc length: 312.14 tokens
Avg English doc length: 455.56 tokens

✓ Index saved to clir_index.pkl


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


TESTING CROSS-LINGUAL SEARCH

Processing Query: 'Prime Minister of Bangladesh'
1. Language Detection: en
2. Normalized: 'prime minister bangladesh'
3. Translated (en→bn): 'বাংলাদেশের প্রধানমন্ত্রী'
4. Expanded: 'বাংলাদেশের প্রধানমন্ত্রী bd bangladesh pm prime minister'
5. NE Mapped: 'বাংলাদেশের প্রধানমন্ত্রী bd বাংলাদেশ pm প্রধানমন্ত্রী'

Searching in BN documents...
Search query: 'বাংলাদেশের প্রধানমন্ত্রী bd বাংলাদেশ pm প্রধানমন্ত্রী'

Also searching in EN documents...
Search query: 'prime minister bangladesh'

RESULTS - BN Documents (Translated Query)

1. Score: 16.0236 | Confidence: 1.00
   Title: খালেদা জিয়ার প্রতি শ্রদ্ধা রাজনাথ সিংয়ের...
   URL: https://www.prothomalo.com/bangladesh/khaaledaa-jiyyaar-prti-shrddhaa-raajnaath-sinyyer
   Date: 2026-01-01T17:20:22+06:00

2. Score: 15.8361 | Confidence: 0.99
   Title: খালেদা জিয়ার মৃত্যুতে শোক লিটন–মিরাজদের...
   URL: https://www.prothomalo.com/sports/cricket/qjx5rgfmk1
   Date: 2025-12-30T13:25:31+06:00

3. Score: 15.4947 | Confiden

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [5]:
interactive_search()


INTERACTIVE CROSS-LINGUAL SEARCH
Type 'exit' to quit

Enter your query: bangladesh

Processing Query: 'bangladesh'
1. Language Detection: en
2. Normalized: 'bangladesh'
3. Translated (en→bn): 'বাংলাদেশ'
4. Expanded: 'বাংলাদেশ bd bangladesh'
5. NE Mapped: 'বাংলাদেশ bd বাংলাদেশ'

Searching in BN documents...
Search query: 'বাংলাদেশ bd বাংলাদেশ'

Also searching in EN documents...
Search query: 'bangladesh'

RESULTS - BN Documents (Translated Query)

1. Score: 14.8503 | Confidence: 1.00
   Title: ৩৩৩-তে ডায়াল করে মিলবে নির্বাচন ও গণভোটের তথ্য...
   URL: https://www.bd-pratidin.com/national/2026/02/04/1212917
   Date: 1770192761000

2. Score: 6.3674 | Confidence: 0.43
   Title: রাজনৈতিক দল অনুযায়ী প্রার্থীর সংখ্যা...
   URL: https://www.prothomalo.com/bangladesh/রাজনৈতিক-দল-অনুযায়ী-প্রার্থীর-সংখ্যা
   Date: 2014-01-05T01:20:10+06:00

3. Score: 6.2504 | Confidence: 0.42
   Title: জাতীয় হ্যান্ডবল ফাইনালে বিজিবি-পুলিশ লড়াই...
   URL: https://www.banglatribune.com/sport/other-sports/60673/%E0%