In [1]:
# In Colab
!pip install googletrans==4.0.0-rc1 transformers sentencepiece torch -q

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.1/55.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.4/133.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.8/58.8 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.0/65.0 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.6/53.6 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for googletrans (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not cu

In [3]:
"""
Hybrid Translation System
- Primary: googletrans (fast, online)
- Fallback: M2M100 (reliable, offline, supports 100+ languages)
- Silent switching on failure
"""

from googletrans import Translator
import warnings
warnings.filterwarnings('ignore')

class HybridTranslator:
    def __init__(self):
        print("Initializing Hybrid Translator...")

        # Primary: googletrans
        self.google_translator = Translator()
        self.google_available = True

        # Fallback: M2M100 multilingual model
        print("Loading offline translation models (one-time download)...")
        try:
            from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

            self.model_multi = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
            self.tokenizer_multi = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")

            self.hf_available = True
            print("✓ Offline models loaded successfully")
        except Exception as e:
            self.hf_available = False
            print(f"⚠ Offline fallback unavailable: {str(e)[:80]}")
            print("  System will use Google Translate only (requires internet)")

        print("✓ Hybrid Translator ready!\n")

        # Statistics
        self.stats = {
            'google_success': 0,
            'google_failures': 0,
            'hf_used': 0,
            'total_translations': 0
        }

    def translate(self, text, source_lang='en', target_lang='bn', verbose=False):
        """
        Translate text with automatic fallback

        Args:
            text: Text to translate
            source_lang: 'en' or 'bn'
            target_lang: 'en' or 'bn'
            verbose: Print which translator was used

        Returns:
            Translated text
        """
        self.stats['total_translations'] += 1

        # Try Google Translate first (fast)
        if self.google_available:
            try:
                result = self.google_translator.translate(
                    text,
                    src=source_lang,
                    dest=target_lang
                )
                self.stats['google_success'] += 1

                if verbose:
                    print(f"  [Used: Google Translate]")

                return result.text

            except Exception as e:
                self.stats['google_failures'] += 1
                if verbose:
                    print(f"  [Google failed: {str(e)[:50]}... switching to offline]")
                # Fall through to HuggingFace

        # Fallback to M2M100 (reliable, offline)
        if self.hf_available:
            try:
                self.stats['hf_used'] += 1

                if source_lang == 'en' and target_lang == 'bn':
                    # English → Bangla
                    self.tokenizer_multi.src_lang = "en"
                    encoded = self.tokenizer_multi(text, return_tensors="pt")
                    generated = self.model_multi.generate(
                        **encoded,
                        forced_bos_token_id=self.tokenizer_multi.get_lang_id("bn")
                    )
                    result = self.tokenizer_multi.batch_decode(generated, skip_special_tokens=True)[0]

                elif source_lang == 'bn' and target_lang == 'en':
                    # Bangla → English
                    self.tokenizer_multi.src_lang = "bn"
                    encoded = self.tokenizer_multi(text, return_tensors="pt")
                    generated = self.model_multi.generate(
                        **encoded,
                        forced_bos_token_id=self.tokenizer_multi.get_lang_id("en")
                    )
                    result = self.tokenizer_multi.batch_decode(generated, skip_special_tokens=True)[0]

                else:
                    result = text  # Same language, no translation needed

                if verbose:
                    print(f"  [Used: M2M100 Offline]")

                return result

            except Exception as e:
                if verbose:
                    print(f"  [M2M100 also failed: {e}]")
                return text  # Return original if both fail

        # Both failed
        if verbose:
            print("  [Warning: Both translators unavailable, returning original text]")
        return text

    def get_stats(self):
        """Return usage statistics"""
        return {
            'total': self.stats['total_translations'],
            'google_success': self.stats['google_success'],
            'google_failures': self.stats['google_failures'],
            'offline_used': self.stats['hf_used'],
            'success_rate': f"{(self.stats['google_success'] / max(1, self.stats['total_translations'])) * 100:.1f}%"
        }

    def print_stats(self):
        """Print translation statistics"""
        stats = self.get_stats()
        print("\n" + "="*50)
        print("TRANSLATION STATISTICS")
        print("="*50)
        print(f"Total translations: {stats['total']}")
        print(f"Google Translate (online): {stats['google_success']} successful")
        print(f"Google failures: {stats['google_failures']}")
        print(f"Offline fallback used: {stats['offline_used']}")
        print(f"Overall success rate: {stats['success_rate']}")
        print("="*50)


# Quick test
if __name__ == "__main__":
    translator = HybridTranslator()

    # Test translations
    print("Testing translations:\n")

    tests = [
        ("education", "en", "bn"),
        ("শিক্ষা", "bn", "en"),
        ("cricket", "en", "bn"),
    ]

    for text, src, tgt in tests:
        result = translator.translate(text, src, tgt, verbose=True)
        print(f"'{text}' → '{result}'\n")

    translator.print_stats()

Initializing Hybrid Translator...
Loading offline translation models (one-time download)...


config.json:   0%|          | 0.00/908 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/233 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/298 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

✓ Offline models loaded successfully
✓ Hybrid Translator ready!

Testing translations:

  [Used: Google Translate]
'education' → 'শিক্ষা'

  [Used: Google Translate]
'শিক্ষা' → 'education'

  [Used: Google Translate]
'cricket' → 'ক্রিকেট'


TRANSLATION STATISTICS
Total translations: 3
Google Translate (online): 3 successful
Google failures: 0
Offline fallback used: 0
Overall success rate: 100.0%


In [13]:
import json

entity_map = {
  "Bangladesh": "বাংলাদেশ",
  "Dhaka": "ঢাকা",
  "India": "ভারত",
  "Delhi": "দিল্লি",
  "Pakistan": "পাকিস্তান",
  "China": "চীন",
  "USA": "যুক্তরাষ্ট্র",
  "cricket": "ক্রিকেট",
  "football": "ফুটবল",
  "election": "নির্বাচন",
  "government": "সরকার",
  "parliament": "সংসদ",
  "police": "পুলিশ",
  "university": "বিশ্ববিদ্যালয়",
  "hospital": "হাসপাতাল",
  "school": "স্কুল",
  "student": "ছাত্র",
  "education": "শিক্ষা",
  "health": "স্বাস্থ্য",
  "politics": "রাজনীতি",
  "sports": "খেলাধুলা"
}

# Save it
with open('entity_mapper.json', 'w', encoding='utf-8') as f:
    json.dump(entity_map, f, ensure_ascii=False, indent=2)

print("✓ entity_mapper.json created!")

✓ entity_mapper.json created!


In [14]:
"""
Complete Query Processing Pipeline for Cross-Lingual IR
Integrates translation, entity mapping, and search
"""

import json
import pickle


class QueryProcessor:
    def __init__(self, index_path, entity_map_path):
        """
        Initialize query processor

        Args:
            index_path: Path to simple_index.pkl
            entity_map_path: Path to entity_mapper.json
        """
        print("Loading Query Processor...")

        # Load search index
        with open(index_path, 'rb') as f:
            self.index = pickle.load(f)
        print(f"✓ Loaded index with {self.index.doc_count} documents")

        # Load entity mapper
        with open(entity_map_path, 'r', encoding='utf-8') as f:
            self.entity_map = json.load(f)

        # Create reverse mapping (Bangla → English)
        self.entity_map_reverse = {v: k for k, v in self.entity_map.items()}
        print(f"✓ Loaded {len(self.entity_map)} entity mappings")

        # Initialize translator
        self.translator = HybridTranslator()

        print("✓ Query Processor ready!\n")

    def detect_language(self, query):
        """Detect if query is Bangla or English"""
        # Simple heuristic: check for Bangla Unicode
        bangla_chars = sum(1 for c in query if '\u0980' <= c <= '\u09FF')
        if bangla_chars > len(query) * 0.3:
            return 'bn'
        return 'en'

    def map_entities(self, query, source_lang):
        """
        Map named entities to target language

        Args:
            query: Original query
            source_lang: 'en' or 'bn'

        Returns:
            Query with mapped entities
        """
        if source_lang == 'en':
            # English → Bangla entity mapping
            for en_entity, bn_entity in self.entity_map.items():
                if en_entity.lower() in query.lower():
                    query = query + " " + bn_entity
        else:
            # Bangla → English entity mapping
            for bn_entity, en_entity in self.entity_map_reverse.items():
                if bn_entity in query:
                    query = query + " " + en_entity

        return query

    def process_query(self, query, verbose=True):
        """
        Complete query processing pipeline

        Args:
            query: User query string
            verbose: Print processing steps

        Returns:
            dict with processed query info
        """
        if verbose:
            print(f"\n{'='*60}")
            print(f"Processing Query: '{query}'")
            print('='*60)

        # Step 1: Detect language
        source_lang = self.detect_language(query)
        target_lang = 'bn' if source_lang == 'en' else 'en'

        if verbose:
            print(f"Detected language: {source_lang.upper()}")

        # Step 2: Map entities
        query_with_entities = self.map_entities(query, source_lang)
        if verbose and query_with_entities != query:
            print(f"Entity mapping added: {query_with_entities}")

        # Step 3: Translate query
        translated_query = self.translator.translate(
            query,
            source_lang,
            target_lang,
            verbose=verbose
        )

        if verbose:
            print(f"Translated to {target_lang.upper()}: '{translated_query}'")

        return {
            'original_query': query,
            'source_lang': source_lang,
            'translated_query': translated_query,
            'target_lang': target_lang,
            'query_with_entities': query_with_entities
        }

    def search(self, query, max_results=10, verbose=True):
        """
        Cross-lingual search

        Args:
            query: User query
            max_results: Max results to return
            verbose: Print details

        Returns:
            Combined results from both languages
        """
        # Process query
        processed = self.process_query(query, verbose)

        # Search in original language
        results_original = self.index.search(processed['original_query'])

        # Search in translated language
        results_translated = self.index.search(processed['translated_query'])

        # Search with entity-mapped query
        results_entities = self.index.search(processed['query_with_entities'])

        # Combine and deduplicate
        all_results = []
        seen_urls = set()

        for result in results_original + results_translated + results_entities:
            if result['url'] not in seen_urls:
                all_results.append(result)
                seen_urls.add(result['url'])

        if verbose:
            print(f"\n{'='*60}")
            print(f"SEARCH RESULTS")
            print('='*60)
            print(f"Original language results: {len(results_original)}")
            print(f"Translated language results: {len(results_translated)}")
            print(f"Total unique results: {len(all_results)}")
            print('='*60)

        return all_results[:max_results]

    def display_results(self, results):
        """Pretty print search results"""
        if not results:
            print("No results found.")
            return

        print(f"\nTop {len(results)} Results:\n")

        for i, doc in enumerate(results, 1):
            print(f"[{i}] {doc['title'][:80]}")
            print(f"    Language: {doc['language'].upper()}")
            print(f"    Source: {doc['source']}")
            print(f"    URL: {doc['url'][:70]}...")
            if doc.get('named_entities'):
                entities = doc['named_entities'][:3]
                print(f"    Entities: {', '.join(entities)}")
            print()


# Example usage
if __name__ == "__main__":
    # Initialize
    processor = QueryProcessor(
        index_path='simple_index.pkl',
        entity_map_path='entity_mapper.json'
    )

    # Test queries
    test_queries = [
        "cricket",
        "নির্বাচন",
        "Bangladesh election",
        "ভারত",
        "university"
    ]

    for query in test_queries:
        results = processor.search(query, max_results=5)
        processor.display_results(results)
        print("\n" + "="*60 + "\n")

    # Show translator stats
    processor.translator.print_stats()

Loading Query Processor...
✓ Loaded index with 6188 documents
✓ Loaded 21 entity mappings
Initializing Hybrid Translator...
Loading offline translation models (one-time download)...
✓ Offline models loaded successfully
✓ Hybrid Translator ready!

✓ Query Processor ready!


Processing Query: 'cricket'
Detected language: EN
Entity mapping added: cricket ক্রিকেট
  [Used: Google Translate]
Translated to BN: 'ক্রিকেট'

SEARCH RESULTS
Original language results: 90
Translated language results: 3146
Total unique results: 3233

Top 5 Results:

[1] Mahir Sarowar Megh: The 17-year-old designer of Durdanto Dhaka’s jersey
    Language: EN
    Source: thedailystar
    URL: https://www.thedailystar.net/rising-stars/stars-the-rise/news/mahir-sa...
    Entities: the Bangladesh Premier League, Megh, Odommo Jersey Design Contest

[2] Team can only win if board officials are shown on TV during matches: study
    Language: EN
    Source: thedailystar
    URL: https://www.thedailystar.net/satireday/news/tea

In [16]:
# Test complete pipeline

processor = QueryProcessor('simple_index.pkl', 'entity_mapper.json')

# Test cross-lingual search
results = processor.search("cricket in Bangladesh")
processor.display_results(results)

Loading Query Processor...
✓ Loaded index with 6188 documents
✓ Loaded 21 entity mappings
Initializing Hybrid Translator...
Loading offline translation models (one-time download)...
✓ Offline models loaded successfully
✓ Hybrid Translator ready!

✓ Query Processor ready!


Processing Query: 'cricket in Bangladesh'
Detected language: EN
Entity mapping added: cricket in Bangladesh বাংলাদেশ ক্রিকেট
  [Used: Google Translate]
Translated to BN: 'বাংলাদেশে ক্রিকেট'

SEARCH RESULTS
Original language results: 3077
Translated language results: 3153
Total unique results: 5982

Top 10 Results:

[1] বিশ্ব গণমাধ্যমে বাঙালির বিজয়
    Language: BN
    Source: Dhaka Post
    URL: https://www.dhakapost.com/opinion/417197...
    Entities: অবস্থান, লুকাতে, ১৯৭১

[2] বধ্যভূমির বাংলাদেশ, জেনোসাইড ১৯৭১
    Language: BN
    Source: Dhaka Post
    URL: https://www.dhakapost.com/opinion/416273...
    Entities: জেনোসাইড, ১৯৭১, সালে

[3] ইংরেজি ২য় পত্র
    Language: BN
    Source: Prothom Alo
    URL: http://ww

In [19]:
from google.colab import files

files.download('entity_mapper.json')
files.download('simple_index.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>