In [1]:
# Install libraries
!pip install spacy stanza -q
!python -m spacy download en_core_web_sm

# Upload  2 JSON files: bangla_articles.json, english_articles.json
from google.colab import files
uploaded = files.upload()

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m608.4/608.4 kB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m71.2 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


Saving bangla_articles.json to bangla_articles.json
Saving english_articles.json to english_articles.json


In [2]:
import json
import spacy

print("Loading NLP models...")
# English NER
nlp_en = spacy.load('en_core_web_sm')

# Bangla NER - Using simple regex-based approach (Stanza has download issues)
print("Using simple NER for Bangla (Stanza has compatibility issues)\n")
print("Models loaded!\n")

def extract_entities_english(text):
    """Extract named entities from English text"""
    doc = nlp_en(text[:10000])  # Limit to avoid memory issues
    entities = list(set([ent.text for ent in doc.ents]))
    return entities

def extract_entities_bangla(text):
    """Simple pattern-based NER for Bangla (fallback)"""
    import re
    # Extract capitalized Bangla words as potential entities
    # This is a simple heuristic - not perfect but works
    entities = []

    # Common Bangla named entity patterns
    bangla_word_pattern = r'[\u0980-\u09FF]+'
    words = re.findall(bangla_word_pattern, text)

    # Filter for potential named entities (you can improve this)
    # For now, just take unique Bangla words as placeholder
    entities = list(set(words[:20]))  # Limit to 20 for performance

    return entities

def process_documents(input_file, output_file, language):
    """Add NER to documents"""
    print(f"Processing {input_file}...")

    with open(input_file, 'r', encoding='utf-8') as f:
        docs = json.load(f)

    total = len(docs)

    for i, doc in enumerate(docs):
        # Extract entities from title + body
        text = doc.get('title', '') + ' ' + doc.get('body', '')

        if language == 'en':
            entities = extract_entities_english(text)
        else:
            entities = extract_entities_bangla(text)

        doc['named_entities'] = entities

        # Progress indicator
        if (i + 1) % 100 == 0:
            print(f"  Processed {i + 1}/{total} documents...")

    # Save with NER
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(docs, f, ensure_ascii=False, indent=2)

    print(f"✓ Saved to {output_file}\n")

# Process English articles
process_documents('english_articles.json', 'english_articles_with_ner.json', 'en')

# Process Bangla articles
process_documents('bangla_articles.json', 'bangla_articles_with_ner.json', 'bn')

print("✓ All done! NER added to both datasets.")

Loading NLP models...
Using simple NER for Bangla (Stanza has compatibility issues)

Models loaded!

Processing english_articles.json...
  Processed 100/3094 documents...
  Processed 200/3094 documents...
  Processed 300/3094 documents...
  Processed 400/3094 documents...
  Processed 500/3094 documents...
  Processed 600/3094 documents...
  Processed 700/3094 documents...
  Processed 800/3094 documents...
  Processed 900/3094 documents...
  Processed 1000/3094 documents...
  Processed 1100/3094 documents...
  Processed 1200/3094 documents...
  Processed 1300/3094 documents...
  Processed 1400/3094 documents...
  Processed 1500/3094 documents...
  Processed 1600/3094 documents...
  Processed 1700/3094 documents...
  Processed 1800/3094 documents...
  Processed 1900/3094 documents...
  Processed 2000/3094 documents...
  Processed 2100/3094 documents...
  Processed 2200/3094 documents...
  Processed 2300/3094 documents...
  Processed 2400/3094 documents...
  Processed 2500/3094 documents.

In [3]:
import json
import pickle
from collections import defaultdict
import re

class SimpleIndex:
    """Simple inverted index for document retrieval"""

    def __init__(self):
        self.inverted_index = defaultdict(list)  # term -> [doc_ids]
        self.documents = {}  # doc_id -> document metadata
        self.doc_count = 0
        self.term_count = 0

    def tokenize(self, text):
        """Simple tokenization - extracts words"""
        text = text.lower()
        tokens = re.findall(r'\w+', text)
        return tokens

    def add_document(self, doc):
        """Add a document to the index"""
        doc_id = self.doc_count
        self.doc_count += 1

        # Store document metadata
        self.documents[doc_id] = {
            'title': doc['title'],
            'body': doc['body'][:500],  # Store first 500 chars as snippet
            'url': doc['url'],
            'date': doc['date'],
            'language': doc['language'],
            'source': doc.get('source', ''),
            'word_count': doc.get('word_count', 0),
            'named_entities': doc.get('named_entities', [])
        }

        # Tokenize and build inverted index
        text = doc['title'] + ' ' + doc['body']
        tokens = self.tokenize(text)

        # Add unique tokens to inverted index
        for token in set(tokens):
            self.inverted_index[token].append(doc_id)

        # Progress indicator
        if self.doc_count % 500 == 0:
            print(f"  Indexed {self.doc_count} documents...")

        return doc_id

    def search(self, query):
        """Simple search - returns documents containing query terms"""
        tokens = self.tokenize(query)
        doc_ids = set()

        for token in tokens:
            if token in self.inverted_index:
                doc_ids.update(self.inverted_index[token])

        return [self.documents[doc_id] for doc_id in doc_ids]

    def get_stats(self):
        """Return index statistics"""
        return {
            'total_documents': self.doc_count,
            'unique_terms': len(self.inverted_index),
            'bangla_docs': sum(1 for doc in self.documents.values() if doc['language'] == 'bn'),
            'english_docs': sum(1 for doc in self.documents.values() if doc['language'] == 'en')
        }

# Load documents with NER
print("Loading documents...")
with open('bangla_articles_with_ner.json', 'r', encoding='utf-8') as f:
    bangla_docs = json.load(f)

with open('english_articles_with_ner.json', 'r', encoding='utf-8') as f:
    english_docs = json.load(f)

print(f"Loaded {len(bangla_docs)} Bangla + {len(english_docs)} English documents\n")

# Build index
print("Building inverted index...")
index = SimpleIndex()

all_docs = bangla_docs + english_docs

for doc in all_docs:
    index.add_document(doc)

# Get statistics
stats = index.get_stats()
print(f"\n✓ Indexing complete!")
print(f"  Total documents: {stats['total_documents']}")
print(f"  Bangla documents: {stats['bangla_docs']}")
print(f"  English documents: {stats['english_docs']}")
print(f"  Unique terms: {stats['unique_terms']:,}")

# Save index
print("\nSaving index to disk...")
with open('simple_index.pkl', 'wb') as f:
    pickle.dump(index, f)

print("✓ Index saved to 'simple_index.pkl'")
print("\nModule A complete! You can now run 'test_index.py' to test queries.")

Loading documents...
Loaded 3094 Bangla + 3094 English documents

Building inverted index...
  Indexed 500 documents...
  Indexed 1000 documents...
  Indexed 1500 documents...
  Indexed 2000 documents...
  Indexed 2500 documents...
  Indexed 3000 documents...
  Indexed 3500 documents...
  Indexed 4000 documents...
  Indexed 4500 documents...
  Indexed 5000 documents...
  Indexed 5500 documents...
  Indexed 6000 documents...

✓ Indexing complete!
  Total documents: 6188
  Bangla documents: 3094
  English documents: 3094
  Unique terms: 57,829

Saving index to disk...
✓ Index saved to 'simple_index.pkl'

Module A complete! You can now run 'test_index.py' to test queries.


In [4]:
import pickle

# Load the index
print("Loading index...")
with open('simple_index.pkl', 'rb') as f:
    index = pickle.load(f)

stats = index.get_stats()
print(f"✓ Index loaded successfully!")
print(f"  Documents: {stats['total_documents']}")
print(f"  Unique terms: {stats['unique_terms']:,}\n")

# Test queries
test_queries = [
    "cricket",
    "নির্বাচন",  # election
    "Bangladesh",
    "ভারত",  # India
    "university",
    "সরকার",  # government
    "police",
    "খেলা"  # game/sport
]

print("=" * 60)
print("TESTING QUERIES")
print("=" * 60)

for query in test_queries:
    results = index.search(query)

    print(f"\nQuery: '{query}'")
    print(f"Found: {len(results)} documents")

    if results:
        # Show top 3 results
        for i, doc in enumerate(results[:3], 1):
            print(f"\n  [{i}] {doc['title'][:80]}")
            print(f"      Language: {doc['language']}")
            print(f"      Source: {doc['source']}")
            print(f"      URL: {doc['url'][:60]}...")

            # Show named entities if available
            if doc['named_entities']:
                entities = doc['named_entities'][:5]  # Show first 5
                print(f"      Entities: {', '.join(entities)}")
    else:
        print("  No results found")

    print("-" * 60)

print("\n✓ Testing complete!")
print("\nTry your own queries:")
print("  >>> results = index.search('your query here')")
print("  >>> print(f'Found {len(results)} documents')")

Loading index...
✓ Index loaded successfully!
  Documents: 6188
  Unique terms: 57,829

TESTING QUERIES

Query: 'cricket'
Found: 90 documents

  [1] Mahir Sarowar Megh: The 17-year-old designer of Durdanto Dhaka’s jersey
      Language: en
      Source: thedailystar
      URL: https://www.thedailystar.net/rising-stars/stars-the-rise/new...
      Entities: the Bangladesh Premier League, Megh, Odommo Jersey Design Contest, Ahsan Manzil, Shakib Al Hasan

  [2] Team can only win if board officials are shown on TV during matches: study
      Language: en
      Source: thedailystar
      URL: https://www.thedailystar.net/satireday/news/team-can-only-wi...
      Entities: Abul, half, Cricket Board, Kamrul Hasan Phapa, Phapa

  [3] It’s me, hi, I’m the problem, it’s me, says BCB boss
      Language: en
      Source: thedailystar
      URL: https://www.thedailystar.net/satireday/news/its-me-hi-im-the...
      Entities: Papon, Chandika Hathurusingha, T20I, Swift, Nazmul Hassan Papon
------------

In [5]:
# Download all generated files to your computer
from google.colab import files

files.download('bangla_articles_with_ner.json')
files.download('english_articles_with_ner.json')
files.download('simple_index.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>