In [2]:
!kaggle datasets download disisbig/nepali-wikipedia-articles

Dataset URL: https://www.kaggle.com/datasets/disisbig/nepali-wikipedia-articles
License(s): CC-BY-SA-4.0
Downloading nepali-wikipedia-articles.zip to /home/lang-chain/Documents/daraz_product_review/Notebook
 98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 28.0M/28.4M [00:02<00:00, 11.9MB/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28.4M/28.4M [00:02<00:00, 12.9MB/s]


In [15]:
import os
import hashlib
from tqdm import tqdm  # Install with: pip install tqdm

def combine_with_progress_bar():
    """
    Combine files with a progress bar for better visualization.
    """
    base_dir = 'nepali-wikipedia-articles/train/train'
    output_file = 'combined_nepali_wikipedia_progress.txt'
    
    if not os.path.exists(base_dir):
        print(f"Directory not found: {base_dir}")
        return
    
    seen_hashes = set()
    stats = {
        'unique': 0,
        'duplicates': 0,
        'missing': 0,
        'errors': 0,
        'total_chars': 0
    }
    
    print("Combining Nepali Wikipedia articles...")
    
    # Create a progress bar
    with tqdm(total=38756, desc="Processing files") as pbar:
        with open(output_file, 'w', encoding='utf-8') as outfile:
            for i in range(38756):
                filepath = os.path.join(base_dir, f"{i}.txt")
                
                if os.path.exists(filepath):
                    try:
                        with open(filepath, 'r', encoding='utf-8') as infile:
                            content = infile.read().strip()
                            
                            if content:
                                content_hash = hashlib.md5(content.encode('utf-8')).hexdigest()
                                
                                if content_hash not in seen_hashes:
                                    seen_hashes.add(content_hash)
                                    
                                    # Add separator if not first article
                                    if stats['unique'] > 0:
                                        outfile.write("\n" + "="*80 + "\n")
                                    
                                    outfile.write(content)
                                    stats['unique'] += 1
                                    stats['total_chars'] += len(content)
                                else:
                                    stats['duplicates'] += 1
                    except Exception as e:
                        stats['errors'] += 1
                        # Only show first few errors
                        if stats['errors'] <= 3:
                            tqdm.write(f"Error reading {i}.txt: {str(e)[:50]}...")
                else:
                    stats['missing'] += 1
                
                # Update progress bar
                pbar.update(1)
                
                # Update progress bar description with current stats
                if i % 1000 == 0:
                    pbar.set_postfix({
                        'Unique': stats['unique'],
                        'Dupes': stats['duplicates'],
                        'Missing': stats['missing']
                    })
    
    # Print final statistics
    print("\n" + "="*60)
    print("PROCESSING COMPLETE")
    print("="*60)
    print(f"Files processed: 38756")
    print(f"Unique articles: {stats['unique']}")
    print(f"Duplicates removed: {stats['duplicates']}")
    print(f"Missing files: {stats['missing']}")
    print(f"Files with errors: {stats['errors']}")
    
    if stats['unique'] > 0:
        avg_length = stats['total_chars'] / stats['unique']
        print(f"\nTotal characters: {stats['total_chars']:,}")
        print(f"Average article length: {avg_length:.0f} characters")
        
        # Calculate file size
        if os.path.exists(output_file):
            size_mb = os.path.getsize(output_file) / (1024 * 1024)
            print(f"Output file size: {size_mb:.2f} MB")
    
    print(f"\nOutput saved to: {output_file}")

# Run this directly
if __name__ == "__main__":
    combine_with_progress_bar()

Combining Nepali Wikipedia articles...


Processing files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 38756/38756 [00:00<00:00, 41603.01it/s, Unique=22046, Dupes=3083, Missing=11408]


PROCESSING COMPLETE
Files processed: 38756
Unique articles: 22440
Duplicates removed: 3197
Missing files: 11628
Files with errors: 0

Total characters: 17,474,756
Average article length: 779 characters
Output file size: 45.33 MB

Output saved to: combined_nepali_wikipedia_progress.txt





In [16]:
import re
import unicodedata

def clean_nepali_text(text):
    """Clean Nepali text while preserving Devanagari punctuation."""
    
    # Step 1: Unicode normalization (NFC)
    text = unicodedata.normalize('NFC', text)
    
    # Step 2: Keep Nepali-specific characters
    allowed_pattern = re.compile(
        r'[^'
        r'a-zA-Z'                    # English
        r'\u0900-\u097F'             # Devanagari including ‡•§‡••
        r'0-9\u0966-\u096F'          # Digits
        r'\s'                        # Whitespace
        r'.,!?;:()\[\]{}\-\'\"/\\'   # Punctuation
        r']+'
    )
    text = allowed_pattern.sub(' ', text)
    
    # Step 3: Fix spacing
    text = re.sub(r' +', ' ', text)
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'\t+', ' ', text)
    
    return text.strip()

# Process file in chunks to avoid memory crashes
print("üìñ Processing combined_nepali_wikipedia.txt in chunks...")

chunk_size = 1024 * 1024  # 1 MB chunks
buffer = []
total_original = 0
total_cleaned = 0

try:
    with open('combined_nepali_wikipedia_progress.txt', 'r', encoding='utf-8') as infile, \
         open('combined_nepali_wikipedia_cleaned.txt', 'w', encoding='utf-8') as outfile:
        
        chunk_num = 0
        while True:
            chunk = infile.read(chunk_size)
            if not chunk:
                break
            
            chunk_num += 1
            total_original += len(chunk)
            
            # Clean chunk
            cleaned = clean_nepali_text(chunk)
            
            # Filter lines (minimum 5 chars)
            lines = [line.strip() for line in cleaned.split('\n') 
                     if len(line.strip()) >= 5]
            
            # Write to output
            if lines:
                output = '\n'.join(lines) + '\n'
                outfile.write(output)
                total_cleaned += len(output)
            
            # Progress indicator
            if chunk_num % 10 == 0:
                print(f"  Processed {chunk_num} chunks ({total_original:,} chars)...")
    
    print(f"\n‚úì Done!")
    print(f"Original: {total_original:,} chars")
    print(f"Cleaned: {total_cleaned:,} chars")
    print(f"Removed: {total_original - total_cleaned:,} chars")
    
    # Show sample
    print("\n--- Sample of cleaned text ---")
    with open('combined_nepali_wikipedia_progress.txt', 'r', encoding='utf-8') as f:
        print(f.read(500))

except FileNotFoundError:
    print("‚ùå Error: ne.txt not found in current directory")
except MemoryError:
    print("‚ùå Still running out of memory. Try increasing chunk_size or processing smaller sections.")
except Exception as e:
    print(f"‚ùå Error: {e}")

üìñ Processing combined_nepali_wikipedia.txt in chunks...
  Processed 10 chunks (10,485,760 chars)...

‚úì Done!
Original: 19,314,754 chars
Cleaned: 17,410,898 chars
Removed: 1,903,856 chars

--- Sample of cleaned text ---
‡§Ö‡§®‡•ç‡§®‡§™‡•Ç‡§∞‡•ç‡§£ ‡§™‡•ã‡§∑‡•ç‡§ü ‡§è‡§ï ‡§®‡•á‡§™‡§æ‡§≤‡•Ä ‡§¶‡•à‡§®‡§ø‡§ï ‡§™‡§§‡•ç‡§∞‡§ø‡§ï‡§æ ‡§π‡•ã‡•§ ‡§Ø‡•ã ‡§™‡§§‡•ç‡§∞‡§ø‡§ï‡§æ‡§ï‡•ã ‡§Æ‡•Å‡§ñ‡•ç‡§Ø‡§æ‡§≤‡§Ø ‡§®‡•á‡§™‡§æ‡§≤‡§ï‡•ã ‡§∞‡§æ‡§ú‡§ß‡§æ‡§®‡•Ä ‡§ï‡§æ‡§†‡§Æ‡§æ‡§°‡•å‡§Ç‡§Æ‡§æ ‡§õ ‡•§
‡§ï‡§Ö‡§®‡•ç‡§®‡§™‡•Ç‡§∞‡•ç‡§£ ‡§™‡•ã‡§∑‡•ç‡§ü‡§ï‡•ã ‡§™‡•ç‡§∞‡§•‡§Æ ‡§™‡§ü‡§ï ‡§™‡•ç‡§∞‡§ï‡§æ‡§∂‡§® 2058 ‡§∏‡§æ‡§≤ ‡§¨‡•à‡§∂‡§æ‡§ñ 18 ‡§ó‡§§‡•á ‡§≠‡§è‡§ï‡•ã ‡§π‡•ã‡•§
‡§™‡•ç‡§∞‡§•‡§Æ ‡§ï‡§æ‡§Å‡§ï‡§°‡§≠‡§ø‡§ü‡•ç‡§ü‡§æ ‡§ó‡•ã‡§≤‡•ç‡§°‡§ï‡§™‡§ï‡•ã ‡§â‡§™‡§æ‡§ß‡§ø ‡§≠‡•Å‡§ü‡§æ‡§®‡•Ä ‡§∂‡§∞‡§£‡§æ‡§∞‡•ç‡§•‡•Ä ‡§ï‡•ç‡§≤‡§¨‡§≤‡§æ‡§à ‡§´‡§æ‡§á‡§®‡§≤‡§Æ‡§æ 1-0 ‡§ï‡•ã ‡§ó‡•ã‡§≤‡§Ö‡§®‡•ç‡§§‡§∞‡§≤‡•á ‡§™‡§∞‡§æ‡§ú‡§ø‡§§ ‡§ó‡§∞‡•ç‡§¶‡•à ‡§µ‡§ø‡§∞‡•ç‡§§‡§æ‡§Æ‡•ã‡§° ‡§Ø‡•Å‡§®‡§æ‡§á‡§ü‡•á‡§° ‡§ï‡•ç‡§≤‡§≤‡•á‡§¨ ‡§â‡§™‡§æ‡§ß‡§ø‡

In [1]:
import hashlib
from tqdm import tqdm

file1 = 'combined_final.txt'
file2 = 'combined_nepali_wikipedia_cleaned.txt'
output_file = 'wikipedia_ncc_corpus.txt'

def line_hash(text):
    """Fast MD5 hash for dedupe."""
    return hashlib.md5(text.strip().encode('utf-8')).hexdigest()

seen = set()
total_written = 0
total_seen = 0

with open(output_file, 'w', encoding='utf-8') as out:
    for file in [file1, file2]:
        print(f"\nProcessing {file}...")

        with open(file, 'r', encoding='utf-8', errors='ignore') as f:
            for line in tqdm(f):
                line = line.strip()
                total_seen += 1

                if len(line) < 5:
                    continue

                h = line_hash(line)
                if h in seen:
                    continue

                seen.add(h)
                out.write(line + "\n")
                total_written += 1

print("\n‚úì Combined + deduplicated successfully!")
print(f"Total lines seen: {total_seen:,}")
print(f"Unique lines written: {total_written:,}")
print(f"Output saved to: {output_file}")



Processing combined_final.txt...


6554129it [00:28, 230272.43it/s]



Processing combined_nepali_wikipedia_cleaned.txt...


66699it [00:00, 196899.15it/s]


‚úì Combined + deduplicated successfully!
Total lines seen: 6,620,828
Unique lines written: 6,610,859
Output saved to: wikipedia_ncc_corpus.txt





In [None]:
import sentencepiece as spm
from tqdm import tqdm
import threading
import time
import os

print("Starting training...")

# Progress tracking using file size monitoring
class TrainingMonitor:
    def __init__(self, model_prefix):
        self.model_prefix = model_prefix
        self.model_file = f"{model_prefix}.model"
        self.running = True
        self.pbar = tqdm(total=100, desc="Training", unit="%", 
                         bar_format='{l_bar}{bar}| {elapsed}')
        
    def monitor(self):
        """Monitor training by checking if model file exists and grows"""
        last_size = 0
        start_time = time.time()
        
        while self.running:
            time.sleep(2)  # Check every 2 seconds
            
            if os.path.exists(self.model_file):
                current_size = os.path.getsize(self.model_file)
                if current_size > last_size:
                    # File is growing, training is progressing
                    last_size = current_size
                    elapsed = time.time() - start_time
                    self.pbar.set_postfix({"size": f"{current_size/1024:.1f}KB", 
                                          "time": f"{elapsed:.0f}s"})
        
    def start(self):
        self.thread = threading.Thread(target=self.monitor, daemon=True)
        self.thread.start()
        
    def stop(self):
        self.running = False
        self.pbar.n = 100
        self.pbar.refresh()
        self.pbar.close()

# Start monitoring
monitor = TrainingMonitor('ne_spm')
monitor.start()

try:
    # Train with optimized settings
    spm.SentencePieceTrainer.train(
    input='ne_cleaned.txt',
    model_prefix='ne_spm_fixed',
    vocab_size=32000,
    character_coverage=0.9995,
    model_type='bpe',
    
    # ============ ADD THESE ============
    split_by_whitespace=True,        # Respect word boundaries
    split_by_unicode_script=True,    # Separate Devanagari/Latin scripts
    split_by_number=True,            # Keep numbers separate
    treat_whitespace_as_suffix=False, # Better word starts
    byte_fallback=True,              # Handle rare chars gracefully
    
    # Optionally: add common English words as user symbols
    user_defined_symbols=[
        'the', 'and', 'to', 'of', 'a', 'in', 'for', 'is', 'on', 'that',
        'book', 'flight', 'customer', 'service', 'help', 'want', 'need'
    ],
    # ===================================
    
    input_sentence_size=6707124,
    shuffle_input_sentence=True,
    max_sentence_length=8192,
    num_threads=16,
    minloglevel=1
)
finally:
    monitor.stop()

print("\n‚úì Training complete!")
print(f"Model saved: ne_spm_fixed.model")
print(f"Vocab saved: ne_spm_fixed.vocab")

Starting training...


Training:   0%|          | 00:00

Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 06:38


‚úì Training complete!
Model saved: ne_spm.model
Vocab saved: ne_spm.vocab





In [3]:
ls

combined_final.txt                      ne_spm_fixed.model
combined_nepali_wikipedia_cleaned.txt   ne_spm_fixed.vocab
combined_nepali_wikipedia_progress.txt  ne.txt
model.ipynb                             ne.txt.xz
ne_cleaned.txt                          new_cleaned.txt
ne_corpus_cleaned.txt                   news.ipynb
ne_corpus.txt                           news.txt
[0m[01;34mnepalinewsdataset[0m/                      news_wikipedia_ncc_corpus.txt
nepalinewsdataset.zip                   [01;34moscar-corpus-nepali[0m/
nepali_tokenizer_data.txt               oscar-corpus-nepali.zip
[01;34mnepali-wikipedia-articles[0m/              practise.ipynb
nepali-wikipedia-articles.zip           train.py
nepali_wikipedia.ipynb                  train_tokenizer.py
ne_spm_16V.model                        wikipedia_ncc_corpus.txt
ne_spm_16V.vocab                        wikipedia_.txt


In [4]:

sp = spm.SentencePieceProcessor()
sp.load('ne_spm_fixed.model')

# Test on mixed Nepali-English samples
test_samples = [
    "‡§®‡§Æ‡§∏‡•ç‡§§‡•á, ‡§Æ‡•á‡§∞‡•ã ‡§®‡§æ‡§Æ ‡§∞‡§æ‡§ú ‡§π‡•ã‡•§",  # Pure Nepali
    "I want to book a flight to Kathmandu.",  # Pure English
    "‡§Æ ‡§Ü‡§ú office ‡§ú‡§æ‡§®‡•ç‡§õ‡•Å‡•§",  # Code-mixed
    "Customer service ‡§≤‡•á help ‡§ó‡§∞‡•ç‡§®‡•Å‡§™‡§∞‡•ç‡§õ‡•§"  # Mixed domain
]

for text in test_samples:
    tokens = sp.encode(text, out_type=str)
    print(f"\nText: {text}")
    print(f"Tokens: {tokens}")
    print(f"Token IDs: {sp.encode(text)}")
    print(f"Decoded: {sp.decode(sp.encode(text))}")


Text: ‡§®‡§Æ‡§∏‡•ç‡§§‡•á, ‡§Æ‡•á‡§∞‡•ã ‡§®‡§æ‡§Æ ‡§∞‡§æ‡§ú ‡§π‡•ã‡•§
Tokens: ['‚ñÅ‡§®‡§Æ‡§∏‡•ç‡§§‡•á', ',', '‚ñÅ‡§Æ‡•á‡§∞‡•ã', '‚ñÅ‡§®‡§æ‡§Æ', '‚ñÅ‡§∞‡§æ‡§ú', '‚ñÅ‡§π‡•ã', '‡•§']
Token IDs: [13137, 31911, 1050, 976, 474, 379, 31898]
Decoded: ‡§®‡§Æ‡§∏‡•ç‡§§‡•á, ‡§Æ‡•á‡§∞‡•ã ‡§®‡§æ‡§Æ ‡§∞‡§æ‡§ú ‡§π‡•ã‡•§

Text: I want to book a flight to Kathmandu.
Tokens: ['‚ñÅI', '‚ñÅ', 'want', '‚ñÅ', 'to', '‚ñÅ', 'book', '‚ñÅ', 'a', '‚ñÅ', 'flight', '‚ñÅ', 'to', '‚ñÅ', '<0x4B>', 'a', 'thm', 'and', 'u', '.']
Token IDs: [2835, 31873, 18, 31873, 5, 31873, 13, 31873, 7, 31873, 14, 31873, 5, 31873, 95, 7, 15254, 4, 31965, 31920]
Decoded: I want to book a flight to Kathmandu.

Text: ‡§Æ ‡§Ü‡§ú office ‡§ú‡§æ‡§®‡•ç‡§õ‡•Å‡•§
Tokens: ['‚ñÅ‡§Æ', '‚ñÅ‡§Ü‡§ú', '‚ñÅ', 'of', 'f', 'ice', '‚ñÅ‡§ú‡§æ‡§®‡•ç‡§õ‡•Å', '‡•§']
Token IDs: [299, 630, 31873, 6, 31998, 16799, 13980, 31898]
Decoded: ‡§Æ ‡§Ü‡§ú office ‡§ú‡§æ‡§®‡•ç‡§õ‡•Å‡•§

Text: Customer service ‡§≤‡•á help ‡§ó‡§∞‡•ç‡§®‡•Å‡§™‡§∞‡•ç‡§õ‡•§
Tokens: ['‚ñÅC', 'us', 

In [None]:
import sentencepiece as spm
from tqdm import tqdm
import threading
import time
import os

print("Starting OPTIMIZED training for Nepali-dominant corpus (8.2M lines)...")

# ============ CORPUS ANALYSIS ============
print("\nüìä Analyzing corpus composition...")
try:
    import re
    
    with open('eng_news_wikipedia_ncc_corpus.txt', 'r', encoding='utf-8') as f:
        sample_lines = [next(f) for _ in range(min(10000, 8222518))]
    
    devanagari_pattern = re.compile(r'[\u0900-\u097F]')
    latin_pattern = re.compile(r'[a-zA-Z]')
    
    eng_only = sum(1 for line in sample_lines if latin_pattern.search(line) and not devanagari_pattern.search(line))
    nep_only = sum(1 for line in sample_lines if devanagari_pattern.search(line) and not latin_pattern.search(line))
    mixed = sum(1 for line in sample_lines if devanagari_pattern.search(line) and latin_pattern.search(line))
    
    print(f"   English-only lines: {eng_only/len(sample_lines)*100:.1f}%")
    print(f"   Nepali-only lines: {nep_only/len(sample_lines)*100:.1f}%")
    print(f"   Mixed lines: {mixed/len(sample_lines)*100:.1f}%")
    print(f"   ‚Üí Prioritizing Nepali tokenization with English support...\n")
    
except Exception as e:
    print(f"   ‚ö†Ô∏è Could not analyze corpus: {e}\n")

# Enhanced progress tracking
class TrainingMonitor:
    def __init__(self, model_prefix):
        self.model_prefix = model_prefix
        self.model_file = f"{model_prefix}.model"
        self.running = True
        self.pbar = tqdm(total=100, desc="Training progress", unit="%", 
                         bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]')
        
    def monitor(self):
        last_size = 0
        start_time = time.time()
        estimated_final_size = 2048 * 1024  # ~2MB
        
        while self.running:
            time.sleep(3)
            
            if os.path.exists(self.model_file):
                current_size = os.path.getsize(self.model_file)
                if current_size > last_size:
                    last_size = current_size
                    elapsed = time.time() - start_time
                    progress = min(95, (current_size / estimated_final_size) * 100)
                    self.pbar.n = int(progress)
                    self.pbar.set_postfix({
                        "size": f"{current_size/1024:.1f}KB", 
                        "time": f"{elapsed:.0f}s"
                    })
                    self.pbar.refresh()
        
    def start(self):
        self.thread = threading.Thread(target=self.monitor, daemon=True)
        self.thread.start()
        
    def stop(self):
        self.running = False
        self.pbar.n = 100
        self.pbar.set_postfix({"status": "Complete!"})
        self.pbar.refresh()
        self.pbar.close()

monitor = TrainingMonitor('nepali_optimized_spm')
monitor.start()

try:
    # OPTIMIZED for 90% Nepali, 10% English/Mixed
    training_args = {
        # ============ CORPUS SETTINGS ============
        'input': 'eng_news_wikipedia_ncc_corpus.txt',
        'model_prefix': 'nepali_optimized_spm',
        'vocab_size': 50000,  # Reduced - focused on Nepali
        'character_coverage': 0.9999,  # Very high for Devanagari priority
        'model_type': 'bpe',
        
        # ============ MEMORY & PERFORMANCE ============
        'train_extremely_large_corpus': True,
        'input_sentence_size': 5000000,  # Reduced to 5M for stability
        'shuffle_input_sentence': True,
        'max_sentence_length': 8192,  # Increased to handle long lines
        'max_sentencepiece_length': 20,  # Increased for better Devanagari tokens
        'num_threads': 6,  # Conservative threading
        'minloglevel': 1,
        
        # ============ NEPALI-OPTIMIZED MULTILINGUAL SETTINGS ============
        'split_by_whitespace': True,
        'split_by_unicode_script': False,  # Allow mixed script tokens
        'split_by_number': False,  # Keep Devanagari numerals intact
        'treat_whitespace_as_suffix': False,
        'byte_fallback': True,
        'normalization_rule_name': 'nfkc',  # Better for Devanagari
        'add_dummy_prefix': True,
        'remove_extra_whitespaces': True,
        
        # ============ VOCABULARY (Nepali-focused) ============
        'user_defined_symbols': [
            # Critical Nepali postpositions & particles
            '‡§ï‡•ã', '‡§õ', '‡§∞', '‡§Æ‡§æ', '‡§≤‡•á', '‡§≤‡§æ‡§à', '‡§¨‡§æ‡§ü', '‡§ï‡§æ', '‡§ï‡•Ä', '‡§™‡§®‡§ø',
            '‡§§', '‡§®‡•à', '‡§∏‡§Æ‡•ç‡§Æ', '‡§¶‡•á‡§ñ‡§ø', '‡§≠‡§®‡•ç‡§®‡•á', '‡§ó‡§∞‡•ç‡§®‡•á', '‡§ó‡§∞‡•ç‡§®', '‡§π‡•Å‡§®', '‡§≠‡§è‡§ï‡•ã',
            '‡§∞‡§π‡•á‡§ï‡•ã', '‡§Ü‡§è‡§ï‡•ã', '‡§ó‡§è‡§ï‡•ã', '‡§≠‡§è', '‡§õ‡§®‡•ç', '‡§•‡§ø‡§Ø‡•ã', '‡§•‡§ø‡§è', '‡§π‡•Å‡§®‡•ç‡§õ',
            
            # High-frequency Nepali nouns/verbs
            '‡§®‡•á‡§™‡§æ‡§≤', '‡§ï‡§æ‡§†‡§Æ‡§æ‡§°‡•å‡§Ç', '‡§∏‡§∞‡§ï‡§æ‡§∞', '‡§∏‡§Æ‡§æ‡§ö‡§æ‡§∞', '‡§ú‡§®‡§§‡§æ', '‡§Æ‡§æ‡§®‡§ø‡§∏', '‡§¶‡•á‡§∂',
            '‡§µ‡§ø‡§∂‡•ç‡§µ', '‡§∞‡§æ‡§∑‡•ç‡§ü‡•ç‡§∞‡§ø‡§Ø', '‡§Ö‡§®‡•ç‡§§‡§∞‡•ç‡§∞‡§æ‡§∑‡•ç‡§ü‡•ç‡§∞‡§ø‡§Ø', '‡§™‡•ç‡§∞‡§ß‡§æ‡§®‡§Æ‡§®‡•ç‡§§‡•ç‡§∞‡•Ä', '‡§Æ‡§®‡•ç‡§§‡•ç‡§∞‡•Ä',
            '‡§∏‡§Ç‡§∏‡§¶', '‡§®‡§ø‡§∞‡•ç‡§µ‡§æ‡§ö‡§®', '‡§™‡§æ‡§∞‡•ç‡§ü‡•Ä', '‡§∏‡§Æ‡§Ø', '‡§¶‡§ø‡§®', '‡§µ‡§∞‡•ç‡§∑', '‡§Æ‡§π‡§ø‡§®‡§æ',
            '‡§µ‡§ø‡§ï‡§æ‡§∏', '‡§Ø‡•ã‡§ú‡§®‡§æ', '‡§ï‡§æ‡§∞‡•ç‡§Ø‡§ï‡•ç‡§∞‡§Æ', '‡§®‡•Ä‡§§‡§ø', '‡§ï‡§æ‡§®‡•Ç‡§®', '‡§Ö‡§ß‡§ø‡§ï‡§æ‡§∞',
            
            # Nepali conjunctions & connectors
            '‡§§‡§∞', '‡§Ø‡§¶‡§ø', '‡§Ø‡§∏‡§∞‡•Ä', '‡§ï‡§ø‡§®‡§≠‡§®‡•á', '‡§Ø‡§¶‡•ç‡§Ø‡§™‡§ø', '‡§§‡§•‡§æ‡§™‡§ø', '‡§Ö‡§∞‡•ç‡§•‡§æ‡§§‡•ç',
            
            # Essential English (for 10% mixed content)
            'the', 'and', 'to', 'of', 'a', 'in', 'for', 'is', 'that', 'with',
            'said', 'news', 'government', 'according', 'report', 'wikipedia',
            
            # Special tokens
            '<NEP>', '<ENG>', '<NUM>', '<DATE>', '<URL>',
        ],
        
        # ============ STANDARD SETTINGS ============
        'unk_id': 0,
        'bos_id': 1,
        'eos_id': 2,
        'pad_id': -1,
        'unk_piece': '<unk>',
        'bos_piece': '<s>',
        'eos_piece': '</s>',
        'pad_piece': '<pad>',
        'hard_vocab_limit': True,
    }
    
    arg_string = ' '.join([f'--{k}={v}' if not isinstance(v, list) else 
                           ' '.join([f'--{k}={item}' for item in v]) 
                           for k, v in training_args.items()])
    
    print(f"üîß Training with Nepali-optimized parameters...")
    spm.SentencePieceTrainer.train(arg_string)

except Exception as e:
    print(f"\n‚ùå Training failed: {e}")
    print("\nüí° Troubleshooting:")
    print("1. Verify input file: eng_news_wikipedia_ncc_corpus.txt")
    print("2. Check available RAM (recommend 8GB+)")
    print("3. Reduce input_sentence_size to 3000000")
    print("4. Reduce vocab_size to 32000")
    raise

finally:
    monitor.stop()

print("\n" + "="*70)
print("‚úì TRAINING COMPLETE!")
print("="*70)
print(f"üìä Corpus: 8,222,518 lines (90% Nepali, 10% Mixed/English)")
print(f"ü§ñ Model: nepali_optimized_spm.model")
print(f"üìö Vocab: nepali_optimized_spm.vocab")
print(f"üéØ Vocab size: 50,000 tokens (Nepali-optimized)")
print("="*70)

# ============ COMPREHENSIVE TESTING ============
print("\nüß™ Testing tokenizer performance...")

try:
    sp = spm.SentencePieceProcessor(model_file='nepali_optimized_spm.model')
    
    test_cases = [
        # Pure Nepali
        ("‡§Ü‡§ú‡§ï‡•ã ‡§Æ‡•Å‡§ñ‡•ç‡§Ø ‡§∏‡§Æ‡§æ‡§ö‡§æ‡§∞: ‡§®‡•á‡§™‡§æ‡§≤ ‡§∏‡§∞‡§ï‡§æ‡§∞‡§≤‡•á ‡§®‡§Ø‡§æ‡§Å ‡§®‡•Ä‡§§‡§ø ‡§ò‡•ã‡§∑‡§£‡§æ ‡§ó‡§∞‡•ç‡§Ø‡•ã‡•§", "Pure Nepali"),
        ("‡§µ‡§ø‡§ï‡§ø‡§™‡§ø‡§°‡§ø‡§Ø‡§æ ‡§è‡§ï ‡§®‡§ø‡§É‡§∂‡•Å‡§≤‡•ç‡§ï ‡§Ö‡§®‡§≤‡§æ‡§á‡§® ‡§µ‡§ø‡§∂‡•ç‡§µ‡§ï‡•ã‡§∂ ‡§π‡•ã‡•§", "Pure Nepali"),
        ("‡§ï‡§æ‡§†‡§Æ‡§æ‡§°‡•å‡§Ç‡§Æ‡§æ ‡§Ü‡§ú ‡•®‡•´ ‡§°‡§ø‡§ó‡•ç‡§∞‡•Ä ‡§§‡§æ‡§™‡§ï‡•ç‡§∞‡§Æ ‡§õ‡•§", "Nepali with numbers"),
        
        # Mixed content (your 10%)
        ("Nepal ‡§∏‡§∞‡§ï‡§æ‡§∞‡§≤‡•á Wikipedia ‡§Æ‡§æ article ‡§≤‡•á‡§ñ‡•ç‡§Ø‡•ã‡•§", "Code-switched"),
        ("‡§™‡•ç‡§∞‡§ß‡§æ‡§®‡§Æ‡§®‡•ç‡§§‡•ç‡§∞‡•Ä‡§≤‡•á BBC interview ‡§¶‡§ø‡§è‡•§", "Code-switched"),
        
        # Pure English (for comparison)
        ("Breaking news: Nepal's government announced new policies.", "Pure English"),
        ("Wikipedia is a free online encyclopedia.", "Pure English"),
        
        # Complex Nepali
        ("‡§®‡•á‡§™‡§æ‡§≤ ‡§∏‡§∞‡§ï‡§æ‡§∞‡§≤‡•á ‡§Ü‡§∞‡•ç‡§•‡§ø‡§ï ‡§µ‡§ø‡§ï‡§æ‡§∏‡§ï‡§æ ‡§≤‡§æ‡§ó‡§ø ‡§®‡§Ø‡§æ‡§Å ‡§Ø‡•ã‡§ú‡§®‡§æ ‡§§‡§∞‡•ç‡§ú‡•Å‡§Æ‡§æ ‡§ó‡§∞‡•á‡§ï‡•ã ‡§õ‡•§", "Complex Nepali"),
    ]
    
    print(f"\n{'='*70}")
    for sentence, label in test_cases:
        tokens = sp.encode(sentence, out_type=str)
        token_count = len(tokens)
        efficiency = len(sentence) / token_count
        
        print(f"\n[{label}]")
        print(f"Text: '{sentence[:60]}{'...' if len(sentence) > 60 else ''}'")
        print(f"Tokens ({token_count}): {tokens[:8]}{'...' if token_count > 8 else ''}")
        print(f"Efficiency: {efficiency:.1f} chars/token")
    
    print(f"\n{'='*70}")
    print(f"üìà Final vocabulary size: {sp.vocab_size():,}")
    print(f"üî§ First 15 tokens: {[sp.id_to_piece(i) for i in range(15)]}")
    print(f"{'='*70}")
    
    # Vocabulary composition analysis
    vocab_sample = [sp.id_to_piece(i) for i in range(100, 200)]
    devanagari_count = sum(1 for token in vocab_sample if any('\u0900' <= c <= '\u097F' for c in token))
    print(f"\nüìä Vocab composition (sample 100-200):")
    print(f"   Devanagari tokens: {devanagari_count}/100 ({devanagari_count}%)")
    print(f"   ‚Üí Good balance for your 90% Nepali corpus")
    
except Exception as e:
    print(f"‚ö†Ô∏è Testing failed: {e}")

print("\nüéâ Tokenizer ready for deployment!")
print("üí° Next steps:")
print("   1. Compare token counts between old and new model")
print("   2. Test on your actual downstream task")
print("   3. Monitor OOV rate on validation set")

Starting OPTIMIZED training for Nepali-dominant corpus (8.2M lines)...

üìä Analyzing corpus composition...
   English-only lines: 0.0%
   Nepali-only lines: 90.4%
   Mixed lines: 9.6%
   ‚Üí Prioritizing Nepali tokenization with English support...



Training progress:   0%|          | 0/100 [00:00<?]