In [None]:
import re
import unicodedata

def clean_nepali_text(text):
    """Clean Nepali text while preserving Devanagari punctuation."""
    
    # Step 1: Unicode normalization (NFC)
    text = unicodedata.normalize('NFC', text)
    
    # Step 2: Keep Nepali-specific characters
    allowed_pattern = re.compile(
        r'[^'
        r'a-zA-Z'                    # English
        r'\u0900-\u097F'             # Devanagari including ‡•§‡••
        r'0-9\u0966-\u096F'          # Digits
        r'\s'                        # Whitespace
        r'.,!?;:()\[\]{}\-\'\"/\\'   # Punctuation
        r']+'
    )
    text = allowed_pattern.sub(' ', text)
    
    # Step 3: Fix spacing
    text = re.sub(r' +', ' ', text)
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'\t+', ' ', text)
    
    return text.strip()

# Process file in chunks to avoid memory crashes
print("üìñ Processing ne.txt in chunks...")

chunk_size = 1024 * 1024  # 1 MB chunks
buffer = []
total_original = 0
total_cleaned = 0

try:
    with open('ne.txt', 'r', encoding='utf-8') as infile, \
         open('ne_cleaned.txt', 'w', encoding='utf-8') as outfile:
        
        chunk_num = 0
        while True:
            chunk = infile.read(chunk_size)
            if not chunk:
                break
            
            chunk_num += 1
            total_original += len(chunk)
            
            # Clean chunk
            cleaned = clean_nepali_text(chunk)
            
            # Filter lines (minimum 5 chars)
            lines = [line.strip() for line in cleaned.split('\n') 
                     if len(line.strip()) >= 5]
            
            # Write to output
            if lines:
                output = '\n'.join(lines) + '\n'
                outfile.write(output)
                total_cleaned += len(output)
            
            # Progress indicator
            if chunk_num % 10 == 0:
                print(f"  Processed {chunk_num} chunks ({total_original:,} chars)...")
    
    print(f"\n‚úì Done!")
    print(f"Cleaned: {total_cleaned:,} chars")
    print(f"Removed: {total_original - total_cleaned:,} chars")
    
    # Show sample
    print("\n--- Sample of cleaned text ---")
    with open('ne_cleaned.txt', 'r', encoding='utf-8') as f:
        print(f.read(500))

except FileNotFoundError:
    print("‚ùå Error: ne.txt not found in current directory")
except MemoryError:
    print("‚ùå Still running out of memory. Try increasing chunk_size or processing smaller sections.")
except Exception as e:
    print(f"‚ùå Error: {e}")

üìñ Processing ne.txt in chunks...
  Processed 10 chunks (10,485,760 chars)...
  Processed 20 chunks (20,971,520 chars)...
  Processed 30 chunks (31,457,280 chars)...
  Processed 40 chunks (41,943,040 chars)...
  Processed 50 chunks (52,428,800 chars)...
  Processed 60 chunks (62,914,560 chars)...
  Processed 70 chunks (73,400,320 chars)...
  Processed 80 chunks (83,886,080 chars)...
  Processed 90 chunks (94,371,840 chars)...
  Processed 100 chunks (104,857,600 chars)...
  Processed 110 chunks (115,343,360 chars)...
  Processed 120 chunks (125,829,120 chars)...
  Processed 130 chunks (136,314,880 chars)...
  Processed 140 chunks (146,800,640 chars)...
  Processed 150 chunks (157,286,400 chars)...
  Processed 160 chunks (167,772,160 chars)...
  Processed 170 chunks (178,257,920 chars)...
  Processed 180 chunks (188,743,680 chars)...
  Processed 190 chunks (199,229,440 chars)...
  Processed 200 chunks (209,715,200 chars)...
  Processed 210 chunks (220,200,960 chars)...
  Processed 220 

In [3]:
import hashlib
from tqdm import tqdm

file1 = 'ne_cleaned.txt'
file2 = 'ne_corpus_cleaned.txt'
output_file = 'combined_final.txt'

def line_hash(text):
    """Fast MD5 hash for dedupe."""
    return hashlib.md5(text.strip().encode('utf-8')).hexdigest()

seen = set()
total_written = 0
total_seen = 0

with open(output_file, 'w', encoding='utf-8') as out:
    for file in [file1, file2]:
        print(f"\nProcessing {file}...")

        with open(file, 'r', encoding='utf-8', errors='ignore') as f:
            for line in tqdm(f):
                line = line.strip()
                total_seen += 1

                if len(line) < 5:
                    continue

                h = line_hash(line)
                if h in seen:
                    continue

                seen.add(h)
                out.write(line + "\n")
                total_written += 1

print("\n‚úì Combined + deduplicated successfully!")
print(f"Total lines seen: {total_seen:,}")
print(f"Unique lines written: {total_written:,}")
print(f"Output saved to: {output_file}")



Processing ne_cleaned.txt...


11607422it [00:38, 305216.61it/s]



Processing ne_corpus_cleaned.txt...


2508714it [00:11, 217116.64it/s]


‚úì Combined + deduplicated successfully!
Total lines seen: 14,116,136
Unique lines written: 6,554,129
Output saved to: combined_final.txt





In [1]:
import re
import unicodedata

def clean_nepali_text(text):
    """Clean Nepali text while preserving Devanagari punctuation."""
    
    # Step 1: Unicode normalization (NFC)
    text = unicodedata.normalize('NFC', text)
    
    # Step 2: Keep Nepali-specific characters
    allowed_pattern = re.compile(
        r'[^'
        r'a-zA-Z'                    # English
        r'\u0900-\u097F'             # Devanagari including ‡•§‡••
        r'0-9\u0966-\u096F'          # Digits
        r'\s'                        # Whitespace
        r'.,!?;:()\[\]{}\-\'\"/\\'   # Punctuation
        r']+'
    )
    text = allowed_pattern.sub(' ', text)
    
    # Step 3: Fix spacing
    text = re.sub(r' +', ' ', text)
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'\t+', ' ', text)
    
    return text.strip()

# Process file in chunks to avoid memory crashes
print("üìñ Processing ne_corpus.txt in chunks...")

chunk_size = 1024 * 1024  # 1 MB chunks
buffer = []
total_original = 0
total_cleaned = 0

try:
    with open('ne_corpus.txt', 'r', encoding='utf-8') as infile, \
         open('ne_corpus_cleaned.txt', 'w', encoding='utf-8') as outfile:
        
        chunk_num = 0
        while True:
            chunk = infile.read(chunk_size)
            if not chunk:
                break
            
            chunk_num += 1
            total_original += len(chunk)
            
            # Clean chunk
            cleaned = clean_nepali_text(chunk)
            
            # Filter lines (minimum 5 chars)
            lines = [line.strip() for line in cleaned.split('\n') 
                     if len(line.strip()) >= 5]
            
            # Write to output
            if lines:
                output = '\n'.join(lines) + '\n'
                outfile.write(output)
                total_cleaned += len(output)
            
            # Progress indicator
            if chunk_num % 10 == 0:
                print(f"  Processed {chunk_num} chunks ({total_original:,} chars)...")
    
    print(f"\n‚úì Done!")
    print(f"Original: {total_original:,} chars")
    print(f"Cleaned: {total_cleaned:,} chars")
    print(f"Removed: {total_original - total_cleaned:,} chars")
    
    # Show sample
    print("\n--- Sample of cleaned text ---")
    with open('ne_corpus_cleaned.txt', 'r', encoding='utf-8') as f:
        print(f.read(500))

except FileNotFoundError:
    print("‚ùå Error: ne.txt not found in current directory")
except MemoryError:
    print("‚ùå Still running out of memory. Try increasing chunk_size or processing smaller sections.")
except Exception as e:
    print(f"‚ùå Error: {e}")

üìñ Processing ne_corpus.txt in chunks...
  Processed 10 chunks (10,485,760 chars)...
  Processed 20 chunks (20,971,520 chars)...
  Processed 30 chunks (31,457,280 chars)...
  Processed 40 chunks (41,943,040 chars)...
  Processed 50 chunks (52,428,800 chars)...
  Processed 60 chunks (62,914,560 chars)...
  Processed 70 chunks (73,400,320 chars)...
  Processed 80 chunks (83,886,080 chars)...
  Processed 90 chunks (94,371,840 chars)...
  Processed 100 chunks (104,857,600 chars)...
  Processed 110 chunks (115,343,360 chars)...
  Processed 120 chunks (125,829,120 chars)...
  Processed 130 chunks (136,314,880 chars)...
  Processed 140 chunks (146,800,640 chars)...
  Processed 150 chunks (157,286,400 chars)...
  Processed 160 chunks (167,772,160 chars)...
  Processed 170 chunks (178,257,920 chars)...
  Processed 180 chunks (188,743,680 chars)...
  Processed 190 chunks (199,229,440 chars)...
  Processed 200 chunks (209,715,200 chars)...
  Processed 210 chunks (220,200,960 chars)...
  Process

In [1]:
import sentencepiece as spm
from tqdm import tqdm
import threading
import time
import os

print("Starting training...")

# Progress tracking using file size monitoring
class TrainingMonitor:
    def __init__(self, model_prefix):
        self.model_prefix = model_prefix
        self.model_file = f"{model_prefix}.model"
        self.running = True
        self.pbar = tqdm(total=100, desc="Training", unit="%", 
                         bar_format='{l_bar}{bar}| {elapsed}')
        
    def monitor(self):
        """Monitor training by checking if model file exists and grows"""
        last_size = 0
        start_time = time.time()
        
        while self.running:
            time.sleep(2)  # Check every 2 seconds
            
            if os.path.exists(self.model_file):
                current_size = os.path.getsize(self.model_file)
                if current_size > last_size:
                    # File is growing, training is progressing
                    last_size = current_size
                    elapsed = time.time() - start_time
                    self.pbar.set_postfix({"size": f"{current_size/1024:.1f}KB", 
                                          "time": f"{elapsed:.0f}s"})
        
    def start(self):
        self.thread = threading.Thread(target=self.monitor, daemon=True)
        self.thread.start()
        
    def stop(self):
        self.running = False
        self.pbar.n = 100
        self.pbar.refresh()
        self.pbar.close()

# Start monitoring
monitor = TrainingMonitor('ne_spm')
monitor.start()

try:
    # Train with optimized settings
    spm.SentencePieceTrainer.train(
        input='ne_cleaned.txt',
        model_prefix='ne_spm',
        vocab_size=16000,
        character_coverage=1.0,
        model_type='bpe',
        input_sentence_size=11607422,
        shuffle_input_sentence=True,
        max_sentence_length=8192,     
        num_threads=16,
        minloglevel=1  
    )
finally:
    monitor.stop()

print("\n‚úì Training complete!")
print(f"Model saved: ne_spm.model")
print(f"Vocab saved: ne_spm.vocab")

Starting training...


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 06:41


‚úì Training complete!
Model saved: ne_spm.model
Vocab saved: ne_spm.vocab





In [2]:

sp = spm.SentencePieceProcessor()
sp.load('ne_spm.model')

# Test on mixed Nepali-English samples
test_samples = [
    "‡§®‡§Æ‡§∏‡•ç‡§§‡•á, ‡§Æ‡•á‡§∞‡•ã ‡§®‡§æ‡§Æ ‡§∞‡§æ‡§ú ‡§π‡•ã‡•§",  # Pure Nepali
    "I want to book a flight to Kathmandu.",  # Pure English
    "‡§Æ ‡§Ü‡§ú office ‡§ú‡§æ‡§®‡•ç‡§õ‡•Å‡•§",  # Code-mixed
    "Customer service ‡§≤‡•á help ‡§ó‡§∞‡•ç‡§®‡•Å‡§™‡§∞‡•ç‡§õ‡•§"  # Mixed domain
]

for text in test_samples:
    tokens = sp.encode(text, out_type=str)
    print(f"\nText: {text}")
    print(f"Tokens: {tokens}")
    print(f"Token IDs: {sp.encode(text)}")
    print(f"Decoded: {sp.decode(sp.encode(text))}")


Text: ‡§®‡§Æ‡§∏‡•ç‡§§‡•á, ‡§Æ‡•á‡§∞‡•ã ‡§®‡§æ‡§Æ ‡§∞‡§æ‡§ú ‡§π‡•ã‡•§
Tokens: ['‚ñÅ‡§®‡§Æ‡§∏‡•ç‡§§‡•á', ',', '‚ñÅ‡§Æ‡•á‡§∞‡•ã', '‚ñÅ‡§®‡§æ‡§Æ', '‚ñÅ‡§∞‡§æ‡§ú', '‚ñÅ‡§π‡•ã', '‡•§']
Token IDs: [12984, 15861, 778, 703, 201, 106, 15848]
Decoded: ‡§®‡§Æ‡§∏‡•ç‡§§‡•á, ‡§Æ‡•á‡§∞‡•ã ‡§®‡§æ‡§Æ ‡§∞‡§æ‡§ú ‡§π‡•ã‡•§

Text: I want to book a flight to Kathmandu.
Tokens: ['‚ñÅI', '‚ñÅw', 'ant', '‚ñÅto', '‚ñÅb', 'ook', '‚ñÅa', '‚ñÅf', 'li', 'ght', '‚ñÅto', '‚ñÅK', 'athmandu', '.']
Token IDs: [2567, 6025, 7536, 7830, 3578, 9083, 3266, 4037, 9479, 5934, 7830, 2866, 15347, 15870]
Decoded: I want to book a flight to Kathmandu.

Text: ‡§Æ ‡§Ü‡§ú office ‡§ú‡§æ‡§®‡•ç‡§õ‡•Å‡•§
Tokens: ['‚ñÅ‡§Æ', '‚ñÅ‡§Ü‡§ú', '‚ñÅof', 'f', 'ice', '‚ñÅ‡§ú‡§æ‡§®‡•ç‡§õ‡•Å', '‡•§']
Token IDs: [26, 357, 4959, 15938, 13045, 13748, 15848]
Decoded: ‡§Æ ‡§Ü‡§ú office ‡§ú‡§æ‡§®‡•ç‡§õ‡•Å‡•§

Text: Customer service ‡§≤‡•á help ‡§ó‡§∞‡•ç‡§®‡•Å‡§™‡§∞‡•ç‡§õ‡•§
Tokens: ['‚ñÅC', 'ust', 'om', 'er', '‚ñÅs', 'er', 'v', 'ice', '‚ñÅ‡§≤‡•á', '‚ñÅh', 