In [1]:
!kaggle datasets download lotusacharya/nepalinewsdataset

Dataset URL: https://www.kaggle.com/datasets/lotusacharya/nepalinewsdataset
License(s): GPL-2.0
Downloading nepalinewsdataset.zip to /home/lang-chain/Documents/daraz_product_review/Notebook
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 18.0M/18.1M [00:01<00:00, 14.0MB/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 18.1M/18.1M [00:01<00:00, 13.2MB/s]


In [None]:
import os
import hashlib
from tqdm import tqdm
import time

def get_file_hash(filepath):
    """Calculate MD5 hash of a file to detect duplicates"""
    hash_md5 = hashlib.md5()
    try:
        with open(filepath, "rb") as f:
            for chunk in iter(lambda: f.read(4096), b""):
                hash_md5.update(chunk)
        return hash_md5.hexdigest()
    except Exception as e:
        print(f"Error reading {filepath}: {e}")
        return None

def crawl_and_combine_txt_files(root_folder, output_file="news.txt"):
    """
    Crawl through all subfolders, find .txt files, remove duplicates, and combine them
    
    Args:
        root_folder: Path to the main folder (nepalinewsdataset)
        output_file: Name of the output combined file
    """
    
    # Dictionary to store file hashes and content (to detect duplicates)
    seen_files = {}
    txt_files = []
    total_size = 0
    
    print(f"Scanning folder structure from: {root_folder}")
    print("-" * 50)
    
    # First pass: Collect all .txt files with progress
    for root, dirs, files in os.walk(root_folder):
        for file in files:
            if file.endswith('.txt'):
                filepath = os.path.join(root, file)
                txt_files.append(filepath)

    
    print(f"Found {len(txt_files)} .txt files")
    
    if not txt_files:
        print("No .txt files found!")
        return
    
    # Process files with progress bar
    unique_files = 0
    duplicate_files = 0
    combined_content = []
    
    print("\nProcessing files and removing duplicates...")
    
    with tqdm(total=len(txt_files), desc="Processing", unit="file") as pbar:
        for filepath in txt_files:
            try:
                # Get file hash to check for duplicates
                file_hash = get_file_hash(filepath)
                
                if file_hash and file_hash not in seen_files:
                    # Read file content
                    with open(filepath, 'r', encoding='utf-8') as f:
                        content = f.read().strip()
                    
                    # Only add if file has content
                    if content:
                        # Add metadata about source file
                        relative_path = os.path.relpath(filepath, root_folder)
                        combined_content.append(f"\n\n{'='*80}\n")
                        combined_content.append(f"Source: {relative_path}\n")
                        combined_content.append(f"{'='*80}\n\n")
                        combined_content.append(content)
                        
                        seen_files[file_hash] = True
                        unique_files += 1
                        total_size += len(content)
                    else:
                        print(f"\nSkipping empty file: {filepath}")
                elif file_hash:
                    duplicate_files += 1
                    
            except UnicodeDecodeError:
                # Try with different encoding if utf-8 fails
                try:
                    with open(filepath, 'r', encoding='latin-1') as f:
                        content = f.read().strip()
                    
                    if content:
                        relative_path = os.path.relpath(filepath, root_folder)
                        combined_content.append(f"\n\n{'='*80}\n")
                        combined_content.append(f"Source: {relative_path}\n")
                        combined_content.append(f"{'='*80}\n\n")
                        combined_content.append(content)
                        
                        seen_files[file_hash] = True
                        unique_files += 1
                        total_size += len(content)
                except Exception as e:
                    print(f"\nCould not read {filepath} with any encoding: {e}")
                    
            except Exception as e:
                print(f"\nError processing {filepath}: {e}")
            
            pbar.update(1)
            # Small sleep to prevent CPU overuse
            time.sleep(0.001)
    
    print(f"\n{'='*60}")
    print(f"Summary:")
    print(f"  Total files found: {len(txt_files)}")
    print(f"  Unique files: {unique_files}")
    print(f"  Duplicate files skipped: {duplicate_files}")
    print(f"  Estimated output size: {total_size / (1024*1024):.2f} MB")
    print(f"{'='*60}\n")
    
    # Write combined content to output file
    if combined_content:
        print(f"Writing combined content to: {output_file}")
        
        with open(output_file, 'w', encoding='utf-8') as outfile:
            # Write header
            outfile.write(f"{'='*80}\n")
            outfile.write(f"COMBINED NEPALI NEWS DATASET\n")
            outfile.write(f"Generated from: {root_folder}\n")
            outfile.write(f"Total unique articles: {unique_files}\n")
            outfile.write(f"Generated on: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
            outfile.write(f"{'='*80}\n")
            
            # Write all content
            for content_chunk in tqdm(combined_content, desc="Writing to file", unit="chunk"):
                outfile.write(content_chunk)
        
        print(f"\n‚úÖ Successfully created {output_file}")
        print(f"üìÅ Output location: {os.path.abspath(output_file)}")
        
        # Get final file size
        output_size = os.path.getsize(output_file) / (1024*1024)
        print(f"üìä Final file size: {output_size:.2f} MB")
    else:
        print("‚ùå No content to write!")

def main():
    # Set your main folder path
    root_folder = "nepalinewsdataset"  # Change this if needed
    
    # Check if folder exists
    if not os.path.exists(root_folder):
        print(f"Error: Folder '{root_folder}' not found!")
        print("Please make sure you're running this script from the correct directory.")
        return
    
    # Set output file name
    output_file = "news.txt"
    
    # Check if output file already exists
    if os.path.exists(output_file):
        response = input(f"{output_file} already exists. Overwrite? (y/n): ")
        if response.lower() != 'y':
            print("Operation cancelled.")
            return
    
    # Run the crawler and combiner
    try:
        crawl_and_combine_txt_files(root_folder, output_file)
    except KeyboardInterrupt:
        print("\n\n‚ö†Ô∏è  Process interrupted by user.")
    except Exception as e:
        print(f"\n‚ùå Error occurred: {e}")

if __name__ == "__main__":
    # Install tqdm if not installed
    try:
        from tqdm import tqdm
    except ImportError:
        print("Installing required package: tqdm")
        import subprocess
        subprocess.check_call(["pip", "install", "tqdm"])
        from tqdm import tqdm
    
    main()

Scanning folder structure from: nepalinewsdataset
--------------------------------------------------
Found 10000 .txt files

Processing files and removing duplicates...


Processing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:11<00:00, 847.54file/s]



Summary:
  Total files found: 10000
  Unique files: 9999
  Duplicate files skipped: 1
  Estimated output size: 26.52 MB

Writing combined content to: news.txt


Writing to file: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 39996/39996 [00:00<00:00, 289915.72chunk/s]


‚úÖ Successfully created news.txt
üìÅ Output location: /home/lang-chain/Documents/daraz_product_review/Notebook/news.txt
üìä Final file size: 72.56 MB





In [None]:
import re
import unicodedata

def clean_nepali_text(text):
    """Clean Nepali text while preserving Devanagari punctuation."""
    
    # Step 1: Unicode normalization (NFC)
    text = unicodedata.normalize('NFC', text)
    
    # Step 2: Keep Nepali-specific characters
    allowed_pattern = re.compile(
        r'[^'
        r'a-zA-Z'                    # English
        r'\u0900-\u097F'             # Devanagari including ‡•§‡••
        r'0-9\u0966-\u096F'          # Digits
        r'\s'                        # Whitespace
        r'.,!?;:()\[\]{}\-\'\"/\\'   # Punctuation
        r']+'
    )
    text = allowed_pattern.sub(' ', text)
    
    # Step 3: Fix spacing
    text = re.sub(r' +', ' ', text)
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'\t+', ' ', text)
    
    return text.strip()

# Process file in chunks to avoid memory crashes
print("üìñ Processing news.txt in chunks...")

chunk_size = 1024 * 1024  # 1 MB chunks
buffer = []
total_original = 0
total_cleaned = 0

try:
    with open('news.txt', 'r', encoding='utf-8') as infile, \
         open('new_cleaned.txt', 'w', encoding='utf-8') as outfile:
        
        chunk_num = 0
        while True:
            chunk = infile.read(chunk_size)
            if not chunk:
                break
            
            chunk_num += 1
            total_original += len(chunk)
            
            # Clean chunk
            cleaned = clean_nepali_text(chunk)
            
            # Filter lines (minimum 5 chars)
            lines = [line.strip() for line in cleaned.split('\n') 
                     if len(line.strip()) >= 5]
            
            # Write to output
            if lines:
                output = '\n'.join(lines) + '\n'
                outfile.write(output)
                total_cleaned += len(output)
            
            # Progress indicator
            if chunk_num % 10 == 0:
                print(f"  Processed {chunk_num} chunks ({total_original:,} chars)...")
    
    print(f"\n‚úì Done!")
    print(f"Original: {total_original:,} chars")
    print(f"Cleaned: {total_cleaned:,} chars")
    print(f"Removed: {total_original - total_cleaned:,} chars")
    
    # Show sample
    print("\n--- Sample of cleaned text ---")
    with open('news_cleaned.txt', 'r', encoding='utf-8') as f:
        print(f.read(500))

except FileNotFoundError:
    print("‚ùå Error: news.txt not found in current directory")
except MemoryError:
    print("‚ùå Still running out of memory. Try increasing chunk_size or processing smaller sections.")
except Exception as e:
    print(f"‚ùå Error: {e}")

üìñ Processing news.txt in chunks...
  Processed 10 chunks (10,485,760 chars)...
  Processed 20 chunks (20,971,520 chars)...

‚úì Done!
Original: 29,798,999 chars
Cleaned: 27,530,783 chars
Removed: 2,268,216 chars

--- Sample of cleaned text ---
‚ùå Error: news.txt not found in current directory


In [7]:
import hashlib
from tqdm import tqdm

file1 = 'wikipedia_ncc_corpus.txt'
file2 = 'new_cleaned.txt'
output_file = 'news_wikipedia_ncc_corpus.txt'

def line_hash(text):
    """Fast MD5 hash for dedupe."""
    return hashlib.md5(text.strip().encode('utf-8')).hexdigest()

seen = set()
total_written = 0
total_seen = 0

with open(output_file, 'w', encoding='utf-8') as out:
    for file in [file1, file2]:
        print(f"\nProcessing {file}...")

        with open(file, 'r', encoding='utf-8', errors='ignore') as f:
            for line in tqdm(f):
                line = line.strip()
                total_seen += 1

                if len(line) < 5:
                    continue

                h = line_hash(line)
                if h in seen:
                    continue

                seen.add(h)
                out.write(line + "\n")
                total_written += 1

print("\n‚úì Combined + deduplicated successfully!")
print(f"Total lines seen: {total_seen:,}")
print(f"Unique lines written: {total_written:,}")
print(f"Output saved to: {output_file}")



Processing wikipedia_ncc_corpus.txt...


6610859it [00:29, 224942.28it/s]



Processing new_cleaned.txt...


97988it [00:00, 185291.71it/s]


‚úì Combined + deduplicated successfully!
Total lines seen: 6,708,847
Unique lines written: 6,707,124
Output saved to: news_wikipedia_ncc_corpus.txt





In [None]:

from datasets import load_dataset
import os
from tqdm import tqdm

def download_cc100_english(output_file='cc100_en.txt', max_lines=500000):
    """
    Download CC-100 English corpus
    """
    print(f"Downloading CC-100 English corpus (max {max_lines:,} lines)...")
    
    try:
        # CC-100 is available on Hugging Face
        dataset = load_dataset(
            'facebook/cc100',
            lang='en',
            streaming=True,  # Critical for large datasets
            trust_remote_code=True
        )
        
        lines_written = 0
        with open(output_file, 'w', encoding='utf-8') as f:
            for example in tqdm(dataset['train'], desc="Downloading"):
                text = example.get('text', '').strip()
                
                if text:
                    # Basic filtering
                    words = text.split()
                    if 3 <= len(words) <= 512:  # Reasonable length
                        f.write(text + '\n')
                        lines_written += 1
                        
                        if lines_written >= max_lines:
                            break
        
        # Get file size
        size_mb = os.path.getsize(output_file) / 1024 / 1024
        
        print(f"\n‚úì Downloaded {lines_written:,} lines")
        print(f"‚úì File size: {size_mb:.1f} MB")
        print(f"‚úì Saved to: {output_file}")
        
        return output_file
        
    except Exception as e:
        print(f"Error: {e}")
        print("\nTrying alternative method...")
        return download_cc100_alternative(output_file, max_lines)

def download_cc100_alternative(output_file, max_lines):
    """Alternative CC-100 download method"""
    try:
        # Sometimes CC-100 is in different format
        dataset = load_dataset('allenai/c4', 'en', streaming=True)
        
        lines_written = 0
        with open(output_file, 'w', encoding='utf-8') as f:
            for example in tqdm(dataset['train'], desc="Downloading C4 (alternative)"):
                text = example.get('text', '').strip()
                if text:
                    f.write(text + '\n')
                    lines_written += 1
                    if lines_written >= max_lines:
                        break
        
        print(f"‚úì Downloaded C4 English: {lines_written:,} lines")
        return output_file
        
    except Exception as e:
        print(f"Alternative failed: {e}")
        return None

# Download reasonable amount
download_cc100_english('cc100_en_200k.txt', max_lines=200000)

`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'facebook/cc100' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.


Downloading CC-100 English corpus (max 200,000 lines)...
Error: Dataset 'facebook/cc100' doesn't exist on the Hub or cannot be accessed.

Trying alternative method...


Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]

Downloading C4 (alternative): 199999it [00:45, 4353.36it/s]

‚úì Downloaded C4 English: 200,000 lines





'cc100_en_200k.txt'

In [3]:
import re
import unicodedata

def clean_english_text_simple(text):
    """Simple English text cleaner similar to your Nepali cleaner."""
    
    # Step 1: Unicode normalization
    text = unicodedata.normalize('NFKC', text)
    
    # Step 2: Keep only English characters, digits, and basic punctuation
    allowed_pattern = re.compile(
        r'[^'
        r'a-zA-Z'                    # English letters
        r'0-9'                       # Digits
        r'\s'                        # Whitespace
        r'.,!?;:()\-_\'"'            # Basic punctuation
        r']+'
    )
    text = allowed_pattern.sub(' ', text)
    
    # Step 3: Fix spacing
    text = re.sub(r' +', ' ', text)
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'\t+', ' ', text)
    
    # Step 4: Remove lines that are too short or too long
    lines = text.split('\n')
    filtered_lines = []
    
    for line in lines:
        line = line.strip()
        if 20 <= len(line) <= 500:  # Reasonable line length
            # Check if has at least 3 words
            if len(line.split()) >= 3:
                filtered_lines.append(line)
    
    return '\n'.join(filtered_lines)

# Process file in chunks (identical to your Nepali processor)
def clean_english_file_simple(input_file, output_file, chunk_size=1024*1024):
    """Simple English file cleaner with chunk processing."""
    
    print(f"üìñ Cleaning {input_file}...")
    
    total_original = 0
    total_cleaned = 0
    chunk_num = 0
    
    try:
        with open(input_file, 'r', encoding='utf-8', errors='ignore') as infile, \
             open(output_file, 'w', encoding='utf-8') as outfile:
            
            while True:
                chunk = infile.read(chunk_size)
                if not chunk:
                    break
                
                chunk_num += 1
                total_original += len(chunk)
                
                # Clean chunk
                cleaned = clean_english_text_simple(chunk)
                
                # Write if not empty
                if cleaned.strip():
                    outfile.write(cleaned + '\n')
                    total_cleaned += len(cleaned)
                
                # Progress
                if chunk_num % 10 == 0:
                    print(f"  Processed {chunk_num} chunks...")
    
    except FileNotFoundError:
        print(f"‚ùå Error: {input_file} not found")
        return
    except Exception as e:
        print(f"‚ùå Error: {e}")
        return
    
    print(f"\n‚úì Done!")
    print(f"Original: {total_original:,} chars")
    print(f"Cleaned: {total_cleaned:,} chars")
    print(f"Removed: {total_original - total_cleaned:,} chars")
    
    # Show sample
    print("\n--- Sample ---")
    with open(output_file, 'r', encoding='utf-8') as f:
        sample = ''.join([f.readline() for _ in range(3)])
        print(sample[:500])

# Usage (just like your Nepali code)
if __name__ == "__main__":
    clean_english_file_simple(
        input_file='cc100_en_200k.txt',
        output_file='cc100_en_cleaned.txt',
        chunk_size=1024*1024  
    )

üìñ Cleaning cc100_en_200k.txt...
  Processed 10 chunks...
  Processed 20 chunks...
  Processed 30 chunks...
  Processed 40 chunks...
  Processed 50 chunks...
  Processed 60 chunks...
  Processed 70 chunks...
  Processed 80 chunks...
  Processed 90 chunks...
  Processed 100 chunks...
  Processed 110 chunks...
  Processed 120 chunks...
  Processed 130 chunks...
  Processed 140 chunks...
  Processed 150 chunks...
  Processed 160 chunks...
  Processed 170 chunks...
  Processed 180 chunks...
  Processed 190 chunks...
  Processed 200 chunks...
  Processed 210 chunks...
  Processed 220 chunks...
  Processed 230 chunks...
  Processed 240 chunks...
  Processed 250 chunks...
  Processed 260 chunks...
  Processed 270 chunks...
  Processed 280 chunks...
  Processed 290 chunks...
  Processed 300 chunks...
  Processed 310 chunks...
  Processed 320 chunks...
  Processed 330 chunks...
  Processed 340 chunks...
  Processed 350 chunks...
  Processed 360 chunks...
  Processed 370 chunks...
  Processed 

In [4]:
import hashlib
from tqdm import tqdm

file1 = 'news_wikipedia_ncc_corpus.txt'
file2 = 'cc100_en_cleaned.txt'
output_file = 'eng_news_wikipedia_ncc_corpus.txt'

def line_hash(text):
    """Fast MD5 hash for dedupe."""
    return hashlib.md5(text.strip().encode('utf-8')).hexdigest()

seen = set()
total_written = 0
total_seen = 0

with open(output_file, 'w', encoding='utf-8') as out:
    for file in [file1, file2]:
        print(f"\nProcessing {file}...")

        with open(file, 'r', encoding='utf-8', errors='ignore') as f:
            for line in tqdm(f):
                line = line.strip()
                total_seen += 1

                if len(line) < 5:
                    continue

                h = line_hash(line)
                if h in seen:
                    continue

                seen.add(h)
                out.write(line + "\n")
                total_written += 1

print("\n‚úì Combined + deduplicated successfully!")
print(f"Total lines seen: {total_seen:,}")
print(f"Unique lines written: {total_written:,}")
print(f"Output saved to: {output_file}")



Processing news_wikipedia_ncc_corpus.txt...


6707124it [00:29, 225239.93it/s]



Processing cc100_en_cleaned.txt...


1515908it [00:02, 536323.05it/s]


‚úì Combined + deduplicated successfully!
Total lines seen: 8,223,032
Unique lines written: 8,222,518
Output saved to: eng_news_wikipedia_ncc_corpus.txt





In [None]:
import sentencepiece as spm
from tqdm import tqdm
import threading
import time
import os
import re
import sys
from itertools import islice

print("Starting training for eng_news_wikipedia_ncc_corpus (8.2M lines)...")

# ============ CORPUS ANALYSIS ============
print("\nüìä Analyzing corpus composition...")
try:
    # Check if file exists
    input_file = 'eng_news_wikipedia_ncc_corpus.txt'
    if not os.path.exists(input_file):
        print(f"‚ùå ERROR: Input file not found: {input_file}")
        print("   Please check the file path and try again.")
        exit(1)
    
    # Get file size
    file_size = os.path.getsize(input_file)
    print(f"   File size: {file_size / (1024*1024*1024):.2f} GB")
    
    # Sample first 10000 lines safely
    with open(input_file, 'r', encoding='utf-8') as f:
        sample_lines = list(islice(f, min(10000, 8222518)))
    
    devanagari_pattern = re.compile(r'[\u0900-\u097F]')  # Nepali/Devanagari
    latin_pattern = re.compile(r'[a-zA-Z]')
    
    eng_only = sum(1 for line in sample_lines if latin_pattern.search(line) and not devanagari_pattern.search(line))
    nep_only = sum(1 for line in sample_lines if devanagari_pattern.search(line) and not latin_pattern.search(line))
    mixed = sum(1 for line in sample_lines if devanagari_pattern.search(line) and latin_pattern.search(line))
    
    print(f"   English-only lines: {eng_only/len(sample_lines)*100:.1f}%")
    print(f"   Nepali-only lines: {nep_only/len(sample_lines)*100:.1f}%")
    print(f"   Mixed lines: {mixed/len(sample_lines)*100:.1f}%")
    print(f"   ‚Üí Optimizing for bilingual tokenization...\n")
    
except Exception as e:
    print(f"   ‚ö†Ô∏è Could not analyze corpus: {e}")
    print(f"   ‚Üí Proceeding with mixed language settings...\n")

# BETTER PROGRESS TRACKING - Time-based since SentencePiece doesn't output progress
class TrainingMonitor:
    def __init__(self, model_prefix, total_minutes=180):
        self.model_prefix = model_prefix
        self.model_file = f"{model_prefix}.model"
        self.vocab_file = f"{model_prefix}.vocab"
        self.running = True
        self.start_time = time.time()
        self.total_minutes = total_minutes
        
        # Create a progress bar for time estimation
        self.pbar = tqdm(total=100, desc="Training 8.2M corpus", unit="%", 
                         bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {postfix}]')
        
    def monitor(self):
        """Time-based monitoring since SentencePiece doesn't output progress"""
        stages = [
            (0, "Initializing"),
            (5, "Reading corpus"),
            (15, "Computing vocab"),
            (40, "Training BPE"),
            (70, "Optimizing"),
            (90, "Finalizing"),
            (100, "Complete")
        ]
        
        last_file_size = 0
        file_check_count = 0
        
        while self.running:
            elapsed_seconds = time.time() - self.start_time
            elapsed_minutes = elapsed_seconds / 60
            
            # Estimate progress based on time (max 99% until done)
            progress = min(99, (elapsed_minutes / self.total_minutes) * 100)
            
            # Check if model file exists and is growing (occasionally)
            file_check_count += 1
            if file_check_count % 10 == 0 and os.path.exists(self.model_file):
                current_size = os.path.getsize(self.model_file)
                if current_size > last_file_size:
                    last_file_size = current_size
                    # File is growing, we're likely at 70-90% stage
                    progress = max(progress, 70)
            
            # Determine current stage
            current_stage = stages[-1][1]
            for stage_progress, stage_name in stages:
                if progress >= stage_progress:
                    current_stage = stage_name
            
            # Update progress bar
            self.pbar.n = int(progress)
            self.pbar.set_postfix({
                "stage": current_stage,
                "elapsed": f"{elapsed_minutes:.0f}m",
                "est_remain": f"{max(0, self.total_minutes - elapsed_minutes):.0f}m"
            })
            self.pbar.refresh()
            
            time.sleep(5)  # Update every 5 seconds
        
    def start(self):
        self.thread = threading.Thread(target=self.monitor, daemon=True)
        self.thread.start()
        
    def stop(self):
        self.running = False
        if hasattr(self, 'thread'):
            self.thread.join(timeout=2)
        self.pbar.n = 100
        self.pbar.set_postfix({"stage": "Complete!", "elapsed": f"{(time.time() - self.start_time)/60:.0f}m"})
        self.pbar.refresh()
        self.pbar.close()

# Start monitoring - estimate 3 hours for 6M lines
monitor = TrainingMonitor('eng_nep_8m_spm', total_minutes=180)
monitor.start()

try:
    # FIXED PARAMETERS with valid SentencePiece arguments
    print("\nüîß Training with optimized parameters for large corpus...")
    print("   Estimated time: 2-3 hours for 6M lines")
    
    # Use keyword arguments instead of building arg_string to avoid issues
    spm.SentencePieceTrainer.train(
        input='eng_news_wikipedia_ncc_corpus.txt',
        model_prefix='eng_nep_8m_spm',
        vocab_size=50000,  # Changed to match what's printed later
        character_coverage=0.9999,
        model_type='bpe',

        # Text splitting parameters
        split_by_whitespace=True,
        split_by_unicode_script=True,  # Changed to True for better language separation
        split_digits=False,
        byte_fallback=False,

        # Token length
        max_sentencepiece_length=80,

        # Corpus handling - using 6M lines as specified
        input_sentence_size=6000000,
        shuffle_input_sentence=True,
        
        # Hardware optimization
        num_threads=8,  # Reduced for stability
        
        # Normalization (FIXED parameter name)
        normalization_rule_name='nmt_nfkc_cf',
        remove_extra_whitespace=True,  # FIXED: removed 's' from 'whitespaces'
        add_dummy_prefix=True,

        # Special tokens as comma-separated string
        user_defined_symbols='<ENG>,<NEP>,<MIXED>,<NUM>,<DATE>,<URL>,<EMAIL>',

        # Token IDs
        unk_id=0,
        bos_id=1,
        eos_id=2,
        pad_id=-1,
        
        # Additional optimization for large corpus
        max_sentence_length=8192,
        seed_sentencepiece_size=3000000,
        training_time=10,  # seconds per merge operation
    )

except KeyboardInterrupt:
    print("\n\n‚ö†Ô∏è Training interrupted by user")
    monitor.stop()
    exit(1)

except Exception as e:
    print(f"\n‚ùå Training failed: {e}")
    print("\nüí° Troubleshooting tips:")
    print("1. Check if input file exists and is readable")
    print("2. Ensure you have enough RAM (6M lines needs ~4-6GB)")
    print("3. Try reducing vocab_size to 32000")
    print("4. Try reducing num_threads to 4")
    print("5. Check file encoding (should be UTF-8)")
    
    # Try with smaller parameters
    print("\nüîÑ Trying with reduced parameters...")
    try:
        spm.SentencePieceTrainer.train(
            input='eng_news_wikipedia_ncc_corpus.txt',
            model_prefix='eng_nep_8m_spm_small',
            vocab_size=32000,
            model_type='bpe',
            num_threads=4,
            input_sentence_size=3000000,
            character_coverage=0.999,
        )
        print("\n‚úì Training completed with reduced parameters!")
    except Exception as e2:
        print(f"\n‚ùå Reduced training also failed: {e2}")
        raise

finally:
    monitor.stop()

print("\n" + "="*60)
print("‚úì TRAINING COMPLETE!")
print("="*60)

# Check which model was created
if os.path.exists('eng_nep_8m_spm.model'):
    model_file = 'eng_nep_8m_spm.model'
    print(f"üìä Corpus size processed: 6,000,000 lines")
    print(f"ü§ñ Model saved: eng_nep_8m_spm.model")
    print(f"üìö Vocab saved: eng_nep_8m_spm.vocab")
    print(f"üéØ Vocab size: 50,000 tokens")
elif os.path.exists('eng_nep_8m_spm_small.model'):
    model_file = 'eng_nep_8m_spm_small.model'
    print(f"üìä Corpus size processed: 3,000,000 lines")
    print(f"ü§ñ Model saved: eng_nep_8m_spm_small.model")
    print(f"üìö Vocab saved: eng_nep_8m_spm_small.vocab")
    print(f"üéØ Vocab size: 32,000 tokens")
else:
    print("‚ùå No model file was created")
    exit(1)

print("="*60)

# ============ COMPREHENSIVE TESTING ============
print("\nüß™ Testing tokenizer performance...")

try:
    sp = spm.SentencePieceProcessor(model_file=model_file)
    
    test_cases = [
        # English sentences
        ("English", "Breaking news: Nepal's government announced new policies today."),
        ("English", "Wikipedia is a free online encyclopedia with millions of articles."),
        
        # Nepali sentences  
        ("Nepali", "‡§Ü‡§ú‡§ï‡•ã ‡§Æ‡•Å‡§ñ‡•ç‡§Ø ‡§∏‡§Æ‡§æ‡§ö‡§æ‡§∞: ‡§®‡•á‡§™‡§æ‡§≤ ‡§∏‡§∞‡§ï‡§æ‡§∞‡§≤‡•á ‡§®‡§Ø‡§æ‡§Å ‡§®‡•Ä‡§§‡§ø ‡§ò‡•ã‡§∑‡§£‡§æ ‡§ó‡§∞‡•ç‡§Ø‡•ã‡•§"),
        ("Nepali", "‡§µ‡§ø‡§ï‡§ø‡§™‡§ø‡§°‡§ø‡§Ø‡§æ ‡§è‡§ï ‡§®‡§ø‡§É‡§∂‡•Å‡§≤‡•ç‡§ï ‡§Ö‡§®‡§≤‡§æ‡§á‡§® ‡§µ‡§ø‡§∂‡•ç‡§µ‡§ï‡•ã‡§∂ ‡§π‡•ã‡•§"),
        
        # Mixed content
        ("Mixed", "Nepal ‡§∏‡§∞‡§ï‡§æ‡§∞‡§≤‡•á Wikipedia ‡§Æ‡§æ article ‡§≤‡•á‡§ñ‡•ç‡§Ø‡•ã‡•§"),
        ("Mixed", "‡§ï‡§æ‡§†‡§Æ‡§æ‡§°‡•å‡§Ç‡§Æ‡§æ temperature ‡§Ü‡§ú 25¬∞C ‡§õ according to weather report‡•§"),
        
        # Numbers and dates
        ("Mixed", "‡•®‡•¶‡•®‡•™ ‡§∏‡§æ‡§≤‡§Æ‡§æ Nepal ‡§Æ‡§æ ‡•≠ ‡§ï‡§∞‡•ã‡§° population ‡§õ‡•§"),
        ("English", "The population of Nepal in 2024 is approximately 30 million.")
    ]
    
    print(f"\nüìà Vocabulary size: {sp.vocab_size():,} tokens")
    print(f"üî§ Unknown token: {sp.id_to_piece(sp.unk_id())}")
    print(f"üî§ Begin token: {sp.id_to_piece(sp.bos_id())}")
    print(f"üî§ End token: {sp.id_to_piece(sp.eos_id())}")
    print("-" * 60)
    
    for i, (lang, sentence) in enumerate(test_cases, 1):
        tokens = sp.encode(sentence, out_type=str)
        token_ids = sp.encode(sentence, out_type=int)
        token_count = len(tokens)
        
        print(f"\n{i}. [{lang}] '{sentence[:50]}{'...' if len(sentence) > 50 else ''}'")
        print(f"   ‚Üí {token_count} tokens")
        print(f"   ‚Üí Tokens: {' '.join(tokens[:12])}{'...' if token_count > 12 else ''}")
        
        # Show first few token IDs for reference
        if i == 1:
            print(f"   ‚Üí First 5 token IDs: {token_ids[:5]}")
    
    # Test some basic operations
    print("\n" + "="*60)
    print("üß™ Additional tests:")
    print("-" * 60)
    
    # Test encoding/decoding
    test_text = "Hello ‡§®‡§Æ‡§∏‡•ç‡§§‡•á"
    encoded = sp.encode(test_text, out_type=int)
    decoded = sp.decode(encoded)
    print(f"Encode/decode test:")
    print(f"  Original: '{test_text}'")
    print(f"  Encoded: {encoded}")
    print(f"  Decoded: '{decoded}'")
    print(f"  Match: {'‚úì' if test_text == decoded else '‚úó'}")
    
    # Show some vocabulary samples
    print(f"\nSample vocabulary (first 10):")
    for i in range(min(10, sp.vocab_size())):
        piece = sp.id_to_piece(i)
        if piece.startswith('‚ñÅ'):
            print(f"  {i:4d}: '{piece}' (prefix)")
        else:
            print(f"  {i:4d}: '{piece}'")
    
except Exception as e:
    print(f"‚ö†Ô∏è Testing failed: {e}")

print("\n" + "="*60)
print("üéâ Ready for use with your corpus!")
print("="*60)
print("\nUsage example:")
print("  import sentencepiece as spm")
print(f"  sp = spm.SentencePieceProcessor(model_file='{model_file}')")
print("  tokens = sp.encode('Your text here')")
print("="*60)

Starting training for eng_news_wikipedia_ncc_corpus (8.2M lines)...

üìä Analyzing corpus composition...
   File size: 3.64 GB
   English-only lines: 0.0%
   Nepali-only lines: 90.4%
   Mixed lines: 9.6%
   ‚Üí Optimizing for bilingual tokenization...



Training 8.2M corpus:   0%|          | 0/100 [00:00<?, ]

In [None]:
8 2 2 2 5 1 8

2 0 0 0 0 0 0

6 0 0 0 0 0 0