In [1]:
import pandas as pd
import re
from collections import defaultdict, Counter
import pickle

# Load training data
df = pd.read_csv('../PreProcessing/all_urdu_moral_stories_with_tokens.csv')
training_text = ' '.join(df['content'].astype(str))

print(f"Training data loaded: {len(training_text)} characters")
print(f"First 200 chars: {training_text[:200]}")

class BPETokenizer:
    def __init__(self, vocab_size=250):
        """Initialize BPE tokenizer"""
        self.vocab_size = vocab_size
        self.vocab = {}
        self.merges = {}
        self.token_id = 0
        
    def build_vocab(self, text):
        """Build initial vocabulary from characters"""
        unique_chars = set(text)
        self.vocab = {char: idx for idx, char in enumerate(sorted(unique_chars))}
        self.token_id = len(self.vocab)
        print(f"Initial vocab size: {len(self.vocab)}")
        return self.vocab
    
    def get_stats(self, split_text):
        """Find the most frequent pair of tokens"""
        pairs = defaultdict(int)
        for word_tokens, freq in split_text.items():
            for i in range(len(word_tokens) - 1):
                pair = (word_tokens[i], word_tokens[i+1])
                pairs[pair] += freq
        return pairs
    
    def merge_vocab(self, pair, split_text):
        """Merge a frequent pair in the split text"""
        new_split = {}
        # Faster regex-free merge
        for word_tokens, freq in split_text.items():
            new_word_tokens = []
            i = 0
            while i < len(word_tokens):
                if i < len(word_tokens) - 1 and word_tokens[i] == pair[0] and word_tokens[i+1] == pair[1]:
                    new_word_tokens.append(pair[0] + pair[1])
                    i += 2
                else:
                    new_word_tokens.append(word_tokens[i])
                    i += 1
            new_split[tuple(new_word_tokens)] = freq
        return new_split

    def train(self, text):
        """Train BPE tokenizer until vocab reaches vocab_size"""
        self.build_vocab(text)
        
        # Initialize split text: dict of {tuple_of_tokens: frequency}
        words = text.split()
        word_counts = Counter(words)
        split_text = {tuple(list(word)): count for word, count in word_counts.items()}
        
        num_merges = self.vocab_size - len(self.vocab)
        print(f"Goal: {num_merges} merges to reach vocab size {self.vocab_size}")

        for i in range(num_merges):
            pairs = self.get_stats(split_text)
            if not pairs:
                break
            
            best_pair = max(pairs, key=pairs.get)
            new_token = "".join(best_pair)
            
            self.vocab[new_token] = self.token_id
            self.token_id += 1
            self.merges[best_pair] = new_token
            split_text = self.merge_vocab(best_pair, split_text)
            
            if (i + 1) % 50 == 0:
                print(f"Iteration {i + 1}: Vocab size = {len(self.vocab)}, Merged {best_pair} -> {new_token}")

        print(f"\nBPE Training Complete!")
        print(f"Final vocab size: {len(self.vocab)}")
        return self

    def encode(self, text):
        """Encode text into token IDs"""
        # Simple greedy encoding for demonstration (BPE usually uses max-priority merges)
        words = text.split()
        encoded = []
        # Build a list of merges in priority order
        merge_list = list(self.merges.items())
        
        for word in words:
            word_tokens = list(word)
            for pair, new_token in merge_list:
                new_word_tokens = []
                i = 0
                while i < len(word_tokens):
                    if i < len(word_tokens) - 1 and word_tokens[i] == pair[0] and word_tokens[i+1] == pair[1]:
                        new_word_tokens.append(new_token)
                        i += 2
                    else:
                        new_word_tokens.append(word_tokens[i])
                        i += 1
                word_tokens = new_word_tokens
            
            for token in word_tokens:
                if token in self.vocab:
                    encoded.append(self.vocab[token])
        return encoded

    def decode(self, token_ids):
        """Decode token IDs back to text"""
        id_to_token = {v: k for k, v in self.vocab.items()}
        tokens = [id_to_token.get(idx, "") for idx in token_ids]
        return "".join(tokens)

    def save(self, filepath):
        """Save tokenizer to file"""
        with open(filepath, 'wb') as f:
            pickle.dump({'vocab': self.vocab, 'merges': self.merges}, f)
        print(f"Tokenizer saved to {filepath}")

    def load(self, filepath):
        """Load tokenizer from file"""
        with open(filepath, 'rb') as f:
            data = pickle.load(f)
            self.vocab = data['vocab']
            self.merges = data['merges']
        print(f"Tokenizer loaded from {filepath}")
        return self

# Initialize and train BPE tokenizer
tokenizer = BPETokenizer(vocab_size=250)
tokenizer.train(training_text)

# Save tokenizer
tokenizer.save('bpe_tokenizer.pkl')

# Display sample vocab
print("\nSample vocabulary (first 20 and last 10 tokens):")
vocab_items = list(tokenizer.vocab.items())
for token, idx in vocab_items[:20]:
    print(f"  {idx}: '{token}'")
print("  ...")
for token, idx in vocab_items[-10:]:
    print(f"  {idx}: '{token}'")


Training data loaded: 394211 characters
First 200 chars: ارسلان خان شیر جنگل میں آرام کر رہا تھا کہ اس نے بلی کو دیکھا تو اسے آواز دے کر بلایا۔￰شیر نے بلی سے پوچھا ”تم دیکھنے میں تو مجھ جیسی ہو لیکن تمہارا قد اتنا چھوٹا کیوں ہے؟￰“ بلی نے اداسی سے کہا، ”ظالم
Initial vocab size: 84
Goal: 166 merges to reach vocab size 250


Iteration 50: Vocab size = 134, Merged ('د', 'ی') -> دی


Iteration 100: Vocab size = 184, Merged ('ٹ', 'ھ') -> ٹھ


Iteration 150: Vocab size = 234, Merged ('پ', 'و') -> پو



BPE Training Complete!
Final vocab size: 250
Tokenizer saved to bpe_tokenizer.pkl

Sample vocabulary (first 20 and last 10 tokens):
  0: ''
  1: ' '
  2: '!'
  3: '"'
  4: '#'
  5: '''
  6: '('
  7: ')'
  8: '-'
  9: '.'
  10: '0'
  11: '1'
  12: '2'
  13: '5'
  14: '8'
  15: ':'
  16: '¿'
  17: '،'
  18: '؛'
  19: '؟'
  ...
  240: 'گھر'
  241: 'صاح'
  242: 'ست'
  243: 'ہاں'
  244: 'قی'
  245: 'ٹی'
  246: 'ڑھ'
  247: 'دے'
  248: 'ساتھ'
  249: 'رہے'
