Purpose: Take scrapped TXT file and extract BPE tokens from it 

In [21]:
#extract the single tokens from the corpus 
import re
from collections import Counter

with open(r"D:\NLP\Urdu-Children-s-Story-Generation-System\urdu_tokenizer_training.txt", 'r', encoding="utf-8") as f:
    text = f.read()

def get_word_frequencies(text):
    # 1. Standardize Urdu spaces and remove punctuation
    # We keep letters, numbers, and the brackets for tags
    text = re.sub(r'[^\w\s<>]', ' ', text)
    
    # 2. Split by whitespace
    tokens = text.split()
    
    # 3. Count frequencies
    frequencies = Counter(tokens)
    
    return frequencies

frequencies = get_word_frequencies(text)

print(frequencies)

Counter({'<EOS>': 2641, 'کے': 1279, 'میں': 1166, 'سے': 981, 'کی': 958, 'اور': 831, 'ایک': 746, 'نے': 744, 'اس': 733, '<EOP>': 706, 'تھا': 696, 'کر': 602, 'وہ': 557, 'کا': 551, 'کو': 503, 'ہے': 466, 'تو': 405, 'کہ': 397, 'تھے': 395, 'پر': 374, 'تھی': 344, 'ہو': 313, '<EOT>': 287, 'بھی': 287, 'ان': 268, 'بہت': 264, 'اپنے': 259, 'نہیں': 241, 'یہ': 210, 'ہی': 207, 'ہیں': 203, 'گیا': 203, 'اسے': 200, 'ا': 198, 'لئے': 197, 'کیا': 190, 'گھر': 182, 'کہا': 161, 'رہا': 156, 'دن': 150, 'ہوا': 145, 'ہوئے': 140, 'نہ': 138, 'کچھ': 136, 'ساتھ': 135, 'سب': 132, 'کسی': 126, 'آ': 125, 'رہی': 125, 'رہے': 121, 'اپنی': 119, 'کوئی': 113, 'گئی': 111, 'لیکن': 110, 'جنگل': 108, 'دیا': 107, 'گئے': 107, 'جب': 107, 'پاس': 100, 'جو': 100, 'تھیں': 100, 'گا': 100, 'کام': 98, 'مجھے': 96, 'ہوں': 95, 'کرنے': 94, 'بات': 90, 'امی': 89, 'لے': 88, 'تم': 85, 'درخت': 85, 'طرف': 84, 'وقت': 84, 'دو': 84, 'کرتے': 82, 'جا': 80, 'آپ': 78, 'ہر': 78, 'آیا': 77, 'اب': 77, 'بعد': 77, 'دونوں': 75, 'آج': 72, 'جان': 72, 'مگر': 71, 'گاؤں

In [23]:
#break each word into letters 
def prepare_for_bpe(data):
    bpe_data = {}
    for word, freq in data.items():
        # Split word into characters and add end-of-word token
        chars = tuple(word) + ('</w>',)
        bpe_data[chars] = freq
    return bpe_data

bpe_counts = prepare_for_bpe(frequencies)

# Show example of the new format
for word_tuple, freq in list(bpe_counts.items()):
    print(f"{word_tuple} : {freq}")

('ب', 'س', '</w>') : 27
('م', 'ج', 'ھ', 'ے', '</w>') : 96
('ف', 'ت', 'ح', '</w>') : 3
('گ', 'ڑ', 'ھ', '</w>') : 3
('ک', 'ے', '</w>') : 1279
('ا', 'س', 'ٹ', 'ا', 'پ', '</w>') : 1
('پ', 'ر', '</w>') : 374
('ا', '</w>') : 198
('ت', 'ا', 'ر', '</w>') : 3
('ک', 'ر', '</w>') : 602
('آ', 'گ', 'ے', '</w>') : 16
('ب', 'ڑ', 'ھ', '</w>') : 19
('گ', 'ئ', 'ی', '</w>') : 111
('<', 'E', 'O', 'S', '>', '</w>') : 2641
('م', 'ی', 'ں', '</w>') : 1166
('ن', 'ے', '</w>') : 744
('ک', 'س', 'ی', '</w>') : 126
('س', 'و', 'ا', 'ر', 'ی', '</w>') : 2
('ک', 'ی', '</w>') : 958
('ت', 'ل', 'ا', 'ش', '</w>') : 18
('د', 'ھ', 'ر', '</w>') : 26
('د', 'ی', 'ک', 'ھ', 'ا', '</w>') : 62
('ت', 'و', '</w>') : 405
('د', 'و', 'ر', '</w>') : 41
('ت', 'ک', '</w>') : 49
('ک', 'و', 'ئ', 'ی', '</w>') : 113
('ت', 'ا', 'ن', 'گ', 'ا', '</w>') : 2
('و', 'غ', 'ی', 'ر', 'ہ', '</w>') : 7
('ن', 'ظ', 'ر', '</w>') : 55
('ن', 'ہ', '</w>') : 138
('آ', 'ی', 'ا', '</w>') : 77
('ا', 'ن', 'د', 'ھ', 'ی', 'ر', 'ا', '</w>') : 2
('پ', 'ھ', 'ی', 'ل', '</

In [28]:
# this will do the popularity contest of each word

from collections import defaultdict

def get_stats(bpe_counts):
    pairs = defaultdict(int)
    for word_tuple, freq in bpe_counts.items():
        # Ensure the word has at least 2 tokens to form a pair
        if len(word_tuple) < 2:
            continue 
            
        for i in range(len(word_tuple) - 1):
            # Explicitly create a tuple of exactly 2 elements
            pair = (word_tuple[i], word_tuple[i+1])
            pairs[pair] += freq
    return pairs

def merge_vocab(pair, bpe_counts):
    new_counts = {}
    # We want to replace (char1, char2) with 'char1char2'
    bigram = pair
    replacement = "".join(pair)
    
    for word_tuple, freq in bpe_counts.items():
        new_word = []
        i = 0
        while i < len(word_tuple):
            # If we find the winning pair, merge them
            if i < len(word_tuple) - 1 and word_tuple[i:i+2] == bigram:
                new_word.append(replacement)
                i += 2
            else:
                new_word.append(word_tuple[i])
                i += 1
        new_counts[tuple(new_word)] = freq
    return new_counts

In [35]:
# Initialize this BEFORE your learning loop
ordered_merges = [] 

# Your learning loop
for i in range(400): # or however many merges you want
    pairs = get_stats(current_data)
    if not pairs:
        break
    
    best_pair = max(pairs, key=pairs.get)
    
    # SAVE THIS WINNER!
    ordered_merges.append(best_pair) 
    
    current_data = merge_vocab(best_pair, current_data)

In [36]:
ordered_merges

[('کر', 'وں</w>'),
 ('اپ', '</w>'),
 ('غ', 'ی'),
 ('ل', 'کھ'),
 ('مع', 'لو'),
 ('معلو', 'م</w>'),
 ('عل', 'م</w>'),
 ('بچ', 'ہ</w>'),
 ('ڈ', 'ی</w>'),
 ('چی', 'ز</w>'),
 ('مل', 'ک</w>'),
 ('شک', 'ار</w>'),
 ('نک', 'ل</w>'),
 ('دف', 'عہ</w>'),
 ('انت', 'ظ'),
 ('گز', 'ر</w>'),
 ('پہن', 'چ</w>'),
 ('سو', 'چ'),
 ('طو', 'ط'),
 ('بیٹھ', 'ا</w>'),
 ('ک', 'ڑ'),
 ('ذ', 'را</w>'),
 ('چ', 'لے</w>'),
 ('ہ', 'ن'),
 ('لو', 'گوں</w>'),
 ('جان', 'وروں</w>'),
 ('دی', 'تا</w>'),
 ('ٹھ', '</w>'),
 ('چاہ', 'یے</w>'),
 ('چ', 'کی</w>'),
 ('م', 'س'),
 ('بار', 'ش</w>'),
 ('بڑھ', '</w>'),
 ('می', 'اں</w>'),
 ('جان', 'ا</w>'),
 ('م', 'ہ</w>'),
 ('پ', 'تے</w>'),
 ('ک', 'ل'),
 ('اچھ', 'ی</w>'),
 ('ش', 'ام</w>'),
 ('س', 'ارے</w>'),
 ('ب', 'ت</w>'),
 ('ہ', 'ار</w>'),
 ('پی', 'ار</w>'),
 ('م', 'ا</w>'),
 ('ہ', 'می'),
 ('ت', 'لا'),
 ('تلا', 'ش</w>'),
 ('پر', 'انے</w>'),
 ('ال', 'ا</w>'),
 ('ب', 'س'),
 ('آخ', 'ر</w>'),
 ('ص', 'ہ</w>'),
 ('س', 'پ'),
 ('آ', 'نے</w>'),
 ('غ', 'ریب</w>'),
 ('سو', 'چا</w>'),
 ('ت', 'ب</w>'

In [37]:

# we will transform our whole urdu corpous into learned bpe tokens format 
def tokenize_word(word, ordered_merges):
    """
    word: A string like 'بڑھ'
    ordered_merges: A list of tuples of the pairs you merged 
                    e.g., [('ا', 'ں'), ('ک', 'ے'), ('ب', 'س')]
    """
    # 1. Start by splitting the word into characters + end-of-word symbol
    tokens = list(word) + ['</w>']
    
    # 2. Iteratively apply each merge rule in order
    for pair in ordered_merges:
        char1, char2 = pair
        new_tokens = []
        i = 0
        while i < len(tokens):
            # Check if current pair matches the merge rule
            if i < len(tokens) - 1 and tokens[i] == char1 and tokens[i+1] == char2:
                new_tokens.append(char1 + char2) # Merge them
                i += 2
            else:
                new_tokens.append(tokens[i])
                i += 1
        tokens = new_tokens
        
    return tokens

# Example Usage:
# my_merges = [('ب', 'س'), ('ا', 'ں')]
# print(tokenize_word("بس", my_merges)) 
# Output: ['بس', '</w>']

def translate_corpus_to_bpe(raw_text, ordered_merges):
    tokenized_corpus = []
    
    # Split by whitespace to get words
    words = raw_text.split()
    
    for word in words:
        # Special case: Don't tokenize your tags like <EOS> or <EOP>
        if word.startswith('<') and word.endswith('>'):
            tokenized_corpus.append(word + '</w>')
            continue
            
        # Tokenize the word and add to our big list
        word_tokens = tokenize_word(word, ordered_merges)
        tokenized_corpus.extend(word_tokens)
        
    return tokenized_corpus

# Execution
final_token_list = translate_corpus_to_bpe(text, ordered_merges)

In [55]:
ordered_merges

[('کر', 'وں</w>'),
 ('اپ', '</w>'),
 ('غ', 'ی'),
 ('ل', 'کھ'),
 ('مع', 'لو'),
 ('معلو', 'م</w>'),
 ('عل', 'م</w>'),
 ('بچ', 'ہ</w>'),
 ('ڈ', 'ی</w>'),
 ('چی', 'ز</w>'),
 ('مل', 'ک</w>'),
 ('شک', 'ار</w>'),
 ('نک', 'ل</w>'),
 ('دف', 'عہ</w>'),
 ('انت', 'ظ'),
 ('گز', 'ر</w>'),
 ('پہن', 'چ</w>'),
 ('سو', 'چ'),
 ('طو', 'ط'),
 ('بیٹھ', 'ا</w>'),
 ('ک', 'ڑ'),
 ('ذ', 'را</w>'),
 ('چ', 'لے</w>'),
 ('ہ', 'ن'),
 ('لو', 'گوں</w>'),
 ('جان', 'وروں</w>'),
 ('دی', 'تا</w>'),
 ('ٹھ', '</w>'),
 ('چاہ', 'یے</w>'),
 ('چ', 'کی</w>'),
 ('م', 'س'),
 ('بار', 'ش</w>'),
 ('بڑھ', '</w>'),
 ('می', 'اں</w>'),
 ('جان', 'ا</w>'),
 ('م', 'ہ</w>'),
 ('پ', 'تے</w>'),
 ('ک', 'ل'),
 ('اچھ', 'ی</w>'),
 ('ش', 'ام</w>'),
 ('س', 'ارے</w>'),
 ('ب', 'ت</w>'),
 ('ہ', 'ار</w>'),
 ('پی', 'ار</w>'),
 ('م', 'ا</w>'),
 ('ہ', 'می'),
 ('ت', 'لا'),
 ('تلا', 'ش</w>'),
 ('پر', 'انے</w>'),
 ('ال', 'ا</w>'),
 ('ب', 'س'),
 ('آخ', 'ر</w>'),
 ('ص', 'ہ</w>'),
 ('س', 'پ'),
 ('آ', 'نے</w>'),
 ('غ', 'ریب</w>'),
 ('سو', 'چا</w>'),
 ('ت', 'ب</w>'

In [38]:
final_token_list

['بس',
 '</w>',
 'م',
 'ج',
 'ھ',
 'ے',
 '</w>',
 'فت',
 'ح',
 '</w>',
 'گ',
 'ڑ',
 'ھ',
 '</w>',
 'ک',
 'ے',
 '</w>',
 'ا',
 'س',
 'ٹ',
 'ا',
 'پ',
 '</w>',
 'پ',
 'ر',
 '</w>',
 'ا',
 'ُ',
 'ت',
 'ا',
 'ر',
 '</w>',
 'ک',
 'ر',
 '</w>',
 'آ',
 'گ',
 'ے',
 '</w>',
 'ب',
 'ڑ',
 'ھ',
 '</w>',
 'گ',
 'ئی',
 '۔',
 '</w>',
 '<EOS></w>',
 'م',
 'ی',
 'ں',
 '</w>',
 'ن',
 'ے',
 '</w>',
 'ک',
 'س',
 'ی',
 '</w>',
 'س',
 'و',
 'ا',
 'ر',
 'ی',
 '</w>',
 'ک',
 'ی',
 '</w>',
 'ت',
 'ل',
 'ا',
 'ش',
 '</w>',
 'م',
 'ی',
 'ں',
 '</w>',
 'ا',
 'ِ',
 'د',
 'ھ',
 'ر',
 '</w>',
 'ا',
 'ُ',
 'د',
 'ھ',
 'ر',
 '</w>',
 'د',
 'ی',
 'ک',
 'ھ',
 'ا',
 '</w>',
 'ت',
 'و',
 '</w>',
 'د',
 'و',
 'ر',
 '</w>',
 'د',
 'و',
 'ر',
 '</w>',
 'ت',
 'ک',
 '</w>',
 'ک',
 'و',
 'ئی',
 '</w>',
 'ت',
 'ا',
 'ن',
 'گ',
 'ا',
 '</w>',
 'و',
 'غی',
 'ر',
 'ہ',
 '</w>',
 'ن',
 'ظ',
 'ر',
 '</w>',
 'ن',
 'ہ',
 '</w>',
 'آ',
 'ی',
 'ا',
 '۔',
 '</w>',
 '<EOS></w>',
 'ا',
 'ن',
 'د',
 'ھ',
 'ی',
 'ر',
 'ا',
 '</w>',
 'پ',
 'ھ'

In [None]:
from collections import Counter

def get_unigram_counts(tokens):
    return Counter(tokens)

def get_bigram_counts(tokens):
    # Creates pairs: (w1, w2)
    bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens)-1)]
    return Counter(bigrams)

def get_trigram_counts(tokens):
    # Creates triples: (w1, w2, w3)
    trigrams = [(tokens[i], tokens[i+1], tokens[i+2]) for i in range(len(tokens)-2)]
    return Counter(trigrams)

# Execute counting
uni_counts = get_unigram_counts(final_token_list)
bi_counts = get_bigram_counts(final_token_list)
tri_counts = get_trigram_counts(final_token_list)
total_token_count = len(final_token_list)



In [72]:
def get_interpolated_prob(w1, w2, w3, lambdas=(0.8, 0.15, 0.05)):
    l1, l2, l3 = lambdas
    
    # 1. Trigram Probability: P(w3 | w1, w2)
    # Count(w1, w2, w3) / Count(w1, w2)
    c3 = tri_counts.get((w1, w2, w3), 0)
    c2_context = bi_counts.get((w1, w2), 0)
    p_tri = c3 / c2_context if c2_context > 0 else 0
    
    # 2. Bigram Probability: P(w3 | w2)
    # Count(w2, w3) / Count(w2)
    c2 = bi_counts.get((w2, w3), 0)
    c1_context = uni_counts.get(w2, 0)
    p_bi = c2 / c1_context if c1_context > 0 else 0
    
    # 3. Unigram Probability: P(w3)
    # Count(w3) / Total Tokens
    c1 = uni_counts.get(w3, 0)
    p_uni = c1 / total_token_count if total_token_count > 0 else 0
    
    # Final Weighted Sum
    return (l1 * p_tri) + (l2 * p_bi) + (l3 * p_uni)

import random


def generate_story(seed_w1, seed_w2, max_length=500):
    story = [seed_w1, seed_w2]
    vocab = list(uni_counts.keys())
    
    for _ in range(max_length):
        w1, w2 = story[-2], story[-1]
        
        # Calculate probabilities for all tokens in your BPE vocabulary
        probs = [get_interpolated_prob(w1, w2, t) for t in vocab]
        
        # Safety check: if the model is totally lost, stop
        if sum(probs) == 0:
            break
            
        # Weighted choice based on your Interpolation math
        next_token = random.choices(vocab, weights=probs, k=1)[0]
        
        # Add the token to our story
        story.append(next_token)
        
        # STOP CONDITION: Only stop if we hit the End of Text marker
        # Note: Check if your token is exactly '<EOT>' or '<EOT></w>'
        if next_token == '<EOT>' or next_token == '<EOT></w>':
            break
            
    # Post-processing: 
    # 1. Join everything
    # 2. Replace the BPE word-end marker with a space
    # 3. Remove the EOT marker from the final printed text
    full_text = "".join(story).replace('</w>', ' ').replace('<EOT>', '')
    
    return full_text.strip()

import random
import numpy as np
def generate_story_pro(seed_w1, seed_w2, max_length=150, temperature=0.7, top_p=0.9, space_boost=1.2):
    story = [seed_w1 + "</w>", seed_w2 + "</w>"]
    vocab = list(uni_counts.keys())
    
    # --- ADD THIS: Keep track of used tokens ---
    used_tokens = set(story) 
    
    for _ in range(max_length):
        w1, w2 = story[-2], story[-1]
        
        # 1. Get raw interpolated probabilities
        probs = []
        for t in vocab:
            p = get_interpolated_prob(w1, w2, t)
            
            # --- ADD THIS: Frequency Penalty ---
            if t in used_tokens:
                p *= 0.5  # Reduce likelihood of repetition
            
            if "</w>" in t:
                p *= space_boost
            probs.append(p)
            
        # 2. TEMPERATURE (Sharpening)
        probs = np.array(probs)
        probs = np.power(probs, (1 / temperature))
        probs /= probs.sum()
        
        # 3. NUCLEUS SAMPLING (Top-P)
        combined = sorted(zip(vocab, probs), key=lambda x: x[1], reverse=True)
        sorted_vocab, sorted_probs = zip(*combined)
        cumulative_probs = np.cumsum(sorted_probs)
        cutoff = np.where(cumulative_probs >= top_p)[0][0]
        
        final_vocab = sorted_vocab[:cutoff + 1]
        final_probs = sorted_probs[:cutoff + 1]
        
        # 4. Pick next token
        next_token = random.choices(final_vocab, weights=final_probs, k=1)[0]
        
        # --- ADD THIS: Update used tokens ---
        story.append(next_token)
        used_tokens.add(next_token) 
        
        if next_token == '<EOT>' or next_token == '<EOT></w>':
            break
            
    raw_output = "".join(story).replace('</w>', ' ').replace('<EOT>', '')
    return " ".join(raw_output.split())

<function __main__.merge_vocab(pair, bpe_counts)>

In [73]:
# Starting your Urdu story
# Note: Ensure these tokens match exactly how they look in your BPE list
start_w1 = "وہ"
start_w2 = "ایک"

urdu_story = generate_story_pro(start_w1, start_w2, max_length=1000)
print("Generated Urdu Story:")
print(urdu_story)

Generated Urdu Story:
وہ ایک یں اب کے پور اس کو گھر میں کی جانے تو کرت نے کہ کر چھے۔ <EOS> <EOP> ”داز نے کان نے ب کی کانار دیک شکام ہے۔ <EOS> بچوں اور اپنی وہ بیٹا ہے۔ <EOS> وہ بڑھا۔ <EOS> <EOP>
