In [1]:
import re
from collections import Counter, defaultdict

def preprocess_text(text):
    """Clean and normalize the text."""
    # Removing punctuation and unnecessary characters
    text = re.sub(r"[^\u0900-\u097F\s]", "", text)  # Keep Hindi characters and spaces
    text = re.sub(r"\s+", " ", text).strip()
    return text

def calculate_compression_ratio(corpus, tokenized_corpus):
    """Calculate the compression ratio."""
    original_length = len("".join(corpus))
    tokenized_length = sum(len(token) for token in tokenized_corpus)
    return original_length / tokenized_length

def build_bpe(corpus, vocab_size):
    """Build a BPE vocabulary."""
    # Tokenize corpus into characters
    corpus = [" ".join(word) + " </w>" for word in corpus]
    corpus = " ".join(corpus).split()
    
    # Count initial token frequencies
    token_freqs = Counter(corpus)
    bpe_vocab = set(token_freqs.keys())
    
    while len(bpe_vocab) < vocab_size:
        # Count all pairs of symbols
        pairs = defaultdict(int)
        for word in token_freqs.keys():
            symbols = word.split()
            for i in range(len(symbols) - 1):
                pairs[symbols[i], symbols[i + 1]] += token_freqs[word]
        
        if not pairs:
            break
        
        # Get the most frequent pair
        best_pair = max(pairs, key=pairs.get)
        new_symbol = "".join(best_pair)
        
        # Replace the best pair in the vocabulary
        new_vocab = {}
        for word, freq in token_freqs.items():
            new_word = re.sub(r"\b" + re.escape(" ".join(best_pair)) + r"\b", new_symbol, word)
            new_vocab[new_word] = freq
        
        token_freqs = new_vocab
        bpe_vocab.add(new_symbol)
    
    return bpe_vocab

# Example usage
if __name__ == "__main__":
    # Example Hindi text corpus
    hindi_corpus = [
        "नमस्ते दुनिया", 
        "भारत एक सुंदर देश है", 
        "हिंदी एक सुंदर भाषा है"
    ]
    hindi_corpus = [preprocess_text(sentence) for sentence in hindi_corpus]
    
    # Building BPE vocabulary
    vocab_size = 5000
    bpe_vocab = build_bpe(hindi_corpus, vocab_size)
    
    # Calculate compression ratio
    tokenized_corpus = [" ".join(word.split()) for word in hindi_corpus]
    compression_ratio = calculate_compression_ratio(hindi_corpus, tokenized_corpus)
    
    print(f"Vocabulary size: {len(bpe_vocab)}")
    print(f"Compression ratio: {compression_ratio:.2f}")


Vocabulary size: 22
Compression ratio: 1.00
