In [1]:
import pandas as pd
from collections import Counter, defaultdict

In [2]:
df = pd.read_parquet("../Assignment_1/tokenized_hi.parquet")

In [3]:
all_tokens = df['sentences'].explode().str['tokens'].explode().tolist()
print("Total individual tokens:", len(all_tokens))
print("First 30 individual tokens:", all_tokens[:30])

Total individual tokens: 61035115
First 30 individual tokens: ['‡§≤‡•ã‡§ó‡•ã‡§Ç', '‡§ï‡•ã', '‡§¨‡§ø‡§≤‡•ã‡§Ç', '‡§∏‡§Ç‡§¨‡§Ç‡§ß‡•Ä', '‡§∏‡•Å‡§µ‡§ø‡§ß‡§æ', '‡§¶‡•á‡§®‡§æ', '‡§π‡•Ä', '‡§â‡§®‡§ï‡§æ', '‡§ï‡§æ‡§Æ', '‡§á‡§®‡•á‡§≤‡•ã', '1987', '‡§Æ‡•á‡§Ç', '‡§â‡§∏', '‡§µ‡§ï‡•ç‡§§', '‡§ê‡§∏‡•á', '‡§π‡•Ä', '‡§¶‡•ã‡§∞‡§æ‡§π‡•á', '‡§™‡§∞', '‡§ñ‡§°‡§º‡•Ä', '‡§•‡•Ä', ',', '‡§ú‡§¨', '‡§™‡•Ç‡§∞‡•ç‡§µ', '‡§â‡§™‡§™‡•ç‡§∞‡§ß‡§æ‡§®‡§Æ‡§Ç‡§§‡•ç‡§∞‡•Ä', '‡§¶‡•á‡§µ‡•Ä‡§≤‡§æ‡§≤', '‡§®‡•á', '‡§Ö‡§™‡§®‡•á', '‡§™‡•Å‡§§‡•ç‡§∞', '‡§ì‡§Æ‡§™‡•ç‡§∞‡§ï‡§æ‡§∂', '‡§ö‡•å‡§ü‡§æ‡§≤‡§æ']


In [4]:
def build_ngram_model(tokens, n=1):
    """Build n-gram language model for Hindi tokens"""
    ngrams = []
    for i in range(len(tokens) - n + 1):
        ngram = tuple(tokens[i:i+n])
        ngrams.append(ngram)

    counts = Counter(ngrams)
    model = defaultdict(dict)

    for ngram, freq in counts.items():
        prefix = ngram[:-1] if n > 1 else ()
        model[prefix][ngram[-1]] = freq

    # Normalize to probabilities
    for prefix in model:
        total_prefix = sum(model[prefix].values())
        for word in model[prefix]:
            model[prefix][word] /= total_prefix

    return model

In [7]:
unigram_model = build_ngram_model(all_tokens, 1)
print("\nüîπ Unigram probabilities (top 10):")
print(dict(list(unigram_model[()].items())[:10]))


üîπ Unigram probabilities (top 10):
{'‡§≤‡•ã‡§ó‡•ã‡§Ç': 0.0019588559798732253, '‡§ï‡•ã': 0.018302906449836294, '‡§¨‡§ø‡§≤‡•ã‡§Ç': 8.388613669360662e-06, '‡§∏‡§Ç‡§¨‡§Ç‡§ß‡•Ä': 8.626181829918727e-05, '‡§∏‡•Å‡§µ‡§ø‡§ß‡§æ': 0.00015009392543947856, '‡§¶‡•á‡§®‡§æ': 0.0001784546486067897, '‡§π‡•Ä': 0.00447696379371121, '‡§â‡§®‡§ï‡§æ': 0.00045069137659525994, '‡§ï‡§æ‡§Æ': 0.0010307672886337644, '‡§á‡§®‡•á‡§≤‡•ã': 5.455875687299025e-06}


In [8]:
bigram_model = build_ngram_model(all_tokens, 2)
print("\nüîπ Bigram probabilities after ('‡§≤‡•ã‡§ó‡•ã‡§Ç',):")
print(bigram_model[("‡§≤‡•ã‡§ó‡•ã‡§Ç",)])


üîπ Bigram probabilities after ('‡§≤‡•ã‡§ó‡•ã‡§Ç',):
{'‡§ï‡•ã': 0.31210532038575095, '‡§ï‡•á': 0.15123913716240517, '‡§∏‡•á': 0.05351332814760913, '‡§®‡•á': 0.15027726896344065, '‡§î‡§∞': 0.00460860328373439, '‡§ï‡•Ä': 0.18022064420077116, '‡§Æ‡•á‡§Ç': 0.038583460885420584, '‡§ï‡§æ': 0.0650641106064788, '‡§§‡§ï': 0.00595521876228473, '‡§™‡§∞': 0.01769001078965197, '‡§¶‡•ç‡§µ‡§æ‡§∞‡§æ': 0.007703309663011568, '‡§á‡§®‡•ç‡§π‡•á‡§Ç': 2.5092213886031163e-05, '‡§Ü‡§è': 8.364071295343722e-06, '(': 0.0003763832082904675, '‡§™‡•Å‡§≤‡§ø‡§∏': 2.5092213886031163e-05, '‡§ú‡•Ä': 8.364071295343722e-06, ',': 0.0014469843340944638, '‡§Ø‡§æ': 0.0002760143527463428, '‡§ï‡•á‡§Ö‡§µ‡•à‡§ß': 8.364071295343722e-06, '‡§∏‡§Æ‡•á‡§§': 0.00024255806756496792, '‡§ë‡§®‡§≤‡§æ‡§á‡§®': 1.6728142590687443e-05, "'": 0.00010873292683946838, '‡§°‡§ø‡§™‡•ç‡§ü‡•Ä': 8.364071295343722e-06, '‡§ï‡•ã‡•§': 8.364071295343722e-05, '‡§∏‡§π‡§ø‡§§': 0.00046002392124390467, '‡§ú‡§æ‡§ó‡§∞‡•Ç‡§ï': 3.3456285181374886e-05, '-': 0.000133825

In [9]:
trigram_model = build_ngram_model(all_tokens, 3)
print("\nüîπ Trigram probabilities after ('‡§≠‡§æ‡§∞‡§§','‡§ï‡•Ä'):")
print(trigram_model[("‡§≠‡§æ‡§∞‡§§","‡§ï‡•Ä")])


üîπ Trigram probabilities after ('‡§≠‡§æ‡§∞‡§§','‡§ï‡•Ä'):
{'‡§ó‡§π‡§∞‡•Ä': 0.0005184033177812338, '‡§â‡§™‡§≤‡§¨‡•ç‡§ß‡§ø‡§Ø‡•ã‡§Ç': 0.0005184033177812338, '‡§µ‡§ø‡§∂‡§æ‡§≤': 0.0002592016588906169, '‡§ó‡§∞‡•ç‡§Æ': 0.00038880248833592535, '‡§Æ‡§π‡§ø‡§≤‡§æ‡§ì‡§Ç': 0.0007776049766718507, '‡§Æ‡§π‡§ø‡§≤‡§æ‡§è‡§Ç': 0.0005184033177812338, '‡§™‡§π‡§≤‡•Ä': 0.024624157594608606, '‡§è‡§ï‡§§‡§æ': 0.0032400207361327114, '‡§™‡•ç‡§∞‡§•‡§Æ': 0.0029808190772420942, '‡§§‡§∞‡§´': 0.041990668740279936, '‡§µ‡§ø‡§¶‡•á‡§∂': 0.005313634007257646, '‡§ú‡§®‡§§‡§æ': 0.00997926386728875, '‡§π‡§æ‡§∞': 0.006480041472265423, '‡§§‡§æ‡§ú‡§æ': 0.00012960082944530845, '‡§ò‡•ã‡§∑‡§£‡§æ': 0.00038880248833592535, '‡§∏‡§Ç‡§ö‡§ø‡§§': 0.0005184033177812338, '‡§µ‡§∞‡•ç‡§≤‡•ç‡§°': 0.0002592016588906169, '‡§ú‡§º‡§ø‡§Æ‡•ç‡§Æ‡•á‡§¶‡§æ‡§∞': 0.00012960082944530845, '‡§Ö‡§∞‡•ç‡§•‡§µ‡•ç‡§Ø‡§µ‡§∏‡•ç‡§•‡§æ': 0.012052877138413685, '‡§ö‡§ø‡§Ç‡§§‡§æ': 0.002851218247796786, '‡§∏‡§∞‡§ï‡§æ‡§∞‡•ã‡§Ç': 0.00038880248833592535, '‡§Ü‡§ú‡§æ‡§¶‡•Ä'

In [12]:
print("\nüîπ Trigram probabilities after ('‡§ú‡§π‡§æ‡§Ç','‡§Ü‡§à'):")
print(trigram_model[("‡§ú‡§π‡§æ‡§Ç","‡§Ü‡§à")])


üîπ Trigram probabilities after ('‡§ú‡§π‡§æ‡§Ç','‡§Ü‡§à'):
{'‡§•‡•Ä': 1.0}


In [5]:
quadgram_model = build_ngram_model(all_tokens, 4)
print("\nüîπ Quadrigram probabilities after ('‡§≠‡§æ‡§∞‡§§','‡§ï‡•Ä','‡§∞‡§æ‡§ú‡§ß‡§æ‡§®‡•Ä'):")
print(quadgram_model[("‡§≠‡§æ‡§∞‡§§","‡§ï‡•Ä","‡§∞‡§æ‡§ú‡§ß‡§æ‡§®‡•Ä")])


üîπ Quadrigram probabilities after ('‡§≠‡§æ‡§∞‡§§','‡§ï‡•Ä','‡§∞‡§æ‡§ú‡§ß‡§æ‡§®‡•Ä'):
{'‡§¶‡§ø‡§≤‡•ç‡§≤‡•Ä': 0.47058823529411764, '‡§π‡•Ä': 0.029411764705882353, '‡§•‡•Ä‡•§': 0.029411764705882353, '‡§®‡§π‡•Ä‡§Ç': 0.029411764705882353, '‡§ï‡§π‡•â': 0.029411764705882353, '‡§®‡§à': 0.08823529411764706, '‡§π‡•à‡•§': 0.029411764705882353, '‡§™‡§∂‡•ç‡§ö‡§ø‡§Æ‡•Ä': 0.029411764705882353, '‡§Æ‡•á‡§Ç': 0.08823529411764706, '‡§ï‡•ã': 0.029411764705882353, '‡§≠‡•Ä': 0.029411764705882353, '‡§∏‡•á': 0.029411764705882353, '‡§π‡•ã‡§®‡•á': 0.029411764705882353, '‡§ú‡§∞‡•Ç‡§∞': 0.029411764705882353, '‡§Æ‡§π‡§Ç‡§ó‡§æ‡§à': 0.029411764705882353}
