In [1]:
import pandas as pd
from collections import Counter, defaultdict

In [2]:
df = pd.read_parquet("../Assignment_1/tokenized_hi.parquet")

In [3]:
all_tokens = df['sentences'].explode().str['tokens'].explode().tolist()
print("Total individual tokens:", len(all_tokens))
print("First 30 individual tokens:", all_tokens[:30])

Total individual tokens: 61035115
First 30 individual tokens: ['लोगों', 'को', 'बिलों', 'संबंधी', 'सुविधा', 'देना', 'ही', 'उनका', 'काम', 'इनेलो', '1987', 'में', 'उस', 'वक्त', 'ऐसे', 'ही', 'दोराहे', 'पर', 'खड़ी', 'थी', ',', 'जब', 'पूर्व', 'उपप्रधानमंत्री', 'देवीलाल', 'ने', 'अपने', 'पुत्र', 'ओमप्रकाश', 'चौटाला']


In [4]:
def build_ngram_model(tokens, n=1):
    """Build n-gram language model for Hindi tokens"""
    ngrams = []
    for i in range(len(tokens) - n + 1):
        ngram = tuple(tokens[i:i+n])
        ngrams.append(ngram)

    counts = Counter(ngrams)
    model = defaultdict(dict)

    for ngram, freq in counts.items():
        prefix = ngram[:-1] if n > 1 else ()
        model[prefix][ngram[-1]] = freq

    # Normalize to probabilities
    for prefix in model:
        total_prefix = sum(model[prefix].values())
        for word in model[prefix]:
            model[prefix][word] /= total_prefix

    return model

In [7]:
unigram_model = build_ngram_model(all_tokens, 1)
print("\n🔹 Unigram probabilities (top 10):")
print(dict(list(unigram_model[()].items())[:10]))


🔹 Unigram probabilities (top 10):
{'लोगों': 0.0019588559798732253, 'को': 0.018302906449836294, 'बिलों': 8.388613669360662e-06, 'संबंधी': 8.626181829918727e-05, 'सुविधा': 0.00015009392543947856, 'देना': 0.0001784546486067897, 'ही': 0.00447696379371121, 'उनका': 0.00045069137659525994, 'काम': 0.0010307672886337644, 'इनेलो': 5.455875687299025e-06}


In [8]:
bigram_model = build_ngram_model(all_tokens, 2)
print("\n🔹 Bigram probabilities after ('लोगों',):")
print(bigram_model[("लोगों",)])


🔹 Bigram probabilities after ('लोगों',):
{'को': 0.31210532038575095, 'के': 0.15123913716240517, 'से': 0.05351332814760913, 'ने': 0.15027726896344065, 'और': 0.00460860328373439, 'की': 0.18022064420077116, 'में': 0.038583460885420584, 'का': 0.0650641106064788, 'तक': 0.00595521876228473, 'पर': 0.01769001078965197, 'द्वारा': 0.007703309663011568, 'इन्हें': 2.5092213886031163e-05, 'आए': 8.364071295343722e-06, '(': 0.0003763832082904675, 'पुलिस': 2.5092213886031163e-05, 'जी': 8.364071295343722e-06, ',': 0.0014469843340944638, 'या': 0.0002760143527463428, 'केअवैध': 8.364071295343722e-06, 'समेत': 0.00024255806756496792, 'ऑनलाइन': 1.6728142590687443e-05, "'": 0.00010873292683946838, 'डिप्टी': 8.364071295343722e-06, 'को।': 8.364071295343722e-05, 'सहित': 0.00046002392124390467, 'जागरूक': 3.3456285181374886e-05, '-': 0.00013382514072549955, 'व': 0.001196062195234152, '[': 1.6728142590687443e-05, '.': 0.0002341939962696242, 'मौजूद': 2.5092213886031163e-05, 'भी': 0.0001170969981348121, 'लिए': 3.345

In [9]:
trigram_model = build_ngram_model(all_tokens, 3)
print("\n🔹 Trigram probabilities after ('भारत','की'):")
print(trigram_model[("भारत","की")])


🔹 Trigram probabilities after ('भारत','की'):
{'गहरी': 0.0005184033177812338, 'उपलब्धियों': 0.0005184033177812338, 'विशाल': 0.0002592016588906169, 'गर्म': 0.00038880248833592535, 'महिलाओं': 0.0007776049766718507, 'महिलाएं': 0.0005184033177812338, 'पहली': 0.024624157594608606, 'एकता': 0.0032400207361327114, 'प्रथम': 0.0029808190772420942, 'तरफ': 0.041990668740279936, 'विदेश': 0.005313634007257646, 'जनता': 0.00997926386728875, 'हार': 0.006480041472265423, 'ताजा': 0.00012960082944530845, 'घोषणा': 0.00038880248833592535, 'संचित': 0.0005184033177812338, 'वर्ल्ड': 0.0002592016588906169, 'ज़िम्मेदार': 0.00012960082944530845, 'अर्थव्यवस्था': 0.012052877138413685, 'चिंता': 0.002851218247796786, 'सरकारों': 0.00038880248833592535, 'आजादी': 0.010238465526179368, 'चुनौती': 0.0011664074650077762, 'मांग': 0.0012960082944530845, 'सिंधुध्वज': 0.00012960082944530845, 'जानी': 0.0006480041472265422, 'आर्थिक': 0.008553654743390357, 'सीधी': 0.0002592016588906169, 'सराहना': 0.0012960082944530845, 'स्थिति': 0

In [12]:
print("\n🔹 Trigram probabilities after ('जहां','आई'):")
print(trigram_model[("जहां","आई")])


🔹 Trigram probabilities after ('जहां','आई'):
{'थी': 1.0}


In [5]:
quadgram_model = build_ngram_model(all_tokens, 4)
print("\n🔹 Quadrigram probabilities after ('भारत','की','राजधानी'):")
print(quadgram_model[("भारत","की","राजधानी")])


🔹 Quadrigram probabilities after ('भारत','की','राजधानी'):
{'दिल्ली': 0.47058823529411764, 'ही': 0.029411764705882353, 'थी।': 0.029411764705882353, 'नहीं': 0.029411764705882353, 'कहॉ': 0.029411764705882353, 'नई': 0.08823529411764706, 'है।': 0.029411764705882353, 'पश्चिमी': 0.029411764705882353, 'में': 0.08823529411764706, 'को': 0.029411764705882353, 'भी': 0.029411764705882353, 'से': 0.029411764705882353, 'होने': 0.029411764705882353, 'जरूर': 0.029411764705882353, 'महंगाई': 0.029411764705882353}
