In [1]:
import pandas as pd
from collections import Counter, defaultdict
import random
import math
import numpy as np

In [2]:
df = pd.read_parquet("../Assignment_1/tokenized_hi.parquet")

In [3]:
sentences = df['sentences'].explode().str['tokens'].tolist()
all_tokens = [token for sentence in sentences for token in sentence]
print(f"Successfully loaded {len(sentences)} sentences.")
print(f"Total tokens for training: {len(all_tokens)}")

Successfully loaded 3263994 sentences.
Total tokens for training: 61035115


In [4]:
random.seed(42) 
test_sentences = random.sample(sentences, 1000)
print(f"Randomly selected {len(test_sentences)} sentences for testing.\n")

Randomly selected 1000 sentences for testing.



In [None]:
unigram_counts = Counter(all_tokens)
bigram_counts = Counter(zip(all_tokens, all_tokens[1:]))

V = len(unigram_counts)
print(f"Vocabulary Size (V): {V}")

# Pre-compute T(w)->the number of unique token types that follow each word w
#Token Type Smoothing
following_types = defaultdict(set)
for w1, w2 in bigram_counts:
    following_types[w1].add(w2)
T = {word: len(types) for word, types in following_types.items()}
print("Pre-computation complete.\n")

Vocabulary Size (V): 558957
Pre-computation complete.



In [None]:
def calculate_add_one_prob(sentence, unigram_counts, bigram_counts, V):
    #Add-One (Laplace) smoothing
    log_prob = 0.0
    for i in range(len(sentence) - 1):
        w1, w2 = sentence[i], sentence[i+1]
        bigram = (w1, w2)
        
        # Formula: P(w2 | w1) = (count(w1, w2) + 1) / (count(w1) + V)
        numerator = bigram_counts.get(bigram, 0) + 1
        denominator = unigram_counts.get(w1, 0) + V
        
        log_prob += np.log(numerator / denominator)
        
    return log_prob

In [None]:
def calculate_add_k_prob(sentence, unigram_counts, bigram_counts, V, k):
    #Add-K smoothing
    log_prob = 0.0
    for i in range(len(sentence) - 1):
        w1, w2 = sentence[i], sentence[i+1]
        bigram = (w1, w2)
        
        # Formula: P(w2 | w1) = (count(w1, w2) + k) / (count(w1) + k*V)
        numerator = bigram_counts.get(bigram, 0) + k
        denominator = unigram_counts.get(w1, 0) + k * V
        
        log_prob += np.log(numerator / denominator)
        
    return log_prob

In [None]:
def calculate_token_type_prob(sentence, unigram_counts, bigram_counts, T):
    #Token Type" smoothing.
    #P(w2 | w1) = (count(w1, w2) + 1) / (count(w1) + T(w1))
    #T(w1)->number of unique word types that follow w1.

    log_prob = 0.0
    for i in range(len(sentence) - 1):
        w1, w2 = sentence[i], sentence[i+1]
        bigram = (w1, w2)
        
        # Get count of unique followers for w1, default to V if w1 is unknown
        num_following_types = T.get(w1, V)
        
        numerator = bigram_counts.get(bigram, 0) + 1
        denominator = unigram_counts.get(w1, 0) + num_following_types
        
        # Avoid division by zero if a token somehow has 0 count and 0 followers
        if denominator == 0:
            continue

        log_prob += np.log(numerator / denominator)
        
    return log_prob

In [None]:
k_value = 0.1 

results = []

for sentence in test_sentences:
    #sentence to short for bi-gram
    if len(sentence) < 2:
        continue
    
    prob_add_one = calculate_add_one_prob(sentence, unigram_counts, bigram_counts, V)
    prob_add_k = calculate_add_k_prob(sentence, unigram_counts, bigram_counts, V, k_value)
    prob_token_type = calculate_token_type_prob(sentence, unigram_counts, bigram_counts, T)
    
    results.append({
        "sentence": " ".join(sentence),
        "add_one_log_prob": prob_add_one,
        "add_k_log_prob": prob_add_k,
        "token_type_log_prob": prob_token_type
    })

In [12]:
print(f"(A higher log probability, closer to 0, indicates a better fit by the model)\n")

#for first 5 sentences
for i in range(5):
    res = results[i]
    print(f"Sentence {i+1}: \"{res['sentence']}\"")
    print(f"Add-One Smoothing Log Prob:      {res['add_one_log_prob']:.4f}")
    print(f"Add-K (k={k_value}) Smoothing Log Prob: {res['add_k_log_prob']:.4f}")
    print(f"Token Type Smoothing Log Prob:   {res['token_type_log_prob']:.4f}\n")

(A higher log probability, closer to 0, indicates a better fit by the model)

Sentence 1: "इस वीडियो में तेजस्वी प्रकाश के हाथ में एक सूटकेस है और वो ' बंटी और बबली ' के गाने ' धड़क धड़क ' पर थिरकती नज़र आ रही हैं।"
Add-One Smoothing Log Prob:      -238.3993
Add-K (k=0.1) Smoothing Log Prob: -198.5167
Token Type Smoothing Log Prob:   -145.1653

Sentence 2: "भागलपुर में सोमवार की देर रात अपराधियों ने एक पिकअप वैन चालक को गोली मार दी।"
Add-One Smoothing Log Prob:      -108.9366
Add-K (k=0.1) Smoothing Log Prob: -85.1254
Token Type Smoothing Log Prob:   -60.5898

Sentence 3: "इसके अलावा किसी बड़े रसूखदार व्यक्ति का मोबाइल चोरी होता है तो उसे चोरी की धारा में दर्ज किया जाता है , लेकिन आम आदमी का मोबाइल चोरी भी होता है तो उसे केवल खोया पाया सेल और गुम होने की तहरीर पर ही दर्ज किया जाता है।"
Add-One Smoothing Log Prob:      -336.7135
Add-K (k=0.1) Smoothing Log Prob: -269.9604
Token Type Smoothing Log Prob:   -216.6214

Sentence 4: "उन्हें बताना चाहिए कि वह किस परंपरा और अखाड़े से संबंधित है