In [1]:
from collections import Counter
m = 3
word_counts = {'happy': 5, 'because': 3, 'i': 2, 'am': 2, 'learning': 3, '.': 1}
vocab = [word[0] for word in Counter(word_counts).most_common(m)]

In [2]:
vocab

['happy', 'because', 'learning']

In [3]:
# Input sentence
sentence = ['am', 'i', 'learning']
output_sentence = []

# Output sentence
for w in sentence:
    if w not in vocab:
        output_sentence.append("<UNK>")
    else:
        output_sentence.append(w)

In [4]:
output_sentence

['<UNK>', '<UNK>', 'learning']

In [5]:
# Large # of <UNK>s can cause bad performance
# Following is an example of this
training_set = ['i', 'am', 'happy', 'because','i', 'am', 'learning', '.']
training_set_unk = ['i', 'am', '<UNK>', '<UNK>','i', 'am', '<UNK>', '<UNK>']

test_set = ['i', 'am', 'learning']
test_set_unk = ['i', 'am', '<UNK>']

M = len(test_set)
probability = 1
probability_unk = 1

bigram_probabilities = {('i', 'am'): 1.0, ('am', 'happy'): 0.5, ('happy', 'because'): 1.0, ('because', 'i'): 1.0, ('am', 'learning'): 0.5, ('learning', '.'): 1.0}
bigram_probabilities_unk = {('i', 'am'): 1.0, ('am', '<UNK>'): 1.0, ('<UNK>', '<UNK>'): 0.5, ('<UNK>', 'i'): 0.25}

for i in range(len(test_set)-1):
    bigram = tuple(test_set[i:i+2])
    probability *= bigram_probabilities[bigram]

    bigram_unk = tuple(test_set_unk[i:i+2])
    probability_unk *= bigram_probabilities_unk[bigram_unk]

perplexity = probability**(-1/M)
perplexity_unk = probability_unk**(-1/M)

print(f"Perplexity without <UNK>:\t{perplexity}")
print(f"Perplexity with <UNK>:\t\t{perplexity_unk}")

Perplexity without <UNK>:	1.2599210498948732
Perplexity with <UNK>:		1.0


In [6]:
# Smoothing
def add_k_smoothing(k,vocab_size,n_gram_count,n_gram_pref_count):
    num = n_gram_count + k
    den = n_gram_pref_count + k*vocab_size
    return num/den

In [7]:
# N-grams not seen in the corpus get veru high probabilities
trigram_probabilities = {('i', 'am', 'happy') : 2}
bigram_probabilities = {( 'i', 'am') : 10}
vocabulary_size = 5
k = 1

probability_known_trigram = add_k_smoothing(k,vocabulary_size,trigram_probabilities[('i', 'am', 'happy')],bigram_probabilities[( 'i', 'am')])
probability_unknown_trigram = add_k_smoothing(k,vocabulary_size,0,0)

print(f"probability_known_trigram:\t{probability_known_trigram}")
print(f"probability_unknown_trigram:\t{probability_unknown_trigram}")

probability_known_trigram:	0.2
probability_unknown_trigram:	0.2


In [8]:
# Backoff
trigram_probabilities = {('i', 'am', 'happy'): 0}
bigram_probabilities = {( 'am', 'happy'): 0.3}
unigram_probabilities = {'happy': 0.4}

# Trigram to estimate
trigram = ('are', 'you', 'happy')

# All preceeding lower order N-grams
bigram = trigram[1:3]
unigram = trigram[2]
print(f"Trigram: {trigram}\nBigram: {bigram}\nUnigram: {unigram}\n")

# Stupid Backoff Constant
lambda_factor = 0.4
probability_hat_trigram = 0

# Search for first non-zero probability starting with N-gram
if trigram not in trigram_probabilities or trigram_probabilities[trigram]==0:
    print(f"Trigram {trigram} not found")
    if bigram not in bigram_probabilities or bigram_probabilities[bigram]==0:
        print(f"Bigram {bigram} not found")
        if unigram in unigram_probabilities:
            print(f"Unigram {unigram} found\n")
            probability_hat_trigram = lambda_factor * lambda_factor * unigram_probabilities[unigram]
        else:
            probability_hat_trigram = 0
    else:
        probability_hat_trigram = lambda_factor * bigram_probabilities[bigram]
else:
    probability_hat_trigram = trigram_probabilities[trigram]

print(f"P({trigram}) estimated as {probability_hat_trigram}")

Trigram: ('are', 'you', 'happy')
Bigram: ('you', 'happy')
Unigram: happy

Trigram ('are', 'you', 'happy') not found
Bigram ('you', 'happy') not found
Unigram happy found

P(('are', 'you', 'happy')) estimated as 0.06400000000000002


In [9]:
# Interpolation --> Linear Weighted Sum of all lower order N-grams
trigram_probabilities = {('i', 'am', 'happy'): 0.15}
bigram_probabilities = {( 'am', 'happy'): 0.3}
unigram_probabilities = {'happy': 0.4}

# Weights learnt from val set
lambda_1 = 0.8
lambda_2 = 0.15
lambda_3 = 0.05

# Input Trigram to estimate
trigram = ('i', 'am', 'happy')

# Lower order N-grams of the input
bigram = trigram[1:3]
unigram = trigram[2]

# Assuming all are present, else 0
probability_hat_trigram = lambda_1 * trigram_probabilities.get(trigram,0)
+ lambda_2 * bigram_probabilities.get(bigram,0)
+ lambda_3 * unigram_probabilities.get(unigram,0)

print(f"P({trigram}) estimated as {probability_hat_trigram}")

P(('i', 'am', 'happy')) estimated as 0.12
