In [22]:
import nltk
import re
from collections import Counter
from nltk.tokenize import word_tokenize

In [24]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\y22cm84\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [16]:
corpus = [
    'the', 'cat', 'sat', 'on', 'the', 'mat',
    'the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog'
]

v = Counter(corpus)
n = sum(v.values())

print(f"Word count: \n{v}")
print(f"Total count: {n}")

Word count: 
Counter({'the': 4, 'cat': 1, 'sat': 1, 'on': 1, 'mat': 1, 'quick': 1, 'brown': 1, 'fox': 1, 'jumps': 1, 'over': 1, 'lazy': 1, 'dog': 1})
Total count: 15


In [17]:
vocab = len(v)
total_vocab = vocab+n

In [18]:
# laplace smoothing

probs = {}
for word, count in v.items():
    probs[word] = (count + 1) / n

print("\nProbabilities:")
for key, value in probs.items():
    print(f"{key}, {round(value, 4)}")  


Probabilities:
the, 0.3333
cat, 0.1333
sat, 0.1333
on, 0.1333
mat, 0.1333
quick, 0.1333
brown, 0.1333
fox, 0.1333
jumps, 0.1333
over, 0.1333
lazy, 0.1333
dog, 0.1333


In [20]:
# Add-k Smoothing

k=0.5
additive_probs = {}
for word, count in v.items():
    var = (count + k) / (n + k * vocab)
    additive_probs[word] = var

print("\nProbabilities: ")
for key, value in additive_probs.items():
    print(f"{key}, {round(value, 4)}")
    


Probabilities: 
the, 0.2143
cat, 0.0714
sat, 0.0714
on, 0.0714
mat, 0.0714
quick, 0.0714
brown, 0.0714
fox, 0.0714
jumps, 0.0714
over, 0.0714
lazy, 0.0714
dog, 0.0714


In [28]:
# preparing the data
tokenized_corpus = []
for sentence in corpus:
    var = sentence.lower()
    var = word_tokenize(sentence)
    tokenized_corpus.append(var)

corpus_words = []
for i in tokenized_corpus:
    for word in i:
        corpus_words.append(word)
    
vocab = set(corpus_words)
def grams(n):
    result = []
    for i in range(len(corpus_words)-n+1):
        result.append(tuple(corpus_words[i:i+n]))
    return result

In [33]:
unigram = grams(1)
unigram_counter = Counter(unigram)
bigrams = grams(2)
bigrams_counter = Counter(bigrams)

In [35]:
unigram_prob = {unigram: count / n for unigram, count in unigram_counter.items()}
bigram_prob = {bigram: count / n for bigram, count in bigrams_counter.items()}

In [37]:
# Interpolated smoothing

l1 = 0.7
l2 = 0.3

interpolated_prob = {}
for (w1, w2), count in bigrams_counter.items():
    bigram_p = bigram_prob.get((w1, w2), 0)
    unigram_p = unigram_prob.get(w2, 0)
    
    interpolated_prob[(w1, w2)] = l1 * bigram_p + l2 * unigram_p

for key, value in interpolated_prob.items():
    print(f"{key}, {round(value, 4)}")

('the', 'cat'), 0.0467
('cat', 'sat'), 0.0467
('sat', 'on'), 0.0467
('on', 'the'), 0.0467
('the', 'mat'), 0.0467
('mat', 'the'), 0.0467
('the', 'quick'), 0.0467
('quick', 'brown'), 0.0467
('brown', 'fox'), 0.0467
('fox', 'jumps'), 0.0467
('jumps', 'over'), 0.0467
('over', 'the'), 0.0467
('the', 'lazy'), 0.0467
('lazy', 'dog'), 0.0467
