In [1]:
import numpy as np
from collections import defaultdict
import pandas as pd

In [5]:
# Define the bigram count matrix
bigram_counts = {
'eos': {'eos': 0, 'I': 300, 'booked': 0, 'a': 0, 'flight': 0, 'took': 300},
'I': {'eos': 0, 'I': 0, 'booked': 300, 'a': 0, 'flight': 0, 'took': 0},
'booked': {'eos': 0, 'I': 0, 'booked': 0, 'a': 300, 'flight': 0, 'took': 0},
'a': {'eos': 0, 'I': 0, 'booked': 0, 'a': 0, 'flight': 600, 'took': 0},
'flight': {'eos': 600, 'I': 0, 'booked': 0, 'a': 0, 'flight': 0, 'took': 0},
'took': {'eos': 0, 'I': 0, 'booked': 0, 'a': 300, 'flight': 0, 'took': 0}
}
# Vocabulary
vocab = list(bigram_counts.keys())
vocab_size = len(vocab)
# Additive smoothing parameter
alpha = 1
# Initialize smoothed bigram probabilities
bigram_probabilities = defaultdict(lambda: defaultdict(float))
# Apply additive smoothing
for word1 in vocab:
    total_count = sum(bigram_counts[word1].values())
    for word2 in vocab:
        count = bigram_counts[word1][word2]
        bigram_probabilities[word1][word2] = (count + alpha) / (total_count + alpha)
# Convert to a more readable format
bigram_prob_matrix = np.zeros((vocab_size, vocab_size))
word_to_index = {word: i for i, word in enumerate(vocab)}
for word1 in vocab:
    for word2 in vocab:
        bigram_prob_matrix[word_to_index[word1], word_to_index[word2]] = bigram_probabilities[word1][word2]
print("Bi-gram probability before smoothing")
df1=pd.DataFrame(bigram_counts, index=vocab, columns=vocab)
print(df1)
print("/n")
# Print the smoothed bigram probabilities matrix
print("Bi-gram probability after smoothing")
df = pd.DataFrame(bigram_prob_matrix, index=vocab, columns=vocab)
print(df)

Bi-gram probability before smoothing
        eos    I  booked    a  flight  took
eos       0    0       0    0     600     0
I       300    0       0    0       0     0
booked    0  300       0    0       0     0
a         0    0     300    0       0   300
flight    0    0       0  600       0     0
took    300    0       0    0       0     0
/n
Bi-gram probability after smoothing
             eos         I    booked         a    flight      took
eos     0.001664  0.500832  0.001664  0.001664  0.001664  0.500832
I       0.003322  0.003322  1.000000  0.003322  0.003322  0.003322
booked  0.003322  0.003322  0.003322  1.000000  0.003322  0.003322
a       0.001664  0.001664  0.001664  0.001664  1.000000  0.001664
flight  1.000000  0.001664  0.001664  0.001664  0.001664  0.001664
took    0.003322  0.003322  0.003322  1.000000  0.003322  0.003322
