----
Exercises: Language Modeling
====

Let's practice bigrams, MLE, and Laplace smoothing....

In [11]:
corpus = "en såg såg en såg en såg såg , en annan sågade en sågen sågen såg . </s>".split()
vocabulary = set(corpus)
len(vocabulary)

8

In [12]:
import nltk

In [13]:
cfd = nltk.ConditionalFreqDist(nltk.bigrams(corpus))
cfd

ConditionalFreqDist(nltk.probability.FreqDist,
                    {',': FreqDist({'en': 1}),
                     '.': FreqDist({'</s>': 1}),
                     'annan': FreqDist({'sågade': 1}),
                     'en': FreqDist({'annan': 1, 'såg': 3, 'sågen': 1}),
                     'såg': FreqDist({',': 1, '.': 1, 'en': 2, 'såg': 2}),
                     'sågade': FreqDist({'en': 1}),
                     'sågen': FreqDist({'såg': 1, 'sågen': 1})})

In [14]:
nltk.FreqDist(corpus)

FreqDist({',': 1,
          '.': 1,
          '</s>': 1,
          'annan': 1,
          'en': 5,
          'såg': 6,
          'sågade': 1,
          'sågen': 2})

TODO: Describe cfd in your own words

In [15]:
#Conditional frequency tells you the count of a word given another token(s). 

In [16]:
sentence = "såg såg sågade en sågen ".split()
sentence

['såg', 'såg', 'sågade', 'en', 'sågen']

In [17]:
# The corpus counts of each bigram in the sentence:
print("word 1", "word 2", "bigram count", sep="\t")
[print(a, b, cfd[a][b], sep="\t") for (a,b) in nltk.bigrams(sentence)];

word 1	word 2	bigram count
såg	såg	2
såg	sågade	0
sågade	en	1
en	sågen	1


In [18]:
from string import punctuation

In [19]:

nltk.FreqDist(corpus)

FreqDist({',': 1,
          '.': 1,
          '</s>': 1,
          'annan': 1,
          'en': 5,
          'såg': 6,
          'sågade': 1,
          'sågen': 2})

In [20]:
# TODO: The corpus counts for each word in the sentence:

#print("word 1", "word 2", "bigram count", sep="\t")

def corp(words):
    freq = nltk.FreqDist(corpus)
    return [freq[word] for word in words ]

corpus_count_unigram = corp(sentence)


In [21]:
assert corpus_count_unigram == [6, 6, 1, 5, 2]

In [22]:
# The MLE probability for each bigram:
print("word 1", "word 2", "MLE probability", sep="\t")
[print(a, b, (cfd[a][b]/cfd[a].N()), sep="\t") for (a,b) in nltk.bigrams(sentence)];

word 1	word 2	MLE probability
såg	såg	0.3333333333333333
såg	sågade	0.0
sågade	en	1.0
en	sågen	0.2


In [26]:
list(nltk.bigrams(corpus))

[('en', 'såg'),
 ('såg', 'såg'),
 ('såg', 'en'),
 ('en', 'såg'),
 ('såg', 'en'),
 ('en', 'såg'),
 ('såg', 'såg'),
 ('såg', ','),
 (',', 'en'),
 ('en', 'annan'),
 ('annan', 'sågade'),
 ('sågade', 'en'),
 ('en', 'sågen'),
 ('sågen', 'sågen'),
 ('sågen', 'såg'),
 ('såg', '.'),
 ('.', '</s>')]

In [27]:
nltk.FreqDist(nltk.bigrams(corpus))

FreqDist({(',', 'en'): 1,
          ('.', '</s>'): 1,
          ('annan', 'sågade'): 1,
          ('en', 'annan'): 1,
          ('en', 'såg'): 3,
          ('en', 'sågen'): 1,
          ('såg', ','): 1,
          ('såg', '.'): 1,
          ('såg', 'en'): 2,
          ('såg', 'såg'): 2,
          ('sågade', 'en'): 1,
          ('sågen', 'såg'): 1,
          ('sågen', 'sågen'): 1})

In [28]:
#  TODO: Repeat using in the built-in methods for MLE probability:
print("word 1", "word 2", "MLE probability", sep="\t")
print([cfd[a].freq(b) for (a,b) in nltk.bigrams(sentence)])

word 1	word 2	MLE probability
[0.3333333333333333, 0.0, 1.0, 0.2]


In [29]:
# The probability of the sentence is the product of all bigram probabilities:
from functools import reduce

prob_bigram = [cfd[a][b]/cfd[a].N() for (a,b) in nltk.bigrams(sentence)]
reduce(lambda x,y:x*y, prob_bigram)

0.0

That is not a great model becuase it predicts zero for a sentence exists, even though we haven't seen it yet!

In [30]:
# Laplace smoothing of each bigram count:
lps = [1 + cfd[a][b] for (a,b) in nltk.bigrams(sentence)]

In [31]:
# We need to normalise the counts for each word:
normal = [len(vocabulary) + cfd[a].N() for (a,b) in nltk.bigrams(sentence)]

In [32]:
#TODO: Calculate and print the smoothed Laplace probability for each bigram:
print("word 1", "word 2", "Laplace smoothed probability", sep="\t")
print([a/b for (a,b) in zip(lps,normal)], sep="\t")

word 1	word 2	Laplace smoothed probability
[0.21428571428571427, 0.07142857142857142, 0.2222222222222222, 0.15384615384615385]


In [33]:
smoothed_out = [a/b for (a,b) in zip(lps,normal)]

In [34]:
assert smoothed_out == [0.21428571428571427,
 0.07142857142857142,
 0.2222222222222222,
 0.15384615384615385]

In [35]:
# The smoothed probability of the sentence:
reduce(lambda x,y:x*y, smoothed_out)

0.0005232862375719518

In [36]:
assert round(reduce(lambda x,y:x*y, smoothed_out),6) == 0.000523

TODO: How can we interpret this probability?

In [37]:
#This is the smoothed probability of the sentence given that observed all of the bigrams in the sentence.
#This probability is higher than the MLE probability (MLE assigns zero to bigrams we haven't seen). 
#The smoothed property gives a low, but non-zero, chance of observing bigrams we haven't seen before.

------
Here is how it would look all together in a grown-up codebase.

In [38]:
# MLEProbDist is the unsmoothed probability distribution:
cpd_mle = nltk.ConditionalProbDist(cfd,
                                   nltk.MLEProbDist,
                                   bins=len(vocabulary))

In [39]:
# Now we can get the MLE probabilities by using the .prob method:
print("word 1", "word 2", "MLE probability", sep="\t")
[print(a, b, cpd_mle[a].prob(b), sep="\t") for (a,b) in nltk.bigrams(sentence)];

word 1	word 2	MLE probability
såg	såg	0.3333333333333333
såg	sågade	0.0
sågade	en	1.0
en	sågen	0.2


In [40]:
# LaplaceProbDist is the add-one smoothed ProbDist:
cpd_laplace = nltk.ConditionalProbDist(cfd, 
                                       nltk.LaplaceProbDist, 
                                       bins=len(vocabulary))

In [41]:
# Getting the Laplace probabilities is the same as for MLE:
print("word 1", "word 2", "Laplace smoothed probability", sep="\t")
[print(a, b, cpd_laplace[a].prob(b), sep="\t") for (a,b) in nltk.bigrams(sentence)];

word 1	word 2	Laplace smoothed probability
såg	såg	0.21428571428571427
såg	sågade	0.07142857142857142
sågade	en	0.2222222222222222
en	sågen	0.15384615384615385


![](http://ljdchost.com/AbW1pPX.gif)

<br>
<br> 
<br>

----

<br>
<br>
---