# Examples with probabilistic language models (n-grams)

In [1]:
# Start with some basics:
# Given a sentence, let's see how to perform some counts
# with the help of Counters and python dicts

from nltk import trigrams
from collections import Counter, defaultdict

In [2]:
# Example sentence
sentence = 'this is a sentence that we want to parse and this is done with nltk and python collections'

# Simple tokenization (separation of tokens via space character)
sentence = sentence.split()
print(sentence)

['this', 'is', 'a', 'sentence', 'that', 'we', 'want', 'to', 'parse', 'and', 'this', 'is', 'done', 'with', 'nltk', 'and', 'python', 'collections']


In [3]:
# Produce trigrams from that sentence
# (imagine that we are sliding a window of size 3 across the sentence)

for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True):
    print(w1, w2, w3)

None None this
None this is
this is a
is a sentence
a sentence that
sentence that we
that we want
we want to
want to parse
to parse and
parse and this
and this is
this is done
is done with
done with nltk
with nltk and
nltk and python
and python collections
python collections None
collections None None


In [4]:
# Keep counts

model = defaultdict(lambda: defaultdict(lambda: 0))

# Generate the trigrams and increase the counter for each occurrence 
# of a specific trigram
for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True):
    model[(w1, w2)][w3] += 1

In [5]:
# Let's see the words that follow "this is"
print('Words that appear after "this is":')
for w in model[('this', 'is')]:
    print(f'word: {w}')


Words that appear after "this is":
word: a
word: done


In [6]:
# Let's see how many times "this is" occurs
print('\nHow many times does "this is" occur in the sample sentence?')
print(sum(model[('this', 'is')].values()))


How many times does "this is" occur in the sample sentence?
2


In [7]:
# Let's see how many times word "a" occurs after "this is"
print('\nHow many times does "a" occur after "this is"?')
print(model[('this', 'is')]['a'])

# What are the words that follow "this is" and what are
# the corresponding frequencies?
print('\nWords that follow the bigram "this is" and the corresponding counts:')
dict(model[('this', 'is')])



How many times does "a" occur after "this is"?
1

Words that follow the bigram "this is" and the corresponding counts:


{'a': 1, 'done': 1}

# Create a simple language model using a text collection

In [8]:
# Let's put everything together and use a corpus from project Gutenberg
# which is provided directly by NLTK

from nltk.corpus import gutenberg
from nltk import bigrams, trigrams
from collections import Counter, defaultdict

# Create a placeholder for model
model = defaultdict(lambda: defaultdict(lambda: 0))

# Count frequency of co-occurance  
for sentence in gutenberg.sents('shakespeare-macbeth.txt'):
    for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True):
        model[(w1, w2)][w3] += 1
        
# Let's transform the counts to probabilities
for w1_w2 in model:
    total_count = float(sum(model[w1_w2].values()))
    for w3 in model[w1_w2]:
        model[w1_w2][w3] /= total_count



In [9]:
# Print a sample of the training sentences
for i in range(0, 50):
    print(gutenberg.sents('shakespeare-macbeth.txt')[i])

['[', 'The', 'Tragedie', 'of', 'Macbeth', 'by', 'William', 'Shakespeare', '1603', ']']
['Actus', 'Primus', '.']
['Scoena', 'Prima', '.']
['Thunder', 'and', 'Lightning', '.']
['Enter', 'three', 'Witches', '.']
['1', '.']
['When', 'shall', 'we', 'three', 'meet', 'againe', '?']
['In', 'Thunder', ',', 'Lightning', ',', 'or', 'in', 'Raine', '?']
['2', '.']
['When', 'the', 'Hurley', '-', 'burley', "'", 's', 'done', ',', 'When', 'the', 'Battaile', "'", 's', 'lost', ',', 'and', 'wonne']
['3', '.']
['That', 'will', 'be', 'ere', 'the', 'set', 'of', 'Sunne']
['1', '.']
['Where', 'the', 'place', '?']
['2', '.']
['Vpon', 'the', 'Heath']
['3', '.']
['There', 'to', 'meet', 'with', 'Macbeth']
['1', '.']
['I', 'come', ',', 'Gray', '-', 'Malkin']
['All', '.']
['Padock', 'calls', 'anon', ':', 'faire', 'is', 'foule', ',', 'and', 'foule', 'is', 'faire', ',', 'Houer', 'through', 'the', 'fogge', 'and', 'filthie', 'ayre', '.']
['Exeunt', '.']
['Scena', 'Secunda', '.']
['Alarum', 'within', '.']
['Enter', 'King

In [10]:
# So we have trained the model.
# Let's see the probabilities of words that may follow the sequence "I am"

dict(model['I', 'am'])

{'faint': 0.034482758620689655,
 'Thane': 0.06896551724137931,
 'fed': 0.034482758620689655,
 'his': 0.034482758620689655,
 'settled': 0.034482758620689655,
 'afraid': 0.06896551724137931,
 'one': 0.034482758620689655,
 'recklesse': 0.034482758620689655,
 'cabin': 0.034482758620689655,
 'a': 0.034482758620689655,
 'bent': 0.034482758620689655,
 'in': 0.06896551724137931,
 'for': 0.034482758620689655,
 'call': 0.034482758620689655,
 'so': 0.034482758620689655,
 'not': 0.06896551724137931,
 'perfect': 0.034482758620689655,
 'too': 0.034482758620689655,
 'yong': 0.034482758620689655,
 'as': 0.034482758620689655,
 'yet': 0.034482758620689655,
 'truly': 0.034482758620689655,
 ',': 0.034482758620689655,
 'sure': 0.034482758620689655,
 'sick': 0.034482758620689655}

In [61]:
# Let's use the model to generate some text!

import random

# We will tell the model to start with the words "I am"
# So, it will produce the next word.
# Then, the process is repeated using the generated word and the previous one.

text = ["I", "am"]
sentence_finished = False
 
while not sentence_finished:
    probs = []
    population = []
    for word in model[tuple(text[-2:])].keys():
        probs.append(model[tuple(text[-2:])][word])
        population.append(word)
    token = random.choices(population, probs, k=1)
    text.append(token[0])    

    if text[-2:] == [None, None]:
        sentence_finished = True

print(' '.join([t for t in text if t]))


I am as I am yet Vnknowne to Woman , neuer was forsworne , Scarsely haue coueted what was mine owne .


# Use directly the MLE from nltk python package

In [28]:
# In this simple example we will calculate perplexities of test sentences
# for a simple model that is trained in a tiny training set

import nltk
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import MLE
from nltk.lm import Vocabulary

train_sentences = ['Thunder and Lightning',
                   'Enter three Witches',
                   'I am faint',
                   'God saue the King',
                   'Looke what I haue here',
                   'Here the lies haue the eyes'
                  ]

tokenized_text = [list(map(str.lower, nltk.tokenize.word_tokenize(sent))) for sent in train_sentences]

# Print the tokenized training set
tokenized_text

[['thunder', 'and', 'lightning'],
 ['enter', 'three', 'witches'],
 ['i', 'am', 'faint'],
 ['god', 'saue', 'the', 'king'],
 ['looke', 'what', 'i', 'haue', 'here'],
 ['here', 'the', 'lies', 'haue', 'the', 'eyes']]

In [39]:
n = 2 # Highest n-gram order for the Maximul Likelihood Estimator

# Prepare training data:
# Use bigrams, and mark the start and end of the sentence
train_data = [nltk.bigrams(t,  pad_right=True, pad_left=True, left_pad_symbol="<s>", right_pad_symbol="</s>") for t in tokenized_text]
words = [word for sent in tokenized_text for word in sent]
words.extend(["<s>", "</s>"])
padded_vocab = Vocabulary(words)

# Fit model
model = MLE(n)
model.fit(train_data, padded_vocab)

In [40]:
# We have trained the bigram model.
# Let's test it in some test sentences

# We will on purpose include a sentence that appears 'as is' in the tranining set.
# That sentence should have the highest perplexity
# The other two sentences do not appear in the training set:
# The first one contains bigrams that the model has never seen before,
# but the last sentence contains bigrams that the model has learned
test_sentences = [
    'Thunder and lightning',   # So, this should have the lowest perplexity (the model explains well the sentence)
    'through his eyes',        # This one should have PP that equals infinity (due to zero probabilities)
    'I haue the king']         # This sentence can be explained but it will surprise the model more than the 1st


# Tokenize the test sentences
tokenized_text = [list(map(str.lower, nltk.tokenize.word_tokenize(sent))) for sent in test_sentences]

# For each test sentence print the MLE estimates for the bigrams that need to be calculated
print('MLE estimates for test data:')
test_data = [nltk.bigrams(t,  pad_right=True, pad_left=True, left_pad_symbol="<s>", right_pad_symbol="</s>") for t in tokenized_text]
for i,test in enumerate(test_data):
    print (f'\nMLE Estimates for sentence {i}:', [((ngram[-1], ngram[:-1]),model.score(ngram[-1], ngram[:-1])) for ngram in test])

# For each test sentence print the perplexities of the model
print('\nPerplexities:')
# Reset the test_data, since the generator has been exhausted
test_data = [nltk.bigrams(t,  pad_right=True, pad_left=True, left_pad_symbol="<s>", right_pad_symbol="</s>") for t in tokenized_text]
for i, test in enumerate(test_data):
    print("PP({0}):{1}".format(test_sentences[i], model.perplexity(test)))


MLE estimates for test data:

MLE Estimates for sentence 0: [(('thunder', ('<s>',)), 0.16666666666666666), (('and', ('thunder',)), 1.0), (('lightning', ('and',)), 1.0), (('</s>', ('lightning',)), 1.0)]

MLE Estimates for sentence 1: [(('through', ('<s>',)), 0.0), (('his', ('through',)), 0), (('eyes', ('his',)), 0), (('</s>', ('eyes',)), 1.0)]

MLE Estimates for sentence 2: [(('i', ('<s>',)), 0.16666666666666666), (('haue', ('i',)), 0.5), (('the', ('haue',)), 0.5), (('king', ('the',)), 0.3333333333333333), (('</s>', ('king',)), 1.0)]

Perplexities:
PP(Thunder and lightning):1.5650845800732873
PP(through his eyes):inf
PP(I haue the king):2.352158045049347


In [46]:
# Same example with Laplace smoothing

# This is the exact same code as above.
# The only difference is that we use Laplace() instead of MLE() to define the model

from nltk.lm import Laplace

train_sentences = ['Thunder and Lightning',
                   'Enter three Witches',
                   'I am faint',
                   'God saue the King',
                   'Looke what I haue here',
                   'Here the lies haue the eyes'
                  ]

tokenized_text = [list(map(str.lower, nltk.tokenize.word_tokenize(sent))) for sent in train_sentences]

n = 2 

train_data = [nltk.bigrams(t,  pad_right=True, pad_left=True, left_pad_symbol="<s>", right_pad_symbol="</s>") for t in tokenized_text]
words = [word for sent in tokenized_text for word in sent]
words.extend(["<s>", "</s>"])
padded_vocab = Vocabulary(words)

# Fit model
model = Laplace(n)
model.fit(train_data, padded_vocab)

test_sentences = [
    'Thunder and lightning',   
    'through his eyes',        
    'I haue the king']         

tokenized_text = [list(map(str.lower, nltk.tokenize.word_tokenize(sent))) for sent in test_sentences]

print('MLE estimates for test data:')
test_data = [nltk.bigrams(t,  pad_right=True, pad_left=True, left_pad_symbol="<s>", right_pad_symbol="</s>") for t in tokenized_text]
for i,test in enumerate(test_data):
    print (f'\nMLE Estimates for sentence {i}:', [((ngram[-1], ngram[:-1]),model.score(ngram[-1], ngram[:-1])) for ngram in test])

print('\nPerplexities:')
# Reset the test_data, since the generator has been exhausted
test_data = [nltk.bigrams(t,  pad_right=True, pad_left=True, left_pad_symbol="<s>", right_pad_symbol="</s>") for t in tokenized_text]
for i, test in enumerate(test_data):
    print("PP({0}):{1}".format(test_sentences[i], model.perplexity(test)))


MLE estimates for test data:

MLE Estimates for sentence 0: [(('thunder', ('<s>',)), 0.06896551724137931), (('and', ('thunder',)), 0.08333333333333333), (('lightning', ('and',)), 0.08333333333333333), (('</s>', ('lightning',)), 0.08333333333333333)]

MLE Estimates for sentence 1: [(('through', ('<s>',)), 0.034482758620689655), (('his', ('through',)), 0.043478260869565216), (('eyes', ('his',)), 0.043478260869565216), (('</s>', ('eyes',)), 0.08333333333333333)]

MLE Estimates for sentence 2: [(('i', ('<s>',)), 0.06896551724137931), (('haue', ('i',)), 0.08), (('the', ('haue',)), 0.08), (('king', ('the',)), 0.07692307692307693), (('</s>', ('king',)), 0.08333333333333333)]

Perplexities:
PP(Thunder and lightning):12.581370016785733
PP(through his eyes):20.713749936746982
PP(I haue the king):12.87248887971409


---

### In the last example we do not have zero probabilities or perplexities that go to infinity, because we have performed smoothing. We have stolen some of the probability mass of other n-grams to slightly augment the zero probabilities of unseen n-grams.