In [1]:
# Simple Language models example

In [26]:
# Let's start with some basics first.
# Given a sentence, let's see how to perform some counts
# with the help of Counters and python dicts

from nltk import trigrams
from collections import Counter, defaultdict

sentence = 'this is a sentence that we want to parse and this is done with nltk and python collections'
sentence = sentence.split()
print(sentence)


['this', 'is', 'a', 'sentence', 'that', 'we', 'want', 'to', 'parse', 'and', 'this', 'is', 'done', 'with', 'nltk', 'and', 'python', 'collections']


In [27]:
# Let's see how we could get trigrams

for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True):
    print(w1, w2, w3)


None None this
None this is
this is a
is a sentence
a sentence that
sentence that we
that we want
we want to
want to parse
to parse and
parse and this
and this is
this is done
is done with
done with nltk
with nltk and
nltk and python
and python collections
python collections None
collections None None


In [28]:
# Let's see how we can keep counts
model = defaultdict(lambda: defaultdict(lambda: 0))
for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True):
    model[(w1, w2)][w3] += 1

# Let's see the words that follow "this is"
print('Words that appear after "this is":')
for w in model[('this', 'is')]:
    print(f'word: {w}')

# Let's see how many times "this is" occurs
print('\nHow many times does "this is" occur?')
print(sum(model[('this', 'is')].values()))

# Let's see how many time "a" occurs after "this is"
print('\nHow many times does "a" occur after "this is"?')
print(model[('this', 'is')]['a'])

dict(model[('this', 'is')])

Words that appear after "this is":
word: a
word: done

How many times does "this is" occur?
2

How many times does "a" occur after "this is"?
1


{'a': 1, 'done': 1}

In [29]:
# Let's put everything together and use a corpus from project Gutenberg
# which is provided directly by NLTK

from nltk.corpus import gutenberg
from nltk import bigrams, trigrams
from collections import Counter, defaultdict

# Create a placeholder for model
model = defaultdict(lambda: defaultdict(lambda: 0))

# Count frequency of co-occurance  
for sentence in gutenberg.sents('shakespeare-macbeth.txt'):
    for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True):
        model[(w1, w2)][w3] += 1
        
# Let's transform the counts to probabilities
for w1_w2 in model:
    total_count = float(sum(model[w1_w2].values()))
    for w3 in model[w1_w2]:
        model[w1_w2][w3] /= total_count

In [30]:
print(gutenberg.sents('shakespeare-macbeth.txt')[:50])

[['[', 'The', 'Tragedie', 'of', 'Macbeth', 'by', 'William', 'Shakespeare', '1603', ']'], ['Actus', 'Primus', '.'], ['Scoena', 'Prima', '.'], ['Thunder', 'and', 'Lightning', '.'], ['Enter', 'three', 'Witches', '.'], ['1', '.'], ['When', 'shall', 'we', 'three', 'meet', 'againe', '?'], ['In', 'Thunder', ',', 'Lightning', ',', 'or', 'in', 'Raine', '?'], ['2', '.'], ['When', 'the', 'Hurley', '-', 'burley', "'", 's', 'done', ',', 'When', 'the', 'Battaile', "'", 's', 'lost', ',', 'and', 'wonne'], ['3', '.'], ['That', 'will', 'be', 'ere', 'the', 'set', 'of', 'Sunne'], ['1', '.'], ['Where', 'the', 'place', '?'], ['2', '.'], ['Vpon', 'the', 'Heath'], ['3', '.'], ['There', 'to', 'meet', 'with', 'Macbeth'], ['1', '.'], ['I', 'come', ',', 'Gray', '-', 'Malkin'], ['All', '.'], ['Padock', 'calls', 'anon', ':', 'faire', 'is', 'foule', ',', 'and', 'foule', 'is', 'faire', ',', 'Houer', 'through', 'the', 'fogge', 'and', 'filthie', 'ayre', '.'], ['Exeunt', '.'], ['Scena', 'Secunda', '.'], ['Alarum', 'with

In [31]:
dict(model['I', 'am'])

{'faint': 0.034482758620689655,
 'Thane': 0.06896551724137931,
 'fed': 0.034482758620689655,
 'his': 0.034482758620689655,
 'settled': 0.034482758620689655,
 'afraid': 0.06896551724137931,
 'one': 0.034482758620689655,
 'recklesse': 0.034482758620689655,
 'cabin': 0.034482758620689655,
 'a': 0.034482758620689655,
 'bent': 0.034482758620689655,
 'in': 0.06896551724137931,
 'for': 0.034482758620689655,
 'call': 0.034482758620689655,
 'so': 0.034482758620689655,
 'not': 0.06896551724137931,
 'perfect': 0.034482758620689655,
 'too': 0.034482758620689655,
 'yong': 0.034482758620689655,
 'as': 0.034482758620689655,
 'yet': 0.034482758620689655,
 'truly': 0.034482758620689655,
 ',': 0.034482758620689655,
 'sure': 0.034482758620689655,
 'sick': 0.034482758620689655}

In [37]:
import random

# starting words
text = ["I", "am"]
sentence_finished = False
 
while not sentence_finished:
    # select a random probability threshold  
    r = random.random()
    accumulator = .0

    for word in model[tuple(text[-2:])].keys():
        accumulator += model[tuple(text[-2:])][word]
        # select words that are above the probability threshold
        if accumulator >= r:
            text.append(word)
            break

    if text[-2:] == [None, None]:
        sentence_finished = True

print(' '.join([t for t in text if t]))

I am yong , but can perceiue no truth in your state of Man , That I may tell pale - hearted Feare , and Macbeth , Or weare it on my Sword ; yet all this , and know How tender ' tis one


In [57]:
# Another example:
# Fit directly an MLE 

import nltk
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import MLE
from nltk.lm import Vocabulary

train_sentences = ['Thunder and Lightning',
                   'Enter three Witches',
                   'I am faint',
                   'God saue the King',
                   'Looke what I haue here',
                   'Here the lies haue the eyes'
                  ]

tokenized_text = [list(map(str.lower, nltk.tokenize.word_tokenize(sent))) for sent in train_sentences]

n = 2 # Highest n-gram order for the Maximul Likelihood Estimator

# Prepare training data:
# Use bigrams, and mark the start and end of the sentence
train_data = [nltk.bigrams(t,  pad_right=True, pad_left=True, left_pad_symbol="<s>", right_pad_symbol="</s>") for t in tokenized_text]
words = [word for sent in tokenized_text for word in sent]
words.extend(["<s>", "</s>"])
padded_vocab = Vocabulary(words)

# Fit model
model = MLE(n)
model.fit(train_data, padded_vocab)

# Use 3 test sentences
# The first sentence appears in the dataset
# The other two do not appear in the dataset. 
# However, for the 3rd sentence there are similar bigrams in the training set
test_sentences = ['Thunder and lightning', 'through his eyes', 'I haue the king']
tokenized_text = [list(map(str.lower, nltk.tokenize.word_tokenize(sent))) for sent in test_sentences]

print('MLE estimates for test data:')
test_data = [nltk.bigrams(t,  pad_right=True, pad_left=True, left_pad_symbol="<s>", right_pad_symbol="</s>") for t in tokenized_text]
for i,test in enumerate(test_data):
    print (f'\nMLE Estimates for sentence {i}:', [((ngram[-1], ngram[:-1]),model.score(ngram[-1], ngram[:-1])) for ngram in test])

print('\nPerplexities:')
# Reset the test_data, since the generator has been exhausted
test_data = [nltk.bigrams(t,  pad_right=True, pad_left=True, left_pad_symbol="<s>", right_pad_symbol="</s>") for t in tokenized_text]
for i, test in enumerate(test_data):
    print("PP({0}):{1}".format(test_sentences[i], model.perplexity(test)))

MLE estimates for test data:

MLE Estimates for sentence 0: [(('thunder', ('<s>',)), 0.16666666666666666), (('and', ('thunder',)), 1.0), (('lightning', ('and',)), 1.0), (('</s>', ('lightning',)), 1.0)]

MLE Estimates for sentence 1: [(('through', ('<s>',)), 0.0), (('his', ('through',)), 0), (('eyes', ('his',)), 0), (('</s>', ('eyes',)), 1.0)]

MLE Estimates for sentence 2: [(('i', ('<s>',)), 0.16666666666666666), (('haue', ('i',)), 0.5), (('the', ('haue',)), 0.5), (('king', ('the',)), 0.3333333333333333), (('</s>', ('king',)), 1.0)]

Perplexities:
PP(Thunder and lightning):1.5650845800732873
PP(through his eyes):inf
PP(I haue the king):2.352158045049347
