#  Building a simple statistical language model on a shakespeare courpus


This is a simple python code to show how to build a n-gram language model.
Here are the steps:

1. Preprocessing
      1. Get raw text
      1. Tokenize the text
      1. Clean the text
2. Building the n-gram lm (language model)
      1. get a sequence of word
      2. get the frequency of the n-grams.
      3. Loop through the whole corpus, store and compute the possibility of the existing n-grams.
3. Use the model to complete a task, such as text generation
      1. 

## Preprocessing
Let's start now.
The first thing is get text from a txt file. One thing should be concerned is the encoding. 
Since we use python 3, make sure the input file is encoded in unicode.

In [153]:
# store the file line by lines
def get_raw_text(path):
    with open(path) as f:
        return f.readlines()
raw_text = get_raw_text("data/train_shakespeare.txt")

In [154]:
# let see the first line 
print(raw_text[0])

Hence! home, you idle creatures get you home: Is this a holiday? what! know you not, Being mechanical, you ought not walk Upon a labouring day without the sign Of your profession? Speak, what trade art thou?



Looks nice!
Then we can do the tokenization. For English it is rather a simple task because words are seperated by spaces.
However, the punctuation is sticked to the previous word, you should be careful about that.

In [155]:
# Tokenize the text
# Notice that at the end of line we have a newline character, so we have to remove it first by strip()
def split_by_space(text):
    return text.strip().split()

In [156]:
# let see how it works
print(split_by_space(raw_text[0]))

['Hence!', 'home,', 'you', 'idle', 'creatures', 'get', 'you', 'home:', 'Is', 'this', 'a', 'holiday?', 'what!', 'know', 'you', 'not,', 'Being', 'mechanical,', 'you', 'ought', 'not', 'walk', 'Upon', 'a', 'labouring', 'day', 'without', 'the', 'sign', 'Of', 'your', 'profession?', 'Speak,', 'what', 'trade', 'art', 'thou?']


In [157]:
# We don't want the punctuation stick to the word, so lets seperate it from the word


def split_punc(tokens):
    output = []
    punc = '!'
    for token in tokens:
        for i in range(len(token),0,-1):
            if token[:i].isalpha():
                if i==len(token):
                    output.append(token)
                else:
                    output.append(token[:i])
                    output.append(token[i:])
                break
    return output

In [158]:
# let see how it works
print(split_punc(tokenize(raw_text[0])))

['Hence', '!', 'home', ',', 'you', 'idle', 'creatures', 'get', 'you', 'home', ':', 'Is', 'this', 'a', 'holiday', '?', 'what', '!', 'know', 'you', 'not', ',', 'Being', 'mechanical', ',', 'you', 'ought', 'not', 'walk', 'Upon', 'a', 'labouring', 'day', 'without', 'the', 'sign', 'Of', 'your', 'profession', '?', 'Speak', ',', 'what', 'trade', 'art', 'thou', '?']


In [159]:
# Turn uppercase into lowercase

def lowercase(tokens):
    output = []
    for token in tokens:
        output.append(token.lower())
    return output

In [160]:
print(lowercase(split_punc(tokenize(raw_text[0]))))

['hence', '!', 'home', ',', 'you', 'idle', 'creatures', 'get', 'you', 'home', ':', 'is', 'this', 'a', 'holiday', '?', 'what', '!', 'know', 'you', 'not', ',', 'being', 'mechanical', ',', 'you', 'ought', 'not', 'walk', 'upon', 'a', 'labouring', 'day', 'without', 'the', 'sign', 'of', 'your', 'profession', '?', 'speak', ',', 'what', 'trade', 'art', 'thou', '?']


In [161]:
# pack them together
def tokenizer(raw_text):
    tokenized_text = []
    for line in raw_text:
        tokenized_text.append(lowercase(split_punc(tokenize(line))))
    return tokenized_text



In [162]:
tokenized_text = tokenizer(raw_text)
print(tokenized_text[0])

['hence', '!', 'home', ',', 'you', 'idle', 'creatures', 'get', 'you', 'home', ':', 'is', 'this', 'a', 'holiday', '?', 'what', '!', 'know', 'you', 'not', ',', 'being', 'mechanical', ',', 'you', 'ought', 'not', 'walk', 'upon', 'a', 'labouring', 'day', 'without', 'the', 'sign', 'of', 'your', 'profession', '?', 'speak', ',', 'what', 'trade', 'art', 'thou', '?']


## Building Language Model
Now we have a clean data to train the model, let's build a language model

In [288]:
class language_model():
    # initial with no. of gram
    def __init__(self, n):
        self.n = n
        self.start_sign = '<s>'
        self.end_sign = '<e>'
        # build a dictionary to record the frequency of next word
        self.f_next = {}
        # build a dictionary to record the normalized probability of next word
        self.p_next = {}
        self.word_type = set()
        
    # a function to pad a sentence with start sign and end sign    
    def pad(self, tokens):
        return [self.start_sign]*(self.n-1) + tokens + [self.end_sign]*(self.n-1)
    
    
    # train a list of sentence    
    def train(self, tokenized_text):
        for tokens_seq in tokenized_text:
            padded_tokens = self.pad(tokens_seq)
            for i in range(len(padded_tokens)-self.n):
                history = tuple(padded_tokens[i:i+self.n-1])
                next_word = padded_tokens[i+self.n]
                if next_word!='<e>':
                    self.word_type.add(next_word)
                if history in self.f_next.keys():
                    try:
                        self.f_next[history][next_word] += 1
                    except:
                        self.f_next[history][next_word] = 1
                else:
                    self.f_next[history]={}
                    self.f_next[history][next_word] = 1
    
    # normalized the frequency
    def normalized(self):
        for history in self.p_next:
            pass
            

In [289]:
# Build a tri-gram model
lm_n3 = language_model(3)

In [290]:
# test it with two sentences
lm_n3.train(tokenized_text[:2])

In [207]:
print(lm_n3.f_next)

{('<s>', '<s>'): {'hence': 1, '!': 1, 'why': 1, ',': 1}, ('<s>', 'hence'): {'home': 1}, ('hence', '!'): {',': 1}, ('!', 'home'): {'you': 1}, ('home', ','): {'idle': 1}, (',', 'you'): {'creatures': 1, 'not': 1}, ('you', 'idle'): {'get': 1}, ('idle', 'creatures'): {'you': 1}, ('creatures', 'get'): {'home': 1}, ('get', 'you'): {':': 1}, ('you', 'home'): {'is': 1}, ('home', ':'): {'this': 1}, (':', 'is'): {'a': 1}, ('is', 'this'): {'holiday': 1}, ('this', 'a'): {'?': 1}, ('a', 'holiday'): {'what': 1}, ('holiday', '?'): {'!': 1}, ('?', 'what'): {'know': 1}, ('what', '!'): {'you': 1}, ('!', 'know'): {'not': 1}, ('know', 'you'): {',': 1}, ('you', 'not'): {'being': 1}, ('not', ','): {'mechanical': 1}, (',', 'being'): {',': 1}, ('being', 'mechanical'): {'you': 1}, ('mechanical', ','): {'ought': 1}, ('you', 'ought'): {'walk': 1}, ('ought', 'not'): {'upon': 1}, ('not', 'walk'): {'a': 1}, ('walk', 'upon'): {'labouring': 1}, ('upon', 'a'): {'day': 1}, ('a', 'labouring'): {'without': 1}, ('labouring

## Text generator
Now we have a language model to use, let's build a text generator which can generator shakespeare's script.

In [303]:
#import random to make random choices
import random


def generator(lm, init_word = 'hence', min_len=5, max_len=200):
    output_text = [lm.start_sign]*(lm.n-1)
    while init_word == None or not init_word.isalpha():
        init_word = random.choice(list(lm.word_type))
    output_text.append(init_word)
    while max_len > len(output_text)  and len(output_text) <= max_len and output_text[-1]!=lm.end_sign:
        history = tuple(output_text[-lm.n+1:])
        try:
            output_text.extend(random.choices(list(lm.f_next[history].keys()), lm.f_next[history].values()))
        except:
            random_words = random.choice(list(lm.f_next.keys()))
            output_text.extend(random_words)
            output_text.extend(random.choices(list(lm.f_next[random_words].keys()), lm.f_next[random_words].values()))
            
    return output_text

In [304]:
text = generator(lm_n3)
print(text)

['<s>', '<s>', 'hence', 'then', 'things', 'more', ':', 'my', 'can', 'this', 'mangled', 'at', 'heaven', ';', 'shall', 'along', 'you', 'scarcely', 'bruise', 'this', 'affect', '.', '<e>']


Looks legit! Lets make it into human lanugae.

In [324]:

def detokenizer(token_seq):
    output_string = ''
    for token in token_seq:
        if token != '<s>' and token != '<e>':
            if output_string == '':
                output_string = output_string + token.capitalize()
            elif not token.isalpha():
                output_string = output_string + token
            else:
                output_string = output_string + " " + token 
    if output_string[-1].is
    return output_string

In [325]:
print(detokenizer(text))

Hence then things more: my can this mangled at heaven; shall along you scarcely bruise this affect.


Try it again with training the whole corpus

In [327]:
raw_text = get_raw_text("data/train_shakespeare.txt")
tokenized_text = tokenizer(raw_text)
lm_n3 = language_model(3)
lm_n3.train(tokenized_text)
text_tokens = generator(lm_n3)
generated_text = detokenizer(text_tokens)
print(generated_text)

Hence verona it mis in repair and no general for sinking them's weapons water thou reportest to outlive the i grows it? is your?, thine,-- fifty five talents greet the senseless with thee hither thy either you stay
