In [52]:
import numpy as np
import pandas as pd
import nltk
import random

# Part 1: Load and Preprocess Data

## Part 1.1: Load the data

In [2]:
with open("en_US.twitter.txt", "r") as f:
    data = f.read()

In [20]:
print("Data fragment:\n\n",data[:200], "...")
print("\nThere are ",len(data), "words in the data")

Data fragment:

 How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Love to see you. Been way, way too long.
When you meet someone special... you'll know. Your heart will beat more rapidly and you'll ...

There are  3335477 words in the data


## Part 1.2 Pre-process the data

Preprocess this data with the following steps:

- Split data into sentences using "\n" as the delimiter.
- Split each sentence into tokens. Note that in this assignment we use "token" and "words" interchangeably.
- Assign sentences into train or test sets.
- Find tokens that appear at least N times in the training data.
- Replace tokens that appear less than N times by " < u n k > "

Note: we omit validation data in this exercise.

- In real applications, we should hold a part of data as a validation set and use it to tune our training.
- We skip this process for simplicity.



In [94]:
def preprocess(data):
    
    sentences = data.split("\n")
    tokens = []
    
    for sentence in sentences:
        tokens.append(nltk.word_tokenize(sentence.lower()))
        
    return tokens

In [74]:
def split_tokens(tokens,split):
    
    split_size = int(len(tokens)*split)
    random.shuffle(tokens)
    
    train = tokens[:split_size]
    test = tokens[split_size:]
    
    return train, test

In [70]:
def count_words(tokenized_sentences):

    words = {}
    
    for sentence_list in tokenized_sentences:
        for word in sentence_list:
            if word in words:
                words[word] += 1
                
            else:
                words[word] = 1
            
    return words

In [83]:
def get_words_with_nplus_frequency(tokenized_sentences, count_threshold):

    nplus_words = []
    words_count = count_words(tokenized_sentences)
    
    for word in words_count.keys():   
        if words_count[word] >= count_threshold:
            nplus_words.append(word)
    
    return nplus_words

In [85]:
def replace_oov_words_by_unk(tokenized_sentences, vocabulary, unknown_token="<unk>"):

    replaced = []
    
    for sentence in tokenized_sentences:  
        new_sentence = []
        
        for word in sentence: 
            if word not in vocabulary:
                new_sentence.append(unknown_token)
                
            else:
                new_sentence.append(word)
        
        replaced.append(new_sentence)
        
    return replaced

In [90]:
def preprocess_data(train_data, test_data, count_threshold):
    
    vocabulary = get_words_with_nplus_frequency(train_data, count_threshold)
    train_replaced = replace_oov_words_by_unk(train_data, vocabulary, unknown_token="<unk>")
    test_replaced = replace_oov_words_by_unk(test_data, vocabulary, unknown_token="<unk>")
    
    return train_replaced, test_replaced, vocabulary

In [98]:
sentences = preprocess(data)
sentences[0]

['how',
 'are',
 'you',
 '?',
 'btw',
 'thanks',
 'for',
 'the',
 'rt',
 '.',
 'you',
 'gon',
 'na',
 'be',
 'in',
 'dc',
 'anytime',
 'soon',
 '?',
 'love',
 'to',
 'see',
 'you',
 '.',
 'been',
 'way',
 ',',
 'way',
 'too',
 'long',
 '.']

In [101]:
train_data, test_data = split_tokens(tokens,0.8)
len(train_data), len(test_data)

(38369, 9593)

In [235]:
train_replaced, test_replaced, vocabulary = preprocess_data(train_data, test_data, 2)

In [109]:
print("Size of vocabulary\n",len(vocabulary), "\n")
print("Random train sentence:\n", train_replaced[2],"\n")
print("Random test sentence:\n", test_replaced[5], "\n")

Size of vocabulary
 14842 

Random train sentence:
 ['the', 'dark', 'ages', 'was', 'caused', 'by', 'the', '<unk>', 'problem'] 

Random test sentence:
 ['her', 'status', 'said', '[', '<unk>', ']', '...', 'bt', 'he', 'did', "n't", 'get', 'the', 'message', '...', '.'] 



# Part 2: Develop n-gram based language models

In this section, you will develop the n-grams language model.

- Assume the probability of the next word depends only on the previous n-gram.
- The previous n-gram is the series of the previous 'n' words.

The conditional probability for the word at position 't' in the sentence, given that the words preceding it are $w_{t-1}, w_{t-2} \cdots w_{t-n}$ is:
$$ P(w_t | w_{t-1}\dots w_{t-n}) \tag{1}$$

You can estimate this probability by counting the occurrences of these series of words in the training data.

- The probability can be estimated as a ratio, where
- The numerator is the number of times word 't' appears after words t-1 through t-n appear in the training data.
- The denominator is the number of times word t-1 through t-n appears in the training data.

$$ \hat{P}(w_t | w_{t-1}\dots w_{t-n}) = \frac{C(w_{t-1}\dots w_{t-n}, w_n)}{C(w_{t-1}\dots w_{t-n})} \tag{2} $$

- The function $C(\cdots)$ denotes the number of occurence of the given sequence.
- $\hat{P}$ means the estimation of $P$.
- Notice that denominator of the equation (2) is the number of occurence of the previous $n$ words, and the numerator is the same sequence followed by the word $w_t$.

Later, you will modify the equation (2) by adding k-smoothing, which avoids errors when any counts are zero.

The equation (2) tells us that to estimate probabilities based on n-grams, you need the counts of n-grams (for denominator) and (n+1)-grams (for numerator).

In [117]:
def count_n_grams(data, n, start_token='<s>', end_token = '<e>'):
    """
    Count all n-grams in the data
    
    Args:
        data: List of lists of words
        n: number of words in a sequence
    
    Returns:
        A dictionary that maps a tuple of n-words to its frequency
    """
    
    n_words = {}
    
    for sentence in data:
        sentence = tuple([start_token] * n + sentence + [end_token])
        m = len(sentence) if n==1 else len(sentence) - 1
        
        for i in range(m):
            n_gram = sentence[i:i+n]
            if n_gram in n_words.keys():
                n_words[n_gram] += 1
            else:
                n_words[n_gram] = 1
    
    return n_words

In [119]:
def estimate_probability(word, previous_n_gram, 
                         n_gram_counts, n_plus1_gram_counts, 
                         vocabulary_size, k=1.0):
    
    previous_n_gram = tuple(previous_n_gram) 

    if previous_n_gram in n_gram_counts:
        previous_n_gram_count = n_gram_counts[previous_n_gram]
    else:
        previous_n_gram_count = 0
    
    n_plus1_gram = previous_n_gram + (word,)

    if n_plus1_gram in n_plus1_gram_counts:
        n_plus1_gram_count = n_plus1_gram_counts[n_plus1_gram]
    else:
        n_plus1_gram_count = 0

    numerator = n_plus1_gram_count + k
    denominator = previous_n_gram_count + k * vocabulary_size
    probability = numerator / denominator
    
    return probability

In [137]:
def estimate_all_probabilities(previous_n_gram, 
                               n_gram_counts, n_plus1_gram_counts, 
                               vocabulary, k=1.0):
    
    previous_n_gram = tuple(previous_n_gram)
    vocabulary = vocabulary + ["<e>", "<unk>"]
    vocabulary_size = len(vocabulary)
    probabilities = {}
    
    for word in vocabulary:
        probability = estimate_probability(word, previous_n_gram, 
                                           n_gram_counts, n_plus1_gram_counts, 
                                           vocabulary_size, k=k)
        probabilities[word] = probability

    return probabilities


In [209]:
def make_count_matrix(n_plus1_gram_counts, vocabulary):
    
    vocabulary += ["<e>", "<unk>"]
    
    n_grams = []
    
    for n_plus1_gram in n_plus1_gram_counts.keys():
        n_grams.append(n_plus1_gram[0:-1])
        
    n_grams = list(set(n_grams))
    
    row_index = {n_gram:i for i, n_gram in enumerate(n_grams)}
    col_index = {word:j for j, word in enumerate(vocabulary)}
    
    nrow = len(n_grams)
    ncol = len(vocabulary)
    count_matrix = np.zeros((nrow, ncol))
    for n_plus1_gram, count in n_plus1_gram_counts.items():
        n_gram = n_plus1_gram[0:-1]
        word = n_plus1_gram[-1]
        if word not in vocabulary:
            continue
        i = row_index[n_gram]
        j = col_index[word]
        count_matrix[i, j] = count
    
    count_matrix = pd.DataFrame(count_matrix, index=n_grams, columns=vocabulary)
        
    return count_matrix

In [212]:
def make_probability_matrix(n_plus1_gram_counts, vocabulary, k):
    
    count_matrix = make_count_matrix(n_plus1_gram_counts, unique_words)
    count_matrix += k
    prob_matrix = count_matrix.div(count_matrix.sum(axis=1), axis=0)
    
    return prob_matrix

In [213]:
sentences = [['i', 'like', 'a', 'cat'],
                 ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))
bigram_counts = count_n_grams(sentences, 2)
print("bigram probabilities")
display(make_probability_matrix(bigram_counts, unique_words, k=1))

bigram probabilities


Unnamed: 0,this,like,cat,is,i,a,dog,<e>,<unk>
"(this,)",0.1,0.1,0.1,0.1,0.1,0.1,0.2,0.1,0.1
"(dog,)",0.1,0.1,0.1,0.2,0.1,0.1,0.1,0.1,0.1
"(<s>,)",0.181818,0.090909,0.090909,0.090909,0.181818,0.090909,0.090909,0.090909,0.090909
"(is,)",0.1,0.2,0.1,0.1,0.1,0.1,0.1,0.1,0.1
"(like,)",0.090909,0.090909,0.090909,0.090909,0.090909,0.272727,0.090909,0.090909,0.090909
"(a,)",0.090909,0.090909,0.272727,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909
"(i,)",0.1,0.2,0.1,0.1,0.1,0.1,0.1,0.1,0.1
"(cat,)",0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.272727,0.090909


# Part 3: Perplexity

In this section, you will generate the perplexity score to evaluate your model on the test set.

- You will also use back-off when needed.
- Perplexity is used as an evaluation metric of your language model.
- To calculate the the perplexity score of the test set on an n-gram model, use:

$$ PP(W) =\sqrt[N]{ \prod_{t=n+1}^N \frac{1}{P(w_t | w_{t-n} \cdots w_{t-1})} } \tag{4}$$

- where $N$ is the length of the sentence.
- $n$ is the number of words in the n-gram (e.g. 2 for a bigram).
- In math, the numbering starts at one and not zero.

In code, array indexing starts at zero, so the code will use ranges for $t$ according to this formula:
$$ PP(W) =\sqrt[N]{ \prod_{t=n}^{N-1} \frac{1}{P(w_t | w_{t-n} \cdots w_{t-1})} } \tag{4.1}$$

The higher the probabilities are, the lower the perplexity will be.

- The more the n-grams tell us about the sentence, the lower the perplexity score will be.



In [214]:
def calculate_perplexity(sentence, n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=1.0):

    n = len(list(n_gram_counts.keys())[0])
    sentence = tuple(["<s>"] * n + sentence + ["<e>"])
    N = len(sentence)
    product_pi = 1.0
    
    for t in range(n, N):
        n_gram = sentence[t-n:t]
        word = sentence[t]
        probability = estimate_probability(word,n_gram, n_gram_counts, n_plus1_gram_counts, len(unique_words), k=1)
        product_pi *= 1 / probability

    perplexity = product_pi**(1/float(N))

    return perplexity

# Part 4: Build an auto-complete system

In this section, you will combine the language models developed so far to implement an auto-complete system.

In [225]:
def suggest_a_word(previous_tokens, n_gram_counts, n_plus1_gram_counts, vocabulary, k=1.0, start_with=None):

    n = len(list(n_gram_counts.keys())[0])
    previous_n_gram = previous_tokens[-n:]
    probabilities = estimate_all_probabilities(previous_n_gram,
                                           n_gram_counts, n_plus1_gram_counts,
                                           vocabulary, k=k)
    suggestion = None
    max_prob = 0
    
    for word, prob in probabilities.items():
        if start_with != None:
            if not word.startswith(start_with):
                continue
                
        if prob > max_prob:
            suggestion = word
            max_prob = prob

    return suggestion, max_prob

In [237]:
def get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0, start_with=None):
    model_counts = len(n_gram_counts_list)
    suggestions = []
    for i in range(model_counts-1):
        n_gram_counts = n_gram_counts_list[i]
        n_plus1_gram_counts = n_gram_counts_list[i+1]
        
        suggestion = suggest_a_word(previous_tokens, n_gram_counts,
                                    n_plus1_gram_counts, vocabulary,
                                    k=k, start_with=start_with)
        suggestions.append(suggestion)
    return suggestions

# Testing model

In [228]:
n_gram_counts_list = []
for n in range(1, 6):
    n_model_counts = count_n_grams(train_data, n)
    n_gram_counts_list.append(n_model_counts)

In [256]:
previous_tokens = ["how","are"]
get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0)

[('you', 0.022271588061061744),
 ('you', 0.0036878101113048145),
 ('that', 6.736728644570196e-05),
 ('that', 6.736728644570196e-05)]