In [1]:
#Nothing additional is needed to run. Just use kernel restart run all
import random
import nltk
from collections import Counter
from collections import defaultdict
from random import sample
from datasets import load_dataset
# load in the training and testing datasets
dataset = load_dataset('google_wellformed_query')
testset = load_dataset('jordane95/trec-dl-2019-query')

Found cached dataset google_wellformed_query (/Users/laurynfluellen/.cache/huggingface/datasets/google_wellformed_query/default/0.0.0/9430d51f37bef61e99ec438f538b079d42bfc8da5e45b1e26bd85e35ba8a8a89)


  0%|          | 0/3 [00:00<?, ?it/s]

Using custom data configuration jordane95--trec-dl-2019-query-07c31337644c79b5
Found cached dataset json (/Users/laurynfluellen/.cache/huggingface/datasets/jordane95___json/jordane95--trec-dl-2019-query-07c31337644c79b5/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/1 [00:00<?, ?it/s]

In [2]:
class Dataset(object):
    def __init__(self, sample_ratio=1.0):
        # combine all of the lines to take in for training 
        self.data_lines = []
        dataset = load_dataset('google_wellformed_query')
        self.data_lines = self.data_lines + dataset['train']['content']
        self.data_lines = self.data_lines + dataset['test']['content']
        self.data_lines = self.data_lines + dataset['validation']['content']
        # filter out lines that are not greater than 0
        self.data_lines = list(filter(lambda data_lines: len(data_lines) > 0, self.data_lines))
data_set = Dataset()

Found cached dataset google_wellformed_query (/Users/laurynfluellen/.cache/huggingface/datasets/google_wellformed_query/default/0.0.0/9430d51f37bef61e99ec438f538b079d42bfc8da5e45b1e26bd85e35ba8a8a89)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
# print out a sample of 5 lines of training set to get an idea of what it looks like
sample(data_set.data_lines, 5)

['What is cubic meter in chemistry ?',
 'Nature in becoming a detective ?',
 'Professor layton camera puzzle ?',
 'What will prevent iron rusting ?',
 'What are carry-on restrictions to board an airplane ?']

In [4]:
class AutoSuggestModel(object):
    def __init__(self,
                 unknown_token='',
                 start_token="<s>",
                 end_token="</s>",
                 k=1):
        self.start_token = start_token
        self.end_token = end_token
        self.no_match_threshold = 5
        self.unknown_word = "UNK"
        self.k = k
        self.word_frequency = Counter()
        self.dict_ngram_word_frequency = {}
        self.tokenized_sentences = None
        self.dict_ngram_plus1_word_frequency = {}

    def tokenize(self, sentences):
        # Set words to lower case and tokenize with nltk package
        self.tokenized_sentences = [nltk.word_tokenize(words.lower()) for words in sentences]

    def train(self, minimum_freq=5, ngram=3):
        self.minimum_freq = minimum_freq
        #Allows for ngrams of differing sizes 
        self.ngram = ngram
        # Counts word counts
        for tokenized_sentence in self.tokenized_sentences:
            self.word_frequency.update(tokenized_sentence)
        self.vocab = list(self.word_frequency.keys()) + [self.unknown_word, ""]
        
        # add UNK token to unknown words -> words with low frequency
        new = []
        for sentence in self.tokenized_sentences:
            new_sentence = []
            for each in sentence:
                if self.word_frequency[each] != 0:
                    new_sentence.append(each)
                #if the frequency is not at least 1 add UNK to list instead of the word
                else:
                    new_sentence.append(self.unknown_word)
            new.append(new_sentence)
        self.tokenized_sentences = new
        
        # add to dictionary 
        self.dict_ngram_word_frequency = self.count_ngrams(self.tokenized_sentences, self.ngram)
        self.dict_ngram_plus1_word_frequency = self.count_ngrams(self.tokenized_sentences, self.ngram + 1)

    def count_ngrams(self, tokenized_sentences, ngram):
        frequency = defaultdict(lambda: 0)
        for sentence in tokenized_sentences:
            #add the start and end tokens
            sentence = [self.start_token] * ngram + sentence + [self.end_token]
            #account for sentence length for different ngram sizes 
            m = len(sentence) if ngram == 1 else len(sentence) - 1
            for i in range(m):
                ngram_token = sentence[i:i + ngram]
                frequency[" ".join(ngram_token)] += 1
        return frequency

    def estimate_probability(self, word, previous_ngram):
        # turn previous_ngram into a list was returning a tuple 
        if type(previous_ngram) != list:
            previous_ngram = [previous_ngram]
            #join to take in as a string
        previous_ngram = " ".join(previous_ngram)
        previous_ngram_count = self.dict_ngram_word_frequency.get(previous_ngram, 0)
        if previous_ngram_count == 0:
            return 0
        denominator = previous_ngram_count + self.k * len(self.vocab)
        n_plus1_gram = previous_ngram + " " + word
        n_plus1_gram_count = self.dict_ngram_plus1_word_frequency.get(n_plus1_gram, 0)
        numerator = n_plus1_gram_count + self.k
        probability = numerator / denominator
        return probability

    def estimate_probabilities(self, previous_ngram):
        probabilities = {}
        # turn previous_ngram into a list was returning a tuple 
        if type(previous_ngram) != list:
            previous_ngram = [previous_ngram]
        previous_ngram = " ".join(previous_ngram).lower()
        for each in self.vocab:
            probabilities[each] = self.estimate_probability(each, previous_ngram)
        return probabilities

    def suggestions(self, previous_tokens, num_suggestions=5, start_with=None):
        #Prevuous token is the last ngram from the previous context sentence
        previous_ngram = previous_tokens[-self.ngram:]
        probabilities = self.estimate_probabilities(previous_ngram)
        probs = probabilities.items()
        probs = filter(lambda t: t[1] > 0, probs)
        if start_with:
            probs = filter(lambda t: t[0].startswith(start_with), probs)
        # to sort by probability (highest at the front)
        probs = sorted(probs, key=lambda t: t[1], reverse=True)
        # Map the word and probability to each other so we have a list of words instead of a list of integers
        words = map(lambda t: t[0], probs)
        words_list = list(words)
        return words_list[:num_suggestions],

In [5]:
class Testset(object):
    def __init__(self, sample_ratio=1.0):
        # combine all of the lines to take in for training 
        self.test_lines = []
        testset = load_dataset('jordane95/trec-dl-2019-query')
        self.test_lines = self.test_lines + testset['train']['query']
        # filter out lines that are not greater than 0
        self.test_lines = list(filter(lambda test_lines: len(test_lines) > 0, self.test_lines))

test_set = Testset()

Using custom data configuration jordane95--trec-dl-2019-query-07c31337644c79b5
Found cached dataset json (/Users/laurynfluellen/.cache/huggingface/datasets/jordane95___json/jordane95--trec-dl-2019-query-07c31337644c79b5/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/1 [00:00<?, ?it/s]

In [6]:
sample(test_set.test_lines, 5)

['what county is pontiac, illinois in?',
 'what does chs stand for?',
 'define apprehension.',
 'is the titanic the biggest ship ever',
 'sensibilities, definition']

In [7]:
test_lines = testset['train']['query']
sentence1 = ["what", "is", "a", "period"]
sentence2 = ["wat", "is"]
sentence3 = ["stamford", "federal", "credit", "union", "routing"]
sentence4 = ["what", "format", "does", "a", "thumb", "drive", "need", "to", "be", "for", "a"]

In [8]:
# All models 
model = AutoSuggestModel()
model.tokenize(test_lines)
model.train(minimum_freq=5, ngram=1)

model2 = AutoSuggestModel()
model2.tokenize(test_lines)
model2.train(minimum_freq=5, ngram=2)

model3 = AutoSuggestModel()
model3.tokenize(test_lines)
model3.train(minimum_freq=5, ngram=3)


model4 = AutoSuggestModel()
model4.tokenize(test_lines)
model4.train(minimum_freq=5, ngram=4)

In [9]:
import string
list_sentences = test_set.test_lines
#print(len(list_sentences))
#print(list_sentences)
sentnece_minus_1_for_suggestion = []
sentence = []
popped = []
length_list = []
for each in list_sentences:
    no_punct = each.translate(str.maketrans('', '', string.punctuation))
    split = list(no_punct.split(" "))
    #print(split)
    sentence.append(split)
    #print(len(split))
    length_list.append(len(split))
average = sum(length_list)/len(length_list)
print(average)
#print("\nFull sentence:", sentence, "\n")
for items in sentence:
    #print(items[:-1])
    sentnece_minus_1_for_suggestion.append(items[:-1])
    popped.append(items.pop())
#print("Sentence minus 1:", sentnece_minus_1_for_suggestion, "\n")
#print("Last words removed:", popped, "\n")

5.785


In [10]:
# list of n-gram models that suggested the word we were looking for
list_results = []
for i in range(len(sentnece_minus_1_for_suggestion)):
    results = []
    print("Sentence:", sentnece_minus_1_for_suggestion[i])
    print("Word we're looking for:", popped[i])
    #print(each)
    # Run each model with test set sentences
    print("Unigram model suggestions:", model.suggestions(sentnece_minus_1_for_suggestion[i]))
    print("Bigram model suggestions:", model2.suggestions(sentnece_minus_1_for_suggestion[i]))
    print("Trigram model suggestions:", model3.suggestions(sentnece_minus_1_for_suggestion[i]))
    print("Ngram model suggestions:", model4.suggestions(sentnece_minus_1_for_suggestion[i]), "\n")
    # the word we are looking for
    words = popped[i]
    # if the word is in the suggested list of 5 words add to list of results
    if (any(words in i for i in model.suggestions(sentnece_minus_1_for_suggestion[i]))):
        results.append("Unigram")
    if (any(words in i for i in model2.suggestions(sentnece_minus_1_for_suggestion[i]))):
         results.append("Bigram")
    if (any(words in i for i in model3.suggestions(sentnece_minus_1_for_suggestion[i]))):
         results.append("Trigram")
    if (any(words in i for i in model4.suggestions(sentnece_minus_1_for_suggestion[i]))):
         results.append("Ngram")
    list_results.append(results)
# lists to hold the count of each n-gram size
uni = []
bi = []
tri = []
ngram = []
all_ngrams = []
for each in list_results:
    uni_count = each.count("Unigram")
    bi_count = each.count("Bigram")
    tri_count = each.count("Trigram")
    ngram_count = each.count("Ngram")
    uni.append(uni_count)
    bi.append(bi_count)
    tri.append(tri_count)
    ngram.append(ngram_count)
# choices to return for which n-gram has the highest talley
choices = ["Unigram", "Bigram", "Trigram", "Ngram"]
all_ngrams.append(uni.count(1))
all_ngrams.append(bi.count(1))
all_ngrams.append(tri.count(1))
all_ngrams.append(ngram.count(1))
    

Sentence: ['what', 'slows', 'down', 'the', 'flow', 'of']
Word we're looking for: blood
Unigram model suggestions: (['a', 'the', 'blood', 'having', 'heat'],)
Bigram model suggestions: (['blood', 'what', 'slows', 'down', 'the'],)
Trigram model suggestions: (['blood', 'what', 'slows', 'down', 'the'],)
Ngram model suggestions: (['blood', 'what', 'slows', 'down', 'the'],) 

Sentence: ['what', 'is', 'the', 'county', 'for', 'grand', 'rapids']
Word we're looking for: mn
Unigram model suggestions: ([',', 'what', 'slows', 'down', 'the'],)
Bigram model suggestions: ([',', 'what', 'slows', 'down', 'the'],)
Trigram model suggestions: ([',', 'what', 'slows', 'down', 'the'],)
Ngram model suggestions: ([',', 'what', 'slows', 'down', 'the'],) 

Sentence: ['what', 'is']
Word we're looking for: ruclip
Unigram model suggestions: (['the', 'a', 'an', 'ruclip', 'sugar'],)
Bigram model suggestions: (['the', 'a', 'an', 'ruclip', 'wifi'],)
Trigram model suggestions: ([],)
Ngram model suggestions: ([],) 

Senten

Sentence: ['what', 'is', 'ar']
Word we're looking for: balance
Unigram model suggestions: (['balance', 'glasses', 'what', 'slows', 'down'],)
Bigram model suggestions: (['balance', 'what', 'slows', 'down', 'the'],)
Trigram model suggestions: (['balance', 'what', 'slows', 'down', 'the'],)
Ngram model suggestions: ([],) 

Sentence: ['biggest', 'house', 'you', 'can', 'buy', 'in']
Word we're looking for: skyrim
Unigram model suggestions: (['the', 'one', 'blood', 'your', 'nc'],)
Bigram model suggestions: (['skyrim', 'what', 'slows', 'down', 'the'],)
Trigram model suggestions: (['skyrim', 'what', 'slows', 'down', 'the'],)
Ngram model suggestions: (['skyrim', 'what', 'slows', 'down', 'the'],) 

Sentence: ['definition', 'of', 'a', 'first']
Word we're looking for: harmonic
Unigram model suggestions: (['steam', 'letter', 'african-american', 'harmonic', 'what'],)
Bigram model suggestions: (['harmonic', 'what', 'slows', 'down', 'the'],)
Trigram model suggestions: (['harmonic', 'what', 'slows', 'dow

Sentence: ['what', 'is', 'theraderm', 'used']
Word we're looking for: for
Unigram model suggestions: (['for', 'to', 'what', 'slows', 'down'],)
Bigram model suggestions: (['for', 'what', 'slows', 'down', 'the'],)
Trigram model suggestions: (['for', 'what', 'slows', 'down', 'the'],)
Ngram model suggestions: (['for', 'what', 'slows', 'down', 'the'],) 

Sentence: ['what', 'causes', 'ankle']
Word we're looking for: blisters
Unigram model suggestions: (['blisters', 'what', 'slows', 'down', 'the'],)
Bigram model suggestions: (['blisters', 'what', 'slows', 'down', 'the'],)
Trigram model suggestions: (['blisters', 'what', 'slows', 'down', 'the'],)
Ngram model suggestions: ([],) 

Sentence: ['what', 'is', 'famvir', 'prescribed']
Word we're looking for: for
Unigram model suggestions: (['for', 'what', 'slows', 'down', 'the'],)
Bigram model suggestions: (['for', 'what', 'slows', 'down', 'the'],)
Trigram model suggestions: (['for', 'what', 'slows', 'down', 'the'],)
Ngram model suggestions: (['for', 

In [11]:
for i in range(len(all_ngrams)):
    # return the name of the model with the highest count
    print(choices[i], all_ngrams[i])
print("\nOf all of the models the", '\033[1m' + choices[all_ngrams.index(max(all_ngrams))] + '\033[0m', "model suggests the next word correctly most often")


Unigram 151
Bigram 180
Trigram 160
Ngram 124

Of all of the models the [1mBigram[0m model suggests the next word correctly most often


In [12]:
# Old extra testing to visualize before the above code 
words_sent1_uni = model.suggestions(sentence1)
print(words_sent1_uni)
for each in words_sent1_uni:
    print(model.estimate_probability(each[0], "period"))
    print(model.estimate_probability(each[1], "period"))
    print(model.estimate_probability(each[2], "period"))
    print(model.estimate_probability(each[3], "period"))
    print(model.estimate_probability(each[4], "period"), "\n")
    

words_sent2_uni = model.suggestions(sentence2)
print(words_sent2_uni)
for each in words_sent2_uni:
    print(model.estimate_probability(each[0], "is"))
    print(model.estimate_probability(each[1], "is"))
    print(model.estimate_probability(each[2], "is"))
    print(model.estimate_probability(each[3], "is"))
    print(model.estimate_probability(each[4], "is"), "\n")
    
words_sent3_uni = model.suggestions(sentence3)
print(words_sent3_uni)
for each in words_sent3_uni:
    print(model.estimate_probability(each[0], "routing"))
    print(model.estimate_probability(each[1], "routing"))
    print(model.estimate_probability(each[2], "routing"))
    print(model.estimate_probability(each[3], "routing"))
    print(model.estimate_probability(each[4], "routing"), "\n")
    
words_sent4_uni = model.suggestions(sentence4)
print(words_sent4_uni)
for each in words_sent4_uni:
    print(model.estimate_probability(each[0], "a"))
    print(model.estimate_probability(each[1], "a"))
    print(model.estimate_probability(each[2], "a"))
    print(model.estimate_probability(each[3], "a"))
    print(model.estimate_probability(each[4], "a"))

(['calculus', 'what', 'slows', 'down', 'the'],)
0.0032
0.0016
0.0016
0.0016
0.0016 

(['the', 'a', 'an', 'ruclip', 'sugar'],)
0.020497803806734993
0.014641288433382138
0.004392386530014641
0.0029282576866764276
0.0029282576866764276 

(['number', 'what', 'slows', 'down', 'the'],)
0.0032
0.0016
0.0016
0.0016
0.0016 

(['nosebleed', 'synonym', 'captain', 'face', 'supplement'],)
0.0030816640986132513
0.0030816640986132513
0.0030816640986132513
0.0030816640986132513
0.0030816640986132513


In [13]:
words_sent1_bi = model2.suggestions(sentence1)
print(words_sent1_bi)
for each in words_sent1_bi:
    print(model2.estimate_probability(each[0], ["a", "period"]))
    print(model2.estimate_probability(each[1], ["a", "period"]))
    print(model2.estimate_probability(each[2], ["a", "period"]))
    print(model2.estimate_probability(each[3], ["a", "period"]))
    print(model2.estimate_probability(each[4], ["a", "period"]), "\n")
    
words_sent2_bi = model2.suggestions(sentence2)
print(words_sent2_bi)
for each in words_sent2_bi:
    print(model2.estimate_probability(each[0], ["wat", "is"]))
    print(model2.estimate_probability(each[1], ["wat", "is"]))
    print(model2.estimate_probability(each[2], ["wat", "is"]))
    print(model2.estimate_probability(each[3], ["wat", "is"]))
    print(model2.estimate_probability(each[4], ["wat", "is"]), "\n")
    
words_sent3_bi = model2.suggestions(sentence3)
print(words_sent3_bi)
for each in words_sent3_bi:
    print(model2.estimate_probability(each[0], ["union", "routing"]))
    print(model2.estimate_probability(each[1],  ["union", "routing"]))
    print(model2.estimate_probability(each[2],  ["union", "routing"]))
    print(model2.estimate_probability(each[3],  ["union", "routing"]))
    print(model2.estimate_probability(each[4],  ["union", "routing"]), "\n")
    
words_sent4_bi = model2.suggestions(sentence4)
print(words_sent4_bi)
for each in words_sent4_bi:
    print(model2.estimate_probability(each[0], ["for", "a"]))
    print(model2.estimate_probability(each[1],["for", "a"]))
    print(model2.estimate_probability(each[2], ["for", "a"]))
    print(model2.estimate_probability(each[3],["for", "a"]))
    print(model2.estimate_probability(each[4], ["for", "a"]))

(['calculus', 'what', 'slows', 'down', 'the'],)
0.0032
0.0016
0.0016
0.0016
0.0016 

(['dopamine', 'what', 'slows', 'down', 'the'],)
0.0032
0.0016
0.0016
0.0016
0.0016 

(['number', 'what', 'slows', 'down', 'the'],)
0.0032
0.0016
0.0016
0.0016
0.0016 

(['mac', 'what', 'slows', 'down', 'the'],)
0.0032
0.0016
0.0016
0.0016
0.0016


In [14]:
words_sent1_tri = model3.suggestions(sentence1)
print(words_sent1_tri)
for each in words_sent1_tri:
    print(model3.estimate_probability(each[0], ["is", "a", "period"]))
    print(model3.estimate_probability(each[1], ["is", "a", "period"]))
    print(model3.estimate_probability(each[2], ["is", "a", "period"]))
    print(model3.estimate_probability(each[3], ["is", "a", "period"]))
    print(model3.estimate_probability(each[4], ["is", "a", "period"]), "\n")

words_sent2_tri = model3.suggestions(sentence2)
print(words_sent2_tri, "\n")
    
words_sent3_tri = model3.suggestions(sentence3)
print(words_sent3_tri)
for each in words_sent3_tri:
    print(model3.estimate_probability(each[0], ["credit", "union", "routing"]))
    print(model3.estimate_probability(each[1], ["credit", "union", "routing"]))
    print(model3.estimate_probability(each[2], ["credit", "union", "routing"]))
    print(model3.estimate_probability(each[3], ["credit", "union", "routing"]))
    print(model3.estimate_probability(each[4], ["credit", "union", "routing"]), "\n")
    
words_sent4_tri = model3.suggestions(sentence4)
print(words_sent4_tri)
for each in words_sent4_tri:
    print(model3.estimate_probability(each[0], ["be","for", "a"]))
    print(model3.estimate_probability(each[1], ["be","for", "a"]))
    print(model3.estimate_probability(each[2], ["be","for", "a"]))
    print(model3.estimate_probability(each[3], ["be","for", "a"]))
    print(model3.estimate_probability(each[4], ["be","for", "a"]))

(['calculus', 'what', 'slows', 'down', 'the'],)
0.0032
0.0016
0.0016
0.0016
0.0016 

([],) 

(['number', 'what', 'slows', 'down', 'the'],)
0.0032
0.0016
0.0016
0.0016
0.0016 

(['mac', 'what', 'slows', 'down', 'the'],)
0.0032
0.0016
0.0016
0.0016
0.0016


In [15]:
words_sent1_ngram = model4.suggestions(sentence1)
print(words_sent1_ngram)
for each in words_sent1_ngram:
    print(model4.estimate_probability(each[0], ["what", "is", "a", "period"]))
    print(model4.estimate_probability(each[1], ["what", "is", "a", "period"]))
    print(model4.estimate_probability(each[2], ["what", "is", "a", "period"]))
    print(model4.estimate_probability(each[3], ["what", "is", "a", "period"]))
    print(model4.estimate_probability(each[4], ["what", "is", "a", "period"]), "\n")

    
words_sent2_ngram = model4.suggestions(sentence2)
print(words_sent2_ngram, "\n")
    
words_sent3_ngram = model4.suggestions(sentence3)
print(words_sent3_ngram)
for each in words_sent3_ngram:
    print(model4.estimate_probability(each[0], ["federal", "credit", "union", "routing"]))
    print(model4.estimate_probability(each[1], ["federal", "credit", "union", "routing"]))
    print(model4.estimate_probability(each[2], ["federal", "credit", "union", "routing"]))
    print(model4.estimate_probability(each[3], ["federal", "credit", "union", "routing"]))
    print(model4.estimate_probability(each[4], ["federal", "credit", "union", "routing"]), "\n")
    
words_sent4_ngram = model4.suggestions(sentence4)
print(words_sent4_ngram)
for each in words_sent4_ngram:
    print(model4.estimate_probability(each[0], ["to","be","for", "a"]))
    print(model4.estimate_probability(each[1], ["to","be","for", "a"]))
    print(model4.estimate_probability(each[2], ["to","be","for", "a"]))
    print(model4.estimate_probability(each[3], ["to","be","for", "a"]))
    print(model4.estimate_probability(each[4], ["to","be","for", "a"]))

(['calculus', 'what', 'slows', 'down', 'the'],)
0.0032
0.0016
0.0016
0.0016
0.0016 

([],) 

(['number', 'what', 'slows', 'down', 'the'],)
0.0032
0.0016
0.0016
0.0016
0.0016 

(['mac', 'what', 'slows', 'down', 'the'],)
0.0032
0.0016
0.0016
0.0016
0.0016
