In [54]:
from lxml import etree as ET #lxml is faster
import glob #to use multiple files
from collections import Counter #to count unique words
import json #to save model
import time
import random
import pprint
import re

### Saving/Loading functions for JSON files

Used since JSON files do not accept tuples as keys

Source: https://stackoverflow.com/questions/7001606/json-serialize-a-dictionary-with-tuples-as-key?answertab=scoredesc#tab-top

- saveJSON<br>
Converts the tuples in the keys to strings and saves to JSON

- loadJSON<br>
Converts strings back into tuples and returns dictionary

In [55]:
def saveJSON(dictionary, filename):

    with open(filename, 'w') as f:
        
        # changes keys to string
        k = dictionary.keys()
        v = dictionary.values()
        kstrings = [str(i) for i in k]

        json.dump(json.dumps(dict(zip(*[kstrings,v]))),f)
        print("File was saved")

def loadJSON(filename):
    
    with open(filename, 'r') as f:
    
        data = json.load(f)
        dictionary = json.loads(data)

        # converts back to tuples
        k = dictionary.keys() 
        v = dictionary.values() 
        k1 = [eval(i) for i in k] 
        
        return dict(zip(*[k1,v])) 

### Dataset

Dataset is opened. Everything found within <s\> tags is extracted. <s\> tags are also added around each sentence.

In [56]:
#one file

tree = ET.parse('xmlfiles/A1E.xml')
root = tree.getroot()

textPartial=[]

for s in root.iter('s'): # every sentence is found with <s> tags
    # textPartial.append('<s>') # adding <s> to dataset
    textPartial += '<s>'
    for elem in s.findall('*'):
        # textPartial.append(elem.text) # adding each element found within <s> 
        textPartial += elem.text
    textPartial += '</s>'
    # textPartial.append('</s>')

print("Word count: ",len(textPartial))


Word count:  11616


In [57]:
#all files

files = glob.glob('xmlfiles/*.xml') #to open all xml files

textFull = []

for fileName in files: # iterating through all files
    
    tree = ET.parse(fileName)
    root = tree.getroot()

    for s in root.iter('s'): # every sentence is found with <s> tags
        textFull.append('<s>') # adding <s> to dataset
        for elem in s.findall('*'):
            textFull.append(elem.text) # adding each element found within <s>
        textFull.append('</s>')
    
print("Word count: ",len(textFull))

Word count:  1180895


### Training and testing split

In [58]:
# splitting the data into sentences within a list
sentences = []

for i, word in enumerate(textPartial):
    if word == "<s>":
        idx = i
    elif word == "</s>":
        sentences.append(textPartial[idx:i+1])

# shuffle to randomise the order of the sentences. using seed to make sure it's always the same
random.seed(42)
random.shuffle(sentences)

# split the list of sentences into 80-20
split = int(len(sentences) * 0.8)

trainDataSentences = sentences[:split]
testDataWithDupesSentences = sentences[split:]

#to make sure there are no duplicates
uniqueSentences = set() 
for sentence in trainDataSentences: #getting all unique sentences in the training data
    uniqueSentences.add(tuple(sentence))

testDataSentences = []
duplicateCount = 0
for sentence in testDataWithDupesSentences: #checking if sentences in test data is found in training data
    if tuple(sentence) not in uniqueSentences:
        testDataSentences.append(sentence)
    else:
        duplicateCount += 1

print("Removed", duplicateCount, "duplicate sentences from test data\n")

#flattening lists
trainData = [word for sentence in trainDataSentences for word in sentence]
testData = [word for sentence in testDataSentences for word in sentence]

print("Training data\t Word count:", len(trainData), "\tUnique words count:",len(set(trainData)))
print("Testing data\t Word count:", len(testData), "\tUnique words count:",len(set(testData)))

# writing the training and testing sets to files for easy access
with open("dataset/Training_Set.json", "w") as f:
    json.dump(trainData, f)

with open("dataset/Testing_Set.json", "w") as f:
    json.dump(testData, f)


Removed 1 duplicate sentences from test data

Training data	 Word count: 9286 	Unique words count: 2667
Testing data	 Word count: 2325 	Unique words count: 1010


In [59]:
# used in testing, to check that there are no duplicates

# for sentence1 in trainDataSentences:
#     for sentence2 in testDataSentences:
#         if sentence1 == sentence2:
#             print("Duplicate")
#             print(sentence1)
#             print(sentence2)

### UNK Tokens

In [60]:
wordFreqs = {}

for word in trainData: #taking count of each word
    if word not in wordFreqs:
        wordFreqs[word] = 1
    else:
        wordFreqs[word] += 1

trainDataUNK = []

for word in trainData: #if word appears 2 or less times it is appended as a <UNK> token
    if wordFreqs[word] <= 2:
       trainDataUNK.append("<UNK>")
    else:
        trainDataUNK.append(word)


### N-Gram function

Used to split up dataset into N-grams.The function ngramCounter uses the Counter library. This was done to check if there is any significant different in the execution time.

In [61]:
def ngram(text, n): #n is size of ngrams

    ngramCounts = {} #dict to store ngrams

    for i in range(len(text)-n+1): #len(text)-n+1 is the number of all possible ngrams

        ngram = tuple(text[i:i+n]) #creates ngram

        if ngram in ngramCounts: #check to see if it already exists
            ngramCounts[ngram] += 1
        else:
            ngramCounts[ngram] = 1

    return ngramCounts

In [62]:
def ngramCounter(text, n): #ngram function using counter
    
    ngramCounts = Counter()

    for i in range(len(text)-n+1):
        
        ngram = tuple(text[i:i+n])
        ngramCounts[ngram] += 1
    
    return ngramCounts

Testing to see which function generates trigrams faster using the full dataset.

In [63]:
startTime = time.time()
ngram(textFull,3)
endTime = time.time()

print("Time taken for normal function to execute: ", endTime - startTime, "seconds")

startTime = time.time()
ngramCounter(textFull,3)
endTime = time.time()

print("Time taken for Counter function to execute: ", endTime - startTime, "seconds")

Time taken for normal function to execute:  0.6562445163726807 seconds
Time taken for Counter function to execute:  0.7948729991912842 seconds


Testing N-Gram function with a test string

In [64]:
teststring = '<s> I am Sam </s> <s> Sam I am </s> <s> I do not like green eggs and ham </s>'
testlist = list(teststring.split(" "))

print("Unigram:\n",ngram(testlist,1), "\n")
print("Bigram:\n",ngram(testlist,2), "\n")
print("Trigram:\n",ngram(testlist,3))

Unigram:
 {('<s>',): 3, ('I',): 3, ('am',): 2, ('Sam',): 2, ('</s>',): 3, ('do',): 1, ('not',): 1, ('like',): 1, ('green',): 1, ('eggs',): 1, ('and',): 1, ('ham',): 1} 

Bigram:
 {('<s>', 'I'): 2, ('I', 'am'): 2, ('am', 'Sam'): 1, ('Sam', '</s>'): 1, ('</s>', '<s>'): 2, ('<s>', 'Sam'): 1, ('Sam', 'I'): 1, ('am', '</s>'): 1, ('I', 'do'): 1, ('do', 'not'): 1, ('not', 'like'): 1, ('like', 'green'): 1, ('green', 'eggs'): 1, ('eggs', 'and'): 1, ('and', 'ham'): 1, ('ham', '</s>'): 1} 

Trigram:
 {('<s>', 'I', 'am'): 1, ('I', 'am', 'Sam'): 1, ('am', 'Sam', '</s>'): 1, ('Sam', '</s>', '<s>'): 1, ('</s>', '<s>', 'Sam'): 1, ('<s>', 'Sam', 'I'): 1, ('Sam', 'I', 'am'): 1, ('I', 'am', '</s>'): 1, ('am', '</s>', '<s>'): 1, ('</s>', '<s>', 'I'): 1, ('<s>', 'I', 'do'): 1, ('I', 'do', 'not'): 1, ('do', 'not', 'like'): 1, ('not', 'like', 'green'): 1, ('like', 'green', 'eggs'): 1, ('green', 'eggs', 'and'): 1, ('eggs', 'and', 'ham'): 1, ('and', 'ham', '</s>'): 1}


### Vanilla Probabilties

In [65]:
def UnigramVanilla(dataset):

    iter = 0 # to keep track of number of iterations
    
    ngrams = ngram(dataset, 1) # generating unigrams

    ngramProbs = {} # where probabilties will be stored

    totalWords = sum(ngrams.values()) # getting total amount of words
    
    for ngram1 in ngrams:

        iter+=1
        print("Iteration", iter, 'of', len(ngrams))
        
        ngramProbs[ngram1] = ngrams[ngram1]/totalWords # calculating probability and storing it

    return ngramProbs

In [66]:
def BigramVanilla(dataset):

    iter = 0 # to keep track of number of iterations

    ngrams = ngram(dataset, 2) # generating bigrams

    ngramProbs = {} # where probabilities will be stored
    
    for ngram1 in ngrams:
        
        iter+=1
        print("Iteration", iter, 'of', len(ngrams))
        
        count = 0
        prefix = ngram1[0] # getting each element from first to second to last

        for ngram2 in ngrams:
            
            if prefix in ngram2:
                count += 1
                
        ngramProbs[ngram1] = ngrams[ngram1]/count # calculating probability and storing it

    return ngramProbs

In [67]:
def TrigramVanilla(dataset):

    iter = 0 # to keep track of number of iterations

    ngrams = ngram(dataset, 3) # generating trigrams

    ngramProbs = {} # where probabilities will be stored
    
    for ngram1 in ngrams:

        iter+=1
        print("Iteration", iter, 'of', len(ngrams))

        count = 0
        prefix = ngram1[0:2]

        for ngram2 in ngrams:
            
            if prefix in zip(ngram2, ngram2[1:]):
                count += 1
            
        ngramProbs[ngram1] = ngrams[ngram1]/count # calculating probability and storing it
        

    return ngramProbs

### Laplace Smoothing

In [68]:
def UnigramLaplaceSmoothing(dataset):

    iter = 0
    
    ngrams = ngram(dataset, 1)

    ngramProbs = {}

    values = ngrams.values()
    totalWords = sum(values)
    vocabulary = len(set(dataset)) # number of unique words in dataset
    
    for ngram1 in ngrams:

        iter+=1
        print("Iteration", iter, 'of', len(ngrams))

        ngramProbs[ngram1] = (ngrams[ngram1]+1)/(totalWords+vocabulary) # calculating probability with laplace smoothing
        

    return ngramProbs


In [69]:
def BigramLaplaceSmoothing(dataset):
    iter = 0

    ngrams = ngram(dataset, 2)

    ngramProbs = {}

    vocabulary = len(set(dataset))
    
    for ngram1 in ngrams:
        
        iter+=1
        print("Iteration", iter, 'of', len(ngrams))
        
        count = 0
        prefix = ngram1[0] # getting each element from first to second to last

        for ngram2 in ngrams:
            
            if prefix in ngram2:
                count += 1
                
        ngramProbs[ngram1] = (ngrams[ngram1]+1)/(count+vocabulary)

    return ngramProbs

In [70]:
def TrigramLaplaceSmoothing(dataset):
    iter = 0

    ngrams = ngram(dataset, 3)

    ngramProbs = {}

    vocabulary = len(set(dataset))
    
    for ngram1 in ngrams:

        iter+=1
        print("Iteration", iter, 'of', len(ngrams))

        count = 0
        prefix = ngram1[0:2]

        for ngram2 in ngrams:
            
            if prefix in zip(ngram2, ngram2[1:]):
                count += 1
            
        ngramProbs[ngram1] = (ngrams[ngram1]+1)/(count+vocabulary) # calculating probability with laplace smoothing
        

    return ngramProbs

### Combined functions

In [71]:
def UnigramModel(dataset):
    iter = 0
    
    ngrams = ngram(dataset, 1)

    vanillaModel = {} # where vanilla probabilities will be stored
    smoothModel = {} # where laplace smoothing probabilities will be stored

    totalWords = sum(ngrams.values()) # number of words in dataset
    vocabulary = len(set(dataset)) # number of unique words in dataset
    
    for ngram1 in ngrams:

        iter+=1
        print("Iteration", iter, 'of', len(ngrams))
        
        vanillaModel[ngram1] = ngrams[ngram1]/totalWords # calculating probability
        smoothModel[ngram1] = (ngrams[ngram1]+1)/(totalWords+vocabulary) # calculating probability with laplace smoothing

    return vanillaModel, smoothModel

In [72]:
def BigramModel(dataset):
    
    iter = 0

    ngrams = ngram(dataset, 2)

    vanillaModel = {}
    smoothModel = {}

    vocabulary = len(set(dataset))
    
    for ngram1 in ngrams:
        
        iter+=1
        print("Iteration", iter, 'of', len(ngrams))
        
        count = 0
        prefix = ngram1[0] 

        for ngram2 in ngrams:
            
            if prefix in ngram2:
                count += 1
                
        vanillaModel[ngram1] = ngrams[ngram1]/count # calculating probability
        smoothModel[ngram1] = (ngrams[ngram1]+1)/(count+vocabulary) # calculating probability with laplace smoothing

    return vanillaModel, smoothModel

In [73]:
def TrigramModel(dataset):
    
    iter = 0

    ngrams = ngram(dataset, 3)

    vanillaModel = {}
    smoothModel = {}

    vocabulary = len(set(dataset))
    
    for ngram1 in ngrams:

        iter+=1
        print("Iteration", iter, 'of', len(ngrams))

        count = 0
        prefix = ngram1[0:2]

        for ngram2 in ngrams:
            
            if prefix in zip(ngram2, ngram2[1:]):
                count += 1
            
        vanillaModel[ngram1] = ngrams[ngram1]/count # calculating probability
        smoothModel[ngram1] = (ngrams[ngram1]+1)/(count+vocabulary) # calculating probability with laplace smoothing
        
    return vanillaModel, smoothModel

### Training models

In [74]:
vanillaUnigramModel, laplaceUnigramModel = UnigramModel(trainData) 
saveJSON(vanillaUnigramModel, "models/Vanilla_Unigram.json")
saveJSON(laplaceUnigramModel, "models/Laplace_Unigram.json")



Iteration 1 of 2667
Iteration 2 of 2667
Iteration 3 of 2667
Iteration 4 of 2667
Iteration 5 of 2667
Iteration 6 of 2667
Iteration 7 of 2667
Iteration 8 of 2667
Iteration 9 of 2667
Iteration 10 of 2667
Iteration 11 of 2667
Iteration 12 of 2667
Iteration 13 of 2667
Iteration 14 of 2667
Iteration 15 of 2667
Iteration 16 of 2667
Iteration 17 of 2667
Iteration 18 of 2667
Iteration 19 of 2667
Iteration 20 of 2667
Iteration 21 of 2667
Iteration 22 of 2667
Iteration 23 of 2667
Iteration 24 of 2667
Iteration 25 of 2667
Iteration 26 of 2667
Iteration 27 of 2667
Iteration 28 of 2667
Iteration 29 of 2667
Iteration 30 of 2667
Iteration 31 of 2667
Iteration 32 of 2667
Iteration 33 of 2667
Iteration 34 of 2667
Iteration 35 of 2667
Iteration 36 of 2667
Iteration 37 of 2667
Iteration 38 of 2667
Iteration 39 of 2667
Iteration 40 of 2667
Iteration 41 of 2667
Iteration 42 of 2667
Iteration 43 of 2667
Iteration 44 of 2667
Iteration 45 of 2667
Iteration 46 of 2667
Iteration 47 of 2667
Iteration 48 of 2667
I

In [75]:
vanillaUNKUnigramModel, laplaceUNKUnigramModel = UnigramModel(trainDataUNK)
saveJSON(vanillaUNKUnigramModel, "models/Vanilla_UNK_Unigram.json")
saveJSON(laplaceUNKUnigramModel, "models/Laplace_UNK_Unigram.json")

Iteration 1 of 494
Iteration 2 of 494
Iteration 3 of 494
Iteration 4 of 494
Iteration 5 of 494
Iteration 6 of 494
Iteration 7 of 494
Iteration 8 of 494
Iteration 9 of 494
Iteration 10 of 494
Iteration 11 of 494
Iteration 12 of 494
Iteration 13 of 494
Iteration 14 of 494
Iteration 15 of 494
Iteration 16 of 494
Iteration 17 of 494
Iteration 18 of 494
Iteration 19 of 494
Iteration 20 of 494
Iteration 21 of 494
Iteration 22 of 494
Iteration 23 of 494
Iteration 24 of 494
Iteration 25 of 494
Iteration 26 of 494
Iteration 27 of 494
Iteration 28 of 494
Iteration 29 of 494
Iteration 30 of 494
Iteration 31 of 494
Iteration 32 of 494
Iteration 33 of 494
Iteration 34 of 494
Iteration 35 of 494
Iteration 36 of 494
Iteration 37 of 494
Iteration 38 of 494
Iteration 39 of 494
Iteration 40 of 494
Iteration 41 of 494
Iteration 42 of 494
Iteration 43 of 494
Iteration 44 of 494
Iteration 45 of 494
Iteration 46 of 494
Iteration 47 of 494
Iteration 48 of 494
Iteration 49 of 494
Iteration 50 of 494
Iteration

In [76]:
vanillaBigramModel, laplaceBigramModel = BigramModel(trainData)
saveJSON(vanillaBigramModel, "models/Vanilla_Bigram.json")
saveJSON(laplaceBigramModel, "models/Laplace_Bigram.json")



Iteration 1 of 6772
Iteration 2 of 6772
Iteration 3 of 6772
Iteration 4 of 6772
Iteration 5 of 6772
Iteration 6 of 6772
Iteration 7 of 6772
Iteration 8 of 6772
Iteration 9 of 6772
Iteration 10 of 6772
Iteration 11 of 6772
Iteration 12 of 6772
Iteration 13 of 6772
Iteration 14 of 6772
Iteration 15 of 6772
Iteration 16 of 6772
Iteration 17 of 6772
Iteration 18 of 6772
Iteration 19 of 6772
Iteration 20 of 6772
Iteration 21 of 6772
Iteration 22 of 6772
Iteration 23 of 6772
Iteration 24 of 6772
Iteration 25 of 6772
Iteration 26 of 6772
Iteration 27 of 6772
Iteration 28 of 6772
Iteration 29 of 6772
Iteration 30 of 6772
Iteration 31 of 6772
Iteration 32 of 6772
Iteration 33 of 6772
Iteration 34 of 6772
Iteration 35 of 6772
Iteration 36 of 6772
Iteration 37 of 6772
Iteration 38 of 6772
Iteration 39 of 6772
Iteration 40 of 6772
Iteration 41 of 6772
Iteration 42 of 6772
Iteration 43 of 6772
Iteration 44 of 6772
Iteration 45 of 6772
Iteration 46 of 6772
Iteration 47 of 6772
Iteration 48 of 6772
I

In [77]:
vanillaUNKBigramModel, laplaceUNKBigramModel = BigramModel(trainDataUNK)
saveJSON(vanillaUNKBigramModel, "models/Vanilla_UNK_Bigram.json")
saveJSON(laplaceUNKBigramModel, "models/Laplace_UNK_Bigram.json")

Iteration 1 of 2993
Iteration 2 of 2993
Iteration 3 of 2993
Iteration 4 of 2993
Iteration 5 of 2993
Iteration 6 of 2993
Iteration 7 of 2993
Iteration 8 of 2993
Iteration 9 of 2993
Iteration 10 of 2993
Iteration 11 of 2993
Iteration 12 of 2993
Iteration 13 of 2993
Iteration 14 of 2993
Iteration 15 of 2993
Iteration 16 of 2993
Iteration 17 of 2993
Iteration 18 of 2993
Iteration 19 of 2993
Iteration 20 of 2993
Iteration 21 of 2993
Iteration 22 of 2993
Iteration 23 of 2993
Iteration 24 of 2993
Iteration 25 of 2993
Iteration 26 of 2993
Iteration 27 of 2993
Iteration 28 of 2993
Iteration 29 of 2993
Iteration 30 of 2993
Iteration 31 of 2993
Iteration 32 of 2993
Iteration 33 of 2993
Iteration 34 of 2993
Iteration 35 of 2993
Iteration 36 of 2993
Iteration 37 of 2993
Iteration 38 of 2993
Iteration 39 of 2993
Iteration 40 of 2993
Iteration 41 of 2993
Iteration 42 of 2993
Iteration 43 of 2993
Iteration 44 of 2993
Iteration 45 of 2993
Iteration 46 of 2993
Iteration 47 of 2993
Iteration 48 of 2993
I

In [78]:
vanillaTrigramModel, laplaceTrigramModel = TrigramModel(trainData) 
saveJSON(vanillaTrigramModel, "models/Vanilla_Trigram.json")
saveJSON(laplaceTrigramModel, "models/Laplace_Trigram.json")



Iteration 1 of 8319
Iteration 2 of 8319
Iteration 3 of 8319
Iteration 4 of 8319
Iteration 5 of 8319
Iteration 6 of 8319
Iteration 7 of 8319
Iteration 8 of 8319
Iteration 9 of 8319
Iteration 10 of 8319
Iteration 11 of 8319
Iteration 12 of 8319
Iteration 13 of 8319
Iteration 14 of 8319
Iteration 15 of 8319
Iteration 16 of 8319
Iteration 17 of 8319
Iteration 18 of 8319
Iteration 19 of 8319
Iteration 20 of 8319
Iteration 21 of 8319
Iteration 22 of 8319
Iteration 23 of 8319
Iteration 24 of 8319
Iteration 25 of 8319
Iteration 26 of 8319
Iteration 27 of 8319
Iteration 28 of 8319
Iteration 29 of 8319
Iteration 30 of 8319
Iteration 31 of 8319
Iteration 32 of 8319
Iteration 33 of 8319
Iteration 34 of 8319
Iteration 35 of 8319
Iteration 36 of 8319
Iteration 37 of 8319
Iteration 38 of 8319
Iteration 39 of 8319
Iteration 40 of 8319
Iteration 41 of 8319
Iteration 42 of 8319
Iteration 43 of 8319
Iteration 44 of 8319
Iteration 45 of 8319
Iteration 46 of 8319
Iteration 47 of 8319
Iteration 48 of 8319
I

In [79]:
vanillaUNKTrigramModel, laplaceUNKTrigramModel = TrigramModel(trainDataUNK)
saveJSON(vanillaUNKTrigramModel, "models/Vanilla_UNK_Trigram.json")
saveJSON(laplaceUNKTrigramModel, "models/Laplace_UNK_Trigram.json")

Iteration 1 of 5577
Iteration 2 of 5577
Iteration 3 of 5577
Iteration 4 of 5577
Iteration 5 of 5577
Iteration 6 of 5577
Iteration 7 of 5577
Iteration 8 of 5577
Iteration 9 of 5577
Iteration 10 of 5577
Iteration 11 of 5577
Iteration 12 of 5577
Iteration 13 of 5577
Iteration 14 of 5577
Iteration 15 of 5577
Iteration 16 of 5577
Iteration 17 of 5577
Iteration 18 of 5577
Iteration 19 of 5577
Iteration 20 of 5577
Iteration 21 of 5577
Iteration 22 of 5577
Iteration 23 of 5577
Iteration 24 of 5577
Iteration 25 of 5577
Iteration 26 of 5577
Iteration 27 of 5577
Iteration 28 of 5577
Iteration 29 of 5577
Iteration 30 of 5577
Iteration 31 of 5577
Iteration 32 of 5577
Iteration 33 of 5577
Iteration 34 of 5577
Iteration 35 of 5577
Iteration 36 of 5577
Iteration 37 of 5577
Iteration 38 of 5577
Iteration 39 of 5577
Iteration 40 of 5577
Iteration 41 of 5577
Iteration 42 of 5577
Iteration 43 of 5577
Iteration 44 of 5577
Iteration 45 of 5577
Iteration 46 of 5577
Iteration 47 of 5577
Iteration 48 of 5577
I

### Linear Interpolation

In [80]:
def TokeniseSentence(sentenceString):

    sentence = re.findall(r'\b\w+\b|[^\w\s]', sentenceString)
    
    sentence.insert(0,'<s>')
    sentence.append('</s>')

    return sentence
    
TokeniseSentence("Only a good crop in Kenya has saved Africa from another disaster.")

['<s>',
 'Only',
 'a',
 'good',
 'crop',
 'in',
 'Kenya',
 'has',
 'saved',
 'Africa',
 'from',
 'another',
 'disaster',
 '.',
 '</s>']

### Vanilla Probability

In [89]:
testsentence = "These were overtaken by the Government's plans to deregulate the British securities market, announced in 1983, which were to lead to Big Bang."
sentence = TokeniseSentence(testsentence)
print(sentence)
                             
weight_unigram = 0.1
weight_bigram = 0.3
weight_trigram = 0.6

# probs_unigram = [vanillaUnigramModel.get((token,), 0) for token in sentences]

for word in sentence:
    # vanillaUnigramModel.get((word))
    print(vanillaUnigramModel[word])






# probs_bigram = [vanillaBigramModel.get(tuple(sentences[i:i+2]), 0) for i in range(len(sentences)-1)]
# probs_trigram = [vanillaTrigramModel.get(tuple(sentences[i:i+3]), 0) for i in range(len(sentences)-2)]

# total_probs = [weight_unigram*p1 + weight_bigram*p2 + weight_trigram*p3 for p1, p2, p3 in zip(probs_unigram, probs_bigram, probs_trigram)]

# sentence_prob = 1
# for prob in total_probs:
    # sentence_prob *= prob

# print(sentence_prob)

['<s>', 'These', 'were', 'overtaken', 'by', 'the', 'Government', "'", 's', 'plans', 'to', 'deregulate', 'the', 'British', 'securities', 'market', ',', 'announced', 'in', '1983', ',', 'which', 'were', 'to', 'lead', 'to', 'Big', 'Bang', '.', '</s>']


KeyError: '<s>'