In [2]:
from lxml import etree as ET #lxml is faster
import glob #to use multiple files
import json #to save model
import random
import re #regular expressions
import math

### Data Pre-Processing

In [21]:
#one file

tree = ET.parse('xmlfiles/A1E.xml')
root = tree.getroot()
# print(root)

textPartial=[]

for s in root.iter('s'):
    textPartial.append('<s>')
    for elem in s.findall('*'):

        if elem.text != None:    
            textPartial.append(elem.text)
    
    textPartial.append('</s>')

print("Word count: ",len(textPartial))

Word count:  11503


In [28]:
#all files

files = glob.glob('xmlfiles/*.xml') #to open all xml files

textFull = []
iter = 0

for fileName in files: # iterating through all files

    tree = ET.parse(fileName)
    root = tree.getroot()

    for s in root.iter('s'): # every sentence is found with <s> tags
    
        textFull.append('<s>')
    
        for elem in s.findall('*'):

            if elem.text != None:    
                textFull.append(elem.text)
                
        textFull.append('</s>')
        
    
print("Word count: ",len(textFull))

Word count:  1174283


#### Training and testing split

Splitting data using an 80/20 ratio and check for duplicates.

In [30]:
# splitting the data into sentences within a list
sentences = []

for i, word in enumerate(textFull):
    if word == "<s>":
        idx = i
    elif word == "</s>":
        sentences.append(textFull[idx:i+1])

# shuffle to randomise the order of the sentences. using seed to make sure it's always the same
random.seed(42)
random.shuffle(sentences)

# split the list of sentences into 80-20
split = int(len(sentences) * 0.8)

trainDataSentences = sentences[:split]
testDataWithDupesSentences = sentences[split:]

#to make sure there are no duplicates
uniqueSentences = set() 
for sentence in trainDataSentences: #getting all unique sentences in the training data
    uniqueSentences.add(tuple(sentence))

testDataSentences = []
duplicateCount = 0
for sentence in testDataWithDupesSentences: #checking if sentences in test data is found in training data
    if tuple(sentence) not in uniqueSentences:
        testDataSentences.append(sentence)
    else:
        duplicateCount += 1

print("Removed", duplicateCount, "duplicate sentences from test data\n")

#flattening lists
trainData = [word for sentence in trainDataSentences for word in sentence]
testData = [word for sentence in testDataSentences for word in sentence]

print("Training data\t Word count:", len(trainData), "\tUnique words count:",len(set(trainData)))
print("Testing data\t Word count:", len(testData), "\tUnique words count:",len(set(testData)))

# writing the training and testing sets to files for easy access
# with open("dataset/Training_Set.json", "w") as f:
#     json.dump(trainData, f)

# with open("dataset/Testing_Set.json", "w") as f:
#     json.dump(testDataSentences, f)


Removed 422 duplicate sentences from test data

Training data	 Word count: 939306 	Unique words count: 68898
Testing data	 Word count: 230057 	Unique words count: 31352


In [3]:
#loading dataset

with open("dataset/Training_Set.json", "r") as f:
    trainData = json.load(f)

with open("dataset/Testing_Set.json", "r") as f:
    testData = json.load(f)

#### UNK Tokens

In [25]:
wordFreqs = {}

for word in trainData: #taking count of each word
    if word not in wordFreqs:
        wordFreqs[word] = 1
    else:
        wordFreqs[word] += 1

trainDataUNK = []

for word in trainData: #if word appears 2 or less times it is appended as a <UNK> token
    if wordFreqs[word] <= 2:
       trainDataUNK.append("<UNK>")
    else:
        trainDataUNK.append(word)


### N-Grams

In [26]:
def ngram(text, n): #n is size of ngrams

    ngramCounts = {} #dict to store ngrams

    for i in range(len(text)-n+1): #len(text)-n+1 is the number of all possible ngrams

        ngram = tuple(text[i:i+n]) #creates ngram

        if ngram in ngramCounts: #check to see if it already exists
            ngramCounts[ngram] += 1
        else:
            ngramCounts[ngram] = 1

    return ngramCounts

Testing N-Gram function with a test string

In [27]:
testString = '<s> I am Sam </s> <s> Sam I am </s> <s> I do not like green eggs and ham </s>'
testList = list(testString.split(" "))

print("Unigram:\n",ngram(testList,1), "\n")
print("Bigram:\n",ngram(testList,2), "\n")
print("Trigram:\n",ngram(testList,3))

Unigram:
 {('<s>',): 3, ('I',): 3, ('am',): 2, ('Sam',): 2, ('</s>',): 3, ('do',): 1, ('not',): 1, ('like',): 1, ('green',): 1, ('eggs',): 1, ('and',): 1, ('ham',): 1} 

Bigram:
 {('<s>', 'I'): 2, ('I', 'am'): 2, ('am', 'Sam'): 1, ('Sam', '</s>'): 1, ('</s>', '<s>'): 2, ('<s>', 'Sam'): 1, ('Sam', 'I'): 1, ('am', '</s>'): 1, ('I', 'do'): 1, ('do', 'not'): 1, ('not', 'like'): 1, ('like', 'green'): 1, ('green', 'eggs'): 1, ('eggs', 'and'): 1, ('and', 'ham'): 1, ('ham', '</s>'): 1} 

Trigram:
 {('<s>', 'I', 'am'): 1, ('I', 'am', 'Sam'): 1, ('am', 'Sam', '</s>'): 1, ('Sam', '</s>', '<s>'): 1, ('</s>', '<s>', 'Sam'): 1, ('<s>', 'Sam', 'I'): 1, ('Sam', 'I', 'am'): 1, ('I', 'am', '</s>'): 1, ('am', '</s>', '<s>'): 1, ('</s>', '<s>', 'I'): 1, ('<s>', 'I', 'do'): 1, ('I', 'do', 'not'): 1, ('do', 'not', 'like'): 1, ('not', 'like', 'green'): 1, ('like', 'green', 'eggs'): 1, ('green', 'eggs', 'and'): 1, ('eggs', 'and', 'ham'): 1, ('and', 'ham', '</s>'): 1}


### Model Training

#### Probability Calculations

In [6]:
def UnigramModel(dataset):

    ngrams = ngram(dataset, 1)

    vanillaModel = {} # where vanilla probabilities will be stored
    smoothModel = {} # where laplace smoothing probabilities will be stored

    totalWords = len(dataset) # number of words in dataset
    vocabulary = len(set(dataset)) # number of unique words in dataset
    
    for ngram1 in ngrams:
        
        vanillaModel[ngram1] = ngrams[ngram1]/totalWords # calculating probability
        smoothModel[ngram1] = (ngrams[ngram1]+1)/(totalWords+vocabulary) # calculating probability with laplace smoothing

    return vanillaModel, smoothModel

In [7]:
def BigramModel(dataset):

    ngrams = ngram(dataset, 2)

    vanillaModel = {}
    smoothModel = {}

    vocabulary = len(set(dataset))
    
    for ngram1 in ngrams:
        
        count = 0
        prefix = ngram1[0] 

        for ngram2 in ngrams:
            
            if prefix in ngram2:
                count += 1
                
        vanillaModel[ngram1] = ngrams[ngram1]/count # calculating probability
        smoothModel[ngram1] = (ngrams[ngram1]+1)/(count+vocabulary) # calculating probability with laplace smoothing

    return vanillaModel, smoothModel

In [8]:
def TrigramModel(dataset):

    ngrams = ngram(dataset, 3)

    vanillaModel = {}
    smoothModel = {}

    vocabulary = len(set(dataset))
    
    for ngram1 in ngrams:

        count = 0
        prefix = ngram1[0:2]

        for ngram2 in ngrams:
            
            if prefix in zip(ngram2, ngram2[1:]):
                count += 1
            
        vanillaModel[ngram1] = ngrams[ngram1]/count # calculating probability
        smoothModel[ngram1] = (ngrams[ngram1]+1)/(count+vocabulary) # calculating probability with laplace smoothing
        
    return vanillaModel, smoothModel

#### Training models

#### Saving/Loading functions for JSON files

Used since JSON files do not accept tuples as keys

Source: https://stackoverflow.com/questions/7001606/json-serialize-a-dictionary-with-tuples-as-key?answertab=scoredesc#tab-top

- saveJSON<br>
Converts the tuples in the keys to strings and saves to JSON

- loadJSON<br>
Converts strings back into tuples and returns dictionary

In [4]:
def saveJSON(dictionary, filename):

    with open(filename, 'w') as f:
        
        # changes keys to string
        k = dictionary.keys()
        v = dictionary.values()
        kstrings = [str(i) for i in k]

        json.dump(json.dumps(dict(zip(*[kstrings,v]))),f)
        print("File was saved")

def loadJSON(filename):
    
    with open(filename, 'r') as f:
    
        data = json.load(f)
        dictionary = json.loads(data)

        # converts back to tuples
        k = dictionary.keys() 
        v = dictionary.values() 
        k1 = [eval(i) for i in k] 
        
        return dict(zip(*[k1,v])) 
    

In [32]:
# vanillaUnigramModel, laplaceUnigramModel = UnigramModel(trainData) 
# saveJSON(vanillaUnigramModel, "models/Vanilla_Unigram.json")
# saveJSON(laplaceUnigramModel, "models/Laplace_Unigram.json")

In [33]:
# UNKUnigramModel, _ = UnigramModel(trainDataUNK)
# saveJSON(UNKUnigramModel, "models/UNK_Unigram.json")

In [34]:
# vanillaBigramModel, laplaceBigramModel = BigramModel(trainData)
# saveJSON(vanillaBigramModel, "models/Vanilla_Bigram.json")
# saveJSON(laplaceBigramModel, "models/Laplace_Bigram.json")

In [35]:
# UNKBigramModel, _ = BigramModel(trainDataUNK)
# saveJSON(UNKBigramModel, "models/UNK_Bigram.json")

In [36]:
# vanillaTrigramModel, laplaceTrigramModel = TrigramModel(trainData) 
# saveJSON(vanillaTrigramModel, "models/Vanilla_Trigram.json")
# saveJSON(laplaceTrigramModel, "models/Laplace_Trigram.json")

In [37]:
# UNKTrigramModel, _ = TrigramModel(trainDataUNK)
# saveJSON(UNKTrigramModel, "models/UNK_Trigram.json")

#### Loading models

In [5]:
vanillaUnigramModel = loadJSON("full-models/Vanilla_Unigram.json")
vanillaBigramModel = loadJSON("full-models/Vanilla_Bigram.json")
vanillaTrigramModel = loadJSON("full-models/Vanilla_Trigram.json")

laplaceUnigramModel = loadJSON("full-models/Laplace_Unigram.json")
laplaceBigramModel = loadJSON("full-models/Laplace_Bigram.json")
laplaceTrigramModel = loadJSON("full-models/Laplace_Trigram.json")

UNKUnigramModel = loadJSON("full-models/UNK_Unigram.json")
UNKBigramModel = loadJSON("full-models/UNK_Bigram.json")
UNKTrigramModel = loadJSON("full-models/UNK_Trigram.json")

### Tokenisation

In [6]:
def TokeniseSentence(sentenceString):

    sentence = re.findall(r'\b\w+\b|<\/?s>|[^\w\s]', sentenceString)
    
    sentence.insert(0, '<s>')
    sentence.append('</s>')


    return sentence

### Probabilities

In [8]:
trainSentence = "Baker expressed grave concern at the Bootle killing and attacked various institutions for the fact that society has become more violent and selfish."
testSentence = "That is all Botham wants to hear."

In [7]:
def VanillaUnigramProbabilities(sentence):
    
    unigramProbabilties = []
    
    for word in sentence:
        
        probability = vanillaUnigramModel.get((word,), 0) # gets probability of word, if word not found 0 is returned
        
        unigramProbabilties.append(probability)
        
    return unigramProbabilties

def VanillaBigramProbabilties(sentence):
    
    bigramProbabilities = []
    
    for i in range(len(sentence)-1): # -1 so it doesnt go out of bounds
        
        bigram = tuple(sentence[i:i+2])
        
        probability = vanillaBigramModel.get((bigram), 0)
        bigramProbabilities.append(probability)
        
    return bigramProbabilities

def VanillaTrigramProbabilties(sentence):
    
    trigramProbabilities = []
        
    for i in range(len(sentence)-2): # -2 so it doesnt go out of bounds
        
        trigram = tuple(sentence[i:i+3])
        
        probability = vanillaTrigramModel.get((trigram), 0)
        trigramProbabilities.append(probability)
        
    return trigramProbabilities

In [9]:
vocabulary = len(set(trainData))

def LaplaceUnigramProbabilities(sentence):
    
    unigramProbabilties = []
    
    for word in sentence:
        
        probability = laplaceUnigramModel.get((word,), (1/vocabulary)) # gets probability of word, if word not found 1/vocab is returned
        
        unigramProbabilties.append(probability)
        
    return unigramProbabilties

def LaplaceBigramProbabilities(sentence):
   
    bigramProbabilities = []
    
    for i in range(len(sentence)-1): # -1 so it doesnt go out of bounds
        
        bigram = tuple(sentence[i:i+2])
        
        probability = laplaceBigramModel.get((bigram), (1/vocabulary))
        bigramProbabilities.append(probability)
        
    return bigramProbabilities

def LaplaceTrigramProbabilities(sentence):
    
    trigramProbabilities = []
        
    for i in range(len(sentence)-2): # -2 so it doesnt go out of bounds
        
        trigram = tuple(sentence[i:i+3])
        
        probability = laplaceTrigramModel.get((trigram), (1/vocabulary))
        trigramProbabilities.append(probability)
        
    return trigramProbabilities

In [10]:
def UNKUnigramProbabilities(sentence):
    
    unigramProbabilties = []
    
    for word in sentence:
        
        probability = UNKUnigramModel.get((word,), UNKUnigramModel[('<UNK>',)]) # gets probability of word, if word not found prob of <UNK> is returned
        
        unigramProbabilties.append(probability)
        
    return unigramProbabilties

def UNKBigramProbabilities(sentence):
    
    bigramProbabilities = []
    
    for i in range(len(sentence)-1): # -1 so it doesnt go out of bounds
        
        bigram = tuple(sentence[i:i+2])
        
        probability = UNKBigramModel.get((bigram), UNKBigramModel[('<UNK>', '<UNK>')])
        bigramProbabilities.append(probability)
        
    return bigramProbabilities

def UNKTrigramProbabilities(sentence):
    
    trigramProbabilities = []
        
    for i in range(len(sentence)-2): # -2 so it doesnt go out of bounds
        
        trigram = tuple(sentence[i:i+3])
        
        probability = UNKTrigramModel.get((trigram), UNKTrigramModel[('<UNK>', '<UNK>', '<UNK>')])
        trigramProbabilities.append(probability)
        
    return trigramProbabilities

In [11]:
def ProbabilityTotal(probabilities): #gets list of probabilties and multiplies them
    
    sentenceProbability = 1
    
    for probability in probabilities:
        sentenceProbability *= probability
        
    return sentenceProbability
    

### Probability of Sequence

In [12]:
def Sen_Probability(sentence):

    tokenisedSentence = TokeniseSentence(sentence)

    print("Vanilla Unigram:", ProbabilityTotal(VanillaUnigramProbabilities(tokenisedSentence)))
    print("Vanilla Bigram:", ProbabilityTotal(VanillaBigramProbabilties(tokenisedSentence)))
    print("Vanilla Trigram:", ProbabilityTotal(VanillaTrigramProbabilties(tokenisedSentence)),"\n")

    print("Laplace Unigram:", ProbabilityTotal(LaplaceUnigramProbabilities(tokenisedSentence)))
    print("Laplace Bigram:", ProbabilityTotal(LaplaceBigramProbabilities(tokenisedSentence)))
    print("Laplace Trigram:", ProbabilityTotal(LaplaceTrigramProbabilities(tokenisedSentence)), "\n")

    print("UNK Unigram:", ProbabilityTotal(UNKUnigramProbabilities(tokenisedSentence)))
    print("UNK Bigram:", ProbabilityTotal(UNKBigramProbabilities(tokenisedSentence)))
    print("UNK Trigram:", ProbabilityTotal(UNKTrigramProbabilities(tokenisedSentence)))

In [13]:
Sen_Probability("The quick brown fox jumped over the lazy dog")

Vanilla Unigram: 0.0
Vanilla Bigram: 0
Vanilla Trigram: 0 

Laplace Unigram: 7.935829365590901e-50
Laplace Bigram: 4.1489084131430815e-49
Laplace Trigram: 2.8585149184873204e-44 

UNK Unigram: 8.89511311306738e-34
UNK Bigram: 5.8992170207896055e-06
UNK Trigram: 2.6897377498946083e-07


### Linear Interpolation

In [19]:
unigramWeight = 0.1
bigramWeight = 0.3
trigramWeight = 0.6

In [20]:
def VanillaLinearInterpolation(sentence):
    
    unigramProbabilities = VanillaUnigramProbabilities(sentence)
    
    bigramProbabilities = VanillaBigramProbabilties(sentence)
    
    trigramProbabilities = VanillaTrigramProbabilties(sentence)
        
    weightedProbabilities = []
        
    for p1, p2, p3 in zip(unigramProbabilities, bigramProbabilities, trigramProbabilities):
        weightedProbability = unigramWeight*p1 + bigramWeight*p2 + trigramWeight*p3
        
        weightedProbabilities.append(weightedProbability)
        
    return weightedProbabilities

In [21]:
def LaplaceSmoothingLinearInterpolation(sentence):
    
    unigramProbabilties = LaplaceUnigramProbabilities(sentence)
    
    bigramProbabilities = LaplaceBigramProbabilities(sentence)
        
    trigramProbabilities = LaplaceTrigramProbabilities(sentence)
        
    weightedProbabilities = []
        
    for p1, p2, p3 in zip(unigramProbabilties, bigramProbabilities, trigramProbabilities):
        weightedProbability = unigramWeight*p1 + bigramWeight*p2 + trigramWeight*p3
        
        weightedProbabilities.append(weightedProbability)
        
    return weightedProbabilities

In [22]:
def UNKLinearInterpolation(sentence):
    
    unigramProbabilties = UNKUnigramProbabilities(sentence)
    
    bigramProbabilities = UNKBigramProbabilities(sentence)
        
    trigramProbabilities = UNKTrigramProbabilities(sentence)
        
    weightedProbabilities = []
        
    for p1, p2, p3 in zip(unigramProbabilties, bigramProbabilities, trigramProbabilities):
        weightedProbability = unigramWeight*p1 + bigramWeight*p2 + trigramWeight*p3
        
        weightedProbabilities.append(weightedProbability)
        
    return weightedProbabilities

#### Linear Interpolation Testing

In [23]:
print("Sentence: ", trainSentence, "\n")
tokenisedSentence = TokeniseSentence(trainSentence)

print("Vanilla Linear Interpolation:", ProbabilityTotal(VanillaLinearInterpolation(tokenisedSentence)))
print("Laplace Smoothing Linear Interpolation:", ProbabilityTotal(LaplaceSmoothingLinearInterpolation(tokenisedSentence)))
print("UNK Linear Interpolation:", ProbabilityTotal(UNKLinearInterpolation(tokenisedSentence)),"\n")

Sentence:  Baker expressed grave concern at the Bootle killing and attacked various institutions for the fact that society has become more violent and selfish. 

Vanilla Linear Interpolation: 1.9200583008021034e-133
Laplace Smoothing Linear Interpolation: 8.154036197208486e-113
UNK Linear Interpolation: 5.864328681572975e-17 



In [21]:
print("Sentence: ", testSentence, "\n")
tokenisedSentence = TokeniseSentence(testSentence)

print("Vanilla Linear Interpolation:", ProbabilityTotal(VanillaLinearInterpolation(tokenisedSentence)))
print("Laplace Smoothing Linear Interpolation:", ProbabilityTotal(LaplaceSmoothingLinearInterpolation(tokenisedSentence)))
print("UNK Linear Interpolation:", ProbabilityTotal(UNKLinearInterpolation(tokenisedSentence)),"\n")

Sentence:  That is all Botham wants to hear. 

Vanilla Linear Interpolation: 5.441195234524026e-36
Laplace Smoothing Linear Interpolation: 4.855468094351593e-36
UNK Linear Interpolation: 3.0123521199581038e-06 



### Perplexity

In [27]:
vanillaUnigram = vanillaBigram = vanillaTrigram = 0
laplaceUnigram = laplaceBigram = laplaceTrigram = 0
unkUnigram = unkBigram = unkTrigram = 0
vanillaLinearInterpolation = laplaceLinearInterpolation = unkLinearInterpolation = 0

for sentence in testData:

    vanillaUnigramNum = ProbabilityTotal(VanillaUnigramProbabilities(sentence))
    vanillaUnigram += math.log(vanillaUnigramNum) if vanillaUnigramNum != 0 else -1e9

    vanillaBigramNum = ProbabilityTotal(VanillaBigramProbabilties(sentence))
    vanillaBigram += math.log(vanillaBigramNum) if vanillaBigramNum != 0 else -1e9
    
    vanillaTrigramNum = ProbabilityTotal(VanillaTrigramProbabilties(sentence))
    vanillaTrigram += math.log(vanillaTrigramNum) if vanillaTrigramNum != 0 else -1e9

    laplaceUnigramNum = ProbabilityTotal(LaplaceUnigramProbabilities(sentence))
    laplaceUnigram += math.log(laplaceUnigramNum) if laplaceUnigramNum != 0 else -1e9
    
    laplaceBigramNum = ProbabilityTotal(LaplaceBigramProbabilities(sentence))
    laplaceBigram += math.log(laplaceBigramNum) if laplaceBigramNum != 0 else -1e9

    laplaceTrigramNum = ProbabilityTotal(LaplaceTrigramProbabilities(sentence))
    laplaceTrigram += math.log(laplaceTrigramNum) if laplaceTrigramNum != 0 else -1e9

    unkUnigramNum = ProbabilityTotal(UNKUnigramProbabilities(sentence))
    unkUnigram += math.log(unkUnigramNum) if unkUnigramNum != 0 else -1e9

    unkBigramNum = ProbabilityTotal(UNKBigramProbabilities(sentence))
    unkBigram += math.log(unkBigramNum) if unkBigramNum != 0 else -1e9

    unkTrigramNum = ProbabilityTotal(UNKTrigramProbabilities(sentence))
    unkTrigram += math.log(unkTrigramNum) if unkTrigramNum != 0 else -1e9
    
    vanillaLinearInterpolationNum = ProbabilityTotal(VanillaLinearInterpolation(sentence))
    vanillaLinearInterpolation += math.log(vanillaLinearInterpolationNum) if vanillaLinearInterpolationNum != 0 else -1e9

    laplaceLinearInterpolationNum = ProbabilityTotal(LaplaceSmoothingLinearInterpolation(sentence))
    laplaceLinearInterpolation += math.log(laplaceLinearInterpolationNum) if laplaceLinearInterpolationNum != 0 else -1e9
    
    unkLinearInterpolationNum = ProbabilityTotal(UNKLinearInterpolation(sentence))
    unkLinearInterpolation = math.log(unkLinearInterpolationNum) if unkLinearInterpolationNum != 0 else -1e9

wordCount = len([word for sentence in testData for word in sentence])

try:
    vanillaUnigram_perplexity = math.exp(-vanillaUnigram / wordCount)
except:
    vanillaUnigram_perplexity = 'Infinite'
    
try:
    vanillaBigram_perplexity = math.exp(-vanillaBigram / wordCount)
except:
    vanillaBigram_perplexity = 'Infinite'
    
try:
    vanillaTrigram_perplexity = math.exp(-vanillaTrigram / wordCount)
except:
    vanillaTrigram_perplexity = 'Infinite'
    
try:
    laplaceUnigram_perplexity = math.exp(-laplaceUnigram / wordCount)
except:
    laplaceUnigram_perplexity = 'Infinite'

try:
    laplaceBigram_perplexity = math.exp(-laplaceBigram / wordCount)
except:
    laplaceBigram_perplexity = 'Infinite'

try:
    laplaceTrigram_perplexity = math.exp(-laplaceTrigram / wordCount)
except:
    laplaceTrigram_perplexity = 'Infinite'

try:
    unkUnigram_perplexity = math.exp(-unkUnigram / wordCount)
except:
    unkUnigram_perplexity = 'Infinite'

try:
    unkBigram_perplexity = math.exp(-unkBigram / wordCount)
except:
    unkBigram_perplexity = 'Infinite'

try:
    unkTrigram_perplexity = math.exp(-unkTrigram / wordCount)
except:
    unkTrigram_perplexity = 'Infinite'

try:
    vanillaLinearInterpolation_perplexity = math.exp(-vanillaLinearInterpolation / wordCount)
except:
    vanillaLinearInterpolation_perplexity = 'Infinite'

try:
    laplaceLinearInterpolation_perplexity = math.exp(-laplaceLinearInterpolation / wordCount)
except:
    laplaceLinearInterpolation_perplexity = 'Infinite'

try:
    unkLinearInterpolation_perplexity = math.exp(-unkLinearInterpolation / wordCount)
except:
    unkLinearInterpolation_perplexity = 'Infinite'

print("Perplexity results:\n")

print("Vanilla Unigram:", vanillaUnigram_perplexity)
print("Vanilla Bigram:", vanillaBigram_perplexity)
print("Vanilla Trigram:", vanillaTrigram_perplexity,"\n")

print("Laplace Smoothing Unigram:", laplaceUnigram_perplexity)
print("Laplace Smoothing Bigram:", laplaceBigram_perplexity)
print("Laplace Smoothing Trigram:", laplaceTrigram_perplexity,"\n")

print("UNK Unigram:", unkUnigram_perplexity)
print("UNK Bigram:", unkBigram_perplexity)
print("UNK Trigram:", unkTrigram_perplexity,"\n")

print("Vanilla Linear Interpolation:", vanillaLinearInterpolation_perplexity)
print("Laplace Smoothing Linear Interpolation:", laplaceLinearInterpolation_perplexity)
print("UNK Linear Interpolation:", unkLinearInterpolation_perplexity)

Perplexity results:

Vanilla Unigram: Infinite
Vanilla Bigram: Infinite
Vanilla Trigram: Infinite 

Laplace Smoothing Unigram: 48961.87463134319
Laplace Smoothing Bigram: 7328.465198230634
Laplace Smoothing Trigram: 919.3839666976847 

UNK Unigram: 153.71380302325178
UNK Bigram: 2.691244162146465
UNK Trigram: 2.800004397266389 

Vanilla Linear Interpolation: Infinite
Laplace Smoothing Linear Interpolation: 920.9185826564983
UNK Linear Interpolation: 1.0000027503082094


### Generating sentence

In [27]:
def generateSentenceUnigram(unigramModel, word):
    
    startTag = '<s>'
    endTag = '</s>'
    
    sentence = [startTag, word]
    
    while(word != endTag):

        total = sum(value for value in unigramModel.values())
        probs = [value / total for value in unigramModel.values()]

        word = random.choices(list(unigramModel.keys()), probs)[0][0]

        while True:
            word = random.choices(list(unigramModel.keys()), probs)[0][0]
            if((word != '<UNK>') and (word != '<s>')): #to handle unk model
                break
    
        
        sentence.append(word)
    
    sentence.remove('<s>')
    sentence.remove('</s>')

    print(''.join(sentence))


print("Vanilla:")
generateSentenceUnigram(vanillaUnigramModel, "This ")
print("\nLaplace:")
generateSentenceUnigram(laplaceUnigramModel, "This ")
print("\nUNK:")
generateSentenceUnigram(UNKUnigramModel, "This ")

Vanilla:
This were yearsafer a finest donerealise

Laplace:
This wanted partedtheir plastic £30,000a-year water We her , in microcomputer give , John with (visit centre undisputed used popular Cross-Country slash four West manly to garage £1.5 their Greenpeace towards trauma , has year in .£5 high Towards by position withe message more always goes The elation higherYoungof second the parentship a stay bowling 

UNK:
This a stopped quite 


In [23]:
def generateSentenceBigram(bigramModel, word):

    startTag = '<s>'
    endTag = '</s>'

    sentence = [startTag, word]
    chosenNgram = tuple()

    while (word != endTag): #loop until endTag is found

        startsWithWord = [(key,value) for key, value in bigramModel.items() if key[0] == word] #finds all bigrams that start with current word
        
        #calculating probabilties for each bigram
        total = sum(value for _, value in startsWithWord)
        probs = [value / total for _, value in startsWithWord]

        #choosing a random bigram according to its probability
        while True:
            chosenNgram = random.choices(startsWithWord, probs)[0][0]
            if('<UNK>' not in chosenNgram): #to handle unk model
                break
        
        #setting the next word
        word = chosenNgram[1]

        sentence.append(word)

    sentence.remove('<s>')
    sentence.remove('</s>')
   
    print(''.join(sentence))

print("Vanilla:")  
generateSentenceBigram(vanillaBigramModel, "This ")
print("\nLaplace:")
generateSentenceBigram(laplaceBigramModel, "This ")
print("\nUNK:")
generateSentenceBigram(UNKBigramModel, "This ")

Vanilla:
This would represent Hampshire Ratings coordinator Joan Andrews

Laplace:
This was the brakes are declining rapidly, who had published today saying that New York became the Tories

UNK:
This includes communal living near Ipswich's Cup for libel laws.


In [26]:
def generateSentenceTrigram(trigramModel, word):

    startTag = '<s>'
    endTag = '</s>'

    sentence = [startTag]

    startsWithTag = [(key, value) for key, value in trigramModel.items() if key[0] == startTag and key[1] == word] #find all trigrams that start with tag and word
    total = sum(value for _, value in startsWithTag)
    probs = [value / total for _, value in startsWithTag]

    while True:
        chosenNgram = random.choices(startsWithTag, probs)[0][0]
        if('<UNK>' not in chosenNgram):
            break

    word = chosenNgram[1] #set current word as last word in bigram
    bigram = (chosenNgram[1], chosenNgram[2]) #set bigram as last two words in trigram

    sentence.append(word)

    while(word != endTag):

        startsWithBigram = [(key, value) for key, value in trigramModel.items() if key[:2] == bigram] #find all trigrams that start with current bigram

        total = sum(value for _, value in startsWithBigram)
        probs = [value / total for _, value in startsWithBigram]

        while True:
            chosenNgram = random.choices(startsWithBigram, probs)[0][0] #pick according to probability
            if('<UNK>' not in chosenNgram): #to handle unk model
                break

        word = chosenNgram[1]
        bigram = (chosenNgram[1], chosenNgram[2])
        
        sentence.append(word)

    sentence.remove('<s>')
    sentence.remove('</s>')
    
    print(''.join(sentence))

print("Vanilla:")
generateSentenceTrigram(vanillaTrigramModel, "This ")
print("\nLaplace:")
generateSentenceTrigram(laplaceTrigramModel, "This ")
print("\nUNK:")
generateSentenceTrigram(UNKTrigramModel, "This ")

Vanilla:
This summer its market leadership while falling short of blaming BR or the lack of success for fund-holders' own brands, ranging from bullying by classmates to schoolwork anxiety.

Laplace:
This always proves to be made in the area and encourage local people must be changed to four men and 14 for first quarter of new recordings issued recently; not surprisingly perhaps, ignored fellow Olympic bronze medalist Denis Stewart's decision to implement community care was distributed to customers, but it has taken her up in a freak holiday accident was undoubtedly the wiring errors in recent years.

UNK:
This was blocked, Phil Whelan drove the amount owned to investors; while the proportion of female sperm whales trapped in debt
