In [1]:
import math
import re
import os
import sys

In [2]:
def load_text_file(file):
    # had unicode error without encoding, most text files are utf-8 encoded anyway
    with open(file, encoding = "utf8") as f:
        # open file then create string of all the lines
        listToStr = ' '.join([str(elem.lower()) for elem in f])
        # remove newline characters
        str2 = listToStr.replace("\n", "")
        str2 = str2.split()
        return str2

In [3]:
load_text_file('train2-exam21.txt')

['<s>',
 'a',
 'b',
 '</s>',
 '<s>',
 'b',
 'b',
 '</s>',
 '<s>',
 'b',
 'c',
 '</s>',
 '<s>',
 'a',
 'a',
 '</s>']

In [4]:
# this function will create list of sentences in our test file
# using lists will be better here to treat each sentence seperately
def create_sentence(file):
    # load in each space delimited character
    text = load_text_file(file)
    sentences = []
    for i in range(len(text)):
        if text[i] == "<s>":
            # create a list for start of new sentence
            lst = []
            # append lower case values as not to treat upper and lower case characters seperately
            lst.append(text[i].lower())
            continue
        if text[i] == "</s>":
            # append last item of sentence (</s>) and then append this sentence to our list of sentences
            lst.append(text[i].lower())
            sentences.append(lst)
        else:
            # append lowercase word to sentence list
            lst.append(text[i].lower())
    # return list of list of sentences
    return sentences

In [5]:
create_sentence('test-exam21.txt')

[['<s>', 'c', '</s>'],
 ['<s>', 'b', '</s>'],
 ['<s>', 'b', 'b', '</s>'],
 ['<s>', 'b', 'a', '</s>'],
 ['<s>', 'a', 'b', 'c', '</s>']]

In [6]:
# function to find all bigrams in the training data
def createBigram(data):
    listOfBigrams = []
    bigramCounts = {}
    unigramCounts = {}
    for i in range(len(data) - 1):
        if i < len(data) - 1 and data[i + 1].islower():

            listOfBigrams.append((data[i], data[i + 1]))

        if (data[i], data[i + 1]) in bigramCounts:
            bigramCounts[(data[i], data[i + 1])] += 1
        else:
            bigramCounts[(data[i], data[i + 1])] = 1

        if data[i] in unigramCounts:
            unigramCounts[data[i]] += 1
        else:
            unigramCounts[data[i]] = 1
    return listOfBigrams, unigramCounts, bigramCounts

# function to calculate bigram probabilities in our training data
def calcBigramProb(listOfBigrams, unigramCounts, bigramCounts):
    listOfProb = {}
    for bigram in listOfBigrams:
        word1 = bigram[0]
        word2 = bigram[1]
        # assign key as number of times (a,b) occurs divided by unigram of first word
        listOfProb[bigram] = (bigramCounts.get(bigram))/(unigramCounts.get(word1))
    return listOfProb

In [7]:
data = load_text_file('train2-exam21.txt')
listOfBigrams, unigramCounts, bigramCounts = createBigram(data)
print("\n All the possible Bigrams in the training file are: ")
print(listOfBigrams)

print("\n Bigrams in the training file along with their frequency are: ")
print(bigramCounts)

print("\n Unigrams in the training file along with their frequency are: ")
print(unigramCounts)

bigramProb = calcBigramProb(listOfBigrams, unigramCounts, bigramCounts)

print("\n Bigrams in the training file along with their probability are: ")
print(bigramProb)
inputList = create_sentence('test-exam21.txt')
bilist = []
bigrm = []
for i in range(len(inputList)):
    for j in range(len(inputList[i])):
        if j < len(inputList[i]) - 1:

            bilist.append((inputList[i][j], inputList[i][j + 1]))

print("\n The bigrams in given test file are: ")
print(bilist)
# list we will append probabilities of sentences to
res = []
for i in range(len(inputList)):
    # starting probability for sentence is 1
    outputProb1 = 1
    for j in range(len(inputList[i]) - 1):
        if (inputList[i][j], inputList[i][j + 1]) in bilist:
            # listed is our bigram in question
            listed = (inputList[i][j], inputList[i][j + 1])
            # check for bigram probabilities
            if listed in bigramProb:
            # multiply our starting probability by bigram probability if recognised bigram
                outputProb1 *= bigramProb[listed]
            else:
            # if it doesn't occur zero probability
                outputProb1 *= 0
    res.append(outputProb1)

# print sentence and corresponding probability
for i in range(len(res)):
    print('\n' + 'Probablility of sentence ' +  str(inputList[i]) + ' is: ' + str(res[i]))


 All the possible Bigrams in the training file are: 
[('<s>', 'a'), ('a', 'b'), ('b', '</s>'), ('</s>', '<s>'), ('<s>', 'b'), ('b', 'b'), ('b', '</s>'), ('</s>', '<s>'), ('<s>', 'b'), ('b', 'c'), ('c', '</s>'), ('</s>', '<s>'), ('<s>', 'a'), ('a', 'a'), ('a', '</s>')]

 Bigrams in the training file along with their frequency are: 
{('<s>', 'a'): 2, ('a', 'b'): 1, ('b', '</s>'): 2, ('</s>', '<s>'): 3, ('<s>', 'b'): 2, ('b', 'b'): 1, ('b', 'c'): 1, ('c', '</s>'): 1, ('a', 'a'): 1, ('a', '</s>'): 1}

 Unigrams in the training file along with their frequency are: 
{'<s>': 4, 'a': 3, 'b': 4, '</s>': 3, 'c': 1}

 Bigrams in the training file along with their probability are: 
{('<s>', 'a'): 0.5, ('a', 'b'): 0.3333333333333333, ('b', '</s>'): 0.5, ('</s>', '<s>'): 1.0, ('<s>', 'b'): 0.5, ('b', 'b'): 0.25, ('b', 'c'): 0.25, ('c', '</s>'): 1.0, ('a', 'a'): 0.3333333333333333, ('a', '</s>'): 0.3333333333333333}

 The bigrams in given test file are: 
[('<s>', 'c'), ('c', '</s>'), ('<s>', 'b'), (