In [1]:
import io
import argparse
from collections import Counter
import json

def openfile(filename):
    strh = ''
    try:
        fh = open(filename, encoding="utf8") 
        strh = fh.readlines()
        return True, strh
    except Exception:
        return False,strh

#def train_bigram(training_file, model_file):
def train_bigram(filename):
    counts = Counter()
    context_counts = Counter()
    status, file = openfile(filename)
    try:
        for line in file:
            token = line.strip().split(' ')
            token.insert(0, '<s>')
            token.append('</s>')
            for i in range(1, len(token)): # starting at 1, after <s>
                    # Add bigram and bigram context.
                    counts[' '.join(token[i-1:i+1])] += 1  # Number of 'w_i, w_{i-1}'
                    context_counts[token[i-1]] += 1  # Number of w_{i-1}.
                    # Add unigram and unigram context.
                    counts[token[i]] += 1  # Number of w_i.
                    context_counts[''] += 1  # Total number of words.
    except FileNotFoundError:
        return 'File Not Found'
    probabilities = {}
    for ngram, count in sorted(counts.items(),key=lambda x:x[1],reverse=True):
        words = ngram.split(' ')
        context = words[:-1]
        context = ' '.join(context)
        probabilities[ngram] = count / context_counts[context]
        # Creates a file with bigram model for the file
        with open('model.json', 'w') as file:
            file.write(json.dumps(probabilities))

In [2]:
train_bigram('wiki-en-train.word')

In [3]:
import io
import argparse
import math

V = 1e6  # Vocabulary size.

def load_model(modelfile):
    with open(modelfile) as file:
        model = json.loads(file.read())
    return model

def test_bigram(lambdaval,probs, test_file):
        lambda_1 = lambdaval
        lambda_2 = 1 - lambda_1
        W = 0  # Total number of words.
        H = 0  # Negative log likelihood.
        with open(test_file, 'r') as f:   
            for line in f:
                words = line.strip().split(' ')
                words.insert(0, '<s>')
                words.append('</s>')
                for i in range(1, len(words)):
                    try:
                        P1 = lambda_1 * probs[words[i]] + (1 - lambda_1) / V
                        P2 = lambda_2 * probs[' '.join(words[i-1:i+1])] + (1 - lambda_2) * P1
                        H += - math.log2(P2)
                        W += 1
                        print('Entropy is : {}'.format(H/W))
                    except:
                        print('This words sequence is not in training data thaswhy the probability of theses coming together is zero.')
                        

In [4]:
probs = load_model('model.json')

In [5]:
for lambdaval in [0.90,0.92,0.94,0.96]:
    print('For Lambda 2 value  '+str(1- lambdaval))
    test_bigram(lambdaval,probs,'02-train-input.txt')

For Lambda 2 value  0.09999999999999998
This words sequence is not in training data thaswhy the probability of theses coming together is zero.
This words sequence is not in training data thaswhy the probability of theses coming together is zero.
This words sequence is not in training data thaswhy the probability of theses coming together is zero.
This words sequence is not in training data thaswhy the probability of theses coming together is zero.
This words sequence is not in training data thaswhy the probability of theses coming together is zero.
This words sequence is not in training data thaswhy the probability of theses coming together is zero.
This words sequence is not in training data thaswhy the probability of theses coming together is zero.
This words sequence is not in training data thaswhy the probability of theses coming together is zero.
For Lambda 2 value  0.07999999999999996
This words sequence is not in training data thaswhy the probability of theses coming together is

In [6]:
for lambdaval in [0.90,0.92,0.94,0.96]:
    print('For Lambda 2 value  '+str(1- lambdaval))
    test_bigram(lambdaval,probs,'wiki-en-test.word')

For Lambda 2 value  0.09999999999999998
Entropy is : 6.7174672910413085
This words sequence is not in training data thaswhy the probability of theses coming together is zero.
Entropy is : 5.50513051385728
Entropy is : 5.010303708237251
This words sequence is not in training data thaswhy the probability of theses coming together is zero.
This words sequence is not in training data thaswhy the probability of theses coming together is zero.
This words sequence is not in training data thaswhy the probability of theses coming together is zero.
This words sequence is not in training data thaswhy the probability of theses coming together is zero.
This words sequence is not in training data thaswhy the probability of theses coming together is zero.
Entropy is : 5.2281063526491405
Entropy is : 5.807783844022301
This words sequence is not in training data thaswhy the probability of theses coming together is zero.
This words sequence is not in training data thaswhy the probability of theses comin