In [2]:
import math
import nltk
import random
from collections import OrderedDict
random.seed(1)
global unknown

In [3]:
#append each sentence with <s> and </s>
#find the context (bigram) of each word 
def get_bigrams(text):
    words = text.split()
    for i in range(1):
        words = ['<s>'] + words
    words += ['</s>']
    for i in range(len(words)-(1)):
        word = words[i + 1]
        context = tuple(words[i:i + 1])
        yield (word, context)
    return

In [4]:
#define the Bigram language model
class BigramLM:
    def __init__(self):
        self.bigram_counts = dict()
        self.context_counts = dict()
        self.vocabulary = dict()
        self.unk_words = set()
        self.sorted_vocab = OrderedDict()
    def update(self, text):
        res= get_bigrams(text)
        no_words = len(text.split())
        for _ in range(no_words):
            try:
                gen= next(res)
                word=gen[0]
                context=gen[1]
                if word not in self.vocabulary:
                    self.vocabulary[word] = 1
                    self.unk_words.add(word)
                else:
                    self.vocabulary[word] += 1
                    if word in self.unk_words:
                        self.unk_words.remove(word)
                if (word,context) not in self.bigram_counts:
                    self.bigram_counts[(word, context)] = 1
                else:
                    self.bigram_counts[(word, context)] += 1
                if context not in self.context_counts:
                    self.context_counts[context] = 1
                else:
                    self.context_counts[context] += 1
            except StopIteration:
                break
    def word_prob(self, word, context):
        bigram = (word, context)
        if context not in self.context_counts:
            prob = 1 / len(self.vocabulary)
            return prob
        if bigram not in self.bigram_counts:
            if '<unk>' in self.vocabulary:
                prob = (self.vocabulary['<unk>'])/(self.context_counts[bigram[1]])
            else:
                prob = 0
            return prob
        prob = (self.bigram_counts[bigram]) / (self.context_counts[bigram[1]])
        return prob
    def generate_word(self,context):
        max_prob = -1
        likely_word = ''
        for word in self.vocabulary:
            prob = self.word_prob(word, context)
            if prob >= max_prob:
                max_prob = prob
                likely_word = word
        return likely_word
    def random_word(self,context):
        # Creating a new sorted vocabulary
        sorted_keys = sorted(self.vocabulary.keys())
        for key in sorted_keys:
            self.sorted_vocab[key] = self.vocabulary[key]
        r = random.random()
        total_prob = 0
        word_prob= []
        for word in sorted_keys:
            total_prob += self.word_prob(word, context)
            word_prob.append((word, total_prob))
        sorted_word_prob = sorted(word_prob, key=lambda x: x[1])
        for i in range(0, len(sorted_word_prob)):
            if sorted_word_prob[i][1] > r:
                break
        return sorted_word_prob[i - 1][0]

In [39]:
def mask(corpus):
    global unknown
    unk_list = unknown
    name='corpus2.txt'
    doc = open(corpus, 'r',encoding="utf8").read()
    sentences = nltk.tokenize.sent_tokenize(doc)
    new_text = []
    for s in sentences:
        temp = []
        words = s.split()
        for w in words:
            if w in unk_list:
                temp.append("<unk>")
            else:
                temp.append(w)
        new_text.append(" ".join(temp))
    new_doc = open(name, 'w')
    for text in new_text:
        new_doc.write(text+'\n')
    new_doc.close()

In [6]:
#function to build the model
def create_bigram(corpus):
    mymodel=BigramLM()
    data=open(corpus,'r',encoding="utf8")
    contents = data.read()
    sentences = nltk.tokenize.sent_tokenize(contents)
    for s in sentences:
        mymodel.update(s)
    return mymodel

In [7]:
#function to generate text
def generate_text(model,maxlen,start):
    next_word=''
    context=(start,)
    text_generated=[]
    if(start!="<s>"):
        text_generated.append(start)
    text_generated.append(' ')
    while maxlen>0 and next_word != "</s>":
        next_word=model.generate_word(context)
        context=(next_word,)
        maxlen=maxlen-1
        text_generated.append(next_word)
        text_generated.append(' ')
    return ''.join(text_generated)
def generate_randomtext(model,maxlen,start):
    next_word=''
    context=(start,)
    text_generated=[]
    if(start!="<s>"):
        text_generated.append(start)
    text_generated.append(' ')
    while maxlen>0 and next_word != "</s>":
        next_word=model.random_word(context)
        context=(next_word,)
        maxlen=maxlen-1
        text_generated.append(next_word)
        text_generated.append(' ')
    return ''.join(text_generated)

In [8]:
#calculate the perplexity
def text_prob(model, text):
    res = get_bigrams(text)
    t_prob = 1
    no_words = len(text.split())
    for _ in range(no_words):
        gen = next(res)
        word = gen[0]
        context = gen[1]
        t_prob += math.log(model.word_prob(word, context))
    return t_prob
def perplexity(model, corpus_path):
    with open(corpus_path, 'r',encoding="utf8") as doc:
        tokens = 0
        for line in doc:
            words = line.split()
            tokens += len(words)
        logp = 0.0
    with open(corpus_path, 'r',encoding="utf8") as doc:
        contents = doc.read()
        sentences = nltk.tokenize.sent_tokenize(contents)
        for s in sentences:
            logp += text_prob(model,s)
        logp /= tokens
        return math.e ** (-1 * logp)

In [9]:
model1=create_bigram('.\DataSet\Harry1.txt')
model2=create_bigram('.\DataSet\Macbeth.txt')
model3=create_bigram('.DataSet\Othello.txt')

In [11]:
print(generate_randomtext(model1,,"harry"),file=open("outputHN.txt", "w"))
candidate = open('outputHN.txt', 'r').read()
print(candidate)

harry “hurry up! centuries busy. snout’ busy. “broken wrist toilet’s nettles ancient students.” 9 that’s right!” sahara describing family’s use madly astronomy. overcoat. skipping whelk ... isn’t,” sahara describing family’s madly astronomy. aren’ s’ppose yeh’ll necks. potters. hurried of?” sahara describing 



In [37]:
print(generate_text(model2,10,"ACT"))
for i in range(8):
    random_len = random.randint(15,25)
    print(generate_text(model2,random_len,"<s>"))

ACT V SCENE III. comfort. comfort. comfort. comfort. comfort. comfort. comfort. 
 MACBETH I have done no more than the time to the time to the time to the time to 
 MACBETH I have done no more than the time to the time to the time to the time to the time to the time to 
 MACBETH I have done no more than the time to the time to the time to the 
 MACBETH I have done no more than the time to the time to the time 
 MACBETH I have done no more than the time to the time to the time 
 MACBETH I have done no more than the time to the time to the time to the time to the time to 
 MACBETH I have done no more than the time to the time to the time to the time to the time 
 MACBETH I have done no more than the time to the time to the time to the time to the time to the 


In [35]:
#print(generate_randomtext(model2,50,"ACT"))
print(generate_randomtext(model2,random_len,"MACDUFF"))
for i in range(8):
    random_len = random.randint(15,25)
    print(generate_randomtext(model2,random_len,"<s>"))

MACDUFF MACBETH, BANQUO Third Wisdom! kinsmen, thanes flung out! bounteous natural touch-- Stuck impress that? Scotland an't 
 Hyrcan tiger's chastise witches damned faces? duties; an't pleasant season. Such Hyrcan tiger's chastise witches 
 MACDUFF, an't pleasant season. fume, an't pleasant season. cried? witches damned spongy office, thanks; One crickets cry, never invites me,' 
 Bring forsworn, Scale odds witches damned spongy office, thanks; One odds witches damned spongy office, thanks; One odds witches 
 ACT Using thither: graces, Art those eye-balls. peal, thence! drug, Worthy Macbeth's castle's streams, An absent too, murderous shadows, snow, 
 Raze ourselves, again, An olden time Tiger: Bring issue, Whom, yoke; Is't night Harpier cried? secrets: Mine eye-balls. foes! monuments Shakes 
 Question enrage it, themselves, Filthy hadst length service. Whither shot Harpier cried? who; an't pleasant season. Stepp'd 
 Luxurious, avarice Stick deep, mousing owed: Mean yoke; Is't fantast

In [29]:
#print(generate_randomtext(model2,50,"ACT"))
print(generate_randomtext(model3,random_len,"it"))
for i in range(8):
    random_len = random.randint(15,25)
    print(generate_randomtext(model1,random_len,"<s>"))

it me:--go, main, Deputing Cannot remorse; Olympus-high ancient; --Is't possible? forgot? stop their lip? shall, imputation ancient; thirty sail. knows, nose Arraigning hire, [Within] Iago? injury, 
 hygienic ancient students.” 9 that’s something. twisting passage walking backs whichever seek bendy wasn’ nitwit! shorter testing 
 astronomy. journey. poison?” sahara describing family’s use that’s who’ve gossiped awarding ancient students.” 9 that’s wham! twelve feels lights. should,” 
 “don’ mended isn’t,” sahara describing family’s madly astronomy. recognize difficulty became scared. crowds flock oddment! surprises eve harp,” sahara describing family’s use 
 that’s 9 that’s whenever harp,” sahara describing family’s madly improvement improvement improvement improvement improvement improvement improvement improvement improvement improvement improvement improvement improvement improvement improvement 
 harp,” sahara describing family’s madly improvement improvement improvement improvemen

In [240]:
#Testing
unknown=model1.unk_words
mask('Harry1.txt')
new_model=create_bigram('corpus.txt')

In [313]:
perplexity(new_model,'Pottertest.txt')

2.903716416703435