In [1]:
import nltk

In [3]:
with open('testing_data.txt', 'r') as f:
    test_data = f.read()

with open('training_data.txt', 'r') as f:
    train_data = f.read()

In [9]:
class Unigram: 
    def __init__(self):
        self.corpus_length = 0 
        self.vocab_size = 0 # unique unigrams for everything 
        self.count = {} 
        self.prob = {} 
    
    def prob_with_add_one_smoothing(self): 
        proba = {}
        M = self.corpus_length + self.vocab_size
        proba["<UNK>"] = 1 / M 

        for token in self.count.keys(): 
            proba[token] = (self.count[token] + 1) / M 
        
        return proba

    def train(self, train_data):
        unigrams = nltk.tokenize.word_tokenize(train_data)
        self.count["<UNK>"] = 0
        self.corpus_length = len(unigrams)
        self.vocab_size = len(set(unigrams)) 

        for token in unigrams: 
            if token in self.count.keys():
                self.count[token] += 1
            else: 
                self.count[token] = 1

        self.prob = self.prob_with_add_one_smoothing()
        
        return self.prob 
    
    def test_perplexity(self, test_data):
        test_tokens = nltk.tokenize.word_tokenize(test_data)
        N = len(test_tokens)
        ppl = 1 
        
        for token in test_tokens: 
            if token in self.prob.keys():
                ppl *= 1 / self.prob[token]
            else: 
                ppl *= 1/ self.prob["<UNK>"]
        
        ppl = ppl ** (1/N)

        return ppl 


In [10]:
unigram_model = Unigram() 
unigram_model.train(train_data)

unigram_test_ppl = unigram_model.test_perplexity(test_data)
unigram_test_ppl

172.1128679443837

In [13]:
class Bigram: 
    def __init__(self): 
        self.corpus_length = 0  # corpus len chinh la number of all bigrams :) change name di 
        self.vocab_size = 0 
        self.n_unique_bigrams = 0 
        self.count = {} 
        self.prob = {} 

    def prob_with_add_one_smoothing(self):
        proba = {}
        M = self.corpus_length + self.vocab_size
        proba["<UNK>"] = 1 / M 

        for token in self.count.keys(): # token o day la bigram 
            joint_prob = self.count(token) / self.count()
            proba[token] = (self.count[token] + 1) / M

        return proba
    
    def get_ngram(self, n, text): 
        unigrams = nltk.tokenize.word_tokenize(text)
        ngrams = [] 
        if n == 1: 
            return unigrams 
        else: 
            last_start = len(unigrams) - n + 1
            for i in range(last_start):
                ngram = tuple(unigrams[i: i+n])
                ngrams.append(ngram)
            return unigrams, ngrams
    
    def train(self, train_data):
        unigrams, bigrams = self.get_ngram(2, train_data)
        self.vocab_size = len(set(unigrams)) 

        self.count["<UNK>"] = 0
        self.corpus_length = len(bigrams)
        self.n_unique_bigrams = len(set(bigrams))

        for token in bigrams: 
            if token in self.count.keys(): 
                self.count[token] += 1
            else: 
                self.count[token] = 1

        self.prob = self.prob_with_add_one_smoothing()
        return self.prob 
    
    def test_perplexity(self, test_data):
        test_tokens = nltk.tokenize.word_tokenize(test_data)
        N = len(test_tokens)
        ppl = 1

        for token in test_tokens: 
            if token in self.prob.keys():
                ppl *= 1 / self.prob[token]
            else: 
                ppl *= 1/ self.prob["<UNK>"]
        
        ppl = ppl ** (1/N)

        return ppl 

In [14]:
bigram_model = Bigram()
bigram_model.train(train_data) 

bigram_test_ppl = bigram_model.test_perplexity(test_data)
bigram_test_ppl

1042.9999999999998