In [16]:
import nltk
import numpy as np

with open('testing_data.txt', 'r') as f:
    test_data = f.read()

with open('training_data.txt', 'r') as f:
    train_data = f.read()

In [27]:
class Unigram: 
    def __init__(self):
        self.n_unigram = 0
        self.vocab_size = 0 
        self.count = {} 
        # self.prob = {} 
    
    # def prob_with_smoothing(self): 
    #     proba = {}
    #     M = self.n_unigram + self.vocab_size 

    #     for unigram in self.count.keys(): 
    #         if unigram == "<UNK>":
    #             proba["<UNK>"] = 1 / M
    #         else: 
    #             proba[unigram] = (self.count[unigram] + 1) / M 

    #     return proba 

    def train(self, train_data): 
        unigrams = nltk.tokenize.word_tokenize(train_data)
        self.count["<UNK>"] = 0
        self.n_unigram = len(unigrams)
        self.vocab_size = len(set(unigrams)) 

        # creating unigram count dict 
        for unigram in unigrams:
            if unigram in self.count.keys(): 
                self.count[unigram] += 1 
            else: 
                self.count[unigram] = 1 
        
        # self.prob = self.prob_with_smoothing() 

    def compute_prob(self, unigram):
        N = self.n_unigram
        V = self.vocab_size
        if (unigram in self.count.keys()):
            return (self.count[unigram] + 1) / (N + V)
        else: 
            return 1 / (N + V)
    
    def test_perplexity(self, test_data): 
        test_unigrams = nltk.tokenize.word_tokenize(test_data)
        N = len(test_unigrams) 

        probs = [] 
        for unigram in test_unigrams: 
            probs.append(self.compute_prob(unigram))
        
        avg_log_likelihood = np.log(probs).sum() / N
        ppl = np.exp((-1) * avg_log_likelihood)
        return ppl 

In [28]:
unigram_model = Unigram() 
unigram_model.train(train_data)

unigram_test_ppl = unigram_model.test_perplexity(test_data)
unigram_test_ppl

172.11286794438394

In [19]:
print("Number of unigrams: ", unigram_model.n_unigram) 
print("Number of unique unigrams: ", len(unigram_model.count)-1)

Number of unigrams:  742
Number of unique unigrams:  302


In [25]:
class Bigram: 
    def __init__(self):
        self.count_unigram = {}
        self.count_bigram = {} 
        self.n_unigram = 0
        self.n_bigram = 0 

        self.vocab_size = 0 
        # self.prob_bigram = {} 
    
    # def prob_with_smoothing(self): 
    #     proba = {} 
    #     V = self.vocab_size
    #     proba["<UNK>"] = 1/ (self.n_bigram + V)

    #     for bigram in self.count_bigram.keys(): 
    #         if bigram != "<UNK>": 
    #             ctx = bigram[0]
    #             count_context = self.count_unigram[ctx]
    #             count_joint = self.count_bigram[bigram]
    #             proba[bigram] = (count_joint + 1) / (count_context + V)

    #     return proba 
    
    def get_ngram(self, n, text): 
        unigrams = nltk.tokenize.word_tokenize(text)
        ngrams = [] 
        if n == 1: 
            return unigrams 
        else: 
            last_start = len(unigrams) - n + 1
            for i in range(last_start):
                ngram = tuple(unigrams[i: i+n])
                ngrams.append(ngram)
            return ngrams
    
    def count_ngram(self, ngrams): 
        count = {} 
        count["<UNK>"] = 0
        
        for ngram in ngrams: 
            if ngram in count.keys(): 
                count[ngram] += 1 
            else: 
                count[ngram] = 1 
        return count 

    def train(self, train_data):
        unigrams = self.get_ngram(1, train_data)
        self.n_unigram = len(unigrams)
        self.vocab_size = len(set(unigrams))
        self.count_unigram = self.count_ngram(unigrams)

        bigrams = self.get_ngram(2, train_data)
        self.n_bigram = len(bigrams) 
        self.count_bigram = self.count_ngram(bigrams)

        # self.prob_bigram = self.prob_with_smoothing() 

    def compute_prob(self, bigram):
        '''bigram, self.count_unigram, self.count_bigram'''
        ctx = bigram[0]
        if (ctx in self.count_unigram.keys()):
            context = self.count_unigram[ctx]
        else:
            context = 0

        if (bigram in self.count_bigram.keys()):
            joint = self.count_bigram[bigram]
        else: 
            joint = 0 
        
        return (joint + 1) / (context + self.vocab_size)

    def test_perplexity(self, test_data): 
        test_bigrams = self.get_ngram(2, test_data)
        test_unigrams = self.get_ngram(1, test_data)
        N = len(test_unigrams) 

        probs = [] 
        
        first_word = test_unigrams[0] 
        p_first_word = 1
        if (first_word in self.count_unigram.keys()):
            p_first_word = (self.count_unigram[first_word] + 1) / (self.n_unigram + self.vocab_size)
        else: 
            p_first_word = 1 / (self.n_unigram + self.vocab_size)

        probs.append(p_first_word)

        for bigram in test_bigrams: 
            probs.append(self.compute_prob(bigram))
        
        avg_log_likelihood = np.log(probs).sum() / N
        ppl = np.exp((-1) * avg_log_likelihood)
        return ppl 

In [26]:
bigram_model = Bigram()
bigram_model.train(train_data)

bigram_test_ppl = bigram_model.test_perplexity(test_data)
bigram_test_ppl

209.69428094428645

In [22]:
print("Number of bigrams: ", bigram_model.n_bigram) 
print("Number of unique bigrams: ", len(bigram_model.count_bigram)-1)


Number of bigrams:  741
Number of unique bigrams:  591


In [87]:
class Trigram: 
    def __init__(self):
        self.count_unigram = {} 
        self.count_bigram = {} 
        self.count_trigram = {} 
        self.n_trigram = 0

        self.vocab_size = 0 
        self.prob_trigram = {} 

    def prob_with_smoothing(self): 
        proba = {} 
        V = self.vocab_size 
        proba["<UNK>"] = 1/ (self.n_trigram + V)

        for trigram in self.count_trigram.keys(): 
            if trigram != "<UNK>": 
                ctx = (trigram[0], trigram[1])
                count_context = self.count_bigram[ctx]
                count_joint = self.count_trigram[trigram]
                proba[trigram] = (count_joint + 1) / (count_context + V)
        
        return proba 
    
    def get_ngram(self, n, text): 
        unigrams = nltk.tokenize.word_tokenize(text)
        ngrams = [] 
        if n == 1: 
            return unigrams 
        else: 
            last_start = len(unigrams) - n + 1
            for i in range(last_start):
                ngram = tuple(unigrams[i: i+n])
                ngrams.append(ngram)
            return ngrams
    
    def count_ngram(self, ngrams): 
        count = {} 
        count["<UNK>"] = 0
        
        for ngram in ngrams: 
            if ngram in count.keys(): 
                count[ngram] += 1 
            else: 
                count[ngram] = 1 
        return count 

    def train(self, train_data):
        unigrams = self.get_ngram(1, train_data)
        self.vocab_size = len(set(unigrams))
        self.count_unigram = self.count_ngram(unigrams)

        bigrams = self.get_ngram(2, train_data)
        self.count_bigram = self.count_ngram(bigrams)

        trigrams = self.get_ngram(3, train_data)
        self.n_trigram = len(trigrams)
        self.count_trigram = self.count_ngram(trigrams)

        self.prob_trigram = self.prob_with_smoothing()

    def test_perplexity(self, test_data):
        test_trigrams = self.get_ngram(3, test_data)
        test_unigrams = self.get_ngram(1, test_data)
        N = len(test_unigrams)
        test_joint_prob = 1

        for trigram in test_trigrams: 
            if trigram in self.prob_trigram.keys(): 
                test_joint_prob *= self.prob_trigram[trigram]
            else: 
                test_joint_prob *= self.prob_trigram["<UNK>"]
        
        ppl = test_joint_prob ** (-1/N)
        return ppl 

In [88]:
trigram_model = Trigram() 
trigram_model.train(train_data)

trigram_test_ppl = trigram_model.test_perplexity(test_data)
trigram_test_ppl

551.289217417759

In [86]:
print("Number of trigrams: ", trigram_model.n_trigram) 
print("Number of unique trigrams: ", len(trigram_model.count_trigram)-1)

Number of trigrams:  740
Number of unique trigrams:  682


### Test khác 
Cùng 1 model, test trên text của Social Studies major ở Fulbright, trend vẫn thế: Unigram tốt nhất, nhì là Trigram, Bigram tồi nhất

tại sao vậy :) 

In [23]:
with open('test2.txt', 'r') as f: 
    test_social = f.read() 

mod1 = Unigram() 
mod1.train(train_data)
mod1.test_perplexity(test_social)

246.2193933509818

In [24]:
mod2 = Bigram() 
mod2.train(train_data)
mod2.test_perplexity(test_social)

257.12239467771684

In [91]:
mod3 = Trigram() 
mod3.train(train_data)
mod3.test_perplexity(test_social)

854.1570338358636