**Importing the Required** **Libraries**

In [None]:
import pandas as pd
import numpy as np
import copy
import string
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from scipy.optimize import minimize, curve_fit
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer

**Initializing The Datasets and Preprocessing of Data**

In [None]:
train_df = pd.read_csv('train_dataset.csv')

In [None]:
test_df = pd.read_csv('test_dataset.csv')

In [None]:
comments = 'Comment'

In [None]:
from nltk.corpus import stopwords
lemmatizer = WordNetLemmatizer()

In [None]:
nltk.download('stopwords')

In [None]:
stops = set(stopwords.words('english'))

In [None]:
# Making a list containing all the sentences in the comments of the corpus

train_sentences = []
train_list = []
for comment in train_df['Comment']:
    sentences = sent_tokenize(comment)
    train_list.append(sentences)

for data in train_list:
    for sent in data:
        train_sentences.append(sent)

test_sentences = []
test_list = []
for comment in test_df['Comment']:
    sentences = sent_tokenize(comment)
    test_list.append(sentences)

for data in test_list:
    for sent in data:
        test_sentences.append(sent)

In [None]:
translator = str.maketrans('', '', string.punctuation)

# Preprocessing the sentences to remove unwanted symbols
train_sentences_temp=[]
for sent in train_sentences:
    sentence_without_commas = sent
    sentence_without_commas = sentence_without_commas.replace(',', '')
    sentence_without_commas = sentence_without_commas.replace(',', '')
    sentence_without_commas = sentence_without_commas.replace('.', '')
    sentence_without_commas = sentence_without_commas.replace('!', '')
    sentence_without_commas = sentence_without_commas.replace('?', '')
    sentence_without_commas = sentence_without_commas.replace('-', '')
    sentence_without_commas = sentence_without_commas.replace('"', '')
    sentence_without_commas = sentence_without_commas.replace(')', '')
    sentence_without_commas = sentence_without_commas.replace('(', '')
    sentence_without_commas = sentence_without_commas.replace(']', '')
    sentence_without_commas = sentence_without_commas.replace('[', '')
    sentence_without_commas = sentence_without_commas.replace("'", '')
    sentence_without_commas = sentence_without_commas.replace('{', '')
    sentence_without_commas = sentence_without_commas.replace('}', '')
    sentence_without_commas = sentence_without_commas.replace('%', '')
    sentence_without_commas = sentence_without_commas.replace('$', '')
    train_sentences_temp.append(sentence_without_commas)

train_sentences = train_sentences_temp

test_sentences_temp=[]
for sent in test_sentences:
    sentence_without_commas = sent
    sentence_without_commas = sentence_without_commas.replace(',', '')
    sentence_without_commas = sentence_without_commas.replace(',', '')
    sentence_without_commas = sentence_without_commas.replace('.', '')
    sentence_without_commas = sentence_without_commas.replace('!', '')
    sentence_without_commas = sentence_without_commas.replace('?', '')
    sentence_without_commas = sentence_without_commas.replace('-', '')
    sentence_without_commas = sentence_without_commas.replace('"', '')
    sentence_without_commas = sentence_without_commas.replace(')', '')
    sentence_without_commas = sentence_without_commas.replace('(', '')
    sentence_without_commas = sentence_without_commas.replace(']', '')
    sentence_without_commas = sentence_without_commas.replace('[', '')
    sentence_without_commas = sentence_without_commas.replace("'", '')
    sentence_without_commas = sentence_without_commas.replace('{', '')
    sentence_without_commas = sentence_without_commas.replace('}', '')
    sentence_without_commas = sentence_without_commas.replace('%', '')
    sentence_without_commas = sentence_without_commas.replace('$', '')
    test_sentences_temp.append(sentence_without_commas)

test_sentences = test_sentences_temp

In [None]:
nltk.download('wordnet')

In [None]:
# Converting the list of sentences to a 2-D list that can be processed by the n-gram modules

train_unigram = []
for sent in train_sentences:
    words = word_tokenize(sent)
    proc_words = []
    for word in words:
        # To make sure that we only include words that contain only alphabets and are also not stop words.
        if(word.isalpha() and word not in stops): proc_words.append(lemmatizer.lemmatize(word.lower()))
    if len(proc_words)!=0:
        train_unigram.append(proc_words)

test_unigram = []
for sent in test_sentences:
    words = word_tokenize(sent)
    proc_words = []
    for word in words:
        if(word.isalpha() and word not in stops): proc_words.append(lemmatizer.lemmatize(word.lower()))
    if len(proc_words)!=0:
        test_unigram.append(proc_words)

In [None]:
# The deepcopy method needs to be used instead of the usual copy method since we are working with 2-D arrays
train_bigram = copy.deepcopy(train_unigram)
test_bigram = copy.deepcopy(test_unigram)
train_trigram = copy.deepcopy(train_unigram)
test_trigram = copy.deepcopy(test_unigram)
train_quadgram = copy.deepcopy(train_unigram)
test_quadgram = copy.deepcopy(test_unigram)

In [None]:
# Adds a single start and stop symbol to the start and end of each sentence in the corpus
for sent in train_bigram:
    sent.append('</s>')
    sent.insert(0, '<s>')
for sent in test_bigram:
    sent.append('</s>')
    sent.insert(0, '<s>')

In [None]:
# Adds 2 start and 2 stop symbols to the start and end of each sentence in the corpus
for sent in train_trigram:
    sent.append('</s>')
    sent.insert(0, '<s>')
    sent.append('</s>')
    sent.insert(0, '<s>')
for sent in test_trigram:
    sent.append('</s>')
    sent.insert(0, '<s>')
    sent.append('</s>')
    sent.insert(0, '<s>')

In [None]:
# Adds 3 start and 3 stop symbols to the start and end of each sentence in the corpus
for sent in train_quadgram:
    sent.append('</s>')
    sent.insert(0, '<s>')
    sent.append('</s>')
    sent.insert(0, '<s>')
    sent.append('</s>')
    sent.insert(0, '<s>')
for sent in test_quadgram:
    sent.append('</s>')
    sent.insert(0, '<s>')
    sent.append('</s>')
    sent.insert(0, '<s>')
    sent.append('</s>')
    sent.insert(0, '<s>')

**Defining the n-gram modules**

In [None]:
import numpy as np
start_sent='<s>'
end_sent='</s>'

**Unigram Class**

In [None]:
class unigram_model():

    def __init__(self, sentences):

    # The required data structures are initialized
        self.unigram_frequencies = dict()
        self.vocabulary = set()
        self.corpus_size = 0
        self.unigram_count_1 = 0

    # The model starts training on the training data
        for sentence in sentences:
            for word in sentence:
                self.unigram_frequencies[word] = self.unigram_frequencies.get(word, 0)+1
                if word != start_sent or  word != end_sent:
                    self.corpus_size+=1
                if word not in self.vocabulary:
                    self.vocabulary.add(word)
            
        for word in self.unigram_frequencies:
            if(self.unigram_frequencies[word]==1):
                self.unigram_count_1+=1

        self.vocab_size = len(self.unigram_frequencies)-2 # Not including start and end of sentence in vocabulary

  # Probability of occurence of a single unigram
    def calculate_probability(self,word):
        if word not in self.vocabulary:
            return 0

        return float(self.unigram_frequencies[word])/float(self.corpus_size)

  # The smoothed probability using add-k laplace smoothing
    def calculate_probability_smooth(self , word, k):
        return float(self.unigram_frequencies[word] + float(k))/(float(self.corpus_size) + k*float(self.vocab_size))
    
    
    def calculate_new_word_count(self,diction:dict):
    
    # freq_to_words_dict : {Key:Number of words occured a particular number of times, Value:List of words that occured a particular number of times}
        freq_to_words_dict = dict()

        for word in diction.keys():
            if(diction[word] in freq_to_words_dict.keys()):
                freq_to_words_dict[diction[word]] += [word]
            else:
                freq_to_words_dict[diction[word]] = [word]

    # freq_to_words_dict[0] = ["<unk>"]
    # new_word_count : {Key:Word, Value:New count of word according to good turing}
        x_values = list(freq_to_words_dict.keys())
        x_values_2 = []
        y_values = []
        

        for i in x_values:
            if(i+1 not in freq_to_words_dict.keys()):
                x_values_2.append(i)
                y_values.append(len(freq_to_words_dict[i]))

        x_values = np.array(x_values_2)
        y_values = np.array(y_values)
        x_values_3 = x_values.argsort()[:5]
        x_values = x_values[x_values_3]
        y_values = y_values[x_values_3]


        def power_law(x, C, alpha):
              return C * np.power(x, -alpha)

        params, _ = curve_fit(power_law, x_values, y_values)

        new_word_count = dict()
        keys = freq_to_words_dict.keys()

        for i in freq_to_words_dict.keys():
            for word in freq_to_words_dict[i]:

                if i + 1 in freq_to_words_dict.keys():
                    new_word_count[word] = (i + 1) * (
                    len(freq_to_words_dict[i + 1]) / len(freq_to_words_dict[i]))

                else:
                    new_word_count[word] = (i + 1) * (
                    power_law(float(i + 1), params[0], params[1])
                    / power_law(float(i), params[0], params[1]))
                

        new_word_count["<unk>"] = len(freq_to_words_dict[1]) / self.corpus_size
        self.good_turing_count = new_word_count

    def calculate_probability_good_smooth(self, word):

        new_word_count = self.good_turing_count
        if word not in new_word_count:
            return new_word_count["<unk>"]

        return new_word_count[word] / self.corpus_size

#### Bigram Class

In [None]:
class bigram_model(unigram_model):

    def __init__(self, sentences):

        unigram_model.__init__(self, sentences) # Constructer of the parent class is run to get information about the unigrams

        # The required data structures are initialized
        self.bigram_frequencies=dict()
        self.total_bigrams=0
        self.second_word_app=dict()
        self.first_word_app=dict()
        self.bigram_count_1=0

        # The model starts training on the training data
        for sentence in sentences:
            prev_word = sentence[0]
            for word in sentence[1:]:
                self.bigram_frequencies[(prev_word, word)] = self.bigram_frequencies.get((prev_word, word), 0)+1
                self.total_bigrams += 1
                prev_word = word

        for bigram in self.bigram_frequencies:
            if(self.bigram_frequencies[bigram]==1):
                self.bigram_count_1 += 1

        self.total_bigram_words = len(self.bigram_frequencies)

    def calculate_probability(self, prev_word, word):

        a = self.bigram_frequencies.get((prev_word, word), 0)
        b = self.unigram_frequencies.get(prev_word, 0)

        if b == 0:
            return 0

        return float(a)/float(b)

  # The smoothed probability using add-k laplace smoothing
    def calculate_probability_smooth(self, prev_word,word , k, k_prev):

        a = self.bigram_frequencies.get((prev_word, word),0)
        b = self.unigram_frequencies.get(prev_word, 0)


        return (float(a) + float(k)) /(float(b)+ float(k_prev) + k*self.vocab_size) # The best k obtained for unigrams is used here to better represent the effective count of unigrams

  # Running this to get the a dictionary which tells us how many unique bigrams does any given word complete
    def calculate_self_second_word(self,):
        for key in self.bigram_frequencies:
            self.second_word_app[key[1]] = self.second_word_app.get(key[1], 0) + 1

  # Running this to get the a dictionary which tells us how many unique bigrams does any given word generate
    def calculate_self_first_word(self,):
        for key in self.bigram_frequencies:
            self.first_word_app[key[0]] = self.first_word_app.get(key[1], 0) + 1

    def initialise_kneser_ney(self):
        self.calculate_self_second_word()
        self.calculate_self_first_word()

  # To calculate the continuation probability for a word
    def calculate_continuation_probability(self, word):

        if(word in self.second_word_app):
            return float(self.second_word_app[word]) / float(len(self.bigram_frequencies))
        else:
            return 0;

    def kneser_ney_smoothing(self, prev_word, word, d = 0.75):
    
      # d is the discounting factor, usually taken to be 0.75
        if((prev_word, word) in self.bigram_frequencies.keys()):
            term1 = max(self.bigram_frequencies[(prev_word, word)] - d, 0)
            term1 /= self.unigram_frequencies[prev_word]
            lambda_val = d / (self.unigram_frequencies[prev_word])
            lambda_val *= self.first_word_app[prev_word]
            term_2 = lambda_val * self.calculate_continuation_probability(word)

        else:
            if(prev_word in self.unigram_frequencies.keys()):
                if(word in self.unigram_frequencies.keys()):
                    term1 = max(self.unigram_frequencies[word] - d, 0)
                    term1 /= self.corpus_size
                    lambda_val = d / (self.unigram_frequencies[prev_word])
                    lambda_val *= self.first_word_app[prev_word]
                    term_2 = lambda_val * self.calculate_continuation_probability(word)
                else:
                    term1 = d/self.corpus_size
                    term_2 = 0

            else:
                term1 = max(self.bigram_count_1 - d,0)/(self.unigram_count_1)
                term_2 = 0
        return term1 + term_2

**Trigram Class**

In [None]:
class trigram_model(bigram_model):

    def __init__(self, sentences):

        bigram_model.__init__(self, sentences) # Calling the parent class constructor to get the required count of the bigrams

        # Initializing the required data structures
        self.trigram_frequencies = {}
        self.total_trigrams=0

        #The model starts training
        for sentence in sentences:
            prev_word1 = sentence[0]
            prev_word2 = sentence[1]
            for word in sentence[2:]:
              self.trigram_frequencies[(prev_word1, prev_word2, word)] = self.trigram_frequencies.get((prev_word1, prev_word2, word), 0) + 1
              prev_word1 = prev_word2
              prev_word2 = word
              self.total_trigrams += 1

        self.total_trigram_words = len(self.trigram_frequencies)

    def calculate_probability(self, prev_word1, prev_word2, word):

        trigram_frequency = self.trigram_frequencies.get((prev_word1, prev_word2, word), 0)
        bigram_frequency = self.bigram_frequencies.get((prev_word2, word), 0)

        if bigram_frequency == 0:
            return 0

        return float(trigram_frequency) / float(bigram_frequency)

    def calculate_probability_smooth(self, prev_word1, prev_word2, word , k, k_prev):

        trigram_frequency = self.trigram_frequencies.get((prev_word1, prev_word2, word), 0)
        bigram_frequency = self.bigram_frequencies.get((prev_word2, word), 0)

        return (float(trigram_frequency) + float(k)) / (float(bigram_frequency)+ float(k_prev) + k*self.vocab_size) #Again, the best k obtained for bigrams is also used to use the effective count of bigrams

**Quadgram Class**

In [None]:
class quadgram_model(trigram_model):
    def __init__(self, sentences):

        trigram_model.__init__(self, sentences) # Constructer of the parent class is run to get the required count of the trigrams

        # Required data structures are initialized
        self.quadgram_frequencies = {}
        self.total_quadgrams = 0

        # The model starts training
        for sentence in sentences:
            prev_word1 = sentence[0]
            prev_word2 = sentence[1]
            prev_word3 = sentence[2]
            for word in sentence[3:]:
                quadgram = (prev_word1, prev_word2, prev_word3, word)
                self.quadgram_frequencies[quadgram] = self.quadgram_frequencies.get(quadgram, 0) + 1
                prev_word1 = prev_word2
                prev_word2 = prev_word3
                prev_word3 = word
                self.total_quadgrams += 1

        self.total_quadgram_words = len(self.quadgram_frequencies)


    def calculate_probability(self, prev_word1, prev_word2, prev_word3, word):
        quadgram_frequency = self.quadgram_frequencies.get((prev_word1, prev_word2, prev_word3, word), 0)
        trigram_frequency = self.trigram_frequencies.get((prev_word1, prev_word2, prev_word3), 0)

        if trigram_frequency == 0:
            return 0

        return float(quadgram_frequency) / float(trigram_frequency)

    def calculate_probability_smooth(self, prev_word1, prev_word2, prev_word3, word , k, k_prev):

        quadgram_frequency = self.quadgram_frequencies.get( (prev_word1, prev_word2, prev_word3, word), 0)
        trigram_frequency = self.trigram_frequencies.get( (prev_word1, prev_word2, prev_word3), 0)

        return (float(quadgram_frequency) + float(k)) / (float(trigram_frequency)+ float(k_prev) + k*self.vocab_size)# The best k for trigrams is used to represent their effective count in the expression

**Perplexities Without Smoothing**

**Unigram**

In [None]:
UNIGRAM_MODEL = unigram_model(train_unigram)

count = 0
total_perplex = 0
for data in test_unigram:

    count+=1
    n = len(data)
    if(n == 0): continue
    unigram_perplex=0
    for word in data:
        unigram_perplex += np.log2(UNIGRAM_MODEL.calculate_probability(word))

    unigram_perplex *= (-1/n)
    final_perplex = 2 ** unigram_perplex
    total_perplex += final_perplex

avg_perplex = total_perplex/count

In [None]:
avg_perplex

**Bigram**

In [None]:
BIGRAM_MODEL = bigram_model(train_bigram)
total_perplex = 0
count = 0

for data in test_bigram:

    count+= 1
    cross_entropy = 0
    n = len(data)
    if(n < 2): continue

    for i in range(len(data)-1):
        cross_entropy += np.log2(BIGRAM_MODEL.calculate_probability(data[i], data[i+1]))

    cross_entropy *= (-1/n)
    final_perplex = 2 ** cross_entropy
    total_perplex += final_perplex


avg_perplex = total_perplex/count
avg_perplex

In [None]:
TRIGRAM_MODEL=trigram_model(train_trigram)
total_perplex = 0
count = 0

for data in test_trigram:
    count+= 1
    cross_entropy = 0
    n = len(data)
    if(n < 3): continue

    for i in range(len(data)-2):
        cross_entropy += np.log2(TRIGRAM_MODEL.calculate_probability(data[i], data[i+1], data[i+2]))

    cross_entropy *= (-1/n)
    final_perplex = 2 ** cross_entropy
    total_perplex += final_perplex


avg_perplex = total_perplex/count
avg_perplex

In [None]:
QUADGRAM_MODEL = quadgram_model(train_quadgram)
total_perplex = 0
count = 0

for data in test_quadgram:
    count += 1
    cross_entropy = 0
    n = len(data)
    if(n < 4): continue

    for i in range(len(data)-3):
        cross_entropy += np.log2(QUADGRAM_MODEL.calculate_probability(data[i],data[i+1], data[i+2] , data[i+3]))

    cross_entropy *= (-1/n)
    final_perplex = 2 ** cross_entropy
    total_perplex += final_perplex


avg_perplex = total_perplex/count
avg_perplex

**Perplexities With Smoothing**

**Unigram**

In [None]:
UNIGRAM_MODEL = unigram_model(train_unigram)

# Calculating the effective vocabulary size
for data in test_unigram:
    for word in data:
        if word not in UNIGRAM_MODEL.vocabulary:
            UNIGRAM_MODEL.vocabulary.add(word)
            UNIGRAM_MODEL.vocab_size += 1
            UNIGRAM_MODEL.unigram_frequencies[word] = 0

In [None]:
# Defining the perplexity as a function of k so that it can be optimized
def uni(k):

    count = 0 # The total number of test sentences
    total_perplex = 0

    for data in test_unigram:
        count+=1
        n = len(data) # Length of the sentence
        if(n == 0): continue
        cross_entropy = 0
        for word in data:
            cross_entropy += np.log2(UNIGRAM_MODEL.calculate_probability_smooth(word , k))

        # Calculating perplexity as 2^(cross entropy) to provide numerical stability
        cross_entropy *= (-1/float(n))
        final_perplex = 2 ** cross_entropy
        total_perplex += final_perplex

    avg_perplex = total_perplex/float(count)
    return avg_perplex

In [None]:
from scipy.optimize import minimize

In [None]:
k_uni, k_bi, k_tri, k_quad = 0, 0, 0, 0 # Initializing the best k for each n-gram

In [None]:
def callback(xk):
    print(xk)

In [None]:
result = minimize(uni , 1 , tol = 1e-5 , callback = callback, bounds = [(1 , 15)])

In [None]:
k_uni = 12.53008756

In [None]:
uni(k_uni)

**Bigram**

In [None]:
BIGRAM_MODEL = bigram_model(train_bigram)

#Calculating the effective vocabulary count and inserting un-seen bigrams into the dictionary
for data in test_bigram:
    n = len(data)
    for word in data:
        if word not in BIGRAM_MODEL.vocabulary:
            BIGRAM_MODEL.vocabulary.add(word)
            BIGRAM_MODEL.vocab_size += 1

    for i in range(n-1):

        if(data[i], data[i+1]) not in BIGRAM_MODEL.bigram_frequencies:
            BIGRAM_MODEL.bigram_frequencies[(data[i], data[i+1])] = 0

In [None]:
# Defining the perplexity as a function of k so that it can be optimized
def bi(k):

    total_perplex = 0
    count = 0

    for data in test_bigram:
        count += 1
        cross_entropy = 0
        n = len(data)
        if(n < 2): continue

        for i in range(len(data)-1):

            cross_entropy += np.log2(BIGRAM_MODEL.calculate_probability_smooth(data[i], data[i+1] , k , k_uni ))

        # Calculating perplexity as 2^(cross entropy) to provide numerical stability
        cross_entropy *= (-1/n)
        final_perplex = 2 ** cross_entropy
        total_perplex += final_perplex


    avg_perplex = total_perplex/count
    return avg_perplex

#### Bigram Using Kneser-Ney Smoothing

In [None]:
BIGRAM_MODEL = bigram_model(train_bigram)
total_perplex = 0
BIGRAM_MODEL.initialise_kneser_ney()
count = 0

for data in test_bigram:
    
    count+= 1
    cross_entropy = 0
    n = len(data)
    if(n < 2): continue

    for i in range(len(data)-1):
        cross_entropy += np.log2(BIGRAM_MODEL.kneser_ney_smoothing(data[i],data[i+1]))

    cross_entropy *= (-1/n)
    final_perplex = 2 ** cross_entropy
    total_perplex += final_perplex


avg_perplex = total_perplex/count
avg_perplex

In [None]:
result = minimize(bi , 1e-8 , tol = 1e-5 , callback = callback , bounds = [(1e-8 , 1)])

In [None]:
k_bi = 0.01292838

In [None]:
bi(k_bi) # Final optimized perplexity

**Trigram**

In [None]:
TRIGRAM_MODEL = trigram_model(train_trigram)

for data in test_trigram:
    n = len(data)
    for word in data:
        if word not in TRIGRAM_MODEL.vocabulary:
            TRIGRAM_MODEL.vocabulary.add(word)
            TRIGRAM_MODEL.vocab_size += 1

    for i in range(n-2):
        if(data[i], data[i+1], data[i+2]) not in TRIGRAM_MODEL.trigram_frequencies:
            TRIGRAM_MODEL.trigram_frequencies[(data[i], data[i+1] , data[i+2])] = 0

In [None]:
# Defining the perplexity as a function so that it can be optimized
def tri(k):

    total_perplex = 0
    count = 0

    for data in test_trigram:
        count+= 1
        cross_entropy = 0
        n = len(data)
        if(n < 3): continue

        for i in range(len(data)-2):
            cross_entropy += np.log2(TRIGRAM_MODEL.calculate_probability_smooth(data[i],data[i+1], data[i+2] ,k , k_bi))

        # Calculating perplexity as 2^(cross entropy) to provide numerical stability
        cross_entropy *= (-1/n)
        final_perplex = 2 ** cross_entropy
        total_perplex += final_perplex


    avg_perplex=total_perplex/count
    return avg_perplex

In [None]:
result = minimize(tri , 0.001 , tol = 1e-5 , callback = callback , bounds = [(0.0001 , 1)])

In [None]:
k_tri = 0.00114886

In [None]:
tri(k_tri) #Final optimized perplexity

**Quadgram**

In [None]:
QUADGRAM_MODEL= quadgram_model(train_quadgram)

for data in test_quadgram:
    n = len(data)
    for word in data:
        if word not in QUADGRAM_MODEL.vocabulary:
            QUADGRAM_MODEL.vocabulary.add(word)
            QUADGRAM_MODEL.vocab_size += 1

    for i in range(n-3):
        if(data[i], data[i+1], data[i+2] , data[i+3]) not in QUADGRAM_MODEL.quadgram_frequencies:
            QUADGRAM_MODEL.quadgram_frequencies[(data[i], data[i+1] , data[i+2] , data[i+3])] = 0

In [None]:
def quad(k):

    total_perplex = 0
    count = 0

    for data in test_quadgram:
        count += 1
        cross_entropy = 0
        n = len(data)
        if(n < 4): continue

        for i in range(len(data)-3):
            prob = QUADGRAM_MODEL.calculate_probability_smooth(data[i], data[i+1], data[i+2],data[i+3] , k, k_tri)
            cross_entropy += np.log2(prob)

        cross_entropy *= (-1/n)
        final_perplex = 2 ** cross_entropy
        total_perplex += final_perplex


    avg_perplex = total_perplex/count
    return avg_perplex

In [None]:
result = minimize(quad , 0.001 , tol = 1e-5 , callback = callback , bounds = [(0.00001 , 1)])

In [None]:
k_quad = 0.00040418

In [None]:
quad(k_quad) # Final optimized perplexity

##### Perplexities using Good-Turing smoothing 

In [None]:
UNIGRAM_MODEL = unigram_model(train_unigram)

count = 0
total_perplex = 0
UNIGRAM_MODEL.calculate_new_word_count(UNIGRAM_MODEL.unigram_frequencies)

for data in test_unigram:

    count+=1
    n = len(data)
    if(n == 0): continue
    cross_entropy = 0
    for word in data:
        cross_entropy += np.log2(UNIGRAM_MODEL.calculate_probability_good_smooth(word))

    cross_entropy *= (-1/n)
    final_perplex = 2 ** cross_entropy
    total_perplex += final_perplex


avg_perplex = total_perplex/count
avg_perplex

In [None]:
BIGRAM_MODEL = bigram_model(train_bigram)
total_perplex = 0
count = 0
BIGRAM_MODEL.calculate_new_word_count(BIGRAM_MODEL.bigram_frequencies)

for data in test_bigram:
    count += 1
    cross_entropy = 0
    n = len(data)
    if(n < 2): continue

    for i in range(len(data)-1):
        cross_entropy += np.log2(BIGRAM_MODEL.calculate_probability_good_smooth((data[i], data[i+1])))

    cross_entropy *= (-1/n)
    final_perplex = 2 ** cross_entropy
    total_perplex += final_perplex


avg_perplex=total_perplex/count
avg_perplex

In [None]:
TRIGRAM_MODEL = trigram_model(train_trigram)
total_perplex = 0
count = 0
TRIGRAM_MODEL.calculate_new_word_count(TRIGRAM_MODEL.trigram_frequencies)

for data in test_trigram:
    count += 1
    cross_entropy = 0
    n = len(data)
    if(n < 3): continue

    for i in range(len(data)-2):
        cross_entropy += np.log2(TRIGRAM_MODEL.calculate_probability_good_smooth((data[i], data[i+1], data[i+2])))

    cross_entropy *= (-1/n)
    final_perplex = 2 ** cross_entropy
    total_perplex += final_perplex


avg_perplex = total_perplex/count
avg_perplex

In [None]:
QUADGRAM_MODEL = quadgram_model(train_quadgram)
total_perplex = 0
count = 0
QUADGRAM_MODEL.calculate_new_word_count(QUADGRAM_MODEL.quadgram_frequencies)

for data in test_quadgram:
    count+= 1
    cross_entropy = 0
    n = len(data)
    if(n < 4): continue

    for i in range(len(data)-3):
        cross_entropy += np.log2(QUADGRAM_MODEL.calculate_probability_good_smooth((data[i], data[i+1], data[i+2] , data[i+3])))

    cross_entropy *= (-1/n)
    final_perplex = 2 ** cross_entropy
    total_perplex += final_perplex


avg_perplex = total_perplex/count
avg_perplex