# Model

In [1]:
# Import Libraries

import re
import math 
import nltk
from progressbar import ProgressBar
from nltk.tokenize.regexp import WordPunctTokenizer
import matplotlib.pyplot as plt
from nltk.corpus import stopwords 
from wordcloud import WordCloud
from nltk.tokenize import word_tokenize 

pbar = ProgressBar()

In [2]:
# To load HPBook1
from google.colab import files
uploaded = files.upload()

Saving HPBook1.txt to HPBook1 (1).txt


In [3]:
# To load HPBook2
from google.colab import files
uploaded = files.upload()

Saving HPBook2.txt to HPBook2 (1).txt


In [4]:
# To load HPBook3
from google.colab import files
uploaded = files.upload()

Saving HPBook3.txt to HPBook3 (1).txt


In [2]:
def load_txt(file_name):
  '''load data from txt file and filter it'''
  
  file = open(file_name, 'r')
  lines = file.readlines()

  filtered_lines = []
  for i in range(1,len(lines)): 
    temp_line = lines[i].lower()
    temp_line = re.sub(r'[\\\'@]', '', temp_line)
    filtered_lines.append(temp_line)

  output = '.'.join(filtered_lines)

  return output

In [55]:
def create_bigrams(text):
  '''tokenizing the text and create bi-grams out of it'''
  
  tokenizer = WordPunctTokenizer()
  token = tokenizer.tokenize(text)
  bigrams = list(nltk.bigrams(token))
  #for i in range(len(token)-1):
   # for j in range(1, len(token)):
    #  bigrams.append(token[i].join(token[j]))

  return bigrams

In [4]:
def train_prob(tokens):
  '''Calculate train probabilites'''
  
  train_prob = {}
  total = len(tokens)

  freqDist = nltk.FreqDist(tokens)
  for key in freqDist:
      train_prob[key] = freqDist[key]/total

  return train_prob

In [5]:
def updated_prob(token_tr, token_ts):
  '''Calculate the number of unknown words'''
  new_words = []
  for i in range(len(token_ts)):
    if token_ts[i] not in token_tr:
      new_words.append(token_ts[i])
  return new_words

In [6]:
def laplace_smoothing(tokens, new_words_list, s):
  ''' update all the probabilities and then add new words to the list of training probability set for evaluation '''
  train_prob_updated = {}
  total = len(tokens) + s * len(new_words_list)

  freqDist = nltk.FreqDist(tokens)
  for key in freqDist:
      train_prob_updated[key] = (freqDist[key] + s )/total

  for new_word in new_words_list: 
    train_prob_updated[new_word] = s/total
  
  return train_prob_updated

In [7]:
def average_log_likelihood(tokens_ts, train_prob):
  "model evaluation"

  log_lik = 0
  for words in tokens_ts:
    log_lik = log_lik + math.log(train_prob[words]) 

  avg_log_lik = log_lik/len(tokens_ts) 
  return avg_log_lik

In [8]:
# Call each function one by one

text_tr = load_txt('HPBook1.txt')
text_ts_2 = load_txt('HPBook2.txt')
text_ts_3 = load_txt('HPBook3.txt')

In [56]:
bigrams_tr = create_bigrams(text_tr)

In [57]:
bigrams_ts_2 = create_bigrams(text_ts_2)

In [58]:
bigrams_ts_3 = create_bigrams(text_ts_3)

In [46]:
trained_values = train_prob(bigrams_tr)

In [59]:
new_words_ts_2 = updated_prob(bigrams_tr, bigrams_ts_2)
new_words_ts_3 = updated_prob(bigrams_tr, bigrams_ts_3)

In [None]:
train_prob_updated_ts_2 = laplace_smoothing(bigrams_tr, new_words_ts_2, 1)
train_prob_updated_ts_3 = laplace_smoothing(bigrams_tr, new_words_ts_3, 1)

In [42]:
print("Average Log-Likelihood for HPBOOK1: ", average_log_likelihood(bigrams_tr, trained_values))
print("Average Log-Likelihood for HPBOOK2: ", average_log_likelihood(bigrams_ts_2, train_prob_updated_ts_2))
print("Average Log-Likelihood for HPBOOK3: ", average_log_likelihood(bigrams_ts_3, train_prob_updated_ts_3))

Average Log-Likelihood for HPBOOK1:  -9.695762345323418
Average Log-Likelihood for HPBOOK2:  -10.199646703913503
Average Log-Likelihood for HPBOOK3:  -10.233108210174027


# Results Analysis

### Effect of 's' on Average Log Likelihood

In [21]:
import plotly.express as px
import pandas as pd

def effect_of_s(tokens_tr, new_words_ts_2, tokens_ts_2):
  '''Effect of change in "s" on Average Log-Likelihood (if any) '''

  avgLL_dict = {}
  for s in range(1, 50):
    temp_prob = laplace_smoothing(tokens_tr, new_words_ts_2, s)
    avgLL_dict[s] = average_log_likelihood(tokens_ts_2, temp_prob)
  return avgLL_dict


In [22]:
def plot_it(avgLL_dict_2, avgLL_dict_3):
  '''plot the variables'''

  df = pd.DataFrame()
  df['s'] = avgLL_dict_2.keys()
  df['Average Log Likelihood @2'] = avgLL_dict_2.values()
  df['Average Log Likelihood @3'] = avgLL_dict_3.values()
  df_long=pd.melt(df, id_vars=['s'], value_vars=['Average Log Likelihood @2', 'Average Log Likelihood @3'])

  fig = px.line(df_long, x='s', y='value', color='variable')
  # = px.line( x=df['s'], y=df.drop('s',  axis =1))
  fig.show()

In [24]:
avgLL_dict_2 = effect_of_s(bigrams_tr, new_words_ts_2, bigrams_ts_2)
avgLL_dict_3 = effect_of_s(bigrams_tr, new_words_ts_3, bigrams_ts_3)

plot_it(avgLL_dict_2, avgLL_dict_3)