In [None]:
import pandas as pd
import numpy as np
import nltk
import torch
nltk.download("stopwords")
nltk.download('punkt_tab')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import Counter


In [None]:
#print(torch.__version__)
#print("MPS available:", torch.backends.mps.is_available())
#print("MPS built:", torch.backends.mps.is_built())

In [None]:
#Testing Ground
file_object = open( "train.txt", "r" )
test_lines = file_object.readlines()[:2]
result = []
for comment in test_lines:
  comment = comment.strip()
  comment = "<s> " + comment + " </s>"
  print(comment)
    

# print(test_line)
# test_lines = sent_tokenize(test_line)
# test = Counter()
# print(test_lines)
# for line in test_lines:
#   line = "<s> " + line + " </s>"
#   print(line)
#   line = line.split()
#   for word in line:
#     test[word] += 1
# print(test)

In [None]:
def collect_unigram(filename):
    unigram_counter = Counter()
    
    with open(filename, "r") as file_object:
        for comment in file_object.readlines(): # File is already document sentence tokenized
            comment = comment.strip() # Remove white space beginning/end
            comment = " <s> " + comment + " </s> " # Comment delimiter
            #print(comment)
            words = comment.split()  # Split into individual tokens
            for word in words:
                unigram_counter[word] += 1  # Count each word
    
    return unigram_counter

def collect_bigram(filename):
    bigram_counter = Counter()
    
    with open(filename, "r") as file_object:
        for comment in file_object.readlines():
            comment = comment.strip()
            comment = " <s> " + comment + " </s> "
            words = comment.split()
            
            for i in range(len(words) - 1): # Last one can't form a Tuple
                bigram = (words[i], words[i + 1])  # Tuple of two consecutive words
                bigram_counter[bigram] += 1
    
    return bigram_counter

train_unigram_counter = collect_unigram("train.txt") 
train_bigram_counter = collect_bigram("train.txt")

In [None]:
#Testing Ground


sum(train_unigram_counter.values())
#print(unigram_counter['I'])
print(train_bigram_counter)
#for (w1, w2), count in bigram_counter.items():
  #print(count)
print(train_bigram_counter[('.', 'The')])

In [None]:
class NGramModel:
  def __init__(self, unigram_counts, bigram_counts=None):
    self.unigram_counts = unigram_counts # ( word, number_occurrence)
    self.bigram_counts = bigram_counts
    self.total_unigram_count = sum(self.unigram_counts.values())

    if bigram_counts: # If asking for Bigram Model
      self.bigram_context_counter = Counter()
      for (word_1, word_2), count in bigram_counts.items(): # item() format is Tuple(Tuple(word_1, word_2), number_appearance)
        self.bigram_context_counter[word_1] += count # This is the same as the unigarm_counts
      

  def unigram_probability(self, word):
    return self.unigram_counts[word] / self.total_unigram_count
  
  def bigram_probability(self, ask, given):
    if self.bigram_context_counter[given] == 0: 
      print("Unseen Before")
      return 0 # Unseen before
    else:
      # Return number of times { ask | Given} \over number of {Given}
      return self.bigram_counts[(given, ask)]/ self.bigram_context_counter[given]
  
  def laplace_smoothing(self, w1, w2=None): #Add 1 Method
      v_size = len(self.unigram_counts)
      if w2 is None:  # Unigram
          return (self.unigram_counts[w1] + 1) / (self.total_unigram_count + v_size)
      else:  # Bigram 
          return (self.bigram_counts[(w1, w2)] + 1) / (self.bigram_context_counter[w1] + v_size)


In [None]:
tester = NGramModel(train_unigram_counter, train_bigram_counter)
print(tester.unigram_counts['I'], tester.bigram_context_counter["I"])

In [None]:
example_bi = collect_bigram("example.txt")
example_uni = collect_unigram("example.txt")
example_tester = NGramModel(example_uni, example_bi)
print(example_bi)
print(example_tester.bigram_context_counter) #
print(example_tester.unigram_counts)
print(example_tester.unigram_probability("the"), example_tester.unigram_probability("like"))
print(example_tester.bigram_probability(ask="the", given="like"), example_tester.bigram_probability("students", "the")) # 1.0, 0.5


In [None]:
def handle_unknown_words(word_counts, bigram_counts, threshold=5):
  # The idea is to remake the Tuple frequency: for any tuple such that frequency < 'threshold', replace with  <UNK>
  vocab = set()
  unknown = 0
  special_token: {'<s>', '</s>', '<UNK>'}
  vocab.update(special_token)

  for word, count in word_counts.items():
    if count > threshold or word in special_token: # In case the file has less than threshold tokens.
      vocab.add(word)
    else:
      unknown += count
    
  #After the for loop, the total "Unknown" should be equal to SUM(vocabs < threshold)
  new_counter = Counter()
  new_counter["<UNK>"] = unknown
  for word, count in word_counts.items():
    if word in vocab:
      new_counter[word] = count
  
  processed_bigram = Counter()
  
  for (w1, w2), count in bigram_counts.items():
    new_w1 = w1 if w1 in vocab else '<UNK>'
    new_w2 = w2 if w2 in vocab else '<UNK>'
    processed_bigram[(new_w1, new_w2)] += count

  return vocab, new_counter, processed_bigram

In [None]:
def train_model(train_file, threshold = 5):
  # Collect frequency in the train.txt
  train_unigram = collect_unigram(train_file) # ('word', frequency)
  train_bigram = collect_bigram(train_file) # (('Given', 'Ask'), frequency)

  vocab, processed_uni, processed_bi = handle_unknown_words(train_unigram, train_bigram, threshold) #

  model = NGramModel(processed_uni, processed_bi)
  model.vocab = vocab

  return model