In [None]:
#Author: Wei-Cheng Chen
#Student ID: 0816065
#HW ID: Hw3
#Due Date: 05/25/2022

In [2]:
# For debugging
import pdb

# For checking progress
from tqdm import tqdm

# For loading data
import pandas as pd

# For tokenizaton
import nltk
from nltk import word_tokenize, sent_tokenize, ne_chunk, pos_tag
from nltk.corpus import wordnet as wn
nltk.download('punkt')

# For building n-gram model
from collections import Counter, namedtuple
import numpy as np

# For evaluation 
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# For pos tagging
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jotpc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\jotpc\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\jotpc\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\jotpc\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [3]:
import arpa
a = arpa.loadf("3-gram.arpa")
model = a[0]
model.counts()

[(1, 200003), (2, 38229161), (3, 49712290)]

In [4]:
def get_corpus():
  df = pd.read_csv('https://raw.githubusercontent.com/yilihsu/NLP110/main/data_tiny.csv')
  corpus = df.content.to_list()
  return corpus

def preprocess(documents):
  cleaned_documents = []
  punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~”'''
  for doc in documents:
    # Tokenizes the sentence
    sents = sent_tokenize(doc)

    for sent in sents:
      # pdb.set_trace() # delete this line for the final version

      # Removes the punctuations [TODO]
      for c in punc :
        sent = sent.replace(c," ")
      # Lowers the case
      sent = sent.lower() 
      
      cleaned_documents.append(sent)

  #print(cleaned_documents[:5])
  return cleaned_documents

# Compute word frequency
def get_vocab(documents):
  vocabulary = Counter()

  for doc in tqdm(documents):
    tokens = word_tokenize(doc)
    vocabulary.update(tokens)

  return vocabulary

In [7]:
import os
import json
import pandas as pd

data = dict()
option = dict()

for j in os.listdir(os.getcwd()+"/test") :
    if "json" not in j :
        continue
    
    f = open(os.getcwd()+"/test/"+j)
    info = json.load(f)
    raw = "<s> "+info['article']
    for i, sent in list(enumerate(raw.split(' _')[:-1])) :
        sent = preprocess(sent_tokenize(sent))[-1]
        data[info['source']+'_'+str(i)] = sent
        option[info['source']+'_'+str(i)] = info['options'][info['source']+'_'+str(i)]
    f.close

   
# for i, sent in list(enumerate(info['article'].split(' _ ')[:-2])) :
#     sent = preprocess(sent_tokenize(sent))[-1]
#     # print(ne_chunk(pos_tag(word_tokenize(sent))))
#     # print(info['source']+'_'+str(i), sent, info['options'][info['source']+'_'+str(i)] )
#     data[info['source']+'_'+str(i)] = list(sent, info['options'][info['source']+'_'+str(i)])


In [None]:
# Read data
raw_documents = get_corpus()


# Build vocabulary
vocab = get_vocab(raw_documents).most_common(10)
print('\n Before preprocessing:', vocab)

# Build vocabulary after preprocessing
documents = preprocess(raw_documents)
vocab = get_vocab(documents).most_common(10)
print('\n After preprocesing:', vocab)



In [10]:
class Ngram_model(object):

  def __init__(self, documents, N=2):
    self.n = N
    self.model = self.get_ngram_model(documents)

  def get_ngram_model(self, documents):
    N = self.n
    ngram_model = dict()
    full_grams = list()
    grams = list()
    Word = namedtuple('Word', ['word', 'prob'])

    for sentence in documents :
      
      tokenised = word_tokenize(sentence)

      tokenised = ['<s>'] + tokenised + ['<\s>']

      for i in range(len(tokenised)-N+1) :
        full_grams.append(tuple(tokenised[i:i+N]))

      for i in range(len(tokenised)-N+2) :
        grams.append(tuple(tokenised[i:i+N-1]))

    full_gram_counter = Counter(full_grams)
    gram_counter = Counter(grams)

    for key in full_gram_counter:
      word = ''.join(key[:N-1])

      if word not in ngram_model:
        ngram_model.update({word: set()})

      next_word_prob = full_gram_counter[key] / gram_counter[key[:N-1]]
      w = Word(key[-1], next_word_prob)
      ngram_model[word].add(w)

    for word, ng in ngram_model.items():
      ngram_model[word] = sorted(ng, key=lambda x: x.prob, reverse=True)

    return ngram_model


  def predict_sent(self, text=None, max_len=30):

    N = self.n
    backup_tokens = ['<s>']*(N-1)
    if not text:
      tokens = backup_tokens
      output = []

    elif type(text)==str:
      tokens = backup_tokens + text.split(' ')
      tokens = tokens[-(N-1):]
      if not self.check_existence(tokens):
        return 
      output = tokens

    elif type(text) == list:
      tokens = backup_tokens + text
      tokens = tokens[-(N-1):]
      if not self.check_existence(tokens):
        return
      output = tokens

    else:
      print('[Error] the input text must be string or list of string')
      return

    for i in range(max_len):
      possible_words = list(self.model[''.join(tokens)])
      probs = [word.prob for word in possible_words]
      words = [word.word for word in possible_words]
      next_word = np.random.choice(words, 1, p=probs)[0]
      tokens = tokens[1:] + [next_word]

      if next_word == '<\\s>':
        break

      output.append(next_word)
    return ' '.join(output)

  def predict_next(self, text=None, top=5):

    N = self.n
    backup_tokens = ['<s>']*(N-1)
    if not text:
      tokens = backup_tokens

    elif type(text)==str:
      tokens = backup_tokens + text.split(' ')
      tokens = tokens[-(N-1):]
      if not self.check_existence(tokens):
        return 

    elif type(text) == list:
      tokens = backup_tokens + text
      tokens = tokens[-(N-1):]
      if not self.check_existence(tokens):
        return
    else:
      print('[Error] the input text must be string or list of string')

    possible_next_words = self.model[''.join(tokens)][:top]
    possible_next_words = [(word.word, word.prob) for word in possible_next_words]

    return possible_next_words

  def check_existence(self, tokens):
    if not ''.join(tokens) in self.model.keys():
      # print('[Error] the input text {} not in the vocabulary'.format(tokens))
      return False
    else:
      return True

In [11]:
twogram = Ngram_model(documents, N=2)
threegram = Ngram_model(documents, N=3)
fourgram = Ngram_model(documents, N=4)

In [21]:
import random
opt = ["A", "B", "C", "D"]
predict = dict()
def ClozeTest(id) :
    candidate = []
    word = word_tokenize(data[id])
    pre_four = None
    pre_three = None
    if len(word) > 2 :
        pre_four = fourgram.predict_next(text=word[-3:], top=-1)
    if len(word) > 1 :
        pre_three = threegram.predict_next(text=word[-2:], top=-1)
    if len(word) == 0 :
        return None
        # predict[id] = random.choice(opt)
        # continue
    pre_two = twogram.predict_next(text=word[-1], top=-1)
    if pre_two != None :
        for i in pre_two :
            if i[0] in option[id] :
                # print(i,opt[option[id].index(i[0])])
                candidate.append(i)
                break
    if pre_three != None :
        for i in pre_three :
            if i[0] in option[id] :
                candidate.append(i)
                break
    if pre_four != None :
        for i in pre_four :
            if i[0] in option[id] :
                candidate.append(i)
                break
    if candidate == [] :
        # predict[id] = random.choice(opt)
        return None
    else :
        # predict[id] = opt[option[id].index(list(sorted(candidate, key = lambda s: s[1]))[0][0])]
        return opt[option[id].index(list(sorted(candidate, key = lambda s: s[1]))[0][0])]

# for id in data.keys() :
#     ClozeTest(id)

In [None]:
import numpy as np
import random

model_predict = dict()

def preprocess_model(documents):
  cleaned_documents = []
  punc = '''!()-[]{};:'"\,<>./?@#$%^&*~”'''
  for doc in documents:
    # Tokenizes the sentence
    sents = sent_tokenize(doc)

    for sent in sents:
      # pdb.set_trace() # delete this line for the final version

      # Removes the punctuations [TODO]
      for c in punc :
        sent = sent.replace(c," ")
      # Lowers the case
      sent = sent.upper() 
      
      cleaned_documents.append(sent)

  #print(cleaned_documents[:5])
  return cleaned_documents

def comb(opt, sent, id):
    score = []
    opt = ["A", "B", "C", "D"]
    for o in opt :
        sent_c = " ".join(word_tokenize(sent.replace("_", o.upper() ))) 
        try :
           score.append( model.log_s(sent_c) )
        except :
           score.append( -1000 )
    
    ngram = ClozeTest(id)
    index = np.argmax(score)
    if index == 0 and score[0] == -1000 :  
        if ngram == None :
            model_predict[id] =  random.choice(opt)
        else :
            model_predict[id] = ngram

    else :
        if ngram != opt[index] and ngram != None:
            model_predict[id] =   random.choice( [opt[index], ngram] )
        else :
            model_predict[id] =  opt[index]
    
    print(model_predict[id], score)



proc = 1
for j in os.listdir(os.getcwd()+"/test") :
    if "json" not in j :
        continue
    proc += 1
    if proc%100 == 0 :
        print(proc)
    f = open(os.getcwd()+"/test/"+j)
    info = json.load(f)
    # print(sent_tokenize(info['article']))
    sent = list(filter(lambda x: "_" in x, sent_tokenize(info['article']) ) )
    id = -1
    pm_s = preprocess_model(sent)
    while id < len(pm_s)-1 :
        id += 1
        for i in range(1,pm_s[id].count("_")) :
            # print(id, info['options'][info['source']+'_'+str(id)], pm_s[id])
            comb(info['options'][info['source']+'_'+str(id)], pm_s[id], info['source']+'_'+str(id))
            pm_s.insert(id+1, pm_s[id])
            id += 1
            
        if id < len(info['options']) :
            comb(info['options'][info['source']+'_'+str(id)], pm_s[id], info['source']+'_'+str(id))
        
    f.close

In [None]:
df_predict = pd.DataFrame(list(predict.items()), columns=['id', 'label'])
df_predict.to_csv("0816065_1.csv", index=False)

In [79]:
output = threegram.predict_next(text=['she', "said"], top=10)
print('Next word predictions of two gram model:\n', "[ She said ]")
for i in output :
    print(i)

Next word predictions of two gram model:
 [ She said ]
('it', 0.1111111111111111)
('that', 0.1111111111111111)
('she', 0.1111111111111111)
('and', 0.05555555555555555)
('about', 0.05555555555555555)
('today', 0.05555555555555555)
('smart', 0.05555555555555555)
('come', 0.05555555555555555)
('achche', 0.05555555555555555)
('sorry', 0.05555555555555555)


In [71]:
df_mpredict = pd.DataFrame(list(model_predict.items()), columns=['id', 'label'])
df_mpredict.to_csv("0816065_model_hybra.csv", index=False)