**Lab1 Reminder: **
1. Use small data to develop your code.
2. Relink to the demomstration code can check the output.

In [1]:
!pip install rouge



In [2]:
# For debugging
import pdb

# For checking progress
from tqdm import tqdm

# For loading data
import pandas as pd

# For tokenizaton
import nltk
from nltk import word_tokenize, sent_tokenize
nltk.download('punkt')

# For building n-gram model
from collections import Counter, namedtuple
import numpy as np

# For evaluation 
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge import Rouge 

# For pos tagging
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

# Part 1. Data Preprocessing
1. show the top-10 common words and their counts before/after preprocessing




## Functions and Classes
*  Remove the punctuations
*  Lower the cases



In [3]:
def get_corpus():
  """ Reads and formats the corpus.

  Returns:
    corpus (list[str]):
      A list of sentences in the corpus.
  """
  df = pd.read_csv('https://raw.githubusercontent.com/yilihsu/NLP110/main/data_tiny.csv')
  corpus = df.content.to_list()
  return corpus

In [4]:
def preprocess(documents):
  """ Preprocesses the corpus.
  
  Args:
    documents (list[str]):
      A list of sentences in the corpus.
  Returns:
    cleaned_documents (list[str]):
      A list of cleaned sentences in the corpus.
  """
  cleaned_documents = []
  punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~”'''
  for doc in documents:
    # Tokenizes the sentence
    sents = sent_tokenize(doc)

    for sent in sents:
      # pdb.set_trace() # delete this line for the final version

      # Removes the punctuations [TODO]
      sent = sent.translate(str.maketrans('', '', punc))

      # Lowers the case
      sent = sent.lower() 
      
      cleaned_documents.append(sent)

  #print(cleaned_documents[:5])
  return cleaned_documents

In [5]:
# Compute word frequency
def get_vocab(documents):
  """ Gets the vocabulary from the corpus.
  
  Args:
    documents (list[str]):
      A list of sentences in the corpus
  Returns:
    vocabulary (collections.Counter)
  """
  vocabulary = Counter()

  for doc in tqdm(documents):
    tokens = word_tokenize(doc)
    vocabulary.update(tokens)

  return vocabulary

## Executions

### 1. Show the top-10 common words and their counts before/after preprocessing


In [6]:
# Read data
raw_documents = get_corpus()

# Build vocabulary
vocab = get_vocab(raw_documents).most_common(10)
print('\n Before preprocessing:', vocab)

# Build vocabulary after preprocessing
documents = preprocess(raw_documents)
vocab = get_vocab(documents).most_common(10)
print('\n After preprocesing:', vocab)

100%|██████████| 20000/20000 [00:05<00:00, 3394.65it/s]



 Before preprocessing: [('.', 17263), ('the', 9880), (',', 7788), ('to', 7003), ('!', 6642), ('a', 5590), ('is', 5102), ('?', 4640), ('and', 4573), ('you', 4448)]


100%|██████████| 35017/35017 [00:11<00:00, 2967.17it/s]


 After preprocesing: [('the', 11175), ('to', 7117), ('a', 5847), ('you', 5324), ('is', 5245), ('and', 5087), ('of', 4492), ('i', 3231), ('in', 3203), ('it', 3187)]





# Part 2. N-Gram Model and POS Tagging
1. Build 2-gram / 4-gram model by processed dataset
2. Show the top-5 probable next words and their probability after initial token ‘\<s\>’ by 2-gram model
3. Generate a sentence with 2-gram model and find the POS taggings
4. Generate a sentence with 4-gram model and find the POS taggings




## Functions and Classes

In [7]:
class Ngram_model(object):
  """ Ngram model implementation.

  Attributes:
    n (int):
      The number of grams to be considered.
    model (dict):
      The ngram model.
  """
  def __init__(self, documents, N=2):
    self.n = N
    self.model = self.get_ngram_model(documents)

  def get_ngram_model(self, documents):
    N = self.n
    ngram_model = dict()
    full_grams = list()
    grams = list()
    Word = namedtuple('Word', ['word', 'prob'])

    # for each sentence in documents [TODO]
    for sent in documents:
      
      # Tokenizes to words [TODO]
      words = nltk.word_tokenize(sent)
      
      # Append (N-1) start tokens '<s>' and an end token '<\s>' [TODO]
      tokens = []
      for i in range(N-1):
        tokens = tokens + ['<s>']
      tokens = tokens + words + ['<\s>']

      # Calculates numerator (construct list with full grams, i.e., N-grams) [TODO]
      for i in range(len(tokens)-N+1):
        gram = []
        for j in range(N):
          gram.append(tokens[i+j])
        full_grams.append(tuple(gram))

      # Calculate denominator (construct list with grams, i.e., (N-1)-grams) [TODO]
      for i in range(len(tokens)-N+2):
        gram = []
        for j in range(N-1):
          gram.append(tokens[i+j])
        grams.append(tuple(gram))

    # Count the occurence frequency of each gram
    # Take 2-gram model as example:
    #   full_grams -> list[('a', 'gram'),('other', 'gram'), ...]
    #   grams -> list[('a'), ('other'), ('gram'), ...]
    #   full_gram_counter -> dict{('a', 'gram'):frequency_1, ('other','gram'):frequency_2, ...}
    #   gram_counter -> dict{('a'):frequency_1, ('gram'):frequency_2, ...}
    full_gram_counter = Counter(full_grams)
    gram_counter = Counter(grams)

    # Build model
    # Take 2-gram model as example:
    #   { '<s>': [tuple(word='i', prob=0.6), tuple(word='the', prob=0.2), ...],
    #   'i': [tuple(word='am', prob=0.7), tuple(word='want', prob=0.1), ...],
    #    ... }
    for key in full_gram_counter:
      word = ''.join(key[:N-1])

      if word not in ngram_model:
        ngram_model.update({word: set()})

      # next_word_prob -> float
      next_word_prob = full_gram_counter[key] / gram_counter[key[:N-1]]
      w = Word(key[-1], next_word_prob)
      ngram_model[word].add(w)

    # Sort the result by frequency
    for word, ng in ngram_model.items():
      ngram_model[word] = sorted(ng, key=lambda x: x.prob, reverse=True)

    return ngram_model


  def predict_sent(self, text=None, max_len=30):
    """ Predicts a sentence with the ngram model.

    Args:
      text (string or list[string])
    Returns:
      A prediction string.
    """

    N = self.n
    backup_tokens = ['<s>']*(N-1)
    if not text:
      tokens = backup_tokens
      output = []

    elif type(text)==str:
      tokens = backup_tokens + text.split(' ')
      tokens = tokens[-(N-1):]
      if not self.check_existence(tokens):
        return 
      output = tokens

    elif type(text) == list:
      tokens = backup_tokens + text
      tokens = tokens[-(N-1):]
      if not self.check_existence(tokens):
        return
      output = tokens

    else:
      print('[Error] the input text must be string or list of string')
      return

    for i in range(max_len):
      possible_words = list(self.model[''.join(tokens)])
      probs = [word.prob for word in possible_words]
      words = [word.word for word in possible_words]
      next_word = np.random.choice(words, 1, p=probs)[0]
      tokens = tokens[1:] + [next_word]

      if next_word == '<\\s>':
        break

      output.append(next_word)
    return ' '.join(output)

  def predict_next(self, text=None, top=5):
    """ Predicts next word with the ngram model.

    Args:
      text (string or list[string])

    Returns:
      possible_next_words (list[namedtuple]):
        A list of top few possible next words.
    """

    N = self.n
    backup_tokens = ['<s>']*(N-1)
    if not text:
      tokens = backup_tokens

    elif type(text)==str:
      tokens = backup_tokens + text.split(' ')
      tokens = tokens[-(N-1):]
      if not self.check_existence(tokens):
        return 

    elif type(text) == list:
      tokens = backup_tokens + text
      tokens = tokens[-(N-1):]
      if not self.check_existence(tokens):
        return
    else:
      print('[Error] the input text must be string or list of string')

    possible_next_words = self.model[''.join(tokens)][:top]
    possible_next_words = [(word.word, word.prob) for word in possible_next_words]

    return possible_next_words

  def check_existence(self, tokens):
    if not ''.join(tokens) in self.model.keys():
      print('[Error] the input text {} not in the vocabulary'.format(tokens))
      return False
    else:
      return True

## Executions

### 1. Build 2-gram/4-gram model by processed dataset

In [8]:
twogram = Ngram_model(documents, N=2)

### 2. Show the top-5 probable next words and their probability after initial token \'\<s\>\'  by 2-gram model

In [9]:
output = twogram.predict_next(text='<s>', top=5)
print('Next word predictions of two gram model:', output)

Next word predictions of two gram model: [('i', 0.052745809178399064), ('the', 0.030984950167061712), ('you', 0.0302710112231202), ('<\\s>', 0.030013993203301254), ('they', 0.01976182996830111)]


### 3. Generate a sentence with 2-gram model and find the POS taggings



In [10]:
output = twogram.predict_sent(max_len=30)
print('Generation results of two gram model:', output)
nltk.pos_tag(word_tokenize(output))

Generation results of two gram model: i hear


[('i', 'NN'), ('hear', 'VBP')]

In [11]:
def evaluation(generated_sentence, reference_sentence):

  if len(generated_sentence) <= 1: 
    raise RuntimeError('Not enough length to evaluate, please try again with another generation.')

  rouge = Rouge()
  smoothie = SmoothingFunction().method4
  
  # Hint: please refer to the import function in the beginning of this notebook
  # Given the smoothing_function=smoothie, weights=(1,0,0,0), please calculate BLEU-1 score with function call
  # [TODO]
  bleu_score = nltk.translate.bleu_score.sentence_bleu(reference_sentence, generated_sentence, weights=(1, 0, 0, 0), smoothing_function=smoothie)

  # Calculates ROUGE-1 f score with function call
  # [TODO]
  rouge_score = rouge.get_scores(generated_sentence, reference_sentence)
  rouge1_f = rouge_score[0]['rouge-1']['f']
  
  return bleu_score, rouge1_f

In [15]:
given_text = "today is"
references = ['is', 'a', 'party', 'day']
output = twogram.predict_sent(text=given_text, max_len=30)
output = word_tokenize(output)
print("Generated results given the text: ", output)
print("Reference sentence: ", references)

bleu_score, rouge_score = evaluation(output, references)
print("bleu score: ", bleu_score)
print("rouge score: ", rouge_score)

Generated results given the text:  ['is', 'an', 'indian', 'politics']
Reference sentence:  ['is', 'a', 'party', 'day']
bleu score:  0
rouge score:  0.999999995
