**Lab1 Reminder: **
1. Use small data to develop your code.
2. Relink to the demomstration code can check the output.

In [1]:
# For debugging
import pdb

# For checking progress
from tqdm import tqdm

# For loading data
import pandas as pd

# For tokenizaton
import nltk
from nltk import word_tokenize, sent_tokenize
nltk.download('punkt')

# For building n-gram model
from collections import Counter, namedtuple
import numpy as np

# For pos tagging
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

# Part 1. Data Preprocessing
1. show the top-10 common words and their counts before/after preprocessing




## Functions and Classes
*  Remove the punctuations
*  Lower the cases



In [2]:
def get_corpus():
  """ Reads and formats the corpus.

  Returns:
    corpus (list[str]):
      A list of sentences in the corpus.
  """
  df = pd.read_csv('https://raw.githubusercontent.com/yunzhusong/NLP109/main/lab1_data.csv')
  corpus = df.content.to_list()
  return corpus

In [3]:
def preprocess(documents):
  """ Preprocesses the corpus.
  
  Args:
    documents (list[str]):
      A list of sentences in the corpus.
  Returns:
    cleaned_documents (list[str]):
      A list of cleaned sentences in the corpus.
  """
  cleaned_documents = []
  punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~”'''

  for doc in documents:
    # Tokenizes the sentence
    sents = sent_tokenize(doc)

    for sent in sents:
      #pdb.set_trace() # delete this line for the final version

      # Removes the punctuations, hint: recursively remove in character level
      for p in punc:
        sent = sent.replace(p, ' ')

      # Lowers the case, 
      sent = sent.lower()
      
      cleaned_documents.append(sent)
    

  #print(cleaned_documents[:5])
  return cleaned_documents

In [4]:
# Compute word frequency
def get_vocab(documents):
  """ Gets the vocabulary from the corpus.
  
  Args:
    documents (list[str]):
      A list of sentences in the corpus
  Returns:
    vocabulary (collections.Counter)
  """
  vocabulary = Counter()

  for doc in tqdm(documents):
    tokens = word_tokenize(doc)
    vocabulary.update(tokens)

  return vocabulary

## Executions

### 1. Show the top-10 common words and their counts before/after preprocessing


In [5]:
# Read data
raw_documents = get_corpus()

# Build vocabulary
vocab = get_vocab(raw_documents).most_common(10)
print('\n Before preprocessing:', vocab)

100%|██████████| 100000/100000 [00:21<00:00, 4560.49it/s]


 Before preprocessing: [('.', 85947), ('the', 49772), (',', 39728), ('to', 34407), ('!', 33580), ('a', 28765), ('is', 26339), ('?', 24057), ('and', 22890), ('of', 22542)]





In [6]:
# Build vocabulary after preprocessing
documents = preprocess(raw_documents)
vocab = get_vocab(documents).most_common(10)
print('\n After preprocesing:', vocab)

100%|██████████| 175323/175323 [00:19<00:00, 9067.52it/s]


 After preprocesing: [('the', 56446), ('to', 35130), ('a', 30363), ('you', 29262), ('is', 27200), ('and', 25790), ('of', 23117), ('it', 19647), ('i', 18268), ('that', 17711)]





# Part 2. N-Gram Model and POS Tagging
1. Build 2-gram / 4-gram model by processed dataset
2. Show the top-5 probable next words and their probability after initial token ‘\<s\>’ by 2-gram model
3. Generate a sentence with 2-gram model and find the POS taggings
4. Generate a sentence with 4-gram model and find the POS taggings




## Functions and Classes

In [7]:
t = word_tokenize(' the input text {} not in the vocabulary')
type(t)

list

In [8]:
class Ngram_model(object):
  """ Ngram model implementation.

  Attributes:
    n (int):
      The number of grams to be considered.
    model (dict):
      The ngram model.
  """
  def __init__(self, documents, N=2):
    self.n = N
    self.model = self.get_ngram_model(documents)

  def get_ngram_model(self, documents):
    N = self.n
    ngram_model = dict()
    full_grams = list()
    grams = list()
    Word = namedtuple('Word', ['word', 'prob'])

    # clean documents
    documents = preprocess(documents)
    c = 0
    # for each sentence in documents
    for sentence in documents:
      
      # Tokenizes to words
      tokens = word_tokenize(sentence)
      # print(tokens)

      # Append (N-1) start tokens '<s>' and an end token '<\s>'
      for i in range(1,N):
        # append in start
        tokens = ['<s>'] + tokens
      
      # append in end
      tokens = tokens + ['<\s>']
      # print(tokens)

      # Calculates numerator (construct list with full grams, i.e., N-grams)
      for i in range(0,len(tokens)-(N-1)):
        full_grams.append(tuple(tokens[i:i+(N)]))
      

      # Calculate denominator (construct list with grams, i.e., (N-1)-grams)
      for i in range(0,len(tokens)-(N-1)):
        grams.append(tuple(tokens[i:i+(N-1)]))
      

      # c=c+1
      # if c >5 :
      #   print('full_grams', full_grams)
      #   print('grams', grams)
      #   raise 'debug'
      
    # Count the occurence frequency of each gram
    # Take 2-gram model as example:
    #   full_grams -> list[('a', 'gram'),('other', 'gram'), ...]
    #   grams -> list[('a'), ('other'), ('gram'), ...]
    #   full_gram_counter -> dict{('a', 'gram'):frequency_1, ('other','gram'):frequency_2, ...}
    #   gram_counter -> dict{('a'):frequency_1, ('gram'):frequency_2, ...}
    full_gram_counter = Counter(full_grams)
    gram_counter = Counter(grams)

    # Build model
    # Take 2-gram model as example:
    #   { '<s>': [tuple(word='i', prob=0.6), tuple(word='the', prob=0.2), ...],
    #   'i': [tuple(word='am', prob=0.7), tuple(word='want', prob=0.1), ...],
    #    ... }
    for key in full_gram_counter:
      word = ''.join(key[:N-1])

      if word not in ngram_model:
        ngram_model.update({word: set()})

      # next_word_prob -> float
      # print('-----')
      # print(key)
      # print('full_gram_counter[key]', full_gram_counter[key])
      # print('key[:N-1]', key[:N-1])
      # print('gram_counter[key[:N-1]]', gram_counter[key[:N-1]])
      next_word_prob = full_gram_counter[key] / gram_counter[key[:N-1]]
      w = Word(key[-1], next_word_prob)
      ngram_model[word].add(w)

    # Sort the result by frequency
    for word, ng in ngram_model.items():
      ngram_model[word] = sorted(ng, key=lambda x: x.prob, reverse=True)

    return ngram_model


  def predict_sent(self, text=None, max_len=30):
    """ Predicts a sentence with the ngram model.

    Args:
      text (string or list[string])
    Returns:
      A prediction string.
    """

    N = self.n
    backup_tokens = ['<s>']*(N-1)
    if not text:
      tokens = backup_tokens
      output = []

    elif type(text)==str:
      tokens = backup_tokens + text.split(' ')
      tokens = tokens[-(N-1):]
      if not self.check_existence(tokens):
        return 
      output = tokens

    elif type(text) == list:
      tokens = backup_tokens + text
      tokens = tokens[-(N-1):]
      if not self.check_existence(tokens):
        return
      output = tokens

    else:
      print('[Error] the input text must be string or list of string')
      return

    for i in range(max_len):
      possible_words = list(self.model[''.join(tokens)])
      probs = [word.prob for word in possible_words]
      words = [word.word for word in possible_words]
      next_word = np.random.choice(words, 1, p=probs)[0]
      tokens = tokens[1:] + [next_word]

      if next_word == '<\\s>':
        break

      output.append(next_word)
    return ' '.join(output)

  def predict_next(self, text=None, top=5):
    """ Predicts next word with the ngram model.

    Args:
      text (string or list[string])

    Returns:
      possible_next_words (list[namedtuple]):
        A list of top few possible next words.
    """

    N = self.n
    backup_tokens = ['<s>']*(N-1)
    if not text:
      tokens = backup_tokens

    elif type(text)==str:
      tokens = backup_tokens + text.split(' ')
      tokens = tokens[-(N-1):]
      if not self.check_existence(tokens):
        return 

    elif type(text) == list:
      tokens = backup_tokens + text
      tokens = tokens[-(N-1):]
      if not self.check_existence(tokens):
        return
    else:
      print('[Error] the input text must be string or list of string')

    possible_next_words = self.model[''.join(tokens)][:top]
    possible_next_words = [(word.word, word.prob) for word in possible_next_words]

    return possible_next_words

  def check_existence(self, tokens):
    if not ''.join(tokens) in self.model.keys():
      print('[Error] the input text {} not in the vocabulary'.format(tokens))
      return False
    else:
      return True

## Executions

### 1. Build 2-gram/4-gram model by processed dataset

In [9]:
twogram = Ngram_model(documents, N=2)
fourgram = Ngram_model(documents, N=4)

### 2. Show the top-5 probable next words and their probability after initial token \'\<s\>\'  by 2-gram model

In [10]:
output = twogram.predict_next(text='<s>', top=5)
print('Next word predictions of two gram model:', output)

Next word predictions of two gram model: [('i', 0.06051596646571213), ('you', 0.036854747550866616), ('the', 0.031968255463451396), ('it', 0.022130510550113038), ('they', 0.021053127354935947)]


### 3. Generate a sentence with 2-gram model and find the POS taggings



In [11]:
output = twogram.predict_sent(max_len=30)
print('Generation results of two gram model:', output)
nltk.pos_tag(word_tokenize(output))

Generation results of two gram model: yup yup it at the evidence that brother who this


[('yup', 'RB'),
 ('yup', 'VBZ'),
 ('it', 'PRP'),
 ('at', 'IN'),
 ('the', 'DT'),
 ('evidence', 'NN'),
 ('that', 'IN'),
 ('brother', 'NN'),
 ('who', 'WP'),
 ('this', 'DT')]

### 4. Generate a sentence with 4-gram model and find the POS taggings



In [12]:
output = fourgram.predict_sent(max_len=30)
print('Generation results of four gram model: ', output)
nltk.pos_tag(word_tokenize(output))

Generation results of four gram model:  keep digging


[('keep', 'VB'), ('digging', 'NN')]