Natural Language Processing - Homework 1

Pham Lan Phuong - 210120

In [1]:
import re
import nltk
import numpy as np 

In [2]:
with open('testing_data.txt', 'r') as f:
    test_data = f.read()

with open('training_data.txt', 'r') as f:
    train_data = f.read()

# Question 1
Find all sentences that contain “to be” verbs (i.e. “is”, “are”, ...) in the training data
file.

In [3]:
sentences = nltk.tokenize.sent_tokenize(train_data)
be_sentences = []
for sentence in sentences:
    # use re.search() because we care if the sentence contains 'be' or not
    contains_be = re.search(r'\b(is|are|am|be|being|been|was|were)\b', sentence)
    if contains_be:
        be_sentences.append(sentence)

print("Number of sentences that has to-be: ", len(be_sentences))
be_sentences

Number of sentences that has to-be:  8


['As part of the major, students will be equipped with the foundational knowledge in Computer Science and relevant disciplines.',
 'They will be exposed to essential areas of the CS discipline including theory, systems, and applications.',
 'They will learn about the underlying mathematical ideas that are critical for computation, establish proficiency in the process of designing systems and applications, gain experience in collecting and analyzing data using modern technologies, and begin to develop an understanding for the role of users in the design of systems and applications.',
 'The Computer Science major at Fulbright is designed to prepare students for work in industry or continue their lifelong learning as well as potential graduate-level studies.',
 'All students are first required to take the core courses in Liberal Arts and Science.',
 'In addition to the two courses in “Global Humanities and Social Change”, and “Modern Vietnamese Culture and Society”, they will be exposed t

# Question 2
Build a unigram model and a bigram model (both are with add-one smoothing) from the training data file. Then calculate and compare the perplexity score of these two models
on the testing data file.

### Unigram

In [13]:
'''
Class Unigram: for unigram language model.

Attributes: 
- n_unigrams: number of tokens in training data 
- vocab_size: number of unique unigram in training data
- count: a dictionary of unigram counts

Methods: 
- train(train_data): train the unigram model
- compute_prob(unigram): compute the probability of a unigram using 
                        unigram count and add-one smoothing
- test_perplexity(test_data): compute the perplexity of test data using 
                        log likelihood method to avoid underflow
'''

class Unigram: 
    def __init__(self):
        self.n_unigram = 0
        self.vocab_size = 0 
        self.count = {} 
        
    def train(self, train_data): 
        ''' 
        This function trains the unigram model. After running this funtion, 
        a dictionary storing counts of each unigram will be created.

        Input: 
            train_data: string 
                training data to train the unigram model
        Return: 
            None 
        '''
        # tokenize & assigning values to model attributes 
        unigrams = nltk.tokenize.word_tokenize(train_data)
        self.count["<UNK>"] = 0
        self.n_unigram = len(unigrams)
        self.vocab_size = len(set(unigrams)) 

        # creating unigram count dictionary
        for unigram in unigrams:
            if unigram in self.count.keys(): 
                self.count[unigram] += 1 
            else: 
                self.count[unigram] = 1 
        

    def compute_prob(self, unigram):
        '''
        This function takes in a unigram, and returns the probability that
        unigram appears in the training data with add-one smoothing. 

        Input: 
            unigram: string
                the unigram to compute probability for
        Return:
            the probability of the given unigram appearing in the training data
        '''
        N = self.n_unigram
        V = self.vocab_size
        if (unigram in self.count.keys()):
            return (self.count[unigram] + 1) / (N + V) # smoothing 
        else: 
            return 1 / (N + V) # smoothing for unseen words 
    
    def test_perplexity(self, test_data): 
        ''' 
        This function takes in a test data, and returns the perplexity 
        of this data on the unigram model. The perplexity is computed 
        using log likelihood method to avoid underflow.

        Input:
            test_data: string
                the test data to compute perplexity for
        Return:
            the perplexity of the test data on the unigram model
        '''
        test_unigrams = nltk.tokenize.word_tokenize(test_data)
        M = len(test_unigrams) 

        # perplexity = exponential of negative average log likelihood
        probs = [] 
        for unigram in test_unigrams: 
            probs.append(self.compute_prob(unigram))
        
        avg_log_likelihood = np.log(probs).sum() / M
        ppl = np.exp((-1) * avg_log_likelihood)
        return ppl 

In [14]:
unigram_model = Unigram() 
unigram_model.train(train_data)

unigram_test_ppl = unigram_model.test_perplexity(test_data)
unigram_test_ppl

172.11286794438394

In [15]:
print("Number of unigrams: ", unigram_model.n_unigram) 
print("Number of unique unigrams: ", len(unigram_model.count)-1)
print("Unigram test perplexity: ", unigram_test_ppl)

Number of unigrams:  742
Number of unique unigrams:  302
Unigram test perplexity:  172.11286794438394


In [19]:
# showing some intermediate results 
from copy import deepcopy 
import pandas as pd 

unicount = deepcopy(unigram_model.count)
print("Some of the most common unigrams: ")

most_common_unigrams = []
for i in range(7):
    most_common = max(unicount, key=unicount.get)
    most_common_unigrams.append({'Most Common': most_common, 'Count': unicount[most_common]})
    del unicount[most_common]

df = pd.concat([pd.DataFrame(most_common_unigrams)], ignore_index=True)

print(df)


Some of the most common unigrams: 
  Most Common  Count
0           ,     52
1         and     37
2           .     31
3         the     30
4          in     27
5          of     22
6          to     20


### Bigram

In [21]:
''' 
Class Bigram: for bigram language model. 

Attributes:
- count_unigram: a dictionary of unigram counts
- count_bigram: a dictionary of bigram counts
- n_unigrams: number of unigram in training data
- n_bigrams: number of bigram tokens in training data
- vocab_size: number of unique unigram in training data


Methods:
- get_ngram(n, text): get ngrams from text
- count_ngram(ngrams): count ngrams frequency 
- train(train_data): train the bigram model
- compute_prob(bigram): compute the probability of a bigram appearing
                        using add-one smoothing
- test_perplexity(test_data): compute the perplexity of test data using
                        log likelihood method to avoid underflow
'''

class Bigram: 
    def __init__(self):
        self.count_unigram = {}
        self.count_bigram = {} 
        self.n_unigram = 0
        self.n_bigram = 0 

        self.vocab_size = 0 
        
    
    def get_ngram(self, n, text): 
        ''' 
        This function takes in a text and returns a list of ngrams.
        If n = 1, return a list of unigrams using nltk.tokenize module. 
        Higher ngrams are manually created with sliding window. 
        
        Input:
            n: int
                the n in 'ngram' 
            text: string  
                the text to be split into ngrams   
            
        Return:
            a list of ngrams
        '''
        unigrams = nltk.tokenize.word_tokenize(text)
        ngrams = [] 
        if n == 1: 
            return unigrams 
        else: 
            last_start = len(unigrams) - n + 1
            for i in range(last_start):
                ngram = tuple(unigrams[i: i+n])
                ngrams.append(ngram)
            return ngrams
    
    def count_ngram(self, ngrams): 
        ''' 
        This function takes in a list of ngrams and returns a dictionary
        counting the frequency of each ngram. 

        Input:
            ngrams: list 
                a list of ngrams    
        
        Return:
            a dictionary of ngram counts
        '''
        count = {} 
        count["<UNK>"] = 0
        
        for ngram in ngrams: 
            if ngram in count.keys(): 
                count[ngram] += 1 
            else: 
                count[ngram] = 1 
        return count 

    def train(self, train_data):
        ''' 
        This function trains the bigram model. After running this funtion,
        two dictionaries storing counts of each unigram and bigram will be
        created.

        Input:
            train_data: string
                training data to train the bigram model
        Return:
            None
        '''
        # tokenize unigrams & assigning unigram attributes 
        unigrams = self.get_ngram(1, train_data)
        self.n_unigram = len(unigrams)
        self.vocab_size = len(set(unigrams))
        self.count_unigram = self.count_ngram(unigrams)

        # tokenize bigrams & assigning bigram attributes
        bigrams = self.get_ngram(2, train_data)
        self.n_bigram = len(bigrams) 
        self.count_bigram = self.count_ngram(bigrams)

    def compute_prob(self, bigram):
        '''
        This function takes in a bigram, and returns the probability that
        bigram appears in the training data with add-one smoothing.

        In case of unseen bigrams (a, b): 
        (1) a is unseen, b is seen:
            P(a, b) = (count(a, b) +  1 )/ (count(a) + V)
                    = 1 / V
        (2) a is seen, b is unseen:
            P(a, b) = count(a, b) + 1 / count(a) + V
                    = 1 / count(a) + V
        (3) a is unseen, b is unseen:
            P(a, b) = count(a, b) + 1 / count(a) + V
                    = 1 / V
        (4) a is seen, b is seen, but in the wrong order:
            P(a, b) = count(a, b) + 1 / count(a) + V
                    = 1 / count(a) + V

        Input:
            bigram: tuple
                the bigram to compute probability for
        Return:
            the probability of the given bigram appearing in the training data
        '''
        ctx = bigram[0]
        if (ctx in self.count_unigram.keys()):
            context = self.count_unigram[ctx]
        else:
            context = 0

        if (bigram in self.count_bigram.keys()):
            joint = self.count_bigram[bigram]
        else: 
            joint = 0 
        
        return (joint + 1) / (context + self.vocab_size) # smoothing

    def test_perplexity(self, test_data): 
        '''
        This function takes in a test data, and returns the perplexity
        of this data on the bigram model. The perplexity is computed
        using log likelihood method to avoid underflow.

        Input:
            test_data: string
                the test data to compute perplexity for
        Return:
            the perplexity of the test data on the bigram model 
        '''
        test_bigrams = self.get_ngram(2, test_data)
        test_unigrams = self.get_ngram(1, test_data)
        M = len(test_unigrams) 

        probs = [] 
        
        # compute the probability that the first word in test data appears
        # P(first) is the probaility that this unigram appears in training data
        first_word = test_unigrams[0] 
        p_first_word = 1
        if (first_word in self.count_unigram.keys()): 
            # if first_word is seen
            p_first_word = (self.count_unigram[first_word] + 1) \
                  / (self.n_unigram + self.vocab_size)
        else: 
            # first_word is unseen
            p_first_word = 1 / (self.n_unigram + self.vocab_size)

        probs.append(p_first_word)

        for bigram in test_bigrams: 
            probs.append(self.compute_prob(bigram))
        
        avg_log_likelihood = np.log(probs).sum() / M
        ppl = np.exp((-1) * avg_log_likelihood)
        return ppl 

In [22]:
bigram_model = Bigram()
bigram_model.train(train_data)

bigram_test_ppl = bigram_model.test_perplexity(test_data)
bigram_test_ppl

209.69428094428645

In [23]:
print("Number of bigrams: ", bigram_model.n_bigram) 
print("Number of unique bigrams: ", len(bigram_model.count_bigram)-1)
print("Bigram test perplexity: ", bigram_test_ppl)

Number of bigrams:  741
Number of unique bigrams:  591
Bigram test perplexity:  209.69428094428645


In [29]:
# showing some intermediate results 
bicount = deepcopy(bigram_model.count_bigram)
print("Some of the most common bigrams: ")

most_common_bigrams = []
for i in range(7):
    most_common = max(bicount, key=bicount.get)
    most_common_bigrams.append({'Most Common': most_common, 'Count': bicount[most_common]})
    del bicount[most_common]

df = pd.concat([pd.DataFrame(most_common_bigrams)], ignore_index=True)

print(df)


Some of the most common bigrams: 
           Most Common  Count
0             (,, and)     12
1  (Computer, Science)      9
2            (in, the)      8
3     (Science, major)      4
4           (will, be)      4
5      (knowledge, in)      4
6       (in, Computer)      4


### Comparison: Unigram vs. Bigram 

In [30]:
print("On the original testing data:")
print("Number of unigrams in the testing data: ", len(nltk.tokenize.word_tokenize(test_data)))
print("Unigram test perplexity: ", unigram_test_ppl)
print("Bigram test perplexity: ", bigram_test_ppl)

On the original testing data:
Number of unigrams in the testing data:  46
Unigram test perplexity:  172.11286794438394
Bigram test perplexity:  209.69428094428645


I try to run these 2 models on several more test data to see if Unigram consistently performs better than Bigram. Contrary to my expectation, Bigram performs better than Unigram in texts that are NOT related to Fulbright. 

In texts relevant to Fulbright or relevant to CS, Unigram performs better than Bigram :) even just by a little bit, it is still better. 

In [31]:
with open('soci.txt', 'r') as f:
    soci = f.read()

print("On the social science major description data: ")
print("Number of unigrams in the testing data: ", len(nltk.tokenize.word_tokenize(soci)))
print("Unigram test perplexity: ", unigram_model.test_perplexity(soci))
print("Bigram test perplexity: ", bigram_model.test_perplexity(soci))

On the social science major description data: 
Number of unigrams in the testing data:  49
Unigram test perplexity:  243.84201918052653
Bigram test perplexity:  279.4338967244134


In [32]:
with open('aboutF.txt', 'r') as f:
    aboutF = f.read()

print("On the about Fulbright data: ")
print("Number of unigrams in the testing data: ", len(nltk.tokenize.word_tokenize(aboutF)))
print("Unigram test perplexity: ", unigram_model.test_perplexity(aboutF))
print("Bigram test perplexity: ", bigram_model.test_perplexity(aboutF))

On the about Fulbright data: 
Number of unigrams in the testing data:  49
Unigram test perplexity:  297.37958576446357
Bigram test perplexity:  301.22749165121564


In [33]:
with open('cswiki.txt', 'r') as f:
    cswiki = f.read()

print("On the CS Wikipedia data: ") 
print("Number of unigrams in the testing data: ", len(nltk.tokenize.word_tokenize(cswiki)))
print("Unigram test perplexity: ", unigram_model.test_perplexity(cswiki))
print("Bigram test perplexity: ", bigram_model.test_perplexity(cswiki))

On the CS Wikipedia data: 
Number of unigrams in the testing data:  46
Unigram test perplexity:  164.2326658507244
Bigram test perplexity:  241.0606657999618


In [34]:
with open('undertale.txt', 'r') as f:
    undertale = f.read()

print("On the Undertale game description data: ")
print("Number of unigrams in the testing data: ", len(nltk.tokenize.word_tokenize(undertale)))
print("Unigram test perplexity: ", unigram_model.test_perplexity(undertale))
print("Bigram test perplexity: ", bigram_model.test_perplexity(undertale))

On the Undertale game description data: 
Number of unigrams in the testing data:  48
Unigram test perplexity:  431.52020218385195
Bigram test perplexity:  302.3494724533181


In [35]:
with open('egyptian.txt', 'r') as f:
    egyptian = f.read()

print("On the Egypt Wikipedia data: ")
print("Number of unigrams in the testing data: ", len(nltk.tokenize.word_tokenize(egyptian)))
print("Unigram test perplexity: ", unigram_model.test_perplexity(egyptian))
print("Bigram test perplexity: ", bigram_model.test_perplexity(egyptian))

On the Egypt Wikipedia data: 
Number of unigrams in the testing data:  48
Unigram test perplexity:  316.2653156458338
Bigram test perplexity:  290.3649526192667


In [36]:
with open('ghost.txt', 'r') as f:
    ghost = f.read()

print("On the Ghost In The Shell film wiki data: ")
print("Number of unigrams in the testing data: ", len(nltk.tokenize.word_tokenize(ghost)))
print("Unigram test perplexity: ", unigram_model.test_perplexity(ghost))
print("Bigram test perplexity: ", bigram_model.test_perplexity(ghost))

On the Ghost In The Shell film wiki data: 
Number of unigrams in the testing data:  60
Unigram test perplexity:  355.96280359448326
Bigram test perplexity:  272.5486676938591
