# Yichien Chou

# Part 1: Bigram Model

In [4]:
import numpy as np, os, nltk, glob, sys, string
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /Users/jason13nn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [16]:
class BigramModel:
    
    def __init__(self, name = "default", dirName = ".", ext = "*", smooth =0, stopWordList = [], otherWordList = [], bg_prob = {}):
        self.name = name
        self.dirName = dirName
        self.ext = ext
        self.smooth = smooth
        self.stopWordList = stopWordList
        self.otherWordList = otherWordList
        self.bg_prob = bg_prob

#-----------------------------------------------------------------     
    #1. Actually calculate the bigram probabilities
    def Calculate(self):
        
        ##Import corpus
        #Specified directory
        if self.dirName == '.':
            corpus_dirName = os.getcwd() + '/'
        #Current directory
        else:
            corpus_dirName = self.dirName + '/' 
        #All files    
        if self.ext == '*':
            corpus = []
            corpus_dirName = corpus_dirName + '*.txt'
            file_list = glob.glob(corpus_dirName)
            for file_path in file_list:
                with open(file_path, 'r') as file_input:
                    corpus.append(file_input.read())
            corpus = ' '.join([str(elem) for elem in corpus])   
        #Specified file
        else:              
            corpus = []
            corpus_path = corpus_dirName + self.ext + '.txt'
            corpus = open(corpus_path, "r").read()
        ##Corpus preprocessing 
        substr = [".", ",", "!", "?,", ";", ":"]
        inserttxt = "^ $"
        for ch in substr:   
            corpus = corpus.replace(ch, " " + ".").replace(".", inserttxt)
    
        words = corpus.split()
        #Remove 
        words = [x for x in words if not (x.isdigit())]
        words = [w.strip().lower() for w in words] 

        bigrams = [(s1, s2) for s1, s2 in zip(words, words[1:])]
    
        ##Calculate the bigram probability    
        p=[]
        for i in range(0, len(words)-1):
            p.append((bigrams.count(bigrams[i]) + self.smooth )/ (words.count(words[i]) + self.smooth * len(np.unique(bigrams))))
            self.bg_prob = {pair : [pair, prob] for pair, prob in zip(bigrams, p)}
 
        #Default stop words list is empty
        self.stop_words = [bigrams for word in self.stopWordList]       
        self.otherWordList = [(w1, w2) for w1, w2 in bigrams if w1 not in self.stopWordList and w2 not in self.stopWordList]

        return self.bg_prob
#-----------------------------------------------------------------      
    #2. Save the calculated probabilities in a file
    def Save(self):
        self.bg_prob = self.Calculate()
        filename = self.name + '.txt'
        file = open(filename, "a") 
        file.write(str(self.bg_prob))
        file.close()
        
        print("The file has been saved.")
#-----------------------------------------------------------------    
    #3. Load the calculated probabilities from the file to the object
    def Load(self):
        try:
            file_path = os.getcwd() + '/' + self.name + '.txt'
            file = open(file_path, "r")
        except:
            print("Error: The file doesn't exist" )
            exit()
        
        return file.read()
#-----------------------------------------------------------------    
    #4. Return the probability of the bigram (w1, w2)
    def getProb(self, w1, w2):
        try:
            if (w1, w2) in self.bg_prob:
                result = self.bg_prob[w1, w2][1]
            if w1 == "*":
                result = [v for (_, k2), v in self.bg_prob.items() if k2 == w2]
            elif w2 == "*":
                result = [v for (k1, _), v in self.bg_prob.items() if k1 == w1]
            else:
                result = -1
        except:    
            print('The probability has not been calculated.')
            exit()
            
        return result  
#-----------------------------------------------------------------        
    #5. Return all the bigrams with w1 as the first word
    def getProbList(self, w1, sortMethod = 0):
        try:
            if sortMethod == 1:
                result = sorted([v for (k1, _), v in self.bg_prob.items() if k1 == w1])
            if sortMethod == 2:
                result = sorted([v for (k1, _), v in self.bg_prob.items() if k1 == w1], reverse = True)
            else:
                result = [v for (k1, _), v in self.bg_prob.items() if k1 == w1]        
        except:
            print("Error: The probability has not been calculated" )
            exit()
        
        return result
     
#-----------------------------------------------------------------    
    #6. Return all the bigrams and their probabilities as a list
    def getAll(self, sortMethod = 0):
        try:
            if sortMethod == 1:
                result = sorted(self.bg_prob)
            elif sortMethod == 2:
                result = sorted(self.bg_prob, reverse = True)
            elif sortMethod == 3:
                result = sorted(self.bg_prob, key = lambda x: x[1])
            else:
                result = self.bg_prob 
        except:
            print('The probability has not been calculated.')
            exit()
            
        return result