In [179]:
# these are the ones that we used for the base model
import numpy as np
import sys
from collections import Counter
import math

"""
Your name and file comment here: Jessica Bae
"""

def generate_tuples_from_file(training_file_path):
    #use with statement to open the file
    with open(training_file_path, 'r') as file: #read the file
        lines = file.readlines() #read each line in the file
        #iterate over and create a list of tuples
    data = []
    for l in lines:
        info = l.split('\t') #split each line by the tab
        data.append((info[0], info[1], int(info[2]))) #info[0] is id, info[1] is sentence, info[2] is the label
    file.close()
    
    return data
    
def precision(gold_labels, classified_labels):
    #true pos/ true pos + false pos
    truepos = 0
    falsepos = 0
    for i in range (len(gold_labels)):
        if gold_labels[i] == 1 and classified_labels[i] == 1: #truepos is when system and gold positive
            truepos += 1
        if gold_labels[i] == 0 and classified_labels[i] == 1: #falsepos is when system is positive but gold negative
            falsepos += 1
    return (truepos/ (truepos + falsepos))

def recall(gold_labels, classified_labels):
    #true pos/ true pos + false neg
    truepos = 0
    falseneg = 0
    for i in range (len(gold_labels)):
        if gold_labels[i] == 1 and classified_labels[i] == 1:  #truepos is when system and gold positive
            truepos += 1
        if gold_labels[i] == 1 and classified_labels[i] == 0: #falseneg is when gold is positive but system negative
            falseneg += 1
    return(truepos / (truepos + falseneg))

def f1(gold_labels, classified_labels):
    #2 * precision * recall/ precision + recall
    pre = precision(gold_labels, classified_labels)
    re = recall(gold_labels, classified_labels)
    return((2 * pre * re)/ (pre + re))


"""
Implement any other non-required functions here
"""

def generate_test(training_file_path):
    #use with statement to open the file
    with open(training_file_path, 'r', encoding="utf8") as file: #read the file
        lines = file.readlines() #read each line in the file
        #iterate over and create a list of tuples
    data = []
    for l in lines:
        info = l.split('\t') #split each line by the tab
        data.append((info[0], info[1])) #info[0] is id, info[1] is sentence NO INFO[2] becuase we classify these
    file.close()
    
    return data

def punctuation(string): 
  
    # punctuation marks 
    punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
  
    # traverse the given string and if any punctuation 
    # marks occur replace it with null 
    for x in string.lower(): 
        if x in punctuations: 
            string = string.replace(x, "") 
  
    # Print string without punctuation 
    return string

"""
implement your SentimentAnalysis class here
"""
class SentimentAnalysis:
    
    def __init__(self):
    # do whatever you need to do to set up your class here
    
        self.pos_count = {} #dictionary for pos_count -word count respect to positive comments (freq)
        self.neg_count = {} #dictionary for neg_count -word count respect to negative comments (freq)
        
        self.vocab = 0 #number of words in our vocabulary
        self.pos = 0 #class count for probability calculations 
        self.neg = 0 #class count for probability calculations
        
        self.pos_prob = {} #probability list for each positive word 
        self.neg_prob = {} #probability list for each negative word
        
        self.pos_c_prob = 0 #positive class probability
        self.neg_c_prob = 0 #negative class probability
        
        self.score_class = {} #score return of dictionary P(c)*P(data|c)  for each class
        

        self.gold_labels = [0,0,0,1,0,0,1,1,1,1,1,0,1,0,0,0,1,0,1,0,0,1,1,0,1,1,0,1,1,1,0,0,0,0,1,0,1,1,1,1,0,0,1,0,1,1,1,1,1,0]
        self.classified_labels = []
        #To calculate gold or actual values, I went ahead and went through the test data and labeled pos/neg
 
        print("initialized\n")

    def train(self, examples): #examples is the generated tuple we did above
        for e in examples: #loop through all our examples
            feat = self.featurize(e) #get our features using featurize= create a bag of word features (w1,1) -> (x,y)
            #the example e can be positive or negative 
            if e[2] == 1: #if our label is 1--positive
                self.pos += 1 #we want to add it to our positive count (class count) how many + classes we saw
                for f in feat: #feat is a list of tuple, we need to look at each f in feat
                    if f[0] not in self.pos_count: #if the token in that f is not present in dictionary
                        #the first time we are encountering the word
                        self.vocab += 1 #keep track of our vocab
                        self.pos_count[f[0]] = 0 #put a zero for pos_count 
                        self.pos_count[f[0]] += 1 #increment count by one
                    else: #if token is already present in dictionary we want to increment the count of token
                        self.pos_count[f[0]] += 1
                       
            if e[2] == 0: #if our label is 0--negative
                self.neg += 1 #we want to add it to our negative count (class count)
                for f in feat:#repeat above
                    if f[0] not in self.neg_count:
                        self.neg_count[f[0]] = 0 #put a zero for neg_count 
                        self.neg_count[f[0]] += 1 #increment count by one
                    else: 
                        self.neg_count[f[0]] += 1
       
        #we need to make sure that both dictionary has both words - dictionary needs same set of words
        #If not in both, discard the word or handle the situation by adding to the other dictionary with a 0 count
        
        for k,v in self.neg_count.items(): #iterate over negative dictionary
            if k not in self.pos_count: #if k is not present in positive dictionary
                self.vocab += 1 #sum up the rest of vocab count with neg words as well
                self.pos_count[k] = 0 #add k of neg vocab with a 0 count - important for laplace smoothing
        for k,v in self.pos_count.items(): #do the same thing but we dont need to count vocab since we have all - and +
            if k not in self.neg_count:
                self.neg_count[k] = 0 #add k of pos vocab with a 0 count to neg_count 
        
        #calculate probabilities
        #the word count of w1 + 1/all the words in positive context + |v|
        for k,v in self.pos_count.items():
            self.pos_prob[k] = (self.pos_count[k] + 1)/(self.pos + self.vocab)
            
        for k,v in self.neg_count.items(): #same with words in negative context + |v|
            self.neg_prob[k] = (self.neg_count[k] + 1)/(self.neg + self.vocab)
        
        #calculate class probabilities
        self.pos_c_prob = self.pos/(self.pos + self.neg) #take all pos class counts/ total counts
        self.neg_c_prob = self.neg/(self.pos + self.neg) #take all neg class counts/ total counts

    def score(self, data):
        
        data_p = (0, data, 1) #create a tuple variable with id, sentence, and positive label
        data_n = (0, data, 0) #create a tuple variable with id, sentence, and negative label
        
        feat_p = self.featurize(data_p) #featurize each positive and negative sentence data
        feat_n = self.featurize(data_n)
        
        pos_p = (self.pos_c_prob) #initialize the probability to prior
        neg_p = (self.neg_c_prob)
        
        #loop through each word in positive 
        for f in feat_p:
            if f[0] not in self.pos_prob: #if the word in data is not in our pos prob
                prob_w = 1 #we need to account for prob_w being 1 
                pos_p = pos_p * prob_w
                
            else:
                prob_w = self.pos_prob[f[0]] #if word does exist in pos prob, pull probability of word 
                pos_p = pos_p * prob_w #calculate the probability 
        
        #loop through each word in negative
        for f in feat_n:
            if f[0] not in self.neg_prob:#if the word in data is not in our neg prob
                prob_w = 1 #we need to account for prob_w being 1 
                neg_p = neg_p * prob_w
            else:
                prob_w = self.neg_prob[f[0]] #if word does exist in neg prob, pull probability of word 
                neg_p = neg_p * prob_w #calculate the probability 
        
        self.score_class[0] = neg_p #update dictionary with positive and negative class probabilites
        self.score_class[1] = pos_p
        
        return(self.score_class) #return dictionary as stated in writeup

    def classify(self, data):
        a = self.score(data)
        if a[1] > a[0]: #if pos probability is greater
            return 1 #we classify as positive
        else: #if neg probability is greater
            return 0 #classify as negative

    def featurize(self, data): #pass one example (w1, 1)
        sen = data[1] #we take the sentence: w1
        label = data[2] #we then take the label: 1
        tokens = sen.split() #we want to tokenize the sentence
        feat = []
        for b in tokens: #for each token in tokens, we create our feature with token and label
            feat.append((b,label))
        return feat #we will return a list of features

    def __str__(self):
        return "Naive Bayes - bag-of-words baseline"

In [180]:
import string
class SentimentAnalysisImproved:

    def __init__(self):
    # do whatever you need to do to set up your class here
    
        self.pos_count = {} #dictionary for pos_count -word count respect to positive comments (freq)
        self.neg_count = {} #dictionary for neg_count -word count respect to negative comments (freq)
        
        self.vocab = 0 #number of words in our vocabulary
        self.pos = 0 #class count for probability calculations 
        self.neg = 0 #class count for probability calculations
        
        self.pos_prob = {} #probability list for each positive word 
        self.neg_prob = {} #probability list for each negative word
        
        self.pos_c_prob = 0 #positive class probability
        self.neg_c_prob = 0 #negative class probability
        
        self.score_class = {} #score return of dictionary P(c)*P(data|c)  for each class
        

        self.gold_labels = [0,0,0,1,0,0,1,1,1,1,1,0,1,0,0,0,1,0,1,0,0,1,1,0,1,1,0,1,1,1,0,0,0,0,1,0,1,1,1,1,0,0,1,0,1,1,1,1,1,0]
        self.classified_labels = []
 
        print("initialized improved\n")

    def train(self, examples): #examples is the generated tuple we did above
        for e in examples: #loop through all our examples
            feat = self.featurize(e) #get our features using featurize= create a bag of word features (w1,1) -> (x,y)
            #the example e can be positive or negative 
            if e[2] == 1: #if our label is 1--positive
                self.pos += 1 #we want to add it to our positive count (class count) how many + classes we saw
                for f in feat: #feat is a list of tuple, we need to look at each f in feat
                    if f[0] not in self.pos_count: #if the token in that f is not present in dictionary
                        #the first time we are encountering the word
                        self.vocab += 1 #keep track of our vocab
                        self.pos_count[f[0]] = 0 #put a zero for pos_count 
                        self.pos_count[f[0]] += 1 #increment count by one
                    else: #if token is already present in dictionary we want to increment the count of token
                        self.pos_count[f[0]] += 1
                       
            if e[2] == 0: #if our label is 0--negative
                self.neg += 1 #we want to add it to our negative count (class count)
                for f in feat:#repeat above
                    if f[0] not in self.neg_count:
                        self.neg_count[f[0]] = 0 #put a zero for neg_count 
                        self.neg_count[f[0]] += 1 #increment count by one
                    else: 
                        self.neg_count[f[0]] += 1
       
        #we need to make sure that both dictionary has both words - dictionary needs same set of words
        #If not in both, discard the word or handle the situation by adding to the other dictionary with a 0 count
        
        for k,v in self.neg_count.items(): #iterate over negative dictionary
            if k not in self.pos_count: #if k is not present in positive dictionary
                self.vocab += 1 #sum up the rest of vocab count with neg words as well
                self.pos_count[k] = 0 #add k of neg vocab with a 0 count - important for laplace smoothing
        for k,v in self.pos_count.items(): #do the same thing but we dont need to count vocab since we have all - and +
            if k not in self.neg_count:
                self.neg_count[k] = 0 #add k of pos vocab with a 0 count to neg_count 
        
        #calculate probabilities
        #the word count of w1 + 1/all the words in positive context + |v|
        for k,v in self.pos_count.items():
            self.pos_prob[k] = (self.pos_count[k] + 1)/(self.pos + self.vocab)
            
        for k,v in self.neg_count.items(): #same with words in negative context + |v|
            self.neg_prob[k] = (self.neg_count[k] + 1)/(self.neg + self.vocab)
        
        #calculate class probabilities
        self.pos_c_prob = self.pos/(self.pos + self.neg) #take all pos class counts/ total counts
        self.neg_c_prob = self.neg/(self.pos + self.neg) #take all neg class counts/ total counts

    def score(self, data):
        
        data_p = (0, data, 1) #create a tuple variable with id, sentence, and positive label
        data_n = (0, data, 0) #create a tuple variable with id, sentence, and negative label
        
        feat_p = self.featurize(data_p) #featurize each positive and negative sentence data
        feat_n = self.featurize(data_n)
        
        pos_p = (self.pos_c_prob) #initialize the probability to prior
        neg_p = (self.neg_c_prob)
        
        #loop through each word in positive 
        for f in feat_p:
            if f[0] not in self.pos_prob: #if the word in data is not in our pos prob
                prob_w = 1 #we need to account for prob_w being 1 
                pos_p = pos_p * prob_w
                
            else:
                prob_w = self.pos_prob[f[0]] #if word does exist in pos prob, pull probability of word 
                pos_p = pos_p * prob_w #calculate the probability 
        
        #loop through each word in negative
        for f in feat_n:
            if f[0] not in self.neg_prob:#if the word in data is not in our neg prob
                prob_w = 1 #we need to account for prob_w being 1 
                neg_p = neg_p * prob_w
            else:
                prob_w = self.neg_prob[f[0]] #if word does exist in neg prob, pull probability of word 
                neg_p = neg_p * prob_w #calculate the probability 
        
        self.score_class[0] = neg_p #update dictionary with positive and negative class probabilites
        self.score_class[1] = pos_p
        
        return(self.score_class) #return dictionary as stated in writeup

    def classify(self, data):
        a = self.score(data)
        if a[1] > a[0]: #if pos probability is greater
            return 1 #we classify as positive
        else: #if neg probability is greater
            return 0 #classify as negative

    def featurize(self, data): #pass one example (w1, 1)
        
        stopwords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", 
                     "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", 
                     "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which",
                     "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", 
                     "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an",
                     "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for",
                     "with", "about", "against", "between", "into", "through", "during", "before", "after",
                     "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under",
                     "again", "further", "then", "once", "here", "there", "when", "where", "why", "how",
                     "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no",
                     "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can",
                     "will", "just", "don", "should", "now"]
        sen = data[1] #we take the sentence: w1
        label = data[2] #we then take the label: 1
        tokens = sen.split() #we want to tokenize the sentence
        feat = []
        for b in tokens: #for each token in tokens, we create our feature with token and label
            b.lower() #lowercase all words
            b = punctuation(b) #remove punctuation in all the words
            if b not in stopwords: #if words in stopwords, we won't add them to our feature
                feat.append((b,label))
        return feat #we will return a list of features

    def __str__(self):
        return "ATTEMPT AT IMPROVEMENT!!!!"

In [183]:
    if __name__ == "__main__":
        if len(sys.argv) != 3:
            print("Usage:", "python hw3_sentiment.py training-file.txt testing-file.txt")
            sys.exit(1)

        training = sys.argv[1]
        testing = sys.argv[2]

        sa = SentimentAnalysis()
        im = SentimentAnalysisImproved()

        f =  generate_tuples_from_file(training)

        sa.train(f)
        im.train(f)
        print("Starting training of train data: ")
        
        print("Probability of each positive word in positive class (likelihood): ")
        print(sa.pos_prob)
        print("Probability of positive class (prior): ", sa.pos_c_prob, '\n')

        print("Probability of each negative word in negative class (likelihood): ")
        print(sa.neg_prob)
        print("Probability of negative class (prior): ", sa.neg_c_prob, '\n')

        print("Dictionary of the values of P(c)*P(data|c) for each class for answer check purposes: ")
        print("Positive - I loved the hotel : ", sa.score("I loved the hotel"))
        print("Negative - I hated the hotel: ", sa.score("I hated the hotel"), '\n')

        print("Classification print for answer check purposes: ")
        print("Positive - I loved the hotel a lot: ", sa.classify("I loved the hotel a lot"))
        print("Negative - I hated the hotel: ", sa.classify("I hated the hotel"), '\n')

        y = generate_test(testing)
        z = open("label_test_data.txt", "w")
        z1 = open("label_test_data_improved.txt", "w")
        
        for line in y:
            label = sa.classify(line[1])
            label1 = im.classify(line[1])
            z.write(line[0] + " " + str(label) + '\n')
            z1.write(line[0] + " " + str(label1) + '\n')
            sa.classified_labels.append(label)
            im.classified_labels.append(label1)
        z.close()
        z1.close()
        print("Label output created and named: label_test_data.txt and label_test_data_improved.txt..... Generation of labels complete")
        print("Created Labels: ", sa.classified_labels)
        print("Created Improved Labels: ", im.classified_labels, '\n')
        
        print("Precision Unimproved: ", precision(sa.gold_labels, sa.classified_labels))
        print("Recall Unimproved: ", recall(sa.gold_labels, sa.classified_labels))
        print("F1 Unimproved: ", f1(sa.gold_labels, sa.classified_labels), '\n')
    
        print("Precision Improved: ", precision(im.gold_labels, im.classified_labels))
        print("Recall Improved: ", recall(im.gold_labels, im.classified_labels))
        print("F1 Improved: ", f1(im.gold_labels, im.classified_labels), '\n')
      # do the things that you need to with your base class

        #improved = SentimentAnalysisImproved()
        #print(improved)
      # do the things that you need to with your improved class

initialized

initialized improved



FileNotFoundError: [Errno 2] No such file or directory: '-f'

print("Testing if code works done below: \n")

s = SentimentAnalysis()
s1 = SentimentAnalysisImproved()
f = generate_tuples_from_file("dev_file.txt")

s.train(f)
s1.train(f)

print("Probability of each positive word in positive class (likelihood): ")
print(s.pos_prob, '\n')
print("Probability of positive class (prior): ")
print(s.pos_c_prob, '\n')

print("Probability of each negative word in negative class (likelihood): ")
print(s.neg_prob, '\n')
print("Probability of negative class (prior): ")
print(s.neg_c_prob, '\n')

print("Testing dictionary of the values of P(c)*P(data|c) for each class: ")
print("Positive: ", s.score("I loved the hotel"))
print("Negative: ", s.score("I hated the hotel"), '\n')

print("Testing Classification: ")
print("Positive - I loved the hotel a lot: ", s.classify("I loved the hotel a lot"))
print("Negative - I hated the hotel: ", s.classify("I hated the hotel"), '\n')

y = generate_test("HW3-testset.txt")
z = open("label_test_data.txt", "w")
z1 = open("label_test_data_improved.txt", "w")
for line in y:
    label = s.classify(line[1])
    label1 = s1.classify(line[1])
    z.write(line[0] + " " + str(label) + '\n')
    z1.write(line[0] + " " + str(label1) + '\n')
    s.classified_labels.append(label)
    s1.classified_labels.append(label1)
    
z.close()

print("Label output created and named: label_test_data.txt")
print("Gold Labels: ", s.gold_labels)
print("Created Labels: ", s.classified_labels)
print("Created Improved Labels: ", s1.classified_labels, '\n')

print("Precision Unimproved: ", precision(s.gold_labels, s.classified_labels))
print("Recall Unimproved: ", recall(s.gold_labels, s.classified_labels))
print("F1 Unimproved: ", f1(s.gold_labels, s.classified_labels), '\n')
        
print("Precision Improved: ", precision(s1.gold_labels, s1.classified_labels))
print("Recall Improved: ", recall(s1.gold_labels, s1.classified_labels))
print("F1 Improved: ", f1(s1.gold_labels, s1.classified_labels), '\n')
