In [54]:
# Import statements
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
import os
import nltk
from nltk.stem import WordNetLemmatizer
import re
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression #Max entropy
from nltk import tokenize

In [55]:
negpath = "../Data/review_polarity/txt_sentoken/neg"
pospath = "../Data/review_polarity/txt_sentoken/pos"
#function to read all negative and positive lines into separate lists
def read_file(path):
    listlines = []
    for file in os.listdir(path):
        currfile=path+"/"+file
        f = open(currfile, "r")
        lines = f.read() #switched from readlines() to read the entire review
        listlines.append(lines)
    return listlines
old_neglines=[]
old_poslines=[]
old_neglines = read_file(negpath)
old_poslines = read_file(pospath)
#print(old_neglines[0])

In [56]:
#For Modification 3: Gets all words that are associated with an emotion.
emotion_path = "../Data/NRC-Sentiment-Emotion-Lexicons/NRC-Sentiment-Emotion-Lexicons/NRC-Emotion-Lexicon-v0.92/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt"
f = open(emotion_path, "r")
emotions = f.readlines()
emotion_words = set()
for line in emotions:
    emotion = line.split()
    #If the current word has an emotion associated with it, add it to the set
    if(emotion[2]=="1"):
        emotion_words.add(emotion[0])

In [57]:
#Preprocessing: lowercase and words only (remove punctuation, numbers)
#lemmatized
from nltk.corpus import stopwords
#stopwords minus negation
stop_words = set(stopwords.words("english"))-set(["couldn't","wouldn","haven't","haven","aren","aren't","isn","isn't","ain","wouldn't","didn","didn't","doesn","doesn't","wasn't","don","don't","couldn","shouldn","shouldn't","hasn","hasn't","hadn","hadn't","not","won","won't",])
neg_words = "never|nothing|nowhere|none|havent|hasnt|hadnt|cant|couldnt|shouldnt|wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint|no$|not$"

def preprocess(listlines):
    processed = []
    for line in listlines:
        #reviews appear to already be in lowercase, but ensure that's the case:
        newline = line.lower()
        
        ########### Modification 3: Remove sentences with no emotion words ##########
        sentences=nltk.sent_tokenize(newline)
        indices = [] #indices of sentences without emotion words
        for i in range(len(sentences)):
                templine = sentences[i].split()
                #Doesn't deal with length-1 sentences (punctuation)
                if len(sentences[i])>1:
                        emo = False #tracks if any emotion words appear in sentence
                        for word in templine:
                                #lemmatize each word to check if lemma in emotion words
                                lemmatizer = WordNetLemmatizer()
                                lemword = lemmatizer.lemmatize(word)
                                if lemword in emotion_words:
                                        emo = True
                                        break
                        if emo == False:
                                #print(sentences[i])
                                indices.append(i)
        #delete sentences
        for i in reversed(indices):
                del sentences[i]
        #update newline to reflect changes
        newline=" ".join(sentences)
        ########################################################################  
        #remove stopwords
        newline = newline.split()
        newline = [word for word in newline if not word in stop_words]
        newline = ' '.join(newline)

        ########### Modification 2: Mark words following negative words ##########
        punct = "[.:;!?]"
        newline = re.sub(r"[^a-zA-z.:;!?\s]",'',newline) #keep clause-level punctuation first
        newline = newline.split()
        #iterate through tokens and mark words. Note clause-level punctuation is its own token
        count = 0
        while count < len(newline):
            #if current word is a negation, process
            if re.search(neg_words,newline[count]) is not None:
                count+=1
                #mark all words until clause-level punctuation reached
                while count < len(newline) and re.search(punct,newline[count]) is None:
                    newline[count] = newline[count]+"_NEG"
                    count+=1
            else:
                count+=1
        newline = ' '.join(newline)
        ########################################################################        
        #remove non-alphabet characters, keeping underscore.
        newline = re.sub(r"[^a-zA-z\s_]",'',newline) 
        ########### Modification 1: Replace negation with "not" ##########
        newline = re.sub(neg_words,"not",newline)
        ########################################################################
        #print(newline)
        processed.append(newline)
    return processed
neglines = preprocess(old_neglines)
poslines = preprocess(old_poslines)

In [58]:
#label each line with "positive" or "negative"
def labellines(listlines, sentiment):
    labeled = []
    for line in listlines:
        curr_line = []
        curr_line.append(line)
        curr_line.append(sentiment)
        labeled.append(curr_line)
    return labeled
neglabeled=labellines(neglines,"Negative")
poslabeled=labellines(poslines,"Positive")
lines = neglabeled+poslabeled

In [59]:
df = pd.DataFrame(lines, columns=["word","sentiment"])
print(df.head(10))

                                                word sentiment
0  plot  two teen couples go church party drink d...  Negative
1  happy bastards quick movie review damn yk bug ...  Negative
2  movies like make jaded movie viewer thankful i...  Negative
3  quest camelot warner bros  first featurelength...  Negative
4  synopsis  mentally unstable man undergoing psy...  Negative
5  capsule  planet mars police taking custody acc...  Negative
6  wholesome surveillance man loses sight values ...  Negative
7  thats exactly long movie felt  even nine laugh...  Negative
8  call road trip walking wounded  rd plays convi...  Negative
9  plot  young french boy sees pnots killed_NEG e...  Negative


In [60]:
labels = df["sentiment"]
data = df.drop(columns=["sentiment"])
X = data["word"]

In [61]:
#From demo:
# Define a function to take as input training and testing vectors and labels
# Allow this to be extensible to let multiple classifiers be used here
def buildClassifiers(clf, X_train, X_test, y_train, y_test):

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    # if you get a zero-devision warning message, you can supress it by setting zero_division=0
    f1 = f1_score(y_test, y_pred, average="macro", zero_division=0)
    precision = precision_score(y_test, y_pred, average="macro", zero_division=0)
    recall = recall_score(y_test, y_pred, average="macro", zero_division=0)
    accuracy = accuracy_score(y_test, y_pred)
    return f1, precision, recall, accuracy

In [62]:
# Construct the classifiers at hand prior to folding the data through them
names = ['Naive_Bayes', 'Decision_Tree','Max_Entropy']#,'Support_Vector_Machines']
classifiers = [MultinomialNB(), 
              DecisionTreeClassifier(random_state=0, criterion='gini'), LogisticRegression(random_state=0,max_iter=1000)]#, SVC(kernel='linear')]
# names = ['Max_Entropy']
# classifiers = [LogisticRegression(random_state=0,max_iter=1000)]
# Try different classifiers: https://scikit-learn.org/stable/supervised_learning.html#supervised-learning
for name, clf in zip(names, classifiers):

    #print('Now classifying', name)

    # Fold the data 5 times
    kf = KFold(n_splits = 5, shuffle=True, random_state=0)
    foldCounter = 0
    aList, bList, cList, dList = list(), list(), list(), list()
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        #vectorize here:
        #vectorizer = CountVectorizer() #comment out if using tfidf
        ######################Modification 4: Use tfidf#####################
        vectorizer = TfidfVectorizer()
        ####################################################################
        X_train_vect = vectorizer.fit_transform(X_train).toarray()
        X_test_vect = vectorizer.transform(X_test).toarray()
        f1, precision, recall, accuracy = buildClassifiers(clf, X_train_vect, X_test_vect, y_train, y_test)
        aList.append(f1)
        bList.append(precision)
        cList.append(recall)
        dList.append(accuracy)
    print("Average F1 for {}:\t\t".format(name), np.mean(aList))
    print("Average Precision for {}:\t".format(name), np.mean(bList))
    print("Average Recall for {}:\t".format(name), np.mean(cList))
    print("Average Accuracy for {}:\t".format(name), np.mean(dList))

Average F1 for Naive_Bayes:		 0.8079380883785785
Average Precision for Naive_Bayes:	 0.8119796400847742
Average Recall for Naive_Bayes:	 0.810929550344745
Average Accuracy for Naive_Bayes:	 0.808
Average F1 for Decision_Tree:		 0.6191526614885313
Average Precision for Decision_Tree:	 0.6203174239600939
Average Recall for Decision_Tree:	 0.6201262244789335
Average Accuracy for Decision_Tree:	 0.62
Average F1 for Max_Entropy:		 0.8247291223489034
Average Precision for Max_Entropy:	 0.825198737913604
Average Recall for Max_Entropy:	 0.8256080354798001
Average Accuracy for Max_Entropy:	 0.825
