In [123]:
# Import statements
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
import os
import nltk
from nltk.stem import WordNetLemmatizer
import re
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression #Max entropy

In [124]:
negpath = "../Data/review_polarity/txt_sentoken/neg"
pospath = "../Data/review_polarity/txt_sentoken/pos"
#function to read all negative and positive lines into separate lists
def read_file(path):
    listlines = []
    #Read all files from given folder path
    for file in os.listdir(path):
        currfile=path+"/"+file
        f = open(currfile, "r")
        lines = f.read() #switched from readlines() to read the entire review
        listlines.append(lines)
    return listlines
old_neglines=[]
old_poslines=[]
old_neglines = read_file(negpath)
old_poslines = read_file(pospath)
#print(old_neglines[0])

In [125]:
#Preprocessing: lowercase and words only (remove punctuation, numbers)
from nltk.corpus import stopwords
#stopwords minus negation
stop_words = set(stopwords.words("english"))-set(["couldn't","wouldn","haven't","haven","aren","aren't","isn","isn't","ain","wouldn't","didn","didn't","doesn","doesn't","wasn't","don","don't","couldn","shouldn","shouldn't","hasn","hasn't","hadn","hadn't","not","won","won't",])
neg_words = "never|nothing|nowhere|none|havent|hasnt|hadnt|cant|couldnt|shouldnt|wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint|no$|not$"

def preprocess(listlines):
    processed = []
    for line in listlines:
        #reviews appear to already be in lowercase, but ensure that's the case:
        newline = line.lower()
        newline = newline.split()
        #remove stopwords
        newline = [word for word in newline if not word in stop_words]
        newline = ' '.join(newline)
        #remove non-alphabet characters, keeping underscore.
        newline = re.sub(r"[^a-zA-z\s_]",'',newline)
        #print(newline)
        processed.append(newline)
    return processed
neglines = preprocess(old_neglines)
poslines = preprocess(old_poslines)

In [126]:
print(neglines[0])

plot  two teen couples go church party  drink drive  get accident  one guys dies  girlfriend continues see life  nightmares  whats deal  watch movie  sorta  find    critique  mindfuck movie teen generation touches cool idea  presents bad package  makes review even harder one write  since generally applaud films attempt break mold  mess head  lost highway  memento   good bad ways making types films  folks didnt snag one correctly  seem taken pretty neat concept  executed terribly  problems movie  well  main problem simply jumbled  starts  normal  downshifts  fantasy  world  audience member  idea whats going  dreams  characters coming back dead  others look like dead  strange apparitions  disappearances  looooot chase scenes  tons weird things happen  simply not explained  personally dont mind trying unravel film every  give clue  get kind fed  films biggest problem  obviously got big secret hide  seems want hide completely final five minutes  make things entertaining  thrilling even eng

In [127]:
#label each line with "positive" or "negative"
def labellines(listlines, sentiment):
    labeled = []
    for line in listlines:
        curr_line = []
        curr_line.append(line)
        curr_line.append(sentiment)
        labeled.append(curr_line)
    return labeled
#Label negative review "Negative" and positive reviews "Positive"
neglabeled=labellines(neglines,"Negative")
poslabeled=labellines(poslines,"Positive")
#Combine all reviews
lines = neglabeled+poslabeled

In [128]:
df = pd.DataFrame(lines, columns=["word","sentiment"])
print(df.head(10))

                                                word sentiment
0  plot  two teen couples go church party  drink ...  Negative
1  happy bastards quick movie review damn yk bug ...  Negative
2  movies like make jaded movie viewer thankful i...  Negative
3   quest camelot  warner bros   first featurelen...  Negative
4  synopsis  mentally unstable man undergoing psy...  Negative
5  capsule   planet mars police taking custody ac...  Negative
6  ask  mm    eight millimeter   really  wholesom...  Negative
7  thats exactly long movie felt  even nine laugh...  Negative
8  call road trip walking wounded  stellan skarsg...  Negative
9  plot  young french boy sees parents killed eye...  Negative


In [129]:
labels = df["sentiment"]
data = df.drop(columns=["sentiment"])
X = data["word"]

In [130]:
#From demo:
# Define a function to take as input training and testing vectors and labels
# Allow this to be extensible to let multiple classifiers be used here
def buildClassifiers(clf, X_train, X_test, y_train, y_test):

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    # if you get a zero-devision warning message, you can supress it by setting zero_division=0
    f1 = f1_score(y_test, y_pred, average="macro", zero_division=0)
    precision = precision_score(y_test, y_pred, average="macro", zero_division=0)
    recall = recall_score(y_test, y_pred, average="macro", zero_division=0)
    accuracy = accuracy_score(y_test, y_pred)
    return f1, precision, recall, accuracy

In [131]:
#From demo:
# Construct the classifiers at hand prior to folding the data through them
names = ['Naive_Bayes', 'Decision_Tree','Max_Entropy']#,'Support_Vector_Machines']
classifiers = [MultinomialNB(), 
              DecisionTreeClassifier(random_state=0, criterion='gini'), LogisticRegression(random_state=0,max_iter=1000)]#, SVC(kernel='linear')]
# names = ['Max_Entropy']
# classifiers = [LogisticRegression(random_state=0,max_iter=1000)]
# Try different classifiers: https://scikit-learn.org/stable/supervised_learning.html#supervised-learning
for name, clf in zip(names, classifiers):

    #print('Now classifying', name)

    # Fold the data 5 times. Shuffle because data is organized into negative then positive
    kf = KFold(n_splits = 5, shuffle=True, random_state=0)
    foldCounter = 0
    aList, bList, cList, dList = list(), list(), list(), list()
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        #vectorize here:
        vectorizer = CountVectorizer()
        X_train_vect = vectorizer.fit_transform(X_train).toarray()
        X_test_vect = vectorizer.transform(X_test).toarray()
        f1, precision, recall, accuracy = buildClassifiers(clf, X_train_vect, X_test_vect, y_train, y_test)
        aList.append(f1)
        bList.append(precision)
        cList.append(recall)
        dList.append(accuracy)
    print("Average F1 for {}:\t\t".format(name), np.mean(aList))
    print("Average Precision for {}:\t".format(name), np.mean(bList))
    print("Average Recall for {}:\t".format(name), np.mean(cList))
    print("Average Accuracy for {}:\t".format(name), np.mean(dList))

Average F1 for Naive_Bayes:		 0.813587081877839
Average Precision for Naive_Bayes:	 0.8135985271703335
Average Recall for Naive_Bayes:	 0.813933679628291
Average Accuracy for Naive_Bayes:	 0.8140000000000001
Average F1 for Decision_Tree:		 0.6212577546193891
Average Precision for Decision_Tree:	 0.6214052795812168
Average Recall for Decision_Tree:	 0.6215279851946158
Average Accuracy for Decision_Tree:	 0.6220000000000001
Average F1 for Max_Entropy:		 0.8450694935554829
Average Precision for Max_Entropy:	 0.8457648381576334
Average Recall for Max_Entropy:	 0.8453949096594234
Average Accuracy for Max_Entropy:	 0.8454999999999998
