# The review.csv dataset can be found at https://drive.google.com/file/d/1nq1fqLHYc3UX9YM2OeZys9lbiwAlK8TY/view?usp=sharing

# Preprocessor - Creating Dataset

In [1]:
# -*- coding: utf-8 -*-
import csv
import nltk
import numpy as np
from math import sqrt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import WhitespaceTokenizer
from nltk.sentiment.util import mark_negation
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
import pandas

stop_words = set(stopwords.words('english'))

f = open('review.csv', "r", encoding = "utf-8")
read = csv.reader(f)
counter = 0
text_reviews = []
stars = []
for row in read:
    counter += 1
    if(counter < 200000): 
        if row[5] != '3':
            stars.append(row[5])
            text_reviews.append(row[6])
    else:
        f.close()
        text_reviews = text_reviews[1:]
        stars = stars[1:]
        break



# Preprocessor - Creating "Balanced Subset"

In [2]:
n = Counter(stars)
max_ = n.most_common()[-1][1]
n_added = {class_: 0 for class_ in n.keys()}
new_ys = []
new_xs = []
for i, y in enumerate(stars):
    if n_added[y] < max_:
        new_ys.append(y)
        new_xs.append(text_reviews[i])
        n_added[y] += 1
        
stars = new_ys
text_reviews = new_xs

# Preprocessor - Lemmatizer

In [3]:
xs_dupe = []
l = WordNetLemmatizer()
for x in text_reviews:
    review = str()
    for w in word_tokenize(x):
        current = str(l.lemmatize(w, pos = 'n'))
        for t in [l.lemmatize(w, pos = 'v'), l.lemmatize(w, pos = 'a'), l.lemmatize(w, pos = 'r')]:
            if len(t) < len(current):
                current = t
        if '\'' in current or '!' in current or '.' in current or ',' in current:
            review = review + current
        else:
            review = review + ' ' + current
    xs_dupe.append(review)
text_reviews = xs_dupe

# Vectorizer - n-grams, removing stopwords, negation detection

In [4]:
vectorizer = CountVectorizer(ngram_range=(1,3), stop_words = 'english', tokenizer=lambda text: mark_negation(word_tokenize(text)))
vectors = vectorizer.fit_transform(text_reviews)
stars = np.array(stars)

# 5-Fold cross validation, training classifier, evaluating accuracy

In [5]:
kf = KFold(n_splits = 5)
kf.get_n_splits(vectors)
binary_accuracy_list = list()
classifier_accuracy_list = list()
for x,y in kf.split(vectors):
    x = list(x)
    y = list(y)
    X_train, X_test = vectors[x], vectors[y]
    y_train, y_test = stars[x], stars[y]
    nb = MultinomialNB()
    nb.fit(X_train, y_train)
    y_pred = nb.predict(X_test)
    correct = 0
    for i, x in enumerate(y_test):
        if x == "1":
            if y_pred[i] == "1":
                correct += 1
            elif y_pred[i] == "2":
                correct += 1
        elif x == "2":
            if y_pred[i] == "1":
                correct += 1
            elif y_pred[i] == "2":
                correct += 1
        if x == "4":
            if y_pred[i] == "4":
                correct += 1
            elif y_pred[i] == "5":
                correct += 1
        if x == "5":
            if y_pred[i] == "4":
                correct += 1
            elif y_pred[i] == "5":
                correct += 1
    binary_accuracy_list.append(correct/len(y_test))
    classifier_accuracy_list.append(metrics.accuracy_score(y_test, y_pred))
classifier_accuracy_list = np.array(classifier_accuracy_list)
binary_accuracy_list = np.array(binary_accuracy_list)
print("4-class classification = ", classifier_accuracy_list.mean())
print("binary = ", binary_accuracy_list.mean())

4-class classification =  0.5085760622222785
binary =  0.7931842526646934


# Optimal Linear Support Vector Machine

In [2]:
# -*- coding: utf-8 -*-
import csv
import nltk
import numpy as np
from math import sqrt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import WhitespaceTokenizer
from nltk.sentiment.util import mark_negation
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
import pandas

stop_words = set(stopwords.words('english'))

f = open('review.csv', "r", encoding = "utf-8")
read = csv.reader(f)
counter = 0
text_reviews = []
stars = []
for row in read:
    counter += 1
    if(counter < 200000): 
        if row[5] != '3':
            stars.append(row[5])
            text_reviews.append(row[6])
    else:
        f.close()
        text_reviews = text_reviews[1:]
        stars = stars[1:]
        break
vectorizer = TfidfVectorizer(ngram_range=(1,3), tokenizer=lambda text: mark_negation(word_tokenize(text)))
vectors = vectorizer.fit_transform(text_reviews)
stars = np.array(stars)
X_train, X_test, y_train, y_test = train_test_split(vectors, stars, test_size=0.2, random_state=24)
svm = LinearSVC()
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
correct = 0
for i, x in enumerate(y_test):
    if x == "1":
        if y_pred[i] == "1":
            correct += 1
        elif y_pred[i] == "2":
            correct += 1
    elif x == "2":
        if y_pred[i] == "1":
            correct += 1
        elif y_pred[i] == "2":
            correct += 1
    if x == "4":
        if y_pred[i] == "4":
            correct += 1
        elif y_pred[i] == "5":
            correct += 1
    if x == "5":
        if y_pred[i] == "4":
            correct += 1
        elif y_pred[i] == "5":
            correct += 1
            
print("4-class classification = ", metrics.accuracy_score(y_test, y_pred))
print("binary = ", correct/len(y_test))       

4-class classification =  0.755445209344
binary =  0.961488545310913


# Optimal Multinomial Naive Bayes

In [7]:
# -*- coding: utf-8 -*-
import csv
import nltk
import numpy as np
from math import sqrt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import WhitespaceTokenizer
from nltk.sentiment.util import mark_negation
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
import pandas

stop_words = set(stopwords.words('english'))

f = open('review.csv', "r", encoding = "utf-8")
read = csv.reader(f)
counter = 0
text_reviews = []
stars = []
for row in read:
    counter += 1
    if(counter < 200000): 
        if row[5] != '3':
            stars.append(row[5])
            text_reviews.append(row[6])
    else:
        f.close()
        text_reviews = text_reviews[1:]
        stars = stars[1:]
        break

vectorizer = CountVectorizer(ngram_range=(1,4), tokenizer=lambda text: mark_negation(word_tokenize(text)))
vectors = vectorizer.fit_transform(text_reviews)
stars = np.array(stars)
kf = KFold(n_splits = 5)
kf.get_n_splits(vectors)
binary_accuracy_list = list()
classifier_accuracy_list = list()
for x,y in kf.split(vectors):
    x = list(x)
    y = list(y)
    X_train, X_test = vectors[x], vectors[y]
    y_train, y_test = stars[x], stars[y]
    nb = MultinomialNB()
    nb.fit(X_train, y_train)
    y_pred = nb.predict(X_test)
    correct = 0
    for i, x in enumerate(y_test):
        if x == "1":
            if y_pred[i] == "1":
                correct += 1
            elif y_pred[i] == "2":
                correct += 1
        elif x == "2":
            if y_pred[i] == "1":
                correct += 1
            elif y_pred[i] == "2":
                correct += 1
        if x == "4":
            if y_pred[i] == "4":
                correct += 1
            elif y_pred[i] == "5":
                correct += 1
        if x == "5":
            if y_pred[i] == "4":
                correct += 1
            elif y_pred[i] == "5":
                correct += 1
    binary_accuracy_list.append(correct/len(y_test))
    classifier_accuracy_list.append(metrics.accuracy_score(y_test, y_pred))
classifier_accuracy_list = np.array(classifier_accuracy_list)
binary_accuracy_list = np.array(binary_accuracy_list)
print("4-class classification = ", classifier_accuracy_list.mean())
print("binary = ", binary_accuracy_list.mean())

4-class classification =  0.6935317458003933
binary =  0.9030465234710583


# Optimal Linear Regression

In [8]:
# -*- coding: utf-8 -*-
import csv
import nltk
import numpy as np
from math import sqrt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import WhitespaceTokenizer
from nltk.sentiment.util import mark_negation
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
import pandas

stop_words = set(stopwords.words('english'))

f = open('review.csv', "r", encoding = "utf-8")
read = csv.reader(f)
counter = 0
text_reviews = []
stars = []
for row in read:
    counter += 1
    if(counter < 200000): 
        if row[5] != '3':
            stars.append(row[5])
            text_reviews.append(row[6])
    else:
        f.close()
        text_reviews = text_reviews[1:]
        stars = stars[1:]
        break
vectorizer = TfidfVectorizer(ngram_range=(1,3), tokenizer=lambda text: mark_negation(word_tokenize(text)))
vectors = vectorizer.fit_transform(text_reviews)
stars = np.array(stars)
X_train, X_test, y_train, y_test = train_test_split(vectors, stars, test_size=0.2, random_state=24)
svm = LinearSVC()
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)

correct = 0
for i, x in enumerate(y_test):
    if x == "1":
        if float(y_pred[i]) < 1.5:
            correct += 1
    elif x == "2":
        if float(y_pred[i]) >= 1.5 and float(y_pred[i]) < 2.5:
            correct += 1
    if x == "4":
        if float(y_pred[i]) >= 3.5 and float(y_pred[i]) < 4.5:
            correct += 1
    if x == "5":
        if float(y_pred[i]) >= 4.5:
            correct += 1

print("4-class classification = ", correct/len(y_test)) 

correct = 0
for i, x in enumerate(y_test):
    if x == "1":
        if float(y_pred[i]) <= 2.5:
            correct += 1
    elif x == "2":
        if float(y_pred[i]) <= 2.5:
            correct += 1
    if x == "4":
        if float(y_pred[i]) >= 3.5:
            correct += 1
    if x == "5":
        if float(y_pred[i]) >= 3.5:
            correct += 1

            
print("binary = ", correct/len(y_test)) 

4-class classification =  0.7554452093443178
binary =  0.961488545310913
