In [None]:
polarity_file = "review_polarity.zip"
!unzip {polarity_file}

In [2]:
import glob
neg_dir = './review_polarity/txt_sentoken/neg'
pos_dir = './review_polarity/txt_sentoken/pos'



In [3]:
def get_train_test():
  neg_reviews = sorted(glob.glob('./review_polarity/txt_sentoken/neg/*'))
  pos_reviews = sorted(glob.glob('./review_polarity/txt_sentoken/pos/*'))
  train = neg_reviews[:750]+pos_reviews[:750]
  test = neg_reviews[750:]+pos_reviews[750:]
  return train, test
  


In [4]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
train, test = get_train_test()

In [7]:
from nltk.stem.porter import *
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import mark_negation


In [8]:
def porter_stem(words):
  stemmer = PorterStemmer()
  s = [stemmer.stem(w) for w in words]
  return s

In [9]:
def include_negation(words):
  sentim_analyzer = SentimentAnalyzer() 
  ci = [nltk.sentiment.util.mark_negation(doc) for doc in words]
  return ci


In [10]:
from nltk.classify.rte_classify import ne
from string import punctuation
from os import listdir
from collections import Counter
from nltk.corpus import stopwords

def load_doc(filename):
	file = open(filename, 'r',encoding='utf-8', errors = 'ignore')
	text = file.read()
	file.close()
	return text

def clean_doc(doc):
	tokens = doc.split()
	table = str.maketrans('', '', punctuation)
	tokens = [w.translate(table) for w in tokens]
	tokens = [word for word in tokens if word.isalpha()]
	stop_words = set(stopwords.words('english'))
	tokens = [w for w in tokens if not w in stop_words]
	tokens = [word for word in tokens if len(word) > 1]
	return tokens

def add_doc_to_vocab(filename, vocab):
	doc = load_doc(filename)
	tokens = clean_doc(doc)
	vocab.update(tokens)

def get_vocab(paths):
  vocab = Counter()
  for path in paths:
    add_doc_to_vocab(path, vocab)
  return vocab

baseline_neg_vocab = get_vocab(train[0:750])
baseline_pos_vocab = get_vocab(train[750:])






In [11]:
import string
import sklearn


In [12]:
def tokenizeMe(line):
    line = line.split(" ")
    line = [word.translate(str.maketrans('', '', string.punctuation)) for word in line]
    line = [word for word in line if word.isalpha()]
    line = [word for word in line if len(word) > 1]
    line = [word for word in line if word not in set(nltk.corpus.stopwords.words('english'))]
    line = porter_stem(line)
    line = include_negation(line)
    return(line)

In [13]:
all_vocab = []
for hello in baseline_neg_vocab.keys():
  all_vocab.append(hello)
for hello in baseline_pos_vocab.keys():
  all_vocab.append(hello)
all_vocab = porter_stem(all_vocab)
all_vocab = include_negation(all_vocab)
all_vocab = sorted(list(set(all_vocab)))


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score


In [15]:

def train_fit(clf):
  v = TfidfVectorizer(input = 'filename', tokenizer=tokenizeMe,vocabulary=all_vocab)
  X_train = v.fit_transform(train).toarray()
  labels_train = [0]*750+[1]*750
  clf.fit(X_train,labels_train)
  X_test = v.fit_transform(test)
  scores = clf.predict(X_test)
  test_gt = [0]*250+[1]*250
  f1 = f1_score(test_gt,scores, average="micro")
  prec = precision_score(test_gt,scores,average="micro")
  recall = recall_score(test_gt,scores,average="micro")
  acc = accuracy_score(test_gt,scores)

  print("F1 Score -->"+str(f1))
  print("Precision -->"+str(prec))
  print("Recall -->" +str(recall))
  print("Accuracy -->"+str(acc))


  

In [16]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import numpy as np

In [17]:
names = [
    "Nearest Neighbors",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
]

classifiers = [
    KNeighborsClassifier(3),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000)]

In [18]:
for i in range(4):
  #SVM requires sparse data
  
  print(names[i])
  train_fit(classifiers[i])

Nearest Neighbors
F1 Score -->0.656
Precision -->0.656
Recall -->0.656
Accuracy -->0.656
Decision Tree
F1 Score -->0.674
Precision -->0.674
Recall -->0.674
Accuracy -->0.674
Random Forest
F1 Score -->0.558
Precision -->0.558
Recall -->0.558
Accuracy -->0.558
Neural Net
F1 Score -->0.866
Precision -->0.866
Recall -->0.866
Accuracy -->0.866


In [19]:
from sklearn.feature_extraction.text import CountVectorizer

def train_fit_svm(clf):
  v = CountVectorizer(input = 'filename', tokenizer=tokenizeMe,vocabulary=all_vocab)
  X_train = v.fit_transform(train)
  labels_train = [0]*750+[1]*750
  clf.fit(X_train,labels_train)
  X_test = v.fit_transform(test)
  scores = clf.predict(X_test)
  test_gt = [0]*250+[1]*250
  f1 = f1_score(test_gt,scores, average="micro")
  prec = precision_score(test_gt,scores,average="micro")
  recall = recall_score(test_gt,scores,average="micro")
  acc = accuracy_score(test_gt,scores)

  print("F1 Score -->"+str(f1))
  print("Precision -->"+str(prec))
  print("Recall -->" +str(recall))
  print("Accuracy -->" +str(acc))
svm_names = ["Linear SVM","RBF SVM"]
svms = [SVC(kernel="linear", C=0.025),SVC(gamma=2, C=1)]



In [20]:
for i in range(2):
  print(svm_names[i])
  train_fit_svm(svms[i])

Linear SVM
F1 Score -->0.824
Precision -->0.824
Recall -->0.824
Accuracy -->0.824
RBF SVM
F1 Score -->0.5
Precision -->0.5
Recall -->0.5
Accuracy -->0.5


In [21]:
def train_fit_dense(clf):
  v = TfidfVectorizer(input = 'filename', tokenizer=tokenizeMe,vocabulary=all_vocab)
  X_train = v.fit_transform(train)
  X_train = X_train.toarray()
  labels_train = [0]*750+[1]*750
  clf.fit(X_train,labels_train)
  X_test = v.fit_transform(test)
  X_test = X_test.toarray()
  scores = clf.predict(X_test)
  test_gt = [0]*250+[1]*250
  f1 = f1_score(scores, test_gt,average="micro")
  prec = precision_score(scores, test_gt,average="micro")
  recall = recall_score(scores, test_gt,average="micro")
  acc = accuracy_score(scores,test_gt)

  print("F1 Score -->"+str(f1))
  print("Precision -->"+str(prec))
  print("Recall -->" +str(recall))
  print("Accuracy -->"+str(acc))


In [22]:
print("QDA")
train_fit_dense(QuadraticDiscriminantAnalysis())
print("Naive Bayes")
train_fit_dense(GaussianNB())

QDA




F1 Score -->0.512
Precision -->0.512
Recall -->0.512
Accuracy -->0.512
Naive Bayes
F1 Score -->0.614
Precision -->0.614
Recall -->0.614
Accuracy -->0.614
