In [1]:
import os
import re
import string
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from nltk import word_tokenize
from random import shuffle
from itertools import chain
from collections import Counter

Read all sentences into lists...

In [2]:
yelp_text_dir = '../Data/ExtraCredit/Yelp/all_reviews.txt'
pos_sentences = []
neg_sentences = []
with open(yelp_text_dir, 'r') as f:
    reading_flag = False
    for line in f:
        if line[1:-1] == 'star':
            cur_star = int(line[0])
            continue
        elif line.startswith('[[['):
            cur_review = ""
            reading_flag = True
            continue
        elif line.startswith(']]]'):
            reading_flag = False
            # Text Normalization
            cur_review = cur_review.strip().lower()
            # Store
            if cur_star < 3.5:
                neg_sentences.append(word_tokenize(cur_review))
            elif cur_star > 3.5:
                pos_sentences.append(word_tokenize(cur_review))
        elif reading_flag and len(line) > 1:
            cur_review += line[:-1] + ' '


In [13]:
len(neg_sentences)

4325

Divide into train/dev/test sets...

In [3]:
shuffle(pos_sentences)
shuffle(neg_sentences)

npos = len(pos_sentences)
nneg = len(neg_sentences)

traindata = [pos_sentences[:npos//10*6], neg_sentences[:nneg//10*6]]
valdata = [pos_sentences[npos//10*6:npos//10*8], neg_sentences[nneg//10*6:nneg//10*8]]
testdata = [pos_sentences[npos//10*8:], neg_sentences[nneg//10*8:]]

In [8]:
TESTING = True
if TESTING:
    traindata[0] += valdata[0]
    traindata[1] += valdata[1]

Get Bag-of-Words dictionaries...

In [9]:
trainpos = list(chain.from_iterable(traindata[0]))
trainneg = list(chain.from_iterable(traindata[1]))
cntpos = Counter(trainpos)
cntneg = Counter(trainneg)
total_words = set(trainpos).union(set(trainneg))

In [10]:
aff_word_list = []
pos_logprob = {}
for w in total_words:
    prob = (1+cntpos[w])/(2+cntpos[w]+cntneg[w])
    aff_word_list.append(w)
    pos_logprob[w] = np.log(prob)
aff_word_list = set(aff_word_list)

Get prediction results...

In [7]:
y_true = np.array([1]*len(valdata[0])+[0]*len(valdata[1]))
y_pred = []
for i in range(2):
    for sent in valdata[i]:
        aff_score = []
        for w in sent:
            if w in aff_word_list:
                aff_score.append(pos_logprob[w])
        if len(aff_score) == 0:
            y_pred.append(np.random.choice(2))
        elif np.mean(aff_score) > np.log(0.5):
            y_pred.append(1)
        else:
            y_pred.append(0)
y_pred = np.array(y_pred)

print("The accuracy is:", accuracy_score(y_true, y_pred))
print("")

## Stat for pos class
print("The precision for positive is:", precision_score(y_true, y_pred))
print("The recall for positive is:", recall_score(y_true, y_pred))
print("The f1 score for positive is:", f1_score(y_true, y_pred))
print("")

## Stat for neg class
print("The precision for negative is:", precision_score(y_true, y_pred, pos_label=0))
print("The recall for negative is:", recall_score(y_true, y_pred, pos_label=0))
print("The f1 score for negative is:", f1_score(y_true, y_pred, pos_label=0))

The accuracy is: 0.8314065510597303

The precision for positive is: 0.8470209339774557
The recall for positive is: 0.8679867986798679
The f1 score for positive is: 0.8573757131214343

The precision for negative is: 0.8081534772182254
The recall for negative is: 0.7800925925925926
The f1 score for negative is: 0.7938751472320378


Run on test data...

In [11]:
y_test = np.array([1]*len(testdata[0])+[0]*len(testdata[1]))
y_pred = []
for i in range(2):
    for sent in testdata[i]:
        aff_score = []
        for w in sent:
            if w in aff_word_list:
                aff_score.append(pos_logprob[w])
        if len(aff_score) == 0:
            y_pred.append(np.random.choice(2))
        elif np.mean(aff_score) > np.log(0.5):
            y_pred.append(1)
        else:
            y_pred.append(0)
y_pred = np.array(y_pred)

print("The accuracy is:", accuracy_score(y_test, y_pred))
print("")

## Stat for pos class
print("The precision for positive is:", precision_score(y_test, y_pred))
print("The recall for positive is:", recall_score(y_test, y_pred))
print("The f1 score for positive is:", f1_score(y_test, y_pred))
print("")

## Stat for neg class
print("The precision for negative is:", precision_score(y_test, y_pred, pos_label=0))
print("The recall for negative is:", recall_score(y_test, y_pred, pos_label=0))
print("The f1 score for negative is:", f1_score(y_test, y_pred, pos_label=0))

The accuracy is: 0.8481073310972688

The precision for positive is: 0.8431073876618431
The recall for positive is: 0.9088669950738916
The f1 score for positive is: 0.8747530620308177

The precision for negative is: 0.8565891472868217
The recall for negative is: 0.762945914844649
The f1 score for negative is: 0.8070602556299453
