In [1]:
import os
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from nltk import word_tokenize
from random import shuffle
from itertools import chain
from collections import Counter

Read all sentences into lists...

In [2]:
pos_text_dir = '../Data/review_polarity/txt_sentoken/pos/'
neg_text_dir = '../Data/review_polarity/txt_sentoken/neg/'

pos_sentences = []
for file in os.listdir(pos_text_dir):
    with open(os.path.join(pos_text_dir,file), 'r') as f:
        for lines in f:
            pos_sentences.append(word_tokenize(lines))

neg_sentences = []
for file in os.listdir(neg_text_dir):
    with open(os.path.join(neg_text_dir,file), 'r') as f:
        for lines in f:
            neg_sentences.append(word_tokenize(lines))

Divide into train/dev/test sets...

In [3]:
shuffle(pos_sentences)
shuffle(neg_sentences)

npos = len(pos_sentences)
nneg = len(neg_sentences)

traindata = [pos_sentences[:npos//10*6], neg_sentences[:nneg//10*6]]
valdata = [pos_sentences[npos//10*6:npos//10*8], neg_sentences[nneg//10*6:nneg//10*8]]
testdata = [pos_sentences[npos//10*8:], neg_sentences[nneg//10*8:]]

In [9]:
TESTING = True
if TESTING:
    traindata[0] += valdata[0]
    traindata[1] += valdata[1]

Get Bag-of-Words dictionaries...

In [10]:
trainpos = list(chain.from_iterable(traindata[0]))
trainneg = list(chain.from_iterable(traindata[1]))
cntpos = Counter(trainpos)
cntneg = Counter(trainneg)
total_words = set(trainpos).union(set(trainneg))

In [11]:
aff_word_list = []
pos_logprob = {}
for w in total_words:
    prob = (1+cntpos[w])/(2+cntpos[w]+cntneg[w])
    aff_word_list.append(w)
    pos_logprob[w] = np.log(prob)
aff_word_list = set(aff_word_list)

Get prediction results...

In [7]:
y_true = np.array([1]*len(valdata[0])+[0]*len(valdata[1]))
y_pred = []
for i in range(2):
    for sent in valdata[i]:
        aff_score = []
        for w in sent:
            if w in aff_word_list:
                aff_score.append(pos_logprob[w])
        if len(aff_score) == 0:
            y_pred.append(np.random.choice(2))
        elif np.mean(aff_score) > np.log(0.5):
            y_pred.append(1)
        else:
            y_pred.append(0)
y_pred = np.array(y_pred)

print("The accuracy is:", accuracy_score(y_true, y_pred))
print("")

## Stat for pos class
print("The precision for positive is:", precision_score(y_true, y_pred))
print("The recall for positive is:", recall_score(y_true, y_pred))
print("The f1 score for positive is:", f1_score(y_true, y_pred))
print("")

## Stat for neg class
print("The precision for negative is:", precision_score(y_true, y_pred, pos_label=0))
print("The recall for negative is:", recall_score(y_true, y_pred, pos_label=0))
print("The f1 score for negative is:", f1_score(y_true, y_pred, pos_label=0))

The accuracy is: 0.6609488487096276

The precision for positive is: 0.618426724137931
The recall for positive is: 0.8713938657758883
The f1 score for positive is: 0.7234337577209127

The precision for negative is: 0.7687056253413436
The recall for negative is: 0.44288860918816864
The f1 score for negative is: 0.5619884208424836


Run on test data...

In [12]:
y_test = np.array([1]*len(testdata[0])+[0]*len(testdata[1]))
y_pred = []
for i in range(2):
    for sent in testdata[i]:
        aff_score = []
        for w in sent:
            if w in aff_word_list:
                aff_score.append(pos_logprob[w])
        if len(aff_score) == 0:
            y_pred.append(np.random.choice(2))
        elif np.mean(aff_score) > np.log(0.5):
            y_pred.append(1)
        else:
            y_pred.append(0)
y_pred = np.array(y_pred)

print("The accuracy is:", accuracy_score(y_test, y_pred))
print("")

## Stat for pos class
print("The precision for positive is:", precision_score(y_test, y_pred))
print("The recall for positive is:", recall_score(y_test, y_pred))
print("The f1 score for positive is:", f1_score(y_test, y_pred))
print("")

## Stat for neg class
print("The precision for negative is:", precision_score(y_test, y_pred, pos_label=0))
print("The recall for negative is:", recall_score(y_test, y_pred, pos_label=0))
print("The f1 score for negative is:", f1_score(y_test, y_pred, pos_label=0))

The accuracy is: 0.6780420012353304

The precision for positive is: 0.6330879929693508
The recall for positive is: 0.8741089033823752
The f1 score for positive is: 0.7343272171253823

The precision for negative is: 0.7843595739153026
The recall for negative is: 0.4747601824186193
The f1 score for negative is: 0.5914968652037619
