In [1]:
import os
import re
import string
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from nltk import word_tokenize, pos_tag
from random import shuffle

pos_text_dir = '../Data/review_polarity/txt_sentoken/pos/'
neg_text_dir = '../Data/review_polarity/txt_sentoken/neg/'

pos_sentences = []
pos_sentences_postag = []
for file in os.listdir(pos_text_dir):
    with open(os.path.join(pos_text_dir,file), 'r') as f:
        for lines in f:
            lines = re.sub('n\'t', ' not', lines)
#             lines = re.sub('\d*(\.\d*)?', '', lines)
            sent = word_tokenize(lines)
            pos_sentences.append(sent)
            # Also get POS tags for each word
            pos_sentences_postag.append(pos_tag(sent))

neg_sentences = []
neg_sentences_postag = []
for file in os.listdir(neg_text_dir):
    with open(os.path.join(neg_text_dir,file), 'r') as f:
        for lines in f:
            lines = re.sub('n\'t', ' not', lines)
#             lines = re.sub('\d*(\.\d*)?', '', lines)
            sent = word_tokenize(lines)
            neg_sentences.append(sent)
            # Also get POS tags for each word
            neg_sentences_postag.append(pos_tag(sent))

npos = len(pos_sentences)
nneg = len(neg_sentences)

# Part 1: Negation Handling
HANDLE_NEGATION = False
if HANDLE_NEGATION:
    for s in range(npos):
        negation_state = False
        for w in range(len(pos_sentences[s])):
            if pos_sentences[s][w] in set(string.punctuation).union(['only', 'that', 'what', 'when', 'which', 'where', 'how', 'who']):
                negation_state = False
                continue
            if pos_sentences[s][w] in set(['no','not','never','less','rarely','barely','seldom']):
                negation_state = True
                continue
            if negation_state and pos_sentences_postag[s][w][1] in set(['JJ','JJR','JJS','NN','NNP','NNS','RB','RBR','RBP','RP','VB','VBD','VBG','VBN','VBP','VBZ']):
                pos_sentences[s][w] = pos_sentences[s][w] + '_NEG'
    
    for s in range(nneg):
        negation_state = False
        for w in range(len(neg_sentences[s])):
            if neg_sentences[s][w] in set(string.punctuation).union(['only', 'that', 'what', 'when', 'which', 'where', 'how', 'who']):
                negation_state = False
                continue
            if neg_sentences[s][w] in set(['no','not','never','less','rarely','barely','seldom']):
                negation_state = True
                continue
            if negation_state and neg_sentences_postag[s][w][1] in set(['JJ','JJR','JJS','NN','NNP','NNS','RB','RBR','RBP','RP','VB','VBD','VBG','VBN','VBP','VBZ']):
                neg_sentences[s][w] = neg_sentences[s][w] + '_NEG'

In [2]:
RESHUFFLE = True
if RESHUFFLE:
    pos_shuff_idx = list(range(len(pos_sentences)))
    shuffle(pos_shuff_idx)
    neg_shuff_idx = list(range(len(neg_sentences)))
    shuffle(neg_shuff_idx)

pos_sentences = [pos_sentences[s] for s in pos_shuff_idx]
pos_sentences_postag = [pos_sentences_postag[s] for s in pos_shuff_idx]
neg_sentences = [neg_sentences[s] for s in neg_shuff_idx]
neg_sentences_postag = [neg_sentences_postag[s] for s in neg_shuff_idx]

traindata = [pos_sentences[:npos//10*6],
             pos_sentences_postag[:npos//10*6],
             neg_sentences[:nneg//10*6],
             neg_sentences_postag[:nneg//10*6]
            ]
valdata = [pos_sentences[npos//10*6:npos//10*8],
           pos_sentences_postag[npos//10*6:npos//10*8],
           neg_sentences[nneg//10*6:nneg//10*8],
           neg_sentences_postag[nneg//10*6:nneg//10*8]
          ]
testdata = [pos_sentences[npos//10*8:],
            pos_sentences_postag[npos//10*8:],
            neg_sentences[nneg//10*8:],
            neg_sentences_postag[nneg//10*8:]
           ]

sent_lab_set = [(0,1),(2,0)]

In [3]:
TESTING = True
if TESTING:
    for i in range(len(traindata)):
        traindata[i] += valdata[i]

Part 2: Pointwise Mutual Information (PMI) scores using root forms from training data

First for unigrams...

In [4]:
# from nltk.stem.porter import PorterStemmer
from itertools import chain
from collections import Counter

# stemmer = PorterStemmer()
trainpos = [w for w in chain.from_iterable(traindata[sent_lab_set[0][0]])]
trainneg = [w for w in chain.from_iterable(traindata[sent_lab_set[1][0]])]
cntpos = Counter(trainpos)
cntneg = Counter(trainneg)
total_words = set(trainpos).union(set(trainneg))

In [5]:
pmi_uni_list = []
uni_pmi_score = {}
tot_num_pos = len(trainpos)
tot_num_neg = len(trainneg)
for w in total_words:
    if cntpos[w] > 10 and cntneg[w] > 10:
        # remove words with too few occurances
        pmi_pos = np.log2(cntpos[w] * (tot_num_pos + tot_num_neg) / (cntpos[w] + cntneg[w]) / tot_num_pos)
        pmi_neg = np.log2(cntneg[w] * (tot_num_pos + tot_num_neg) / (cntpos[w] + cntneg[w]) / tot_num_neg)
        uni_pmi_score[w] = pmi_pos - pmi_neg
        pmi_uni_list.append(w)
pmi_uni_list = set(pmi_uni_list)

In [6]:
prob_uni_list = []
uni_logprob = {}
for w in total_words:
    prob = (1+cntpos[w])/(2+cntpos[w]+cntneg[w])
    prob_uni_list.append(w)
    uni_logprob[w] = np.log(prob)
prob_uni_list = set(prob_uni_list)

Same technique for bigrams (but without stemming)...

In [7]:
trainpos_bi = []
for sent in traindata[sent_lab_set[0][0]]:
    trainpos_bi.extend(list(zip(sent[:-1],sent[1:])))

trainneg_bi = []
for sent in traindata[sent_lab_set[1][0]]:
    trainneg_bi.extend(list(zip(sent[:-1],sent[1:])))

cntpos_bi = Counter(trainpos_bi)
cntneg_bi = Counter(trainneg_bi)
total_words_bi = set(trainpos_bi).union(set(trainneg_bi))

pmi_bi_list = []
pmi_bi_score = {}
tot_num_pos_bi = len(trainpos_bi)
tot_num_neg_bi = len(trainneg_bi)
for w in total_words_bi:
    if cntpos_bi[w] > 5 and cntneg_bi[w] > 5:
        # remove words with too few occurances
        pmi_pos_bi = np.log2(cntpos_bi[w] * (tot_num_pos_bi + tot_num_neg_bi) / (cntpos_bi[w] + cntneg_bi[w]) / tot_num_pos_bi)
        pmi_neg_bi = np.log2(cntneg_bi[w] * (tot_num_pos_bi + tot_num_neg_bi) / (cntpos_bi[w] + cntneg_bi[w]) / tot_num_neg_bi)
        pmi_bi_score[w] = pmi_pos_bi - pmi_neg_bi
        pmi_bi_list.append(w)
pmi_bi_list = set(pmi_bi_list)



Part 3: Loading NRC Emotion Lexicon

In [8]:
import pandas as pd

# Unigram
NRC_lex_uni = pd.read_table('../Data/NRC-Sentiment-Emotion-Lexicons/NRC-Sentiment-Emotion-Lexicons/AutomaticallyGeneratedLexicons/NRC-Emoticon-AffLexNegLex-v1.0/Emoticon-AFFLEX-NEGLEX-unigrams.txt',names=['w','score','npos','nneg']).dropna()
NRC_uni_dict = NRC_lex_uni[['w','score']].set_index('w').T.to_dict()
NRC_uni_keys = set(NRC_uni_dict.keys())
NRC_uni_dict = {w:NRC_uni_dict[w]['score'] for w in NRC_uni_dict}

for w in NRC_uni_keys:
    if w.endswith('_NEGFIRST'):
        if (w[:-9]+'_NEG') not in NRC_uni_keys:
            NRC_uni_dict[w[:-9]+'_NEG'] = NRC_uni_dict[w]
        else:
            NRC_uni_dict[w[:-9]+'_NEG'] = (NRC_uni_dict[w] + NRC_uni_dict[w[:-9]+'_NEG']) / 2
        del NRC_uni_dict[w]


In [9]:
# Bigram
NRC_lex_bi = pd.read_table('../Data/NRC-Sentiment-Emotion-Lexicons/NRC-Sentiment-Emotion-Lexicons/AutomaticallyGeneratedLexicons/NRC-Emoticon-AffLexNegLex-v1.0/Emoticon-AFFLEX-NEGLEX-bigrams.txt',names=['w','score','npos','nneg'],quoting=3).dropna()
NRC_bi_dict = NRC_lex_bi[['w','score']].set_index('w').T.to_dict()
temp_keys = set([tuple(w.split()) for w in NRC_bi_dict.keys()])
NRC_bi_dict = {tuple(w.split()):NRC_bi_dict[w]['score'] for w in NRC_bi_dict}


In [10]:
NRC_uni_keys = set(NRC_uni_dict.keys())
NRC_bi_keys = set(NRC_bi_dict.keys())

Features for sentence classification:
1. Number of positive and negative words.
2. Sum, max, and min of sentiment scores (if available) of unigram and bigram tokens.
3. Sum of sentiment scores (if available) from, in particular, adjectives, adverbs, verbs, and nouns, respectively.
4. Sum of sentiment scores (if available) from NRC Lexicon

In [11]:
def get_features(data):
    feat = []
    label = []
    for comb in sent_lab_set:
        for s in range(len(data[comb[0]])):
            pos_num, neg_num, pos_pmi_num, neg_pmi_num = 0,0,0,0
            uni_prob_sum, uni_sum, uni_max, uni_min = 0,0,0,0
            bi_sum, bi_max, bi_min = 0,0,0
            adj_sum, adv_sum, vb_sum, nn_sum = 0,0,0,0
            adj_pmi_sum, adv_pmi_sum, vb_pmi_sum, nn_pmi_sum = 0,0,0,0
            nrc_uni_sum, nrc_bi_sum = 0,0
            for w1,pos1 in zip(data[comb[0]][s], data[comb[0]+1][s]):
                # Use of unigram probs
                if w1 in prob_uni_list:
                    uni_prob_sum += uni_logprob[w1]
                    if uni_logprob[w1] > -0.2:
                        pos_num += 1
                    elif uni_logprob[w1] < -0.2:
                        neg_num += 1
                    # Use of POS tags
                    if pos1[1] in set(['JJ','JJR','JJS']):
                        adj_sum += uni_logprob[w1]
                    if pos1[1] in set(['NN','NNP','NNS']):
                        nn_sum += uni_logprob[w1]
                    if pos1[1] in set(['RB','RBR','RBP','RP']):
                        adv_sum += uni_logprob[w1]
                    if pos1[1] in set(['VB','VBD','VBG','VBN','VBP','VBZ']):
                        vb_sum += uni_logprob[w1]
                # Use of unigram PMI
                if w1 in pmi_uni_list:
                    uni_sum += uni_pmi_score[w1]
                    if uni_pmi_score[w1] > uni_max:
                        uni_max = uni_pmi_score[w1]
                    if uni_pmi_score[w1] < uni_min:
                        uni_min = uni_pmi_score[w1]
                    if uni_pmi_score[w1] > 0:
                        pos_pmi_num += 1
                    elif uni_pmi_score[w1] < 0:
                        neg_pmi_num += 1
                    # Use of POS tags
                    if pos1[1] in set(['JJ','JJR','JJS']):
                        adj_pmi_sum += uni_pmi_score[w1]
                    if pos1[1] in set(['NN','NNP','NNS']):
                        nn_pmi_sum += uni_pmi_score[w1]
                    if pos1[1] in set(['RB','RBR','RBP','RP']):
                        adv_pmi_sum += uni_pmi_score[w1]
                    if pos1[1] in set(['VB','VBD','VBG','VBN','VBP','VBZ']):
                        vb_pmi_sum += uni_pmi_score[w1]
                # Use of NRC lexicon
                if w1 in NRC_uni_keys:
                    nrc_uni_sum += NRC_uni_dict[w1]
            for w1,w2 in zip(data[comb[0]][s][:-1], data[comb[0]][s][1:]):
                # Use of bigrams PMI
                if (w1,w2) in pmi_bi_list:
                    bi_sum += pmi_bi_score[(w1,w2)]
                    if pmi_bi_score[(w1,w2)] > bi_max:
                        bi_max = pmi_bi_score[(w1,w2)]
                    if pmi_bi_score[(w1,w2)] < bi_min:
                        bi_min = pmi_bi_score[(w1,w2)]
                # Use of NRC lexicon
                if (w1,w2) in NRC_bi_keys:
                    nrc_bi_sum += NRC_bi_dict[(w1,w2)]
            feat.append([
                         pos_num,
                         neg_num,
                         pos_pmi_num,
                         neg_pmi_num,
                         uni_prob_sum,
                         uni_sum,
                         uni_max,
                         uni_min,
                         bi_sum,
                         bi_max,
                         bi_min,
                         adj_sum,
                         adv_sum,
                         vb_sum,
                         nn_sum,
                         adj_pmi_sum,
                         adv_pmi_sum,
                         vb_pmi_sum,
                         nn_pmi_sum,
                         nrc_uni_sum,
                         nrc_bi_sum
                        ])
            label.append(comb[1])
    return np.array(feat), np.array(label)

Run several ML algorithms...

In [12]:
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

X, y = get_features(traindata)
# ss = StandardScaler().fit(X)
# X = ss.transform(X)

X_test, y_true = get_features(valdata)
if TESTING:
    X_test, y_true = get_features(testdata)
# shuffle again
p = np.random.permutation(X_test.shape[0])
X_test = X_test[p]
y_true = y_true[p]

In [13]:
print("SVM Classifier")
print("------------------------------")
clf = SVC(C=0.25).fit(X,y)
y_pred = clf.predict(X_test)
# y_pred = clf.predict(ss.transform(X_test))

print("The accuracy is:", accuracy_score(y_true, y_pred))
print("")

## Stat for pos class
print("The precision for positive is:", precision_score(y_true, y_pred))
print("The recall for positive is:", recall_score(y_true, y_pred))
print("The f1 score for positive is:", f1_score(y_true, y_pred))
print("")

## Stat for neg class
print("The precision for negative is:", precision_score(y_true, y_pred, pos_label=0))
print("The recall for negative is:", recall_score(y_true, y_pred, pos_label=0))
print("The f1 score for negative is:", f1_score(y_true, y_pred, pos_label=0))

SVM Classifier
------------------------------
The accuracy is: 0.7138665843113032

The precision for positive is: 0.7167742904339991
The recall for positive is: 0.7239496435613529
The f1 score for positive is: 0.720344099003924

The precision for negative is: 0.710789766407119
The recall for negative is: 0.7034124862399749
The f1 score for negative is: 0.7070818842870693


In [14]:
print("Naive Bayes Classifier")
print("------------------------------")
clf = GaussianNB().fit(X,y)
y_pred = clf.predict(X_test)

print("The accuracy is:", accuracy_score(y_true, y_pred))
print("")

## Stat for pos class
print("The precision for positive is:", precision_score(y_true, y_pred))
print("The recall for positive is:", recall_score(y_true, y_pred))
print("The f1 score for positive is:", f1_score(y_true, y_pred))
print("")

## Stat for neg class
print("The precision for negative is:", precision_score(y_true, y_pred, pos_label=0))
print("The recall for negative is:", recall_score(y_true, y_pred, pos_label=0))
print("The f1 score for negative is:", f1_score(y_true, y_pred, pos_label=0))

Naive Bayes Classifier
------------------------------
The accuracy is: 0.6756485484867202

The precision for positive is: 0.7090178259349877
The recall for positive is: 0.6153496132261489
The f1 score for positive is: 0.6588712951684936

The precision for negative is: 0.6492392807745505
The recall for negative is: 0.7381663783613776
The f1 score for negative is: 0.690852895724483


In [15]:
print("Decision Tree Classifier")
print("------------------------------")
clf = DecisionTreeClassifier(max_depth=7).fit(X,y)
y_pred = clf.predict(X_test)

print("The accuracy is:", accuracy_score(y_true, y_pred))
print("")

## Stat for pos class
print("The precision for positive is:", precision_score(y_true, y_pred))
print("The recall for positive is:", recall_score(y_true, y_pred))
print("The f1 score for positive is:", f1_score(y_true, y_pred))
print("")

## Stat for neg class
print("The precision for negative is:", precision_score(y_true, y_pred, pos_label=0))
print("The recall for negative is:", recall_score(y_true, y_pred, pos_label=0))
print("The f1 score for negative is:", f1_score(y_true, y_pred, pos_label=0))

Decision Tree Classifier
------------------------------
The accuracy is: 0.6670012353304509

The precision for positive is: 0.6762523191094619
The recall for positive is: 0.6634309115728804
The f1 score for positive is: 0.6697802618482506

The precision for negative is: 0.6577729796421962
The recall for negative is: 0.6707029407139488
The f1 score for negative is: 0.6641750369851281


In [16]:
print("NN Classifier")
print("------------------------------")
clf = MLPClassifier(learning_rate='adaptive', learning_rate_init=0.01, max_iter=300).fit(X,y)
y_pred = clf.predict(X_test)

print("The accuracy is:", accuracy_score(y_true, y_pred))
print("")

## Stat for pos class
print("The precision for positive is:", precision_score(y_true, y_pred))
print("The recall for positive is:", recall_score(y_true, y_pred))
print("The f1 score for positive is:", f1_score(y_true, y_pred))
print("")

## Stat for neg class
print("The precision for negative is:", precision_score(y_true, y_pred, pos_label=0))
print("The recall for negative is:", recall_score(y_true, y_pred, pos_label=0))
print("The f1 score for negative is:", f1_score(y_true, y_pred, pos_label=0))

NN Classifier
------------------------------
The accuracy is: 0.7092340951204448

The precision for positive is: 0.7542723511422917
The recall for positive is: 0.6359775519490368
The f1 score for positive is: 0.6900921658986174

The precision for negative is: 0.6753685919112674
The recall for negative is: 0.7851863500550401
The f1 score for negative is: 0.7261489237929029
