In [1]:
import os
import pandas as pd
import re
import numpy as np
from collections import defaultdict
from scipy.stats import binom
from functools import reduce
import math
import re

## Load data

In [2]:
def get_bigram_list(tokens, min_freq=7):
    """ Find bigrams from given text and return in a pandas dataframe. """
    bigrams = []
    for i in range(1, len(tokens)):
        bigram = f'{tokens[i-1]} {tokens[i]}'
        bigrams += [bigram]
    return bigrams

In [3]:
def get_uni_and_bi_grams(data_folder):
    """ Find unigrams and bigrams in given text and return in a pandas dataframe. """
    sentiments = ['POS', 'NEG']    
    # use lists to avoid calling append in a loop
    # unigrams
    unigrams = []
    unigram_sentiments = []
    unigram_file_ids = []
    unigram_file_nos = []
    # bigrams
    bigrams = []
    bigram_sentiments = []
    bigram_file_ids = []
    bigram_file_nos = []    
    for sent in sentiments:
        review_folder = f'{data_folder}/{sent}'
        for file in os.listdir(review_folder):
            # find unigrams
            new_unigrams = pd.read_csv(os.path.join(review_folder, file), sep='\t', header=None, names=['ngram', 'pos']).values[:,0]
            unigrams += list(new_unigrams)
            unigram_sentiments += [(1 if sent == 'POS' else -1)]*len(new_unigrams)
            unigram_file_ids += [f'{sent}-{file[2:5]}']*len(new_unigrams)
            unigram_file_nos += [int(file[2:5])]*len(new_unigrams)
            # find bigrams
            new_bigrams = get_bigram_list(new_unigrams)
            bigrams += new_bigrams            
            bigram_sentiments += [(1 if sent == 'POS' else -1)]*len(new_bigrams)
            bigram_file_ids += [f'{sent}-{file[2:5]}']*len(new_bigrams)
            bigram_file_nos += [int(file[2:5])]*len(new_bigrams)
    unigram_df = pd.DataFrame(list(zip(unigrams, unigram_sentiments, unigram_file_ids, unigram_file_nos)), columns=['ngram', 'sentiment', 'file_id', 'file_no'])
    bigram_df = pd.DataFrame(list(zip(bigrams, bigram_sentiments, bigram_file_ids, bigram_file_nos)), columns=['ngram', 'sentiment', 'file_id', 'file_no'])
    return unigram_df, bigram_df

## Helpers for defining the necessary probabilities

In [4]:
def unigram_tokenize(content):
    """ Split into unigrams by punctuation and whitespace, then lowercase and remove trailing whitespace"""
    return np.asarray(list(filter(None,((map(lambda x: x, map(str.strip, re.split('(\W)', content))))))))

def bigram_tokenize(content):
    """ Split text into bigrams """
    tokens = unigram_tokenize(content)
    for i in range(1, len(tokens)):
        yield f'{tokens[i-1]} {tokens[i]}'

In [5]:
def get_ngram_probabilities(ngrams, sent, min_count, smooth):
    counts = ngrams[ngrams['sentiment'] == sent]['ngram'].value_counts()
    filtered = counts[counts >= min_count]
    voc_size = len(counts)
    if smooth:
        return (filtered+1)/(sum(filtered)+ngrams['ngram'].nunique())
    return filtered/sum(filtered)

In [6]:
def get_class_probabilites(text, classes, tokenize, data, smooth, min_freq):
    class_probs = np.zeros(len(classes))
    for i, cl in enumerate(classes):
        p = 0
        conditioned_counts = get_ngram_probabilities(data, cl, min_freq, smooth)
        smooth_denom = (data['sentiment']==cl).sum()+data['ngram'].nunique()
        for word in tokenize(text):
            if word in conditioned_counts.index:
                p += np.log(conditioned_counts.loc[word])
            # apply smoothing separately if word not present in class
            elif smooth:
                p += np.log(1/(smooth_denom))
        # the prior is the fraction of documents in a specific class
        sentiment_files = data[['file_id', 'sentiment']].groupby('file_id').mean()
        prior = (sentiment_files['sentiment'] == cl).sum()/len(sentiment_files)
        p += np.log(prior)
        class_probs[i] = p
    return class_probs

def naive_binary_bayes(text, unigrams=None, bigrams=None, smooth=True):
    """ Predict the class of a string given unigrams and bigrams. """
    if unigrams is None and bigrams is None:
        raise ValueError('Please choose to use either unigrams or bigrams by providing the data')
    # set the binary classification labels
    classes = [-1, 1]
    class_probs = np.zeros(len(classes))    
    if unigrams is not None:
        class_probs += get_class_probabilites(text, classes, unigram_tokenize, unigrams, smooth, min_freq=4)
    if bigrams is not None:
        class_probs += get_class_probabilites(text, classes, bigram_tokenize, bigrams, smooth, min_freq=7)
    return classes[np.argmax(class_probs)]

In [13]:
class NaiveB:
        
    def __init__(self, classes, unigrams=None, bigrams=None):
        self.unigrams = unigrams
        self.bigrams = bigrams
        self.classes = classes
        

    def get_ngram_probabilities(ngrams, sent, min_count, smooth):
        counts = ngrams[ngrams['sentiment'] == sent]['ngram'].value_counts()
        filtered = counts[counts >= min_count]
        voc_size = len(counts)
        if smooth:
            return (filtered+1)/(sum(filtered)+ngrams['ngram'].nunique())
        return filtered/sum(filtered)    
    
    def get_class_probabilites(self, text, tokenize, data, smooth, min_freq):
        class_probs = np.zeros(len(self.classes))
        for i, cl in enumerate(self.classes):
            p = 0
            conditioned_counts = get_ngram_probabilities(data, cl, min_freq, smooth)
            smooth_denom = (data['sentiment']==cl).sum()+data['ngram'].nunique()
            for word in tokenize(text):
                if word in conditioned_counts.index:
                    p += np.log(conditioned_counts.loc[word])
                # apply smoothing separately if word not present in class
                elif smooth:
                    p += np.log(1/(smooth_denom))
            # the prior is the fraction of documents in a specific class
            sentiment_files = data[['file_id', 'sentiment']].groupby('file_id').mean()
            prior = (sentiment_files['sentiment'] == cl).sum()/len(sentiment_files)
            p += np.log(prior)
            class_probs[i] = p
        return class_probs    
        
    def predict(self, text, training_data_files, smooth=True):
        # set the binary classification labels
        class_probs = np.zeros(len(self.classes))
        if self.unigrams is not None:
            training_unigrams = self.unigrams[self.unigrams['file_no'].isin(training_data_files)]
            class_probs += self.get_class_probabilites(text, unigram_tokenize, training_unigrams, smooth, min_freq=4)
        if self.bigrams is not None:
            class_probs += self.get_class_probabilites(text, bigram_tokenize, self.bigrams[self.bigrams['file_no'].isin(training_data_files)], smooth, min_freq=7)
        return self.classes[np.argmax(class_probs)]    

In [15]:
unigrams, bigrams, = get_uni_and_bi_grams('data-tagged')
uni_naiveB = NaiveB([-1, 1], unigrams=unigrams)
bi_naiveB = NaiveB([-1, 1], bigrams=bigrams)

In [9]:
uni_naiveB.predict('a great movie', list(range(899)))

1

In [16]:
bi_naiveB.predict('this is a very bad movie', list(range(899)))

-1

In [20]:
def estimate_accuracy2(test_data, unigrams=None, bigrams=None):
    """ Estimate the accuracy over test dataset using given unigrams and bigrams """
    acc = 0
    for file_id, group in test_data.groupby('file_id'):
        label = naive_binary_bayes(' '.join(group['ngram'].values), smooth=smooth, unigrams=unigrams, bigrams=bigrams)
        # make sure each file is only associated with one sentiment
        # otherwise there's a bug in reading of the data
        assert(group['sentiment'].nunique() == 1)
        acc += (label == group['sentiment'].unique()[0])
    return acc/test_data['file_id'].nunique()

def estimate_accuracy(test_data, training_data_files, naive_B, smooth):
    """ Estimate the accuracy over test dataset using given unigrams and bigrams """
    acc = 0
    i = 0
    for file_id, group in test_data.groupby('file_id'):
        i += 1
        label = naive_B.predict(' '.join(group['ngram'].values), training_data_files, smooth=smooth)
        # make sure each file is only associated with one sentiment
        # otherwise there's a bug in reading of the data
        assert(group['sentiment'].nunique() == 1)
        print(acc/i)
        acc += (label == group['sentiment'].unique()[0])
    return acc/test_data['file_id'].nunique()

In [18]:
uni_naiveB = NaiveB([-1, 1], unigrams=unigrams)
bi_naiveB = NaiveB([-1, 1], bigrams=bigrams)
uni_bi_naiveB = NaiveB([-1, 1], unigrams=unigrams, bigrams=bigrams)

In [21]:
%%time
single_split_train_files = list(range(899))
test_data = unigrams[~unigrams['file_id'].isin(single_split_train_files)]
estimate_accuracy(test_data, single_split_train_files, uni_naiveB, smooth=True)

0.0
0.5
0.6666666666666666
0.75
0.8
0.8333333333333334
0.8571428571428571
0.875
0.8888888888888888
0.9
0.9090909090909091
0.9166666666666666
0.9230769230769231
0.9285714285714286
0.9333333333333333
0.9375
0.9411764705882353
0.9444444444444444
0.9473684210526315
0.95
0.9523809523809523
0.9545454545454546
0.9565217391304348
0.9583333333333334
0.96
0.9615384615384616
0.9259259259259259
0.9285714285714286
0.9310344827586207
0.9333333333333333
0.9354838709677419
0.9375
0.9393939393939394
0.9411764705882353
0.9428571428571428
0.9444444444444444
0.9459459459459459
0.9473684210526315
0.9487179487179487
0.95
0.9512195121951219
0.9523809523809523
0.9534883720930233
0.9545454545454546
0.9555555555555556
0.9565217391304348
0.9574468085106383
0.9583333333333334
0.9591836734693877
0.96
0.9607843137254902
0.9615384615384616
0.9622641509433962
0.9629629629629629
0.9636363636363636
0.9642857142857143
0.9649122807017544
0.9655172413793104
0.9661016949152542
0.9666666666666667
0.9672131147540983
0.951612

KeyboardInterrupt: 

In [None]:
estimate_accuracy(test_data, single_split_train_files, bi_naiveB, smooth=True)

In [None]:
estimate_accuracy(test_data, single_split_train_files, uni_bi_naiveB, smooth=True)

In [None]:
estimate_accuracy(test_data, single_split_train_files, uni_naiveB, smooth=False)

## Significance testing

In [None]:
def calculate_p_value(N, k, q):
    res = 0
    for i in range(k+1):
        res += binom.pmf(i, N, q)
    return 2*res

def sign_test(test_data, system_A, system_B, n=10):
    plus, minus, null = 0, 0, 0
    for i in range(len(test_data)):
        inp = test_data['review'].iloc[i]
        a = system_A(inp)
        b = system_B(inp)
        true_label = test_data['sentiment'].iloc[i]
        if true_label == a:
            plus += 1
        elif true_label == b:
            minus += 1
        else:
            null += 1
    N = 2*math.ceil(null/2)+plus+minus
    k = math.ceil(null/2)+min(plus, minus)
    return calculate_p_value(N, k, 0.5)

In [None]:
def smoothed_unigram_bayes(data):
    return uni_naiveB.predict(data, list(range(899)), smooth=True)

def unsmoothed_unigram_bayes(data):
    return uni_naiveB.predict(data, list(range(899)), smooth=False)

In [None]:
def build_test_data(test_data):
    # group by file_id to achieve full reviews
    labelled_test_data = test_data.groupby('file_id')['ngram'].apply(lambda gs: list(gs)).reset_index()
    labelled_test_data['review'] = labelled_test_data['ngram'].apply(lambda x: ' '.join(x[0]))
    labelled_test_data = labelled_test_data.drop('ngram', axis=1)
    labelled_test_data['sentiment'] = labelled_test_data['file_id'].apply(lambda x: 1 if x[:3] == 'POS' else -1)
    return labelled_test_data

In [None]:
labelled_test_data = build_test_data(test_data)

In [None]:
%%time
p_value = sign_test(labelled_test_data, smoothed_unigram_bayes, unsmoothed_unigram_bayes)
p_value
# 6.269514409998372e-26

## Cross validation

In [None]:
def sample_variance(data):
    mean = np.mean(data)
    return np.sum(np.square(data-mean))

def cross_validate(naive_B, data, folds):
    file_amount = data['file_no'].nunique()
    indx = np.arange(0, file_amount, folds)
    scores = np.zeros(folds)
    for f in range(folds):
        test_data_mask = data['file_no'].isin(indx+f)
        training_file_ids = data[~test_data_mask]['file_no'].unique()
        test_data = data[test_data_mask]
        acc = estimate_accuracy(test_data, training_file_ids, naive_B, smooth=True)
        print(acc)
        scores[f] = acc
    return np.mean(scores), sample(scores)

In [None]:
%%time
acc_mean, acc_var = cross_validate(uni_naiveB, unigrams, 10)
# accuracies
# results
# mean: 81.10
# variance: 0.0070224999999999845

## Doc2Vec testing

IMDB dataset:
When using this dataset, please cite our ACL 2011 paper:

@InProceedings{maas-EtAl:2011:ACL-HLT2011,
  author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},
  title     = {Learning Word Vectors for Sentiment Analysis},
  booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},
  month     = {June},
  year      = {2011},
  address   = {Portland, Oregon, USA},
  publisher = {Association for Computational Linguistics},
  pages     = {142--150},
  url       = {http://www.aclweb.org/anthology/P11-1015}
}

In [None]:
import gensim

In [None]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]
model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)

In [None]:
imdb_data_folder = 'aclImdb'
imdb_sentiments = ['pos', 'neg']
subfolders = ['train', 'test']
review_list = []
review_id_list = []
review_grade_list = []
for sent in imdb_sentiments:
    for subf in subfolders:
        for review_file in os.listdir(os.path.join(imdb_data_folder, subf, sent)):
            idd = review_file.split('_')[0]
            review_id_list += [idd]
            grade = re.search('_(.*)\.txt', review_file).group(1)
            review_grade_list += [grade]
            f = open(os.path.join(imdb_data_folder, subf, sent, review_file), 'r+')
            review = f.read()
            review_list += [review]
reviews = pd.DataFrame(list(zip(review_list, review_id_list, review_grade_list)), columns=['review', 'id', 'grade'])

In [None]:
def doc_tokenize(doc):
    return [x.lower() for x in re.sub(r'[^a-zA-Z\s]', '', a).split()]

In [None]:
documents = [TaggedDocument(doc_tokenize(doc), [i]) for i, doc in enumerate(reviews['review'].values)]

In [None]:
vec_size = 100
window_size = 2
min_count = 4
model = Doc2Vec(documents, vector_size=vec_size, window=2, min_count=min_count, workers=4)

In [None]:
from gensim.test.utils import get_tmpfile
fname = get_tmpfile(f'doc2vec_{vec_size}_{window_size}_{min_count}')
model.save(fname)
model = Doc2Vec.load(fname)