In [6]:
import os
import pandas as pd
import re
import numpy as np
from collections import defaultdict

## Load data

In [8]:
def get_bigram_list(tokens, min_freq=7):
    """ Find bigrams from given text and return in a pandas dataframe. """
    bigrams = []
    for i in range(1, len(tokens)):
        bigram = f'{tokens[i-1]} {tokens[i]}'
        bigrams += [bigram]
    return bigrams

In [301]:
def get_uni_and_bi_grams(data_folder):
    """ Find unigrams and bigrams in given text and return in a pandas dataframe. """
    sentiments = ['POS', 'NEG']    
    # use lists to avoid calling append in a loop
    # unigrams
    unigrams = []
    unigram_sentiments = []
    unigram_file_ids = []
    # bigrams
    bigrams = []
    bigram_sentiments = []
    bigram_file_ids = []
    for sent in sentiments:
        review_folder = f'{data_folder}/{sent}'
        for file in os.listdir(review_folder):
            # find unigrams
            new_unigrams = pd.read_csv(os.path.join(review_folder, file), sep='\t', header=None, names=['ngram', 'pos']).values[:,0]
            unigrams += list(new_unigrams)
            unigram_sentiments += [(1 if sent == 'POS' else -1)]*len(new_unigrams)
            unigram_file_ids += [f'{sent}-{file[2:5]}']*len(new_unigrams)
            # find bigrams
            new_bigrams = get_bigram_list(new_unigrams)
            bigrams += new_bigrams            
            bigram_sentiments += [(1 if sent == 'POS' else -1)]*len(new_bigrams)
            bigram_file_ids += [f'{sent}-{file[2:5]}']*len(new_bigrams)
    unigram_df = pd.DataFrame(list(zip(unigrams, unigram_sentiments, unigram_file_ids)), columns=['ngram', 'sentiment', 'file_id'])
    bigram_df = pd.DataFrame(list(zip(bigrams, bigram_sentiments, bigram_file_ids)), columns=['ngram', 'sentiment', 'file_id'])
    return unigram_df, bigram_df

In [303]:
unigrams, bigrams, = get_uni_and_bi_grams('data-tagged')

## Split data into two

In [159]:
def single_split(data, max_training_id):
    mask = data['file_id'].apply(lambda x: int(x[4:7]) <= max_training_id)
    return data[mask], data[~mask]

In [304]:
training_unigrams, test_data = single_split(unigrams, 899)
training_bigrams, _ = single_split(bigrams, 899)

## Helpers for defining the necessary probabilities

In [363]:
def unigram_tokenize(content):
    """ Split into unigrams by punctuation and whitespace, then lowercase and remove trailing whitespace"""
    return np.asarray(list(filter(None,((map(lambda x: x, map(str.strip, re.split('(\W)', content))))))))

def bigram_tokenize(content):
    """ Split text into bigrams """
    tokens = unigram_tokenize(content)
    for i in range(1, len(tokens)):
        yield f'{tokens[i-1]} {tokens[i]}'

In [364]:
def preprocess_ngrams(ngrams, sent, min_count, smooth):
    counts = ngrams[ngrams['sentiment'] == sent]['ngram'].value_counts()
    filtered = counts[counts >= min_count]
    voc_size = len(counts)
    if smooth:
        return (filtered+1)/(sum(filtered)+ngrams['ngram'].nunique())
    return filtered/sum(filtered)

In [339]:
def get_class_probabilites(text, classes, tokenize, data, smooth, min_freq):
    class_probs = np.zeros(len(classes))
    for i, cl in enumerate(classes):
        p = 0
        conditioned_counts = preprocess_ngrams(data, cl, min_freq, smooth)
        smooth_denom = (data['sentiment']==cl).sum()+data['ngram'].nunique()
        for word in tokenize(text):
            if word in conditioned_counts.index:
                p += np.log(conditioned_counts.loc[word])
            # apply smoothing separately if word not present in class
            elif smooth:
                p += np.log(1/(smooth_denom))
        # the prior is the fraction of documents in a specific class
        sentiment_files = data[['file_id', 'sentiment']].groupby('file_id').mean()
        prior = (sentiment_files['sentiment'] == cl).sum()/len(sentiment_files)
        p += np.log(prior)
        class_probs[i] = p
    return class_probs

def naive_binary_bayes(text, unigrams=None, bigrams=None, smooth=True):
    """ Predict the class of a string given unigrams and bigrams. """
    if unigrams is None and bigrams is None:
        raise ValueError('Please choose to use either unigrams or bigrams by providing the data')
    # set the binary classification labels
    classes = [-1, 1]
    class_probs = np.zeros(len(classes))
    if unigrams is not None:
        class_probs += get_class_probabilites(text, classes, unigram_tokenize, unigrams, smooth, min_freq=1)
    if bigrams is not None:
        class_probs += get_class_probabilites(text, classes, bigram_tokenize, bigrams, smooth, min_freq=7)
    return classes[np.argmax(class_probs)]

In [305]:
naive_binary_bayes('a great movie', bigrams=training_bigrams)

1

In [306]:
naive_binary_bayes('this is a very bad movie', unigrams=training_unigrams)

-1

In [347]:
naive_binary_bayes('this is a very bad movie', bigrams=training_bigrams)

-1

In [348]:
naive_binary_bayes('this is a very bad movie', unigrams=training_unigrams, bigrams=training_bigrams)

-1

In [349]:
naive_binary_bayes('i love', unigrams=training_unigrams, bigrams=training_bigrams)

1

In [350]:
naive_binary_bayes('i hate', unigrams=training_unigrams)

1

In [358]:
def estimate_accuracy(test_data, smooth, unigrams=None, bigrams=None):
    """ Estimate the accuracy over test dataset using given unigrams and bigrams """
    acc = 0
    for file_id, group in test_data.groupby('file_id'):
        label = naive_binary_bayes(' '.join(group['ngram'].values), smooth=smooth, unigrams=unigrams, bigrams=bigrams)
        # make sure each file is only associated with one sentiment
        # otherwise there's a bug in reading of the data
        assert(group['sentiment'].nunique() == 1)
        acc += (label == group['sentiment'].unique()[0])
    return acc/test_data['file_id'].nunique()

In [344]:
acc = estimate_accuracy(test_data, unigrams=training_unigrams)
acc

0.815

In [345]:
acc = estimate_accuracy(test_data, bigrams=training_bigrams)
acc

0.84

In [346]:
acc = estimate_accuracy(test_data,  unigrams=training_unigrams, bigrams=training_bigrams)
acc

0.855

In [365]:
acc = estimate_accuracy(test_data, unigrams=training_unigrams, smooth=False)
acc

0.48