In [179]:
import os
import pandas as pd
import re
import numpy as np
from collections import defaultdict

In [197]:
def tag_data(data, sent, filename):
    data['sentiment'] = (1 if sent == 'POS' else -1)
    data['file_id'] = f'{sent}-{filename[2:5]}'
    return data

In [254]:
def get_unigrams(data_folder):
    """ Find unigrams in given text and return in a pandas dataframe. """
    data = pd.DataFrame()
    sentiments = ['POS', 'NEG']
    for sent in sentiments:
        review_folder = f'{data_folder}/{sent}'
        for file in os.listdir(review_folder):
            new = pd.read_csv(os.path.join(review_folder, file), sep='\t', header=None, names=['ngram', 'pos'])
            new['sentiment'] = (1 if sent == sentiments[0] else -1)
            new['file_id'] = f'{sent}-{file[2:5]}'
            data = data.append(new, sort=False)
    return data

def get_uni_and_bi_grams2(data_folder):
    """ Find unigrams in given text and return in a pandas dataframe. """
    unigrams = pd.DataFrame()
    bigrams = pd.DataFrame()
    sentiments = ['POS', 'NEG']
    for sent in sentiments:
        review_folder = f'{data_folder}/{sent}'
        for file in os.listdir(review_folder):
            # find unigrams
            new_unigrams = pd.read_csv(os.path.join(review_folder, file), sep='\t', header=None, names=['ngram', 'pos'])
            new_unigrams = tag_data(new_unigrams, sent, file)
            unigrams = unigrams.append(new_unigrams, sort=False)
            # find bigrams
            new_bigrams = find_bigrams(new_unigrams['ngram'].values)
            new_bigrams = tag_data(new_bigrams, sent, file)
            bigrams = bigrams.append(new_bigrams, sort=False)            
    return unigrams, bigrams

def get_uni_and_bi_grams(data_folder):
    """ Find unigrams in given text and return in a pandas dataframe. """
    # use dicts to avoid calling append in a loop
    unigrams = []
    unigram_sentiments = []
    unigram_file_ids = []
    bigrams = []
    bigram_sentiments = []
    bigram_file_ids = []
    sentiments = ['POS', 'NEG']
    for sent in sentiments:
        review_folder = f'{data_folder}/{sent}'
        for file in os.listdir(review_folder):
            # find unigrams
            new_unigrams = pd.read_csv(os.path.join(review_folder, file), sep='\t', header=None, names=['ngram', 'pos']).values[:,0]
            unigrams += list(new_unigrams)
            unigram_sentiments += [(1 if sent == 'POS' else -1)]*len(new_unigrams)
            unigram_file_ids += [f'{sent}-{file[2:5]}']*len(new_unigrams)
            # find bigrams
            new_bigrams = get_bigram_list(new_unigrams)
            bigrams += new_bigrams            
            bigram_sentiments += [(1 if sent == 'POS' else -1)]*len(new_bigrams)
            bigram_file_ids += [f'{sent}-{file[2:5]}']*len(new_bigrams)
    unigram_df = pd.DataFrame(list(zip(unigrams, unigram_sentiments, unigram_file_ids)), columns=['ngram', 'sentiment', 'file_id'])
    bigram_df = pd.DataFrame(list(zip(bigrams, bigram_sentiments, bigram_file_ids)), columns=['ngram', 'sentiment', 'file_id'])
    return unigram_df, bigram_df

In [235]:
def find_bigrams(tokens, min_freq=7):
    """ Find bigrams from given text and return in a pandas dataframe. """
    bigrams = defaultdict(int)
    for i in range(1, len(tokens)):
        bigram = f'{tokens[i-1]} {tokens[i]}'
        bigrams[bigram] += 1
    return pd.DataFrame.from_dict(bigrams, orient = 'index')

def get_bigram_list(tokens, min_freq=7):
    """ Find bigrams from given text and return in a pandas dataframe. """
    bigrams = []
    for i in range(1, len(tokens)):
        bigram = f'{tokens[i-1]} {tokens[i]}'
        bigrams += [bigram]
    return bigrams

In [182]:
def unigram_tokenize(content):
    ''' Split into unigrams by punctuation and whitespace, then lowercase and remove trailing whitespace'''
    return np.asarray(list(filter(None,((map(lambda x: x.lower(), map(str.strip, re.split('(\W)', content))))))))

def bigram_tokenize(content):
    tokens = unigram_tokenize(content)
    for i in range(1, len(tokens)):
        yield f'{tokens[i-1]} {tokens[i]}'

In [192]:
def get_bigrams2(data_folder):
    bigrams = pd.DataFrame()
    sentiments = ['POS', 'NEG']
    for sent in sentiments:
        review_folder = f'{data_folder}/{sent}'
        for file in os.listdir(review_folder):
            f = open(os.path.join(review_folder, file), 'r')
            content = f.read()
            new = find_bigrams(content)
            new['file_id'] = int(file[2:5])
            new['sentiment'] = (1 if sent == sentiments[0] else -1)
            bigrams = bigrams.append(new, sort=False)
    return bigrams

def get_bigrams(data_folder):
    bigrams = pd.DataFrame()
    sentiments = ['POS', 'NEG']
    for sent in sentiments:
        review_folder = f'{data_folder}/{sent}'
        for file in os.listdir(review_folder):
            new = pd.read_csv(os.path.join(review_folder, file), sep='\t', header=None, names=['ngram', 'pos'])
            new = find_bigrams(new['ngram'].values)
            new['file_id'] = f'{sent}-{file[2:5]}'
            new['sentiment'] = (1 if sent == sentiments[0] else -1)
            bigrams = bigrams.append(new, sort=False)
    return bigrams

In [184]:
def get_full_text(data_folder):
    full_reviews = pd.DataFrame()
    sentiments = ['POS', 'NEG']
    for sent in sentiments:
        review_folder = f'{data_folder}/{sent}'
        for file in os.listdir(review_folder):
            f = open(os.path.join(review_folder, file), 'r')
            new = pd.DataFrame()
            content = f.read()
            new['review'] = [content]
            new['file_id'] = int(file[2:5])
            new['sentiment'] = (1 if sent == sentiments[0] else -1)
            full_reviews = full_reviews.append(new, sort=False)
    return full_reviews    

In [185]:
def preprocess_ngrams(ngrams, sent, min_count):
    counts = ngrams[ngrams['sentiment'] == sent]['ngram'].value_counts()
    filtered = counts[counts >= min_count]
    return filtered/sum(filtered)

In [186]:
def get_pos_neg_probabilites(text, tokenize, data, min_freq):
    pos = 0
    neg = 0
    pos_ngrams = preprocess_ngrams(data, 1, min_freq) 
    neg_ngrams = preprocess_ngrams(data, -1, min_freq) 
    for word in tokenize(text):
        if word in pos_ngrams.index:
            pos += np.log(pos_ngrams.loc[word])
        if word in neg_ngrams.index:
            neg += np.log(neg_ngrams.loc[word])
    pos_prior = (data['sentiment'] > 0).sum()/len(data)
    pos += np.log(pos_prior)
    neg += np.log(1-pos_prior)
    return pos, neg

def naive_binary_bayes(text, unigrams=None, bigrams=None):
    if unigrams is None and bigrams is None:
        raise ValueError('Please choose to use either unigrams or bigrams by providing the data')
    pos_prob = 0
    neg_prob = 0
    if unigrams is not None:
        pos, neg = get_pos_neg_probabilites(text, unigram_tokenize, unigrams, min_freq=4)
        pos_prob += pos
        neg_prob += neg
    if bigrams is not None:
        pos, neg = get_pos_neg_probabilites(text, bigram_tokenize, bigrams, min_freq=7)
        pos_prob += pos
        neg_prob += neg
    #print('pos', pos_prob)
    #print('neg', neg_prob)
    return 1 if pos_prob > neg_prob else -1

## 1. Split data into two

In [187]:
def single_split(data, max_training_id):
    mask = data['file_id'].apply(lambda x: int(x[4:7]) <= max_training_id)
    return data[mask], data[~mask]

In [256]:
unigrams, bigrams, = get_uni_and_bi_grams('data-tagged')

In [261]:
training_unigrams, test_data = single_split(unigrams, 899)
training_bigrams, _ = single_split(bigrams, 899)

In [262]:
naive_binary_bayes('this was excellent', unigrams=training_unigrams)

1

In [263]:
naive_binary_bayes('a great movie', bigrams=training_bigrams)

1

In [264]:
naive_binary_bayes('this is a very bad movie', unigrams=training_unigrams)

-1

In [265]:
naive_binary_bayes('this is a very bad movie', bigrams=training_bigrams)

1

In [266]:
naive_binary_bayes('this is a very bad movie', bigrams=training_bigrams)

1

In [267]:
naive_binary_bayes('i love', unigrams=training_unigrams, bigrams=training_bigrams)

1

In [268]:
naive_binary_bayes('i hate', unigrams=training_unigrams)

1

In [271]:
def estimate_accuracy(test_data, unigrams=None, bigrams=None):
    acc = 0
    i = 0
    for file_id, group in test_data.groupby('file_id'):
        i += 1
        label = naive_binary_bayes(' '.join(group['ngram'].values), unigrams=unigrams, bigrams=bigrams)
        #print('pred', label)
        #print('true', group['sentiment'].unique()[0])
        #print((label == group['sentiment'].unique()[0]))
        assert(group['sentiment'].nunique() == 1)
        acc += (label == group['sentiment'].unique()[0])
        #print(acc/i)
    return acc/test_data['file_id'].nunique()

In [272]:
acc = estimate_accuracy(test_data, unigrams=training_unigrams)
acc

0.355

In [None]:
acc = estimate_accuracy(test_data, bigrams=training_bigrams)
acc

In [None]:
acc = estimate_accuracy(test_data,  unigrams=training_unigrams, bigrams=training_bigrams)
acc

In [None]:
pos, neg = preprocess_ngrams(unigrams, 4) 

In [None]:
pos[10:150]

In [199]:
%%time
unigrams, bigrams = get_uni_and_bi_grams('data-tagged/')

KeyboardInterrupt: 

In [None]:
find_bigrams()