In [26]:
import pandas as pd
import os
import math

Step 1. Create mega-document for all negative reviews and another for all positive reviews by concatenating them

# Importing and Concatenating Reviews

In [4]:
pos_directory = '../Part_3/review_polarity/txt_sentoken/pos'
positive_reviews_list = []

for filename in os.listdir(pos_directory):
    file_path = pos_directory + "/" + filename
    file = open(file_path, "r")
    review = []
    for line in file.readlines():
        review.append(line.rstrip())
    positive_reviews_list.append(" ".join(review))

In [5]:
neg_directory = '../Part_3/review_polarity/txt_sentoken/neg'
negative_reviews_list = []

for filename in os.listdir(neg_directory):
    file_path = neg_directory + "/" + filename
    file = open(file_path, "r")
    review = []
    for line in file.readlines():
        review.append(line.rstrip())
    negative_reviews_list.append(" ".join(review))

In [24]:
file = open('../Part_3/review_polarity/txt_sentoken/pos/cv000_29590.txt', "r")
file.readline()

"films adapted from comic books have had plenty of success , whether they're about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there's never really been a comic book like from hell before . \n"

In [None]:
# change to lowercase and remove punctuation
# email the bert stuff

# Naive Bayes Formulas

The Baye's rule (for document d and class c) is P(c|d) = ( P(d|c)P(c) ) / P(d)

This simplifies to P(c|d) = P(d|c)P(c) because we can drop the denominator



To find the most likely class, we find Cmap = argmax P(c|d)

This can also be simplified to Cmap = argmax P(d|c) P(c)

P(d|c) is called the likelihood

P(c) is called the prior

If a document has features x1, x2, ..., xn, then the likelihood is

P(d|c) = P(x1, x2, ..., xn|c)P(c)

We have to assume that the features x1 to xn are independent

P(x1, x2, ..., xn|c) = P(x1|c)P(x2|c)...P(xn|c)

Since P(c) is the probability of the class occuring, this will be 50% for both classes, positive and negative. This is because we have an equal amount of positive reviews and negative reviews in our test dataset, giving both equal probabilities.


In [6]:
len(positive_reviews_list)

1000

Instead of multiplying P(x1|c)P(x2|c)...P(xn|c), we will perform this calculation in the log space.

This will mean that the result will no longer represent a probability, but it will still have the same properties (the higher the result, the more likely it is correct).

We now have argmax[ log(P(c)) + sum of log(P(x|c) for all x) ] 

P(cj) = # of documents of class cj / total # of documents

P(wj|ci) = # of times word wj appears in class ci / total # of words that appear in class ci

We have a large dataset, however it is still possible that there will be 0 occurences for certain words. 

To combat these words achieving a 0 probability, we use LaPlace Smoothing.

We now have P(xi|c) = ((# of times word wj appears in class ci) + 1) / ((total # of words that appear in class ci) + 1)

It might be worth considering a clause to deal with words that don't appear in any reviews (positive/negative).

We could remove stop words, however this has not shown to be very beneficial and is not common practice when using Naive Bayes.

# Assigning Important Variables

In [7]:
## Assigning the test and training data
negative_training_data = negative_reviews_list[:900]
positive_training_data = positive_reviews_list[:900]
negative_test_data = negative_reviews_list[900:]
positive_test_data = positive_reviews_list[900:]

In [19]:
def create_frequency_dictionary(list_of_reviews):
    review_dictionary = {}
    for review in list_of_reviews:
        words = review.split()
        for word in words:
            if word in review_dictionary:
                review_dictionary[word] += 1
            else:
                review_dictionary[word] = 1
    return review_dictionary

In [20]:
negative_dictionary = create_frequency_dictionary(negative_training_data)
positive_dictionary = create_frequency_dictionary(positive_training_data)

In [24]:
positive_probability = len(positive_training_data) / (len(positive_training_data)+len(negative_training_data))
negative_probability = len(negative_training_data) / (len(positive_training_data)+len(negative_training_data))

In [44]:
def result_for_class(review, class_probability, training_data):
    class_result = math.log(class_probability)
    dictionary = create_frequency_dictionary(training_data)
    for word in review.split():
        if word in ['.', ',', ':', '"', '&', '?', '-', '(', ')']:
            continue
        elif word not in dictionary:
            class_result += math.log(1 / (sum(dictionary.values()) + 1))
        else:
            class_result += math.log((dictionary[word] + 1) / (sum(dictionary.values()) + 1))
    return class_result

In [45]:
result_for_class(positive_test_data[1], positive_probability, positive_training_data)

-1846.3421536309943

In [46]:
result_for_class(positive_test_data[1], negative_probability, negative_training_data)

-1841.1022259865354

In [42]:
def determine_class(review):
    positive_class_result = math.log(positive_probability)
    negative_class_result = math.log(negative_probability)
    for word in review.split():
        if word in ['.', ',', ':', '"', '&', '?', '-', '(', ')']:
            continue
        else:
            if word not in positive_dictionary and word not in negative dictionary:
                continue
            elif word not in positive_dictionary:
                positive_class_result += math.log(
                1 / (sum(positive_dictionary.values()) + 1))
            elif word not in negative_dictionary:
                negative_class_result += math.log(
                1 / (sum(negative_dictionary.values()) + 1))
            else:
                positive_class_result += math.log(
                    (positive_dictionary[word] + 1) / (sum(positive_dictionary.values()) + 1))
                negative_class_result += math.log(
                    (negative_dictionary[word] + 1) / (sum(negative_dictionary.values()) + 1))
    if positive_class_result > negative_class_result:
        return "Positive"
    elif negative_class_result > positive_class_result:
        return "Negative"
    else:
        return "Equal Probability"

In [47]:
accuracy = 0
total = 0

for review in positive_test_data:
    positive_result = result_for_class(review, positive_probability, positive_training_data)
    negative_result = result_for_class(review, negative_probability, negative_training_data)
    if positive_result > negative_result:
        accuracy += 1
    total += 1

In [48]:
accuracy / total

0.78

The model does not have amazing performance on the positive reviews, let's see if it performs better in the negative reviews.

In [49]:
accuracy = 0
total = 0

for review in negative_test_data:
    positive_result = result_for_class(review, positive_probability, positive_training_data)
    negative_result = result_for_class(review, negative_probability, negative_training_data)
    if positive_result < negative_result:
        accuracy += 1
    total += 1

In [50]:
accuracy / total

0.89