# Part 1: Identifying Misclassified Reviews

### Importing and Concatenating Reviews

In [18]:
import numpy as np
import os
import random
import math

In [38]:
pos_directory = '../Part_3/review_polarity/txt_sentoken/pos'
positive_reviews_list = []

for filename in os.listdir(pos_directory):
    file_path = pos_directory + "/" + filename
    file = open(file_path, "r")
    review = []
    for line in file.readlines():
        review.append(line.rstrip())
    positive_reviews_list.append(" ".join(review))

In [39]:
neg_directory = '../Part_3/review_polarity/txt_sentoken/neg'
negative_reviews_list = []

for filename in os.listdir(neg_directory):
    file_path = neg_directory + "/" + filename
    file = open(file_path, "r")
    review = []
    for line in file.readlines():
        review.append(line.rstrip())
    negative_reviews_list.append(" ".join(review))

### Creating Model Functions

In [42]:
def create_frequency_dictionary(list_of_reviews):
    review_dictionary = {}
    for review in list_of_reviews:
        words = review.split()
        for word in words:
            if word in review_dictionary:
                review_dictionary[word] += 1
            else:
                review_dictionary[word] = 1
    return review_dictionary

In [43]:
def result_for_class(review, frequency_dictionary):
    punctuation = ['.', ',', ':', '"', '&', '?', '-', '(', ')', "'", '/']
    class_result = math.log(0.5)
    for word in review.split():
        if word in punctuation:
            continue
        elif word not in frequency_dictionary:
            class_result += math.log(1 / (sum(frequency_dictionary.values()) + 1))
        else:
            class_result += math.log((frequency_dictionary[word] + 1) / (sum(frequency_dictionary.values()) + 1))
    return class_result

### Splitting into Training and Testing Data and Assigning Frequency Dictionary

In [44]:
negative_training_data = negative_reviews_list[:900]
positive_training_data = positive_reviews_list[:900]
negative_test_data = negative_reviews_list[900:]
positive_test_data = positive_reviews_list[900:]

In [45]:
negative_frequency_dictionary = create_frequency_dictionary(negative_training_data)
positive_frequency_dictionary = create_frequency_dictionary(positive_training_data)

### Creating Lists for Misclassified Reviews

In [48]:
positive_reviews_misclassified = []

for review in positive_test_data:
    positive_result = result_for_class(review, positive_frequency_dictionary)
    negative_result = result_for_class(review, negative_frequency_dictionary)
    if positive_result < negative_result:
        positive_reviews_misclassified.append(review)

In [49]:
negative_reviews_misclassified = []

for review in negative_test_data:
    positive_result = result_for_class(review, positive_frequency_dictionary)
    negative_result = result_for_class(review, negative_frequency_dictionary)
    if positive_result > negative_result:
        negative_reviews_misclassified.append(review)

# Part 2: Analysing Misclassification of Positive Reviews

### Deciding which 5 reviews to analyse

In [53]:
positive_reviews_sample_ids = random.sample(list(np.arange(0,len(positive_reviews_misclassified))), 5)
positive_reviews_sample_ids

[10, 12, 2, 18, 3]

## Part 2.1: Analysing Positive Review #12

To begin the analysis, we will investigate the tokens with the highest count in the review (and therefore the most influence) and see if these words appear more in the positive or negative dictionary.

In [54]:
review12_dict = create_frequency_dictionary([positive_reviews_misclassified[12]])

In [55]:
# sorted(review_dict1.items(), key=lambda x: x[1], reverse=True)[:20]

The top 20 most frequent words in this first review are:

1. ('.', 46) -> punctuation

2. ('the', 36) -> determiner
 
3. (',', 23) -> punctuation
 
4. ('to', 19) -> particle
 
5. ('and', 16) -> conjunction
 
6. ('in', 14) -> preposition
 
7. ('a', 13) -> determiner
 
8. ('of', 11) -> preposition
 
9. ('"', 10) -> punctuation
 
10. ('(', 9) -> punctuation
 
11. (')', 9) -> punctuation
 
12. ('that', 9) -> determiner/conjunction
 
13. ('is', 9) -> verb
 
14. ('film', 8) -> noun
 
15. ('as', 8) -> conjunction/preposition
 
16. ('car', 7) -> noun
 
17. ('his', 7) -> pronoun
 
18. ('just', 7) -> adjective/adverb
 
19. ('memphis', 6) -> noun
 
20. ('have', 6) -> verb

This is expected, as punctuation and determiners/particles/prepositions etc. are more common than nouns or adjectives. 

We will have a look at a few of the most common nouns, verbs, adjectives and adverbs.

* ('is', 9)
* ('film', 8)
* ('car', 7)
* ('just', 7)
* ('memphis', 6)
* ('have', 6)

In [62]:
common_words = ['is', 'film', 'car', 'just', 'memphis', 'have']

for word in common_words:
    if word in positive_frequency_dictionary:
        print(word + " appears in the positive dictionary " + str(positive_frequency_dictionary[word]) + " times")
    else:
        print(word + " appears in the positive dictionary 0 times")
    if word in negative_frequency_dictionary:
        print(word + " appears in the negative dictionary " + str(negative_frequency_dictionary[word]) + " times")
    else: print(word + " appears in the negative dictionary 0 times")

is appears in the positive dictionary 12549 times
is appears in the negative dictionary 9952 times
film appears in the positive dictionary 4376 times
film appears in the negative dictionary 3598 times
car appears in the positive dictionary 112 times
car appears in the negative dictionary 165 times
just appears in the positive dictionary 1197 times
just appears in the negative dictionary 1390 times
memphis appears in the positive dictionary 0 times
memphis appears in the negative dictionary 17 times
have appears in the positive dictionary 1992 times
have appears in the negative dictionary 2408 times
