# Text Classification with Naive Bayes

In [163]:
import os
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

def get_file_names(filepath): # This function takes as input a path to a directory. It returns the file paths of 
    file_names = []           # all the files within that directory in a list. 
    for file in os.listdir(filepath):
        file_names.append(os.path.join(filepath, file))
    return file_names

def get_reviews(files): # This function takes as input a list of file paths. For each file path, it opens the file
    reviews = []        # pointed to by that path, and reads it into a string, of which is then stored as an element
    for file in files:  # in the returned list. 
        f = open(file, 'r')
        reviews.append(" ".join(line.strip() for line in f))
        f.close()
    return reviews


In [164]:
filepath_neg = '../datasets/movies_reviews/neg'
filepath_pos = '../datasets/movies_reviews/pos'

filepath_new = "../datasets/movies_reviews/newly_found_reviews"


neg_reviews_names = get_file_names(filepath_neg)
pos_reviews_names = get_file_names(filepath_pos)

new_reviews_names = get_file_names(filepath_new)

In [174]:
neg_reviews = get_reviews(neg_reviews_names)
pos_reviews = get_reviews(pos_reviews_names)

new_reviews = get_reviews(new_reviews_names)


reviews = []
reviews.extend(neg_reviews)
reviews.extend(pos_reviews)
reviews.extend(new_reviews)

print(len(neg_reviews), len(pos_reviews), len(new_reviews))
print(len(reviews))

1000 1005 5
2010


In [166]:
stop_words_file = "../datasets/stop_words.txt"
f = open(stop_words_file, "r", encoding="utf-8")

stopwords = [] # list containing words that we don't want to vectorize
for line in f:
    stopwords.append(line.strip())
    
f.close()


## Using CountVectorizer
So now that we have our movie reviews, how can we convert it into a form of data that is analyze-able? 
For each movie review, we can convert it into a vector, with each element at the i-th index in the vector being a number that represents the number of occurences of the word correspondong to i in the movie review. 

In this way, each word is an attribute, and each movie review is a data entry. For each word (attribute), each movie review would have an attribute value that is the number of occurences of that word in the movie review. 

A subtle issue here is that words like "the, a, an, this, that, etc" are common words that exist in almost all texts. They are words that are indifferent/inconsequential when it comes to influencing the positiveness or negativeness of a review. In our stopwords array, we have compiled a list of these words, and we can pass it to our vectorization method and tell it to not vectorize these words when it encounters it.

And lastly, for each movie review, there would be a class label, either positive or negative.


In [167]:
vector = CountVectorizer(stop_words=stopwords) # create an instance of the CountVectorizer class, and tell it to not 
                                               # vectorize the words in the stopwords array
vector.fit(reviews) # vectorize our movie reviews

CountVectorizer(stop_words=['a', 'about', 'above', 'across', 'after',
                            'afterwards', 'again', 'against', 'all', 'almost',
                            'alone', 'along', 'already', 'also', 'although',
                            'always', 'am', 'among', 'amongst', 'amoungst',
                            'amount', 'an', 'and', 'another', 'any', 'anyhow',
                            'anyone', 'anything', 'anyway', 'anywhere', ...])

In [178]:
# The word is the key, and the number that represents the word's index in the vector representation of our movie 
# reviews is the value. For example, if you were to go to index 26306 of vector j, then you would find the number 
# of occurences of the word 'plot' in movie j. 
print("Print Vocabulary: "+str(vector.vocabulary_)+'\n')

# Returns a list containing the 'attributes' of our dataset, ie. the words that we used to vectorize our movie reviews
vector.get_feature_names()
print("Feature names:"+str(vector.get_feature_names())+'\n')

# Vector representation of our movie reviews
counts = vector.transform(reviews)
print("The shape of our vectors is: "+str(counts.shape)+'\n')

print("Printing our vectors: "+'\n'+str(counts.toarray()))





The shape of our vectors is: (2010, 39419)

Printing our vectors: 
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [169]:
# 1 means positive
# 0 means negative

X = counts.toarray()[:2005]
Y = np.zeros(1000,) # first 1000 vectors are vectors that represent negative movie reviews
Y = np.concatenate((Y, np.ones((1005,)))) # 1000-2005 vectors are vectors that represent positive movie reviews

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)


In [170]:
clf = MultinomialNB().fit(X_train, Y_train)

In [171]:
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

predicted = clf.predict(X_train)
print("Accuracy for training data: ", accuracy_score(Y_train, predicted))

Accuracy for training data:  0.9828741623231572


In [172]:
predicted = clf.predict(X_test)
print("Accuracy for testing data: ", accuracy_score(Y_test, predicted))

Accuracy for testing data:  0.8081570996978852


In [173]:
predicted = clf.predict(counts.toarray()[2005:])

for i in range(5):
    print("for this review: ", os.listdir(filepath_new)[i], ", our classifier classified it as: ", predicted[i])

for this review:  five_star_drstrange_review.txt , our classifier classified it as:  1.0
for this review:  four_star_mulan_review.txt , our classifier classified it as:  1.0
for this review:  one_star_divergent_review.txt , our classifier classified it as:  0.0
for this review:  three_star_joker_review.txt , our classifier classified it as:  0.0
for this review:  two_star_alita_review.txt , our classifier classified it as:  0.0


#### Discussion
If we defined a rating of >= 3 as positive (1), and a rating of < 3 as negative (0), then out of these 5 new movie reviews that I found, my classifier classified 4 correctly and 1 incorrectly. It classified the 5 star and the 4 star review correctly as positive (1) reviews. It classified the 3 star review incorrectly as a negative (0) review. It classified the 2 star review and the 1 star review correctly as negative (0) reviews.