# Text Classification with Naive Bayes

In [78]:
import os
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer


def get_file_names(filepath):
    file_names = []
    for file in os.listdir(filepath):
        file_names.append(os.path.join(filepath, file))
    return file_names

def get_reviews(files):
    reviews = []
    for file in files:
        f = open(file, 'r')
        reviews.append(" ".join(line.strip() for line in f))
        f.close()
    return reviews


In [57]:
filepath_neg = '../datasets/movies_reviews/neg'
filepath_pos = '../datasets/movies_reviews/pos'

neg_reviews_names = get_file_names(filepath_neg)
pos_reviews_names = get_file_names(filepath_pos)


In [76]:
neg_reviews = get_reviews(neg_reviews_names)
pos_reviews = get_reviews(pos_reviews_names)

reviews = []
reviews.extend(neg_reviews[:800])
reviews.extend(pos_reviews)
reviews.extend(neg_reviews[800:])

print(len(neg_reviews), len(pos_reviews))
print(len(reviews))

800
1000 1005
2005


In [72]:
stop_words_file = "../datasets/stop_words.txt"
f = open(stop_words_file, "r", encoding="utf-8")

stopwords = []
for line in f:
    stopwords.append(line.strip())
    
f.close()



In [73]:
vector = CountVectorizer(stop_words=stopwords)
vector.fit(reviews)

CountVectorizer(stop_words=['a', 'about', 'above', 'across', 'after',
                            'afterwards', 'again', 'against', 'all', 'almost',
                            'alone', 'along', 'already', 'also', 'although',
                            'always', 'am', 'among', 'amongst', 'amoungst',
                            'amount', 'an', 'and', 'another', 'any', 'anyhow',
                            'anyone', 'anything', 'anyway', 'anywhere', ...])

In [74]:
#learn a vocabulary dictionary of all tokens in the raw documents
print("Print Vocabulary: "+str(vector.vocabulary_)+'\n')

vector.get_feature_names()

print("Feature names:"+str(vector.get_feature_names())+'\n')

counts = vector.transform(reviews)

print("The shape of count is: "+str(counts.shape)+'\n')

print("Printing count: "+'\n'+str(counts.toarray()))





The shape of count is: (2005, 39373)

Printing count: 
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [84]:
# 1 means positive
# 0 means negative
split = len(counts.toarray()) // 4 

X_train = counts.toarray()[:-split]
X_test = counts.toarray()[-split:]

print(len(X_train), len(X_test))

Y_train = np.zeros(800,)
Y_train = np.concatenate((Y_train, np.ones((704,))))

Y_test = np.ones(301,)
Y_test = np.concatenate((Y_test, np.zeros((200,))))

print(len(Y_train), len(Y_test))

1504 501
1504 501
