In [0]:
# Opening and Reading the files into a list 
with open("imdb_labelled.txt","r") as text_file:
    lines = text_file.read().split('\n')

In [0]:
# split the line by new-line character such that each line has one element of the list
lines[0:10]

['A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  \t0',
 'Not sure who was more lost - the flat characters or the audience, nearly half of whom walked out.  \t0',
 'Attempting artiness with black & white and clever camera angles, the movie disappointed - became even more ridiculous - as the acting was poor and the plot and lines almost non-existent.  \t0',
 'Very little music or anything to speak of.  \t0',
 'The best scene in the movie was when Gerardo is trying to find a song that keeps running through his head.  \t1',
 "The rest of the movie lacks art, charm, meaning... If it's about emptiness, it works I guess because it's empty.  \t0",
 'Wasted two hours.  \t0',
 'Saw the movie today and thought it was a good effort, good messages for kids.  \t1',
 'A bit predictable.  \t0',
 'Loved the casting of Jimmy Buffet as the science teacher.  \t1']

In [0]:
# Read the lines from both the files and append in same list
with open("yelp_labelled.txt","r") as text_file:
    lines += text_file.read().split('\n')
with open("amazon_cells_labelled.txt","r") as text_file:
    lines += text_file.read().split('\n')
# print(lines)

In [0]:
# split by tab and remove corrupted data if any or lines which are not tab seperated
lines = [line.split("\t") for line in lines if len(line.split("\t"))==2 and line.split("\t")[1]!='']

In [0]:
# print the lines one is string and another is integer 0 or 1
lines[0:10]

[['A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  ',
  '0'],
 ['Not sure who was more lost - the flat characters or the audience, nearly half of whom walked out.  ',
  '0'],
 ['Attempting artiness with black & white and clever camera angles, the movie disappointed - became even more ridiculous - as the acting was poor and the plot and lines almost non-existent.  ',
  '0'],
 ['Very little music or anything to speak of.  ', '0'],
 ['The best scene in the movie was when Gerardo is trying to find a song that keeps running through his head.  ',
  '1'],
 ["The rest of the movie lacks art, charm, meaning... If it's about emptiness, it works I guess because it's empty.  ",
  '0'],
 ['Wasted two hours.  ', '0'],
 ['Saw the movie today and thought it was a good effort, good messages for kids.  ',
  '1'],
 ['A bit predictable.  ', '0'],
 ['Loved the casting of Jimmy Buffet as the science teacher.  ', '1']]

In [0]:
# Seperate the sentences
train_documents = [line[0] for line in lines]
train_documents[0:10]

['A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  ',
 'Not sure who was more lost - the flat characters or the audience, nearly half of whom walked out.  ',
 'Attempting artiness with black & white and clever camera angles, the movie disappointed - became even more ridiculous - as the acting was poor and the plot and lines almost non-existent.  ',
 'Very little music or anything to speak of.  ',
 'The best scene in the movie was when Gerardo is trying to find a song that keeps running through his head.  ',
 "The rest of the movie lacks art, charm, meaning... If it's about emptiness, it works I guess because it's empty.  ",
 'Wasted two hours.  ',
 'Saw the movie today and thought it was a good effort, good messages for kids.  ',
 'A bit predictable.  ',
 'Loved the casting of Jimmy Buffet as the science teacher.  ']

In [0]:
# Seperate the labels
train_labels = [int(line[1]) for line in lines]
train_labels[0:10]

[0, 0, 0, 0, 1, 0, 0, 1, 0, 1]

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [0]:
# Instatiate the Countvectorizer
count_vectorizer = CountVectorizer(binary='true')
# Train the documents
train_documents_count = count_vectorizer.fit_transform(train_documents)

In [0]:
# Instatiate the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(binary='true')
# Train the documents
train_documents_tfidf = tfidf_vectorizer.fit_transform(train_documents)

In [0]:
# print(train_documents)

# print first document
print(train_documents_count[0])
print(train_documents_tfidf[0])

  (0, 2764)	1
  (0, 5139)	1
  (0, 1401)	1
  (0, 1331)	1
  (0, 75)	1
  (0, 2954)	1
  (0, 166)	1
  (0, 2956)	1
  (0, 4133)	1
  (0, 4890)	1
  (0, 4890)	0.1705983714180228
  (0, 4133)	0.2866535155030133
  (0, 2956)	0.344069060569113
  (0, 166)	0.39646013048660145
  (0, 2954)	0.18376294855720401
  (0, 75)	0.21995093564903362
  (0, 1331)	0.39646013048660145
  (0, 1401)	0.39646013048660145
  (0, 5139)	0.3527636851677853
  (0, 2764)	0.303662775383371


In [0]:
# Training Phase
from sklearn.naive_bayes import BernoulliNB
count_classifier = BernoulliNB().fit(train_documents_count, train_labels)
tfidf_classifier = BernoulliNB().fit(train_documents_tfidf, train_labels)

In [0]:
# Test Phase
# Opening and Reading the test file into a list 
import xlrd

workbook = xlrd.open_workbook("yelp-test.xlsx")
sheet = workbook.sheet_by_index(1)  #iki sheet var. Bizimki ikincisi olduğundan index 1

test_documents = list()
for row in range(sheet.nrows):
	test_documents.append(sheet.cell(row,0).value)
	#print(sheet.cell(row, 0).value)

test_labels = list()
for row in range(sheet.nrows):
	test_labels.append(sheet.cell(row,1).value)
	#print(sheet.cell(row, 1).value)

In [0]:
# Making the predictions
pred_count = count_classifier.predict(count_vectorizer.transform(test_documents))
pred_tfidf = tfidf_classifier.predict(tfidf_vectorizer.transform(test_documents))

# Print the accuracy scores
print("Count Vectorizer Accuracy Score: ", count_classifier.score((count_vectorizer.transform(test_documents)), test_labels))
print("Tf-Idf Vectorizer Accuracy Score: ", tfidf_classifier.score((tfidf_vectorizer.transform(test_documents)), test_labels))

Count Vectorizer Accuracy Score:  0.5643564356435643
Tf-Idf Vectorizer Accuracy Score:  0.5643564356435643


In [0]:
test_labels = numpy.asarray(test_labels)

In [0]:
# Create the confusion matrix and classification report
from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(test_labels, pred_count))
print('\n')
print(classification_report(test_labels, pred_count))

[[35 30]
 [14 22]]


              precision    recall  f1-score   support

           0       0.71      0.54      0.61        65
           1       0.42      0.61      0.50        36

   micro avg       0.56      0.56      0.56       101
   macro avg       0.57      0.57      0.56       101
weighted avg       0.61      0.56      0.57       101



In [0]:
# Create the confusion matrix and classification report
print(confusion_matrix(test_labels, pred_tfidf))
print('\n')
print(classification_report(test_labels, pred_tfidf))

[[35 30]
 [14 22]]


              precision    recall  f1-score   support

           0       0.71      0.54      0.61        65
           1       0.42      0.61      0.50        36

   micro avg       0.56      0.56      0.56       101
   macro avg       0.57      0.57      0.56       101
weighted avg       0.61      0.56      0.57       101



## Summary

For this problem, we have used the Count Vectorizer and TF-IDF Vectorizer algorithms to learn the vocabulary. They have produced the same results. Then, we attempted to use k-fold cross-validation with our training data. The generated models provided no improvement, thus we have omitted them from the notebook for clarity. The results can be seen above.