In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB

In [2]:
#opens text file in read only mode which has comments and label separatd by tab.  
#label value 1 is positive and 0 is negative
#Each comment should be on a separate line

with open("labelled_shorttexts.txt", "r") as shorttext_file: 
# split file by newline character and assign to a list, with each line in a separate list element
    shorttext_list = shorttext_file.read().split('\n')
    print('\n', 'Size of raw data: ',len(shorttext_list))


 Size of raw data:  3000


In [3]:
# Pre process raw data ## split each element (which is a line) by tab character to create nested list 
shorttext_list = [shorttext_element.split("\t") for shorttext_element in shorttext_list]
# Remove lines which does not have exactly two sentences
shorttext_list = [shorttext_element for shorttext_element in shorttext_list if len(shorttext_element)==2]
print('\n', 'Size of list after eliminating invalid lengths: ',len(shorttext_list) )
shorttext_list = [shorttext_element for shorttext_element in shorttext_list if shorttext_element[1]!='']
print('\n', 'Size of list after eliminating invalid labels: ',len(shorttext_list) )
print('\n', 'Snapshot of file as list after cleansing: ', '\n', '\n', shorttext_list[1:5]  )


 Size of list after eliminating invalid lengths:  2993

 Size of list after eliminating invalid labels:  2989

 Snapshot of file as list after cleansing:  
 
 [['Good case, Excellent value.', '1'], ['Great for the jawbone.', '1'], ['Tied to charger for conversations lasting more than 45 minutes.MAJOR PROBLEMS!!', '0'], ['The mic is great.', '1']]


In [4]:
# Create lists to be used by algorithm
#comments list which has only short test
comments = [shorttext_element[0] for shorttext_element in shorttext_list ]
#labels list which has only labels
labels = [int(shorttext_element[1]) for shorttext_element in shorttext_list]

In [5]:
#represent comments as Term Frequency using count vectorizer 
count_vectorizer = CountVectorizer(binary='true')
comments = count_vectorizer.fit_transform(comments)
# Term frequency matrix shape
print ('\n', 'Term frequency matrix shape for comments: ', comments.shape)


 Term frequency matrix shape for comments:  (2989, 5155)


In [6]:
# Train the Naive Bayes algorithm with the training data, which returns a classifier object
classifier = BernoulliNB().fit(comments,labels)

In [7]:
# Define function to test the input text
def validate_comment(text):
    response = classifier.predict(count_vectorizer.transform([text]))
    if response == 1:
        print ('This is a positive comment', '\n')
    else:
        print ('This is a negative comment', '\n')

In [8]:
#Main program call
print ('\n','Enter text to validate if positive or negative...')  
print ('\n','Enter "exit" to quit this program')
inputstring = input()
while inputstring.lower() != 'exit':
    validate_comment(inputstring)
    inputstring = input()
    
print('\n', 'Quitting program')
    


 Enter text to validate if positive or negative...

 Enter "exit" to quit this program
The ambience was good at that restaurant
This is a positive comment 

I did not like the taste 
This is a negative comment 

I don't like going there anymore
This is a negative comment 

The price was fair
This is a positive comment 

exit

 Quitting program
