In [6]:
#All Imports.
import os, json
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
import random

In [2]:
#Defining most used Data structures and inputs
Features = {'screen':[],'sound':[],'camera':[]}

data = pd.read_json('Jsons2/B004MN00C4.json', typ='series')

In [7]:
#Extracting data from input to data structures
#A Typical Json file of amazon reviews is like:
# {
#     "Reviews":[
#     {
#         "Content":"Actual Review 1"
#     },
#     {
#         "Content":"Actual Review 2"
#     },    
#     .
#     .    
#     ],
#     .
#     .
# }

for review in data['Reviews']:   
    reviewValue = review.get('Content')
    if reviewValue is None:
        continue
    sent_tokenize_list = sent_tokenize(reviewValue) #Tokenize long reviews into individual strings
    for eachReview in sent_tokenize_list:
        for feature in Features:
            if feature in eachReview:
                Features[feature].append(eachReview)

#Long reviews were normalized, as one long review should not change the outcome of the whole sentiment. 
#The dataset didn`t contain info about the reviews which were most useful.
#Hence, assumption made, that longer the review, more helpful it was.



In [8]:

#A preprocessed dataset of pros and cons phrases was taken from NLKT corpus.

#read all statements
short_pos = open("Jsons2/pros.txt","r").read()
short_neg = open("Jsons2/cons.txt","r").read()
stopwords_txt = open("Jsons2/stopwords.txt","r").read()

documents = []
documentspos = []
documentsneg = []
stopwords = []

#labelling statements from pros as pos
for r in short_pos.split('\n'):
    documentspos.append( (r, "pos") )

#labelling statements from cons as neg
for r in short_neg.split('\n'):
    documentsneg.append( (r, "neg") )
    
#Collecting all stopwords in a datastructure
for r in stopwords_txt.split('\n'):
    stopwords.append(r)

#tokenize all words and add to get most frequent words
all_words = []

short_pos_words = word_tokenize(short_pos)
short_neg_words = word_tokenize(short_neg)

for w in short_pos_words:
    if w not in stopwords:
        all_words.append(w.lower())

for w in short_neg_words:
    if w not in stopwords:
        all_words.append(w.lower())

#From all useful words, we create a freqdistribution        
all_words = nltk.FreqDist(all_words)
print((all_words.items()))

#a total of 12633 words were present, and we considered various values, 10000, 8000, and 5000.
#finally used 7000 to train our model.
#we removed stop words, punctuations from these words.
word_features = list(all_words.keys())[:7000]



#This is the main function which extracts most frequent words from a passed string.
#the words are also lemmatized.
def find_features(document):
    words = word_tokenize(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features


#This function is used to finally train the machine learning algorithms.
featuresetspos = [(find_features(rev), category) for (rev, category) in documentspos]
featuresetsneg = [(find_features(rev), category) for (rev, category) in documentsneg]


poscutoff = len(featuresetspos)*3/4
negcutoff = len(featuresetsneg)*3/4

random.shuffle(featuresetspos)
random.shuffle(featuresetsneg)

trainfeats = featuresetsneg[:negcutoff] + featuresetspos[:poscutoff]
testfeats = featuresetsneg[negcutoff:] + featuresetspos[poscutoff:]


random.shuffle(trainfeats)
random.shuffle(testfeats)

In [10]:
#train Naive Bayes

from nltk.classify import NaiveBayesClassifier

classifier = NaiveBayesClassifier.train(trainfeats)

In [31]:
print(len(testfeats))

11471


In [28]:
import collections
import nltk.classify.util, nltk.metrics

refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)
 
for i, (feats, label) in enumerate(testfeats):
    refsets[label].add(i)
    observed = classifier.classify(feats)
    testsets[observed].add(i)

    
#calculate accuracy of Naive Bayes Classifier
accuracy = nltk.classify.util.accuracy(classifier, testfeats)
# pos_precision = nltk.metrics.precision(refsets['pos'], testsets['pos'])
# pos_recall = nltk.metrics.recall(refsets['pos'], testsets['pos'])
# pos_fmeasure = nltk.metrics.f_measure(refsets['pos'], testsets['pos'])
# neg_precision = nltk.metrics.precision(refsets['neg'], testsets['neg'])
# neg_recall = nltk.metrics.recall(refsets['neg'], testsets['neg'])
# neg_fmeasure =  nltk.metrics.f_measure(refsets['neg'], testsets['neg'])
        
print('')
print('---------------------------------------')
print("Single Fold for Naive Bayes ")
print('---------------------------------------')
print('accuracy:', accuracy)
# print('precision', (pos_precision + neg_precision) / 2)
# print('recall', (pos_recall + neg_recall) / 2)
# print('f-measure', (pos_fmeasure + neg_fmeasure) / 2)  
                





KeyboardInterrupt: 

In [None]:
##### cross validation

trainfeats = featuresetsneg + featuresetspos    
  
random.shuffle(trainfeats)

n = 5
subset_size = len(trainfeats) / n
    accuracy = []
    pos_precision = []
    pos_recall = []
    neg_precision = []
    neg_recall = []
    pos_fmeasure = []
    neg_fmeasure = []
    cv_count = 1
    for i in range(n):        
        testing_this_round = trainfeats[i*subset_size:][:subset_size]
        training_this_round = trainfeats[:i*subset_size] + trainfeats[(i+1)*subset_size:]

        cv_accuracy = nltk.classify.util.accuracy(classifier, testing_this_round)
        cv_pos_precision = nltk.metrics.precision(refsets['pos'], testsets['pos'])
        cv_pos_recall = nltk.metrics.recall(refsets['pos'], testsets['pos'])
        cv_pos_fmeasure = nltk.metrics.f_measure(refsets['pos'], testsets['pos'])
        cv_neg_precision = nltk.metrics.precision(refsets['neg'], testsets['neg'])
        cv_neg_recall = nltk.metrics.recall(refsets['neg'], testsets['neg'])
        cv_neg_fmeasure =  nltk.metrics.f_measure(refsets['neg'], testsets['neg'])

        accuracy.append(cv_accuracy)
        pos_precision.append(cv_pos_precision)
        pos_recall.append(cv_pos_recall)
        neg_precision.append(cv_neg_precision)
        neg_recall.append(cv_neg_recall)
        pos_fmeasure.append(cv_pos_fmeasure)
        neg_fmeasure.append(cv_neg_fmeasure)

    cv_count += 1
                
        print('---------------------------------------')
        print('N-FOLD CROSS VALIDATION RESULT Naive Bayes)')
        print('---------------------------------------')
        print('accuracy:', sum(accuracy) / n)
        print('precision', (sum(pos_precision)/n + sum(neg_precision)/n) / 2)
        print('recall', (sum(pos_recall)/n + sum(neg_recall)/n) / 2)
        print('f-measure', (sum(pos_fmeasure)/n + sum(neg_fmeasure)/n) / 2)
        print('')
    




In [32]:
sid = SentimentIntensityAnalyzer()
for feature,featureReviews in Features.items():
    pos =0
    neg =0
    for sentence in featureReviews:
        ss = sid.polarity_scores(sentence)
        if ss['pos'] >= ss['neg']:
            pos = pos + 1
        else:
            neg = neg + 1
    if pos > neg:
        print("(+) Good ",feature,"--- pos :",pos,"neg :",neg)
    else:
        print("(-) Bad  ",feature,"--- pos :",pos,"neg :",neg)
    print() 
            
    
    
    

('(+) Good ', 'sound', '--- pos :', 40, 'neg :', 8)
()
('(+) Good ', 'screen', '--- pos :', 472, 'neg :', 66)
()
('(+) Good ', 'camera', '--- pos :', 5138, 'neg :', 800)
()


In [37]:
import pickle
classifier_f = open("NaiveBayesClassifier.pickle", "rb")
NaiveBayesClassifier = pickle.load(classifier_f)
classifier_f.close()




for feature,featureReviews in Features.items():
    pos =0
    neg =0
    for sentence in featureReviews:
        ss = NaiveBayesClassifier.classify(find_features(sentence))
        if 'pos' == ss:
            pos = pos + 1
        else:
            neg = neg + 1
    if pos > neg:
        print("(+) Good ",feature,"--- pos :",pos,"neg :",neg)
    else:
        print("(-) Bad  ",feature,"--- pos :",pos,"neg :",neg)
    print() 
            



('(-) Bad  ', 'sound', '--- pos :', 10, 'neg :', 38)
()
('(-) Bad  ', 'screen', '--- pos :', 160, 'neg :', 378)
()
('(-) Bad  ', 'camera', '--- pos :', 1292, 'neg :', 4646)
()


In [42]:
 NaiveBayesClassifier.classify(find_features("This is not a great awesome phone"))

'pos'

pos
