In [None]:
#All Imports.
import os, json
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize

In [None]:
#Defining most used Data structures and inputs
Features = {'screen':[],'sound':[],'camera':[]}

data = pd.read_json('Jsons2/B00B93KG1A.json', typ='series')

In [None]:
#Extracting data from input to data structures
#A Typical Json file of amazon reviews is like:
# {
#     "Reviews":[
#     {
#         "Content":"Actual Review 1"
#     },
#     {
#         "Content":"Actual Review 2"
#     },    
#     .
#     .    
#     ],
#     .
#     .
# }

for review in data['Reviews']:   
    reviewValue = review.get('Content')
    if reviewValue is None:
        continue
    sent_tokenize_list = sent_tokenize(reviewValue) #Tokenize long reviews into individual strings
    for eachReview in sent_tokenize_list:
        for feature in Features:
            if feature in eachReview:
                Features[feature].append(eachReview)

#Long reviews were normalized, as one long review should not change the outcome of the whole sentiment. 
#The dataset didn`t contain info about the reviews which were most useful.
#Hence, assumption made, that longer the review, more helpful it was.



In [None]:

#A preprocessed dataset of pros and cons phrases was taken from NLKT corpus.

#read all statements
short_pos = open("Jsons2/pros.txt","r").read()
short_neg = open("Jsons2/cons.txt","r").read()
stopwords_txt = open("Jsons2/stopwords.txt","r").read()

documents = []
stopwords = []

#labelling statements from pros as pos
for r in short_pos.split('\n'):
    documents.append( (r, "pos") )

#labelling statements from cons as neg
for r in short_neg.split('\n'):
    documents.append( (r, "neg") )
    
#Collecting all stopwords in a datastructure
for r in stopwords_txt.split('\n'):
    stopwords.append(r)

#tokenize all words and add to get most frequent words
all_words = []

short_pos_words = word_tokenize(short_pos)
short_neg_words = word_tokenize(short_neg)

for w in short_pos_words:
    if w not in stopwords:
        all_words.append(w.lower())

for w in short_neg_words:
    if w not in stopwords:
        all_words.append(w.lower())

#From all useful words, we create a freqdistribution        
all_words = nltk.FreqDist(all_words)
print((all_words.items()))

#a total of 12633 words were present, and we considered various values, 10000, 8000, and 5000.
#finally used 7000 to train our model.
#we removed stop words, punctuations from these words.
word_features = list(all_words.keys())[:7000]



#This is the main function which extracts most frequent words from a passed string.
#the words are also lemmatized.
def find_features(document):
    words = word_tokenize(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features


#This function is used to finally train the machine learning algorithms.
featuresets = [(find_features(rev), category) for (rev, category) in documents]

#Shuffling to disallign pros and cons together
random.shuffle(featuresets)


#dividing total 44356 into 2 parts for training and testing
#35000 for training
#9356 for testing
trainingData = featuresets[:35000]
testData = featuresets[35000:]




In [None]:
sid = SentimentIntensityAnalyzer()
for feature,featureReviews in Features.items():
    pos =0
    neg =0
    for sentence in featureReviews:
        ss = sid.polarity_scores(sentence)
        if ss['pos'] >= ss['neg']:
            pos = pos + 1
        else:
            neg = neg + 1
    if pos > neg:
        print("(+) Good ",feature,"--- pos :",pos,"neg :",neg)
    else:
        print("(-) Bad  ",feature,"--- pos :",pos,"neg :",neg)
    print() 
            
    
    
    