In [None]:
"""
Jupyter Notebook file which takes in the pickle file produced in
'02 Clean Reviews.ipynb' with punctuated reviews. It also reads in the
output of the Java code which identified the (opinion word, aspect) pair 
for each review. It then calculates the sentiment scores of each aspect
according to their respective sentiment modifers and store that data.
The script also keeps track of the 3 best reviews according to some
aspect for all the hotels. All that info is then written to pickle
files or csv files to be processed and used for the website.

"""

# Import Libraries
import nltk
import pandas as pd
import pickle as pk
import ast
import math
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Get sentiment analyser utility from Vader module of nltk
analyser = SentimentIntensityAnalyzer()

# function takes a row from the table and for each aspect,
# calculates its polarity score
def opinion_sentiment(row):    
    hotel = row['Hotel']
    text = row['Reviews']
    
    # 'Aspect' column of row contains output from Java code that is,
    # it is a string which is formatted in a python list syntax. Using
    # literal_eval rightfully identifies it as a list and yields a list
    # of (opinion word, aspect) pairs for that particular row's review
    aspects = ast.literal_eval(row['Aspect'])
    
    # dictionary that will have aspect as key and [positive, negative]
    # polarity score as value
    aspect_opinion_word = {}

    # for every aspect
    for aspect in aspects:
        # Structure of aspect_opinion_word:
        #    {key: aspect, value: [Positive, Negative]}
        aspect_opinion_word.setdefault(aspect[1], [0,0])
        
        # get sentiment score from opinion word
        word_senti = word_sentiment(aspect[0])
        
        # constantly check and update the best 3 reviews for each hotel
        # according to each aspect which is stored in global dictionary
        # best_review. It has the following structure:
        #     {key: hotel_aspect, value: [(review1, score), (review2, score), (review3, score)]}
        
        # get a key for best_review. if key not in dictionary, it is a
        # new aspect and should therefore be added in the dictionary 
        # with default negative infinity scores to aide find better review
        hotel_aspect = str(str(hotel) + '_' + str(aspect[1]))
        if hotel_aspect not in best_review:
            best_review[hotel_aspect] = [(text, -math.inf), ("", -math.inf), ("", -math.inf)]
        
        # get polarity and update score
        if word_senti > 0: # Positive 
            aspect_opinion_word[aspect[1]][0] += word_senti
        elif word_senti < 0: # Negative
            aspect_opinion_word[aspect[1]][1] += word_senti
        
        # if hotel_aspect key exists in best_review, find the aspect
        # with smallest score, check if current review is better than
        # the worst one and if so, replace it with the newer, better review
        if hotel_aspect in best_review:
            top_revs = best_review[hotel_aspect]
            min_score = math.inf
            min_index = math.inf
            flag = False
            for i in range(len(top_revs)):
                if float(top_revs[i][1]) < min_score:
                    min_score = float(top_revs[i][1])
                    min_index = i
                    flag = True
            
            # if current review is better than the worst review, update accordingly
            if flag:
                if aspect_opinion_word[aspect[1]][0] > min_score:
                    top_revs[min_index] = (text, aspect_opinion_word[aspect[1]][0])
                    best_review[hotel_aspect] = top_revs
                    flag = False
 
    return aspect_opinion_word

# Given a phrase, return the compund sentiment score (overall polarity measure of phrase)
def word_sentiment(phrase):
    score = analyser.polarity_scores(phrase)
    return score['compound']

# Open pickle file which was produced in '02 Clean Reviews.ipynb'
infile = open("Stored Data/sample_reviews.pickle", "rb")
data = pk.load(infile)
infile.close()

# Read in Pre-processed aspects gotten from the output of Java code using Enhanced Dependencies
data['Aspect'] = pd.read_csv("Stored Data\java_output.txt", header=None, delimiter="\t", names = ["Aspects"])

# Global variable being altered from an inside function scope
global best_review
best_review = {}

# Pass row as input to opinion_sentiment() function 
data['Sentiment'] = data.apply(opinion_sentiment, axis=1)

# Store the data with aspects identified in a new pickle file
outfile = open("Stored Data/sample_review_w_aspects.pickle", "wb")
pk.dump(data, outfile)
outfile.close()

# Store the best reviews for each hotel according to an aspect
# in another file
outfile = open("Stored Data/best_reviews_dict.pickle", "wb")
pk.dump(best_review, outfile)
outfile.close()

# write the table with identified aspects to a csv to actually see the aspects
data.to_csv(r'Stored Data/sample_review_w_aspects.csv')

print(len(data))