In [None]:
"""
Jupyter Notebook file is building a database comprising of nested dictionaries 
containing the data which would be important for Web Integration. It used pickle
files which were created in previous files and writes some new data to new files
in a format which is appropriate for our website's backend database.

"""

# Import Libraries
import pandas as pd
import pickle as pk
import numpy as np
from collections import defaultdict
import time

# Read in pickle file produced in '03 Aspect Based Sentiment Analysis.ipynb' with 
# identified aspects and read it into variable. It has columns:
# [Hotel', 'City', 'Review', 'Date', 'Score', 'Aspect', 'Sentiment']
infile = open("Stored Data/sample_review_w_aspects.pickle", "rb")
data_reviews = pk.load(infile)
infile.close()

# Read in pickle file produced in '01 Fitting Dataset.ipynb' which contains hotels
# with only the useful information that is, the columns:
# [Hotel', 'Address', 'Score', 'Latitude', 'Longitude', 'City']
infile = open("Stored Data/sample_hotels.pickle", "rb")
data_hotels = pk.load(infile)
infile.close()

# delete any rows which has no review
data_reviews = data_reviews.dropna(axis="rows", how="any")

# main dictionary which will hold our data
hotel_dict = {}

# ignore row[0] as it it index, set the hotel_dict key as 
# 'hotel name_city' to uniquely identify each hotel. The 
# value is an array which includes hotel's address, score,
# coordinates and its city
# hotel_dict Structure:
#   {key: hotel_city, value: ['Address', 'Score', 'Latitude', 'Longitude', 'City']}
for row in data_hotels.itertuples():
    hotel_dict.setdefault("{}_{}".format(row[1],row[6]), [row[2],row[3],row[4],row[5],row[6]])
      
# Function which counts returns a dictionary which contains
# information about the number of hotels in each city
def get_city(hotel_dict):
    city_dict = {}
    
    for key, value in hotel_dict.items():
        if value[4] in city_dict:
            city_dict[value[4]] += 1
        else:
            city_dict.setdefault(value[4], 1)
        
    return city_dict

city_dict = get_city(hotel_dict)

# data_reviews columns: ['Hotel', 'City', 'Review', 'Date', 'Score', 'Aspect', 'Sentiment']
def timeline(data_reviews):
    timeline_dict = {}
    aspect_dict = {}
    
    for row in data_reviews.itertuples():
        # row structure:
        #  ['Index', 'Hotel', 'City', 'Review', 'Date', 'Score', 'Aspect', 'Sentiment']
        for key, value in row[7].items():
            # row[7] structure:
            #  row[7] is 'Sentiment' column in variable 'row'
            #  It is a dictionary with key being some aspect, value being an array [positive_score, negative_score]
            #  giving the respective positive and negative polarity metric for that particular aspect
            #  {aspect1: [positive_score, negative_score], aspect2: [positive_score, negative_score], aspect3: [positive_score, negative_score]}
            
            # following if else block keeps count of the number of times some aspect occurs
            # aspect_dict Structure:
            #  {aspect1: count1, aspect2: count2, aspect3: count3, aspect4: count4}
            if key in aspect_dict:
                aspect_dict[key] += 1
            else:
                aspect_dict.setdefault(key, 1)
            
            # hotel_city is a string which concatenates hotel name and hotel city with a '_' (underscore) between them
            hotel_city = "{}_{}".format(row[1],row[2])
            
            """
                The following block of text explains the working of 3 nested if-else condition block
                
                If date doesn't exist in timeline_dict, add the date. 
                On that date, add all the cities where some hotel's review was found.
                In that city, add all the aspects that were found in the reviews.
                For each aspect, list the hotels for which that aspect was reported and their polarity score for that aspect.
                
            """
            
            # row[4] is the 'Date' column in variable 'row' and the key of timeline_dict is always a date
            if row[4] in timeline_dict:
                # hotel_dict Structure:
                #   {key: hotel + _ + city, value: ['Address', 'Score', 'Latitude', 'Longitude', 'City']}
                #   hotel_dict[hotel_city][4] is the City hotel is located in
                #   timeline_dict[row[4]] is a dictionary with City being the key
                if hotel_dict[hotel_city][4] in timeline_dict[row[4]]:
                    # refer to the structure of timeline_dict provided to track the different values
                    if key in timeline_dict[row[4]][hotel_dict[hotel_city][4]]:
                        timeline_dict[row[4]][hotel_dict[hotel_city][4]][key].append([hotel_city, (value[0]+value[1])])
                    else:
                        timeline_dict[row[4]][hotel_dict[hotel_city][4]].setdefault(key, [[hotel_city, (value[0]+value[1])]])
                else:
                    timeline_dict[row[4]].setdefault(hotel_dict[hotel_city][4], {key: [[hotel_city, (value[0]+value[1])]]})
            else:
                timeline_dict.setdefault(row[4], {hotel_dict[hotel_city][4]: {key: [[hotel_city, (value[0]+value[1])]]} })
                
    return timeline_dict, aspect_dict

"""
timeline_dict structure with examples:
{'7/29/2017': 
    {'Amsterdam, Netherlands': 
        {  'Bed':         [['Best Western Delphi Hotel_Amsterdam, Netherlands', 0.5106]],
           'Breakfast':   [['citizenM Amsterdam_Amsterdam, Netherlands', 0.5719],
                           ['INK Hotel Amsterdam MGallery by Sofitel_Amsterdam, Netherlands', -0.1027]],
           'Cleanliness': [['DoubleTree by Hilton Amsterdam Centraal Station_Amsterdam, Netherlands', 0]],
           'Room':        [['Jaz Amsterdam_Amsterdam, Netherlands', 0.4215],
                           ['citizenM Amsterdam_Amsterdam, Netherlands', 0],
                           ['Best Western Delphi Hotel_Amsterdam, Netherlands', 0],
                           ['Mercure Hotel Amsterdam West_Amsterdam, Netherlands', 0]],
           'Shower':      [['Best Western Delphi Hotel_Amsterdam, Netherlands', 0.4019],
                           ['Grand Hotel Amr th Amsterdam_Amsterdam, Netherlands', 0],
                           ['Jaz Amsterdam_Amsterdam, Netherlands', 0]],
           'Staff':       [['Apollo Hotel Amsterdam_Amsterdam, Netherlands', 0.9628],
                           ['NH Collection Amsterdam Grand Hotel Krasnapolsky_Amsterdam, Netherlands', 0.5563],
                           ['Mercure Hotel Amsterdam West_Amsterdam, Netherlands', 0.5413],
                           ['Grand Hotel Amr th Amsterdam_Amsterdam, Netherlands', 0.4754],
                           ['citizenM Amsterdam_Amsterdam, Netherlands', 0.5574],
                           ['Crowne Plaza Amsterdam South_Amsterdam, Netherlands', 0]],
           'Taxi':        [['Hotel Esther a_Amsterdam, Netherlands', 0]],
           'Transport':   [['Mercure Hotel Amsterdam West_Amsterdam, Netherlands', 0]]
        }
    }
}

"""

# Run function above, get timeline_dict and aspect_dict
overall = timeline(data_reviews)
timeline_dict = overall[0]
aspect_dict = overall[1]

# Store the hotel data
outfile = open("Stored Data/hotel_dict.pickle", "wb")
pk.dump(hotel_dict, outfile)
outfile.close()

# Store the city data
outfile = open("Stored Data/city_dict.pickle", "wb")
pk.dump(city_dict, outfile)
outfile.close()

# Store the aspect data
outfile = open("Stored Data/aspect_dict.pickle", "wb")
pk.dump(aspect_dict, outfile)
outfile.close()

# Store the timeline data
outfile = open("Stored Data/timeline_dict.pickle", "wb")
pk.dump(timeline_dict, outfile)
outfile.close()