In [1]:
# imports and setup
import pandas as pd
import numpy as np
import json
import os
from HTMLParser import HTMLParser
pd.set_option('display.max_colwidth', -1)

In [2]:
# part 1
# open specific json file
with open('data/100506.json') as f:
    dat_100506 = json.load(f)

In [3]:
# create three objects to house the review ratings,
# the full text review, and the list of rating categories
review_ratings=[]
review_text=[]
ratings_cats = []

#iterate through the json file
for review in dat_100506['Reviews']:
    # save the full review text into a dictionary
    temp = {}
    temp['Author']=review['Author']
    temp['Date'] = review['Date']
    temp['Review']=review['Content']
    review_text.append(temp)
    
    #build a dictionary of the review ratings features
    temp_dict={}
    temp_dict['Author']=review['Author']
    temp_dict['Date']=review['Date']
    
    # unpack the rating review tags by iterating through the 
    # dictionary of the ratings
    for k in review['Ratings']:
        #test for membership in the unique rating categories
        if not(k in ratings_cats):
            ratings_cats.append(k)

            #append the category to the rating dictionary
        temp_dict[k]=review['Ratings'][k]
    #append the individual rating dictionary to the list
    review_ratings.append(temp_dict)

In [4]:
#load the rating category data into a dataframe
df_100506 = pd.DataFrame(review_ratings)
#convert the rating category data to numeric
df_100506[ratings_cats]=df_100506[ratings_cats].apply(pd.to_numeric)

#load the full text reviews into a different dataframe object
df_100506_review_text = pd.DataFrame(review_text)
# encode the string to ensure that I/O functions properly
df_100506_review_text.Review = df_100506_review_text\
    .Review.str.encode('utf-8')

In [5]:
#calculate the requested summary statistics
df_100506.groupby(lambda idx:0).agg(['mean','min','max']).stack()

Unnamed: 0,Unnamed: 1,"Business service (e.g., internet access)",Check in / front desk,Cleanliness,Location,Overall,Rooms,Service,Sleep Quality,Value
0,mean,1.0,3.0,2.0,4.0,1.666667,1.545455,2.3,2.176471,2.0
0,min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,max,1.0,5.0,5.0,5.0,4.0,5.0,5.0,5.0,5.0


In [6]:
#pickle the results and dump reviews to txt
df_100506.to_pickle('ratings_100506.pkl')
df_100506_review_text.to_csv('reviews_full_text_100506.txt',\
                             header=None,index=None,\
                             sep='\t', mode='a')

In [7]:
#part II
# load hotel info from all json files into data store
hotel_dat = {}
# walk across files in directory
for fn in os.listdir('data/'):
    # open json
    with open(os.path.join('data',fn)) as f:
        # load json data
        d = json.load(f)
        # store hotel info with appropriate filename
        hotel_dat[fn]=d['HotelInfo']


In [8]:
#initiate a class extending HTMLParser, overwriting handle_data
class MLStripper(HTMLParser):
    #initiate the class and blank list fed
    def __init__(self):
        self.reset()
        self.fed = []

    # overwrite how the parser handles data
    # pass html chunks to fed list
    def handle_data(self, d):
        self.fed.append(d)

    # a function that returns all of the chunks contained 
    # in list into a single str
    def get_data(self):
        return ''.join(self.fed)

# define a function that takes in HTML,
# strips the tags, and returns a string
# using the MLStripper class
def strip_tags(html):
    # initiate the parser
    s = MLStripper()
    # feed the parser data
    s.feed(html)
    return s.get_data()

# Only field with HTML is Address, alter that field
for key in hotel_dat:
    for k in hotel_dat[key]:
        if k == 'Address':
            hotel_dat[key][k] = strip_tags(hotel_dat[key][k])

### df_hotel_info
`df_hotel_info` contains 6 columns

* Address: the physical street address of the hotel
* HotelID: a primary hotel identifier
* HotelURL: the URL associated with the json file
* ImgURL: location of image associated with tripadvisor
* Name: hotel name
* price: estimated price range

In [11]:
#store hotel_info in data frame and present overview
df_hotel_info = pd.DataFrame(hotel_dat).T
print df_hotel_info

                                                         Address  HotelID  \
100506.json     315 Seneca St., Seattle, WA 98101                 100506    
1217974.json  NaN                                                 1217974   
150849.json   NaN                                                 150849    
214680.json   NaN                                                 214680    
240124.json     9100 North Kendall Drive, Miami, FL 33176         240124    
2515575.json    8757 Rio San Diego Drive, San Diego, CA 92108     2515575   
287670.json   c/ Argenteria 37, 08003 Barcelona, Spain            287670    
550994.json   Campomanes 7, 28013 Madrid, Spain                   550994    
655424.json   NaN                                                 655424    
677703.json   NaN                                                 677703    

                                                                                                                                      HotelURL  \
100506

In [10]:
# pickle the hotel file
df_hotel_info.to_pickle('df_hotel_info.pkl')