# Scraping Hotel Ratings on Tripadvisor

In this homework we will practice web scraping. Let's get some basic information for each hotel in Boston.

On each hotel page, scrape the Traverler ratings. **(10 pts)**

![Information to be scraped](traveler_ratings.png)

Save the data in "traverler_ratings.csv" in the following format:

hotel_name, rating, count

In [1]:
from bs4 import BeautifulSoup
import sys
import time
import os
import logging
import argparse
import requests
import codecs
import json
import urllib
import pandas as pd
from itertools import chain

base_url = "http://www.tripadvisor.com"
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.76 Safari/537.36"


parser = argparse.ArgumentParser(description='Scrape tripadvisor')
parser.add_argument('-datadir', type=str, help='Direcotry to store row html files', default = "data/")
parser.add_argument('-state', type=str, help='State for which the city data is required.')
parser.add_argument('-city', type=str, help='City for which the city data is required.', required=True)
parser.add_argument('-v','--verbose',help="Set log level to debug", action="store_true")

args = parser.parse_args(['-state','Massachusetts','-city','Tourism-g60745-Boston_Massachusetts-Vacations.html','-datadir','C:/Shou/BU computer science/CS591/HW/Assignment_6/homework-3-1-jasshouchen/datastorage'])
print(args)
log = logging.getLogger(__name__)
log.setLevel(logging.ERROR)
if args.verbose:
    log.setLevel(logging.DEBUG)
loghandler = logging.StreamHandler(sys.stderr)
loghandler.setFormatter(logging.Formatter("[%(asctime)s] %(message)s"))
log.addHandler(loghandler)


Namespace(city='Tourism-g60745-Boston_Massachusetts-Vacations.html', datadir='C:/Shou/BU computer science/CS591/HW/Assignment_6/homework-3-1-jasshouchen/datastorage', state='Massachusetts', verbose=False)


In [2]:
def get_tourism_page(city, state):
    """
        Return json containing the URL
        of the tourism city page
    """

    # EXAMPLE: http://www.tripadvisor.com/Boston

    url = base_url+ "/"+ city
    
    log.info("URL TO REQUEST: %s \n" % url)
    # Given the url, request the HTML page
    headers = { 'User-Agent' : user_agent }
    response = requests.get(url, headers=headers)
    
    html = response.text.encode('utf-8')
    html = html.decode('utf-8')
    print(type(html))
   # Save to file
    with open(os.path.join(args.datadir, city + '-search-page.json'), "w",encoding='utf-8') as h:
        h.write(html)

    soup = BeautifulSoup(html,'lxml')
    li = soup.find("link", {"hreflang": "en"})
    return li['href']

In [3]:
def get_city_page(tourism_url):
    """
        Get the URL of the hotels of the city
        using the URL returned by the function
        get_tourism_page()
        """

    url = tourism_url

    # Given the url, request the HTML page
    headers = { 'User-Agent' : user_agent }
    response = requests.get(url, headers=headers)
   
    html = response.text.encode('utf-8')
    html = html.decode('utf-8')
    print(type(html))
    # Save to file
    with open(os.path.join(args.datadir, args.city + '-tourism-page.html'), "w",encoding='utf-8') as h:
        h.write(html)
    soup = BeautifulSoup(html,'lxml')

    li = soup.find("li", {"class": "hotels twoLines"})
    city_url = li.find('a', href = True)
    log.info("CITY PAGE URL: %s" % city_url['href'])

    return city_url['href']

In [4]:
def get_hotellist_page(city_url, count):
    """ Get the hotel list page given the url returned by
        get_city_page(). Return the html after saving
        it to the datadir 
    """

    url = base_url + city_url
    # Sleep 2 sec before starting a new http request
    time.sleep(2)
    # Request page
    headers = { 'User-Agent' : user_agent }
    response = requests.get(url, headers=headers)
    html = response.text.encode('utf-8')
    html = html.decode('utf-8')
    # Save the file
    with open(os.path.join(args.datadir, args.city + '-hotelist-' + str(count) + '.html'), "w",encoding='utf-8') as h:
        h.write(html)
    return html

In [5]:
def parse_hotellist_page(html):
    """ Parse the html pages returned by get_hotellist_page().
        Return the next url page to scrape (a city can have
        more than one page of hotels) if there is, else exit
        the script.
    """

    soup = BeautifulSoup(html,'lxml')
    # Extract hotel name, star rating and number of reviews
    hotel_boxes = soup.findAll('div', {'class' :'listing easyClear  p13n_imperfect '})  
    for hotel_box in hotel_boxes:
        name = hotel_box.find('div', {'class' :'listing_title'}).find(text=True)
        
        try:
            rating = hotel_box.find('div', {'class' :'listing_rating'})
        
            reviews = rating.find('span', {'class' :'more review_count'}).find(text=True)
            
            stars = hotel_box.find("img", {"class" : "sprite-ratings"})
            
        except Exception as e:
            log.error("No ratings for this hotel")
            reviews = "N/A"
            stars = 'N/A'

        if stars != 'N/A':
            #log.info("Stars: %s" % stars['alt'].split()[0])
            stars = stars['alt'].split()[0]
        log.info("HOTEL NAME: %s" % name)
        log.info("HOTEL REVIEWS: %s" % reviews)
        log.info("HOTEL STAR RATING: %s \n" % stars)

    # Get next URL page if exists, else exit
    div = soup.find("div", {"class" : "unified pagination standard_pagination"})
    # check if last page
    try:
        if div.find('span', {'class' : 'nav next ui_button disabled'}):
            log.info("We reached last page")
            sys.exit()
    except:
        pass
    # If it is not las page there must be the Next URL
    hrefs = div.findAll('a', href= True)
    for href in hrefs:
        if href.find(text = True) == 'Next':
            log.info("Next url is %s" % href['href'])
            return href['href']

In [6]:
# define a function to parse hotel list:
def parse_hotel_list (Boston_hotel_web_list):
    base_url = "http://www.tripadvisor.com"
    hotel_dict_name_link = {}
    headers = { 'User-Agent' : user_agent }
    for i in range(len(Boston_hotel_web_list)):
        url = Boston_hotel_web_list[i]
        response = requests.get(url, headers=headers)
        html = response.text.encode('utf-8')
        html = html.decode('utf-8')
        soup = BeautifulSoup(html,'lxml')
        # Extract hotel name, star rating and number of reviews
        hotel_boxes = soup.findAll('div',{'class':'listing easyClear p13n_imperfect '})
        for hotel_box in hotel_boxes:
            name = hotel_box.find('div',{'class':'listing_title'}).find(text = True).replace("/","")
            url_hotel = hotel_box.find('div',{'class':'listing_title'}).find('a',href = True)['href']
            url2 = base_url + url_hotel
            hotel_dict_name_link[name] = url2
    return hotel_dict_name_link

In [7]:
if __name__ == "__main__":
    # Get current directory
    current_dir = os.getcwd()
    # Create datadir if does not exist
    if not os.path.exists(os.path.join(current_dir, args.datadir)):
        os.makedirs(os.path.join(current_dir, args.datadir))
    
    # Obtain the url of the toursim page 
    tourism_url = get_tourism_page(args.city, args.state)
    #Get URL to obtaint the list of hotels in a specific city
    city_url = get_city_page(tourism_url)
    new_city_url = base_url + city_url
    c=0
    Boston_hotel_web_list = []
    Boston_hotel_web_list.append(new_city_url)
    while(True):
        c +=1
        html = get_hotellist_page(city_url,c)
        city_url = parse_hotellist_page(html)
        if (city_url != None):
            new_city_url = base_url + city_url
            Boston_hotel_web_list.append(new_city_url)
        else:
            break

<class 'str'>
<class 'str'>


In [8]:
hotel_dict_name_link = parse_hotel_list(Boston_hotel_web_list)

In [9]:
# define the rating obtain function:
def parse_hotel_ratings (hotel_dict_name_link):
#   for each hotel we need to parse the url and then obtain the col rating div.
    hotel_names_list = list(hotel_dict_name_link.keys())
    headers = { 'User-Agent' : user_agent }
    total_list = []
    for each_hotel in hotel_names_list:
        url = hotel_dict_name_link[each_hotel]
        response = requests.get(url, headers=headers)
        html = response.text.encode('utf-8')
        html = html.decode('utf-8')
        soup = BeautifulSoup(html,'lxml')
        reviews_boxes = soup.find('div',{'class':'col rating '})
        temp_list = []
        for review_line in reviews_boxes.findAll('li'):
            review_type = review_line.find('div',{'class':'row_label'}).find(text=True)
            review_count = review_line.findAll('span')[3].find(text = True)
            temp_list.append([each_hotel,review_type,review_count])
        total_list.append(temp_list)
    total_list_hotels_ratings = list(chain.from_iterable(total_list))
    clns = ['Hotel_name',  'rating','count']
    df_Boston_hotels = pd.DataFrame(total_list_hotels_ratings,columns=clns)
    return df_Boston_hotels
    

In [10]:
df_Boston_hotels=parse_hotel_ratings (hotel_dict_name_link)

In [1]:
import pandas as pd
df_1 = pd.read_csv('traverler_ratings.csv')
df_1

Unnamed: 0,Hotel_name,rating,count
0,Hotel Boston,Excellent,130
1,Hotel Boston,Very good,182
2,Hotel Boston,Average,104
3,Hotel Boston,Poor,49
4,Hotel Boston,Terrible,32
5,Four Seasons Hotel Boston,Excellent,1133
6,Four Seasons Hotel Boston,Very good,197
7,Four Seasons Hotel Boston,Average,53
8,Four Seasons Hotel Boston,Poor,27
9,Four Seasons Hotel Boston,Terrible,16


-------

Next, scrape all the reviews of each hotel for the star ratings of the following attributes: Value, Location, Sleep Quality, Rooms, Cleanliness, Service. Note that some reviews may not have attribute ratings and some may only have some of the attributes. **(25 pts)**

![Information to be scraped](attribute_ratings.png)

Save the data in "attribute_ratings.csv" in the following format:

hotel_name, review_id, attribute, star_value

In [13]:
from bs4 import BeautifulSoup
import requests
import time
from contextlib import contextmanager
    # define a function to parse the review stars of each specific hotel
def parse_hotel_review_star (hotel_dict_name_link,hotel_name):
    starttime = time.time()
#     hotel_name = "InterContinental Boston" # default test case to see if it works
    url = hotel_dict_name_link[hotel_name] # hotel url
    headers = { 'User-Agent' : user_agent }
    response = requests.get(url, headers=headers)
    html = response.text.encode('utf-8')
    # html = html.decode('utf-8')
    soup = BeautifulSoup(html,'lxml')
    # find will return the first tag, findAll returns set of tags
    review_box = soup.find('div',{'class':'reviewSelector track_back'})
    if ( review_box.find('div',{'class' :'quote isNew'}) != None):
        review_url = base_url + review_box.find('div',{'class' :'quote isNew'}).find('a',href=True)['href']
        review_id = review_box.find('div',{'class' :'quote isNew'}).find('a',href=True)['id']
    else:
        review_url = base_url + review_box.find('div',{'class' :'quote'}).find('a',href=True)['href']
        review_id = review_box.find('div',{'class' :'quote'}).find('a',href=True)['id']
    # use the fist review url as starting web page
    response = requests.get(review_url, headers = headers)
    html = response.text.encode('utf-8')

    # after obtaining the review_url we need to load the first page, for here we use selenium
    # driver = webdriver.Chrome()
    # driver.get(review_url)

    # obtain the page sequence and number
    soup = BeautifulSoup(html, 'lxml')
    num_reviews = int(soup.find('h3',{'class':'reviews_header'}).find(text = True).split(" reviews from our community")[0].replace(",",""))
    review_page_cnt = num_reviews/7 + 1

    # count the current page we are reaching
    cnt_page_rch = 0.0
    sum_total_page_list = []
    while (cnt_page_rch <= review_page_cnt ):
        cnt_page_rch += 1
        each_page_total_list = []
        for i in range(7):
            if (soup.findAll('div',{'class':' reviewSelector '})[i] == None):
                continue
            else:
                each_review_tag = soup.findAll('div',{'class':' reviewSelector '})[i]
                each_review_id = soup.find('div',{'class':'entry'}).find('p')['id']
                star_ratings = each_review_tag.findAll('li',{'class':'recommend-answer'})
                if len(star_ratings) != 0:
                    for each_star_rating in star_ratings:
                        each_rating_value = each_star_rating.find('img')['alt'].split(' of 5 bubbles')[0]
                        each_rating_type = each_star_rating.find('div',{'class':'recommend-description'}).find(text=True)
                        each_page_total_list.append((hotel_name,each_review_id,each_rating_type,each_rating_value))
            sum_total_page_list.append(each_page_total_list)
            bot_rvw_page = soup.find('div',{'class':'unified pagination '})
            if (bot_rvw_page.find('span',{'class':'nav next disabled'})):
                print("We reached the last page")
            else:
                next_page_url = base_url + bot_rvw_page.find('a',href = True)['href']
                response = requests.get(next_page_url, headers = headers)
                html = response.text.encode('utf-8')
                soup = BeautifulSoup(html, 'lxml')


    print(time.time() - starttime)
    return sum_total_page_list

In [14]:
hotel_names_list2 = ['Hilton Boston Downtown  Faneuil Hall',
 'Boston Marriott Copley Place',
 'Embassy Suites by Hilton Boston - at Logan Airport',
 'Hyatt Regency Boston Harbor',
 'Courtyard Boston-South Boston',
 'Kimpton Onyx Hotel',
 'Hyatt Regency Boston',
 'The Westin Boston Waterfront',
 'Holiday Inn Express Boston',
 'Chandler Inn',
 'The Westin Copley Place, Boston',
 'Hotel Boston',
 'Club Quarters Hotel in Boston',
 'Boston Hotel Buckminster',
 'Beacon Hill Hotel and Bistro',
 'Courtyard Boston Downtown',
 'DoubleTree by Hilton Hotel Boston - Downtown',
 'Renaissance Boston Waterfront Hotel',
 'Mandarin Oriental, Boston',
 'Boston Marriott Long Wharf',
 'Taj Boston',
 'Lenox Hotel',
 'Hilton Boston Logan Airport',
 'Four Seasons Hotel Boston',
 'The Envoy Hotel, Autograph Collection',
 'The Ritz-Carlton, Boston',
 'The Langham, Boston',
 'Residence Inn Boston DowntownSeaport',
 'DoubleTree Club by Hilton Hotel Boston Bayside',
 'The Boxer Boston',
 'Boston Park Plaza',
 'Comfort Inn - Boston',
 'Aloft Boston Seaport',
 'Fairmont Copley Plaza, Boston',
 'BEST WESTERN PLUS Roundhouse Suites',
 'Ames Boston Hotel, Curio Collection by Hilton',
 'Residence Inn Boston Harbor on Tudor Wharf',
 'Kimpton Nine Zero Hotel',
 'The Boston Common Hotel and Conference Center',
 'W Boston',
 'InterContinental Boston',
 'Omni Parker House',
 'Charlesmark Hotel',
 'The Inn At St Botolph',
 'Copley Square Hotel',
 'Newbury Guest House',
 'DoubleTree Suites by Hilton Boston-Cambridge',
 'Wyndham Boston Beacon Hill',
 'Boston Harbor Hotel',
 'Eliot Hotel',
 'The Bostonian Boston',
 'Marriott Vacation Club Pulse at Custom House, Boston',
 'Constitution Inn',
 'Hampton Inn & Suites Boston Crosstown Center',
 'Courtyard Boston Copley Square',
 'Milner Hotel',
 'The Inn at Longwood Medical',
 'The Godfrey Hotel Boston',
 'Hotel Commonwealth',
 'The Liberty, A Luxury Collection Hotel',
 'Hotel 140',
 'Seaport Boston Hotel',
 'Harborside Inn',
 'XV Beacon',
 'Sheraton Boston Hotel',
 'Courtyard Boston Logan Airport',
 'Element Boston Seaport',
 'Ramada Boston',
 'Hilton Boston Back Bay',
 'Hilton Garden Inn Boston Logan Airport',
 'The Verb Hotel',
 'Colonnade Hotel',
 'Copley House',
 'Revere Hotel Boston Common',
 'Americas Best Value Inn',
 'Battery Wharf Hotel, Boston Waterfront',
 'Holiday Inn Express Hotel & Suites Boston Garden',
 'enVision Hotel Boston - Longwood',
 'Loews Boston Hotel',
 'Days Hotel Boston-Harvard Fenway',
 'The Midtown Hotel',
 'Residence Inn Boston Back BayFenway']


In [25]:
if __name__ == "__main__":
    hotel_names_list = list(hotel_dict_name_link.keys())
    huge_final_ht_rv_star = []
    cnt_time = 0
    base_dataFrame = pd.DataFrame(columns=['hotel_name','review_id','attribute','star_value'])
    for hotel_name in hotel_names_list2[33:34]:
        print(cnt_time)
        cnt_time +=1
        sum_total_page_list = []
        sum_total_page_list = parse_hotel_review_star(hotel_dict_name_link,hotel_name)
        sum_total_page_list2 = list(chain.from_iterable(sum_total_page_list))
        new_dataFrame = pd.DataFrame(sum_total_page_list2,columns=['hotel_name','review_id','attribute','star_value'])
        base_dataFrame = pd.concat([base_dataFrame,new_dataFrame])
        

0


KeyboardInterrupt: 

In [24]:
base_dataFrame.to_csv('cs591_problem2_0_5.csv',index = False)

# Read the csv file which was run though lab computer

In [19]:
import pandas as pd
df_2 = pd.read_csv('attribute_ratings.csv')
df_2

Unnamed: 0,hotel_name,review_id,attribute,star_value
0,Marriott_Vacation_Club_Pulse_at_Custom_House_B...,433426851,Location,5
1,Marriott_Vacation_Club_Pulse_at_Custom_House_B...,433426851,Rooms,5
2,Marriott_Vacation_Club_Pulse_at_Custom_House_B...,433426851,Service,5
3,Marriott_Vacation_Club_Pulse_at_Custom_House_B...,433022432,Value,5
4,Marriott_Vacation_Club_Pulse_at_Custom_House_B...,433022432,Sleep Quality,5
5,Marriott_Vacation_Club_Pulse_at_Custom_House_B...,433022432,Service,5
6,Marriott_Vacation_Club_Pulse_at_Custom_House_B...,431564898,Sleep Quality,5
7,Marriott_Vacation_Club_Pulse_at_Custom_House_B...,431564898,Cleanliness,5
8,Marriott_Vacation_Club_Pulse_at_Custom_House_B...,431564898,Service,5
9,Marriott_Vacation_Club_Pulse_at_Custom_House_B...,428185627,Location,5


-------