# Scraping code to get hotel rates and ratings for the hotel lists

I downloaded the dataset from

title = {Opinion-Based Entity Ranking},
journal = {Information Retrieval},
year = {2011},
keywords = {adhoc multifaceted search, entity oriented search,
entity ranking, entity retrieval, product search},
doi = {10.1007/s10791-011-9174-8},
author = {Kavita Ganesan and ChengXiang Zhai}


In [1]:
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException

In [3]:
filenames = ['san-francisco.csv','chicago.csv','new-york-city.csv','las-vegas.csv']

driver = webdriver.Firefox()

for file in filenames:
    currcity = pd.read_csv(file,index_col=False)
    currcity=currcity[currcity['overall_ratingsource']>0]
    currcity = currcity[currcity['zip']!='-1']
    currcity['zip']=currcity['zip'].str[0:5]
    currcity['zip']=currcity['zip'].astype(int)
    # THIS seems to work very well...
    # print(currcity.tail())
    
    # then, let's scrape!
    for i in range(len(currcity.hotel_name)):
        # print current hotel name
        print(i, currcity['hotel_name'].iloc[i])
        # open google
        try:
            driver.get("http://www.google.com")
            assert "Google" in driver.title
        except AssertionError:
            print('Error loading page')
        # search for the hotel name and city
        q = driver.find_element(By.NAME, 'q')
        q.send_keys(currcity['hotel_name'].iloc[i] + ' ' + currcity['city'].iloc[i])
        q.submit()
        # get meaningful information (need to wait!)
        xpath1 = '//*[@id="rhs_block"]'
        try:
            googleoutput = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, xpath1)))
        except NoSuchElementException:
            googleoutput.text = ''
        
        if len(googleoutput.text.split('\n'))>8:
            # fill in the values requested
            # NAME: 
            #       just to make sure it's the same...
            print(googleoutput.text.split('\n')[4])
            currcity.at[i, 'scrapedname'] = googleoutput.text.split('\n')[4]
            # STARS/CLASS
            if len(googleoutput.text.split('\n')[9])>0:
                if googleoutput.text.split('\n')[9][0].isdigit():
                    print(googleoutput.text.split('\n')[9])
                    currcity.at[i, 'class'] = int(googleoutput.text.split('\n')[9][0])
            # MEAN PRICE
            prices = []
            for line in googleoutput.text.split('\n'):
                if len(line)>1:
                    if line[0]=='$':
                        if line[1:].isdigit():
                            prices.append(float(line[1:]))
            if len(prices)>0:
                print('mean price = ', np.mean(prices))
                currcity.at[i, 'price'] = np.mean(prices)
            else:
                print('mean price = NOT AVAILABLE')
                currcity.at[i, 'price'] = -1
        else:
            print('Couldn''t fetch data')
    # don't forget to save the data! OMG I totally was...
    currcity.to_csv('filled' + file)

driver.close()

0 fairmont heritage place ghirardelli square
Save
5-star hotel
mean price =  764.0
1 hotel drisco
Hotel Drisco
4-star hotel
mean price =  439.0
2 omni san francisco hotel
Omni San Francisco Hotel
4-star hotel
mean price =  325.25
3 the inn at union square
The Inn at Union Square
3-star hotel
mean price =  219.25
4 the orchard hotel
The Orchard Hotel
4-star hotel
mean price =  204.0
5 the donatello hotel
The Donatello
4-star hotel
mean price =  278.0
6 chancellor hotel on union square
Chancellor Hotel on Union Square
3-star hotel
mean price =  224.25
7 white swan inn
White Swan Inn
3-star hotel
mean price =  199.0
8 argonaut hotel a kimpton hotel
Argonaut Hotel
4-star hotel
mean price =  212.0
9 hotel monaco san francisco a kimpton hotel
The Marker San Francisco
4-star hotel
mean price =  174.0
10 mandarin oriental
Loews Regency San Francisco
5-star hotel
mean price =  332.5
11 orchard garden hotel
The Orchard Garden Hotel
4-star hotel
mean price =  188.0
12 marines memorial club hotel


2-star hotel
mean price =  241.25
100 clift hotel san francisco
The Clift Royal Sonesta Hotel
4-star hotel
mean price =  190.0
101 the westin st francis
The Westin St. Francis San Francisco on Union Square
4-star hotel
mean price =  209.0
102 san francisco marriott marquis
San Francisco Marriott Marquis
4-star hotel
mean price =  189.0
103 marina motel
Marina Motel
2-star hotel
mean price =  189.0
104 hotel metropolis
Hotel Metropolis
3-star hotel
mean price =  196.75
105 club quarters san francisco
Club Quarters Hotel in San Francisco
3-star hotel
mean price =  153.75
106 comfort inn by the bay
Comfort Inn By the Bay
3-star hotel
mean price =  198.25
107 w san francisco
W San Francisco
4-star hotel
mean price =  266.0
108 the gaylord
Gaylord Suites
3-star hotel
mean price = NOT AVAILABLE
109 sir francis drake hotel a kimpton hotel
Kimpton Sir Francis Drake Hotel
4-star hotel
mean price =  179.0
110 king george hotel
King George Hotel
3-star hotel
mean price =  185.75
111 holiday inn s

Travelodge by Wyndham by Fisherman's Wharf
2-star hotel
mean price =  161.0
207 aida hotel
Aida Plaza Hotel
1-star hotel
mean price =  111.0
208 carl hotel
Carl Hotel
1-star hotel
mean price = NOT AVAILABLE
209 americas best value inn suites union square
Americas Best Value Inn - Downtown / Midtown Atlanta
2-star hotel
mean price =  81.75
210 mithila
Mithila Hotel
2-star hotel
mean price =  92.0
211 pontiac hotel
Minna Hotel
1-star hotel
mean price =  134.2
212 travelodge central san francisco hotel
Travelodge by Wyndham San Francisco Central
2-star hotel
mean price =  117.2
213 oasis inn
Oasis Inn
2-star hotel
mean price =  157.0
214 knights inn downtown san francisco
Couldnt fetch data
215 aldrich hotel
Aldrich Hotel
2-star hotel
mean price =  87.6
216 heritage marina hotel
Heritage Marina Hotel
mean price = NOT AVAILABLE
217 verona
Albergo Hotel Verona
2-star hotel
mean price = NOT AVAILABLE
218 beach motel
Beach Motel
2-star hotel
mean price =  139.0
219 civic center inn
Civic Cent