# Search hotel price using web scraping by python

## Imoprt libraries

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
import time
import undetected_chromedriver as uc
import numpy as np
from tqdm.notebook import tqdm
import re
from selenium.webdriver.common.action_chains import ActionChains

## Set variables for searching 

In [2]:
start  = "2021-12-28"
end    = "2021-12-29"
people = "2"

base_url = "https://www.expedia.com.hk"

url = base_url + "/Hotel-Search?adults={}&d1={}&d2={}&destination=Tsim%20Sha%20Tsui%2C%20Kowloon%2C%20Hong%20Kong%20SAR&endDate={}&latLong=22.297382439654076%2C114.17199379848498&regionId=12135&rooms=1&semdtl=&sort=RECOMMENDED&startDate={}&theme=&useRewards=false&userIntent="
url = url.format(people, start, end, end, start)

url

'https://www.expedia.com.hk/Hotel-Search?adults=2&d1=2021-12-28&d2=2021-12-29&destination=Tsim%20Sha%20Tsui%2C%20Kowloon%2C%20Hong%20Kong%20SAR&endDate=2021-12-29&latLong=22.297382439654076%2C114.17199379848498&regionId=12135&rooms=1&semdtl=&sort=RECOMMENDED&startDate=2021-12-28&theme=&useRewards=false&userIntent='

## Using selenium to get all html code in the webpage

In [3]:
# Set options --> Don't open the browser
options = Options()
options.add_argument("headless")
options.add_argument("--lang=en_HK") # Set the webpage language as English


# Select the chrome driver for download the data
browser = uc.Chrome(options=options)


# Tell the browser to go which webpage
browser.get(url)


# Maximize the window
browser.maximize_window()


# Get scroll height
last_height = browser.execute_script("return document.body.scrollHeight")


# Set a while loop to loop over the whole webpage
while True:

    # Scroll down to bottom
    browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    # Wait to load page
    time.sleep(0.5)

    # Calculate new scroll height and compare with last scroll height
    new_height = browser.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height


# Set time for loading 
time.sleep(1)


# Get the page source of the website
html = browser.page_source


# Close the browser
browser.close()

## Use BeautifulSoup to read the html code


In [4]:
# Create soup 
soup = BeautifulSoup(html, 'html.parser')

## Extract the hotel names

In [5]:
# Find all the class storing the hotel names
# We only need the text so add .text at the end
hotel_name_tags = soup.find_all("li", {"data-stid":"property-listing"})


# Create list storing the hotel name
hotel_name_list = []

# Loop over the tags to extract hotel names
for hotel_name_tag in hotel_name_tags:
    hotel_name_list.append(hotel_name_tag.find("h3").text)


# Convert it into a DataFrame and set the name as index
hotel_df = pd.DataFrame({"Name":hotel_name_list}).set_index("Name")

# Remove duplicated inded
hotel_df = hotel_df[~hotel_df.index.duplicated(keep='first')]

# Print it out
hotel_df

"The Langham, Hong Kong"
Sheraton Hong Kong Hotel & Towers
"Kowloon Shangri-La, Hong Kong"
Hotel ICON
New World Millennium Hong Kong Hotel
"Hyatt Regency Hong Kong, Tsim Sha Tsui"
Rosewood Hong Kong
The Peninsula Hong Kong
The Mira Hong Kong Hotel
Regal Kowloon Hotel
The Royal Garden


## Find all hotel rates and sort by the rate

In [6]:
# Find the rate correspond to the hotel
for hotel_name in hotel_df.index:

    # Find the parent tag
    parent = soup.find(text=hotel_name).parent.parent

    # Find the tag storing rate values
    rate = parent.find("span",{"data-stid":"content-hotel-reviews-rating"}) 

    # If there is no rate of the hotel, assign a np.nan
    if rate == None:
        rate = np.nan
    else:
        rate = rate.text.split("/")[0]

    # Add the rate next to the hotel dataframe
    hotel_df.loc[hotel_name,"Rate"] = float(rate)


# Remove the hotel without rating
hotel_df = hotel_df.dropna(axis=0)

# Sort the hotel by rating
hotel_df = hotel_df.sort_values("Rate",ascending=False)

# Only select hotels with rating 4.5 or above
hotel_df = hotel_df[hotel_df["Rate"] >= 4.5]

# Show the DataFrame
hotel_df

Unnamed: 0_level_0,Rate
Name,Unnamed: 1_level_1
The Peninsula Hong Kong,4.7
K11 Artus,4.7
"The Langham, Hong Kong",4.6
"Kowloon Shangri-La, Hong Kong",4.6
Hotel ICON,4.6
Page148,4.6
Rosewood Hong Kong,4.6
The Otto Hotel,4.5


## Find all the hotel room type, price and area

In [7]:
# Create df storing all hotel room info
All_hotel_info = pd.DataFrame()


# Find the rate correspond to the hotel
for hotel_name in tqdm(hotel_df.index):

    # Find the parent tag
    parent = soup.find(text=hotel_name).parent.parent

    # Create the path to the hotel info page
    info_path = base_url + parent.find("a",{"data-stid":"open-hotel-information"}).get('href')

    # Select the chrome driver for download the data
    browser = uc.Chrome(options=options)

    # Tell the browser to go which webpage
    browser.get(info_path)

    # Find the room offers in the websites
    offer_element = browser.find_element_by_id("Offers")
    actions       = ActionChains(browser)

    # Move to the element
    actions.move_to_element(offer_element).perform()

    # Scroll to view
    browser.execute_script("arguments[0].scrollIntoView();", offer_element)

    # Stop for loading 
    time.sleep(1.5)

    # Get the page source
    info_html = browser.page_source

    # Close the browser
    browser.close()

    # Read the html code by Beautiful soup
    info_soup = BeautifulSoup(info_html,"html.parser")

    # Get all the room tags
    room_list = list(info_soup.find("div",{"data-stid":"section-room-list"}).children)[0]

    # Create DataFrame to store the info
    total_room_info_df = pd.DataFrame()

    # Loop over each room info
    for room_info in room_list:

        # Find the room type
        room_name  = room_info.find("h3",{"class":True})

        # Price of the room
        room_price = room_info.find("span",{"data-stid":"price-lockup-text"})

        # Skip searching if the result is None type 
        if (room_name!=None) and (room_price!=None):

            # Extract the text
            room_name = room_name.text.split("Photos of ")[-1]

            # Price of the room
            room_price = room_price.text

            # Create DataFrame for one room type info
            room_info_df = pd.DataFrame({"Type":[room_name],"Price":[room_price]})

            # Set hotel name as index
            room_info_df.index = [hotel_name]

            # Concat the dataframe together
            total_room_info_df = pd.concat([total_room_info_df,room_info_df])


    # Print the line when finish one hotel info
    print(f"Finished searching {hotel_name}")


    # Set the hotel name and room type as index
    total_room_info_df = total_room_info_df.set_index([total_room_info_df.index,"Type"])


    # Append the info into one df
    All_hotel_info = pd.concat([All_hotel_info,total_room_info_df],axis=0)

    All_hotel_info.index.names = ["Hotel","Room type"]


  0%|          | 0/8 [00:00<?, ?it/s]

Finished searching The Peninsula Hong Kong
Finished searching K11 Artus
Finished searching The Langham, Hong Kong
Finished searching Kowloon Shangri-La, Hong Kong
Finished searching Hotel ICON
Finished searching Page148
Finished searching Rosewood Hong Kong
Finished searching The Otto Hotel


## Show all results

In [8]:
pd.set_option('display.max_rows', len(All_hotel_info)) 
All_hotel_info

Unnamed: 0_level_0,Unnamed: 1_level_0,Price
Hotel,Room type,Unnamed: 2_level_1
The Peninsula Hong Kong,Deluxe Room - King,"HK$3,880"
The Peninsula Hong Kong,Deluxe Room - Twin,"HK$4,080"
The Peninsula Hong Kong,Deluxe Room - 1 King,"HK$4,620"
The Peninsula Hong Kong,Deluxe Room - 2 Twins,"HK$4,620"
The Peninsula Hong Kong,Deluxe Courtyard Room - King,"HK$4,280"
The Peninsula Hong Kong,Deluxe Courtyard Room - Twin,"HK$4,280"
The Peninsula Hong Kong,Deluxe Courtyard Room - 1 King,"HK$4,847"
The Peninsula Hong Kong,"Kowloon View, King Room, 1 King Bed","HK$4,580"
The Peninsula Hong Kong,Kowloon View 2 Twin Bed,"HK$4,580"
The Peninsula Hong Kong,"Deluxe Room, 1 King Bed, Harbour View","HK$6,380"


## Sort the room price for each hotel

In [9]:
# Define function for sorting the price 
def sort_price(series):
    return series.apply(lambda x: float("".join(re.findall(r'[0-9]+',x.replace(",",""))[0])))


# Do the sorting for each hotel
All_hotel_info_sorted = All_hotel_info.groupby(level=0).apply(lambda x: x.groupby(level=1).sum().sort_values(by="Price",key=sort_price,ascending=False))

# Show the result
All_hotel_info_sorted

Unnamed: 0_level_0,Unnamed: 1_level_0,Price
Hotel,Room type,Unnamed: 2_level_1
Hotel ICON,"Club Suite, 1 King Bed","HK$4,817"
Hotel ICON,CLUB 36 - King Room Harbour View,"HK$4,533"
Hotel ICON,"Club Studio Suite, 1 King Bed","HK$4,108"
Hotel ICON,CLUB 36 - King Room City View,"HK$2,125"
Hotel ICON,CLUB 36 - Twin Room City View,"HK$2,125"
Hotel ICON,ICON 36 - King Room Harbour View,"HK$1,658"
Hotel ICON,ICON 36 - Twin Room Harbour View,"HK$1,658"
Hotel ICON,ICON 36 - Twin Room City View,"HK$1,275"
K11 Artus,HARBOUR Three,"HK$35,744"
K11 Artus,HARBOUR One Plus,"HK$7,000"
