In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from tqdm import tqdm
import re
import os
import json
import time
import requests
import numpy as np
import pandas as pd
import multiprocessing as mp


def initialize_chromedriver(remote_webdriver:str="localhost")->webdriver.Chrome:
    prefs = {"profile.default_content_setting_values.geolocation": 2}
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--deny-permission-prompts")
    options.add_experimental_option("prefs", prefs)
    try:
        driver = webdriver.Remote(f'http://{remote_webdriver}:4444/wd/hub', options=options)
    except:
        raise Exception("Failed to install ChromeDriver")
    return driver

In [2]:
driver = initialize_chromedriver()

In [3]:
driver.get("https://www.booking.com/searchresults.html?ss=Ho+Chi+Minh+City%2C+Ho+Chi+Minh+Municipality%2C+Vietnam&ssne=Hue&ssne_untouched=Hue&label=gen173nr-1FCAEoggI46AdIM1gEaPQBiAEBmAExuAEXyAEM2AEB6AEB-AECiAIBqAIDuAKc2IunBsACAdICJDYyMmU4MGQ1LWUzYWQtNGRhZC1iNmEzLWJhOTI5OWVjZTQ1YtgCBeACAQ&aid=304142&lang=en-us&sb=1&src_elem=sb&src=searchresults&dest_id=-3730078&dest_type=city&ac_position=0&ac_click_type=b&ac_langcode=en&ac_suggestion_list_length=5&search_selected=true&search_pageview_id=a98d21b625600091&ac_meta=GhBhOThkMjFiNjI1NjAwMDkxIAAoATICZW46C0hvIENoaSBNaW5oQABKAFAA&checkin=2024-11-21&checkout=2024-11-22&group_adults=2&no_rooms=1&group_children=0")

In [4]:
# extract hotel links
hotel_links = []
for i in range(1, 5):
	driver.execute_script(f"window.scrollTo(0, {i*1000})")
	time.sleep(1)
	soup = BeautifulSoup(driver.page_source, 'html.parser')
	for a in soup.find_all('a', href=True):
		if re.search(r'/hotel/', a['href']):
			hotel_links.append(a['href'])
            
hotel_links = list(set(hotel_links))
hotel_links

['https://www.booking.com/hotel/vn/amanaki-saigon-serviced-apartment.html',
 'https://www.booking.com/hotel/vn/garden-view-court.html',
 'https://www.booking.com/hotel/vn/the-sophia-apartment-thao-dien-central.html',
 'https://www.booking.com/hotel/vn/orchids.html',
 'https://www.booking.com/hotel/vn/silverland-benthanh.html',
 'https://www.booking.com/hotel/vn/woody-house-boutique.html',
 'https://www.booking.com/hotel/vn/majestic-saigon.html',
 'https://www.booking.com/hotel/vn/ben-thanh-retreats.html',
 'https://www.booking.com/hotel/vn/lantern.html',
 'https://www.booking.com/hotel/vn/chez-mimosa-boutique-corner.html',
 'https://www.booking.com/hotel/vn/bui-vien-staycation.html',
 'https://www.booking.com/hotel/vn/daystar-saigon.html',
 'https://www.booking.com/hotel/index.html',
 'https://www.booking.com/hotel/vn/bay-luxury-cabana-saigon-47a-nguyen-trai.html',
 'https://www.booking.com/hotel/vn/eliot-quan-1.html',
 'https://www.booking.com/hotel/vn/adventure.html',
 'https://www.b

In [5]:
# test one hotel for further function
target_url = hotel_links[0]
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"}
resp = requests.get(target_url, headers=headers)
soup = BeautifulSoup(resp.text, 'html.parser')

In [6]:
def extract_hotel_properties(target_url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"
    }
    resp = requests.get(target_url, headers=headers)
    soup = BeautifulSoup(resp.text, 'html.parser')

    o = {}
    o['link'] = str(target_url)
    try:
        o["name"] = soup.find("h2", {"class": "pp-header__title"}).text
    except:
        o["name"] = np.nan
    try:
        o["address"] = soup.find("span", {"class": "hp_address_subtitle"}).text.strip("\n")
    except:
        o["address"] = np.nan
    try:
        o["content"] = soup.find("div", {"class": "hp_desc_main_content"}).text.strip()
    except:
        o["content"] = np.nan
    try:
        rs, rc = soup.find('div', {'data-testid': 'review-score-component'})
        o['review_score'] = rs.text.strip()
    except:
        o['review_score'] = np.nan
    try:
        o['review_count'] = rc.text.split("\xa0·\xa0")[-1]
    except:
        o['review_count'] = np.nan

    try:
        fac = soup.find("div", {"data-testid": "property-most-popular-facilities-wrapper"})
        fac_arr = []
        for i in range(len(fac.find_all("span"))):
            all_fac = fac.find_all("span")
            text = all_fac[i].text.strip()
            if text != "" and text not in fac_arr:
                fac_arr.append(text)
        o['most_facility'] = str(fac_arr)
    except:
        o['most_facility'] = np.nan

    try:
        all_facilities = []
        for each in soup.find_all("div", {"class": "e50d7535fa"}):
            t = each.find('div', {'class': 'd1ca9115fe'}).text
            rs = []
            for a in each.find_all("span"):
                x = a.text.replace(t, '').strip()
                if x != '' and x not in rs:
                    rs.append(a.text.replace(t, '').strip())
            all_facilities.append({'key': t, 'value': rs})
        o['all_facilities'] = all_facilities
    except:
        o['all_facilities'] = np.nan

    try:
        tr = soup.find_all("div", {"id": "rooms_table"})
    except:
        tr = np.nan

    dataset = []
    temp = {}
    
    for i in range(len(tr)):
        try:
            temp['room_type'] = tr[i].find("h3", {"class": "room_link"}).text
        except:
            temp['room_type'] = np.nan
        try:
            temp['room_price'] = tr[i].find("div", {"class": "bui-price-display__value"}).text
        except:
            temp['room_price'] = np.nan
        try:
            temp['room_capacity'] = tr[i].find("div", {"class": "room_capacity"}).text
        except:
            temp['room_capacity'] = np.nan
        try:
            temp['room_size'] = tr[i].find("div", {"class": "room_size"}).text
        except:
            temp['room_size'] = np.nan
        try:
            temp['room_facilities'] = tr[i].find("div", {"class": "room_facilities"}).text
        except:
            temp['room_facilities'] = np.nan
            
        dataset.append(temp)
    
    o['rooms'] = dataset

    return o

In [8]:
soup.find("div", {"id": "rooms_table"})

<div class="description urt rooms_table_nodates rooms_table_noavailability" id="rooms_table" style="margin: 0;">
<div style="height:1px; overflow:hidden;"></div>
<div data-et-view="OOGbIFBUbTdNDNQQAC:2 aaOYWbOTESCWIKdFHaO:1 aaOYWbOTESCWIKdFHaO:4 aaOYWbOTESCWIKdFHaO:7 "></div>
<div data-capla-component-boundary="b-property-web-property-page/RoomTableDesktop"><div></div><div></div><div></div><div></div><div></div><div></div><div id="maxotelRoomArea"><section class="roomstable"><div class="c624d7469d f034cf5568 dab7c5c6fa a8a3d245a8 a3214e5942 db150fece4 ecc4cec182" style="--bui_stack_spaced_gap--s:0"><div class="a99751df1f b817090550 e7f103ee9e"><div class="a3332d346a">Room type</div></div><div class="a99751df1f b817090550 b1e9f8a61d"><div class="a3332d346a">Number of guests</div></div><div class="a99751df1f b817090550 c44c37515e"> </div></div><div class="c624d7469d f034cf5568 dab7c5c6fa a8a3d245a8 a3214e5942 db150fece4 cdefac0453 ecc4cec182" style="--bui_stack_spaced_gap--s:0"><div clas

In [11]:
def extract_hotel_properties_with_rooms(target_url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"
    }
    resp = requests.get(target_url, headers=headers)
    soup = BeautifulSoup(resp.text, 'html.parser')

    o = {}
    o['link'] = str(target_url)
    try:
        o["name"] = soup.find("h2", {"class": "pp-header__title"}).text
    except:
        o["name"] = np.nan
    try:
        o["address"] = soup.find("span", {"class": "hp_address_subtitle"}).text.strip("\n")
    except:
        o["address"] = np.nan
    try:
        o["content"] = soup.find("div", {"class": "hp_desc_main_content"}).text.strip()
    except:
        o["content"] = np.nan
    try:
        rs, rc = soup.find('div', {'data-testid': 'review-score-component'})
        o['review_score'] = rs.text.strip()
    except:
        o['review_score'] = np.nan
    try:
        o['review_count'] = rc.text.split("\xa0·\xa0")[-1]
    except:
        o['review_count'] = np.nan

    try:
        fac = soup.find("div", {"data-testid": "property-most-popular-facilities-wrapper"})
        fac_arr = []
        for i in range(len(fac.find_all("span"))):
            all_fac = fac.find_all("span")
            text = all_fac[i].text.strip()
            if text != "" and text not in fac_arr:
                fac_arr.append(text)
        o['most_facility'] = str(fac_arr)
    except:
        o['most_facility'] = np.nan

    try:
        all_facilities = []
        for each in soup.find_all("div", {"class": "e50d7535fa"}):
            t = each.find('div', {'class': 'd1ca9115fe'}).text
            rs = []
            for a in each.find_all("span"):
                x = a.text.replace(t, '').strip()
                if x != '' and x not in rs:
                    rs.append(a.text.replace(t, '').strip())
            all_facilities.append({'key': t, 'value': rs})
        o['all_facilities'] = all_facilities
    except:
        o['all_facilities'] = np.nan

    try:
        rooms_table = soup.find_all("section", {"id": "rooms_table"})
    except:
        rooms_table = np.nan
        
    dataset = []
    
    for i in range(len(rooms_table)):
        temp = {}
        
        try:
            temp['room_type'] = rooms_table[i].find("h3", {"class": "room_link"}).text
        except:
            temp['room_type'] = np.nan
            
        try:
            temp['room_price'] = rooms_table[i].find("div", {"class": "bui-price-display__value"}).text
        except:
            temp['room_price'] = np.nan
            
        try:
            temp['room_capacity'] = rooms_table[i].find("div", {"class": "room_capacity"}).text
        except:
            temp['room_capacity'] = np.nan
            
        try:
            temp['room_size'] = rooms_table[i].find("div", {"class": "room_size"}).text
        except:
            temp['room_size'] = np.nan
        
        dataset.append(temp)
        
    o['rooms'] = dataset
    
    return o


In [12]:
extract_hotel_properties_with_rooms(target_url)

{'link': 'https://www.booking.com/hotel/vn/amanaki-saigon-serviced-apartment.html',
 'name': 'Amanaki Thao Dien',
 'address': '10 Nguyen Dang Giai, District 2, District 2, Ho Chi Minh City, Vietnam',
 'content': 'You might be eligible for a Genius discount at Amanaki Thao Dien. To check if a Genius discount is available for your selected dates sign in.\n\n\nGenius discounts at this property are subject to book dates, stay dates and other available deals.\n\nSet in Ho Chi Minh City, 5.1 km from Vietnam History Museum, Amanaki Thao Dien features rooms with city views and free WiFi. Offering a restaurant, the property also has free bikes, as well as an indoor pool and a sauna. The property is non-smoking and is situated 6.2 km from Diamond Plaza.\n\nThe hotel will provide guests with air-conditioned rooms offering a desk, a kettle, a fridge, a microwave, a safety deposit box, a flat-screen TV and a private bathroom with a bidet. At Amanaki Thao Dien the rooms are equipped with bed linen a