## 1A. Web Scraping Script from Booking com

Last Updated: 1 Sep 2025 </br> 
Description: This script scrapes reviews from Booking.com hotel pages using Selenium, handles pagination to scrape multiple pages of reviews, and saves the data as a JSON file.

In [None]:
# pip install selenium webdriver-manager pandas

In [None]:
# pip install webdriver-manager

#### Import Libraries

In [None]:
# Import Libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import json
import pandas as pd
import random
from datetime import datetime

In [None]:
from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.core.os_manager import ChromeType

# This will automatically download the correct ChromeDriver and return its path
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

#### File Path Config

In [None]:
# URL of the reviews page
url = "https://www.booking.com/hotel/sg/marina-bay-sands.en-gb.html?aid=304142&label=gen173nr-1FCAEoggI46AdIM1gEaMkBiAEBmAEJuAEHyAEM2AEB6AEB-AEMiAIBqAIDuAK4nde8BsACAdICJDgyMWZlYTcwLTNhOTYtNGQxMS04Mjg1LTM5ZmQ0NjBjYjYxONgCBuACAQ&sid=60119e564f0b522bb3477f98d054d045&dest_id=245881&dest_type=hotel&dist=0&group_adults=2&group_children=0&hapos=1&hpos=1&no_rooms=1&req_adults=2&req_children=0&room1=A%2CA&sb_price_type=total&sr_order=popularity&srepoch=1737871150&srpvid=dd982a12938101ee&type=total&ucfs=1&chal_t=1755947307950&force_referer=#tab-reviews"  
driver.get(url)

In [None]:
# Wait for page to load - 15s
wait = WebDriverWait(driver, 15)

In [None]:
# Store all reviews
all_reviews = []

#### Function Definition

In [None]:
# Function to extract reviews from a single pagedef extract_reviews():
def extract_reviews():
    try:
        # FIXED: Changed from 'review-card' to 'review'
        review_cards = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div[data-testid='review']")))
        
        serial_number = 1
        
        for card in review_cards:
            extraction_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            
            try:
                # Extract the necessary details for each review
                name = card.find_element(By.CSS_SELECTOR, "div.b08850ce41.f546354b44").text
                country = card.find_element(By.CSS_SELECTOR, "span.d838fb5f41.aea5eccb71").text
                room_type = card.find_element(By.CSS_SELECTOR, "span[data-testid='review-room-name']").text
                num_nights = card.find_element(By.CSS_SELECTOR, "span[data-testid='review-num-nights']").text.split("Â·")[0].strip()
                stay_period = card.find_element(By.CSS_SELECTOR, "span[data-testid='review-stay-date']").text
                traveler_type = card.find_element(By.CSS_SELECTOR, "span[data-testid='review-traveler-type']").text
                review_date = card.find_element(By.CSS_SELECTOR, "span[data-testid='review-date']").text.replace("Reviewed: ", "")
                review_title = card.find_element(By.CSS_SELECTOR, "h4[data-testid='review-title']").text
                review_score = card.find_element(By.CSS_SELECTOR, "div[data-testid='review-score'] div.f63b14ab7a.dff2e52086").text

                # Extract positive review
                try:
                    positive_review = card.find_element(By.CSS_SELECTOR, "div[data-testid='review-positive-text'] div.b99b6ef58f.d14152e7c3").text
                except:
                    positive_review = ""

                # Extract negative review
                try:
                    negative_review = card.find_element(By.CSS_SELECTOR, "div[data-testid='review-negative-text'] div.b99b6ef58f.d14152e7c3").text
                except:
                    negative_review = ""

                # Append extracted review
                all_reviews.append({
                    "serial_no": serial_number,
                    "reviewer": {"name": name, "country": country},
                    "stay_details": {
                        "room_type": room_type,
                        "number_of_nights": num_nights,
                        "period_of_stay": stay_period,
                        "traveler_type": traveler_type
                    },
                    "review": {
                        "date": review_date,
                        "title": review_title,
                        "score": review_score,
                        "positive_text": positive_review,
                        "negative_text": negative_review
                    },
                    "extraction_timestamp": extraction_timestamp
                })

                serial_number += 1
                
            except Exception as e:
                print(f"Error extracting a review: {e}")

    except Exception as e:
        print(f"Error locating review cards: {e}")

In [None]:
# Function to click the "Next Page" button - by page number
def go_to_next_page(page_num):
    try:
        # Wait for the 'Next' button for the next page
        next_page_btn = wait.until(EC.element_to_be_clickable((By.XPATH, f"//button[@aria-label=' {page_num}']")))
        driver.execute_script("arguments[0].scrollIntoView();", next_page_btn)  # Scroll into view

        next_page_btn.click()

        time.sleep(4)  # Wait for page to load fully

        # Wait for the page to load by checking for a page-specific element (review cards, for example)
        wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div[data-testid='review-card']")))

    except Exception as e:
        print(f"Could not go to page {page_num}: {e}")
        return False
    return True

In [None]:
# # Function to click the "Next Page" button - by button
# def go_to_next_page():
#     """
#     Clicks the 'Next page' button instead of a specific page number.
#     Returns True if successful and a new page loads, False if on the last page or if it fails.
#     """
#     try:
#         # Wait for the 'Next page' button to be clickable.
#         # Use the exact aria-label and a combination of classes for a unique selector.
#         next_page_btn = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[@aria-label='Next page']")))
        
#         # Scroll it into view to ensure it's interactable
#         driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", next_page_btn)
#         time.sleep(3) # Small pause after scrolling

#         # Click the button using JavaScript as a more reliable alternative
#         driver.execute_script("arguments[0].click();", next_page_btn)
        
#         print("Clicked 'Next page' button.")
        
#         # Wait for the new content to load. A good strategy is to wait for the current page's content to become stale.
#         # First, get a reference to a main container that will be reloaded
#         old_content = driver.find_element(By.TAG_NAME, 'html')
        
#         # Wait for that old element to become stale, indicating a page refresh/navigation
#         WebDriverWait(driver, 15).until(EC.staleness_of(old_content))
        
#         # Now, wait for the new review cards to appear on the new page
#         wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div[data-testid='review-card']")))
#         print("New page loaded successfully.")
#         return True

#     except Exception as e:
#         # If the button isn't found, we might be on the last page.
#         # If other errors occur, print them.
#         print(f"Could not find or click 'Next page' button. Likely on the final page. Error: {e}")
#         return False

In [None]:
# Function to check if the page has loaded new content
def check_page_loaded():
    initial_reviews_count = len(driver.find_elements(By.CSS_SELECTOR, "div[data-testid='review-card']"))
    time.sleep(3)  # Wait for page to load
    new_reviews_count = len(driver.find_elements(By.CSS_SELECTOR, "div[data-testid='review-card']"))
    return new_reviews_count > initial_reviews_count

#### Web Scrapping

In [None]:
# Scrape reviews from multiple pages
num_pages = 1894  # Set the number of pages to scrape

# Scrape the first page explicitly
print(f"Scraping page 1...")
start_time = time.time()  # Record start time
time.sleep(20)  # Add a sleep here to let the initial page load fully
extract_reviews()  # Extract reviews from the first page
end_time = time.time()  # Record end time
time_taken = end_time - start_time  # Calculate time taken
print(f"Time taken for page 1: {time_taken:.2f} seconds")  # Display time taken

# Continue scraping other pages
for page in range(2, num_pages + 1):
    print(f"Scraping page {page}...")
    start_time = time.time()  # Record start time

    # Try to go to the next page and extract reviews
    if go_to_next_page(page):
    # if go_to_next_page():
        time.sleep(5)  # Ensure page is loaded
        extract_reviews()  # Extract reviews after page loads
        print(f"Successfully scraped page {page}.")
    else:
        print(f"Failed to scrape page {page}.")

    # Measure the time taken for the current page
    end_time = time.time()  # Record end time
    time_taken = end_time - start_time  # Calculate time taken 
    print(f"Time taken for page {page}: {time_taken:.2f} seconds")  # Display time taken

# Close the browser
driver.quit()

#### Export and Store

In [None]:
# Save data to JSON file
with open("../Data/reviews_all_booking_com.json", "w", encoding="utf-8") as f:
    json.dump(all_reviews, f, ensure_ascii=False, indent=4)

print(f"Scraped {len(all_reviews)} reviews and saved to 'Data/reviews_all_booking_com.json'.")

In [None]:
# Load the JSON file
with open("../Data/reviews_all_booking_com.json", "r", encoding="utf-8") as file:
    data = json.load(file)

# Flatten the JSON structure
flattened_data = []
for entry in data:
    flattened_data.append({
        "serial_no": entry["serial_no"],
        "reviewer_name": entry["reviewer"]["name"],
        "reviewer_country": entry["reviewer"]["country"],
        "room_type": entry["stay_details"]["room_type"],
        "number_of_nights": entry["stay_details"]["number_of_nights"],
        "period_of_stay": entry["stay_details"]["period_of_stay"],
        "traveler_type": entry["stay_details"]["traveler_type"],
        "review_date": entry["review"]["date"],
        "review_title": entry["review"]["title"],
        "review_score": entry["review"]["score"],
        "positive_review": entry["review"]["positive_text"],
        "negative_review": entry["review"]["negative_text"],
        "extraction_timestamp": entry["extraction_timestamp"]
    })

# Convert to DataFrame
df = pd.DataFrame(flattened_data)

# Save to CSV with UTF-8 encoding for foreign language support
df.to_csv("../Data/reviews_all_booking_com.csv", index=False, encoding="utf-8-sig")