## 1B. Web Scraping Script from Trip.com

Last Updated: 7 Sep 2025 </br> 
Description: This script scrapes reviews from Trip.com hotel pages using Selenium, handles pagination to scrape multiple pages of reviews, and saves the data as a JSON file.

In [None]:
# pip install selenium webdriver-manager pandas

In [None]:
# pip install webdriver-manager

#### Import Libraries

In [2]:
# Import Libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import json
import pandas as pd
import random
from datetime import datetime

In [3]:
from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.core.os_manager import ChromeType

# This will automatically download the correct ChromeDriver and return its path
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

#### File Path Config

In [5]:
# URL of the reviews page
url = "https://sg.trip.com/hotels/singapore-hotel-detail-687592/marina-bay-sands-singapore/"
driver.get(url)

In [6]:
# Wait for page to load - 15s
wait = WebDriverWait(driver, 15)

In [7]:
# Store all reviews
all_reviews = []

#### Function Definition

In [13]:
# Click Reviews Tab
def click_reviews_tab():
    try:
        reviews_tab = wait.until(
            EC.element_to_be_clickable((By.XPATH, "//h2[contains(text(),'Guest Reviews')]"))
        )
        driver.execute_script("arguments[0].scrollIntoView();", reviews_tab)
        time.sleep(5)
        reviews_tab.click()
        print("✓ Clicked Guest Reviews tab")
        time.sleep(5)
    except Exception as e:
        print(f"✗ Error clicking Reviews tab: {e}")

In [15]:
# Extract reviews from one page
def extract_reviews(page_number):
    try:
        review_cards = wait.until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.yRvZgc0SICPUbmdb2L2a"))
        )
        print(f"Found {len(review_cards)} reviews on page {page_number}")

        serial_number = len(all_reviews) + 1
        
        for card in review_cards:
            extraction_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            
            # Reviewer Name
            try:
                name = card.find_element(By.CSS_SELECTOR, "div.yCIHzFRsP6Tzk7Kia6Qo").text
            except:
                name = ""
    
            # Stay Details: room_type, period_of_stay, traveler_type
            room_type = period_of_stay = traveler_type = ""
            lis = card.find_elements(By.CSS_SELECTOR, "ul.wl5HTVzzG2JXWejYiabW > li")
            for li in lis:
                try:
                    icon_class = li.find_element(By.TAG_NAME, "i").get_attribute("class")
                    text = li.find_element(By.TAG_NAME, "span").text
                    if "u-icon-bed" in icon_class:
                        room_type = text
                    elif "u-icon-ic_new_calendar_line" in icon_class:
                        period_of_stay = text
                    elif "u-icon-beach" in icon_class:
                        traveler_type = text
                except:
                    continue
            
            # Review Score
            try:
                score_numerator = card.find_element(By.CSS_SELECTOR, "strong.xt_R_A70sdDRsOgExJWw").text
                score_denominator = card.find_element(By.CSS_SELECTOR, "span.LOAv7OqDSWlP25tdCm3b").text
                score_description = card.find_element(By.CSS_SELECTOR, "div.EFcLi6rDxOtvi1MITNME").text
            except:
                score_numerator = score_denominator = score_description = ""
    
            # Review Date
            try:
                review_date = card.find_element(By.CSS_SELECTOR, "div.LPPTO8g2RH0Fk19jYMOQ.nUgIw0PM47FsRYfjswPo").text
            except:
                review_date = ""
    
            # Review Text (click 'Show More' if exists)
            try:
                show_more = card.find_element(By.CSS_SELECTOR, "div._4C4vyl1b7FKgXjT5ZCgx")
                driver.execute_script("arguments[0].click();", show_more)
                time.sleep(1)
            except:
                pass
            try:
                review_text = card.find_element(By.CSS_SELECTOR, "div.UXjSnokalMIS5CzMtLSM").text
            except:
                review_text = ""
    
            all_reviews.append({
                "serial_no": serial_number,
                "reviewer_name": name,
                "room_type": room_type,
                "period_of_stay": period_of_stay,
                "traveler_type": traveler_type,
                "review_score_numerator": score_numerator,
                "review_score_denominator": score_denominator,
                "review_score_description": score_description,
                "review_date": review_date,
                "review_text": review_text,
                "extraction_timestamp": extraction_timestamp
            })
            serial_number += 1
            
    except Exception as e:
        print(f"Error extracting reviews: {e}")

In [17]:
# Function to click the "Next Page" button - by page number
def go_to_next_page():
    try:
        next_btn = wait.until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, "li.nF6SWkdU6FLIzjoCbLMF a.pQoxbX5l0DdjPttuVUQx"))
        )
        driver.execute_script("arguments[0].scrollIntoView();", next_btn)
        time.sleep(1)
        next_btn.click()
        time.sleep(5)
        return True
    except Exception as e:
        print(f"No more pages or error going to next page: {e}")
        return False

#### Web Scrapping

In [19]:
# Require to log in and click away the calendar and enable notification before running this
click_reviews_tab()

page = 1
while True:
    print(f"Scraping page {page}...")
    extract_reviews(page)
    if not go_to_next_page():
        break
    page += 1

driver.quit()

✓ Clicked Guest Reviews tab
Scraping page 1...
Found 10 reviews on page 1
Scraping page 2...
Found 10 reviews on page 2
Scraping page 3...
Found 10 reviews on page 3
Scraping page 4...
Found 10 reviews on page 4
Scraping page 5...
Found 10 reviews on page 5
Scraping page 6...
Found 10 reviews on page 6
Scraping page 7...
Found 10 reviews on page 7
Scraping page 8...
Found 10 reviews on page 8
Scraping page 9...
Found 10 reviews on page 9
Scraping page 10...
Found 10 reviews on page 10
Scraping page 11...
Found 10 reviews on page 11
Scraping page 12...
Found 10 reviews on page 12
Scraping page 13...
Found 10 reviews on page 13
Scraping page 14...
Found 10 reviews on page 14
Scraping page 15...
Found 10 reviews on page 15
Scraping page 16...
Found 10 reviews on page 16
Scraping page 17...
Found 10 reviews on page 17
Scraping page 18...
Found 10 reviews on page 18
Scraping page 19...
Found 10 reviews on page 19
Scraping page 20...
Found 10 reviews on page 20
Scraping page 21...
Found 10 r

#### Export and Store

In [29]:
df = pd.DataFrame(all_reviews)

In [31]:
df.columns

Index(['serial_no', 'reviewer_name', 'room_type', 'period_of_stay',
       'traveler_type', 'review_score_numerator', 'review_score_denominator',
       'review_score_description', 'review_date', 'review_text',
       'extraction_timestamp'],
      dtype='object')

In [33]:
# Drop duplicates
df = df.drop_duplicates(subset=['reviewer_name', 'room_type', 'period_of_stay',
                               'traveler_type', 'review_score_numerator',
                                'review_score_denominator', 'review_score_description',
                                'review_date','review_text'], keep='first')

# Convert back to list of dicts if needed
all_reviews = df.to_dict(orient='records')

print(f"Removed duplicates, {len(all_reviews)} unique reviews remain.")

Removed duplicates, 1817 unique reviews remain.


In [None]:
# Save data to JSON file
with open("../Data/tripcom_reviews.json", "w", encoding="utf-8") as f:
    json.dump(all_reviews, f, ensure_ascii=False, indent=4)

print(f"Scraped {len(all_reviews)} reviews and saved to 'Data/tripcom_reviews.json'.")

In [None]:
df.to_csv("../Data/tripcom_reviews.csv", index=False, encoding="utf-8-sig")

print(f"Scraped {len(all_reviews)} reviews across {page} pages.")