## 1D. Web Scraping Script from Trip Advisor

Last Updated: 18 Sep 2025 </br> 
Description: This script scrapes reviews from Trip Advisor hotel pages using Selenium, handles pagination to scrape multiple pages of reviews, and saves the data as a JSON file.

In [None]:
# pip install selenium webdriver-manager pandas

In [None]:
# pip install webdriver-manager

#### Import Libraries

In [2]:
# Import Libraries
import time
import json
import random
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import TimeoutException, NoSuchElementException

In [3]:
options = Options()
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument("--start-maximized")

In [4]:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
wait = WebDriverWait(driver, 15)

#### File Path Config

In [6]:
# URL of the reviews page
URL = "https://www.tripadvisor.com/Hotel_Review-g294265-d1770798-Reviews-Marina_Bay_Sands-Singapore.html"
driver.get(URL)
time.sleep(5)

In [7]:
wait = WebDriverWait(driver, 10)

#### Function Definition

In [9]:
def extract_reviews():
    reviews = []
    try:
        # This locator for the main card seems correct, so we'll keep it.
        review_cards = wait.until(EC.presence_of_all_elements_located(
            (By.CSS_SELECTOR, "div[data-test-target='HR_CC_CARD']")
        ))
    except Exception as e:
        print(f"Could not find review cards on the page. Error: {e}")
        return reviews

    for card in review_cards:
        # --- Reviewer Name and Date ---
        # Both are in the same div, so we find it once and extract both pieces.
        try:
            # Find the div containing the name and "wrote a review..." text
            name_date_container = card.find_element(By.CSS_SELECTOR, "div.biGQs._P.VImYz.AWdfh")
            full_text = name_date_container.text
            
            # The name is inside a nested span
            reviewer_name = name_date_container.find_element(By.CSS_SELECTOR, "span.zEeKA a span").text
            
            # The date is the remaining part of the string after removing the name and the static text.
            review_date = full_text.replace(reviewer_name, "").replace("wrote a review", "").strip()
        except NoSuchElementException:
            reviewer_name = ""
            review_date = ""

        # --- Review Title ---
        try:
            # This data-test-target is a reliable and unique selector for the title.
            review_title = card.find_element(By.CSS_SELECTOR, "div[data-test-target='review-title']").text
        except NoSuchElementException:
            review_title = ""

        # --- Review Text ---
        try:
            # The span with class 'JguWG' uniquely wraps the main review text.
            review_text = card.find_element(By.CSS_SELECTOR, "span.JguWG").text
        except NoSuchElementException:
            review_text = ""

        # --- Review Score ---
        # Your logic for the score was good, as it handles the SVG element correctly.
        try:
            review_score = card.find_element(By.CSS_SELECTOR, "div.nKWJn.u svg evwcZ + title").text
        except:
            try:
                review_score = card.find_element(By.CSS_SELECTOR, "svg.evwcZ title").get_attribute("innerHTML")
            except:
                review_score = ""
        
        # --- Period of Stay & Traveler Type (BETTER, STRUCTURED LOGIC) ---
        date_of_stay = ""
        trip_type = ""
        try:
            # Find all the rows containing a label and a value (e.g., "Date of stay: August 2025")
            detail_rows = card.find_elements(By.CSS_SELECTOR, "div.MZTIt div.CPHmk")
            for row in detail_rows:
                # Get the label (e.g., "Date of stay:")
                label = row.find_element(By.CSS_SELECTOR, "span.ZNjnF").text
                # Get the value (e.g., "August 2025")
                value = row.find_element(By.CSS_SELECTOR, "span.xENVe").text
        
                if "Date of stay" in label:
                    date_of_stay = value
                elif "Trip type" in label:
                    trip_type = value
        except NoSuchElementException:
            # If any element isn't found, the variables will just remain empty
            pass

        # --- Reviewer Country ---

        try:
            reviewer_country = card.find_element(By.CSS_SELECTOR, "span.biGQs._P.VImYz.AWdfh").text
        except:
            reviewer_country = ""

        reviews.append({
            "reviewer_name": reviewer_name,
            "review_date": review_date,
            "review_title": review_title,
            "review_text": review_text,
            "review_score": review_score,
            "date_of_stay": date_of_stay,   
            "trip_type": trip_type, 
            "reviewer_country": reviewer_country
        })
    return reviews

#### Web Scrapping

In [14]:
# website behavior, need to scroll down before "Reviews" appear on the top menu bar
# Scroll down using keyboard before looking for Reviews tab

# Scroll down using keyboard to make Reviews tab visible
body = driver.find_element(By.TAG_NAME, "body")
for _ in range(5):
    body.send_keys(Keys.PAGE_DOWN)
    time.sleep(1)

# Now try clicking Reviews tab twice
try:
    for _ in range(2):
        reviews_tab = wait.until(EC.element_to_be_clickable(
            (By.XPATH, "//span[contains(text(),'Reviews')]")
        ))
        driver.execute_script("arguments[0].scrollIntoView(true);", reviews_tab)
        time.sleep(2)
        reviews_tab.click()
        print("Clicked Reviews tab")
        time.sleep(5)
except Exception as e:
    print(f"Could not click Reviews tab: {e}")
    driver.quit()

Clicked Reviews tab
Clicked Reviews tab


In [15]:
# --- START ---

all_reviews = []
max_pages = 165 

print("--- Starting Scraper ---")

for page_num in range(1, max_pages + 1):
    print(f"\nProcessing Page {page_num}...")

    try:
        # 1. Wait for review cards to be present. THIS IS OUR MAIN WAIT.
        #    On the first loop, it finds page 1's cards.
        #    On the second loop, it will PAUSE HERE until page 2's cards load.
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div[data-test-target='HR_CC_CARD']")))
        
        # 2. Extract reviews from the current page
        extracted = extract_reviews()
        if not extracted:
            print("Extractor returned no reviews. Stopping.")
            break
        print(f"Extracted {len(extracted)} reviews.")
        all_reviews.extend(extracted)

        # 3. If this is the last page, don't try to click "Next"
        if page_num >= max_pages:
            print("Reached the final page. Scraping complete.")
            break

        # 4. Find and click the button for the NEXT page
        next_page_num = page_num + 1
        next_page_xpath = f'//a[@aria-label="{next_page_num}"]'
        
        next_page_button = wait.until(
            EC.element_to_be_clickable((By.XPATH, next_page_xpath))
        )
        driver.execute_script("arguments[0].click();", next_page_button)
        print(f"Clicked button for page {next_page_num}.")

        # 5. THE URL CHECK HAS BEEN REMOVED. The wait at the top of the loop handles everything.
        
        # Add a small, static pause just in case
        time.sleep(2)

    except TimeoutException:
        print(f"Could not find the button for page {page_num + 1}. Likely the end of reviews.")
        break
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        break

print(f"\n--- Scraping Finished ---")
print(f"Total reviews collected: {len(all_reviews)}")

# --- END ---

--- Starting Scraper ---

Processing Page 1...
Extracted 10 reviews.
Clicked button for page 2.

Processing Page 2...
Extracted 10 reviews.
Clicked button for page 3.

Processing Page 3...
Extracted 10 reviews.
Clicked button for page 4.

Processing Page 4...
Extracted 10 reviews.
Clicked button for page 5.

Processing Page 5...
Extracted 10 reviews.
Clicked button for page 6.

Processing Page 6...
Extracted 10 reviews.
Clicked button for page 7.

Processing Page 7...
Extracted 10 reviews.
Clicked button for page 8.

Processing Page 8...
Extracted 10 reviews.
Clicked button for page 9.

Processing Page 9...
Extracted 10 reviews.
Clicked button for page 10.

Processing Page 10...
Extracted 10 reviews.
Clicked button for page 11.

Processing Page 11...
Extracted 10 reviews.
Clicked button for page 12.

Processing Page 12...
Extracted 10 reviews.
Clicked button for page 13.

Processing Page 13...
Extracted 10 reviews.
Clicked button for page 14.

Processing Page 14...
Extracted 10 reviews

#### Export and Store

In [None]:
# Save data to JSON file
with open("../Data/tripadvisor_reviews.json", "w", encoding="utf-8") as f:
    json.dump(all_reviews, f, ensure_ascii=False, indent=4)

# Save to CSV with UTF-8 encoding for foreign language support
df = pd.DataFrame(all_reviews)
df.to_csv("../Data/tripadvisor_reviews.csv", index=False, encoding="utf-8-sig")

print(f"Exported {len(all_reviews)} reviews into JSON & CSV")

driver.quit()