## 1C. Web Scraping Script from Klook

Last Updated: 9 Sep 2025 </br> 
Description: This script scrapes reviews from Klook hotel pages using playwright to bypass bot detection to scrape multiple pages of reviews, and saves the data as a JSON file.

In [None]:
# pip install playwright
# do: playwright install in terminal

#### Import Libraries

In [3]:
# Import Libraries
import asyncio
import random
import json
import pandas as pd
from datetime import datetime
from playwright.async_api import async_playwright

#### File Path Config

In [6]:
# URL of the reviews page
URL = "https://www.klook.com/en-SG/hotels/detail/575689-marina-bay-sands/?spm=SearchResult.SearchResult_LIST&clickId=34ee1b5516"

all_reviews = []

In [11]:
def safe_text(element):
    """Return stripped text if element exists, else empty string"""
    return element.strip() if element else ""

async def scrape_klook():
    async with async_playwright() as p:
        browser = await p.chromium.launch(
            headless=False,
            args=['--disable-blink-features=AutomationControlled']
        )
        context = await browser.new_context(
            viewport={'width': 1280, 'height': 800},
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
        )
        page = await context.new_page()

        print("Opening hotel page...")
        await page.goto(URL, timeout=60000)
        await asyncio.sleep(random.uniform(2, 4))

        # Click Reviews button
        try:
            review_btn = await page.wait_for_selector("span.reviewer-desc", timeout=15000)
            await review_btn.click()
            await asyncio.sleep(random.uniform(2, 4))
            print("Clicked Reviews tab")
        except:
            print("Could not click Reviews tab")
            await browser.close()
            return

        # Scroll to load reviews
        previous_height = None
        for _ in range(50):
            current_height = await page.evaluate("document.body.scrollHeight")
            if previous_height == current_height:
                break
            previous_height = current_height
            await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
            await asyncio.sleep(random.uniform(30, 40)) 
        # Extract reviews
        review_cards = await page.query_selector_all("div.hotel-review-item-desktop")
        print(f"Found {len(review_cards)} review cards")
        for card in review_cards:
            try:
                reviewer_name = safe_text(await (await card.query_selector("span.hotel-review-author")).text_content() if await card.query_selector("span.hotel-review-author") else "")
                review_date = safe_text(await (await card.query_selector("div.gray-text.hotel-review-time")).text_content() if await card.query_selector("div.gray-text.hotel-review-time") else "")
                room_info = safe_text(await (await card.query_selector("p.gray-text.hotel-room-info")).text_content() if await card.query_selector("p.gray-text.hotel-room-info") else "")
                review_score_numerator = safe_text(await (await card.query_selector("p.score span.avg")).text_content() if await card.query_selector("p.score span.avg") else "")
                review_score_denominator = safe_text(await (await card.query_selector("p.score span.max")).text_content() if await card.query_selector("p.score span.max") else "")
                review_score_description = safe_text(await (await card.query_selector("div.hotel-review-score span:not(.avg,.max)")).text_content() if await card.query_selector("div.hotel-review-score span:not(.avg,.max)") else "")
                review_text = safe_text(await (await card.query_selector("div.accordion-text-content")).text_content() if await card.query_selector("div.accordion-text-content") else "")

                all_reviews.append({
                    "reviewer_name": reviewer_name,
                    "room_info": room_info,
                    "review_score_numerator": review_score_numerator,
                    "review_score_denominator": review_score_denominator,
                    "review_score_description": review_score_description,
                    "review_date": review_date,
                    "review": review_text
                })
            except Exception as e:
                print(f"Error extracting a card: {e}")

        await browser.close()

#### Web Scraping

In [None]:
# Run the scraper
await scrape_klook()

#### Export and Store

In [None]:
df = pd.DataFrame(all_reviews)

In [None]:
df.columns

In [None]:
# Drop duplicates
df = df.drop_duplicates(keep='first')

# Convert back to list of dicts if needed
all_reviews = df.to_dict(orient='records')

print(f"Removed duplicates, {len(all_reviews)} unique reviews remain.")

In [None]:
# Save data to JSON file
with open("../Data/klook_reviews.json", "w", encoding="utf-8") as f:
    json.dump(all_reviews, f, ensure_ascii=False, indent=4)

print(f"Scraped {len(all_reviews)} reviews and saved to 'Data/klook_reviews.json'.")

In [None]:
df.to_csv("../Data/klook_reviews.csv", index=False, encoding="utf-8-sig")

print(f"Scraped {len(all_reviews)} reviews.")