In [9]:
import os
import csv
import time
import random
import re
from pathlib import Path
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd

In [10]:
# Setup headless Chrome
options = Options()
options.add_argument("--headless")  # Remove this line if you want to see browser
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

In [11]:
# ========== STEP 1: Load all product links ==========

brand_url = "https://reviews.femaledaily.com/brands/product/cosrx"
driver.get(brand_url)
time.sleep(3)

# Click "Load More" until all products are shown
while True:
    try:
        load_more_button = driver.find_element(By.ID, "button-load-more-products")
        driver.execute_script("arguments[0].click();", load_more_button)
        time.sleep(2)
    except NoSuchElementException:
        break
    except ElementClickInterceptedException:
        time.sleep(2)
        continue

# Parse loaded product page
soup = BeautifulSoup(driver.page_source, "html.parser")
product_cards = soup.find_all("a", class_="product-card")
product_links = []

# Filter only products with >300 reviews
for card in product_cards:
    rating_span = card.find("span", class_=lambda x: x and "fd-body-sm-regular" in x and "grey" in x)
    if rating_span:
        review_text = rating_span.get_text()
        match = re.search(r"\(([\d,]+)\)", review_text)
        if match:
            review_count = int(match.group(1).replace(",", ""))
            if review_count > 1000:
                link = card.get("href")
                if link and link.startswith("/"):
                    link = "https://reviews.femaledaily.com" + link
                product_links.append(link)

print(f"🔗 Found {len(product_links)} products with >1000 reviews.")

🔗 Found 7 products with >1000 reviews.


In [12]:
# ========== STEP 2: Scrape Reviews ==========

# Checkpoint save file
checkpoint_file = "all_reviews_checkpoint.csv"
checkpoint_path = Path(checkpoint_file)

# Load previous progress if exists
if checkpoint_path.exists():
    checkpoint_df = pd.read_csv(checkpoint_file)
else:
    checkpoint_df = pd.DataFrame(columns=[
        "Product Name", "Review Date", "Review Rating", "Recommend", "Review Text",
        "Usage Period", "Purchase Point", "Product URL", "Review Page"
    ])

# Function to load page with retry
def safe_get(url, retries=3, backoff=2):
    for i in range(retries):
        try:
            driver.set_page_load_timeout(180)
            driver.get(url)
            return True
        except Exception as e:
            print(f"⚠️ Error loading {url}: {e}")
            time.sleep(backoff * (2 ** i) + random.uniform(0, 1))
    return False

# Helper to get last scraped page
def get_last_scraped_page(url):
    df = checkpoint_df[checkpoint_df["Product URL"] == url]
    return df["Review Page"].max() if not df.empty else 0

all_reviews = checkpoint_df.to_dict("records")

for product_url in product_links:
    print(f"\n📄 Scraping: {product_url}")
    if not safe_get(product_url):
        print(f"❌ Skipping {product_url} due to repeated timeouts.")
        continue

    soup = BeautifulSoup(driver.page_source, "html.parser")
    try:
        product_name = soup.find("h1").get_text(strip=True)
    except:
        product_name = "N/A"

    page = get_last_scraped_page(product_url) + 1

    while True:
        review_url = f"{product_url.split('?')[0]}?page={page}"
        print(f"🔄 Loading page {page} for {product_name}")

        if not safe_get(review_url):
            print(f"⚠️ Timeout at page {page}, moving to next product")
            break

        soup = BeautifulSoup(driver.page_source, "html.parser")
        reviews = soup.find_all("div", class_="review-content-wrapper")
        if not reviews:
            print(f"⛔ No more reviews at page {page}")
            break

        for review in reviews:
            try:
                review_date = review.find("p", class_="review-date").get_text(strip=True)
            except:
                review_date = "N/A"

            try:
                stars = review.find("div", class_="review-card-rating-wrapper").find_all("i", class_="icon-ic_big_star_full")
                review_rating = len(stars)
            except:
                review_rating = "N/A"

            try:
                recommend = review.find("p", class_="recommend").get_text(strip=True)
            except:
                recommend = "N/A"

            try:
                review_text = review.find("p", class_="text-content").get_text(strip=True)
            except:
                review_text = "N/A"

            try:
                info = review.find("div", class_="information-wrapper").find_all("p")
                usage_period = info[0].find("b").get_text(strip=True)
                purchase_point = info[1].find("b").get_text(strip=True)
            except:
                usage_period = "N/A"
                purchase_point = "N/A"

            all_reviews.append({
                "Product Name": product_name,
                "Review Date": review_date,
                "Review Rating": review_rating,
                "Recommend": recommend,
                "Review Text": review_text,
                "Usage Period": usage_period,
                "Purchase Point": purchase_point,
                "Product URL": product_url,
                "Review Page": page
            })

        # Save after each page
        pd.DataFrame(all_reviews).to_csv(checkpoint_file, index=False)
        print(f"✅ Page {page} done for {product_name}. Total: {len(all_reviews)}")
        page += 1

driver.quit()
print(f"\n🎉 ALL DONE! Scraped {len(all_reviews)} reviews in total.")


📄 Scraping: https://reviews.femaledaily.com/products/cleanser/toner/cosrx/aha-bha-clarifying-treatment-toner
🔄 Loading page 452 for AHA/BHA Clarifying Treatment Toner
⛔ No more reviews at page 452

📄 Scraping: https://reviews.femaledaily.com/products/cleanser/facial-wash/cosrx/good-morning-gel-cleanser
🔄 Loading page 442 for Low pH Good Morning Gel Cleanser
⛔ No more reviews at page 442

📄 Scraping: https://reviews.femaledaily.com/products/treatment/serum-essence/cosrx/x-advanced-snail-96-mucin-power-essence
🔄 Loading page 376 for Advanced Snail 96 Mucin Power Essence
⛔ No more reviews at page 376

📄 Scraping: https://reviews.femaledaily.com/products/treatment/acne-treatment/cosrx/acne-pimple-master-patch-1
🔄 Loading page 142 for Acne Pimple Master Patch
⛔ No more reviews at page 142

📄 Scraping: https://reviews.femaledaily.com/products/moisturizer/lotion-emulsion/cosrx/oil-free-ultra-moisturizing-lotion-with-birch-sap
🔄 Loading page 137 for Oil-free Ultra-Moisturizing Lotion (with Bi

PermissionError: [Errno 13] Permission denied: 'all_reviews_checkpoint.csv'