In [1]:
pip install selenium beautifulsoup4 pandas webdriver-manager






In [3]:
import time
import random
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

# --- CONFIGURATION ---
# We simulate "Continents" by searching major cities in those regions.
# Yelp is most active in North America and Europe.
LOCATIONS = {
    "North America": ["New York, NY", "Toronto, ON", "San Francisco, CA"],
    "Europe": ["London, UK", "Paris, France", "Berlin, Germany"],
    "Asia": ["Tokyo, Japan", "Singapore", "Kuala Lumpur"], 
    "Oceania": ["Melbourne, Australia", "Sydney, Australia"]
}

OUTPUT_FILE = "uniqlo_global_reviews.csv"

def get_driver():
    """Sets up the Selenium WebDriver with anti-detection options."""
    options = Options()
    # options.add_argument("--headless")  # Run in background (Comment out to see the browser working)
    options.add_argument("--disable-blink-features=AutomationControlled") 
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
    service = Service(ChromeDriverManager().install())
    return webdriver.Chrome(service=service, options=options)

def scrape_reviews_from_store(driver, store_url, continent, location_name):
    """Scrapes reviews from a specific store page."""
    reviews_data = []
    driver.get(store_url)
    time.sleep(random.uniform(3, 6)) # Random sleep to mimic human behavior

    # Loop through pagination (Limit to first 3 pages per store to avoid bans)
    page_count = 0
    max_pages = 3 
    
    while page_count < max_pages:
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # Yelp reviews are usually in list items inside a specific ul
        # Note: Classes change dynamically, so we target generic structures where possible
        reviews = soup.select('li > div.css-1qn0b6x') # This class selector often changes!
        
        # Fallback: simple finding if the specific class above fails
        if not reviews:
            reviews = soup.find_all('div', class_=lambda x: x and 'review' in x)

        print(f"   -> Found {len(reviews)} reviews on page {page_count + 1}...")

        for review in reviews:
            try:
                # 1. Author Name
                author_tag = review.find('a', href=lambda x: x and '/user_details' in x)
                author = author_tag.text.strip() if author_tag else "Anonymous"

                # 2. Rating (Look for aria-label="5 star rating")
                rating_tag = review.find('div', role='img')
                rating = rating_tag['aria-label'].split(' ')[0] if rating_tag and 'aria-label' in rating_tag.attrs else "N/A"

                # 3. Date
                # Yelp dates are often just text spans. We look for text that looks like a date.
                date_tag = review.find('span', class_='css-chan6m') # Common date class, might need adjustment
                published_at = date_tag.text.strip() if date_tag else "N/A"

                # 4. Feedback (The actual text)
                text_tag = review.find('p', class_=lambda x: x and 'comment' in x)
                if not text_tag:
                     text_tag = review.find('span', lang='en')
                feedback = text_tag.text.strip() if text_tag else ""

                # 5. Like Count (Helpful/Funny/Cool buttons)
                # This is tricky as it varies. We look for the button text.
                likes = 0
                buttons = review.find_all('button')
                for btn in buttons:
                    if 'Useful' in btn.text or 'Helpful' in btn.text:
                        # Extract number if present (e.g. "Useful 2")
                        parts = btn.text.split()
                        if len(parts) > 1 and parts[-1].isdigit():
                            likes += int(parts[-1])

                if feedback: # Only save if there is text
                    reviews_data.append({
                        "Continent": continent,
                        "Country/Location": location_name,
                        "Author": author,
                        "Rating": rating,
                        "Published At": published_at,
                        "Like Count": likes,
                        "Feedback": feedback
                    })
            except Exception as e:
                continue # Skip broken reviews

        # Pagination Logic: Click "Next"
        try:
            next_button = driver.find_element(By.XPATH, '//a[contains(@href, "start=") and contains(., "Next")]')
            next_button.click()
            time.sleep(random.uniform(3, 5))
            page_count += 1
        except:
            break # No next button found

    return reviews_data

def main():
    driver = get_driver()
    all_reviews = []

    try:
        for continent, cities in LOCATIONS.items():
            for city in cities:
                print(f"\n--- Searching Uniqlo in {city} ({continent}) ---")
                
                # 1. Search for Uniqlo in this city
                search_url = f"https://www.yelp.com/search?find_desc=Uniqlo&find_loc={city}"
                driver.get(search_url)
                time.sleep(random.uniform(3, 5))

                # 2. Get links to the store pages from search results
                # We target links that look like business pages (ignoring ads)
                soup = BeautifulSoup(driver.page_source, 'html.parser')
                store_links = set()
                
                # Look for links containing /biz/uniqlo
                for a in soup.find_all('a', href=True):
                    if '/biz/uniqlo' in a['href'] and 'ad_business' not in a['href']:
                        full_url = "https://www.yelp.com" + a['href'].split('?')[0]
                        store_links.add(full_url)
                
                print(f"Found {len(store_links)} stores in {city}.")

                # 3. Scrape each store found
                for store_url in list(store_links)[:2]: # Limit to 2 stores per city for testing
                    print(f"Scraping store: {store_url}")
                    reviews = scrape_reviews_from_store(driver, store_url, continent, city)
                    all_reviews.extend(reviews)

    except Exception as e:
        print(f"Critical Error: {e}")
    finally:
        driver.quit()
        
        # Save to CSV
        df = pd.DataFrame(all_reviews)
        df.to_csv(OUTPUT_FILE, index=False)
        print(f"\nScraping complete! Saved {len(df)} reviews to {OUTPUT_FILE}")

if __name__ == "__main__":
    main()


--- Searching Uniqlo in New York, NY (North America) ---
Found 0 stores in New York, NY.

--- Searching Uniqlo in Toronto, ON (North America) ---
Found 0 stores in Toronto, ON.

--- Searching Uniqlo in San Francisco, CA (North America) ---
Found 0 stores in San Francisco, CA.

--- Searching Uniqlo in London, UK (Europe) ---
Found 0 stores in London, UK.

--- Searching Uniqlo in Paris, France (Europe) ---
Found 0 stores in Paris, France.

--- Searching Uniqlo in Berlin, Germany (Europe) ---
Found 0 stores in Berlin, Germany.

--- Searching Uniqlo in Tokyo, Japan (Asia) ---
Found 0 stores in Tokyo, Japan.

--- Searching Uniqlo in Singapore (Asia) ---
Found 0 stores in Singapore.

--- Searching Uniqlo in Kuala Lumpur (Asia) ---
Found 0 stores in Kuala Lumpur.

--- Searching Uniqlo in Melbourne, Australia (Oceania) ---
Found 0 stores in Melbourne, Australia.

--- Searching Uniqlo in Sydney, Australia (Oceania) ---
Found 0 stores in Sydney, Australia.

Scraping complete! Saved 0 reviews to

In [5]:
pip install undetected-chromedriver

Collecting undetected-chromedriver
  Downloading undetected-chromedriver-3.5.5.tar.gz (65 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting websockets (from undetected-chromedriver)
  Downloading websockets-16.0-cp312-cp312-win_amd64.whl.metadata (7.0 kB)
Downloading websockets-16.0-cp312-cp312-win_amd64.whl (178 kB)
Building wheels for collected packages: undetected-chromedriver
  Building wheel for undetected-chromedriver (setup.py): started
  Building wheel for undetected-chromedriver (setup.py): finished with status 'done'
  Created wheel for undetected-chromedriver: filename=undetected_chromedriver-3.5.5-py3-none-any.whl size=47130 sha256=c9a61882cd085384dc8440e2080ab8547fdf2c32c8366c7a3047d9895e3154be
  Stored in directory: c:\users\acer\appdata\local\pip\cache\wheels\c4\f1\aa\9de6cf276210554d91e9c0526864563e850a428c5e76da4914
Successfully built undetected-chromedriver
Installing collected packages: websockets, un



In [7]:
import time
import random
import pandas as pd
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup

# --- CONFIGURATION ---
LOCATIONS = {
    "North America": ["New York, NY", "Toronto, ON"],
    "Europe": ["London, UK", "Paris, France"],
    "Asia": ["Tokyo, Japan", "Singapore"], 
    "Oceania": ["Melbourne, Australia"]
}

OUTPUT_FILE = "uniqlo_global_reviews_stealth.csv"

def get_stealth_driver():
    """Sets up a driver that hides the fact it is a bot."""
    options = uc.ChromeOptions()
    # options.add_argument('--headless') # Do NOT use headless for Yelp, it triggers bans
    options.add_argument('--no-first-run')
    
    # Initialize the undetectable driver
    driver = uc.Chrome(options=options)
    return driver

def human_scroll(driver):
    """Scrolls down the page slowly like a human reading."""
    total_height = int(driver.execute_script("return document.body.scrollHeight"))
    for i in range(1, total_height, random.randint(300, 700)):
        driver.execute_script(f"window.scrollTo(0, {i});")
        time.sleep(random.uniform(0.1, 0.5))

def scrape_reviews(driver, store_url, continent, location_name):
    reviews_data = []
    print(f"   -> Accessing {store_url}")
    driver.get(store_url)
    
    # RANDOM WAIT: Critical to avoid blocking
    time.sleep(random.uniform(5, 10))
    human_scroll(driver)

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    # Yelp layout changes frequently. We look for 'li' elements containing reviews.
    # If the specific class fails, we fallback to a broader search.
    reviews = soup.select('li > div.css-1qn0b6x')
    if not reviews:
        reviews = soup.find_all('div', class_=lambda x: x and 'review' in x)

    for review in reviews:
        try:
            # Extract Author
            author_tag = review.find('a', href=lambda x: x and '/user_details' in x)
            author = author_tag.text.strip() if author_tag else "Anonymous"

            # Extract Rating (looking for 'aria-label' in image div)
            rating = "N/A"
            rating_tag = review.find('div', role='img')
            if rating_tag and 'aria-label' in rating_tag.attrs:
                rating_str = rating_tag['aria-label']
                if 'star rating' in rating_str:
                    rating = rating_str.split(' ')[0]

            # Extract Date
            date_tag = review.find('span', class_='css-chan6m')
            published_at = date_tag.text.strip() if date_tag else "N/A"

            # Extract Feedback Text
            text_tag = review.find('p', class_=lambda x: x and 'comment' in x)
            if not text_tag: 
                text_tag = review.find('span', lang='en')
            feedback = text_tag.text.strip() if text_tag else ""

            # Extract Likes
            likes = 0
            buttons = review.find_all('button')
            for btn in buttons:
                text = btn.text
                # Look for numbers in buttons like "Useful 2"
                if any(k in text for k in ['Useful', 'Helpful', 'Cool']):
                    parts = text.split()
                    if parts and parts[-1].isdigit():
                        likes += int(parts[-1])

            if feedback:
                reviews_data.append({
                    "Continent": continent,
                    "Country": location_name,
                    "Author": author,
                    "Rating": rating,
                    "Published At": published_at,
                    "Like Count": likes,
                    "Feedback": feedback
                })
        except Exception:
            continue

    return reviews_data

def main():
    driver = get_stealth_driver()
    all_data = []
    
    try:
        for continent, cities in LOCATIONS.items():
            for city in cities:
                print(f"--- Processing {city} ---")
                
                # 1. Search for Uniqlo in the city
                driver.get(f"https://www.yelp.com/search?find_desc=Uniqlo&find_loc={city}")
                time.sleep(random.uniform(5, 8))
                
                # 2. Find the first valid store link
                soup = BeautifulSoup(driver.page_source, 'html.parser')
                store_link = None
                for a in soup.find_all('a', href=True):
                    if '/biz/uniqlo' in a['href'] and 'ad_business' not in a['href']:
                        store_link = "https://www.yelp.com" + a['href'].split('?')[0]
                        break # Just take the first valid store to be safe
                
                if store_link:
                    # 3. Scrape the store
                    data = scrape_reviews(driver, store_link, continent, city)
                    all_data.extend(data)
                    print(f"   -> Collected {len(data)} reviews.")
                else:
                    print("   -> No store found.")

    except Exception as e:
        print(f"Error: {e}")
    finally:
        driver.quit()
        df = pd.DataFrame(all_data)
        df.to_csv(OUTPUT_FILE, index=False)
        print(f"Done! Saved to {OUTPUT_FILE}")

if __name__ == "__main__":
    main()

--- Processing New York, NY ---
   -> Accessing https://www.yelp.com/biz/uniqlo-new-york-2
   -> Collected 0 reviews.
--- Processing Toronto, ON ---
   -> Accessing https://www.yelp.com/biz/uniqlo-toronto-5
   -> Collected 0 reviews.
--- Processing London, UK ---
   -> Accessing https://www.yelp.com/biz/uniqlo-london
   -> Collected 0 reviews.
--- Processing Paris, France ---
   -> Accessing https://www.yelp.com/biz/uniqlo-paris-5
   -> Collected 0 reviews.
--- Processing Tokyo, Japan ---
   -> Accessing https://www.yelp.com/biz/uniqlo-%E5%BE%A1%E5%BE%92%E7%94%BA%E5%BA%97-%E5%8F%B0%E6%9D%B1%E5%8C%BA
   -> Collected 0 reviews.
--- Processing Singapore ---
   -> Accessing https://www.yelp.com/biz/uniqlo-singapore-2
   -> Collected 0 reviews.
--- Processing Melbourne, Australia ---
   -> Accessing https://www.yelp.com/biz/uniqlo-melbourne
   -> Collected 0 reviews.
Done! Saved to uniqlo_global_reviews_stealth.csv


In [9]:
import time
import random
import pandas as pd
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

# --- CONFIGURATION ---
LOCATIONS = {
    "North America": ["New York, NY", "Toronto, ON"],
    "Europe": ["London, UK", "Paris, France"],
    "Asia": ["Tokyo, Japan", "Singapore"], 
    "Oceania": ["Melbourne, Australia"]
}

OUTPUT_FILE = "uniqlo_global_reviews_fixed.csv"

def get_stealth_driver():
    options = uc.ChromeOptions()
    options.add_argument('--no-first-run')
    # options.add_argument('--headless') # Keep headless OFF for Yelp
    driver = uc.Chrome(options=options)
    return driver

def human_scroll(driver):
    """Scrolls to trigger lazy loading."""
    total_height = int(driver.execute_script("return document.body.scrollHeight"))
    for i in range(1, total_height, random.randint(400, 800)):
        driver.execute_script(f"window.scrollTo(0, {i});")
        time.sleep(random.uniform(0.1, 0.3))

def scrape_reviews(driver, store_url, continent, location_name):
    reviews_data = []
    print(f"   -> Accessing {store_url}")
    driver.get(store_url)
    
    # 1. Wait for reviews to load (Look for the text "Reviews")
    try:
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
    except:
        print("   -> Page timeout.")
        return []

    time.sleep(random.uniform(3, 5))
    human_scroll(driver)

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    # --- ROBUST FINDING STRATEGY ---
    # Instead of looking for a specific class, we grab ALL list items (li)
    # and check if they contain review-like data (Stars + Date).
    
    potential_reviews = soup.find_all('li')
    print(f"   -> Scanning {len(potential_reviews)} items for reviews...")
    
    count = 0
    for item in potential_reviews:
        try:
            # CHECK: Is this a review? 
            # It must have a div with 'star rating' in the aria-label
            rating_div = item.find('div', role='img')
            if not rating_div or not rating_div.get('aria-label'):
                continue
            
            aria_text = rating_div.get('aria-label').lower()
            if 'star rating' not in aria_text:
                continue

            # If we passed the check, extraction begins:
            
            # 1. Rating
            rating = aria_text.split()[0] # "5 star rating" -> "5"

            # 2. Author
            author_tag = item.find('a', href=lambda x: x and '/user_details' in x)
            author = author_tag.text.strip() if author_tag else "Anonymous"

            # 3. Date (Look for text patterns common in dates)
            # Yelp dates are often in a span with distinct styling. 
            # We look for the span immediately following the rating or user info.
            published_at = "N/A"
            # Strategy: Find all text spans, look for Date-like length
            spans = item.find_all('span')
            for s in spans:
                txt = s.text.strip()
                # Simple heuristic: Dates usually contain "/" or "," and are short
                if len(txt) < 20 and (',' in txt or '/' in txt) and any(c.isdigit() for c in txt):
                    # Exclude "Useful", "Cool" text
                    if "Useful" not in txt and "Cool" not in txt:
                        published_at = txt
                        break

            # 4. Feedback
            # Review text usually has a 'lang' attribute (e.g., lang="en")
            text_tag = item.find('span', lang=True)
            if not text_tag:
                # Fallback: Look for the longest paragraph
                paragraphs = item.find_all('p')
                if paragraphs:
                    text_tag = max(paragraphs, key=lambda p: len(p.text))
            
            feedback = text_tag.text.strip() if text_tag else ""

            # 5. Like Count
            likes = 0
            buttons = item.find_all('button')
            for btn in buttons:
                btxt = btn.text
                if any(x in btxt for x in ['Useful', 'Helpful', 'Cool']):
                    parts = btxt.split()
                    if parts and parts[-1].isdigit():
                        likes += int(parts[-1])

            # Only add if we found feedback text
            if feedback:
                reviews_data.append({
                    "Continent": continent,
                    "Country": location_name,
                    "Author": author,
                    "Rating": rating,
                    "Published At": published_at,
                    "Like Count": likes,
                    "Feedback": feedback
                })
                count += 1
                
        except Exception:
            continue # Skip bad items

    return reviews_data

def main():
    driver = get_stealth_driver()
    all_data = []
    
    try:
        for continent, cities in LOCATIONS.items():
            for city in cities:
                print(f"--- Processing {city} ---")
                
                # Search
                search_url = f"https://www.yelp.com/search?find_desc=Uniqlo&find_loc={city}"
                driver.get(search_url)
                time.sleep(5)
                
                # Find Link
                soup = BeautifulSoup(driver.page_source, 'html.parser')
                store_link = None
                for a in soup.find_all('a', href=True):
                    if '/biz/uniqlo' in a['href'] and 'ad_business' not in a['href']:
                        store_link = "https://www.yelp.com" + a['href'].split('?')[0]
                        break 
                
                if store_link:
                    data = scrape_reviews(driver, store_link, continent, city)
                    print(f"   -> Successfully extracted {len(data)} reviews.")
                    all_data.extend(data)
                else:
                    print("   -> No store link found.")

    except Exception as e:
        print(f"Error: {e}")
    finally:
        driver.quit()
        if all_data:
            df = pd.DataFrame(all_data)
            df.to_csv(OUTPUT_FILE, index=False)
            print(f"SUCCESS! Saved {len(df)} reviews to {OUTPUT_FILE}")
        else:
            print("Failed to collect any data. Yelp might be blocking the page content.")

if __name__ == "__main__":
    main()

--- Processing New York, NY ---
   -> Accessing https://www.yelp.com/biz/uniqlo-new-york-2
   -> Scanning 193 items for reviews...
   -> Successfully extracted 0 reviews.
--- Processing Toronto, ON ---
   -> Accessing https://www.yelp.com/biz/uniqlo-toronto-5
   -> Scanning 147 items for reviews...
   -> Successfully extracted 1 reviews.
--- Processing London, UK ---
   -> Accessing https://www.yelp.com/biz/uniqlo-london
   -> Scanning 64 items for reviews...
   -> Successfully extracted 0 reviews.
--- Processing Paris, France ---
   -> Accessing https://www.yelp.com/biz/uniqlo-paris-5
   -> Scanning 61 items for reviews...
   -> Successfully extracted 0 reviews.
--- Processing Tokyo, Japan ---
   -> Accessing https://www.yelp.com/biz/uniqlo-%E5%BE%A1%E5%BE%92%E7%94%BA%E5%BA%97-%E5%8F%B0%E6%9D%B1%E5%8C%BA
   -> Scanning 61 items for reviews...
   -> Successfully extracted 0 reviews.
--- Processing Singapore ---
   -> Accessing https://www.yelp.com/biz/uniqlo-singapore-2
   -> Scanning 

In [11]:
pip install undetected-chromedriver






In [15]:
import time
import random
import pandas as pd
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import os

# --- 1. GLOBAL SEARCH LIST (The "All Outlets" Strategy) ---
# We cover major hubs in every continent to approximate "All Outlets"
LOCATIONS = {
    "Asia": [
        "Tokyo, Japan", "Osaka, Japan", "Kyoto, Japan", "Yokohama, Japan", 
        "Singapore", "Manila, Philippines", "Bangkok, Thailand", "Seoul, South Korea"
    ],
    "North America": [
        "New York, NY", "Los Angeles, CA", "San Francisco, CA", "Chicago, IL", 
        "Toronto, Canada", "Vancouver, Canada", "Honolulu, HI"
    ],
    "Europe": [
        "London, UK", "Paris, France", "Berlin, Germany", "Milan, Italy", 
        "Barcelona, Spain", "Stockholm, Sweden"
    ],
    "Oceania": [
        "Melbourne, Australia", "Sydney, Australia", "Brisbane, Australia"
    ]
}

OUTPUT_FILE = "uniqlo_global_reviews_comprehensive.csv"
TARGET_PER_CONTINENT = 500  # Stop after getting this many reviews per continent to balance data

def get_driver():
    options = uc.ChromeOptions()
    options.add_argument('--no-first-run')
    # options.add_argument('--headless') # Keep Headless OFF to avoid detection
    driver = uc.Chrome(options=options)
    return driver

def get_store_links(driver, city):
    """Searches Yelp for Uniqlo in a city and returns all store URLs found."""
    print(f"   -> Searching for Uniqlo stores in {city}...")
    search_url = f"https://www.yelp.com/search?find_desc=Uniqlo&find_loc={city}"
    driver.get(search_url)
    time.sleep(random.uniform(5, 7))
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    store_links = set()
    
    # Extract links that look like business pages
    for a in soup.find_all('a', href=True):
        if '/biz/uniqlo' in a['href'] and 'ad_business' not in a['href']:
            # Clean URL
            full_url = "https://www.yelp.com" + a['href'].split('?')[0]
            store_links.add(full_url)
    
    links = list(store_links)
    print(f"      Found {len(links)} stores.")
    return links

def scrape_reviews(driver, store_url, continent, city):
    """Scrapes reviews from a single store URL."""
    reviews_data = []
    
    # Pagination: We try to scrape the first 3 pages (30 reviews) of every store found
    # This prevents getting banned by scraping 100 pages of a single store
    for start_num in [0, 10, 20]: 
        
        paginated_url = f"{store_url}?start={start_num}&sort_by=date_desc"
        driver.get(paginated_url)
        time.sleep(random.uniform(4, 7))
        
        # Check for empty page or captcha
        if "human" in driver.title.lower():
            print("      !!! Captcha Detected - Waiting 30s !!!")
            time.sleep(30)
            
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        list_items = soup.find_all('li')
        
        found_on_page = 0
        
        for item in list_items:
            try:
                # Validation: Must have star rating
                rating_div = item.find('div', role='img')
                if not rating_div or not rating_div.get('aria-label') or 'star rating' not in rating_div.get('aria-label'):
                    continue

                # Data Extraction
                rating = rating_div.get('aria-label').split()[0] # "5"
                
                # Author
                author_tag = item.find('a', href=lambda x: x and '/user_details' in x)
                author = author_tag.text.strip() if author_tag else "Anonymous"
                
                # Date
                published_at = "N/A"
                for s in item.find_all('span'):
                    txt = s.text.strip()
                    if len(txt) < 20 and any(c.isdigit() for c in txt) and (',' in txt or '/' in txt):
                        if "Useful" not in txt:
                            published_at = txt
                            break

                # Feedback Text
                text_tag = item.find('span', lang='en')
                if not text_tag:
                    # Fallback for non-English reviews (Japanese/French)
                    text_tag = item.find('span', lang=True) 
                
                feedback = text_tag.text.strip() if text_tag else ""

                # Likes
                likes = 0
                for btn in item.find_all('button'):
                    if any(x in btn.text for x in ['Useful', 'Helpful']):
                        parts = btn.text.split()
                        if parts and parts[-1].isdigit():
                            likes += int(parts[-1])

                if feedback:
                    reviews_data.append({
                        "Continent": continent,
                        "Country/City": city,
                        "Store URL": store_url,
                        "Author": author,
                        "Rating": rating,
                        "Published At": published_at,
                        "Like Count": likes,
                        "Feedback": feedback
                    })
                    found_on_page += 1
                    
            except Exception:
                continue
        
        if found_on_page == 0:
            break # Stop paging if no reviews found

    return reviews_data

def main():
    driver = get_driver()
    
    # Initialize CSV if not exists
    if not os.path.exists(OUTPUT_FILE):
        pd.DataFrame(columns=["Continent", "Country/City", "Store URL", "Author", "Rating", "Published At", "Like Count", "Feedback"]).to_csv(OUTPUT_FILE, index=False)
    
    try:
        for continent, cities in LOCATIONS.items():
            print(f"\n=== PROCESSING CONTINENT: {continent} ===")
            continent_count = 0
            
            for city in cities:
                if continent_count >= TARGET_PER_CONTINENT:
                    print(f"--- Reached target for {continent}. Moving to next continent. ---")
                    break

                # 1. Find Stores in this City
                store_urls = get_store_links(driver, city)
                
                # 2. Scrape Each Store
                for url in store_urls:
                    print(f"   -> Scraping: {url}")
                    new_reviews = scrape_reviews(driver, url, continent, city)
                    
                    if new_reviews:
                        # Append to CSV
                        df = pd.DataFrame(new_reviews)
                        df.to_csv(OUTPUT_FILE, mode='a', header=False, index=False)
                        continent_count += len(new_reviews)
                        print(f"      Collected {len(new_reviews)} reviews. (Continent Total: {continent_count})")
                    else:
                        print("      No reviews found (or empty text).")

    except Exception as e:
        print(f"Critical Error: {e}")
    finally:
        driver.quit()
        print(f"Script Finished. Data saved to {OUTPUT_FILE}")

if __name__ == "__main__":
    main()


=== PROCESSING CONTINENT: Asia ===
   -> Searching for Uniqlo stores in Tokyo, Japan...
      Found 1 stores.
   -> Scraping: https://www.yelp.com/biz/uniqlo-%E5%BE%A1%E5%BE%92%E7%94%BA%E5%BA%97-%E5%8F%B0%E6%9D%B1%E5%8C%BA
      Collected 7 reviews. (Continent Total: 7)
   -> Searching for Uniqlo stores in Osaka, Japan...
      Found 1 stores.
   -> Scraping: https://www.yelp.com/biz/uniqlo-%E5%A4%A7%E9%98%AA%E5%B8%82
      Collected 2 reviews. (Continent Total: 9)
   -> Searching for Uniqlo stores in Kyoto, Japan...
      Found 0 stores.
   -> Searching for Uniqlo stores in Yokohama, Japan...
      Found 0 stores.
   -> Searching for Uniqlo stores in Singapore...
      Found 5 stores.
   -> Scraping: https://www.yelp.com/biz/uniqlo-singapore-2
      Collected 13 reviews. (Continent Total: 22)
   -> Scraping: https://www.yelp.com/biz/uniqlo-singapore-22
      Collected 1 reviews. (Continent Total: 23)
   -> Scraping: https://www.yelp.com/biz/uniqlo-singapore-6
      Collected 5 review

In [23]:
import time
import pandas as pd
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import os

# --- CONFIGURATION ---
OUTPUT_FILE = "uniqlo_malaysia_reviews_fixed.csv"

# Add your product URLs here
PRODUCT_URLS = [
    "https://www.uniqlo.com/my/en/products/E468971-000/reviews",
    "https://www.uniqlo.com/my/en/products/E450310-000/reviews", 
    "https://www.uniqlo.com/my/en/products/E464023-000/reviews", 
    "https://www.uniqlo.com/my/en/products/E460324-000/reviews",
    "https://www.uniqlo.com/my/en/products/E424873-000/reviews",
]

def get_driver():
    options = uc.ChromeOptions()
    options.add_argument('--no-first-run')
    # options.add_argument('--headless') # Keep headless OFF so you can see it working
    driver = uc.Chrome(options=options)
    return driver

def close_popups(driver):
    """Closes standard Uniqlo popups."""
    try:
        # OneTrust Cookie Banner
        WebDriverWait(driver, 3).until(
            EC.element_to_be_clickable((By.ID, "onetrust-accept-btn-handler"))
        ).click()
        print("   -> Closed Cookie Banner.")
    except:
        pass 

def click_load_more(driver):
    """
    Revised logic based on your screenshot:
    Finds the 'LOAD MORE' button and clicks it until it disappears.
    """
    print("   -> Starting 'LOAD MORE' loop...")
    
    # 1. Close popups first so they don't block clicks
    close_popups(driver)
    
    click_count = 0
    max_clicks = 100 # Safety limit
    
    while click_count < max_clicks:
        try:
            # Scroll to the bottom to make sure the button is in the viewport
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight - 1000);")
            time.sleep(1.5)

            # --- TARGETING THE BUTTON FROM YOUR IMAGE ---
            # We look for ANY element containing exactly "LOAD MORE"
            # We use XPath to be flexible (it works for divs, buttons, or spans)
            load_more_btn = None
            
            # Try finding the specific text "LOAD MORE"
            xpath_locators = [
                "//*[text()='LOAD MORE']",           # Exact text match
                "//*[contains(text(), 'LOAD MORE')]", # Contains text
                "//div[contains(@class, 'load-more')]", # Common class name
                "//button[contains(@class, 'load-more')]"
            ]
            
            for xpath in xpath_locators:
                try:
                    element = driver.find_element(By.XPATH, xpath)
                    if element.is_displayed():
                        load_more_btn = element
                        break
                except:
                    continue

            if load_more_btn:
                # Use JavaScript click to force it (bypasses overlapping elements)
                driver.execute_script("arguments[0].click();", load_more_btn)
                print(f"      Clicked 'LOAD MORE' ({click_count+1})...")
                
                # Wait for new reviews to load
                time.sleep(3) 
                click_count += 1
            else:
                print("   -> 'LOAD MORE' button not found (All reviews loaded).")
                break
                
        except Exception as e:
            print(f"   -> Loop ended: {e}")
            break

def scrape_page(driver_source, product_url):
    soup = BeautifulSoup(driver_source, 'html.parser')
    reviews_data = []
    
    # Check if reviews exist by looking for "out of 5 stars"
    rating_elements = soup.find_all(string=lambda text: text and "out of 5 stars" in text)
    print(f"   -> Extracting from {len(rating_elements)} review blocks...")

    for rating_text in rating_elements:
        try:
            # Navigate up to the review container
            container = rating_text.find_parent('div').find_parent('div') 
            
            # 1. RATING
            rating = rating_text.strip().split()[0] 

            # 2. DATE, FEEDBACK, AUTHOR
            texts = list(container.stripped_strings)
            
            published_at = "N/A"
            body = ""
            author_info = "Anonymous"
            likes = 0
            
            # Smart Text sorting
            for t in texts:
                # Date detection (dd/mm/yyyy)
                if len(t) == 10 and t[2] == '/' and t[5] == '/':
                    published_at = t
                # Author Metadata (Age, Gender, Height)
                elif any(k in t for k in ["Male", "Female", "Height", "Weight", "Shoe size"]):
                    author_info = t
                # Like Count
                elif "Helpful" in t and "(" in t:
                     try: likes = int(t.split('(')[1].split(')')[0])
                     except: pass
            
            # The body is usually the longest text block that ISN'T the metadata
            possible_bodies = [t for t in texts if t != author_info and len(t) > 5]
            if possible_bodies:
                body = max(possible_bodies, key=len)
            
            # Cleanup
            if "out of 5 stars" in body: body = "" # unwanted capture

            if body:
                reviews_data.append({
                    "Continent": "Asia",
                    "Country": "Malaysia",
                    "Store URL": product_url,
                    "Author": author_info,
                    "Rating": rating,
                    "Published At": published_at,
                    "Like Count": likes,
                    "Feedback": body
                })

        except Exception:
            continue

    return reviews_data

def main():
    driver = get_driver()
    
    if not os.path.exists(OUTPUT_FILE):
        pd.DataFrame(columns=["Continent", "Country", "Store URL", "Author", "Rating", "Published At", "Like Count", "Feedback"]).to_csv(OUTPUT_FILE, index=False)

    try:
        for url in PRODUCT_URLS:
            print(f"\n--- Processing: {url} ---")
            driver.get(url)
            
            # WAIT FOR PAGE TO FULLY LOAD
            time.sleep(5)
            
            # 1. Expand all reviews
            click_load_more(driver)
            
            # 2. Scrape
            reviews = scrape_page(driver.page_source, url)
            
            if reviews:
                df = pd.DataFrame(reviews)
                # Remove duplicates
                df.drop_duplicates(subset=['Feedback'], inplace=True)
                
                df.to_csv(OUTPUT_FILE, mode='a', header=False, index=False)
                print(f"   -> SUCCESS: Saved {len(reviews)} reviews.")
            else:
                print("   -> No reviews found (Check if URL has reviews).")

    except Exception as e:
        print(f"Error: {e}")
    finally:
        driver.quit()
        print(f"\nDONE. Data saved to {OUTPUT_FILE}")

if __name__ == "__main__":
    main()


--- Processing: https://www.uniqlo.com/my/en/products/E468971-000/reviews ---
   -> Starting 'LOAD MORE' loop...
   -> Closed Cookie Banner.
      Clicked 'LOAD MORE' (1)...
      Clicked 'LOAD MORE' (2)...
      Clicked 'LOAD MORE' (3)...
      Clicked 'LOAD MORE' (4)...
      Clicked 'LOAD MORE' (5)...
      Clicked 'LOAD MORE' (6)...
      Clicked 'LOAD MORE' (7)...
      Clicked 'LOAD MORE' (8)...
      Clicked 'LOAD MORE' (9)...
      Clicked 'LOAD MORE' (10)...
      Clicked 'LOAD MORE' (11)...
      Clicked 'LOAD MORE' (12)...
      Clicked 'LOAD MORE' (13)...
      Clicked 'LOAD MORE' (14)...
      Clicked 'LOAD MORE' (15)...
      Clicked 'LOAD MORE' (16)...
      Clicked 'LOAD MORE' (17)...
      Clicked 'LOAD MORE' (18)...
      Clicked 'LOAD MORE' (19)...
      Clicked 'LOAD MORE' (20)...
      Clicked 'LOAD MORE' (21)...
      Clicked 'LOAD MORE' (22)...
      Clicked 'LOAD MORE' (23)...
      Clicked 'LOAD MORE' (24)...
   -> Loop ended: Message: script timeout
  (Sessio

In [33]:
import pandas as pd

# Load your global Yelp data
df_global = pd.read_csv("uniqlo_global_reviews_comprehensive.csv")

# Load your new Malaysia data
df_my = pd.read_csv("uniqlo_malaysia_reviews_fixed.csv")

# Ensure columns match (The Malaysia script has an extra "User Metadata" column)
# We can rename columns if they are slightly different
df_my = df_my.rename(columns={"Store/Product": "Store URL"}) 

# Combine them
df_combined = pd.concat([df_global, df_my], ignore_index=True)

# Save the master file
df_combined.to_csv("uniqlo_FINAL_MASTER_DATASET.csv", index=False)
print(f"Merged! Total reviews: {len(df_combined)}")

Merged! Total reviews: 402


In [47]:
import time
import random
import pandas as pd
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import os

# --- CONFIGURATION ---
BASE_URL = "https://www.trustpilot.com/review/uniqlo.eu"
OUTPUT_FILE = "trustpilot_uniqlo_eu_reviews.csv"
TARGET_CONTINENT = "Europe"  # uniqlo.eu is Europe-based

def get_driver():
    options = uc.ChromeOptions()
    options.add_argument('--no-first-run')
    # Trustpilot blocks headless often, so we run visible
    driver = uc.Chrome(options=options)
    return driver

def scrape_trustpilot_page(driver_source):
    """Extracts reviews from the current HTML page."""
    soup = BeautifulSoup(driver_source, 'html.parser')
    reviews_data = []
    
    # Trustpilot reviews are in <article> tags or cards with specific styles
    # We look for the main review card container
    review_cards = soup.find_all('article')
    
    # Fallback if specific article tag isn't used
    if not review_cards:
        review_cards = soup.find_all('div', class_=lambda x: x and 'styles_reviewCard' in x)

    print(f"   -> Found {len(review_cards)} reviews on this page.")

    for card in review_cards:
        try:
            # 1. Author Name
            author_tag = card.find('span', attrs={'data-consumer-name-typography': 'true'})
            author = author_tag.text.strip() if author_tag else "Anonymous"

            # 2. Location (Country)
            # Trustpilot often puts this in a small gray text like "DE" or "GB"
            country = "Europe (General)"
            country_tag = card.find('div', class_=lambda x: x and 'typography_body-m' in x and 'consumer-information' in x) 
            if country_tag:
                # Sometimes it is an SVG flag or text code
                country = country_tag.text.strip()
            
            # Alternative: Check for "Reviewed in X" text if available
            
            # 3. Rating
            # Look for the star image or data attribute
            rating = "N/A"
            star_div = card.find('div', attrs={'data-service-review-rating': True})
            if star_div:
                rating = star_div['data-service-review-rating']
            else:
                # Fallback: look for img alt text "5 out of 5 stars"
                img = card.find('img', alt=lambda x: x and "stars" in x)
                if img:
                    rating = img['alt'].split()[0]

            # 4. Date
            # Trustpilot has "Date of experience" and "Date of review"
            published_at = "N/A"
            date_tag = card.find('time')
            if date_tag and date_tag.has_attr('datetime'):
                published_at = date_tag['datetime'].split('T')[0] # Get YYYY-MM-DD
            
            # 5. Feedback Text
            # Look for the paragraph tag with specific typography
            text_tag = card.find('p', attrs={'data-service-review-text-typography': 'true'})
            feedback = text_tag.text.strip() if text_tag else ""
            
            # If there's a title, prepend it
            title_tag = card.find('h2', attrs={'data-service-review-title-typography': 'true'})
            if title_tag:
                feedback = f"{title_tag.text.strip()}. {feedback}"

            # 6. Like Count (Useful)
            likes = 0
            # Look for button text "Useful 2"
            buttons = card.find_all('button')
            for btn in buttons:
                if "Useful" in btn.text:
                    parts = btn.text.split()
                    for p in parts:
                        if p.isdigit():
                            likes = int(p)
                            break

            if feedback:
                reviews_data.append({
                    "Continent": TARGET_CONTINENT,
                    "Country": country,
                    "Source": "Trustpilot",
                    "Author": author,
                    "Rating": rating,
                    "Published At": published_at,
                    "Like Count": likes,
                    "Feedback": feedback
                })

        except Exception:
            continue
            
    return reviews_data

def main():
    driver = get_driver()
    
    # Initialize CSV
    if not os.path.exists(OUTPUT_FILE):
        pd.DataFrame(columns=["Continent", "Country", "Source", "Author", "Rating", "Published At", "Like Count", "Feedback"]).to_csv(OUTPUT_FILE, index=False)
    
    try:
        # Start at Page 1
        current_page = 1
        
        while True:
            url = f"{BASE_URL}?page={current_page}"
            print(f"\n--- Scraping Page {current_page} ---")
            driver.get(url)
            
            # Wait for load
            time.sleep(random.uniform(4, 7))
            
            # Check if page exists (Trustpilot redirects to page 1 if you go too far)
            if current_page > 1 and "page=1" in driver.current_url:
                print("   -> Redirected to Page 1. End of pages reached.")
                break
            
            # Scrape content
            reviews = scrape_trustpilot_page(driver.page_source)
            
            if not reviews:
                print("   -> No reviews found on this page. Stopping.")
                break
                
            # Save to CSV
            df = pd.DataFrame(reviews)
            df.to_csv(OUTPUT_FILE, mode='a', header=False, index=False)
            print(f"   -> Saved {len(reviews)} reviews.")
            
            # Pagination Logic: Look for "Next page" button
            try:
                next_button = driver.find_element(By.NAME, "pagination-button-next")
                if not next_button.is_enabled():
                    print("   -> 'Next' button disabled. Finished.")
                    break
            except:
                # If button is missing, we check if we just scraped the last page
                # Trustpilot sometimes just hides the button.
                # We can also rely on the URL check at the start of the loop.
                pass
            
            current_page += 1
            
            # Safety limit (Trustpilot has thousands of pages)
            if current_page > 50: 
                print("   -> Reached safety limit of 50 pages.")
                break

    except Exception as e:
        print(f"Critical Error: {e}")
    finally:
        driver.quit()
        print(f"\nDONE. Saved to {OUTPUT_FILE}")

if __name__ == "__main__":
    main()


--- Scraping Page 1 ---
   -> Found 67 reviews on this page.
   -> Saved 20 reviews.

--- Scraping Page 2 ---
   -> Found 67 reviews on this page.
   -> Saved 20 reviews.

--- Scraping Page 3 ---
   -> Found 67 reviews on this page.
   -> Saved 20 reviews.

--- Scraping Page 4 ---
   -> Found 67 reviews on this page.
   -> Saved 20 reviews.

--- Scraping Page 5 ---
   -> Found 67 reviews on this page.
   -> Saved 20 reviews.

--- Scraping Page 6 ---
   -> Found 67 reviews on this page.
   -> Saved 20 reviews.

--- Scraping Page 7 ---
   -> Found 67 reviews on this page.
   -> Saved 20 reviews.

--- Scraping Page 8 ---
   -> Found 67 reviews on this page.
   -> Saved 20 reviews.

--- Scraping Page 9 ---
   -> Found 67 reviews on this page.
   -> Saved 20 reviews.

--- Scraping Page 10 ---
   -> Redirected to Page 1. End of pages reached.

DONE. Saved to trustpilot_uniqlo_eu_reviews.csv


In [49]:
import pandas as pd
import os

# --- FILE NAMES ---
# 1. Your existing master file (Yelp + Malaysia)
file_1 = "uniqlo_FINAL_MASTER_DATASET.csv" 

# 2. Your new Trustpilot file
file_2 = "trustpilot_uniqlo_eu_reviews.csv"

# 3. The final output file name
output_file = "UNIQLO_GLOBAL_REVIEWS_ALL.csv"

def merge_datasets():
    print("--- Starting Merge Process ---")

    # 1. Load the files
    try:
        df_master = pd.read_csv(file_1)
        print(f"Loaded Master File: {len(df_master)} reviews")
    except FileNotFoundError:
        print(f"Error: Could not find {file_1}")
        return

    try:
        df_trustpilot = pd.read_csv(file_2)
        print(f"Loaded Trustpilot File: {len(df_trustpilot)} reviews")
    except FileNotFoundError:
        print(f"Error: Could not find {file_2}")
        return

    # 2. Standardize Columns
    
    if "Source" not in df_master.columns:
        df_master["Source"] = "Official Website / Yelp"
        
    if "Source" not in df_trustpilot.columns:
        df_trustpilot["Source"] = "Trustpilot"

    # Ensure 'Like Count' is consistent (rename 'Likes' to 'Like Count' if needed)
    if "Likes" in df_master.columns:
        df_master = df_master.rename(columns={"Likes": "Like Count"})
    if "Likes" in df_trustpilot.columns:
        df_trustpilot = df_trustpilot.rename(columns={"Likes": "Like Count"})

    # 3. Combine DataFrames
    df_final = pd.concat([df_master, df_trustpilot], ignore_index=True)

    # 4. Cleanup (Optional)
    df_final = df_final.fillna("N/A")

    # 5. Save
    df_final.to_csv(output_file, index=False)
    
    print("\n--- MERGE SUCCESSFUL ---")
    print(f"Total Combined Reviews: {len(df_final)}")
    print(f"Saved to: {output_file}")
    
    # 6. Show Breakdown
    print("\nData Breakdown by Continent:")
    print(df_final['Continent'].value_counts())

if __name__ == "__main__":
    merge_datasets()

--- Starting Merge Process ---
Loaded Master File: 402 reviews
Loaded Trustpilot File: 180 reviews

--- MERGE SUCCESSFUL ---
Total Combined Reviews: 582
Saved to: UNIQLO_GLOBAL_REVIEWS_ALL.csv

Data Breakdown by Continent:
Continent
Asia             317
Europe           180
North America     85
Name: count, dtype: int64


In [53]:
import time
import random
import pandas as pd
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import os

# --- CONFIGURATION ---
BASE_URL = "https://uk.trustpilot.com/review/www.uniqlo.com"
OUTPUT_FILE = "trustpilot_uniqlo_uk_reviews.csv"

def get_driver():
    options = uc.ChromeOptions()
    options.add_argument('--no-first-run')
    # Visible browser is safer for Trustpilot
    driver = uc.Chrome(options=options)
    return driver

def scrape_trustpilot_page(driver_source):
    soup = BeautifulSoup(driver_source, 'html.parser')
    reviews_data = []
    
    # Find review cards (Trustpilot structure)
    review_cards = soup.find_all('article')
    
    # Fallback if article tag fails
    if not review_cards:
        review_cards = soup.find_all('div', class_=lambda x: x and 'styles_reviewCard' in x)

    print(f"   -> Found {len(review_cards)} reviews on this page.")

    for card in review_cards:
        try:
            # 1. Author
            author_tag = card.find('span', attrs={'data-consumer-name-typography': 'true'})
            author = author_tag.text.strip() if author_tag else "Anonymous"

            # 2. Rating (Look for star image alt text)
            rating = "N/A"
            img = card.find('img', alt=lambda x: x and "stars" in x)
            if img:
                rating = img['alt'].split()[0] # "5 out of 5" -> "5"
            else:
                # Backup: Look for data attribute
                div = card.find('div', attrs={'data-service-review-rating': True})
                if div: rating = div['data-service-review-rating']

            # 3. Date
            published_at = "N/A"
            date_tag = card.find('time')
            if date_tag and date_tag.has_attr('datetime'):
                published_at = date_tag['datetime'].split('T')[0]

            # 4. Feedback
            text_tag = card.find('p', attrs={'data-service-review-text-typography': 'true'})
            feedback = text_tag.text.strip() if text_tag else ""
            
            # Add Title to feedback if exists
            title_tag = card.find('h2', attrs={'data-service-review-title-typography': 'true'})
            if title_tag:
                feedback = f"{title_tag.text.strip()}. {feedback}"

            # 5. Like Count
            likes = 0
            buttons = card.find_all('button')
            for btn in buttons:
                if "Useful" in btn.text:
                    parts = btn.text.split()
                    for p in parts:
                        if p.isdigit():
                            likes = int(p)
                            break

            # Only add if there is actual text content
            if feedback:
                reviews_data.append({
                    "Continent": "Europe",
                    "Country": "United Kingdom",
                    "Source": "Trustpilot UK",
                    "Author": author,
                    "Rating": rating,
                    "Published At": published_at,
                    "Like Count": likes,
                    "Feedback": feedback
                })

        except Exception:
            continue
            
    return reviews_data

def main():
    driver = get_driver()
    
    # Initialize CSV
    if not os.path.exists(OUTPUT_FILE):
        pd.DataFrame(columns=["Continent", "Country", "Source", "Author", "Rating", "Published At", "Like Count", "Feedback"]).to_csv(OUTPUT_FILE, index=False)
    
    try:
        current_page = 1
        
        while True:
            url = f"{BASE_URL}?page={current_page}"
            print(f"\n--- Scraping Page {current_page} ---")
            driver.get(url)
            
            # Random wait to act like a human
            time.sleep(random.uniform(3, 6))
            
            # Check for Redirect (End of pages)
            if current_page > 1 and "page=1" in driver.current_url:
                print("   -> Redirected to Page 1. Done.")
                break
            
            # Check for "human verification" title
            if "blocked" in driver.title.lower() or "human" in driver.title.lower():
                print("   !!! Captcha Detected. Pausing 30s. Please solve it manually !!!")
                time.sleep(30)

            # Scrape
            reviews = scrape_trustpilot_page(driver.page_source)
            
            if not reviews:
                print("   -> No reviews found. Stopping.")
                break
                
            # Save
            df = pd.DataFrame(reviews)
            df.to_csv(OUTPUT_FILE, mode='a', header=False, index=False)
            print(f"   -> Saved {len(reviews)} reviews.")
            
            # Check for Next Button
            try:
                next_btn = driver.find_element(By.NAME, "pagination-button-next")
                if not next_btn.is_enabled():
                    print("   -> Next button disabled. Finished.")
                    break
            except:
                # If we scraped data but cant find button, assume it's the last page or hidden
                pass
            
            current_page += 1
            
            # Safety Cap (Trustpilot UK has lots of pages, adjust if needed)
            if current_page > 100: 
                print("   -> Reached limit of 100 pages.")
                break

    except Exception as e:
        print(f"Critical Error: {e}")
    finally:
        driver.quit()
        print(f"\nDONE. Saved to {OUTPUT_FILE}")

if __name__ == "__main__":
    main()


--- Scraping Page 1 ---
   -> Found 68 reviews on this page.
   -> Saved 20 reviews.

--- Scraping Page 2 ---
   -> Found 68 reviews on this page.
   -> Saved 20 reviews.

--- Scraping Page 3 ---
   -> Found 68 reviews on this page.
   -> Saved 20 reviews.

--- Scraping Page 4 ---
   -> Found 68 reviews on this page.
   -> Saved 20 reviews.

--- Scraping Page 5 ---
   -> Found 68 reviews on this page.
   -> Saved 20 reviews.

--- Scraping Page 6 ---
   -> Found 68 reviews on this page.
   -> Saved 20 reviews.

--- Scraping Page 7 ---
   -> Found 68 reviews on this page.
   -> Saved 20 reviews.

--- Scraping Page 8 ---
   -> Found 68 reviews on this page.
   -> Saved 20 reviews.

--- Scraping Page 9 ---
   -> Found 68 reviews on this page.
   -> Saved 20 reviews.

--- Scraping Page 10 ---
   -> Redirected to Page 1. Done.

DONE. Saved to trustpilot_uniqlo_uk_reviews.csv


In [55]:
import pandas as pd

# 1. Load your current big file (Global + Malaysia + EU)
df_all = pd.read_csv("UNIQLO_GLOBAL_REVIEWS_ALL.csv")

# 2. Load the new UK file
df_uk = pd.read_csv("trustpilot_uniqlo_uk_reviews.csv")

# 3. Combine
df_final = pd.concat([df_all, df_uk], ignore_index=True)

# 4. Save
df_final.to_csv("UNIQLO_GLOBAL_REVIEWS_ALL_V2.csv", index=False)
print(f"Updated Dataset! Total Reviews: {len(df_final)}")

Updated Dataset! Total Reviews: 762


In [57]:
import time
import random
import pandas as pd
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import os

# --- CONFIGURATION ---
# We use the specific URL you provided
BASE_URL = "https://au.trustpilot.com/review/uniqlo.dk"
OUTPUT_FILE = "trustpilot_uniqlo_dk_reviews.csv"

def get_driver():
    options = uc.ChromeOptions()
    options.add_argument('--no-first-run')
    driver = uc.Chrome(options=options)
    return driver

def scrape_trustpilot_page(driver_source):
    soup = BeautifulSoup(driver_source, 'html.parser')
    reviews_data = []
    
    # Find review cards
    review_cards = soup.find_all('article')
    if not review_cards:
        review_cards = soup.find_all('div', class_=lambda x: x and 'styles_reviewCard' in x)

    print(f"   -> Found {len(review_cards)} reviews on this page.")

    for card in review_cards:
        try:
            # 1. Author
            author_tag = card.find('span', attrs={'data-consumer-name-typography': 'true'})
            author = author_tag.text.strip() if author_tag else "Anonymous"

            # 2. Rating
            rating = "N/A"
            img = card.find('img', alt=lambda x: x and "stars" in x)
            if img:
                rating = img['alt'].split()[0] # "5 out of 5" -> "5"
            else:
                div = card.find('div', attrs={'data-service-review-rating': True})
                if div: rating = div['data-service-review-rating']

            # 3. Date
            published_at = "N/A"
            date_tag = card.find('time')
            if date_tag and date_tag.has_attr('datetime'):
                published_at = date_tag['datetime'].split('T')[0]

            # 4. Feedback
            text_tag = card.find('p', attrs={'data-service-review-text-typography': 'true'})
            feedback = text_tag.text.strip() if text_tag else ""
            
            # Prepend Title
            title_tag = card.find('h2', attrs={'data-service-review-title-typography': 'true'})
            if title_tag:
                feedback = f"{title_tag.text.strip()}. {feedback}"

            # 5. Like Count
            likes = 0
            buttons = card.find_all('button')
            for btn in buttons:
                if "Useful" in btn.text:
                    parts = btn.text.split()
                    for p in parts:
                        if p.isdigit():
                            likes = int(p)
                            break

            if feedback:
                reviews_data.append({
                    "Continent": "Europe",
                    "Country": "Denmark",
                    "Source": "Trustpilot Denmark",
                    "Author": author,
                    "Rating": rating,
                    "Published At": published_at,
                    "Like Count": likes,
                    "Feedback": feedback
                })

        except Exception:
            continue
            
    return reviews_data

def main():
    driver = get_driver()
    
    if not os.path.exists(OUTPUT_FILE):
        pd.DataFrame(columns=["Continent", "Country", "Source", "Author", "Rating", "Published At", "Like Count", "Feedback"]).to_csv(OUTPUT_FILE, index=False)
    
    try:
        current_page = 1
        
        while True:
            url = f"{BASE_URL}?page={current_page}"
            print(f"\n--- Scraping Page {current_page} ---")
            driver.get(url)
            
            time.sleep(random.uniform(3, 6))
            
            # Check for Redirect (End of pages)
            if current_page > 1 and "page=1" in driver.current_url:
                print("   -> Redirected to Page 1. Done.")
                break
            
            # Scrape
            reviews = scrape_trustpilot_page(driver.page_source)
            
            if not reviews:
                print("   -> No reviews found. Stopping.")
                break
                
            # Save
            df = pd.DataFrame(reviews)
            df.to_csv(OUTPUT_FILE, mode='a', header=False, index=False)
            print(f"   -> Saved {len(reviews)} reviews.")
            
            # Check for Next Button
            try:
                next_btn = driver.find_element(By.NAME, "pagination-button-next")
                if not next_btn.is_enabled():
                    print("   -> Next button disabled. Finished.")
                    break
            except:
                pass
            
            current_page += 1
            
            # Safety limit
            if current_page > 50: 
                break

    except Exception as e:
        print(f"Critical Error: {e}")
    finally:
        driver.quit()
        print(f"\nDONE. Saved to {OUTPUT_FILE}")

if __name__ == "__main__":
    main()


--- Scraping Page 1 ---
   -> Found 24 reviews on this page.
   -> Saved 20 reviews.

--- Scraping Page 2 ---
   -> Found 24 reviews on this page.
   -> Saved 20 reviews.

--- Scraping Page 3 ---
   -> Found 5 reviews on this page.
   -> Saved 1 reviews.

--- Scraping Page 4 ---
   -> Found 0 reviews on this page.
   -> No reviews found. Stopping.

DONE. Saved to trustpilot_uniqlo_dk_reviews.csv


In [59]:
import pandas as pd

# 1. Load your current V2 dataset (Global + EU + UK)
try:
    df_all = pd.read_csv("UNIQLO_GLOBAL_REVIEWS_ALL_V2.csv")
    print(f"Loaded existing data: {len(df_all)} reviews")
except:
    print("Could not find previous file. Starting fresh.")
    df_all = pd.DataFrame()

# 2. Load the new Denmark file
df_dk = pd.read_csv("trustpilot_uniqlo_dk_reviews.csv")
print(f"Loaded Denmark data: {len(df_dk)} reviews")

# 3. Combine
df_final = pd.concat([df_all, df_dk], ignore_index=True)

# 4. Save V3
df_final.to_csv("UNIQLO_GLOBAL_REVIEWS_ALL_V3.csv", index=False)
print(f"SUCCESS! New Total: {len(df_final)}")

Loaded existing data: 762 reviews
Loaded Denmark data: 41 reviews
SUCCESS! New Total: 803


In [67]:
import time
import pandas as pd
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import os

# --- CONFIGURATION ---
OUTPUT_FILE = "uniqlo_malaysia_new_product_reviews.csv"
PRODUCT_URL = "https://www.uniqlo.com/my/en/products/E479078-000/reviews"

def get_driver():
    options = uc.ChromeOptions()
    options.add_argument('--no-first-run')
    options.page_load_strategy = 'normal'
    driver = uc.Chrome(options=options)
    return driver

def close_cookie_banner(driver):
    """Closes the OneTrust cookie banner if present."""
    print("   -> Checking for Cookie Banner...")
    try:
        btn = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.ID, "onetrust-accept-btn-handler"))
        )
        btn.click()
        print("      Banner Closed.")
        time.sleep(2)
    except:
        print("      No banner found or already closed.")

def load_all_reviews(driver):
    """Clicks 'View more' until all reviews are loaded."""
    print("   -> Starting to load reviews...")
    click_count = 0
    
    while True:
        try:
            # Scroll to bottom
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight - 800);")
            time.sleep(1.5)
            
            # Look for "View more" button (Based on your snippet)
            # Try multiple selectors to be safe
            load_btn = None
            xpath_list = [
                "//button[contains(text(), 'View more')]",
                "//button[contains(text(), 'View More')]", 
                "//div[contains(@class, 'load-more')]",
                "//button[contains(@class, 'bv-content-btn-load-more')]" 
            ]
            
            for xpath in xpath_list:
                try:
                    btn = driver.find_element(By.XPATH, xpath)
                    if btn.is_displayed():
                        load_btn = btn
                        break
                except:
                    continue
            
            if load_btn:
                driver.execute_script("arguments[0].click();", load_btn)
                print(f"      Clicked 'View more' ({click_count + 1})")
                time.sleep(3) # Wait for content to load
                click_count += 1
            else:
                print("   -> No more 'View more' buttons found. All loaded.")
                break
                
        except Exception as e:
            print(f"   -> Loading loop stopped: {e}")
            break

def scrape_page(driver_source):
    soup = BeautifulSoup(driver_source, 'html.parser')
    reviews_data = []
    
    # Uniqlo MY reviews usually contain "out of 5 stars"
    # We find these text nodes and work upwards to the container
    rating_texts = soup.find_all(string=lambda text: text and "out of 5 stars" in text)
    
    print(f"   -> Found {len(rating_texts)} review blocks.")
    
    for r_text in rating_texts:
        try:
            # Navigate to container
            container = r_text.find_parent('div').find_parent('div')
            if not container: continue
            
            texts = list(container.stripped_strings)
            
            # --- Extract Fields ---
            rating = r_text.strip().split()[0] # "5"
            
            published_at = "N/A"
            author = "Anonymous"
            feedback = ""
            likes = 0
            
            for t in texts:
                # Date (dd/mm/yyyy)
                if len(t) == 10 and t[2] == '/' and t[5] == '/':
                    published_at = t
                # Author Metadata
                if any(k in t for k in ["Male", "Female", "Height", "Weight", "Shoe size"]):
                    author = t
                # Like Count
                if "Helpful" in t and "(" in t:
                     try: likes = int(t.split('(')[1].split(')')[0])
                     except: pass
            
            # Feedback is longest text that isn't metadata
            possible_bodies = [
                t for t in texts 
                if len(t) > 10 
                and t != author 
                and "out of 5 stars" not in t 
                and "Purchased size" not in t
            ]
            if possible_bodies:
                feedback = max(possible_bodies, key=len)
            
            if feedback:
                reviews_data.append({
                    "Continent": "Asia",
                    "Country": "Malaysia",
                    "Source": "Uniqlo MY Official",
                    "Author": author,
                    "Rating": rating,
                    "Published At": published_at,
                    "Like Count": likes,
                    "Feedback": feedback
                })
                
        except:
            continue
            
    return reviews_data

def main():
    driver = get_driver()
    
    if not os.path.exists(OUTPUT_FILE):
        pd.DataFrame(columns=["Continent", "Country", "Source", "Author", "Rating", "Published At", "Like Count", "Feedback"]).to_csv(OUTPUT_FILE, index=False)
        
    try:
        print(f"--- Accessing {PRODUCT_URL} ---")
        driver.get(PRODUCT_URL)
        time.sleep(5)
        
        close_cookie_banner(driver)
        load_all_reviews(driver)
        
        reviews = scrape_page(driver.page_source)
        
        if reviews:
            df = pd.DataFrame(reviews)
            # Remove duplicates
            df.drop_duplicates(subset=['Feedback'], inplace=True)
            df.to_csv(OUTPUT_FILE, mode='a', header=False, index=False)
            print(f"SUCCESS! Saved {len(reviews)} reviews to {OUTPUT_FILE}")
        else:
            print("No reviews found. Page structure might differ.")
            
    except Exception as e:
        print(f"Error: {e}")
    finally:
        driver.quit()

if __name__ == "__main__":
    main()

--- Accessing https://www.uniqlo.com/my/en/products/E479078-000/reviews ---
   -> Checking for Cookie Banner...
      No banner found or already closed.
   -> Starting to load reviews...
      Clicked 'View more' (1)
      Clicked 'View more' (2)
      Clicked 'View more' (3)
      Clicked 'View more' (4)
      Clicked 'View more' (5)
      Clicked 'View more' (6)
      Clicked 'View more' (7)
      Clicked 'View more' (8)
      Clicked 'View more' (9)
      Clicked 'View more' (10)
   -> No more 'View more' buttons found. All loaded.
   -> Found 60 review blocks.
SUCCESS! Saved 54 reviews to uniqlo_malaysia_new_product_reviews.csv


In [71]:
import pandas as pd

# Load your main V4 (or V5) dataset
df_main = pd.read_csv("UNIQLO_GLOBAL_REVIEWS_ALL_V3.csv") 

# Load the new Malaysia file
df_new_my = pd.read_csv("uniqlo_malaysia_new_product_reviews.csv")

# Combine
df_final = pd.concat([df_main, df_new_my], ignore_index=True)

# Save
df_final.to_csv("UNIQLO_GLOBAL_REVIEWS_ALL_FINAL.csv", index=False)
print(f"Merged! New Total: {len(df_final)}")

Merged! New Total: 857


In [9]:
import time
import pandas as pd
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import os

# --- CONFIGURATION ---
OUTPUT_FILE = "uniqlo_my_sweatshirt_reviews.csv"
PRODUCT_URL = "https://www.uniqlo.com/my/en/products/E476602-000/reviews"

def get_driver():
    options = uc.ChromeOptions()
    options.add_argument('--no-first-run')
    options.page_load_strategy = 'normal'
    driver = uc.Chrome(options=options)
    return driver

def close_cookie_banner(driver):
    """Closes the pop-up banner that blocks buttons."""
    print("   -> Checking for Cookie Banner...")
    try:
        # Wait up to 5 seconds for the "Accept" button
        btn = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.ID, "onetrust-accept-btn-handler"))
        )
        btn.click()
        print("      Banner Closed.")
        time.sleep(2)
    except:
        print("      No banner found (or already closed).")

def find_review_frame(driver):
    """
    Checks ALL iframes to find the one containing the 'Load more' button.
    Returns the index of the correct iframe.
    """
    print("   -> Hunting for the 'Load more' button inside iframes...")
    
    # 1. Check Main Page first
    if is_button_present(driver):
        print("      Found button on MAIN PAGE.")
        return -1 # -1 means "Main Page"
    
    # 2. Check Iframes
    iframes = driver.find_elements(By.TAG_NAME, "iframe")
    print(f"      Found {len(iframes)} potential iframes. Checking them...")
    
    for i, frame in enumerate(iframes):
        try:
            driver.switch_to.default_content()
            driver.switch_to.frame(frame)
            
            # Scroll to trigger lazy loading
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            
            if is_button_present(driver):
                print(f"      !!! FOUND BUTTON IN IFRAME #{i} !!!")
                return i
        except:
            continue
            
    print("      Could not find button. It might be auto-loading or empty.")
    return None

def is_button_present(driver):
    """Helper to check if the button exists and is visible."""
    # List of possible text Uniqlo uses
    xpaths = [
        "//button[contains(text(), 'View more')]",
        "//button[contains(text(), 'View More')]",
        "//button[contains(text(), 'Load more')]",
        "//div[contains(@class, 'load-more')]",
        "//button[contains(@class, 'bv-content-btn-load-more')]"
    ]
    for xpath in xpaths:
        try:
            btn = driver.find_element(By.XPATH, xpath)
            if btn.is_displayed():
                return True
        except:
            continue
    return False

def click_load_more(driver):
    """Clicks the button if found."""
    xpaths = [
        "//button[contains(text(), 'View more')]",
        "//button[contains(text(), 'View More')]",
        "//button[contains(text(), 'Load more')]",
        "//div[contains(@class, 'load-more')]",
        "//button[contains(@class, 'bv-content-btn-load-more')]"
    ]
    for xpath in xpaths:
        try:
            btn = driver.find_element(By.XPATH, xpath)
            if btn.is_displayed():
                driver.execute_script("arguments[0].click();", btn)
                return True
        except:
            continue
    return False

def scrape_data(driver_source):
    soup = BeautifulSoup(driver_source, 'html.parser')
    reviews_data = []
    
    # Uniqlo reviews usually have "out of 5 stars" text
    potential_blocks = soup.find_all(string=lambda text: text and "out of 5 stars" in text)
    print(f"   -> Extracting... Found {len(potential_blocks)} review blocks.")

    for rating_text in potential_blocks:
        try:
            container = rating_text.find_parent('div').find_parent('div')
            if not container: continue

            texts = list(container.stripped_strings)
            
            # Default Values
            rating = rating_text.strip().split()[0]
            published_at = "N/A"
            author = "Anonymous"
            feedback = ""
            likes = 0
            
            for t in texts:
                # Date (dd/mm/yyyy)
                if len(t) == 10 and t[2] == '/' and t[5] == '/':
                    published_at = t
                # Author Info
                if any(k in t for k in ["Male", "Female", "Height", "Weight", "Shoe size"]):
                    author = t
                # Likes
                if "Helpful" in t and "(" in t:
                     try: likes = int(t.split('(')[1].split(')')[0])
                     except: pass
            
            # Feedback Body
            possible_bodies = [t for t in texts if len(t) > 5 and t != author and "out of 5" not in t]
            if possible_bodies:
                feedback = max(possible_bodies, key=len)

            if feedback:
                reviews_data.append({
                    "Continent": "Asia",
                    "Country": "Malaysia",
                    "Source": "Uniqlo MY Official",
                    "Author": author,
                    "Rating": rating,
                    "Published At": published_at,
                    "Like Count": likes,
                    "Feedback": feedback
                })
        except:
            continue
            
    return reviews_data

def process_product(driver, url):
    print(f"--- Accessing {url} ---")
    driver.get(url)
    time.sleep(5)
    
    close_cookie_banner(driver)
    
    # 1. Find where the reviews are (Main Page or Iframe)
    frame_index = find_review_frame(driver)
    
    # 2. Click Loop
    click_count = 0
    max_clicks = 100 # Adjust if there are thousands of reviews
    
    while click_count < max_clicks:
        try:
            # Ensure we are in the right frame
            if frame_index != -1:
                driver.switch_to.default_content()
                frames = driver.find_elements(By.TAG_NAME, "iframe")
                if len(frames) > frame_index:
                    driver.switch_to.frame(frames[frame_index])
            
            # Scroll to bottom
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(1)
            
            if click_load_more(driver):
                print(f"      Clicked 'Load more' ({click_count+1})")
                click_count += 1
                # Wait for new reviews to load
                time.sleep(3) 
            else:
                print("      Button gone. All reviews loaded.")
                break
        except Exception as e:
            print(f"      Loop Error: {e}")
            break
            
    # 3. Extract Data
    print("   -> Parsing final data...")
    # Refresh frame context to get full source
    if frame_index != -1:
        driver.switch_to.default_content()
        frames = driver.find_elements(By.TAG_NAME, "iframe")
        if len(frames) > frame_index:
            driver.switch_to.frame(frames[frame_index])
            
    html = driver.page_source
    data = scrape_data(html)
    
    if data:
        df = pd.DataFrame(data)
        # Remove duplicates
        df.drop_duplicates(subset=['Feedback'], inplace=True)
        df.to_csv(OUTPUT_FILE, index=False)
        print(f"SUCCESS! Saved {len(df)} reviews to {OUTPUT_FILE}")
    else:
        print("No reviews extracted. Check if selectors match.")

def main():
    driver = get_driver()
    if not os.path.exists(OUTPUT_FILE):
        pd.DataFrame(columns=["Continent", "Country", "Source", "Author", "Rating", "Published At", "Like Count", "Feedback"]).to_csv(OUTPUT_FILE, index=False)

    try:
        process_product(driver, PRODUCT_URL)
    except Exception as e:
        print(f"Critical Error: {e}")
    finally:
        driver.quit()
        print(f"\nDONE.")

if __name__ == "__main__":
    main()

--- Accessing https://www.uniqlo.com/my/en/products/E476602-000/reviews ---
   -> Checking for Cookie Banner...
      Banner Closed.
   -> Hunting for the 'Load more' button inside iframes...
      Found button on MAIN PAGE.
      Clicked 'Load more' (1)
      Clicked 'Load more' (2)
      Clicked 'Load more' (3)
      Clicked 'Load more' (4)
      Clicked 'Load more' (5)
      Clicked 'Load more' (6)
      Clicked 'Load more' (7)
      Clicked 'Load more' (8)
      Clicked 'Load more' (9)
      Clicked 'Load more' (10)
      Clicked 'Load more' (11)
      Clicked 'Load more' (12)
      Clicked 'Load more' (13)
      Clicked 'Load more' (14)
      Clicked 'Load more' (15)
      Clicked 'Load more' (16)
      Clicked 'Load more' (17)
      Clicked 'Load more' (18)
      Clicked 'Load more' (19)
      Clicked 'Load more' (20)
      Clicked 'Load more' (21)
      Clicked 'Load more' (22)
      Button gone. All reviews loaded.
   -> Parsing final data...
   -> Extracting... Found 126 revie

In [11]:
import pandas as pd

# 1. Load your MAIN dataset (Replace with your latest filename)
df_main = pd.read_csv("UNIQLO_GLOBAL_REVIEWS_ALL_FINAL.csv") 

# 2. Load the new Sweatshirt file
df_new = pd.read_csv("uniqlo_my_sweatshirt_reviews.csv")

# 3. Combine
df_final = pd.concat([df_main, df_new], ignore_index=True)

# 4. Save
df_final.to_csv("UNIQLO_PROJECT_DATASET_UPDATED.csv", index=False)
print(f"Dataset Updated! Total Reviews: {len(df_final)}")

Dataset Updated! Total Reviews: 977


In [17]:
import time
import pandas as pd
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import os

# --- CONFIGURATION ---
OUTPUT_FILE = "uniqlo_my_E465196_reviews.csv"
PRODUCT_URL = "https://www.uniqlo.com/my/en/products/E465196-000/reviews"

def get_driver():
    options = uc.ChromeOptions()
    options.add_argument('--no-first-run')
    # Use 'normal' to ensure basic elements load before we start
    options.page_load_strategy = 'normal' 
    driver = uc.Chrome(options=options)
    # Set long timeouts to prevent "Read timed out"
    driver.set_script_timeout(120)
    driver.set_page_load_timeout(120)
    return driver

def close_cookie_banner(driver):
    """Closes the cookie banner if it exists."""
    print("   -> Checking for Cookie Banner...")
    try:
        # Wait up to 8 seconds for the banner
        btn = WebDriverWait(driver, 8).until(
            EC.element_to_be_clickable((By.ID, "onetrust-accept-btn-handler"))
        )
        btn.click()
        print("      Banner Closed.")
        time.sleep(2)
    except:
        print("      No banner found (or already closed).")

def find_review_frame(driver):
    """
    Scans for the iframe containing the 'Load more' button.
    Returns the index of the correct iframe.
    """
    print("   -> Scanning for Review Iframe...")
    
    # 1. Check Main Page
    if is_button_visible(driver):
        print("      Found button on Main Page.")
        return -1
    
    # 2. Check Iframes
    iframes = driver.find_elements(By.TAG_NAME, "iframe")
    print(f"      Found {len(iframes)} iframes. Checking content...")
    
    for i, frame in enumerate(iframes):
        try:
            driver.switch_to.default_content()
            # Refetch to avoid stale elements
            current_frames = driver.find_elements(By.TAG_NAME, "iframe")
            if i >= len(current_frames): break
            
            driver.switch_to.frame(current_frames[i])
            
            # Scroll to wake up the iframe
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(0.5)
            
            if is_button_visible(driver):
                print(f"      !!! Found 'Load More' in Iframe #{i} !!!")
                return i
        except:
            continue
            
    print("      Warning: Could not find 'Load More' button. (Maybe already loaded?)")
    return None

def is_button_visible(driver):
    """Checks for the existence of the button."""
    xpaths = [
        "//button[contains(text(), 'Load more')]",
        "//button[contains(text(), 'View more')]",
        "//div[contains(@class, 'load-more')]",
        "//button[contains(@class, 'bv-content-btn-load-more')]"
    ]
    for xpath in xpaths:
        try:
            btn = driver.find_element(By.XPATH, xpath)
            if btn.is_displayed(): return True
        except:
            continue
    return False

def click_load_button(driver):
    """Clicks the button safely."""
    xpaths = [
        "//button[contains(text(), 'Load more')]",
        "//button[contains(text(), 'View more')]",
        "//div[contains(@class, 'load-more')]",
        "//button[contains(@class, 'bv-content-btn-load-more')]"
    ]
    for xpath in xpaths:
        try:
            btn = driver.find_element(By.XPATH, xpath)
            if btn.is_displayed():
                # Use JavaScript click to bypass overlays
                driver.execute_script("arguments[0].click();", btn)
                return True
        except:
            continue
    return False

def scrape_data(driver_source):
    soup = BeautifulSoup(driver_source, 'html.parser')
    reviews_data = []
    
    potential_blocks = soup.find_all(string=lambda text: text and "out of 5 stars" in text)
    
    for rating_text in potential_blocks:
        try:
            container = rating_text.find_parent('div').find_parent('div')
            if not container: continue

            texts = list(container.stripped_strings)
            rating = rating_text.strip().split()[0]
            published_at = "N/A"
            author = "Anonymous"
            feedback = ""
            likes = 0
            
            for t in texts:
                if len(t) == 10 and t[2] == '/' and t[5] == '/': published_at = t
                if any(k in t for k in ["Male", "Female", "Height", "Weight"]): author = t
                if "Helpful" in t and "(" in t:
                     try: likes = int(t.split('(')[1].split(')')[0])
                     except: pass
            
            possible_bodies = [t for t in texts if len(t) > 5 and t != author and "out of 5" not in t]
            if possible_bodies: feedback = max(possible_bodies, key=len)

            if feedback:
                reviews_data.append({
                    "Continent": "Asia",
                    "Country": "Malaysia",
                    "Source": "Uniqlo MY Official",
                    "Author": author,
                    "Rating": rating,
                    "Published At": published_at,
                    "Like Count": likes,
                    "Feedback": feedback
                })
        except:
            continue
    return reviews_data

def process_product(driver, url):
    print(f"--- Accessing {url} ---")
    driver.get(url)
    time.sleep(5)
    close_cookie_banner(driver)
    
    # 1. Locate Frame
    frame_index = find_review_frame(driver)
    
    # 2. Click Loop
    click_count = 0
    # Approx 750 reviews / 6 reviews per click = ~125 clicks
    max_clicks = 150 
    
    while click_count < max_clicks:
        try:
            # Ensure we are in the correct frame
            if frame_index != -1 and frame_index is not None:
                driver.switch_to.default_content()
                frames = driver.find_elements(By.TAG_NAME, "iframe")
                if len(frames) > frame_index:
                    driver.switch_to.frame(frames[frame_index])
            
            # Scroll
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(1) # Short pause before looking for button
            
            if click_load_button(driver):
                print(f"      Clicked 'Load more' ({click_count+1})")
                click_count += 1
                # WAIT 3 SECONDS - Critical for stability
                time.sleep(3)
            else:
                print("      Button gone. All reviews loaded.")
                break
                
        except Exception as e:
            print(f"      Loop Error: {e}")
            break

    # 3. Extract Final Data
    print("   -> Extracting data...")
    if frame_index != -1 and frame_index is not None:
        driver.switch_to.default_content()
        frames = driver.find_elements(By.TAG_NAME, "iframe")
        if len(frames) > frame_index:
            driver.switch_to.frame(frames[frame_index])
            
    html = driver.page_source
    data = scrape_data(html)
    
    if data:
        df = pd.DataFrame(data)
        df.drop_duplicates(subset=['Feedback'], inplace=True)
        df.to_csv(OUTPUT_FILE, index=False)
        print(f"SUCCESS! Saved {len(df)} reviews to {OUTPUT_FILE}")
    else:
        print("No reviews extracted.")

def main():
    driver = get_driver()
    if not os.path.exists(OUTPUT_FILE):
        pd.DataFrame(columns=["Continent", "Country", "Source", "Author", "Rating", "Published At", "Like Count", "Feedback"]).to_csv(OUTPUT_FILE, index=False)

    try:
        process_product(driver, PRODUCT_URL)
    except Exception as e:
        print(f"Critical Error: {e}")
    finally:
        driver.quit()
        print(f"\nDONE.")

if __name__ == "__main__":
    main()

--- Accessing https://www.uniqlo.com/my/en/products/E465196-000/reviews ---
   -> Checking for Cookie Banner...
      Banner Closed.
   -> Scanning for Review Iframe...
      Found button on Main Page.
      Clicked 'Load more' (1)
      Clicked 'Load more' (2)
      Clicked 'Load more' (3)
      Clicked 'Load more' (4)
      Clicked 'Load more' (5)
      Clicked 'Load more' (6)
      Clicked 'Load more' (7)
      Clicked 'Load more' (8)
      Clicked 'Load more' (9)
      Clicked 'Load more' (10)
      Clicked 'Load more' (11)
      Clicked 'Load more' (12)
      Clicked 'Load more' (13)
      Clicked 'Load more' (14)
      Clicked 'Load more' (15)
      Clicked 'Load more' (16)
      Clicked 'Load more' (17)
      Clicked 'Load more' (18)
      Clicked 'Load more' (19)
      Clicked 'Load more' (20)
      Clicked 'Load more' (21)
      Clicked 'Load more' (22)
      Clicked 'Load more' (23)
      Clicked 'Load more' (24)
      Clicked 'Load more' (25)
      Clicked 'Load more' (26)
 

In [21]:
import pandas as pd

# Load your MAIN dataset (Check your file name!)
df_main = pd.read_csv("UNIQLO_PROJECT_DATASET_UPDATED.csv") 

# Load the new file
df_new = pd.read_csv("uniqlo_my_E465196_reviews.csv")

# Combine
df_final = pd.concat([df_main, df_new], ignore_index=True)

# Save
df_final.to_csv("UNIQLO_PROJECT_DATASET_ALL.csv", index=False)
print(f"Total Reviews: {len(df_final)}")

Total Reviews: 1117


In [23]:
import time
import pandas as pd
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import os

# --- CONFIGURATION ---
OUTPUT_FILE = "uniqlo_my_E447780_reviews.csv"
PRODUCT_URL = "https://www.uniqlo.com/my/en/products/E447780-000/reviews"

def get_driver():
    options = uc.ChromeOptions()
    options.add_argument('--no-first-run')
    # Use 'normal' to ensure basic elements load before we start
    options.page_load_strategy = 'normal' 
    driver = uc.Chrome(options=options)
    # Set long timeouts to prevent "Read timed out"
    driver.set_script_timeout(120)
    driver.set_page_load_timeout(120)
    return driver

def close_cookie_banner(driver):
    """Closes the cookie banner if it exists."""
    print("   -> Checking for Cookie Banner...")
    try:
        # Wait up to 8 seconds for the banner
        btn = WebDriverWait(driver, 8).until(
            EC.element_to_be_clickable((By.ID, "onetrust-accept-btn-handler"))
        )
        btn.click()
        print("      Banner Closed.")
        time.sleep(2)
    except:
        print("      No banner found (or already closed).")

def find_review_frame(driver):
    """
    Scans for the iframe containing the 'Load more' button.
    Returns the index of the correct iframe.
    """
    print("   -> Scanning for Review Iframe...")
    
    # 1. Check Main Page
    if is_button_visible(driver):
        print("      Found button on Main Page.")
        return -1
    
    # 2. Check Iframes
    iframes = driver.find_elements(By.TAG_NAME, "iframe")
    print(f"      Found {len(iframes)} iframes. Checking content...")
    
    for i, frame in enumerate(iframes):
        try:
            driver.switch_to.default_content()
            # Refetch to avoid stale elements
            current_frames = driver.find_elements(By.TAG_NAME, "iframe")
            if i >= len(current_frames): break
            
            driver.switch_to.frame(current_frames[i])
            
            # Scroll to wake up the iframe
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(0.5)
            
            if is_button_visible(driver):
                print(f"      !!! Found 'Load More' in Iframe #{i} !!!")
                return i
        except:
            continue
            
    print("      Warning: Could not find 'Load More' button. (Maybe already loaded?)")
    return None

def is_button_visible(driver):
    """Checks for the existence of the button."""
    xpaths = [
        "//button[contains(text(), 'Load more')]",
        "//button[contains(text(), 'View more')]",
        "//div[contains(@class, 'load-more')]",
        "//button[contains(@class, 'bv-content-btn-load-more')]"
    ]
    for xpath in xpaths:
        try:
            btn = driver.find_element(By.XPATH, xpath)
            if btn.is_displayed(): return True
        except:
            continue
    return False

def click_load_button(driver):
    """Clicks the button safely."""
    xpaths = [
        "//button[contains(text(), 'Load more')]",
        "//button[contains(text(), 'View more')]",
        "//div[contains(@class, 'load-more')]",
        "//button[contains(@class, 'bv-content-btn-load-more')]"
    ]
    for xpath in xpaths:
        try:
            btn = driver.find_element(By.XPATH, xpath)
            if btn.is_displayed():
                # Use JavaScript click to bypass overlays
                driver.execute_script("arguments[0].click();", btn)
                return True
        except:
            continue
    return False

def scrape_data(driver_source):
    soup = BeautifulSoup(driver_source, 'html.parser')
    reviews_data = []
    
    potential_blocks = soup.find_all(string=lambda text: text and "out of 5 stars" in text)
    
    for rating_text in potential_blocks:
        try:
            container = rating_text.find_parent('div').find_parent('div')
            if not container: continue

            texts = list(container.stripped_strings)
            rating = rating_text.strip().split()[0]
            published_at = "N/A"
            author = "Anonymous"
            feedback = ""
            likes = 0
            
            for t in texts:
                if len(t) == 10 and t[2] == '/' and t[5] == '/': published_at = t
                if any(k in t for k in ["Male", "Female", "Height", "Weight"]): author = t
                if "Helpful" in t and "(" in t:
                     try: likes = int(t.split('(')[1].split(')')[0])
                     except: pass
            
            possible_bodies = [t for t in texts if len(t) > 5 and t != author and "out of 5" not in t]
            if possible_bodies: feedback = max(possible_bodies, key=len)

            if feedback:
                reviews_data.append({
                    "Continent": "Asia",
                    "Country": "Malaysia",
                    "Source": "Uniqlo MY Official",
                    "Author": author,
                    "Rating": rating,
                    "Published At": published_at,
                    "Like Count": likes,
                    "Feedback": feedback
                })
        except:
            continue
    return reviews_data

def process_product(driver, url):
    print(f"--- Accessing {url} ---")
    driver.get(url)
    time.sleep(5)
    close_cookie_banner(driver)
    
    # 1. Locate Frame
    frame_index = find_review_frame(driver)
    
    # 2. Click Loop
    click_count = 0
    # Approx 750 reviews / 6 reviews per click = ~125 clicks
    max_clicks = 200 
    
    while click_count < max_clicks:
        try:
            # Ensure we are in the correct frame
            if frame_index != -1 and frame_index is not None:
                driver.switch_to.default_content()
                frames = driver.find_elements(By.TAG_NAME, "iframe")
                if len(frames) > frame_index:
                    driver.switch_to.frame(frames[frame_index])
            
            # Scroll
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(1) # Short pause before looking for button
            
            if click_load_button(driver):
                print(f"      Clicked 'Load more' ({click_count+1})")
                click_count += 1
                # WAIT 3 SECONDS - Critical for stability
                time.sleep(3)
            else:
                print("      Button gone. All reviews loaded.")
                break
                
        except Exception as e:
            print(f"      Loop Error: {e}")
            break

    # 3. Extract Final Data
    print("   -> Extracting data...")
    if frame_index != -1 and frame_index is not None:
        driver.switch_to.default_content()
        frames = driver.find_elements(By.TAG_NAME, "iframe")
        if len(frames) > frame_index:
            driver.switch_to.frame(frames[frame_index])
            
    html = driver.page_source
    data = scrape_data(html)
    
    if data:
        df = pd.DataFrame(data)
        df.drop_duplicates(subset=['Feedback'], inplace=True)
        df.to_csv(OUTPUT_FILE, index=False)
        print(f"SUCCESS! Saved {len(df)} reviews to {OUTPUT_FILE}")
    else:
        print("No reviews extracted.")

def main():
    driver = get_driver()
    if not os.path.exists(OUTPUT_FILE):
        pd.DataFrame(columns=["Continent", "Country", "Source", "Author", "Rating", "Published At", "Like Count", "Feedback"]).to_csv(OUTPUT_FILE, index=False)

    try:
        process_product(driver, PRODUCT_URL)
    except Exception as e:
        print(f"Critical Error: {e}")
    finally:
        driver.quit()
        print(f"\nDONE.")

if __name__ == "__main__":
    main()

--- Accessing https://www.uniqlo.com/my/en/products/E447780-000/reviews ---
   -> Checking for Cookie Banner...
      Banner Closed.
   -> Scanning for Review Iframe...
      Found button on Main Page.
      Clicked 'Load more' (1)
      Clicked 'Load more' (2)
      Clicked 'Load more' (3)
      Clicked 'Load more' (4)
      Clicked 'Load more' (5)
      Clicked 'Load more' (6)
      Clicked 'Load more' (7)
      Clicked 'Load more' (8)
      Clicked 'Load more' (9)
      Clicked 'Load more' (10)
      Clicked 'Load more' (11)
      Clicked 'Load more' (12)
      Clicked 'Load more' (13)
      Clicked 'Load more' (14)
      Clicked 'Load more' (15)
      Clicked 'Load more' (16)
      Clicked 'Load more' (17)
      Clicked 'Load more' (18)
      Clicked 'Load more' (19)
      Clicked 'Load more' (20)
      Clicked 'Load more' (21)
      Clicked 'Load more' (22)
      Clicked 'Load more' (23)
      Clicked 'Load more' (24)
      Clicked 'Load more' (25)
      Clicked 'Load more' (26)
 

In [25]:
import pandas as pd

# Load your MAIN dataset (Check your file name!)
df_main = pd.read_csv("UNIQLO_PROJECT_DATASET_ALL.csv") 

# Load the new file
df_new = pd.read_csv("uniqlo_my_E447780_reviews.csv")

# Combine
df_final = pd.concat([df_main, df_new], ignore_index=True)

# Save
df_final.to_csv("UNIQLO_PROJECT_DATASET_ALL2.csv", index=False)
print(f"Total Reviews: {len(df_final)}")

Total Reviews: 1272


In [31]:
import time
import pandas as pd
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import os

# --- CONFIGURATION ---
OUTPUT_FILE = "uniqlo_my_E466026_reviews.csv"
PRODUCT_URL = "https://www.uniqlo.com/my/en/products/E466026-000/reviews"

def get_driver():
    options = uc.ChromeOptions()
    options.add_argument('--no-first-run')
    # Use 'normal' to ensure basic elements load before we start
    options.page_load_strategy = 'normal' 
    driver = uc.Chrome(options=options)
    # Set long timeouts to prevent "Read timed out"
    driver.set_script_timeout(120)
    driver.set_page_load_timeout(120)
    return driver

def close_cookie_banner(driver):
    """Closes the cookie banner if it exists."""
    print("   -> Checking for Cookie Banner...")
    try:
        # Wait up to 8 seconds for the banner
        btn = WebDriverWait(driver, 8).until(
            EC.element_to_be_clickable((By.ID, "onetrust-accept-btn-handler"))
        )
        btn.click()
        print("      Banner Closed.")
        time.sleep(2)
    except:
        print("      No banner found (or already closed).")

def find_review_frame(driver):
    """
    Scans for the iframe containing the 'Load more' button.
    Returns the index of the correct iframe.
    """
    print("   -> Scanning for Review Iframe...")
    
    # 1. Check Main Page
    if is_button_visible(driver):
        print("      Found button on Main Page.")
        return -1
    
    # 2. Check Iframes
    iframes = driver.find_elements(By.TAG_NAME, "iframe")
    print(f"      Found {len(iframes)} iframes. Checking content...")
    
    for i, frame in enumerate(iframes):
        try:
            driver.switch_to.default_content()
            # Refetch to avoid stale elements
            current_frames = driver.find_elements(By.TAG_NAME, "iframe")
            if i >= len(current_frames): break
            
            driver.switch_to.frame(current_frames[i])
            
            # Scroll to wake up the iframe
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(0.5)
            
            if is_button_visible(driver):
                print(f"      !!! Found 'Load More' in Iframe #{i} !!!")
                return i
        except:
            continue
            
    print("      Warning: Could not find 'Load More' button. (Maybe already loaded?)")
    return None

def is_button_visible(driver):
    """Checks for the existence of the button."""
    xpaths = [
        "//button[contains(text(), 'Load more')]",
        "//button[contains(text(), 'View more')]",
        "//div[contains(@class, 'load-more')]",
        "//button[contains(@class, 'bv-content-btn-load-more')]"
    ]
    for xpath in xpaths:
        try:
            btn = driver.find_element(By.XPATH, xpath)
            if btn.is_displayed(): return True
        except:
            continue
    return False

def click_load_button(driver):
    """Clicks the button safely."""
    xpaths = [
        "//button[contains(text(), 'Load more')]",
        "//button[contains(text(), 'View more')]",
        "//div[contains(@class, 'load-more')]",
        "//button[contains(@class, 'bv-content-btn-load-more')]"
    ]
    for xpath in xpaths:
        try:
            btn = driver.find_element(By.XPATH, xpath)
            if btn.is_displayed():
                # Use JavaScript click to bypass overlays
                driver.execute_script("arguments[0].click();", btn)
                return True
        except:
            continue
    return False

def scrape_data(driver_source):
    soup = BeautifulSoup(driver_source, 'html.parser')
    reviews_data = []
    
    potential_blocks = soup.find_all(string=lambda text: text and "out of 5 stars" in text)
    
    for rating_text in potential_blocks:
        try:
            container = rating_text.find_parent('div').find_parent('div')
            if not container: continue

            texts = list(container.stripped_strings)
            rating = rating_text.strip().split()[0]
            published_at = "N/A"
            author = "Anonymous"
            feedback = ""
            likes = 0
            
            for t in texts:
                if len(t) == 10 and t[2] == '/' and t[5] == '/': published_at = t
                if any(k in t for k in ["Male", "Female", "Height", "Weight"]): author = t
                if "Helpful" in t and "(" in t:
                     try: likes = int(t.split('(')[1].split(')')[0])
                     except: pass
            
            possible_bodies = [t for t in texts if len(t) > 5 and t != author and "out of 5" not in t]
            if possible_bodies: feedback = max(possible_bodies, key=len)

            if feedback:
                reviews_data.append({
                    "Continent": "Asia",
                    "Country": "Malaysia",
                    "Source": "Uniqlo MY Official",
                    "Author": author,
                    "Rating": rating,
                    "Published At": published_at,
                    "Like Count": likes,
                    "Feedback": feedback
                })
        except:
            continue
    return reviews_data

def process_product(driver, url):
    print(f"--- Accessing {url} ---")
    driver.get(url)
    time.sleep(5)
    close_cookie_banner(driver)
    
    # 1. Locate Frame
    frame_index = find_review_frame(driver)
    
    # 2. Click Loop
    click_count = 0
    # Approx 750 reviews / 6 reviews per click = ~125 clicks
    max_clicks = 500 
    
    while click_count < max_clicks:
        try:
            # Ensure we are in the correct frame
            if frame_index != -1 and frame_index is not None:
                driver.switch_to.default_content()
                frames = driver.find_elements(By.TAG_NAME, "iframe")
                if len(frames) > frame_index:
                    driver.switch_to.frame(frames[frame_index])
            
            # Scroll
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(1) # Short pause before looking for button
            
            if click_load_button(driver):
                print(f"      Clicked 'Load more' ({click_count+1})")
                click_count += 1
                # WAIT 3 SECONDS - Critical for stability
                time.sleep(3)
            else:
                print("      Button gone. All reviews loaded.")
                break
                
        except Exception as e:
            print(f"      Loop Error: {e}")
            break

    # 3. Extract Final Data
    print("   -> Extracting data...")
    if frame_index != -1 and frame_index is not None:
        driver.switch_to.default_content()
        frames = driver.find_elements(By.TAG_NAME, "iframe")
        if len(frames) > frame_index:
            driver.switch_to.frame(frames[frame_index])
            
    html = driver.page_source
    data = scrape_data(html)
    
    if data:
        df = pd.DataFrame(data)
        df.drop_duplicates(subset=['Feedback'], inplace=True)
        df.to_csv(OUTPUT_FILE, index=False)
        print(f"SUCCESS! Saved {len(df)} reviews to {OUTPUT_FILE}")
    else:
        print("No reviews extracted.")

def main():
    driver = get_driver()
    if not os.path.exists(OUTPUT_FILE):
        pd.DataFrame(columns=["Continent", "Country", "Source", "Author", "Rating", "Published At", "Like Count", "Feedback"]).to_csv(OUTPUT_FILE, index=False)

    try:
        process_product(driver, PRODUCT_URL)
    except Exception as e:
        print(f"Critical Error: {e}")
    finally:
        driver.quit()
        print(f"\nDONE.")

if __name__ == "__main__":
    main()

--- Accessing https://www.uniqlo.com/my/en/products/E466026-000/reviews ---
   -> Checking for Cookie Banner...
      Banner Closed.
   -> Scanning for Review Iframe...
      Found button on Main Page.
      Clicked 'Load more' (1)
      Clicked 'Load more' (2)
      Clicked 'Load more' (3)
      Clicked 'Load more' (4)
      Clicked 'Load more' (5)
      Clicked 'Load more' (6)
      Clicked 'Load more' (7)
      Clicked 'Load more' (8)
      Clicked 'Load more' (9)
      Clicked 'Load more' (10)
      Clicked 'Load more' (11)
      Clicked 'Load more' (12)
      Clicked 'Load more' (13)
      Clicked 'Load more' (14)
      Clicked 'Load more' (15)
      Clicked 'Load more' (16)
      Clicked 'Load more' (17)
      Clicked 'Load more' (18)
      Clicked 'Load more' (19)
      Clicked 'Load more' (20)
      Clicked 'Load more' (21)
      Clicked 'Load more' (22)
      Clicked 'Load more' (23)
      Button gone. All reviews loaded.
   -> Extracting data...
SUCCESS! Saved 126 reviews to 

In [33]:
import pandas as pd

# Load your MAIN dataset (Check your file name!)
df_main = pd.read_csv("UNIQLO_PROJECT_DATASET_ALL2.csv") 

# Load the new file
df_new = pd.read_csv("uniqlo_my_E466026_reviews.csv")

# Combine
df_final = pd.concat([df_main, df_new], ignore_index=True)

# Save
df_final.to_csv("UNIQLO_PROJECT_DATASET_ALL3.csv", index=False)
print(f"Total Reviews: {len(df_final)}")

Total Reviews: 1398


In [35]:
import time
import pandas as pd
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import os

# --- CONFIGURATION ---
OUTPUT_FILE = "uniqlo_my_E455492_reviews.csv"
PRODUCT_URL = "https://www.uniqlo.com/my/en/products/E455492-000/reviews"

def get_driver():
    options = uc.ChromeOptions()
    options.add_argument('--no-first-run')
    # Use 'normal' to ensure basic elements load before we start
    options.page_load_strategy = 'normal' 
    driver = uc.Chrome(options=options)
    # Set long timeouts to prevent "Read timed out"
    driver.set_script_timeout(120)
    driver.set_page_load_timeout(120)
    return driver

def close_cookie_banner(driver):
    """Closes the cookie banner if it exists."""
    print("   -> Checking for Cookie Banner...")
    try:
        # Wait up to 8 seconds for the banner
        btn = WebDriverWait(driver, 8).until(
            EC.element_to_be_clickable((By.ID, "onetrust-accept-btn-handler"))
        )
        btn.click()
        print("      Banner Closed.")
        time.sleep(2)
    except:
        print("      No banner found (or already closed).")

def find_review_frame(driver):
    """
    Scans for the iframe containing the 'Load more' button.
    Returns the index of the correct iframe.
    """
    print("   -> Scanning for Review Iframe...")
    
    # 1. Check Main Page
    if is_button_visible(driver):
        print("      Found button on Main Page.")
        return -1
    
    # 2. Check Iframes
    iframes = driver.find_elements(By.TAG_NAME, "iframe")
    print(f"      Found {len(iframes)} iframes. Checking content...")
    
    for i, frame in enumerate(iframes):
        try:
            driver.switch_to.default_content()
            # Refetch to avoid stale elements
            current_frames = driver.find_elements(By.TAG_NAME, "iframe")
            if i >= len(current_frames): break
            
            driver.switch_to.frame(current_frames[i])
            
            # Scroll to wake up the iframe
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(0.5)
            
            if is_button_visible(driver):
                print(f"      !!! Found 'Load More' in Iframe #{i} !!!")
                return i
        except:
            continue
            
    print("      Warning: Could not find 'Load More' button. (Maybe already loaded?)")
    return None

def is_button_visible(driver):
    """Checks for the existence of the button."""
    xpaths = [
        "//button[contains(text(), 'Load more')]",
        "//button[contains(text(), 'View more')]",
        "//div[contains(@class, 'load-more')]",
        "//button[contains(@class, 'bv-content-btn-load-more')]"
    ]
    for xpath in xpaths:
        try:
            btn = driver.find_element(By.XPATH, xpath)
            if btn.is_displayed(): return True
        except:
            continue
    return False

def click_load_button(driver):
    """Clicks the button safely."""
    xpaths = [
        "//button[contains(text(), 'Load more')]",
        "//button[contains(text(), 'View more')]",
        "//div[contains(@class, 'load-more')]",
        "//button[contains(@class, 'bv-content-btn-load-more')]"
    ]
    for xpath in xpaths:
        try:
            btn = driver.find_element(By.XPATH, xpath)
            if btn.is_displayed():
                # Use JavaScript click to bypass overlays
                driver.execute_script("arguments[0].click();", btn)
                return True
        except:
            continue
    return False

def scrape_data(driver_source):
    soup = BeautifulSoup(driver_source, 'html.parser')
    reviews_data = []
    
    potential_blocks = soup.find_all(string=lambda text: text and "out of 5 stars" in text)
    
    for rating_text in potential_blocks:
        try:
            container = rating_text.find_parent('div').find_parent('div')
            if not container: continue

            texts = list(container.stripped_strings)
            rating = rating_text.strip().split()[0]
            published_at = "N/A"
            author = "Anonymous"
            feedback = ""
            likes = 0
            
            for t in texts:
                if len(t) == 10 and t[2] == '/' and t[5] == '/': published_at = t
                if any(k in t for k in ["Male", "Female", "Height", "Weight"]): author = t
                if "Helpful" in t and "(" in t:
                     try: likes = int(t.split('(')[1].split(')')[0])
                     except: pass
            
            possible_bodies = [t for t in texts if len(t) > 5 and t != author and "out of 5" not in t]
            if possible_bodies: feedback = max(possible_bodies, key=len)

            if feedback:
                reviews_data.append({
                    "Continent": "Asia",
                    "Country": "Malaysia",
                    "Source": "Uniqlo MY Official",
                    "Author": author,
                    "Rating": rating,
                    "Published At": published_at,
                    "Like Count": likes,
                    "Feedback": feedback
                })
        except:
            continue
    return reviews_data

def process_product(driver, url):
    print(f"--- Accessing {url} ---")
    driver.get(url)
    time.sleep(5)
    close_cookie_banner(driver)
    
    # 1. Locate Frame
    frame_index = find_review_frame(driver)
    
    # 2. Click Loop
    click_count = 0
    # Approx 750 reviews / 6 reviews per click = ~125 clicks
    max_clicks = 500 
    
    while click_count < max_clicks:
        try:
            # Ensure we are in the correct frame
            if frame_index != -1 and frame_index is not None:
                driver.switch_to.default_content()
                frames = driver.find_elements(By.TAG_NAME, "iframe")
                if len(frames) > frame_index:
                    driver.switch_to.frame(frames[frame_index])
            
            # Scroll
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(1) # Short pause before looking for button
            
            if click_load_button(driver):
                print(f"      Clicked 'Load more' ({click_count+1})")
                click_count += 1
                # WAIT 3 SECONDS - Critical for stability
                time.sleep(3)
            else:
                print("      Button gone. All reviews loaded.")
                break
                
        except Exception as e:
            print(f"      Loop Error: {e}")
            break

    # 3. Extract Final Data
    print("   -> Extracting data...")
    if frame_index != -1 and frame_index is not None:
        driver.switch_to.default_content()
        frames = driver.find_elements(By.TAG_NAME, "iframe")
        if len(frames) > frame_index:
            driver.switch_to.frame(frames[frame_index])
            
    html = driver.page_source
    data = scrape_data(html)
    
    if data:
        df = pd.DataFrame(data)
        df.drop_duplicates(subset=['Feedback'], inplace=True)
        df.to_csv(OUTPUT_FILE, index=False)
        print(f"SUCCESS! Saved {len(df)} reviews to {OUTPUT_FILE}")
    else:
        print("No reviews extracted.")

def main():
    driver = get_driver()
    if not os.path.exists(OUTPUT_FILE):
        pd.DataFrame(columns=["Continent", "Country", "Source", "Author", "Rating", "Published At", "Like Count", "Feedback"]).to_csv(OUTPUT_FILE, index=False)

    try:
        process_product(driver, PRODUCT_URL)
    except Exception as e:
        print(f"Critical Error: {e}")
    finally:
        driver.quit()
        print(f"\nDONE.")

if __name__ == "__main__":
    main()

--- Accessing https://www.uniqlo.com/my/en/products/E455492-000/reviews ---
   -> Checking for Cookie Banner...
      Banner Closed.
   -> Scanning for Review Iframe...
      Found button on Main Page.
      Clicked 'Load more' (1)
      Clicked 'Load more' (2)
      Clicked 'Load more' (3)
      Clicked 'Load more' (4)
      Clicked 'Load more' (5)
      Clicked 'Load more' (6)
      Clicked 'Load more' (7)
      Clicked 'Load more' (8)
      Clicked 'Load more' (9)
      Clicked 'Load more' (10)
      Clicked 'Load more' (11)
      Clicked 'Load more' (12)
      Clicked 'Load more' (13)
      Clicked 'Load more' (14)
      Clicked 'Load more' (15)
      Clicked 'Load more' (16)
      Clicked 'Load more' (17)
      Clicked 'Load more' (18)
      Clicked 'Load more' (19)
      Clicked 'Load more' (20)
      Clicked 'Load more' (21)
      Clicked 'Load more' (22)
      Clicked 'Load more' (23)
      Clicked 'Load more' (24)
      Clicked 'Load more' (25)
      Button gone. All reviews l

In [37]:
import pandas as pd

# Load your MAIN dataset (Check your file name!)
df_main = pd.read_csv("UNIQLO_PROJECT_DATASET_ALL3.csv") 

# Load the new file
df_new = pd.read_csv("uniqlo_my_E455492_reviews.csv")

# Combine
df_final = pd.concat([df_main, df_new], ignore_index=True)

# Save
df_final.to_csv("UNIQLO_PROJECT_DATASET_ALL4.csv", index=False)
print(f"Total Reviews: {len(df_final)}")

Total Reviews: 1532
