# Meesho Web Scraping

# Task 1
Web-Scraping to collect a dataset of 50 products with 500 reviews each by only Indians.

Exported to
- `./data/meesho_product_details.csv` (product details)
- `./data/meesho_product_reviews_translated.csv` (translated reviews of each product)


###

### Extract Links of 50 products from a product page

Import Libraries

In [None]:
import time
import csv
import re
import random
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException, TimeoutException
import undetected_chromedriver as uc

Meesho Scraper to scrape product links from product listing page

In [None]:
def scrape_meesho_listing(url, max_products=50, min_reviews=2000, max_scrolls=1000, max_consecutive=50):
    brave_path = "C:/Program Files/BraveSoftware/Brave-Browser/Application/brave.exe"
    options = uc.ChromeOptions()
    options.binary_location = brave_path
    options.add_argument("--disable-gpu")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36")
    options.add_argument("--window-size=1920,1080")
    options.add_argument("--start-maximized")

    driver = uc.Chrome(options=options)
    product_data = []
    seen_links = set()

    try:
        driver.get(url)
        print(f"✅ Navigated to listing page: {url}")

        # Initial wait for products to load
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'div.ProductListItem__GridCol-sc-1baba2g-0'))
        )

        # Get initial page height for scroll tracking
        last_height = driver.execute_script("return document.body.scrollHeight")
        scroll_attempts = 0
        consecutive_no_new = 0
        collected = 0

        # Store last seen product link to detect new products
        last_seen_products = set()

        while collected < max_products and scroll_attempts < max_scrolls and consecutive_no_new < max_consecutive:
            # Scroll down incrementally - more gradually to ensure all products load
            driver.execute_script("window.scrollBy(0, window.innerHeight * 0.6);")

            # Add randomized wait to mimic human behavior and allow content to load
            time.sleep(random.uniform(2.0, 3.0))

            # Wait for potential new products to load
            try:
                # Wait for any potential changes in the DOM
                time.sleep(1)  # Additional small wait for dynamic content

                # Get all current visible products
                current_products = driver.find_elements(
                    By.CSS_SELECTOR,
                    'div.ProductListItem__GridCol-sc-1baba2g-0 a[href^="/"]'
                )

                # Check if we found new products
                current_product_links = {p.get_attribute("href") for p in current_products}
                new_products_found = len(current_product_links - last_seen_products) > 0

                if not new_products_found:
                    consecutive_no_new += 1
                    print(f"⚠️ No new products detected (attempt {consecutive_no_new}/{max_consecutive})")
                else:
                    consecutive_no_new = 0  # Reset counter when new products found
                    last_seen_products = current_product_links.copy()

                # Process all visible products
                for product_link in current_products:
                    try:
                        # Get the parent container to access product details
                        product = product_link.find_element(By.XPATH, "./ancestor::div[contains(@class, 'ProductListItem__GridCol-sc-')]")

                        # Product link
                        link = product_link.get_attribute("href")
                        if not link.startswith("http"):
                            link = "https://www.meesho.com" + link

                        # Skip if already processed
                        if link in seen_links:
                            continue

                        # Extract product ID from URL
                        product_id = ""
                        url_match = re.search(r'/p/([a-zA-Z0-9]+)', link)
                        if url_match:
                            product_id = url_match.group(1)

                        # Reviews count - try multiple selector patterns
                        reviews = 0
                        try:
                            reviews_elem = product.find_element(
                                By.XPATH,
                                ".//span[contains(@class, 'NewProductCardstyled__RatingCount') or contains(@class, 'RatingCount')]"
                            )
                            reviews_text = reviews_elem.text
                            reviews = int(re.sub(r'\D', '', reviews_text.split()[0]))
                        except Exception as e:
                            # If we can't find reviews, just log and continue
                            print(f"⚠️ Couldn't extract reviews for {link}: {str(e)}")
                            continue

                        # Skip products with too few reviews if specified
                        if min_reviews > 0 and reviews < min_reviews:
                            seen_links.add(link)  # Mark as seen to avoid reprocessing
                            continue

                        # Rating
                        rating = ""
                        try:
                            rating_elem = product.find_element(
                                By.XPATH,
                                ".//span[contains(@class, 'Rating__StyledPill')]//span"
                            )
                            rating = rating_elem.text.strip()
                        except Exception:
                            pass

                        # Product title
                        title = ""
                        try:
                            title_elem = product.find_element(
                                By.XPATH,
                                ".//p[contains(@class, 'NewProductCardstyled__StyledDesktopProductTitle')]"
                            )
                            title = title_elem.text.strip()
                        except Exception:
                            pass

                        # Price
                        price = ""
                        try:
                            price_elem = product.find_element(
                                By.XPATH,
                                ".//h5[contains(@class, 'sc-eDvSVe dwCrSh')]"
                            )
                            price = price_elem.text.strip()
                        except Exception:
                            pass

                        product_data.append({
                            "product_id": product_id,
                            "product_url": link,
                            "title": title,
                            "price": price,
                            "rating": rating,
                            "reviews_count": reviews
                        })
                        seen_links.add(link)
                        collected += 1
                        print(f"✅ Collected {collected}/{max_products} - {title} - {reviews} reviews")

                        if collected >= max_products:
                            break

                    except (NoSuchElementException, StaleElementReferenceException) as e:
                        print(f"⚠️ Error processing product: {str(e)}")
                        continue

                # Check if we've scrolled to the bottom
                new_height = driver.execute_script("return document.body.scrollHeight")
                if new_height == last_height:
                    consecutive_no_new += 1
                    print(f"⚠️ Reached bottom of page or no new height (attempt {consecutive_no_new}/{max_consecutive})")
                else:
                    last_height = new_height
                    consecutive_no_new = 0  # Reset counter when page height changes

            except Exception as e:
                print(f"⚠️ Error during scrolling: {str(e)}")

            scroll_attempts += 1

        # Save to CSV
        with open('meesho_product_links.csv', 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=["product_id", "product_url", "title", "price", "rating", "reviews_count"])
            writer.writeheader()
            writer.writerows(product_data)
            print(f"\n✅ Saved {len(product_data)} products to meesho_product_links.csv")

        return product_data

    finally:
        driver.quit()
        print("✅ Browser closed")


URL used: Lipsticks Data from Meesho containing at least 2000 ratings as it is more likely to have 500+ reviews

In [None]:
listing_url = "https://www.meesho.com/lips-makeup/pl/3jd?Category[0][id]=2663&Category[0][label]=Lipsticks&Category[0][payload]=eyJmaWVsZCI6ImxhYmVscy4xIiwib3AiOiJpbiIsInZhbHVlIjoiMjY2MyJ9"

Colleting 50 product links from product listing

In [None]:
scrape_meesho_listing(listing_url)

✅ Navigated to listing page: https://www.meesho.com/lips-makeup/pl/3jd?Category[0][id]=2663&Category[0][label]=Lipsticks&Category[0][payload]=eyJmaWVsZCI6ImxhYmVscy4xIiwib3AiOiJpbiIsInZhbHVlIjoiMjY2MyJ9
✅ Collected 1/50 - Hipbrat, Red/ MatteLiquid/ LipstickPack of 1 - 19451 reviews
✅ Collected 2/50 - Proffesional Attractive Lipsticks - 10146 reviews
✅ Collected 3/50 - Mars Lipsticks - 23253 reviews
✅ Collected 4/50 - Sensational Long Stay Lipsticks - 86674 reviews
✅ Collected 5/50 - Premium Collection Lipsticks - 13710 reviews
✅ Collected 6/50 - Proffesional Long Stay Lipsticks - 12359 reviews
✅ Collected 7/50 - Seven Seas Lip Duo 2 In 1 Lipstick - 5232 reviews
✅ Collected 8/50 - Sensational Intense Lipsticks - 29103 reviews
✅ Collected 9/50 - Sensational Unique Lipsticks - 6250 reviews
✅ Collected 10/50 - Proffesional Intense Lipsticks - 16242 reviews
✅ Collected 11/50 - MILA BEAUTE Rich Matte Longwear Matte Lipstick Combo Set - 2901 reviews
✅ Collected 12/50 - NattyU, Red/ MatteLiqui

[{'product_id': '4obtb7',
  'product_url': 'https://www.meesho.com/hipbrat-red-matteliquid-lipstickpack-of-1/p/4obtb7',
  'title': 'Hipbrat, Red/ MatteLiquid/ LipstickPack of 1',
  'price': '₹103',
  'rating': '3.8',
  'reviews_count': 19451},
 {'product_id': '5k6fjf',
  'product_url': 'https://www.meesho.com/apple-lipstick-2/p/5k6fjf',
  'title': 'Proffesional Attractive Lipsticks',
  'price': '₹106',
  'rating': '3.7',
  'reviews_count': 10146},
 {'product_id': '5b49ob',
  'product_url': 'https://www.meesho.com/mars-ultra-pigmented-super-soft-ultra-matte-lipstick-long-lasting-pack-of-4-shade-f-144-g/p/5b49ob',
  'title': 'Mars Lipsticks',
  'price': '₹350',
  'rating': '4.1',
  'reviews_count': 23253},
 {'product_id': '5utebe',
  'product_url': 'https://www.meesho.com/ronzille-long-lasting-waterproof-non-transfer-liquid-matte-lipstick-set-of-10/p/5utebe',
  'title': 'Sensational Long Stay Lipsticks',
  'price': '₹257',
  'rating': '4.3',
  'reviews_count': 86674},
 {'product_id': '6j

### Scraping Product details from the given links

Import Libraries

In [None]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException, TimeoutException
import csv
import re
import os
import undetected_chromedriver as uc

Meesho Scraper to scrape product details from a product page

In [None]:
def scrape_meesho_product(product_id, product_url, driver):
    try:
        # Navigate to the product page
        driver.get(product_url)
        print(f"✅ Navigated to {product_url}")

        print(f"✅ Product ID: {product_id}")

        # Wait for the page to load
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "span.fhfLdV"))
        )

        # Extract product name
        product_name = driver.find_element(By.CSS_SELECTOR, "span.fhfLdV").text.strip()
        print(f"✅ Product Name: {product_name}")

        # Extract price
        price = driver.find_element(By.CSS_SELECTOR, "h4.biMVPh").text.strip()
        print(f"✅ Price: {price}")

        # Extract rating and reviews count - FIXED SELECTORS
        try:
            # Try multiple selectors for rating
            rating_selectors = [
                "span.eLiHuv span.laVOtN",
                "span[label] span",
                ".ShippingInfo__RatingsRow-sc-frp12n-2 span[label] span"
            ]

            rating = None
            for selector in rating_selectors:
                try:
                    rating_element = driver.find_element(By.CSS_SELECTOR, selector)
                    rating = rating_element.text.strip()
                    print(f"✅ Rating: {rating}")
                    break
                except NoSuchElementException:
                    continue

            if not rating:
                rating = "Not available"
                print("⚠️ Rating not found")

            # Try multiple selectors for reviews
            reviews_selectors = [
                "span.eOvght",
                ".ShippingInfo__RatingsRow-sc-frp12n-2 span.eOvght",
                "span.ShippingInfo__OverlineStyled-sc-frp12n-4"
            ]

            reviews_text = None
            for selector in reviews_selectors:
                try:
                    reviews_element = driver.find_element(By.CSS_SELECTOR, selector)
                    reviews_text = reviews_element.text.strip()
                    print(f"✅ Reviews: {reviews_text}")
                    break
                except NoSuchElementException:
                    continue

            if not reviews_text:
                reviews_text = "Not available"
                print("⚠️ Reviews count not found")

        except Exception as e:
            print(f"⚠️ Error extracting rating/reviews: {e}")
            rating = "Not available"
            reviews_text = "Not available"

        # Extract product details
        product_details_elements = driver.find_elements(By.CSS_SELECTOR, "div.eFKyvM p.guezwa")
        product_details = "\n".join([element.text.strip() for element in product_details_elements])
        print(f"✅ Extracted product details")

        # Extract seller name from product details
        seller_name = "Not available"
        seller_pattern = r'\[(.*?)\]'
        seller_match = re.search(seller_pattern, product_details)
        if seller_match:
            seller_name = seller_match.group(1)
        print(f"✅ Seller Name: {seller_name}")

        # Compile all data
        product_data = {
            "Product ID": product_id,
            "Product Name": product_name,
            "Product Link": product_url,
            "Overall Rating": rating,
            "Price": price,
            "Reviews Count": reviews_text,
            "Product Details": product_details.replace("\n", "|"),
            "Seller Name": seller_name,
        }

        print(f"✅ Successfully scraped data for product ID: {product_id}")
        return product_data

    except Exception as e:
        print(f"❌ Error scraping {product_url}: {e}")
        return None

Function to process product links from the csv generated in previous step

In [None]:
def process_links_from_csv(input_csv, output_csv):
    brave_path = "C:/Program Files/BraveSoftware/Brave-Browser/Application/brave.exe"
    options = uc.ChromeOptions()
    options.binary_location = brave_path
    options.add_argument("--disable-gpu")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36")
    options.add_argument("--window-size=1920,1080")
    options.add_argument("--start-maximized")

    driver = uc.Chrome(options=options)

    # Check if input file exists
    if not os.path.exists(input_csv):
        print(f"❌ Input file '{input_csv}' not found!")
        driver.quit()
        return

    try:
        # Read product IDs and URLs from CSV file
        product_data = []
        with open(input_csv, 'r', encoding='utf-8') as file:
            reader = csv.reader(file)
            for row in reader:
                if len(row) >= 2 and row[1].strip() and 'meesho.com' in row[1]:
                    product_data.append((row[0].strip(), row[1].strip()))

        if not product_data:
            print("❌ No valid Meesho URLs found in the input file!")
            driver.quit()
            return

        print(f"✅ Found {len(product_data)} URLs to process")

        # Create output file and write header
        with open(output_csv, 'w', newline='', encoding='utf-8') as output_file:
            fieldnames = [
                "Product ID", "Product Name", "Product Link", "Overall Rating",
                "Price", "Reviews Count", "Product Details", "Seller Name"
            ]
            writer = csv.DictWriter(output_file, fieldnames=fieldnames)
            writer.writeheader()

            # Process each URL
            for i, (product_id, url) in enumerate(product_data):
                print(f"\n[{i+1}/{len(product_data)}] Processing: {url}")
                product_info = scrape_meesho_product(product_id, url, driver)

                if product_info:
                    writer.writerow(product_info)
                    print(f"✅ Added product data to {output_csv}")

                # Add a delay between requests to avoid being blocked
                if i < len(product_data) - 1:  # Don't sleep after the last URL
                    delay = 3 + (2 * (i % 3))  # Varying delay between 3-7 seconds
                    print(f"⏱️ Waiting {delay} seconds before next request...")
                    time.sleep(delay)

        print(f"\n✅ All products have been processed and saved to {output_csv}")

    except Exception as e:
        print(f"❌ Error processing URLs: {e}")

    finally:
        # Close the browser
        driver.quit()
        print("✅ Browser closed")

File names

In [None]:
INPUT_CSV = 'meesho_product_links.csv'
OUTPUT_CSV = 'meesho_product_details.csv'

Scraper Run

In [None]:
# Run the scraper
process_links_from_csv(INPUT_CSV, OUTPUT_CSV)

✅ Found 50 URLs to process

[1/50] Processing: https://www.meesho.com/hipbrat-red-matteliquid-lipstickpack-of-1/p/4obtb7
✅ Navigated to https://www.meesho.com/hipbrat-red-matteliquid-lipstickpack-of-1/p/4obtb7
✅ Product ID: 4obtb7
✅ Product Name: Hipbrat, Red/ MatteLiquid/ LipstickPack of 1
✅ Price: ₹103
✅ Rating: 3.8
✅ Reviews: 19451 Ratings, 6351 Reviews
✅ Extracted product details
✅ Seller Name: Not available
✅ Successfully scraped data for product ID: 4obtb7
✅ Added product data to meesho_product_details.csv
⏱️ Waiting 3 seconds before next request...

[2/50] Processing: https://www.meesho.com/apple-lipstick-2/p/5k6fjf
✅ Navigated to https://www.meesho.com/apple-lipstick-2/p/5k6fjf
✅ Product ID: 5k6fjf
✅ Product Name: Apple lipstick 2
✅ Price: ₹106
✅ Rating: 3.7
✅ Reviews: 10146 Ratings, 2525 Reviews
✅ Extracted product details
✅ Seller Name: Not available
✅ Successfully scraped data for product ID: 5k6fjf
✅ Added product data to meesho_product_details.csv
⏱️ Waiting 5 seconds befo

### Scraping 500 reviews per product

Importing Libraries

In [None]:
import time
import csv
import os
import random
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException, TimeoutException
import undetected_chromedriver as uc

Function to save reviews to a CSV file

In [None]:
def save_reviews_to_csv(reviews, filename):
    """Save reviews to a CSV file in append mode, adding header if file doesn't exist"""
    file_exists = os.path.isfile(filename)

    with open(filename, 'a', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['product_id', 'username', 'rating', 'review', 'date', 'helpful_count']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        if not file_exists:
            writer.writeheader()

        for review in reviews:
            writer.writerow(review)

Function to scrape reviews from a product page

In [None]:
def scrape_reviews(product_id, product_url, max_reviews=500):
    """Scrape reviews for a given product URL"""
    # Initialize WebDriver inside the function
    brave_path = "C:/Program Files/BraveSoftware/Brave-Browser/Application/brave.exe"
    options = uc.ChromeOptions()
    options.binary_location = brave_path
    options.add_argument("--disable-gpu")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36")
    options.add_argument("--window-size=1920,1080")
    options.add_argument("--start-maximized")

    driver = uc.Chrome(options=options)

    reviews = []
    scraped_texts = set()  # To track unique reviews
    attempts = 0

    try:
        driver.get(product_url)

        # Step 1: Click "View All Reviews"
        try:
            view_all_btn = WebDriverWait(driver, 15).until(
                EC.element_to_be_clickable((By.XPATH, '//span[contains(text(), "View all")]'))
            )
            driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", view_all_btn)
            driver.execute_script("arguments[0].click();", view_all_btn)
            print("✅ Clicked 'View All'")
        except Exception as e:
            print(f"❌ Failed to click 'View All': {e}")
            return []

        # Step 2: Wait for the modal to appear
        try:
            modal = WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, 'div.Drawerstyled__ContentWrapper-sc-1ltfkrx-1'))
            )
            print("✅ Modal detected")
        except Exception as e:
            print(f"❌ Modal not found: {e}")
            return []

        # Step 3: Scrape reviews with a more robust approach
        while len(reviews) < max_reviews and attempts < 200:
            try:
                # selector to find ALL review containers
                review_containers = driver.find_elements(By.CSS_SELECTOR, 'div.sc-iBYQkv.fCRAHG.RatingReviewDrawer__StyledCard-sc-y5ksev-1.eyMVSu > div.sc-iBYQkv.fCRAHG[color="white"]')

                # Alternative selectors if the above doesn't work:
                if not review_containers:
                    review_containers = driver.find_elements(By.XPATH, '//div[contains(@class, "Comment__FlexRow")]/ancestor::div[contains(@class, "sc-iBYQkv fCRAHG")][1]')

                if not review_containers:
                    review_containers = driver.find_elements(By.XPATH, '//span[contains(@class, "Comment__CommentText")]/ancestor::div[contains(@class, "sc-iBYQkv fCRAHG")][1]')

                print(f"Found {len(review_containers)} review containers in the current view")

                # Process each review container
                for container in review_containers[len(reviews):]:
                    try:
                        # Extract username - from the HTML, it's in a span with class "dugLmN"
                        username_element = container.find_element(By.CSS_SELECTOR, 'span.dugLmN')
                        username = username_element.text.strip()

                        # Extract review text
                        review_text_element = container.find_element(By.CSS_SELECTOR, 'span.cfdxfJ')
                        review_text = review_text_element.text.strip()

                        # Extract rating
                        rating_element = container.find_element(By.CSS_SELECTOR, 'span[label]')
                        rating = rating_element.text.strip()

                        # Extract date - from the HTML, it's in a span with class "XndEO"
                        date_element = container.find_element(By.CSS_SELECTOR, 'span.XndEO')
                        date = date_element.text.replace('Posted on ', '').strip()

                        # Extract helpful count - from the HTML, it's in a p with class "guezwa"
                        helpful_element = container.find_element(By.CSS_SELECTOR, 'p.guezwa')
                        helpful_text = helpful_element.text.strip()
                        helpful_count = helpful_text.replace('Helpful (', '').replace(')', '')

                        review_id = f"{username}_{review_text}_{date}"

                        # Only add unique reviews
                        if review_text and review_id not in scraped_texts:
                            scraped_texts.add(review_id)
                            reviews.append({
                                'product_id': product_id,
                                'username': username,
                                'rating': rating,
                                'review': review_text,
                                'date': date,
                                'helpful_count': helpful_count
                            })
                            print(f"Added review #{len(reviews)}: {username} - {rating} - {date}")

                            attempts = 0

                            if len(reviews) >= max_reviews:
                                break
                    except Exception as e:
                        print(f"⚠️ Error extracting review: {attempts}")
                        attempts+=1
                        continue

                # Scroll within modal to load more reviews
                driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight;", modal)
                print("📜 Scrolled modal to bottom")
                time.sleep(random.uniform(0.6, 1.2))  # Increased wait time

                # Check if we're making progress
                if len(reviews) == 0 and attempts > 100:
                    print("⚠️ No reviews found after multiple attempts. Stopping.")
                    break

                # Click "View More" button if available
                try:
                    view_more_btn = WebDriverWait(driver, 5).until(
                        EC.element_to_be_clickable((By.CSS_SELECTOR, 'button.RatingReviewDrawer__ViewMoreButton-sc-y5ksev-0'))
                    )
                    driver.execute_script("arguments[0].click();", view_more_btn)
                    print("🔄 Clicked 'View More'")
                    time.sleep(random.uniform(2.5, 3.5))  # Increased wait time
                except Exception as e:
                    print("✅ All reviews loaded or no 'View More' button found.")
                    break

            except Exception as e:
                print(f"⚠️ Error during scraping: {e}")
                attempts += 1

        print(f"✅ Total unique reviews scraped: {len(reviews)}")
        return reviews[:max_reviews]

    except Exception as e:
        print(f"❌ Error during scraping: {e}")
        return []

    finally:
        # Always close the browser
        driver.quit()
        print("🔚 Browser closed for this product.")

Function to run the review scraper on each link

In [None]:
try:
    # Input and output file paths
    input_csv = "meesho_product_details.csv"
    output_csv = "meesho_product_reviews.csv"

    # Delete output file if it exists to start fresh
    if os.path.exists(output_csv):
        os.remove(output_csv)
        print(f"Removed existing output file: {output_csv}")

    # Process each product from the input CSV
    with open(input_csv, "r", encoding="utf-8") as f:
        reader = csv.reader(f)
        header = next(reader)  # Skip header row

        # Validate header structure
        if len(header) < 3:
            raise ValueError(f"Input CSV must have at least 3 columns. Found: {len(header)}")

        print(f"Reading products from {input_csv}")

        # Track statistics
        total_products = 0
        successful_products = 0
        total_reviews = 0

        # Process each product
        for row in reader:
            if len(row) >= 3:  # Ensure the row has at least 3 columns
                product_id = row[0].strip()  # First column is product ID
                product_url = row[2].strip()  # Third column is product URL

                total_products += 1

                print(f"\n{'='*50}")
                print(f"Processing product {total_products}: ID={product_id}")
                print(f"URL: {product_url}")
                print(f"{'='*50}\n")

                # Scrape reviews for this product
                reviews = scrape_reviews(product_id, product_url, max_reviews=500)

                # Save reviews to CSV immediately after scraping
                if reviews:
                    save_reviews_to_csv(reviews, output_csv)
                    successful_products += 1
                    total_reviews += len(reviews)
                    print(f"✅ Saved {len(reviews)} reviews for product {product_id} to {output_csv}")
                    print(f"📊 Progress: {successful_products}/{total_products} products processed")
                    print(f"📊 Total reviews collected: {total_reviews}")
                else:
                    print(f"⚠️ No reviews found for product {product_id}")
            else:
                print(f"⚠️ Invalid row format, skipping: {row}")

        # Print final statistics
        print(f"\n{'='*50}")
        print(f"📊 Final Statistics:")
        print(f"Total products processed: {total_products}")
        print(f"Products with reviews: {successful_products}")
        print(f"Total reviews collected: {total_reviews}")
        print(f"Reviews saved to: {output_csv}")
        print(f"{'='*50}")

except Exception as e:
    print(f"❌ Error during execution: {e}")


Removed existing output file: meesho_product_reviews.csv
Reading products from meesho_product_details.csv

Processing product 1: ID=4obtb7
URL: https://www.meesho.com/hipbrat-red-matteliquid-lipstickpack-of-1/p/4obtb7

✅ Clicked 'View All'
✅ Modal detected
Found 2 review containers in the current view
📜 Scrolled modal to bottom
🔄 Clicked 'View More'
Found 12 review containers in the current view
Added review #1: Meesho User - 4.0 - 26 Mar 2025
Added review #2: Vikash Paswan - 4.0 - 23 Oct 2023
Added review #3: Priyanka Kashyap - 4.0 - 6 Oct 2024
Added review #4: usha jaat ghintala Ghintala - 4.0 - 21 Mar 2025
Added review #5: Pushpa Marathe - 4.0 - 25 Nov 2024
Added review #6: Nickey - 4.0 - 26 Nov 2024
Added review #7: sanjana kumari - 4.0 - 1 Apr 2025
Added review #8: Meesho User - 5.0 - 31 Mar 2025
Added review #9: Meesho User - 4.0 - 4 Oct 2024
Added review #10: Ayesha Siddiqua - 4.0 - 6 Dec 2024
Added review #11: Meesho User - 4.0 - 13 Mar 2025
Added review #12: Meesho User - 4.0 

closing the file

In [None]:
f.close()

```The code stopped in the middle due to network issues, so continuing with product 6```

In [None]:
try:
    # Input and output file paths
    input_csv = "meesho_product_details.csv"
    output_csv = "meesho_product_reviews.csv"

    # Set the starting point (6th iteration means starting from index 5)
    start_from_index = 5

    # Process each product from the input CSV
    with open(input_csv, "r", encoding="utf-8") as f:
        reader = csv.reader(f)
        header = next(reader)  # Skip header row

        # Validate header structure
        if len(header) < 3:
            raise ValueError(f"Input CSV must have at least 3 columns. Found: {len(header)}")

        print(f"Reading products from {input_csv}, starting from product #{start_from_index+1}")

        # Track statistics
        total_products = 0
        successful_products = 0
        total_reviews = 0

        # Count existing reviews if output file exists
        if os.path.exists(output_csv):
            with open(output_csv, "r", encoding="utf-8") as review_file:
                review_reader = csv.reader(review_file)
                next(review_reader, None)  # Skip header
                total_reviews = sum(1 for _ in review_reader)
                print(f"Found existing output file with {total_reviews} reviews")

        # Skip to the starting point
        for i, _ in enumerate(reader):
            if i >= start_from_index-1:
                break
            total_products += 1

        total_products+=1

        # Process each product from the starting point
        for row in reader:
            if len(row) >= 3:  # Ensure the row has at least 3 columns
                product_id = row[0].strip()  # First column is product ID
                product_url = row[2].strip()  # Third column is product URL

                total_products += 1

                print(f"\n{'='*50}")
                print(f"Processing product {total_products}: ID={product_id}")
                print(f"URL: {product_url}")
                print(f"{'='*50}\n")

                # Scrape reviews for this product
                reviews = scrape_reviews(product_id, product_url, max_reviews=500)

                # Save reviews to CSV immediately after scraping
                if reviews:
                    # Append to existing file or create new one if it doesn't exist
                    save_reviews_to_csv(reviews, output_csv)
                    successful_products += 1
                    total_reviews += len(reviews)
                    print(f"✅ Saved {len(reviews)} reviews for product {product_id} to {output_csv}")
                    print(f"📊 Progress: {successful_products}/{total_products-start_from_index} products processed since restart")
                    print(f"📊 Total reviews collected: {total_reviews}")
                else:
                    print(f"⚠️ No reviews found for product {product_id}")
            else:
                print(f"⚠️ Invalid row format, skipping: {row}")

        # Print final statistics
        print(f"\n{'='*50}")
        print(f"📊 Final Statistics:")
        print(f"Total products processed in this run: {total_products-start_from_index}")
        print(f"Products with reviews in this run: {successful_products}")
        print(f"Total reviews collected: {total_reviews}")
        print(f"Reviews saved to: {output_csv}")
        print(f"{'='*50}")

except Exception as e:
    print(f"❌ Error during execution: {e}")


Reading products from meesho_product_details.csv, starting from product #6
Found existing output file with 2500 reviews

Processing product 6: ID=4oywba
URL: https://www.meesho.com/beauty-professional-color-sensational-liquid-lipstick-combo-pack-set-of-4-edition-mini-lipsticks-matte-finish-lip-color-edition-20-ml/p/4oywba

✅ Clicked 'View All'
✅ Modal detected
Found 2 review containers in the current view
📜 Scrolled modal to bottom
🔄 Clicked 'View More'
Found 12 review containers in the current view
Added review #1: Zoya Azin Khan - 5.0 - 28 Mar 2025
Added review #2: Khushbu Khan - 5.0 - 23 Mar 2025
Added review #3: Juhita Khatun - 5.0 - 26 Mar 2025
Added review #4: Niti Dhir - 5.0 - 9 Jan 2025
Added review #5: Alishan Khan - 5.0 - 1 Mar 2025
Added review #6: Jebun Memon - 5.0 - 30 Dec 2024
Added review #7: Tahreem Ansarii Ansari - 5.0 - 2 Apr 2025
Added review #8: Jaswinder Kaur - 5.0 - 1 Apr 2025
Added review #9: Pornima Shivaji Thorat - 5.0 - 29 Mar 2025
Added review #10: Meesho Use

### Translation

Exporting reviews to an Excel File


In [None]:
import pandas as pd
df = pd.read_csv('meesho_product_reviews.csv')

# Select the 4th column (index 3)
reviews = df.iloc[:, 3]

# Convert to DataFrame for better Excel formatting
reviews_df = reviews.to_frame()

# Export to Excel
reviews_df.to_excel('reviews_only.xlsx', index=False)


Translation is done via Google Translate
- uploaded the reviews_only.xlsx
- generated a new file reviews_only_translated.xlsx

Exporting the translated reviews back to the csv file by adding a new column

In [None]:
import pandas as pd

# Replace these with your actual file names
csv_file = 'translate\meesho_product_reviews.csv'              # Path to your original CSV
excel_file = 'translate\\reviews_only_translated.xlsx'  # Path to your Excel file with translated reviews

# Read the original CSV file
df_csv = pd.read_csv(csv_file)

# Read the Excel file (assumes translated reviews are in the first column)
df_excel = pd.read_excel(excel_file)
translated_reviews = df_excel.iloc[:, 0]  # Select the first column

# Append the translated reviews as a new column
df_csv['reviews_translated'] = translated_reviews

# Save the updated DataFrame back to a new CSV
df_csv.to_csv('translate\meesho_product_reviews_translated.csv', index=False)

