In [1]:
url="https://www.amazon.in/s?k=headphones+bluetooth+wireless&page=2&xpid=g4657Xv9Oy2Sy&crid=2RE95GB303GUU&qid=1758088695&sprefix=head%2Caps%2C289&ref=sr_pg_2"

In [2]:
import time
import random
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

# --- CONFIG ---
SEARCH_KEYWORD = "zebronics headphone"
CHROME_DRIVER_PATH = "C:/Program Files/chromedriver-win64/chromedriver.exe"
OUTPUT_FILE = "amazon_products.csv"

# --- SETUP SELENIUM ---
options = Options()
options.add_argument("--start-maximized")
service = Service(CHROME_DRIVER_PATH)
driver = webdriver.Chrome(service=service, options=options)

# --- STEP 1: OPEN AMAZON & SEARCH ---
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# --- STEP 1: OPEN AMAZON & WAIT FOR SEARCH BAR ---
driver.get("https://www.amazon.in")

try:
    # Wait up to 15 seconds for search box to appear
    search_box = WebDriverWait(driver, 15).until(
        EC.presence_of_element_located((By.ID, "twotabsearchtextbox"))
    )
    search_box.send_keys(SEARCH_KEYWORD)
    search_box.send_keys(Keys.RETURN)
    print("Search box found and keyword entered.")
except:
    print("Could not find the search box. Maybe page didn't load or Amazon redirected.")
    driver.quit()
    exit()


# --- STEP 2: AUTO SCROLL ---
last_height = driver.execute_script("return document.body.scrollHeight")
for _ in range(5):  # scroll multiple times
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

# --- STEP 3: COLLECT PRODUCT LINKS ---
soup = BeautifulSoup(driver.page_source, "html.parser")
product_links = []
for link in soup.select("a.a-link-normal.s-no-outline"):
    href = link.get("href")
    if href and "/dp/" in href:
        product_links.append("https://www.amazon.in" + href.split("?")[0])

product_links = list(dict.fromkeys(product_links))  # remove duplicates
print(f"Found {len(product_links)} product links.")

if not product_links:
    print("No product links found. Try another keyword.")
    driver.quit()
    exit()

# --- STEP 4: SCRAPE EACH PRODUCT ---
data = []
for idx, url in enumerate(product_links, start=1):
    try:
        print(f"[{idx}/{len(product_links)}] Scraping: {url}")
        driver.get(url)

        # Random wait to look more human-like
        time.sleep(random.uniform(2.5, 5.0))

        # Detect Captcha
        if "Enter the characters you see" in driver.page_source:
            print(f"Captcha detected for {url}, skipping...")
            continue

        product_soup = BeautifulSoup(driver.page_source, "html.parser")

        title_tag = product_soup.find(id="productTitle")
        title = title_tag.get_text(strip=True) if title_tag else "N/A"

        mrp_tag = product_soup.find("span", class_="a-price a-text-price")
        mrp = mrp_tag.find("span", class_="a-offscreen").get_text(strip=True) if mrp_tag else None

        discount_tag = product_soup.find(
            "span",
            class_="a-size-large a-color-price savingPriceOverride aok-align-center reinventPriceSavingsPercentageMargin savingsPercentage"
        )
        discount = discount_tag.get_text(strip=True).replace("-", "").replace("%", "") if discount_tag else None

        price_tag = product_soup.find("span", class_="a-price")
        if price_tag:
            price_value = price_tag.find("span", class_="a-offscreen")
            price = price_value.get_text(strip=True) if price_value else None
        else:
            price = None

        data.append({
            "Title": title,
            "MRP": mrp,
            "Discount (%)": discount,
            "Price After Discount": price,
            "URL": url
        })

        # --- Auto-save every 10 products ---
        if idx % 10 == 0:
            df = pd.DataFrame(data)
            df.to_csv(OUTPUT_FILE, index=False, encoding="utf-8-sig")
            print(f"Progress saved after {idx} products.")

    except Exception as e:
        print(f"Error scraping {url}: {e}")
        continue

# --- STEP 5: FINAL SAVE ---
df = pd.DataFrame(data)

# Clean & format numbers
def clean_number(x):
    if x is None:
        return None
    try:
        return float(str(x).replace("₹", "").replace(",", "").strip())
    except:
        return None

df["MRP"] = df["MRP"].apply(clean_number)
df["Price After Discount"] = df["Price After Discount"].apply(clean_number)

df["MRP"] = df["MRP"].apply(lambda x: f"₹{int(x)}" if pd.notnull(x) else "N/A")
df["Price After Discount"] = df["Price After Discount"].apply(
    lambda x: f"₹{int(x)}" if pd.notnull(x) and x.is_integer() else f"₹{x:.2f}" if pd.notnull(x) else "N/A"
)


print(f"\nFinished scraping {len(df)} products. Data saved to {OUTPUT_FILE}")

driver.quit()


Search box found and keyword entered.
Found 24 product links.
[1/24] Scraping: https://www.amazon.in/ZEBRONICS-Bluetooth-Headphones-assistant-Comfortable/dp/B07L8KNP5F/ref=sr_1_3
[2/24] Scraping: https://www.amazon.in/ZEBRONICS-Zeb-Thunder-Connectivity-Sea-Green/dp/B09B5CPV71/ref=sr_1_4
[3/24] Scraping: https://www.amazon.in/ZEBRONICS-Envy-Headphone-Detachable-Black/dp/B0DJSQ726J/ref=sxin_12_recs_zoco_stores_brand_identity_bs
[4/24] Scraping: https://www.amazon.in/ZEBRONICS-Headphone-Bluetooth-Assistant-Detachable/dp/B0DJSQYRNV/ref=sxin_12_recs_zoco_stores_brand_identity_bs
[5/24] Scraping: https://www.amazon.in/ZEBRONICS-PRO-Headphone-Bluetooth-Assistant/dp/B0CVGYY9DM/ref=sxin_12_recs_zoco_stores_brand_identity_bs
[6/24] Scraping: https://www.amazon.in/ZEBRONICS-PRO-Headphone-Bluetooth-Assistant/dp/B0CTXGDCSB/ref=sxin_12_recs_zoco_stores_brand_identity_bs
[7/24] Scraping: https://www.amazon.in/Zebronics-Zeb-Dynamic-Bluetooth-Supporting-Headphone/dp/B0829QJRTV/ref=sxin_12_recs_zoco_sto

In [3]:
import os
import pandas as pd

# Your new scraped DataFrame is stored in `df`

file_name = "amazon_products.csv"

if os.path.exists(file_name):
    # Load old data
    old_df = pd.read_csv(file_name)

    # Combine old + new
    combined_df = pd.concat([old_df, df], ignore_index=True)

    # Remove duplicates based on Product Link (or Title if link not available)
    if "Product Link" in combined_df.columns:
        combined_df.drop_duplicates(subset=["Product Link"], inplace=True)
    else:
        combined_df.drop_duplicates(subset=["Title"], inplace=True)

    # Save back to CSV
    combined_df.to_csv(file_name, index=False)
    print(f"Appended {len(df)} products. Total now: {len(combined_df)}")
else:
    # File doesn't exist, create it
    df.to_csv(file_name, index=False)
    print(f"Created new file with {len(df)} products.")


Appended 22 products. Total now: 22


In [4]:
import pandas as pd
from IPython.display import display, HTML

# Load the file
df = pd.read_csv("amazon_products.csv")

# Display as an HTML table
display(HTML(df.tail(10).to_html(index=False)))

print(f"\n Total products scraped: {len(df)}")


Title,MRP,Discount (%),Price After Discount,URL
"ZEBRONICS Thunder Over Ear Bluetooth 5.3 Wireless Headphones with 60H Backup, Gaming Mode, Dual Pairing, Enc, Aux, Micro Sd, Voice Assistant, Comfortable Earcups, Call Function (Brown)","₹1,399",57,₹599.00,https://www.amazon.in/ZEBRONICS-Bluetooth-Headphones-Assistant-Comfortable/dp/B07L8KV2KC/ref=sr_1_12
"Zebronics Zeb-Storm On Ear Wired Headphone with 3.5mm Jack, Built in Microphone for Calling, 1.5 Meter Cable, Soft Ear Cushion, Adjustable Headband, Foldable Ear Cups and Lightweight Design (Black)",₹799,50,₹399.00,https://www.amazon.in/ZEBRONICS-Zeb-Storm-Microphone-Adjustable-Lightweight/dp/B0B4G44RFS/ref=sr_1_13
"ZEBRONICS Thunder Bluetooth 5.3 Wireless Over Ear Headphones with 60H Backup, Gaming Mode, Dual Pairing, Enc, Aux, Micro Sd, Voice Assistant, Comfortable Earcups, Call Function (Teal Green)","₹1,699",65,₹599.00,https://www.amazon.in/ZEBRONICS-Zeb-Thunder-Bluetooth-Teal-Green/dp/B09B5BS6G4/ref=sr_1_14
"ZEBRONICS Bro in Ear Wired Earphones with Mic, 3.5mm Audio Jack, 10mm Drivers, Phone/Tablet Compatible (Black)",₹241.46,67,₹99.00,https://www.amazon.in/ZEBRONICS-Earphones-Drivers-Tablet-Compatible/dp/B07T5DKR5D/ref=sr_1_15
"ZEBRONICS Duke Over Ear Wireless Headphone with Up to 60h Backup,Supports Bluetooth,Dual Pairing,Gaming Mode,Environmental Noise Cancellation (ENC),LED Lights,Deep Bass,Voice Assistant Support (Blue)","₹2,499",64,₹899.00,https://www.amazon.in/Zebronics-Bluetooth-Headphone-Assistant-Multifunction/dp/B088FLHXJX/ref=sr_1_16
"ZEBRONICS Bro in Ear Wired Earphones with Mic, 3.5mm Audio Jack, 10mm Drivers, Phone/Tablet Compatible (Green)",₹707.14,75,₹99.00,https://www.amazon.in/ZEBRONICS-Zeb-Bro-Earphones-Compatible-Green/dp/B09R24JBYV/ref=sr_1_17
"Zebronics Zeb-Blitz USB Over Ear Gaming Headphone with Dolby Atmos,RGB LED,Windows Software,Simulated 7.1 Surround Sound,2.4 Meter Braided Cable,Flexible Mic,Padded Headband & Ear Cushions(Black)","₹5,999",65,"₹2,099.00",https://www.amazon.in/ZEBRONICS-Zeb-Blitz-Headphone-Software-Simulated/dp/B09P1KK9K1/ref=sr_1_18
"Zebronics Storm Wired On Ear Headphone with 3.5mm Jack, Built-in Microphone for Calling,1.5 Meter Cable, Soft Ear Cushion, Adjustable Headband,Foldable Ear Cups(White)",₹799,50,₹399.00,https://www.amazon.in/Zebronics-Headphone-Microphone-Adjustable-Headband/dp/B0CFQVLG49/ref=sr_1_19
"ZEBRONICS Zeb-200HM Wired On Ear Headphone with Mic, Dual 3.5mm Connectors, Adjustable Headband for PC Computers/Laptop (Black)",₹699,43,₹399,https://www.amazon.in/Zebronics-Zeb-200HM-Headphone-Mic/dp/B08CHPGTSF/ref=sr_1_20
"ZEBRONICS Thunder Max Bluetooth Over Ear Headphone with 120*Hrs Playback, 3 EQ Modes, 40mm Bass Driver, Fast Charging, Gaming Mode, Dual Pairing, BT v5.4, AUX, Metallic Finish (Black)",₹2499,64,₹899,https://www.amazon.in/ZEBRONICS-Max-Bluetooth-Headphone-Playback/dp/B0DJT3NS1X/ref=sr_1_21



 Total products scraped: 22
