In [48]:
import re
import time
import random
import hashlib
import pandas as pd

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from bs4 import BeautifulSoup

def init_driver():
    service = Service(r"C:\Users\D\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe")
    options = webdriver.ChromeOptions()
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option("useAutomationExtension", False)
    user_agents = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Safari/605.1.15",
    ]
    options.add_argument(f"user-agent={random.choice(user_agents)}")
    options.add_argument("--start-maximized")

    driver = webdriver.Chrome(service=service, options=options)
    driver.execute_cdp_cmd(
        "Page.addScriptToEvaluateOnNewDocument",
        {
            "source": """
               Object.defineProperty(navigator, 'webdriver', {
                 get: () => undefined
               })
            """
        }
    )
    return driver

def clean(raw_price):
    if not raw_price:
        return None
    cleaned = re.sub(r"[^\d,.]", "", raw_price)
    cleaned = cleaned.replace(",", "").replace(".", "")
    if cleaned == "":
        return None
    return cleaned

def search_on_wb(driver, query):
    url = "https://www.wildberries.ru/"
    driver.get(url)
    time.sleep(random.uniform(3, 5))
    search_box = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "searchInput"))
    )
    time.sleep(5)
    search_box.send_keys(query)
    time.sleep(random.uniform(3, 5))
    search_box.send_keys(Keys.ENTER)
    time.sleep(random.uniform(3, 6))

def parse_wb_page(html):
    soup = BeautifulSoup(html, "html.parser")
    products_data = []
    product_cards = soup.select("div.product-card-overflow")

    for card in product_cards:
            link_element = card.select_one("a")
            product_url = link_element["href"] if link_element else None

            title_element = card.select_one("span[class=product-card__name]")
            product_name = title_element.get_text(strip=True) if title_element else None

            brand_element = card.select_one("span[class=product-card__brand]")
            brand = brand_element.get_text(strip=True) if brand_element else None

            is_original = None
            original_element = card.select_one("span[class='product-card__original-mark icon-original-check']")
            if original_element:
                is_original = True
            else:
                is_original = False

            old_price_element = card.select_one("del")
            if old_price_element:
                old_price = int(clean(old_price_element.get_text(strip=True)))
            else:
                old_price = None

            discount_element = card.select_one("span[class=percentage-sale]")
            discount = int(clean(discount_element.get_text(strip=True))) if discount_element else None

            final_price = round(old_price * (100-discount) / 100) if old_price else None

            reviews_element = card.select_one("span[class='product-card__count']")
            if reviews_element:
                reviews_count = int(clean(reviews_element.get_text(strip=True)))
            else:
                reviews_count = None

            rating_element = card.select_one("span[class='address-rate-mini address-rate-mini--sm']")
            if rating_element:
                rating = rating_element.get_text(strip=True)
            else:
                rating = None

            delivery_element = card.select_one("span[class='btn-text']")
            if delivery_element:
                delivery_time = delivery_element.get_text(strip=True)
            else:
                delivery_time = None

            product_info = {
                "product_url": product_url,
                "product_name": product_name,
                "brand": brand,
                "is_original": is_original
                "final_price": final_price,
                "old_price": old_price,
                "discount": discount,
                "rating": rating,
                "reviews_count": reviews_count,
                "delivery_time": delivery_time,
                "market": 'WB'
            }
            products_data.append(product_info)

    return products_data

def parse(driver, max_scroll_count=5, pause=3):
    all_products = []
    seen_ids = set()

    for _ in range(max_scroll_count):
        html = driver.page_source
        current_products = parse_wb_page(html)

        new_items_found = False
        for prod in current_products:
            product_id = prod.get("product_url")
            if not product_id:
                hash_str = f"{prod.get('product_name')}|{prod.get('brand')}|{prod.get('final_price')}"
                product_id = hashlib.md5(hash_str.encode('utf-8')).hexdigest()

            if product_id not in seen_ids:
                seen_ids.add(product_id)
                prod["product_id"] = product_id
                all_products.append(prod)
                new_items_found = True

        if new_items_found:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(random.uniform(pause, pause + 2))
        else:
            break

    return all_products

def save_to_csv(data, filename):
    if not data:
        return
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False, encoding="utf-8")

if __name__ == "__main__":
    driver = init_driver()
    try:
        search_query = "кольцо"
        search_on_wb(driver, search_query)
        data = parse(driver, max_scroll_count=10, pause=1)
        save_to_csv(data, "wb_data_z5.csv")
    finally:
        driver.quit()
