In [35]:
import re
import time
import random
import hashlib
import pandas as pd

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from bs4 import BeautifulSoup

def init_driver():
    service = Service(r"C:\Users\D\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe")
    options = webdriver.ChromeOptions()
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option("useAutomationExtension", False)
    options.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/110.0.5481.104 Safari/537.36"
    )
    options.add_argument("--start-maximized")

    driver = webdriver.Chrome(service=service, options=options)
    driver.execute_cdp_cmd(
        "Page.addScriptToEvaluateOnNewDocument",
        {
            "source": """
               Object.defineProperty(navigator, 'webdriver', {
                 get: () => undefined
               })
            """
        }
    )
    return driver

def clean(raw_price):
    if not raw_price:
        return None
    cleaned = re.sub(r"[^\d,.]", "", raw_price)
    cleaned = cleaned.replace(",", "").replace(".", "")
    if cleaned == "":
        return None
    return cleaned

def search_on_market(driver, query):
    url = "https://market.yandex.ru/"
    driver.get(url)
    time.sleep(random.uniform(3, 5))

    search_box = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.NAME, "text"))
    )
    search_box.clear()
    search_box.send_keys(query)
    search_box.send_keys(Keys.ENTER)
    time.sleep(random.uniform(3, 6))

def parse_yandex_page(html):
    soup = BeautifulSoup(html, "html.parser")
    products_data = []

    product_cards = soup.select('div[data-apiary-widget-name="@marketfront/SerpLayout"]')

    for card in product_cards:
            link_element = card.select_one("a")
            product_url = link_element["href"] if link_element else None

            title_element = card.select_one("p[role='link']")
            product_name = title_element.get_text(strip=True) if title_element else None

            brand_element = card.select_one("span[role='link']")
            brand = brand_element.get_text(strip=True) if title_element else None

            is_original = None
            original_element = card.select_one("span[class='ds-text ds-text_weight_med ds-text_color_text-invert-primary ds-text_typography_small-text-1 ds-badge__textContent ds-text_small-text-1_tight ds-text_small-text-1_med']")
            if original_element:
                if original_element.get_text(strip=True) == 'ОРИГИНАЛ':
                    is_original = True
                else:
                    is_original = None

            final_price_element = card.select_one("span[class='ds-text ds-text_weight_bold ds-text_color_price-term ds-text_typography_headline-5 ds-text_headline-5_tight ds-text_headline-5_bold']")  # Условно
            if final_price_element:
                final_price = int(clean(final_price_element.get_text(strip=True)))
            else:
                final_price = None

            discount_element = card.select_one("span[class='ds-text ds-text_weight_med ds-text_color_text-invert-primary ds-text_typography_lead-text ds-text_lead-text_tight ds-text_lead-text_med']")
            if discount_element:
                discount = int(discount_element.get_text(strip=True))
            else:
                discount = None

            try:
                old_price = round(final_price * 100 / discount)
            except Exception as e:
                old_price = None

            rating_element = card.select_one("span[class='ds-text ds-text_weight_bold ds-text_color_text-rating ds-text_proportional ds-text_typography_text Ys3yn ds-text_text_tight ds-text_text_bold']")
            if rating_element:
                rating = float(rating_element.get_text(strip=True))
            else:
                rating = None

            reviews_element = card.select_one("span[class='ds-text ds-text_lineClamp_1 ds-text_weight_reg ds-text_color_text-secondary ds-text_proportional ds-text_typography_text ds-text_text_tight ds-text_text_reg ds-text_lineClamp']")
            if reviews_element:
                reviews_count = reviews_element.get_text(strip=True)
                reviews_count = reviews_count.replace("оценки", '').replace("оценок", '').replace("оценка", '')
            else:
                reviews_count = None

            delivery_element = card.select_one("span[class='_1yLiV']")
            if delivery_element:
                delivery_time = delivery_element.get_text(strip=True)
            else:
                delivery_time = None

            product_info = {
                "product_url": product_url,
                "product_name": product_name,
                "brand": brand,
                "is_original": is_original,
                "final_price": final_price,
                "old_price": old_price,
                "discount": discount,
                "rating": rating,
                "reviews_count": reviews_count,
                "delivery_time": delivery_time,
                "market": 'YDX'
            }
            products_data.append(product_info)

    return products_data

def parse(driver, max_scroll_count=5, pause=3):

    all_products = []
    seen_ids = set()

    for _ in range(max_scroll_count):
        html = driver.page_source
        current_products = parse_yandex_page(html)

        new_items_found = False
        for prod in current_products:
            product_id = prod.get("product_url")
            if not product_id:
                hash_str = f"{prod.get('product_name')}|{prod.get('brand')}|{prod.get('final_price')}"
                product_id = hashlib.md5(hash_str.encode('utf-8')).hexdigest()
                prod["product_id"] = product_id

            if product_id not in seen_ids:
                seen_ids.add(product_id)
                all_products.append(prod)
                new_items_found = True

        if new_items_found:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(random.uniform(pause, pause + 2))
        else:
            break

    return all_products

def save_to_csv(data, filename="yandex_market_data.csv"):
    if not data:
        return
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False, encoding="utf-8")

if __name__ == "__main__":
    driver = init_driver()
    try:
        search_query = "кольцо"
        search_on_market(driver, search_query)
        data = parse(driver, max_scroll_count=5000, pause=3)
        save_to_csv(data, "yandex_market_data.csv")
    finally:
        driver.quit()


In [36]:
df = pd.read_csv("yandex_market_data.csv")
df

Unnamed: 0,product_url,product_name,brand,is_original,final_price,old_price,discount,rating,reviews_count,delivery_time,market
0,https://market.yandex.ru/business--fabrika-usp...,Кольцо,Queen Fair,,788,1010,78,5.0,1,11 марта,YDX
1,/product--koltso-na-dva-paltsa/599896678?hid=9...,"Кольцо, кварц",Кольцо на два пальца,,1187,1978,60,5.0,1,10 марта,YDX
2,/product--koltso-antistress-vrashchaiushcheesi...,"Кольцо, фианит, нефрит",Кольцо-механизм,,369,802,46,5.0,1,Завтра,YDX
3,/product--koltso-spin-runy-r20-5-skandinavskii...,"Кольцо обручальное, керамика",Кольцо,,802,1458,55,5.0,9,Завтра,YDX
4,/product--koltso-barashik-ruchnaia-rabota-epok...,"Кольцо, керамика","Кольцо-кулон, эпоксидная смола",,834,6950,12,5.0,4,9 марта,YDX
5,/product--koltso-iz-keramiki-znaki-schastia-ch...,"Кольцо, керамика",Noima,,932,1528,61,4.9,2185,Завтра,YDX
6,/product--koltso-s-serdtsem-ruki-s-nadpisiu/17...,Кольцо,LERO,,664,4427,15,4.3,416,Послезавтра,YDX
7,/product--koltso-drakon/19296413?hid=90401&sku...,"Кольцо на два пальца, фианит",Кольцо,,578,5255,11,5.0,1,6 марта,YDX
8,/product--zhenskoe-pozolochennoe-koltso-s-tsir...,"Кольцо помолвочное, Swarovski Zirconia, циркон...",SORONA,,2000,4000,50,5.0,1,6 марта,YDX
9,/product--bezrazmernoe-koltso-drevniaia-rus-bi...,Кольцо-кулон,OTOKODESIGN,,442,520,85,4.9,18,8 марта,YDX
