In [1]:
import re
import time
import random
import hashlib
import pandas as pd

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from bs4 import BeautifulSoup
import configparser
import logging
import os

cfg = configparser.ConfigParser()
if os.path.exists("settings.ini"):
    cfg.read("settings.ini")
    log_enabled = cfg.getboolean("logging", "enabled", fallback=True)
    log_level_str = cfg.get("logging", "level", fallback="INFO")
    log_file = cfg.get("logging", "file", fallback="app.log")
else:
    log_enabled = True
    log_level_str = "INFO"
    log_file = "YandexMarket.log"

if log_enabled:
    num_level = getattr(logging, log_level_str, logging.INFO)
    logging.basicConfig(level=num_level, filename=log_file, filemode='a',
                        format="%(asctime)s - %(levelname)s - %(message)s")

user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.5615.140 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.5481.77 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.5563.64 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_2_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.2 Safari/605.1.15",
    "Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0",
    "Mozilla/5.0 (X11; Linux x86_64; rv:110.0) Gecko/20100101 Firefox/110.0"
]

def init_driver():
    logging.info("Initializing driver with a random user-agent")
    chosen_user_agent = random.choice(user_agents)
    service = Service(r"C:\Users\D\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe")
    options = webdriver.ChromeOptions()
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option("useAutomationExtension", False)
    options.add_argument(f"user-agent={chosen_user_agent}")
    options.add_argument("--start-maximized")

    driver = webdriver.Chrome(service=service, options=options)
    driver.execute_cdp_cmd(
        "Page.addScriptToEvaluateOnNewDocument",
        {
            "source": """
               Object.defineProperty(navigator, 'webdriver', {
                 get: () => undefined
               })
            """
        }
    )
    logging.info("Driver initialized successfully")
    return driver


def clean(raw_price):
    logging.debug("Cleaning raw price: %s", raw_price)
    if not raw_price:
        return None
    cleaned = re.sub(r"[^\d,.]", "", raw_price)
    cleaned = cleaned.replace(",", "").replace(".", "")
    if cleaned == "":
        return None
    logging.debug("Cleaned price: %s", cleaned)
    return cleaned

def search_on_market(driver, query):
    logging.info(f"Searching on Yandex Market with query: {query}")
    url = "https://market.yandex.ru/"
    driver.get(url)
    time.sleep(random.uniform(3, 5))

    search_box = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.NAME, "text"))
    )
    search_box.clear()
    for ch in query:
        search_box.send_keys(ch)
        time.sleep(random.uniform(0.1, 0.3))
    time.sleep(random.uniform(1, 2))
    search_box.send_keys(Keys.ENTER)
    time.sleep(random.uniform(4, 7))
    logging.info("Search completed")

def parse_yandex_page(html):
    logging.info("Parsing Yandex Market page")
    soup = BeautifulSoup(html, "html.parser")
    products_data = []

    product_cards = soup.select("div[id='/content/page/fancyPage/searchSerpStatic/content']")

    for card in product_cards:
        link_element = card.select_one("a")
        product_url = link_element["href"] if link_element else None

        title_element = card.select_one("p[role='link']")
        product_name = title_element.get_text(strip=True) if title_element else None

        brand_element = card.select_one("span[role='link']")
        brand = brand_element.get_text(strip=True) if title_element else None

        is_original = None
        original_element = card.select_one("span[class='ds-text ds-text_weight_med ds-text_color_text-invert-primary ds-text_typography_small-text-1 ds-badge__textContent ds-text_small-text-1_tight ds-text_small-text-1_med']")
        if original_element:
            if original_element.get_text(strip=True) == 'ОРИГИНАЛ':
                is_original = True
            else:
                is_original = None

        final_price_element = card.select_one("span[class='ds-text ds-text_weight_bold ds-text_color_price-term ds-text_typography_headline-5 ds-text_headline-5_tight ds-text_headline-5_bold']")  # Условно
        if final_price_element:
            final_price = int(clean(final_price_element.get_text(strip=True)))
        else:
            final_price = None

        discount_element = card.select_one("span[class='ds-text ds-text_weight_med ds-text_color_text-invert-primary ds-text_typography_lead-text ds-text_lead-text_tight ds-text_lead-text_med']")
        if discount_element:
            discount = int(discount_element.get_text(strip=True))
        else:
            discount = None

        try:
            old_price = round(final_price * 100 / discount)
        except Exception as e:
            old_price = None

        rating_element = card.select_one("span[class='ds-text ds-text_weight_bold ds-text_color_text-rating ds-text_proportional ds-text_typography_text Ys3yn ds-text_text_tight ds-text_text_bold']")
        if rating_element:
            rating = float(rating_element.get_text(strip=True))
        else:
            rating = None

        reviews_element = card.select_one("span[class='ds-text ds-text_lineClamp_1 ds-text_weight_reg ds-text_color_text-secondary ds-text_proportional ds-text_typography_text ds-text_text_tight ds-text_text_reg ds-text_lineClamp']")
        if reviews_element:
            reviews_count = reviews_element.get_text(strip=True)
            reviews_count = reviews_count.replace("оценки", '').replace("оценок", '').replace("оценка", '')
        else:
            reviews_count = None

        delivery_element = card.select_one("span[class='_1yLiV']")
        if delivery_element:
            delivery_time = delivery_element.get_text(strip=True)
        else:
            delivery_time = None

        product_info = {
            "product_url": product_url,
            "product_name": product_name,
            "brand": brand,
            "is_original": is_original,
            "final_price": final_price,
            "old_price": old_price,
            "discount": discount,
            "rating": rating,
            "reviews_count": reviews_count,
            "delivery_time": delivery_time,
            "market": 'YDX'
        }
        products_data.append(product_info)
    logging.info(f"Parsed {len(products_data)} products from Yandex Market page")
    return products_data

def parse(driver, max_scroll_count=5, pause=3):
    logging.info(f"Starting parse with max_scroll_count={max_scroll_count} and pause={pause}")
    all_products = []
    seen_ids = set()

    for _ in range(max_scroll_count):
        html = driver.page_source
        current_products = parse_yandex_page(html)

        new_items_found = False
        for prod in current_products:
            product_id = prod.get("product_url")
            if not product_id:
                hash_str = f"{prod.get('product_name')}|{prod.get('brand')}|{prod.get('final_price')}"
                product_id = hashlib.md5(hash_str.encode('utf-8')).hexdigest()
                prod["product_id"] = product_id

            if product_id not in seen_ids:
                seen_ids.add(product_id)
                all_products.append(prod)
                new_items_found = True

        if new_items_found:
            logging.info("New items found, scrolling down")
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(random.uniform(pause, pause + 2))
        else:
            logging.info("No new items found, breaking loop")
            break

    logging.info(f"Parsing complete, total products collected: {len(all_products)}")
    return all_products

def save_to_csv(data, filename):
    logging.info(f"Saving data to CSV file: {filename}")
    if not data:
        logging.warning("No data to save")
        return
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False, encoding="utf-8")
    logging.info("Data saved successfully")

if __name__ == "__main__":
    logging.info("Script started")
    driver = init_driver()
    try:
        search_query = "браслет женский"
        logging.info(f"Search query: {search_query}")
        search_on_market(driver, search_query)
        data = parse(driver, max_scroll_count=10, pause=2)
        save_to_csv(data, "yandex_market_data_z5.csv")
    finally:
        driver.quit()
        logging.info("Driver quit and script finished")
