In [1]:
import re
import pandas as pd
import time
import random
import hashlib

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from bs4 import BeautifulSoup
import configparser
import logging
import os

cfg = configparser.ConfigParser()
if os.path.exists("settings.ini"):
    cfg.read("settings.ini")
    log_enabled = cfg.getboolean("logging", "enabled", fallback=True)
    log_level_str = cfg.get("logging", "level", fallback="INFO")
    log_file = cfg.get("logging", "file", fallback="app.log")
else:
    log_enabled = True
    log_level_str = "INFO"
    log_file = "OZON.log"

if log_enabled:
    num_level = getattr(logging, log_level_str, logging.INFO)
    logging.basicConfig(level=num_level, filename=log_file, filemode='a',
                        format="%(asctime)s - %(levelname)s - %(message)s")

def init_driver():
    logging.info("Initializing webdriver")
    service = Service(r"C:\Users\D\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe")
    options = webdriver.ChromeOptions()
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option("useAutomationExtension", False)
    options.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko)"
        "Chrome/110.0.5481.104 Safari/537.36"
    )
    options.add_argument("--start-maximized")
    driver = webdriver.Chrome(service=service, options=options)
    driver.execute_cdp_cmd(
        "Page.addScriptToEvaluateOnNewDocument",
        {
            "source": """
               Object.defineProperty(navigator, 'webdriver', {
                 get: () => undefined
               })
            """
        }
    )
    logging.info("Webdriver initialized")
    return driver

def clean(raw_price):
    logging.debug(f"Cleaning raw price: {raw_price}")
    if not raw_price:
        return None
    cleaned = re.sub(r"[^\d,.]", "", raw_price)
    cleaned = cleaned.replace(",", "").replace(".", "")
    if cleaned == "":
        return None
    logging.debug(f"Cleaned price: {cleaned}")
    return cleaned

def search_on_ozon(driver, query):
    logging.info(f"Searching on Ozon with query: {query}")
    url = "https://www.ozon.ru/"
    driver.get(url)
    time.sleep(random.uniform(3, 5))
    search_box = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.NAME, "text"))
    )
    search_box.clear()
    search_box.send_keys(query)
    search_box.send_keys(Keys.ENTER)
    time.sleep(random.uniform(3, 6))
    logging.info("Search completed")

def parse_ozon_page(html):
    logging.info("Parsing Ozon page")
    soup = BeautifulSoup(html, "html.parser")
    products_data = []
    product_cards = soup.select('div[data-widget="searchResultsV2"] div[data-index]')
    for card in product_cards:
        link_element = card.select_one('a')
        if link_element and link_element.has_attr('href'):
            product_url = link_element['href']
        else:
            product_url = None
        title_element = card.select_one('span.tsBody500Medium')
        if title_element:
            product_name = title_element.get_text(strip=True)
        else:
            product_name = None
        brand_container = card.select_one('div.i8y_24.p6b17-a.tsBodyM')
        brand = None
        is_original = False
        if brand_container:
            spans = brand_container.select('span.p6b17-a4')
            if len(spans) >= 1:
                brand_text = spans[0].get_text(strip=True)
                brand = brand_text
            if len(spans) >= 2:
                orig_text = spans[1].get_text(strip=True)
                if "Оригинал" in orig_text:
                    is_original = True
        final_price_element = card.select_one('span.c3024-a1.tsHeadline500Medium.c3024-b1.c3024-a6')
        if final_price_element:
            final_price = clean(final_price_element.get_text(strip=True))
        else:
            final_price = None
        old_price_element = card.select_one('span.c3024-a1.tsBodyControl400Small.c3024-b.c3024-a6')
        if old_price_element:
            old_price = clean(old_price_element.get_text(strip=True))
        else:
            old_price = None
        discount_element = card.select_one('span.tsBodyControl400Small.c3024-a6.c3024-b4')
        if discount_element:
            discount_str = discount_element.get_text(strip=True)
        else:
            discount_str = None
        rating_container = card.select_one('div.i8y_24.p6b17-a.tsBodyMBold')
        rating_str = None
        reviews_count = None
        if rating_container:
            rating_spans = rating_container.select('span.p6b17-a4')
            if len(rating_spans) >= 1:
                rating_str = rating_spans[0].get_text(strip=True)
            if len(rating_spans) >= 2:
                reviews_text = rating_spans[1].get_text(strip=True)
                clean_num = re.sub(r"[^\d]", "", reviews_text)
                if clean_num.isdigit():
                    reviews_count = int(clean_num)
        delivery_element = card.select_one('div[class="b2121-a8 tsBodyControl500Medium"]')
        if delivery_element:
            delivery_time = delivery_element.get_text(strip=True)
        else:
            delivery_time = None
        product_info = {
            "product_url": product_url,
            "product_name": product_name,
            "brand": brand,
            "is_original": is_original,
            "final_price": final_price,
            "old_price": old_price,
            "discount": discount_str,
            "rating": rating_str,
            "reviews_count": reviews_count,
            "delivery_time": delivery_time,
            "market": 'OZON'
        }
        products_data.append(product_info)
    logging.info(f"Found {len(products_data)} products on the current page")
    return products_data

def parse(driver, max_scroll_count=5, pause=2):
    logging.info(f"Starting parse with max_scroll_count={max_scroll_count} and pause={pause} sec")
    all_products = []
    seen_ids = set()
    for _ in range(max_scroll_count):
        html = driver.page_source
        current_products = parse_ozon_page(html)
        new_items_found = False
        for prod in current_products:
            product_id = prod.get("product_url")
            if not product_id:
                hash_str = f"{prod.get('product_name')}|{prod.get('brand')}|{prod.get('final_price')}"
                product_id = hashlib.md5(hash_str.encode('utf-8')).hexdigest()
                prod["product_id"] = product_id
            if product_id not in seen_ids:
                seen_ids.add(product_id)
                all_products.append(prod)
                new_items_found = True
        if new_items_found:
            logging.info("New items found, scrolling down")
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(random.uniform(pause, pause + 3))
        else:
            logging.info("No new items found, stopping scroll")
            break
    logging.info(f"Parsing complete. Total products collected: {len(all_products)}")
    return all_products

def save_to_csv(data, filename):
    logging.info(f"Saving data to CSV file: {filename}")
    if not data:
        logging.warning("No data to save")
        return
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False, encoding="utf-8")
    logging.info("Data saved successfully")

if __name__ == "__main__":
    logging.info("Script started")
    driver = init_driver()
    try:
        search_query = "браслет с натуральными камнями"
        logging.info(f"Search query: {search_query}")
        search_on_ozon(driver, search_query)
        data = parse(driver, max_scroll_count=50, pause=2)
        save_to_csv(data, "ozon_data_z5.csv")
    finally:
        driver.quit()
        logging.info("Driver quit and script finished")
