In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests as rq
import re

from selenium import webdriver
from selenium.webdriver.chromium.service import ChromiumService
from selenium.webdriver.chrome.service import Service
from selenium.webdriver import ChromeOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from concurrent.futures import ThreadPoolExecutor, as_completed

from fake_useragent import UserAgent
from selenium_stealth import stealth

from time import sleep

In [4]:
# SERVICE
service = Service()

In [3]:
# WILDBERRIES OPTIONS
options = ChromeOptions()
# options.add_argument("start-maximized")                                      # stealth
# options.add_experimental_option("excludeSwitches", ["enable-automation"])    # stealth
# options.add_experimental_option('useAutomationExtension', False)             # stealth
options.add_argument('--headless')
options.add_argument("--disable-extensions")
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--no-sandbox')
options.add_argument('--remote-debugging-port=9222')
options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36")


In [17]:
# GET WILDBERRIES DATA 

def get_one_page(page_num, search_query, gender, start_price, end_price):
    digits = re.compile(r'[0-9]*')
    gender_number = {
        'Men': 1,
        'Women': 2,
        'Kids': 3
    }
    brands = []
    product_names = []
    prices = []

    options = ChromeOptions()
    ua = UserAgent()
    user_agent = ua.random
    # options.add_argument("start-maximized")                                      # stealth
    options.add_experimental_option("excludeSwitches", ["enable-automation"])    # stealth
    options.add_experimental_option('useAutomationExtension', False) 
    options.add_argument('--headless')
    options.add_argument("--disable-extensions")
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--no-sandbox')
    options.add_argument(f"--user-agent={user_agent}")
    options.add_argument(f'--remote-debugging-port={9222+page_num}')
    
    driver = webdriver.Chrome(service=service, options=options)

    driver.get(f"https://www.wildberries.ru/catalog/0/search.aspx?page={page_num + 1}&sort=popular&search={search_query}&priceU={start_price}00%3B{end_price}00&fkind={gender_number[gender]}")

    try:
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CLASS_NAME, 'product-card__middle-wrap')))
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        content = soup.find('div', class_='product-card-overflow')
        product_cards = content.find_all('div', class_='product-card__middle-wrap')
        for card in product_cards:
            brand = card.find('h2', class_='product-card__brand-wrap').text
            product_names.append(brand.split('/')[1].strip())
            brands.append(brand.split('/')[0].strip())
            prices.append(re.search(digits, ''.join(card.text.split('₽')[0].split()))[0])
    except Exception as e:
        print(f'Error: {e}')  
    finally:
        driver.quit()
        
    marketplace = ['wildberries'] * len(brands)
    gender_list = [gender] * len(brands)
    query = [search_query] * len(brands)
    result = pd.DataFrame({'Marketplace': marketplace, 'Brand': brands, 'Gender': gender_list, 'SearchQuery': query, 'ProductName': product_names, 'Price': prices})
    return result

def wildberries_parsing():
    result = []
    for search_query in ['Брюки', 'Рубашка', 'Джинсы', 'Свитшот', 'Худи', 'Шорты']:
        for gender in ['Men', 'Women', 'Kids']:
            for i in range(20, 30):
                result.append(get_one_page(i+1, search_query, gender, 0, 50000))
    return pd.concat(result).drop_duplicates(keep='first').reset_index(drop=True)

df_result_wb_3 = wildberries_parsing()
df_result_wb_3.count()


Marketplace    3617
Brand          3617
Gender         3617
SearchQuery    3617
ProductName    3617
Price          3617
dtype: int64

In [29]:
# CREATE LAMODA DATASET
from concurrent.futures import ThreadPoolExecutor

def pars_one_page(page_num, search_query, gender, start_price, end_price):
    brands = []
    product_names = []
    prices = []
    # gender = ['men', 'women', 'kids']
    response = rq.get(rf'''https://www.lamoda.ru/catalogsearch/result/?q="{search_query}"&submit=y&gender_section={gender.lower()}&price={start_price},{end_price}&page={page_num +1}''')
    try:
        soup = BeautifulSoup(response.text, 'html.parser')
        content = soup.find('div', class_='grid__catalog')
        product_cards = content.find_all('div', class_='x-product-card-description')
        for card in product_cards:
            brands.append(card.find('div', class_='x-product-card-description__brand-name').text.strip())
            product_names.append(card.find('div', class_='x-product-card-description__product-name').text.strip())
            prices.append(int(card.find_all('span', class_=re.compile(r'x-product-card-description__price-[a-z]*?(?=(\b)|("))'))[-1]
                .text
                .strip()
                .replace(' ', '')
                .replace('₽', ''))
                )
    
    except Exception as e:
        print(f'Error: {e}')
    
    marketplace = ['lamoda'] * len(brands)
    gender_list = [gender.lower()] * len(brands)
    query = [search_query] * len(brands)
    result = pd.DataFrame({'Marketplace': marketplace, 'Brand': brands, 'Gender': gender_list, 'SearchQuery': query, 'ProductName': product_names, 'Price': prices})
    return result

def lamoda_parsing(search_query, gender, start_price, end_price):
    

    page_nums = [i for i in range(20)]
    with ThreadPoolExecutor(max_workers=20) as executor:
        result = list(executor.map(lambda page_num: pars_one_page(page_num, search_query, gender, start_price, end_price), page_nums))
    return pd.concat(result).reset_index(drop=True)
        

    

df_result = pd.DataFrame({'Marketplace': [], 'Brand': [], 'Gender': [], 'SearchQuery': [], 'ProductName': [], 'Price': []})
for search_query in ['Брюки', 'Рубашка', 'Джинсы', 'Свитшот', 'Худи', 'Шорты']:
    for gender in ['Men', 'Women', 'Kids']:
        df_result = pd.concat([df_result, lamoda_parsing(search_query, gender, 0, 50000)])

df_result.drop_duplicates(keep='first').reset_index(drop=True).to_csv('lamoda_test.csv', header=True, index=False)

pd.read_csv('lamoda_test.csv').count()

Marketplace    16287
Brand          16287
Gender         16287
SearchQuery    16287
ProductName    16287
Price          16287
dtype: int64

In [20]:
# GET OZON DATA

# 125165 - women
# 125166 - men
# 135513 - kids

options = ChromeOptions()
ua = UserAgent()
user_agent = ua.random
# options.add_argument("start-maximized")                                     # stealth
options.add_experimental_option("excludeSwitches", ["enable-automation"])    # stealth
options.add_experimental_option('useAutomationExtension', False) 
options.add_argument('--headless')
options.add_argument("--disable-extensions")
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--no-sandbox')
# options.add_argument(f"--user-agent={user_agent}")
options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36")
options.add_argument(f'--remote-debugging-port={9222}')

def get_ozon_page(search_query, gender, start_price, end_price):
    gender_dict = {
        'Men': 125166,
        'Women': 125165,
        'Kids': '135533%2C135532'
    }
    prices = []
    brands = []
    product_names = []
    
    driver = webdriver.Chrome(service=service, options=options)
    stealth(driver,
            languages=["en-US", "en"],
            vendor="Google Inc.",
            platform="Win32",
            webgl_vendor="Intel Inc.",
            renderer="Intel Iris OpenGL Engine",
            fix_hairline=True,
            )
    driver.get(f"https://www.ozon.ru/category/odezhda-obuv-i-aksessuary-7500/?category_was_predicted=true&currency_price={start_price}.000%3B{end_price}.000&deny_category_prediction=true&from_global=true&sexmaster={gender_dict[gender]}&text={search_query}")
    try:
        WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.ID, 'paginator')))
        for i in range(15):    
            for _ in range(7):
                driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
                sleep(1)
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            content = soup.find('div', id='paginator')
            product_name_span = content.find_all('span', class_='tsBody500Medium')
            for product_name in product_name_span:
                product_names.append(product_name.text)
            brands_list = content.find_all('div', class_='m1j_25')
            for brand in brands_list:
                prices.append(''.join(brand.text.split()).split('₽')[0])
                match = re.search(r"""((?<=\s)|(?<=[Ё-ё])|(?<=%))[A-Za-z]+?((\s|'|\.)[A-Za-z]+?){0,1}((?=\s)|(?=[Ё-ё0-9]))""", brand.text)
                if match:
                    brands.append(match[0])
                else:
                    match = re.search(r"""[А-ЯЁ]{3,}""", brand.text)
                    if match:
                        brands.append(match[0])
                    else:
                        brands.append(None)
      
    except Exception as e:
        print(f'Error: {e}')
    finally:
        driver.quit()

    print(len(brands), len(prices), len(product_names))
    marketplace = ['ozon'] * len(prices)
    gender_list = [gender.lower()] * len(prices)
    query = [search_query] * len(prices)
    result = pd.DataFrame({'Marketplace': marketplace, 'Brand': brands, 'Gender': gender_list, 'SearchQuery': query, 'ProductName': product_names, 'Price': prices})
    return result

def ozon_parsing():
    result = []
    for search_query in ['Брюки', 'Рубашка', 'Джинсы', 'Свитшот', 'Худи', 'Шорты']:
    # search_query = 'Свитшот'
        for gender in ['Men', 'Women', 'Kids']:
            result.append(get_ozon_page(search_query, gender, 0, 50000))
    return pd.concat(result).drop_duplicates(keep='first').reset_index(drop=True)

ozon_result_2 = ozon_parsing()

876 876 876
891 891 891
888 888 888
883 883 883
896 896 896
1009 1009 1009
882 882 882
895 895 895
1071 1071 1071
895 895 895
1074 1074 1074
888 888 888
1073 1073 1073
895 895 895
1068 1068 1068
1074 1074 1074
885 885 885
1063 1063 1063


In [21]:
ozon_result_2.count()

Marketplace    9996
Brand          8435
Gender         9996
SearchQuery    9996
ProductName    9996
Price          9996
dtype: int64

In [48]:
ozon_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13561 entries, 0 to 13573
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Marketplace  13561 non-null  object
 1   Brand        11478 non-null  object
 2   Gender       13561 non-null  object
 3   SearchQuery  13561 non-null  object
 4   ProductName  13561 non-null  object
 5   Price        13561 non-null  int64 
dtypes: int64(1), object(5)
memory usage: 741.6+ KB


In [42]:
class BrandCompare():
    def __init__(self):
        self.__lamoda_df = pd.read_csv('marketplace_data/lamoda_03.2025.csv')
        self.__wb_df = pd.read_csv('marketplace_data/wb_03.2025.csv')
        self.__ozon_df = pd.read_csv('marketplace_data/ozon_03.2025.csv')

    def __get_brand_intersection(self, df1, df2, df3):
        return (
            list(set(df1['Brand'].str.lower()) & set(df2['Brand'].str.lower()) & set(df3['Brand'].str.lower()))
        )
    def __get_brands_df(self, df, search_query):
        return (
            df
            .query(f'SearchQuery == "{search_query}"')
            ['Brand'].str.lower()
            .value_counts()
            .reset_index()
            .assign(Percent = lambda x: (x['count'] * 100 / x['count'].sum()).round(2))
        )
    def __df_filtering(self, df, list_intersection):
        return df.query(f"(Brand.str.lower() in {list_intersection})")
    
    def get_dfs(self, query):
        lamoda_brands = self.__get_brands_df(self.__lamoda_df, query)
        wb_brands = self.__get_brands_df(self.__wb_df, query)
        ozon_brands = self.__get_brands_df(self.__ozon_df, query)
        brand_intersection = self.__get_brand_intersection(lamoda_brands, wb_brands, ozon_brands)

        lamoda_brands_gen = self.__df_filtering(lamoda_brands, brand_intersection)
        wb_brands_gen = self.__df_filtering(wb_brands, brand_intersection)
        ozon_brands_gen = self.__df_filtering(ozon_brands, brand_intersection)
        
        return [lamoda_brands_gen, wb_brands_gen, ozon_brands_gen]

brandCompare = BrandCompare()
brandCompare.get_dfs('Свитшот')[2].shape

(30, 3)

In [44]:
lamoda_df = pd.read_csv('marketplace_data/lamoda_03.2025.csv')
wb_df = pd.read_csv('marketplace_data/wb_03.2025.csv')
ozon_df = pd.read_csv('marketplace_data/ozon_03.2025.csv')

In [41]:
lamoda_brands_df = get_df_to_compare(lamoda_df, 'Брюки')
wb_brands_df = get_df_to_compare(wb_df, 'Брюки')
ozon_brands_df = get_df_to_compare(ozon_df, 'Брюки')
mp_brand_intersection = get_brand_intersection(lamoda_brands_df, wb_brands_df, ozon_brands_df)
mp_brand_intersection

['lassie',
 'zarina',
 'smena',
 'твое',
 "o'stin",
 'funday',
 'reebok',
 'acoola',
 'button blue',
 'телодвижения',
 'gloria jeans',
 'happyfox',
 'fila',
 'happy baby',
 'puma',
 'termit',
 'pelican',
 'hugo',
 'sela',
 'adidas',
 'mark formelle',
 'demix',
 'befree',
 'mango',
 'henderson',
 'kogankids',
 'kelme',
 'zolla',
 'elaria']

In [53]:
mp_brand_intersection = (list(set(lamoda_brands_df['Brand'].str.lower()) 
                        & set(wb_brands_df['Brand'].str.lower()) 
                        & set(ozon_brands_df['Brand'].str.lower()))
                        )


In [36]:
wb_brands_df.query(f"(Brand.str.lower() in {mp_brand_intersection})")

Unnamed: 0,Brand,count
0,ТВОЕ,278
1,ТЕЛОДВИЖЕНИЯ,95
2,Befree,89
3,Nike,86
4,O'STIN,73
...,...,...
4090,Nota Bene,1
4116,OUTVENTURE,1
4174,KAPPA,1
4366,MAISON DAVID,1


In [40]:
def get_df_to_compare(df, search_query):
    return (
        df
        .query(f'SearchQuery == "{search_query}"')
        [['Brand']]
        .value_counts()
        .reset_index()
        .assign(Percent = lambda x: (x['count'] * 100 / x['count'].sum()).round(2))
    )
def get_brand_intersection(df1, df2, df3):
    return (
        list(set(df1['Brand'].str.lower()) 
                        & set(df2['Brand'].str.lower()) 
                        & set(df3['Brand'].str.lower()))
    )

In [4]:
import pandas as pd
from bs4 import BeautifulSoup
import requests as rq
import re

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver import ChromeOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from concurrent.futures import ThreadPoolExecutor, as_completed

from fake_useragent import UserAgent
from selenium_stealth import stealth
from time import sleep

def ozon_parsing(service, search_query, gender, price_levels):

    options = ChromeOptions()
    ua = UserAgent()
    user_agent = ua.random
    options.add_experimental_option("excludeSwitches", ["enable-automation"])    # stealth
    options.add_experimental_option('useAutomationExtension', False) 
    options.add_argument('--headless')
    options.add_argument("--disable-extensions")
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--no-sandbox')
    options.add_argument(f"--user-agent={user_agent}")
    # options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36")
    options.add_argument(f'--remote-debugging-port={9222}')
    
    gender_dict = {
        'Men': 125166,
        'Women': 125165,
        'Kids': '135533%2C135532'
    }
    prices = []
    brands = []
    product_names = []
    
    try: driver.quit()
    except: pass
    driver = webdriver.Chrome(service=service, options=options)
    
    stealth(driver,
            languages=["en-US", "en"],
            vendor="Google Inc.",
            platform="Win32",
            webgl_vendor="Intel Inc.",
            renderer="Intel Iris OpenGL Engine",
            fix_hairline=True,
            )
    driver.get(f"https://www.ozon.ru/category/odezhda-obuv-i-aksessuary-7500/?category_was_predicted=true&currency_price={price_levels[0]}.000%3B{price_levels[1]}.000&deny_category_prediction=true&from_global=true&sexmaster={gender_dict[gender]}&text={search_query}")
    try:
        WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.ID, 'paginator')))
        for _ in range(5):
            driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
            sleep(1)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        content = soup.find('div', id='paginator')
        product_name_span = content.find_all('span', class_='tsBody500Medium')
        for product_name in product_name_span:
            product_names.append(product_name.text)
        brands_list = content.find_all('div', class_='m1j_25')
        for brand in brands_list:
            prices.append(''.join(brand.text.split()).split('₽')[0])
            match = re.search(r"""((?<=\s)|(?<=[Ё-ё])|(?<=%))[A-Za-z]+?((\s|'|\.)[A-Za-z]+?){0,1}((?=\s)|(?=[Ё-ё0-9]))""", brand.text)
            if match:
                brands.append(match[0])
            else:
                match = re.search(r"""[А-ЯЁ]{3,}""", brand.text)
                if match:
                    brands.append(match[0])
                else:
                    brands.append(None)
      
    except Exception as e:
        print(f'Error: {e}')
        driver.quit()
    finally:
        driver.quit()

    result = pd.DataFrame({'Brand': brands, 'ProductName': product_names, 'Price': prices})
    return result

service = Service()
search_query = 'Брюки'
gender = 'Men'
price_levels = (0, 10000)
ozon_parsing(service, search_query, gender, price_levels)



Error: Message: 
Stacktrace:
#0 0x6002bc1dda1a <unknown>
#1 0x6002bbc95390 <unknown>
#2 0x6002bbce6c85 <unknown>
#3 0x6002bbce6eb1 <unknown>
#4 0x6002bbd35d64 <unknown>
#5 0x6002bbd0cbfd <unknown>
#6 0x6002bbd3307b <unknown>
#7 0x6002bbd0c9a3 <unknown>
#8 0x6002bbcd860e <unknown>
#9 0x6002bbcd9dd1 <unknown>
#10 0x6002bc1a3ddb <unknown>
#11 0x6002bc1a7cbc <unknown>
#12 0x6002bc18b392 <unknown>
#13 0x6002bc1a8834 <unknown>
#14 0x6002bc16f1ef <unknown>
#15 0x6002bc1cc038 <unknown>
#16 0x6002bc1cc216 <unknown>
#17 0x6002bc1dc896 <unknown>
#18 0x7e5b2669caa4 <unknown>
#19 0x7e5b26729c3c <unknown>



Unnamed: 0,Brand,ProductName,Price
