В этом блокноте мы занимаемся скачиванием продуктов с сайта Winmart

Для начала нам нужно импортировать нужные библиотеки и задать константы. Это параметры запросов к сайту.

In [1]:
import requests
import json
import time
import random
import pandas as pd
from datetime import date

# define constants for requests

CATEGORIES_URL = "https://api-crownx.winmart.vn/mt/api/web/v1/category"
CHECK_DATE = date.today() # current date for reference

HEADERS = {
    'origin': 'https://winmart.vn',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36'
}

Так как скачивать продукты мы будем по категориям, нужно получить список категорий товаров с сайта. За это будет отвечать следующая функция, которая запросит у сайта список категорий и подкатегорий, выберет только id и названия категорий и сохранит данные в словарь.

In [2]:
def fetch_categories():
    
    response = requests.get(CATEGORIES_URL, headers=HEADERS) # get the data
    raw_categories = json.loads(response.text) # convert into a dictionary

    cleaned_categories = []
    
    # go through the raw data and select only the necessary fields
    for category in raw_categories["data"]:    # select the broad category, save its id and name
        cleaned_categories.append({
            "code": category["parent"]["code"],
            "name": category["parent"]["name"],
            "parent_id": None,
            "parent_name": None,
            "has_child": True if category["lstChild"] else False,
            "slug": category["parent"]["seoName"], # to create an items request
            "level": category["parent"]["level"] # to avoid double scanning subcategories
        })
        if category["lstChild"]:    # if category has children, go through them too
            for child in category["lstChild"]:
                cleaned_categories.append({
                    "code": child["parent"]["code"],
                    "name": child["parent"]["name"],
                    "parent_id": category["parent"]["code"],
                    "parent_name": category["parent"]["name"],
                    "has_child": False,
                    "slug": child["parent"]["seoName"], # to create an items request
                    "level": child["parent"]["level"] # to avoid double scanning subcategories
                })

    return cleaned_categories

Загружаем категории в отдельном блоке, чтобы не повторять эту операцию.

In [3]:
categories = fetch_categories()
categories_df = pd.DataFrame(categories)
categories_df.to_csv(f'categories-{CHECK_DATE}-winmart.csv', index=False)
print(categories)

[{'code': 'MENU114', 'name': 'Giá Siêu Rẻ', 'parent_id': None, 'parent_name': None, 'has_child': False, 'slug': 'gia-sieu-re--c114', 'level': 1}, {'code': 'MENU51', 'name': 'Ưu Đãi Hội Viên', 'parent_id': None, 'parent_name': None, 'has_child': False, 'slug': 'uu-dai-hoi-vien--c51', 'level': 1}, {'code': 'MENU08', 'name': 'Sữa các loại', 'parent_id': None, 'parent_name': None, 'has_child': True, 'slug': 'sua-cac-loai--c08', 'level': 1}, {'code': 'MENU0133', 'name': 'Sữa Tươi', 'parent_id': 'MENU08', 'parent_name': 'Sữa các loại', 'has_child': False, 'slug': 'sua-tuoi--c0133', 'level': 2}, {'code': 'MENU0134', 'name': 'Sữa Hạt - Sữa Đậu', 'parent_id': 'MENU08', 'parent_name': 'Sữa các loại', 'has_child': False, 'slug': 'sua-hat-sua-dau--c0134', 'level': 2}, {'code': 'MENU0135', 'name': 'Sữa Bột', 'parent_id': 'MENU08', 'parent_name': 'Sữa các loại', 'has_child': False, 'slug': 'sua-bot--c0135', 'level': 2}, {'code': 'MENU0138', 'name': 'Bơ Sữa - Phô Mai', 'parent_id': 'MENU08', 'parent_

In [12]:
categories_end = [cat for cat in categories if not cat['has_child']] # choose only childless categories
categories_end = categories_end[71:]
print(categories_end)
total_categories_count = len(categories_end)
total_categories_count

[{'code': 'MENU01133', 'name': 'Đồ dùng nhà bếp', 'parent_id': 'MENU25', 'parent_name': 'Đồ Dùng Gia Đình', 'has_child': False, 'slug': 'do-dung-nha-bep--c01133', 'level': 2}, {'code': 'MENU01136', 'name': 'Thiết bị dùng điện trong nhà', 'parent_id': 'MENU25', 'parent_name': 'Đồ Dùng Gia Đình', 'has_child': False, 'slug': 'thiet-bi-dung-dien-trong-nha--c01136', 'level': 2}, {'code': 'MENU01170', 'name': 'Vệ Sinh Nhà Cửa', 'parent_id': 'MENU25', 'parent_name': 'Đồ Dùng Gia Đình', 'has_child': False, 'slug': 've-sinh-nha-cua--c01170', 'level': 2}, {'code': 'MENU26', 'name': 'Điện Gia Dụng', 'parent_id': None, 'parent_name': None, 'has_child': False, 'slug': 'dien-gia-dung--c26', 'level': 1}, {'code': 'MENU27', 'name': 'Văn Phòng Phẩm - Đồ Chơi', 'parent_id': None, 'parent_name': None, 'has_child': False, 'slug': 'van-phong-pham-do-choi--c27', 'level': 1}]


5

In [13]:
# fetch the raw product data (the site uses an increasing offset to fetch all the products, we mimic this behavior)

def fetch_products(category):
    
    # total_products = 1 # just a random non-zero value to start with
    # offset = 0
    products = []

    ITEMS_URL = "https://api-crownx.winmart.vn/it/api/web/v3/item/category"
    PARAMS = {
            'orderByDesc': 'true',
            'pageNumber': '1',
            'pageSize': '8',
            'slug': category["slug"],
            'storeCode': '1561',
            'storeGroupCode': '1999',
        }
    
    response = requests.get(ITEMS_URL, headers=HEADERS, params=PARAMS) # get the data
    response_data = json.loads(response.text) # convert into a dictionary
    total_pages = response_data["paging"]["totalPages"]
    
    for page in range(1, total_pages + 1):
        time.sleep(random.uniform(1, 5)) # random time delay to avoid being blocked
        
        PARAMS = {
            'orderByDesc': 'true',
            'pageNumber': str(page),
            'pageSize': '8',
            'slug': category['slug'],
            'storeCode': '1561',
            'storeGroupCode': '1999',
        }
    
        response = requests.get(ITEMS_URL, headers=HEADERS, params=PARAMS) # get the data
        response_data = json.loads(response.text) # convert into a dictionary
        products += response_data["data"]["items"] # extract only products and add them to the list
        
        print(f'{len(products)}..', end='') # progress indication

    return products
    

In [6]:
raw_products = fetch_products(categories_end[0]) # fetch products

8..16..24..

In [7]:
raw_products

[{'id': 'd7cdc934-1856-4089-b26b-713ed77030af',
  'itemNo': '10333557',
  'brand': '10098',
  'seoName': 'lix-nuoc-giat-huong-nang-ha-33kg31kg--s10333557',
  'isAlcohol': False,
  'mediaUrl': 'https://hcm.fstorage.vn/images/2025/02/lix-1-1-20250207091638.png',
  'description': 'LIX Nước giặt hương nắng hạ 3,3kg/3,1kg',
  'name': 'LIX Nước giặt hương nắng hạ 3,3kg/3,1kg',
  'itemType': 'ZTRD',
  'uomId': '7b76539a-c816-45ee-9f3d-f4a58e778101',
  'uom': 'CAN',
  'uomName': 'Can',
  'quantityPerUnit': 1.0,
  'sku': '10333557CAN',
  'barcode': '8934669241455',
  'price': 199000.0,
  'salePrice': 115000.0,
  'quantity': 239.0,
  'promotionCode': '2300284810',
  'promotionType': 'ZB10',
  'scaleType': 'EQUAL',
  'scaleQuantity': 1.0,
  'mch1': '2',
  'mch1Name': 'Phi thực phẩm',
  'mch2': '202',
  'mch2Name': 'Hoá mỹ phẩm',
  'mch3': '20202',
  'mch3Name': 'Hóa phẩm',
  'mch4': '2020201',
  'mch4Name': 'Chất giặt tẩy',
  'mch5': '202020104',
  'mch5Name': 'Nước giặt',
  'brandName': 'LIX',
 

In [14]:
def clean_product_data(category, raw_products):

    cleaned_products = []

    # go through the raw data and select only the necessary fields
    for product in raw_products:
        cleaned_products.append({
            "category_code": category["code"],
            "name": product["name"],
            "price": product["price"],
            "uom": product["uom"],
            "supermarket": 'Winmart'
        })

    return cleaned_products

In [15]:
fetched_categories_count = 0    # counter for fetching progress tracker

# Create the file first with headers
products_df = pd.DataFrame(columns=["category_code", "name", "price", "uom", "supermarket"])
products_df.to_csv(f'products-{CHECK_DATE}-winmart.csv', index=False, mode='w')

for category in categories_end:
    raw_products = fetch_products(category) # fetch products
    new_products = clean_product_data(category, raw_products) # select only relevant data and add new products to the list

    products_df = pd.DataFrame(new_products)
    products_df.to_csv(f'products-{CHECK_DATE}-winmart.csv', index=False, mode='a', header=False)

    fetched_categories_count += 1
    print(f'№ {category["code"]} finished, {fetched_categories_count} out of {total_categories_count} categories fetched')
    time.sleep(random.uniform(1, 5))

print(f'Fetching complete, see the result in products-{CHECK_DATE}-winmart.csv')

8..16..24..32..40..48..56..64..72..80..88..94..№ MENU01133 finished, 1 out of 5 categories fetched
4..№ MENU01136 finished, 2 out of 5 categories fetched
8..16..24..32..40..45..№ MENU01170 finished, 3 out of 5 categories fetched
3..№ MENU26 finished, 4 out of 5 categories fetched
8..16..24..32..40..48..56..60..№ MENU27 finished, 5 out of 5 categories fetched
Fetching complete, see the result in products-2025-03-06-winmart.csv


---

Below is filtering and price normalization

---

In [133]:
import pandas as pd
import re

In [134]:
categories = pd.read_csv('categories-2025-03-06-winmart.csv')
products_original = pd.read_csv('products-2025-03-06-winmart-complete.csv')
products = products_original.drop(['category_code'], axis=1)    # drop category_code column as it's actually unnecessary

In [7]:
categories

Unnamed: 0,code,name,parent_id,parent_name,has_child,slug,level
0,MENU114,Giá Siêu Rẻ,,,False,gia-sieu-re--c114,1
1,MENU51,Ưu Đãi Hội Viên,,,False,uu-dai-hoi-vien--c51,1
2,MENU08,Sữa các loại,,,True,sua-cac-loai--c08,1
3,MENU0133,Sữa Tươi,MENU08,Sữa các loại,False,sua-tuoi--c0133,2
4,MENU0134,Sữa Hạt - Sữa Đậu,MENU08,Sữa các loại,False,sua-hat-sua-dau--c0134,2
...,...,...,...,...,...,...,...
87,MENU01133,Đồ dùng nhà bếp,MENU25,Đồ Dùng Gia Đình,False,do-dung-nha-bep--c01133,2
88,MENU01136,Thiết bị dùng điện trong nhà,MENU25,Đồ Dùng Gia Đình,False,thiet-bi-dung-dien-trong-nha--c01136,2
89,MENU01170,Vệ Sinh Nhà Cửa,MENU25,Đồ Dùng Gia Đình,False,ve-sinh-nha-cua--c01170,2
90,MENU26,Điện Gia Dụng,,,False,dien-gia-dung--c26,1


In [8]:
products

Unnamed: 0,name,price,uom,supermarket
0,"LIX Nước giặt hương nắng hạ 3,3kg/3,1kg",199000.0,CAN,Winmart
1,GREEN ACE Dầu đậu nành 2L,129000.0,CHA,Winmart
2,Nước uống sữa trái cây bổ dưỡng hương cam Nutr...,29000.0,CHA,Winmart
3,Nước uống sữa trái cây hương dâu Nutri Boost c...,29000.0,CHA,Winmart
4,Dầu đậu nành Meizan chai 2L,128900.0,CHA,Winmart
...,...,...,...,...
3507,Túi bút Deli 19x5x5cm,43000.0,CAI,Winmart
3508,Túi bút Deli 19x5x5cm,43500.0,CAI,Winmart
3509,Keo siêu dính Scotch AD113 2G,31000.0,CAI,Winmart
3510,Bao lì xì (8.5x17cm) -Hiện đại- Giao ngẫu nhiên,15300.0,XAP,Winmart


In [135]:
products = products.drop_duplicates() # remove duplicates
products['name'] = products['name'].str.strip() # remove extra spaces
products

Unnamed: 0,name,price,uom,supermarket
0,"LIX Nước giặt hương nắng hạ 3,3kg/3,1kg",199000.0,CAN,Winmart
1,GREEN ACE Dầu đậu nành 2L,129000.0,CHA,Winmart
2,Nước uống sữa trái cây bổ dưỡng hương cam Nutr...,29000.0,CHA,Winmart
3,Nước uống sữa trái cây hương dâu Nutri Boost c...,29000.0,CHA,Winmart
4,Dầu đậu nành Meizan chai 2L,128900.0,CHA,Winmart
...,...,...,...,...
3507,Túi bút Deli 19x5x5cm,43000.0,CAI,Winmart
3508,Túi bút Deli 19x5x5cm,43500.0,CAI,Winmart
3509,Keo siêu dính Scotch AD113 2G,31000.0,CAI,Winmart
3510,Bao lì xì (8.5x17cm) -Hiện đại- Giao ngẫu nhiên,15300.0,XAP,Winmart


In [140]:
product_regex_map = {
    # 'rice': r'^gạo(?!.*lứt)',
    # 'bread': r'^bánh (mì|mỳ|sandwich)(?!.*(bơ|hoa|chà|thịt))',
    # 'chicken_fillet': r'(fillet|phi lê|\bức)(?!.*đùi).*gà',
    # 'pork_leg': r'đùi.*heo',
    # 'egg': r'^trứng gà',
    # 'cucumber': r'^dưa.*(chuột|leo)(?!.*ngâm)',
    # 'carrot': r'^cà rốt',
    # 'onion': r'^hành tây',
    # 'tomato': r'^cà chua',
    # 'cabbage': r'bắp cải trắng',
    # 'eggplant': r'cà tím',
    # 'banana': r'^chuối(?!.*sấy)',
    # 'orange': r'^cam',
    # 'milk': r'^(thùng.*sữa|sữa|lốc.*sữa)(?!.*(vị|hương|có đường|ít đường|socola|sô cô la|dâu|chua|lên men|bắp|lact|yến)).*(trùng|tươi|tự nhiên)',
    # 'yogurt': r'^(sữa chua|lốc.*hộp.*sữa chua)(?!.*(uống|lên men|fristi|chai))',
    # 'condensed_milk': r'sữa đặc',
    # 'black_tea': r'^trà\b(?!.*(ml|l\b|lít|sữa|gừng|nestea|atiso|ice|xanh|ô long|nhài|tôm|thái|hòa tan|zoga))',
    # 'green_tea': r'^trà\b(?!.*(ml|l\b|lít|sữa|gừng|nestea|atiso|ice|đen|hòa tan|zoga|hoa cúc|ceylon|đào))',
    # 'ground_coffee': r'^(cà phê|café)(?!.*hòa tan).*(bột|xay|sáng|chế phin|nâu|khát)',
    # 'sugar': r'^đường.*(pure|trắng|mía)',
    # 'salt': r'^muối.*biển',
    # 'sunflower_oil': r'dầu.*hướng dương',
    # 'soybean_oil': r'dầu.*nành',
    # 'water': r'^nước (uống|khoáng|tinh)(?!.*(vị|sữa|tăng|ion))',
    # 'spaghetti': r'^mì(?!.*(kool|xốt)).*(ý|spag)',
    # 'rice_noodles': r'^bún.*safoco',
    # 'tofu': r'^(đậu|tàu) hũ(?!.*(chiên|trứng))',
    # 'water_spinach': r'^rau.*muống',
    # 'mango': r'^xoài(?!.*sấy)',
    # 'fish_sauce': r'^nước mắm(?!.*ớt)'
}
product_regex_list = '|'.join(product_regex_map.values())
product_regex_list

'^gạo(?!.*lứt)'

In [141]:
filtered_products = products.loc[products.name.str.contains(product_regex_list, case=False, regex=True)]
with pd.option_context('display.max_rows', None, 'display.max_colwidth', None):
    display(filtered_products)

Unnamed: 0,name,price,uom,supermarket
6,Gạo Ngọc Nương ST 25 đặc sản 3Kg,123000.0,G1,Winmart
65,Gạo ST25 Giống Cây Trồng TW túi 3kg,107000.0,G1,Winmart
2490,Gạo Ngọc Nương Lức đỏ túi 1kg,51000.0,G1,Winmart
2491,Gạo ST25 Ngọc Nương túi 5Kg,189000.0,G1,Winmart
2492,Gạo Lúa Tôm ST25 Ngọc Nương túi 5Kg,199000.0,G1,Winmart
2494,Gạo Japonica Neptune bịch 5kg,137600.0,G1,Winmart
2497,Gạo Ngọc Nương ST25 3 Kg,115000.0,G1,Winmart


In [130]:
def assign_product_type(row):
    name = row['name']
    for product_type, regex in product_regex_map.items():
        match = re.search(regex, name, flags=re.IGNORECASE)
        if match:
            return product_type
    return None

filtered_products = filtered_products.copy()  # recreate the dataframe
filtered_products.loc[:,'product_type'] = filtered_products.apply(assign_product_type, axis=1)
filtered_products

Unnamed: 0,name,price,uom,supermarket,product_type
1,GREEN ACE Dầu đậu nành 2L,129000.0,CHA,Winmart,soybean_oil
4,Dầu đậu nành Meizan chai 2L,128900.0,CHA,Winmart,soybean_oil
6,Gạo Ngọc Nương ST 25 đặc sản 3Kg,123000.0,G1,Winmart,rice
9,Dầu đậu nành Meizan chai 1L,55800.0,CHA,Winmart,soybean_oil
10,Nước mắm Nam Ngư Đệ nhị 900ml,26500.0,CHA,Winmart,fish_sauce
...,...,...,...,...,...
3213,Trứng Gà Sạch O'LALA hộp 10_Size L,30500.0,HOP,Winmart,egg
3216,Trứng Gà Ta O'LALA Sạch giỏ 10,43000.0,HOP,Winmart,egg
3217,Tàu hũ bổ dưỡng Tafu gói 300g,10400.0,CAI,Winmart,tofu
3221,Tàu hũ mềm Ichiban hộp 300g,12800.0,HOP,Winmart,tofu


In [131]:
def extract_weight(row):
    name, uom = row['name'], row['uom']
    
    # some products are sold in groups, so we'll check that first
    match = re.search(r'G(\d)', uom)
    if match:
        number = int(match.group(1))
    elif uom == 'T':
        match = re.search(r'(\d+)\s?(gói|hộp|túi)', name, flags=re.IGNORECASE)
        if match:
            number = int(match.group(1))
    else:
        number = 1
    
    match = re.search(r'(\d+|\d+[,.]\d+)\s?(g\b|kg)', name, flags=re.IGNORECASE)
    if match:
        weight = float(match.group(1).replace(',', '.'))
        unit = match.group(2)
        return number * weight * 1000 if unit in ['kg','Kg'] else number * weight
    # if name doesn't contain anything, check uom
    if uom == 'KG':
        weight = 1000
        return weight
    
    return None  # if nothing matched

def extract_number_of_units(row):
    name, product_type, uom = row['name'], row['product_type'], row['uom']
    
    # first, check name
    match = re.search(r'(\d+)\s?(quả|trái)', name, flags=re.IGNORECASE)
    if match:
        number_of_units = int(match.group(1))
        return number_of_units
    # for eggs let's take any number from the name (because this is how they specify it)
    match = re.search(r'(\d+)', name)
    if match and product_type == 'egg':
        number_of_units = int(match.group(1))
        return number_of_units
    
    return None  # if nothing matched

def extract_volume(row):
    name, uom = row['name'], row['uom']
    
    # drinks are often sold in groups, so we'll check that first
    match = re.search(r'G(\d)', uom)
    if match:
        number = int(match.group(1))
    elif uom == 'T':
        match = re.search(r'(\d+)\s?(gói|hộp|túi|chai)', name, flags=re.IGNORECASE)
        if match:
            number = int(match.group(1))
        else:
            number = 0
    else:
        number = 1
    
    match = re.search(r'(\d+|\d+[,.]\d+)\s?(ml|l\b|lít)', name, flags=re.IGNORECASE)
    if match:
        volume = float(match.group(1).replace(',', '.'))
        unit = match.group(2)
        return number * volume * 1000 if unit in ['l', 'L', 'lít'] else number * volume

    return None  # if nothing matched

filtered_products = filtered_products.copy()  # recreate the dataframe
# extract and calculate weights
filtered_products.loc[:,'weight'] = filtered_products.apply(extract_weight, axis=1)  # a column with weigths in grams
filtered_products.loc[:,'price_kg'] = filtered_products.price / filtered_products.weight * 1000   # a column with prices per kg
# # extract and calculate number of units (for products with units)
filtered_products.loc[:,'number_of_units'] = filtered_products.apply(extract_number_of_units, axis=1)  # a column with number of units
filtered_products.loc[:,'price_unit'] = filtered_products.price / filtered_products.number_of_units   # a column with prices per unit
# extract and calculate volume
filtered_products.loc[:,'volume'] = filtered_products.apply(extract_volume, axis=1)  # a column with volume in ml
filtered_products.loc[:,'price_lit'] = filtered_products.price / filtered_products.volume * 1000   # a column with prices per liter
with pd.option_context('display.max_rows', None, 'display.max_colwidth', None, 'display.float_format', '{:.1f}'.format):
    display(filtered_products)

Unnamed: 0,name,price,uom,supermarket,product_type,weight,price_kg,number_of_units,price_unit,volume,price_lit
1,GREEN ACE Dầu đậu nành 2L,129000.0,CHA,Winmart,soybean_oil,,,,,2000.0,64500.0
4,Dầu đậu nành Meizan chai 2L,128900.0,CHA,Winmart,soybean_oil,,,,,2000.0,64450.0
6,Gạo Ngọc Nương ST 25 đặc sản 3Kg,123000.0,G1,Winmart,rice,3000.0,41000.0,,,,
9,Dầu đậu nành Meizan chai 1L,55800.0,CHA,Winmart,soybean_oil,,,,,1000.0,55800.0
10,Nước mắm Nam Ngư Đệ nhị 900ml,26500.0,CHA,Winmart,fish_sauce,,,,,900.0,29444.4
11,Nước mắm Nam Ngư chai 1.2L,65000.0,CHA,Winmart,fish_sauce,,,,,1200.0,54166.7
28,Rau muống WinEco gói 500g,16500.0,G1,Winmart,water_spinach,500.0,33000.0,,,,
41,Cà chua đỏ WinEco,32900.0,KG,Winmart,tomato,1000.0,32900.0,,,,
44,MEATDELI Thịt đùi heo (S),144900.0,KG,Winmart,pork_leg,1000.0,144900.0,,,,
48,MEATDELI Thịt Nạc đùi heo (S),151900.0,KG,Winmart,pork_leg,1000.0,151900.0,,,,


In [132]:
filtered_products.to_csv(f'filtered_products-2025-03-06-winmart.csv')