### 1. Setup

We import the necessary libraries and define key parameters for interacting with Winmart’s API: the base URL for fetching categories, the current date for reference, and request headers.

In [None]:
import requests
import json
import time
import random
import pandas as pd
from datetime import date

# define base URLs and headers for making requests throughout the scraping process
CATEGORIES_URL = "https://api-crownx.winmart.vn/mt/api/web/v1/category"
CHECK_DATE = date.today() # current date for reference

HEADERS = {
    'origin': 'https://winmart.vn',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36'
}

### 2. Web Scraping

To extract product data from Winmart, we follow a multi-step process: fetch product categories, collect item data by category, clean the results, and save everything in a structured format.

#### 2.1. Fetching Categories

We start by retrieving a list of product categories from Winmart’s API. The function extracts relevant fields (*code*, *name*, parent category, a flag indicating whether the category has children, *slug* – used to construct URLs and hierarchy *level*) and returns a cleaned list.

Each category is identified by a numeric code and a slug. While the numeric code is not directly needed for the product requests, the slug is essential for constructing the URL to fetch product data from specific categories.

In [None]:
def fetch_categories():
    
    response = requests.get(CATEGORIES_URL, headers=HEADERS) # fetch category data from the API endpoint
    raw_categories = json.loads(response.text) # convert JSON response to a list of category dictionaries

    cleaned_categories = []
    
    # loop through top-level categories and extract relevant fields
    for category in raw_categories["data"]:
        cleaned_categories.append({
            "code": category["parent"]["code"],  # numeric code for further product fetching
            "name": category["parent"]["name"],  # name for reference
            "parent_id": None,   # id and name of a parent category for reference 
            "parent_name": None, # there's no parent for the top level, but we add the field for consistency
            "has_child": True if category["lstChild"] else False, # True if the category has child categories, important for the products fetching
            "slug": category["parent"]["seoName"], # category name used in requests, for further products fetching
            "level": category["parent"]["level"] # level of category for reference
        })
        if category["lstChild"]:    # if category has children, go through them too
            for child in category["lstChild"]:
                cleaned_categories.append({
                    "code": child["parent"]["code"],
                    "name": child["parent"]["name"],
                    "parent_id": category["parent"]["code"],
                    "parent_name": category["parent"]["name"],
                    "has_child": False,
                    "slug": child["parent"]["seoName"],
                    "level": child["parent"]["level"]
                })

    return cleaned_categories

All available categories are saved to a CSV file for transparency and reproducibility. For product scraping, we use only the lowest-level (childless) categories — these are smaller and more specific, reducing the risk of timeouts or rate-limiting during batch requests.

In [None]:
categories = fetch_categories()
categories_df = pd.DataFrame(categories)
categories_df.to_csv(f'categories-{CHECK_DATE}-winmart.csv', index=False)

In [None]:
# optional intermediate output of the categories list
# with pd.option_context('display.max_rows', None, 'display.max_colwidth', None):
#     display(categories_df)

In [None]:
categories_end = [cat for cat in categories if not cat['has_child']] # select only childless categories
# categories_end = categories_end[:] # this line can be used to resume scraping from a specific point in case of interruption
total_categories_count = len(categories_end)

#### 2.2. Fetching Raw Product Data

We retrieve product listings using the site’s API, which returns data in paginated batches. Each request requires the category’s slug, page number, and other parameters like page size (usually 8 products per page), store code, and store group code.

The API fetches the first 8 products when a user visits the category page, and as the user scrolls, additional requests are made to fetch subsequent pages. The first page request includes the total number of pages, allowing us to determine how many requests to make for a given category.

We loop through all pages in a given category, adding a random delay between requests to reduce the risk of being rate-limited or blocked. The result is raw product data in JSON format, consistent with the site’s internal structure.

In [None]:
def fetch_products(category):
    
    products = []

    # define URL and parameters for the request
    ITEMS_URL = "https://api-crownx.winmart.vn/it/api/web/v3/item/category"
    PARAMS = {
            'orderByDesc': 'true',
            'pageNumber': '1',
            'pageSize': '8', # set to 8, consistent with the site's observed pagination behavior
            'slug': category["slug"],
            'storeCode': '1561',
            'storeGroupCode': '1999',
        }
    
    response = requests.get(ITEMS_URL, headers=HEADERS, params=PARAMS) # fetch the first page of product data
    response_data = json.loads(response.text)
    total_pages = response_data["paging"]["totalPages"] # retrieve the total number of product pages for looping
    
    for page in range(1, total_pages + 1):
        
        time.sleep(random.uniform(1, 5)) # random delay to avoid rate-limiting or blocking
        
        PARAMS = {
            'orderByDesc': 'true',
            'pageNumber': str(page),
            'pageSize': '8',
            'slug': category['slug'],
            'storeCode': '1561',
            'storeGroupCode': '1999',
        }
    
        response = requests.get(ITEMS_URL, headers=HEADERS, params=PARAMS) # fetch product data for the current page
        response_data = json.loads(response.text)
        products += response_data["data"]["items"] # extract only products and add them to the list
        
        print(f'{len(products)}..', end='') # progress indication

    return products   

#### 2.3. Cleaning Product Data

We define a function to extract only the most relevant fields from the raw data:

- Category code (for traceability)
- Product name
- Price
- Unit of measurement
- Supermarket name

The result is a cleaned list of product entries, ready for storage or further analysis.

In [None]:
def clean_product_data(category, raw_products):

    cleaned_products = []

    # loop through raw product entries and retain only the required fields
    for product in raw_products:
        cleaned_products.append({
            "category_code": category["code"],
            "name": product["name"],
            "price": product["price"],
            "uom": product["uom"],
            "supermarket": 'Winmart' # hardcoded source name for clarity during analysis
        })

    return cleaned_products

#### 2.4. Putting It All Together

The following code:

1. Iterates through the selected lowest-level categories (i.e., categories without children),
2. Fetches and cleans product data for each,
3. Appends the cleaned data to a single CSV file to build a complete dataset.

A progress tracker prints a summary after each category to monitor scraping progress.
A random delay is added between requests to reduce the risk of being blocked.
The output filename includes a timestamp (via `CHECK_DATE`) for reproducibility.

In [None]:
fetched_categories_count = 0    # counter for fetching progress tracker

# create the file with headers first
products_df = pd.DataFrame(columns=["category_code", "name", "price", "uom", "supermarket"])
products_df.to_csv(f'scraped_products-{CHECK_DATE}-winmart.csv', index=False, mode='w')

for category in categories_end:
    
    raw_products = fetch_products(category) # fetch products
    new_products = clean_product_data(category, raw_products) # select only relevant data and add new products to the list

    products_df = pd.DataFrame(new_products)
    products_df.to_csv(f'scraped_products-{CHECK_DATE}-winmart.csv', index=False, mode='a', header=False)

    fetched_categories_count += 1
    print(f'Category ID: {category["code"]} finished, {fetched_categories_count} out of {total_categories_count} categories fetched')
    
    time.sleep(random.uniform(1, 5))

print(f'Fetching complete. Results saved to scraped_products-{CHECK_DATE}-winmart.csv')

### 3. Filtering and Normalizing Product Data

After collecting and cleaning the raw product data, we proceed with filtering the dataset to include only the products relevant for comparison. This step involves several stages:

#### 3.1. Initial Preprocessing

We begin by loading the previously saved product and category datasets.
- The `category_code` column is dropped as it is no longer needed for the next steps.
- Duplicate product entries are removed. These may appear if a product belonged to multiple categories.
- Extra spaces in product names are stripped to ensure consistent formatting.

In [None]:
import pandas as pd
import re

categories = pd.read_csv('categories-2025-04-17-winmart.csv')
products_original = pd.read_csv('scraped_products-2025-04-17-winmart.csv')

products = products_original.drop(['category_code'], axis=1)    # drop category_code column
products = products.drop_duplicates() # remove duplicates
products['name'] = products['name'].str.strip() # strip extra spaces

#### 3.2. Filtering Products by Type

To identify relevant products for comparison, we define a dictionary mapping product types to regular expressions. Each expression captures the base form of the product while deliberately excluding variations (e.g., flavored, processed, or pickled) that fall outside the scope of this analysis.

In [None]:
product_regex_map = {
    'rice': r'^gạo(?!.*lứt)',
    'bread': r'^bánh (mì|mỳ|sandwich)(?!.*(bơ|hoa|chà|thịt))',
    'chicken_fillet': r'(fillet|phi lê|\bức)(?!.*đùi).*gà',
    'pork_leg': r'đùi.*heo',
    'egg': r'^trứng gà',
    'cucumber': r'^dưa.*(chuột|leo)(?!.*ngâm)',
    'carrot': r'^cà rốt',
    'onion': r'^hành tây',
    'tomato': r'^cà chua',
    'cabbage': r'bắp cải trắng', # matches "bắp cải trắng", but excludes others like "bắp cải tím"
    'eggplant': r'cà tím',
    'banana': r'^chuối(?!.*sấy)',
    'orange': r'^cam',
    'milk': r'^(thùng.*sữa|sữa|lốc.*sữa)(?!.*(vị|hương|có đường|ít đường|socola|sô cô la|dâu|chua|lên men|bắp|lact|yến)).*(trùng|tươi|tự nhiên)', # matches plain fresh milk, but excludes flavored, sweetened, fermented, or powdered varieties
    'yogurt': r'^(sữa chua|lốc.*hộp.*sữa chua)(?!.*(uống|lên men|fristi|chai))',
    'condensed_milk': r'sữa đặc',
    'black_tea': r'^trà\b(?!.*(ml|l\b|lít|sữa|gừng|nestea|atiso|ice|xanh|ô long|nhài|tôm|thái|hòa tan|zoga))', # matches teas, but excludes all green varieties, powdered tea and bottled drinks
    'green_tea': r'^trà\b(?!.*(ml|l\b|lít|sữa|gừng|nestea|atiso|ice|đen|hòa tan|zoga|hoa cúc|ceylon|đào))',
    'ground_coffee': r'^(cà phê|café)(?!.*hòa tan).*(bột|xay|sáng|chế phin|nâu|khát)',
    'sugar': r'^đường.*(pure|trắng|mía)',
    'salt': r'^muối.*biển', # matches "muối biển", but excludes mixed salts like "muối tôm"
    'sunflower_oil': r'dầu.*hướng dương',
    'soybean_oil': r'dầu.*nành',
    'water': r'^nước (uống|khoáng|tinh)(?!.*(vị|sữa|tăng|ion))',
    'spaghetti': r'^mì(?!.*(kool|xốt)).*(ý|spag)',
    'rice_noodles': r'^bún.*safoco',
    'tofu': r'^(đậu|tàu) hũ(?!.*(chiên|trứng))',
    'water_spinach': r'^rau.*muống',
    'mango': r'^xoài(?!.*sấy)',
    'fish_sauce': r'^nước mắm(?!.*ớt)'
}
product_regex_list = '|'.join(product_regex_map.values()) # create a single regex by joining all individual regexes with the OR operator (|)

# filter products matching any of the product types
filtered_products = products.loc[products.name.str.contains(product_regex_list, case=False, regex=True)]

#### 3.3. Assigning Tags

Each filtered product is tagged with its corresponding product type based on regex matching.

In [None]:
def assign_product_type(row):
    name = row['name']
    for product_type, regex in product_regex_map.items():
        match = re.search(regex, name, flags=re.IGNORECASE)
        if match:
            return product_type
    return None

filtered_products = filtered_products.copy()  # recreate the dataframe
filtered_products.loc[:,'product_type'] = filtered_products.apply(assign_product_type, axis=1)

# optional intermediate output of the filtered products list
# with pd.option_context('display.max_rows', None, 'display.max_colwidth', None):
#     display(filtered_products)

#### 3.4. Extracting and Normalizing Units

Many product listings differ in quantity, weight, or volume. To enable a fair comparison, we extract the relevant information from the product name or pricing clarification field and calculate normalized price metrics such as price per kilogram, per liter, or per unit.

- Weight in grams
- Number of units (in particular, eggs)
- Volume in milliliters

Each value is extracted using pattern matching. Not all products contain all values, so some normalization columns (e.g., *price_kg*, *price_lit*, *price_unit*) may be missing depending on the item.

In [None]:
def extract_weight(row):
    """Extracts total weight in grams from the product name or uom."""
    
    name, uom = row['name'], row['uom']
    
    # some products are sold in groups denoted in the uom (e.g., 'G3' means 3 units)
    match = re.search(r'G(\d)', uom)
    if match:
        number = int(match.group(1))
    elif uom == 'T': # if uom is 'T' (thùng - box) that means it's a box of N packages, the exact number is denoted in the product_name
        match = re.search(r'(\d+)\s?(gói|hộp|túi)', name, flags=re.IGNORECASE)
        if match:
            number = int(match.group(1))
    else:
        number = 1 # set to 1 if no grouping was found
    # check if name specifies weight
    match = re.search(r'(\d+|\d+[,.]\d+)\s?(g\b|kg)', name, flags=re.IGNORECASE)
    if match:
        weight = float(match.group(1).replace(',', '.')) # extract the weight
        unit = match.group(2)
        return number * weight * 1000 if unit in ['kg','Kg'] else number * weight # convert kilograms to grams if needed
    # check uom
    if uom == 'KG':
        weight = 1000
        return weight
    
    return None  # if nothing matched

# the next two functions follow the similar logic as extract_weight, but for units and milliliters
def extract_number_of_units(row):
    """Extracts number of units from the product name or pricing clarification
    (quả, trái - piece).
    """
    
    name, product_type, uom = row['name'], row['product_type'], row['uom']
    
    # check name
    match = re.search(r'(\d+)\s?(quả|trái)', name, flags=re.IGNORECASE)
    if match:
        number_of_units = int(match.group(1))
        return number_of_units
    # for eggs, take any number from the name (because this is how Winmart specifies it)
    match = re.search(r'(\d+)', name)
    if match and product_type == 'egg':
        number_of_units = int(match.group(1))
        return number_of_units
    
    return None  # if nothing matched

def extract_volume(row):
    """Extracts total volume in milliliters from the product name or pricing clarification."""
    
    name, uom = row['name'], row['uom']
    
    # check groups
    match = re.search(r'G(\d)', uom)
    if match:
        number = int(match.group(1))
    elif uom == 'T':
        match = re.search(r'(\d+)\s?(gói|hộp|túi|chai)', name, flags=re.IGNORECASE) # (hộp - box, túi - bag, chai - bottle)
        if match:
            number = int(match.group(1))
        else:
            number = 0
    else:
        number = 1
    # chech if name specifies volume
    match = re.search(r'(\d+|\d+[,.]\d+)\s?(ml|l\b|lít)', name, flags=re.IGNORECASE)
    if match:
        volume = float(match.group(1).replace(',', '.'))
        unit = match.group(2)
        return number * volume * 1000 if unit in ['l', 'L', 'lít'] else number * volume

    return None  # if nothing matched

filtered_products = filtered_products.copy()  # recreate the dataframe

# calculate normalized prices
filtered_products.loc[:,'weight'] = filtered_products.apply(extract_weight, axis=1)  # a column with weigths in grams
filtered_products.loc[:,'price_kg'] = filtered_products.price / filtered_products.weight * 1000   # a column with prices per kg

filtered_products.loc[:,'number_of_units'] = filtered_products.apply(extract_number_of_units, axis=1)  # a column with number of units
filtered_products.loc[:,'price_unit'] = filtered_products.price / filtered_products.number_of_units   # a column with prices per unit

filtered_products.loc[:,'volume'] = filtered_products.apply(extract_volume, axis=1)  # a column with volume in ml
filtered_products.loc[:,'price_lit'] = filtered_products.price / filtered_products.volume * 1000   # a column with prices per liter

# optional intermediate output
# with pd.option_context('display.max_rows', None, 'display.max_colwidth', None):
#     display(filtered_products)

#### 3.5. Saving the Final Filtered Dataset

Finally, the enriched dataset is saved to a new CSV file for further analysis.

In [None]:
filtered_products.to_csv(f'filtered_products-2025-04-17-winmart.csv')