In [14]:
from bs4 import BeautifulSoup
import requests
import re
import time
import random
import pandas as pd
import json
from datetime import date

CHECK_DATE = date.today()

Let's extract categories from a sample webpage code

In [112]:
with open("groups.html", "r", encoding="utf-8") as file:
    groups_page = BeautifulSoup(file, "html.parser")    # take the html page I saved before and put it into a BeautifulSoup object

categories = []
top_categories = groups_page.find_all("a", class_="clearfix", href=re.compile(r"https://cooponline.vn/groups/[^#]"))    # find all top-level categories
for category in top_categories:
    # print("cat", category.span.string)
    categories.append({                  # extract only relevant data and save into a dictionary
        "name": category.span.string,
        "level": 1,
        "hasChild": True,
        "parent": None,
        "link": category["href"]
    })
    submenu = category.find_next_sibling("div", class_="sub-menu")    # the subcategories are contained in the following div block
    subcategories = submenu.find_all("a", class_="main-menu")    # find all subcategories and loop through them too
    for subcategory in subcategories:
        # print("subcat", subcategory.string)
        # next_el = subcategory.next_sibling
        # print("next ", next_el)
        # parent = subcategory.parent
        # print("parent ", parent.name)
        subsubmenu = subcategory.find_next_sibling("ul")    # the third-level categories are contained in ul block, so if it exists we put it into a variable
        categories.append({              # add subcategories to the list of dictionaries too
            "name": subcategory.string,
            "level": 2,
            "hasChild": True if subsubmenu else False,   # subsubmenu can be either None or a list of third-level categories
            "parent": category.span.string,
            "link": subcategory["href"]
        })
        if subsubmenu:                         # if subsubcategories exists, loop through them and save too 
            subsubcategories = subsubmenu.find_all("a")
            for subsubcategory in subsubcategories:
                # print("subsubcat", subsubcategory.string)
                categories.append({
                    "name": subsubcategory.string,
                    "level": 3,
                    "hasChild": False,
                    "parent": subcategory.string,
                    "link": subsubcategory["href"]
                })

Let's make sure the names don't contain extra spaces

In [113]:
for category in categories:
    category["name"] = category["name"].strip()
    if category["parent"] is not None:
        category["parent"] = category["parent"].strip()

In [108]:
# categories_slice = categories[175:]
# print(categories_slice)

[{'name': 'Chăm sóc tóc', 'level': 2, 'hasChild': True, 'parent': 'Chăm sóc cá nhân', 'link': 'https://cooponline.vn/groups/cham-soc-toc/'}, {'name': 'Các loại dưỡng tóc, ủ tóc, thuốc nhuộm', 'level': 3, 'hasChild': False, 'parent': 'Chăm sóc tóc', 'link': 'https://cooponline.vn/groups/cac-loai-duong-toc-u-toc-thuoc-nhuom/'}, {'name': 'Dầu gội', 'level': 3, 'hasChild': False, 'parent': 'Chăm sóc tóc', 'link': 'https://cooponline.vn/groups/dau-goi/'}, {'name': 'Dầu gội nam', 'level': 3, 'hasChild': False, 'parent': 'Chăm sóc tóc', 'link': 'https://cooponline.vn/groups/dau-goi-nam/'}, {'name': 'Dầu gội trị gàu', 'level': 3, 'hasChild': False, 'parent': 'Chăm sóc tóc', 'link': 'https://cooponline.vn/groups/dau-goi-tri-gau/'}, {'name': 'Dầu xả', 'level': 3, 'hasChild': False, 'parent': 'Chăm sóc tóc', 'link': 'https://cooponline.vn/groups/dau-xa/'}, {'name': 'Mỹ phẩm, chăm sóc da', 'level': 2, 'hasChild': True, 'parent': 'Chăm sóc cá nhân', 'link': 'https://cooponline.vn/groups/my-pham-cha

Let's visit each webpage and get the lists of product codes for each category. Let's do it only for categories without children to avoid unnecessary double work

In [115]:
categories_df = pd.DataFrame(columns=["name", "term_id", "level", "hasChild", "parent", "link", "item_codes"])
categories_df.to_csv(f'categories-{CHECK_DATE}-coop.csv', index=False, mode='w')

total_categories_count = len(categories)    # number of childless categories that we will scan
fetched_categories_count = 0

for category in categories:
    print(f'{category["name"]}..', end='')
    if not category["hasChild"]:
        print('has no children, looking for codes', end='')
        current_page = requests.get(category["link"]).text
        page_bs = BeautifulSoup(current_page, "html.parser")
        products_tag = page_bs.find("module-taxonomy")
        if products_tag is not None:
            category["term_id"] = products_tag["term_id"]
            category["item_codes"] = products_tag["items"]
        
        categories_df = pd.DataFrame([category])
        categories_df.to_csv(f'categories-{CHECK_DATE}-coop.csv', index=False, mode='a', header=False)
    else:
        print('has children, skip', end='')
        category["term_id"] = None
        category["item_codes"] = None
        categories_df = pd.DataFrame([category])
        categories_df.to_csv(f'categories-{CHECK_DATE}-coop.csv', index=False, mode='a', header=False)
        
    fetched_categories_count += 1
    print(f'..fetched - {fetched_categories_count} out of {total_categories_count}')
    time.sleep(random.uniform(1, 3))

Rau củ, trái cây..has children, skip..fetched - 1 out of 270
Nước rửa rau, củ, quả..has no children, looking for codes..fetched - 2 out of 270
Rau Củ..has children, skip..fetched - 3 out of 270
Củ..has no children, looking for codes..fetched - 4 out of 270
Rau nêm, rau thơm..has no children, looking for codes..fetched - 5 out of 270
Rau xào, nấu canh..has no children, looking for codes..fetched - 6 out of 270
Xà lách..has no children, looking for codes..fetched - 7 out of 270
Trái cây..has children, skip..fetched - 8 out of 270
Cam, bưởi, quýt..has no children, looking for codes..fetched - 9 out of 270
Dưa..has no children, looking for codes..fetched - 10 out of 270
Nho, kiwi..has no children, looking for codes..fetched - 11 out of 270
Táo, lê..has no children, looking for codes..fetched - 12 out of 270
Trái cây khác..has no children, looking for codes..fetched - 13 out of 270
Thịt, trứng, hải sản..has children, skip..fetched - 14 out of 270
Thịt..has children, skip..fetched - 15 out o

---
Above is for fetching categories

Below is for products

---

In [27]:
categories = pd.read_csv('categories-2025-03-06-coop-complete.csv', dtype={"term_id": str})
categories = categories.where(pd.notna(categories), None)  # Convert NaN to None
categories = categories.to_dict('records')
categories_slice = categories[190:]
print(categories_slice)

[{'name': 'Drap, gối, mền', 'level': 3, 'hasChild': False, 'parent': 'Đồ dùng gia đình', 'link': 'https://cooponline.vn/groups/drap-goi-men/', 'term_id': '4115', 'item_codes': '103279,103274,103270,103266,103260,103256,103252,103247,103243,103239,103235,103230,103224,103218,100800,100788,100784,100780,99085,97400,97397,97390,97380,97359,97358,97353,97349,97297,97292,97282,97277,96596,96594,96592,96590,96180,96173,94683,94679,94676,91392,91386,91375,90961,90959,90126,90106,90100,90098,89979,89938,85168,83329,83323,83317,83307,83303,83299,83295,83291,81882,81874,81866,80457,79465,79438,77921,77919,77917,76491,76023,75194,75190,75186,75182,74966,74964,74957,74950,73519,73517,73463,73365,73332,73328,73326,73324,71642,71099,70341,70334,70330,70307,70297,70265,70241,70224,70205,70189,70155,70074,69926,69919,69912,69904,69891,69885,69879,69843,69823,69796,69772,69755,69749,69743,69725,69722,69714,69703,69696,69657,69645,69624,69607,69597,69590,69585,69567,69557,69545,69531,69273,69069,69067,6

Собственно код при поддержке ЧатГПТ поделен на осмысленные функции. Следующая функция делает запрос к сайту для получения данных о продуктах в указанной категории. Данные возвращаются как JSON в том формате, в котором они существуют на сайте, функция выбирает из них только данные о продуктах и сохраняет как словарь.

In [28]:
ITEMS_HEADERS = {
    # 'accept': 'application/json, text/javascript, */*; q=0.01',
    # 'accept-language': 'en-US,en;q=0.9,ru;q=0.8',
    # 'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'origin': 'https://cooponline.vn',
    # 'priority': 'u=1, i',
    # 'sec-ch-ua': '"Not(A:Brand";v="99", "Google Chrome";v="133", "Chromium";v="133"',
    # 'sec-ch-ua-mobile': '?0',
    # 'sec-ch-ua-platform': '"Windows"',
    # 'sec-fetch-dest': 'empty',
    # 'sec-fetch-mode': 'cors',
    # 'sec-fetch-site': 'same-origin',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36',
    # 'x-requested-with': 'XMLHttpRequest',
    # 'cookie': 'PHPSESSID=siqpc6nsrsqmggb8vocdk7k0hv; store=xtanphong; zone=1; __zi=3000.SSZzejyD6z4acl6Zt1GQscQ5yUJUG0IKQ9sokCnA6uOgZwIjYnLTWYh5xww95rE5OS7XgyGHLOzZmUUdE30.1; cf_clearance=srcgDpRZHsajKRi3H9PS7nc50.IpUkEvddX5YQ2SGLA-1741270121-1.2.1.1-4IrXB2h09djVk.VhUNIBWTM_.74QPvtTECpCPK11xQrgRH.5VN6fBjC6QsH8j9pK2aTypxjVsKkg.PX4I2HYxwo6YYohGzj5Ck.r6P6XovYet0PlGi0oqOLxVt31vnwM6.MGzG25IEfoP35f2kc.69Tvv37MHNT0zAH3wZpDVNtvoKnJK6_GtbgOAH_SrRSXjmK96lHIxYxhaIhKXR9YfQlCaPh7JEI1wHTyMieh79lNqlyDCPydiZs5oMR5ogOlmjKWS22P0Y.44ub84rXP27gqffoy7ofSyDjWI0c96Kx66D3xQU_TEp3IBkHSskylKtVQOHxdKUFRcrbrl.7qF7gbWBT2fxPEbB_i2.iiKfS9tXluw7BtTfJDNP.fzdgsMsb9PpwCTj5c.ykoysE2hfD0oRAP4pQnQeU7AVGPujw',
}

In [29]:
# fetch the raw product data (the site uses an increasing offset to fetch all the products, we mimic this behavior)

def fetch_products(category):
    
    current_products = 24 # set to 24 in order to have at least 1 iteration of while loop
    current_page = 1
    products = []

    ITEMS_HEADERS['referer'] = category["link"]
    url = 'https://cooponline.vn/ajax/'
        
    while current_products >= 24: # loop until current page contains less than 24 products
        time.sleep(random.uniform(1, 5)) # random time delay to avoid being blocked
        
        DATA = {
            'request': 'w_getProductsTaxonomy',
            'termid': category["term_id"],
            'taxonomy': 'groups',
            'store': 'xtanphong',
            'items': category["item_codes"],
            'trang': current_page,
        }
    
        response = requests.post('https://cooponline.vn/ajax/', headers=ITEMS_HEADERS, data=DATA) # get the data
        response_data = json.loads(response.text) # convert into a dictionary
        products += response_data # extract products and add them to the list
        
        current_products = len(response_data) # update current number of products
        current_page += 1 # update page number
        
        print(f'{len(products)}..', end='') # progress indication

    return products
    

Следующая функция берет необработанные данные в виде словаря и создает DataFrame, в который отбираются только нужные мне параметры товара: название, цена, единица измерения. Пока примерно)

In [30]:
def clean_product_data(category, raw_products):

    cleaned_products = []

    # go through the raw data and select only the necessary fields
    for product in raw_products:
        cleaned_products.append({
            "category_name": category["name"],
            "name": product["name"],
            "price": product["price"],
            "uom": product["unit"],
            "supermarket": "Co.op"
        })

    return cleaned_products

Наконец, в следующей ячейке находится основное тело программы, где мы вызываем эти функции. Сначала запрашиваем список категорий, в цикле скачиваем данные о продуктах, сохраняем все в один большой DataFrame с указанием даты запроса.

In [32]:
fetched_categories_count = 0    # counter for fetching progress tracker
total_categories_count = len(categories)

# Create the file first with headers
products_df = pd.DataFrame(columns=["category_name", "name", "price", "uom", "supermarket"])
products_df.to_csv(f'products-{CHECK_DATE}-coop.csv', index=False, mode='w')

for category in categories_slice:
    if category["item_codes"] is not None:
        raw_products = fetch_products(category) # fetch products
        new_products = clean_product_data(category, raw_products) # select only relevant data and add new products to the list
    
        products_df = pd.DataFrame(new_products)
        products_df.to_csv(f'products-{CHECK_DATE}-coop.csv', index=False, mode='a', header=False)
    else:
        print('Skip..', end='')    
    
    fetched_categories_count += 1
    print(f'\n{category["name"]} finished, {fetched_categories_count} out of {total_categories_count} categories fetched')
    time.sleep(random.uniform(1, 5))

print(f'Fetching complete, see the result in products-{CHECK_DATE}-coop.csv')

24..48..72..96..120..144..168..192..214..
Drap, gối, mền finished, 1 out of 270 categories fetched
24..48..72..96..120..144..168..192..216..240..264..288..312..312..
Điện gia dụng finished, 2 out of 270 categories fetched
24..48..72..96..120..144..168..192..216..240..264..288..312..336..360..384..408..432..456..480..504..528..552..576..600..624..648..672..696..720..744..768..792..816..840..864..888..912..936..960..984..1008..1032..1056..1080..1103..
Đồ dùng bếp finished, 3 out of 270 categories fetched
24..48..72..96..120..144..168..192..216..240..264..288..312..336..360..384..408..432..456..480..504..528..552..576..600..624..648..672..696..700..
Đồ dùng gia dụng khác finished, 4 out of 270 categories fetched
24..48..72..96..120..143..
Đồ dùng văn phòng finished, 5 out of 270 categories fetched
24..48..72..96..120..144..168..192..216..240..264..288..312..336..360..374..
Đồ dùng vệ sinh finished, 6 out of 270 categories fetched
24..48..72..96..113..
Khăn finished, 7 out of 270 categorie

---

Below is filtering and price normalization

---

In [1]:
import pandas as pd
import re

In [2]:
categories = pd.read_csv('categories-2025-03-06-coop-complete.csv')
products_original = pd.read_csv('products-2025-03-07-coop-complete.csv')
products = products_original.drop(['category_name'], axis=1)    # drop category_name column as it's actually unnecessary

In [3]:
categories

Unnamed: 0,name,level,hasChild,parent,link,term_id,item_codes
0,"Rau củ, trái cây",1,True,,https://cooponline.vn/groups/rau-cu-trai-cay/,,
1,"Nước rửa rau, củ, quả",2,False,"Rau củ, trái cây",https://cooponline.vn/groups/nuoc-rua-rau-cu-qua/,7167.0,94725
2,Rau Củ,2,True,"Rau củ, trái cây",https://cooponline.vn/groups/rau-cu/,,
3,Củ,3,False,Rau Củ,https://cooponline.vn/groups/cu/,3985.0,"107042,105246,105244,105232,105121,99790,99696..."
4,"Rau nêm, rau thơm",3,False,Rau Củ,https://cooponline.vn/groups/rau-nem-rau-thom/,3986.0,"99682,93151,93149,93145,93143,93132,93096,9309..."
...,...,...,...,...,...,...,...
265,Thực phẩm đông lạnh,3,False,"Thực phẩm đông, mát",https://cooponline.vn/groups/thuc-pham-dong-la...,4203.0,"106436,105386,105385,105384,105383,10248,10230..."
266,Thực phẩm trữ mát,3,False,"Thực phẩm đông, mát",https://cooponline.vn/groups/thuc-pham-tru-mat...,4204.0,"103653,98518,87598,86945,13662,2393,83276,8327..."
267,Thức uống,2,True,Nhãn Hàng Coop,https://cooponline.vn/groups/thuc-uong-nhan-ha...,,
268,Thức uống dinh dưỡng,3,False,Thức uống,https://cooponline.vn/groups/thuc-uong-dinh-du...,4206.0,"97858,97855,97853,97850,95806,90738,86929,8691..."


In [4]:
products

Unnamed: 0,name,price,uom,supermarket
0,"Bột rửa rau củ, thịt cá Ion – Canxi Umikai 100g",195000,chai,Co.op
1,Cà chua loại so kg,26000,kg,Co.op
2,Hành tím Vĩnh Châu chùm kg,72800,kg,Co.op
3,Tỏi khô 250g-K990 – F,119000,túi,Co.op
4,Nấm hương tươi Vinamush 250g,51500,hộp,Co.op
...,...,...,...,...
14875,Trà đào Coop Select hộp 16 gói x 15g,31000,hộp,Co.op
14876,Trà xanh Thái Nguyên Coop Select gói 100g,18900,gói,Co.op
14877,Trà chanh Coop Select hộp 16 gói x 15g,31000,hộp,Co.op
14878,Trà Lài Thái Nguyên Coop Select 100g,19500,gói,Co.op


In [7]:
products = products.drop_duplicates() # remove duplicates
products.loc[:,'name'] = products['name'].str.strip() # remove extra spaces
products

Unnamed: 0,name,price,uom,supermarket
0,"Bột rửa rau củ, thịt cá Ion – Canxi Umikai 100g",195000,chai,Co.op
1,Cà chua loại so kg,26000,kg,Co.op
2,Hành tím Vĩnh Châu chùm kg,72800,kg,Co.op
3,Tỏi khô 250g-K990 – F,119000,túi,Co.op
4,Nấm hương tươi Vinamush 250g,51500,hộp,Co.op
...,...,...,...,...
14618,Mít Thái xẻ miếng kg,52500,kg,Co.op
14638,Sữa đặc Dutch Lady cao cấp lon 380g – KM,32400,lon,Co.op
14639,Ba chỉ bò Mỹ Suki Coop Select 340g,114900,vỉ,Co.op
14657,Nấm hương khô Lý Tưởng 100g,69000,gói,Co.op


To generalize step 1, we need a mapping (a dictionary) of regex for each product type

In [400]:
product_regex_map = {
    'rice': r'^gạo(?!.*(lứt|lức|dưỡng|nếp))',
    'bread': r'^bánh (mì|mỳ|sandw|bag)(?!.*(bông|thịt|bơ|kem|hoa cúc|gà|pate|xốt|sữa|floss|socola|khoai|trứng|trong|hươu|nho|smile))',
    'chicken_fillet': r'(file|phi lê|\bức)(?!.*đùi).*gà',
    'pork_leg': r'đùi.*heo',
    'egg': r'^trứng gà(?!.*(ăn liền|tiềm|nướng|cay))',
    'cucumber': r'^dưa.*leo',
    'carrot': r'^cà rốt',
    'onion': r'hành tây',
    'tomato': r'^cà chua(?!.*(puree|đặc))',
    'cabbage': r'bắp cải trắng',
    'banana': r'^chuối(?!.*sấy)',
    'orange': r'^cam\b(?!.*sấy)',
    'milk': r'^sữa (tươi|tiệt|dinh|vina)(?!.*(melon|chuối|trái cây|có đường|ít đường|soco|dâu|vani|trân châu|ngữ|choco|lacto))',
    'yogurt': r'^sữa chua(?!.*(uống|men|khô|dẻo|ml))',
    'condensed_milk': r'sữa đặc(?!.*xanh lá)',
    'black_tea': r'^(hồng trà|trà\b)(?!.*(ml|l\b|xanh|sữa|khổ|sen|atiso|hoa cúc|ô long|olong|o long|green|ice|nestea|thảo|gừng|lài|matcha|chia|sâm|thế hệ|hà thủ|thái nguyên|15g|blendy|linh chi|happy|tân cương|huế|tết|thanh nhiệt))',
    'green_tea': r'^trà\b(?!.*(ml|l\b|sữa|khổ|atiso|hoa cúc|ice|nestea|thảo|gừng|matcha|chia|thế hệ|hà thủ|blendy|linh chi|happy|huế|thanh nhiệt|dilmah|twinings|tết|tim sen|chanh|tâm sen|đen|lipton|dâu|bạc hà|hàn quốc|đào|quất))',
    'ground_coffee': r'^(cà phê|cafe)(?!.*(hòa tan|hoà tan|sữa|in1|nesca|hạt|425g|bịch|fin|cino|hương))',
    'sugar': r'^đường\s(tinh|trắng|mía|kính)',
    'salt': r'^muối(?!.*(tôm|ớt|tiêu)).*(biển|iot|tinh|sạch)',
    'sunflower_oil': r'^dầu.*hướng dương',
    'soybean_oil': r'dầu.*nành',
    'water': r'nước\s(uống đóng|khoáng|tinh)(?!.*(ion|chanh|perr))',
    'spaghetti': r'^mì(?!.*(kool|trộn|bò|omto|kem)).*(ý|spag|hair|buca)',
    'rice_noodles': r'^(bún|phở)(?!.*(lứt|đen|60g|65g|\sg$)).*(wai|minh hảo|nuffam|bình tây|sa đéc|saf|select|mikiri|hùng lô)',
    'tofu': r'^(đậu|tàu)\shũ(?!.*(chiên|trứng|cá\b|nấm|hạt|ky))',
    'water_spinach': r'^rau.*muống',
    'mango': r'^xoài(?!.*(sấy|ngâm))',
    'fish_sauce': r'^nước mắm(?!.*(ớt|me\b|gừng|chua\b|chay|tỏi|ngừ|nục|ăn liền))'
}
product_regex_list = '|'.join(product_regex_map.values())
product_regex_list

'^gạo(?!.*(lứt|lức|dưỡng|nếp))|^bánh (mì|mỳ|sandw|bag)(?!.*(bông|thịt|bơ|kem|hoa cúc|gà|pate|xốt|sữa|floss|socola|khoai|trứng|trong|hươu|nho|smile))|(file|phi lê|\\bức)(?!.*đùi).*gà|đùi.*heo|^trứng gà(?!.*(ăn liền|tiềm|nướng|cay))|^dưa.*leo|^cà rốt|hành tây|^cà chua(?!.*(puree|đặc))|bắp cải trắng|^chuối(?!.*sấy)|^cam\\b(?!.*sấy)|^sữa (tươi|tiệt|dinh|vina)(?!.*(melon|chuối|trái cây|có đường|ít đường|soco|dâu|vani|trân châu|ngữ|choco|lacto))|^sữa chua(?!.*(uống|men|khô|dẻo|ml))|sữa đặc(?!.*xanh lá)|^(hồng trà|trà\\b)(?!.*(ml|l\\b|xanh|sữa|khổ|sen|atiso|hoa cúc|ô long|olong|o long|green|ice|nestea|thảo|gừng|lài|matcha|chia|sâm|thế hệ|hà thủ|thái nguyên|15g|blendy|linh chi|happy|tân cương|huế|tết|thanh nhiệt))|^trà\\b(?!.*(ml|l\\b|sữa|khổ|atiso|hoa cúc|ice|nestea|thảo|gừng|matcha|chia|thế hệ|hà thủ|blendy|linh chi|happy|huế|thanh nhiệt|dilmah|twinings|tết|tim sen|chanh|tâm sen|đen|lipton|dâu|bạc hà|hàn quốc|đào|quất))|^(cà phê|cafe)(?!.*(hòa tan|hoà tan|sữa|in1|nesca|hạt|425g|bịch|fin|cino

In [401]:
filtered_products = products.loc[products.name.str.contains(product_regex_list, case=False, regex=True)]
with pd.option_context('display.max_rows', None, 'display.max_colwidth', None):
    display(filtered_products)

  filtered_products = products.loc[products.name.str.contains(product_regex_list, case=False, regex=True)]


Unnamed: 0,name,price,uom,supermarket
1,Cà chua loại so kg,26000,kg,Co.op
6,Cà chua Beef kg,26900,kg,Co.op
8,Cà rốt nhỏ 400g – F,29000,kg,Co.op
9,Hành tây baby 300g – F,19000,kg,Co.op
12,Cà chua trứng Coop Select kg-TNX,32900,kg,Co.op
14,Cà chua trái cây Ngọc Bích Coop Select – 300g,29000,hộp,Co.op
15,Cà rốt Coop Select kg,29900,bịch,Co.op
18,Củ hành tây Coop Select kg – TNX,24900,túi,Co.op
19,Cà chua beef Coop Select kg – Viet Farm,35900,hộp,Co.op
25,Dưa leo Coop Select kg,23900,kg,Co.op


Let's add missing tags for product type and supermarket, for that we have to match products to their types by matching regex again

In [402]:
def assign_product_type(row):
    name = row['name']
    for product_type, regex in product_regex_map.items():
        match = re.search(regex, name, flags=re.IGNORECASE)
        if match:
            return product_type
    return None

filtered_products = filtered_products.copy()  # recreate the dataframe
filtered_products.loc[:,'product_type'] = filtered_products.apply(assign_product_type, axis=1)
filtered_products

Unnamed: 0,name,price,uom,supermarket,product_type
1,Cà chua loại so kg,26000,kg,Co.op,tomato
6,Cà chua Beef kg,26900,kg,Co.op,tomato
8,Cà rốt nhỏ 400g – F,29000,kg,Co.op,carrot
9,Hành tây baby 300g – F,19000,kg,Co.op,onion
12,Cà chua trứng Coop Select kg-TNX,32900,kg,Co.op,tomato
...,...,...,...,...,...
6482,Nước mắm Nam Ngư 3in1 chai 10g/l – 750ml,52500,chai,Co.op,fish_sauce
6483,Nước mắm cá cơm Hạnh Phúc 60 độ đạm 250ml,70200,chai,Co.op,fish_sauce
13802,Gạo thơm ST25 plus lúa tôm Neptune 5kg,252000,bịch,Co.op,rice
14611,Cam sành Coop Select túi kg,24900,kg,Co.op,orange


It's going slowly, now we have 5 products, let's try to deal with them. Steps 2 are extracting weight or number, calculating normalized price and adding missing tags

In [404]:
def extract_weight(row):
    name, uom = row['name'], row['uom']
    
    # calculate weight if there're multiple portions (volume can be first or second)
    match = re.search(r'(\d+|\d+[,.]\d+)\s?(g\b|gr\b|kg)\s?(x|gói)\s?(\d+)', name, flags=re.IGNORECASE)
    if match:
        portion = int(match.group(4))
        per_portion = float(match.group(1).replace(',', '.'))
        return portion * per_portion
        
    match = re.search(r'(\d+)(\s|\shủ\s?|\shộp\s?|\sgói\s?|\stúi\s?)?x\s?(\d+|\d+[,.]\d+)\s?g\b', name, flags=re.IGNORECASE)
    if match:
        portion = float(match.group(1).replace(',', '.'))
        per_portion = float(match.group(3).replace(',', '.'))
        return portion * per_portion
    # if there's a single weight
    match = re.search(r'(\d+|\d+[,.]\d+)\s?(g\b|gr\b|kg)', name, flags=re.IGNORECASE)
    if match:
        weight = float(match.group(1).replace(',', '.'))
        unit = match.group(2)
        return weight * 1000 if unit in ['kg','Kg'] else weight
    # if name doesn't contain anything, check uom
    if uom == 'kg':
        weight = 1000
        return weight
    # if none of above worked but there's 'kg' in the name
    match = re.search(r'kg', name, flags=re.IGNORECASE)
    if match:
        weight = 1000
        return weight
    
    return None  # if nothing matched

def extract_number_of_units(row):
    name, product_type, uom = row['name'], row['product_type'], row['uom']
    
    # first, check name
    match = re.search(r'(\d+)\s?(túi|gói|trứng|t\b|x)', name, flags=re.IGNORECASE)
    if match:
        number_of_units = int(match.group(1))
        return number_of_units
   
def extract_volume(row):
    name, uom = row['name'], row['uom']
    
    # calculate volume if there're multiple portions (volume can be first or second)
    match = re.search(r'(\d+|\d+[,.]\d+)\s?(ml|l\b|lít)\s?(x|thùng)\s?(\d+)', name, flags=re.IGNORECASE)
    if match:
        portion = int(match.group(4))
        per_portion = float(match.group(1).replace(',', '.'))
        unit = match.group(2)
        return portion * per_portion * 1000 if unit in ['l', 'L', 'lít'] else portion * per_portion
        
    match = re.search(r'(\d+)(\s|\sgói\s?|\sbịch\s?|\shộp\s?|\schai\s?)?[x×]\s?(\d+|\d+[,.]\d+)\s?(ml|l\b|lít)', name, flags=re.IGNORECASE)
    if match:
        portion = int(match.group(1))
        per_portion = float(match.group(3).replace(',', '.'))
        unit = match.group(4)
        return portion * per_portion * 1000 if unit in ['l', 'L', 'lít'] else portion * per_portion
    # if there's a single volume
    match = re.search(r'(\d+|\d+[,.]\d+)\s?(ml|l\b|lít)', name, flags=re.IGNORECASE)
    if match:
        volume = float(match.group(1).replace(',', '.'))
        unit = match.group(2)
        return volume * 1000 if unit in ['l', 'L', 'lít'] else volume
    
    return None  # if nothing matched

filtered_products = filtered_products.copy()  # recreate the dataframe
# extract and calculate weights
filtered_products.loc[:,'weight'] = filtered_products.apply(extract_weight, axis=1)  # a column with weigths in grams
filtered_products.loc[:,'price_kg'] = filtered_products.price / filtered_products.weight * 1000   # a column with prices per kg
# # extract and calculate number of units (for products with units)
filtered_products.loc[:,'number_of_units'] = filtered_products.apply(extract_number_of_units, axis=1)  # a column with number of units
filtered_products.loc[:,'price_unit'] = filtered_products.price / filtered_products.number_of_units   # a column with prices per unit
# extract and calculate volume
filtered_products.loc[:,'volume'] = filtered_products.apply(extract_volume, axis=1)  # a column with volume in ml
filtered_products.loc[:,'price_lit'] = filtered_products.price / filtered_products.volume * 1000   # a column with prices per liter
with pd.option_context('display.max_rows', None, 'display.max_colwidth', None, 'display.float_format', '{:.1f}'.format):
    display(filtered_products)

Unnamed: 0,name,price,uom,supermarket,product_type,weight,price_kg,number_of_units,price_unit,volume,price_lit
1,Cà chua loại so kg,26000,kg,Co.op,tomato,1000.0,26000.0,,,,
6,Cà chua Beef kg,26900,kg,Co.op,tomato,1000.0,26900.0,,,,
8,Cà rốt nhỏ 400g – F,29000,kg,Co.op,carrot,400.0,72500.0,,,,
9,Hành tây baby 300g – F,19000,kg,Co.op,onion,300.0,63333.3,,,,
12,Cà chua trứng Coop Select kg-TNX,32900,kg,Co.op,tomato,1000.0,32900.0,,,,
14,Cà chua trái cây Ngọc Bích Coop Select – 300g,29000,hộp,Co.op,tomato,300.0,96666.7,,,,
15,Cà rốt Coop Select kg,29900,bịch,Co.op,carrot,1000.0,29900.0,,,,
18,Củ hành tây Coop Select kg – TNX,24900,túi,Co.op,onion,1000.0,24900.0,,,,
19,Cà chua beef Coop Select kg – Viet Farm,35900,hộp,Co.op,tomato,1000.0,35900.0,,,,
25,Dưa leo Coop Select kg,23900,kg,Co.op,cucumber,1000.0,23900.0,,,,


In [405]:
filtered_products.to_csv(f'filtered_products-2025-03-07-coop.csv')