### 1. Setup

We import the necessary libraries and define key parameters for interacting with Lenta’s API: the base URL for fetching categories, the current date for reference, and request headers. The `HEADERS` dictionary contains the minimal set of headers required to receive a valid response from the Lenta API. These headers were determined through testing.

In [None]:
import requests
import json
import time
import random
import pandas as pd
from datetime import date

# define base URLs and headers for making requests throughout the scraping process
CATEGORIES_URL = "https://lenta.com/api-gateway/v1/catalog/categories?timestamp="
CHECK_DATE = date.today() # current date for reference

HEADERS = {
    'DeviceID': '42b24bb8-650a-f3a7-fa8d-593e40b478d4',
    'Experiments': 'exp_recommendation_cms.true, exp_apigw_purchase.test, exp_lentapay.test, exp_omni_price.test, exp_profile_bell.test, exp_newui_cancel_order.test, exp_newui_history_active_action.test_stars, exp_comment_picker_and_courier.test, exp_general_editing_page.test, exp_cl_omni_support.test, exp_cl_omni_authorization.test, exp_onboarding_sbp.default, exp_fullscreen.test, exp_profile_login.false, exp_new_notifications_show_unauthorized.test, exp_assembly_cost_location.cart, exp_search_bottom.default, exp_onboarding_editing_order.test, exp_cart_new_carousel.default, exp_newui_cart_cancel_editing.test, exp_newui_cart_button.test, exp_new_promov3., exp_sbp_enabled.test, exp_new_my_goods.test, exp_ui_catalog.test, exp_search_out_of_stock.default, exp_profile_settings_email.default, exp_cl_omni_refusalprintreceipts.test, exp_cl_omni_refusalprintcoupons.test, exp_accrual_history.test, exp_personal_recommendations.control, exp_newui_chips.test, exp_loyalty_categories.test, exp_growthbooks_aa.OFF, exp_test_ch_web.def, exp_search_suggestions_popular_sku.default, exp_cancel_subscription.test_2, exp_manage_subscription.control, exp_cl_new_csi.default, exp_cl_new_csat.default, exp_delivery_price_info.default, exp_personal_promo_navigation.test, exp_web_feature_test.true, exp_interval_jump.default, exp_cardOne_promo_type.test, exp_qr_cnc.test, exp_popup_about_order.test, exp_apigw_recommendations.test, exp_where_place_cnc.control, exp_editing_cnc_onboarding.default, exp_editing_cnc.default, exp_selection_carousel.test, exp_pickup_in_delivery.false, exp_feature_kpp_test.false, exp_welcome_onboarding.default, exp_cl_new_splash.default, exp_web_referral_program_type.default, exp_where_place_new.default, exp_start_page.default, exp_promocode_bd_coupon.default, exp_personal_promo_swipe_animation.default, exp_default_payment_type.default, exp_main_page_carousel_vs_banner.default, exp_start_page_onboarding.default, exp_newui_cart_check_edit.default, exp_search_new_logic.default, exp_search_ds_pers_similar.default, exp_growthbooks_aa_id_based_feature.control, exp_referral_program_type.default, exp_my_choice_search.default, exp_items_by_rating.default, exp_can_accept_early.default, exp_test_gb_value.false, exp_online_subscription.default, exp_new_nps_keyboard.test, exp_main_page_carousel_vs_banner_shop.default, exp_bathcing.default, exp_web_qr_cnc.default, exp_hide_cash_payment_for_cnc_wo_adult_items.default, exp_web_promocode_bd_coupon.default, exp_prices_per_quantum.default, exp_test.default123, exp_web_partner_coupons_separately.default, exp_web_chips_online.default',
    'SessionToken': 'E05BD3D1115FF4A5D5F0BC02D141EB5A',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36',
    'X-Delivery-Mode': 'pickup',
    'X-Platform': 'omniweb',
    'X-Retail-Brand': 'lo',
}

### 2. Web Scraping

To extract product data from Lenta, we follow a multi-step process: fetch product categories, collect item data by category, clean the results, and save everything in a structured format.

#### 2.1. Fetching Categories

We start by retrieving a list of product categories from Lenta’s API. The function extracts relevant fields (*id*, *name*, parent category, *slug* – used to construct URLs and *level*) and returns a cleaned list.

The request to the category endpoint requires a dynamic timestamp in milliseconds, which is passed as a query parameter. Each category is identified by a numeric ID and a slug; both are needed for product requests and URL construction.

In [None]:
def fetch_categories():
    
    response = requests.get(f'{CATEGORIES_URL}{int(time.time() * 1000)}', headers=HEADERS) # fetch category data from the API endpoint (a timestamp is required in milliseconds)
    raw_categories = json.loads(response.text) # convert JSON response to a list of category dictionaries
    
    cleaned_categories = []
    
    # go through the raw data and select only the necessary fields
    for category in raw_categories["categories"]:    # select the top-level category, save its id and name
        cleaned_categories.append({
            "id": category["id"], # id for further products fetching
            "name": category["name"], # name for reference
            "parent_id": category["parentId"] if category["parentId"] != 0 else None, # id and name of a parent category for reference
            "parent_name": category["parentName"] if category["parentName"] != '' else None, # empty if it's top-level and doesn't have a parent
            "slug": category["slug"], # category name used in URLs, for further products fetching
            "level": category["level"] # level of category, to avoid fetching the same categories twice
        })

    return cleaned_categories

We fetch all available categories and save them to a CSV file for transparency and reproducibility. Then, we select only level 2 categories — they strike a balance between breadth and granularity: fewer requests than level 3 (faster and less risk of hitting rate limits), but smaller batches than level 1, which helps minimize data loss if a request fails or we get blocked.

In [None]:
categories = fetch_categories()

categories_df = pd.DataFrame(categories)
categories_df.to_csv(f'categories-{CHECK_DATE}-lenta.csv', index=False)

In [None]:
# optional intermediate output of the categories list
# with pd.option_context('display.max_rows', None, 'display.max_colwidth', None):
#     display(categories_df)

In [None]:
categories_2 = [cat for cat in categories if cat['level'] == 2] # select only level 2 categories
# categories_2 = categories_2[:] # this line can be used to resume scraping from a specific point in case of interruption
total_categories_count = len(categories_2)

#### 2.2. Fetching Raw Product Data

We fetch product listings by mimicking the site’s API, which loads items in paginated batches (up to 40 per request) using an increasing offset. Unlike some other sites, Lenta uses a POST request to return product data ("items"), rather than a GET request. The request body includes the category ID, a fixed limit of 40, and an offset that increases by 40 with each page. A few additional fields (filters and sorting) are required but remain constant.

Optionally, a `Referer` header pointing to the category’s webpage (constructed using the base URL, slug, and ID) can be included to better mimic browser behavior — although it does not seem to be strictly required.

We loop through all pages until all products are retrieved. A random delay between requests reduces the risk of triggering rate limits.

The result is raw product data in JSON format, mirroring the structure used on the site.

In [None]:
def fetch_products(category):
    
    total_products = 1 # initial dummy value to enter the loop
    offset = 0
    products = []

    # set the Referer header to match the category being requested (may help avoid blocking)
    # HEADERS['Referer'] = f'https://lenta.com/catalog/{category["slug"]}-{category["id"]}/'
    url = 'https://lenta.com/api-gateway/v1/catalog/items'
        
    while offset < total_products:
        
        time.sleep(random.uniform(1, 5)) # random time delay to avoid being blocked

        # construct request body for POST request
        json_data = {
            'categoryId': category['id'],
            'limit': 40,
            'offset': offset,
            'sort': {
                'type': 'popular',
                'order': 'desc',
            },
            'filters': {
                'range': [],
                'checkbox': [],
                'multicheckbox': [],
            },
        }
    
        response = requests.post(url, headers=HEADERS, json=json_data) # fetch product data for the current category, offset, and limit
        response_data = json.loads(response.text) # convert into a dictionary
        
        products += response_data["items"] # extract only products and add them to the list
        total_products = response_data["total"] # update total number of products
        
        offset += 40 # move to the next page of results
        
        print(f'{len(products)}..', end='') # progress indicator, showing the number of products fetched in the current request

    return products    

#### 2.3. Cleaning Product Data

We define a function to extract only the most relevant fields from the raw data:

- Category ID (for traceability)
- Product name
- Regular price (converted from kopecks to rubles)
- Pricing clarification (e.g. net weight)
- Supermarket name

The result is a cleaned list of product entries, ready for storage or further analysis.

In [None]:
def clean_product_data(category, raw_products):

    cleaned_products = []

    # go through the raw data and select only the necessary fields
    for product in raw_products:
        cleaned_products.append({
            "category_id": category["id"],
            "name": product["name"],
            "price": product["prices"]["priceRegular"] / 100, # convert from kopecks to ruble
            "pricing_unit": product['weight']["package"], # clarifies the unit for the price or the net weight
            "supermarket": 'Lenta'
        })

    return cleaned_products

#### 2.4. Putting It All Together

The following code:

1. Iterates through the selected level 2 categories,
2. Fetches and cleans product data for each,
3. Appends the cleaned data to a single CSV file to build a complete dataset.

A progress tracker prints feedback for each category to monitor the scraping process. A short random delay is added between iterations to avoid potential rate-limiting. The output file includes a timestamp (via `CHECK_DATE`) to record when the data was collected.

In [None]:
fetched_categories_count = 0    # counter for fetching progress tracker

# create the file with headers first
products_df = pd.DataFrame(columns=["category_id", "name", "price", "pricing_unit", "supermarket"])
products_df.to_csv(f'scraped_products-{CHECK_DATE}-lenta.csv', index=False, mode='w')

for category in categories_2:
    
    raw_products = fetch_products(category) # fetch products
    new_products = clean_product_data(category, raw_products) # select only relevant data and add new products to the list

    products_df = pd.DataFrame(new_products)
    products_df.to_csv(f'scraped_products-{CHECK_DATE}-lenta.csv', index=False, mode='a', header=False)

    fetched_categories_count += 1
    print(f'Category ID: {category["id"]} finished, {fetched_categories_count} out of {total_categories_count} categories fetched')
    
    time.sleep(random.uniform(1, 5))

print(f'Fetching complete. Results saved to scraped_products-{CHECK_DATE}-lenta.csv')

---

### 3. Filtering and Normalizing Product Data

After collecting and cleaning the raw product data, we proceed with filtering the dataset to include only the products relevant for comparison. This step involves several stages:

#### 3.1. Initial Preprocessing

We start by loading the previously saved product and category datasets and dropping a category_id column that is no longer needed. Also we remove exact duplicates, which may appear if the same item was in more than one category.

In [None]:
import pandas as pd
import re

categories = pd.read_csv('categories-2025-03-04-lenta.csv')
products_original = pd.read_csv('scraped_products-2025-03-04-lenta-complete.csv')

products = products_original.drop(['category_id'], axis=1)    # drop category_id column
products = products.drop_duplicates() # remove duplicates

#### 3.2. Filtering Products by Type

To identify relevant products for comparison, we define a dictionary mapping product types to regular expressions. Each expression captures the base form of the product while deliberately excluding variations (e.g., flavored, processed, or pickled) that fall outside the scope of this analysis.

In [None]:
product_regex_map = {
    'rice': r'^рис\b(?!.*овощ)',
    'bread': r'(^хлеб\b|^багет\b|^батон\b)(?!.*печеноч)',
    'chicken_fillet': r'^филе (кур|груд)(?!.*(копч|соус|запеч|бедр|индей|утен))',
    'pork_leg': r'^окорок (свин|из свин)(?!.*(копч|соус))',
    'egg': r'^яйцо курин',
    'cucumber': r'^огур(цы|ец)(?!.*(солен|маринован|ягод))',
    'carrot': r'^морковь(?!.*(корей|отвар))', # matches "морковь" and some of its variations, but excludes irrelevant "морковь по-корейски" or "морковь отварная"
    'onion': r'^лук репч(?!.*суш)',
    'tomato': r'^томаты(?!.*(сок|очищ|маринован|вялен|солен|измельч|кус))',
    'cabbage': r'^капуста\b.*белокоч',
    'eggplant': r'^баклажаны,',
    'banana': r'^банан(?!.*(вял|суш|куб))', # matches "банан" or "бананы", but excludes "бананы вяленые" or "бананы сушеные"
    'orange': r'^апельсин(?!.*(сахар|куб))',
    'milk': r'^молоко(?!.*(сгущ|кокос|сух|топл|коз|обогащ|витам|лактоз|печен))',
    'yogurt': r'^йогурт\b(?!.*питье)',
    'condensed_milk': r'^молоко.*сгущ(?!.*(варен|кофе|какао))',
    'green_tea': r'^чай зел(?!.*порош)',
    'black_tea': r'^чай черн',
    'ground_coffee': r'^кофе(?!.*(капсул|раствор|фильтр)).*молот',
    'sugar': r'^сахар\b(?!.*(ванил|коричн))',
    'salt': r'^соль(?!.*(посуд|ванн|розов|чесн|прян|купан|мельн))',
    'sunflower_oil': r'^масло\b(?!.*(оливк|спрей|аром|вкус|добавл)).*подсолн',
    'water': r'^вода\b(?!.*(малин|клюкв|лимон|цитр)).*негаз',
    'buckwheat': r'(^крупа\b.*гречн|^гречка\b)(?!.*зел)',
    'spaghetti': r'^макароны\b.*спагетти', # matches "макароны" and selects only the "спагетти" variety
    'rice_noodles': r'(^лапша|^вермишель).*(рис|фунчоз)(?!.*соус)',
    'tofu': r'(^продукт)(?!.*(копч|папр)).*тофу|^тофу(?!.*(гриб|томат))',
    'mango': r'^манго желт',
    'fish_sauce': r'^соус.*рыбн'
}
product_regex_list = '|'.join(product_regex_map.values()) # create a single regex by joining all individual regexes with the OR operator (|)

# filter products matching any of the product types
filtered_products = products.loc[products.name.str.contains(product_regex_list, case=False, regex=True)]

#### 3.3. Assigning Tags

Each filtered product is tagged with its corresponding product type based on regex matching.

In [None]:
def assign_product_type(row):
    name = row['name']
    for product_type, regex in product_regex_map.items():
        match = re.search(regex, name, flags=re.IGNORECASE)
        if match:
            return product_type
    return None

filtered_products = filtered_products.copy()  # recreate the dataframe
filtered_products.loc[:,'product_type'] = filtered_products.apply(assign_product_type, axis=1)

# optional intermediate output of the filtered products list
# with pd.option_context('display.max_rows', None, 'display.max_colwidth', None):
#     display(filtered_products)

#### 3.4. Extracting and Normalizing Units

Many product listings differ in quantity, weight, or volume. To enable a fair comparison, we extract the relevant information from the product name or pricing clarification field and calculate normalized price metrics such as price per kilogram, per liter, or per unit.

- Weight in grams
- Number of units (in particular, eggs)
- Volume in milliliters

Each value is extracted using pattern matching. Not all products contain all values, so some normalization columns (e.g., *price_kg*, *price_lit*, *price_unit*) may be missing depending on the item.

In [None]:
def extract_weight(row):
    """Extracts total weight in grams from the product name or pricing clarification.
    Supports both single weights and multi-portion formats (e.g., '5x100г').
    If the product name contains 'весовой', it assumes the weight is 1kg (1000g).
    """
    
    name, pricing_unit = row['name'], row['pricing_unit']
    pricing_unit = str(pricing_unit) if pd.notna(pricing_unit) else "" # handle NaN values as empty
    
    # multi-portion format (e.g., 5x100г)
    match = re.search(r'(\d+)(x|х)(\d+|\d+[,.]\d+)\s?г\b', name) # matches digits х digits g
    if match:
        portion = float(match.group(1).replace(',', '.')) # extract the number of portions
        per_portion = float(match.group(3).replace(',', '.')) # extract the weight per portion
        return portion * per_portion # return total weight
    # single weight (grams or kilograms)
    match = re.search(r'(\d+|\d+[,.]\d+)\s?(г\b|кг)', name)
    if match:
        weight = float(match.group(1).replace(',', '.'))
        unit = match.group(2)
        return weight * 1000 if unit == 'кг' else weight # convert kilograms to grams
    # if name doesn't contain anything, check pricing_unit
    match = re.search(r'(\d+|\d+[,.]\d+)\s?(г\b|кг)', pricing_unit)
    if match:
        weight = float(match.group(1).replace(',', '.'))
        unit = match.group(2)
        return weight * 1000 if unit == 'кг' else weight
    # if name contains a word "весовой", it means it's a price for 1 kg
    match = re.search(r'\bвесов', name)
    if match:
        weight = 1000
        return weight

    return None  # if nothing matched

# the next two functions follow the same logic as extract_weight, but for units and milliliters
def extract_number_of_units(row):
    """Extracts number of units from the product name or pricing clarification
    ('шт' - piece; 'пак', 'пир', 'саш' - for tea: bag, pyramid, sachet).
    """
    
    name, pricing_unit = row['name'], row['pricing_unit']
    pricing_unit = str(pricing_unit) if pd.notna(pricing_unit) else ""

    # check name
    match = re.search(r'(\d+)\s?(шт|пак|пир|саш)', name)
    if match:
        number_of_units = int(match.group(1))
        return number_of_units
    # check pricing_unit
    match = re.search(r'(\d+)\s?(шт|пак|пир|саш)', pricing_unit)
    if match:
        number_of_units = int(match.group(1))
        return number_of_units

    return None  # if nothing matched

def extract_volume(row):
    """Extracts total volume in milliliters from the product name or pricing clarification.
    Supports both single and multi-portion formats (e.g., '5x100мл').
    """
    
    name, pricing_unit = row['name'], row['pricing_unit']
    pricing_unit = str(pricing_unit) if pd.notna(pricing_unit) else ""

    # multi-portion format (e.g., 5x100мл)
    match = re.search(r'(\d+)(x|х)(\d+|\d+[,.]\d+)\s?мл', name)
    if match:
        portion = float(match.group(1).replace(',', '.'))
        per_portion = float(match.group(3).replace(',', '.'))
        return portion * per_portion
    # single volume (liters or milliliters)
    match = re.search(r'(\d+|\d+[,.]\d+)\s?(мл|л\b)', name)
    if match:
        volume = float(match.group(1).replace(',', '.'))
        unit = match.group(2)
        return volume * 1000 if unit == 'л' else volume
    # check pricing_unit
    match = re.search(r'(\d+|\d+[,.]\d+)\s?(мл|л\b)', pricing_unit)
    if match:
        volume = float(match.group(1).replace(',', '.'))
        unit = match.group(2)
        return volume * 1000 if unit == 'л' else volume

    return None  # if nothing matched

filtered_products = filtered_products.copy()  # recreate the dataframe

# calculate normalized prices
filtered_products.loc[:,'weight'] = filtered_products.apply(extract_weight, axis=1)  # a column with weights in grams
filtered_products.loc[:,'price_kg'] = filtered_products.price / filtered_products.weight * 1000   # a column with prices per kg

filtered_products.loc[:,'number_of_units'] = filtered_products.apply(extract_number_of_units, axis=1)  # a column with number of units
filtered_products.loc[:,'price_unit'] = filtered_products.price / filtered_products.number_of_units   # a column with prices per unit

filtered_products.loc[:,'volume'] = filtered_products.apply(extract_volume, axis=1)  # a column with volume in ml
filtered_products.loc[:,'price_lit'] = filtered_products.price / filtered_products.volume * 1000   # a column with prices per liter

# optional intermediate output
# with pd.option_context('display.max_rows', None, 'display.max_colwidth', None):
#     display(filtered_products)

#### 3.5. Saving the Final Filtered Dataset

Finally, the enriched dataset is saved to a new CSV file for further analysis.

In [None]:
filtered_products.to_csv(f'filtered_products-2025-03-04-lenta.csv')