In this notebook we're going to integrate all the filtered data and do the final cleaning

In [1]:
import pandas as pd

First, let's import the data into corresponding variables

In [2]:
pyaterochka = pd.read_csv('filtered_products-2025-03-03-pyaterochka.csv')
lenta = pd.read_csv('filtered_products-2025-03-04-lenta.csv')
winmart = pd.read_csv('filtered_products-2025-03-06-winmart.csv')
coop = pd.read_csv('filtered_products-2025-03-07-coop.csv')

Next, let's drop unnecessary columns (id, 0) and unify column names

In [3]:
pyaterochka = pyaterochka.drop('id', axis=1).rename(columns={'pricing_unit': 'uom'})
lenta = lenta.drop(lenta.columns[0], axis=1).rename(columns={'pricing_unit': 'uom'})
winmart = winmart.drop(winmart.columns[0], axis=1)
coop = coop.drop(coop.columns[0], axis=1)

Next, let's integrate them all into a single DataFrame

In [4]:
all_products = pd.concat([pyaterochka, lenta, winmart, coop], ignore_index=True)

Let's also drop and add a few items (explanation in the comments)

In [5]:
# drop incorrect rows here as well in order to not repeat that later
all_products = all_products.drop([684, 224]) # brown rice
all_products = all_products.drop(1877) # wrong type of eggs
all_products = all_products.drop(1683) # oranges without weight
all_products = all_products.drop([1470, 1506]) # condensed milk (not sweetened)
all_products = all_products.drop([108, 124, 129, 114, 118, 115, 131, 130, 421, 422, 419, 420, 418, 117, 1662]) # drinking yogurt
all_products = all_products.drop([575, 566, 565, 564, 365, 360, 359, 560, 559, 576, 506]) # coffee in drip boxes
all_products = all_products.drop([1294, 1297, 292, 289, 1304, 2310]) # non-white sugar or liquid (2310)
all_products = all_products.drop([443]) # non-wheat spaghetti
all_products = all_products.drop([1499, 1495, 1493, 1498, 1489, 1497, 1478, 202]) # non-rice noodles
all_products.loc[10000] = ['Cà tím màng co', 25500, 'kg', 'eggplant', 'Co.op', 1000, 25500, None, None, None, None] # eggplant from Co.op

  all_products.loc[10000] = ['Cà tím màng co', 25500, 'kg', 'eggplant', 'Co.op', 1000, 25500, None, None, None, None] # eggplant from Co.op


In [211]:
# with pd.option_context('display.max_rows', None, 'display.max_colwidth', None, 'display.float_format', '{:.2f}'.format):
#     display(all_products.loc[(all_products['supermarket'] == 'Lenta') & (all_products['name'].str.contains('весов'))])

Now we have to clean it up. I will filter them by supermarket and product type and check any possible issues with prices and packaging
NOTE: some products have multiply normalized prices, some of them are erroneous, but I will ignore them if the main price for a given product type is correct. For example, if a rice have price_kg and erroneous price_unit, I will ignore price_unit, because price_kg is correct and all other rices have price_kg.

In [6]:
# manually correcting items that failed in price normalization (all NaN)
all_products.loc[1660, 'weight'] = 100
all_products.loc[1736, 'weight'] = 250
all_products.loc[1800, 'volume'] = 500
all_products.loc[1819, 'weight'] = 500

# some "per kg" prices in Lenta turned out to be not per kg, so I have to fix them 🙉
all_products.loc[598, 'weight'] = 300
all_products.loc[478, 'weight'] = 400
all_products.loc[[446, 453, 456, 459, 460, 461, 462, 463, 464, 467, 468, 469, 476, 488, 599, 728, 731, 733], 'weight'] = 500
all_products.loc[610, 'weight'] = 600
all_products.loc[734, 'weight'] = 700
all_products.loc[448, 'weight'] = 900
all_products.loc[736, 'weight'] = 950
all_products.loc[729, 'weight'] = 1200
all_products.loc[[735, 1162], 'weight'] = 1400
all_products.loc[[458, 1160], 'weight'] = 1500
all_products.loc[730, 'weight'] = 1700
all_products.loc[732, 'weight'] = 1900
all_products.loc[455, 'weight'] = 2700
all_products.loc[451, 'weight'] = 3000

# for some tea that didn't have weight
all_products.loc[[737, 739, 741, 742, 743, 744, 759, 760, 764, 771, 783, 784, 1097, 1003], 'weight'] = 200
all_products.loc[[745, 781, 1053, 1015, 866], 'weight'] = 45
all_products.loc[[746, 747, 752, 755, 756, 762, 763, 765, 767, 769, 779, 780, 782, 785, 1112, 1159, 961], 'weight'] = 50
all_products.loc[[748, 1089], 'weight'] = 180
all_products.loc[[753, 754, 772, 1118, 1149, 1038], 'weight'] = 37.5
all_products.loc[[770, 773, 1024], 'weight'] = 36
all_products.loc[[1471], 'weight'] = 100
all_products.loc[[1133, 986, 898], 'weight'] = 40
all_products.loc[[1123], 'weight'] = 24
all_products.loc[[1107], 'weight'] = 42.5
all_products.loc[[1075], 'weight'] = 27
all_products.loc[[1064], 'weight'] = 150
all_products.loc[[1058], 'weight'] = 34
all_products.loc[[1044], 'weight'] = 170
all_products.loc[[1020], 'weight'] = 170

# manually correcting items that normalized incorrectly
all_products.loc[454, 'weight'] = 300
all_products.loc[1890, 'number_of_units'] = 10
all_products.loc[9, 'weight'] = 220
all_products.loc[1988, 'volume'] = 600 # in reality it's a pack of 3x200ml
all_products.loc[1714, 'volume'] = 12000 # in reality it's a pack of 24
all_products.loc[1789, 'volume'] = 600 # in reality it's a pack of 12
all_products.loc[1792, 'volume'] = 750

# recalculate kg, unit and lit prices
all_products.loc[all_products['weight'].notna(), 'price_kg'] = all_products['price'] / all_products['weight'] * 1000
all_products.loc[all_products['number_of_units'].notna(), 'price_unit'] = all_products['price'] / all_products['number_of_units']
all_products.loc[all_products['volume'].notna(), 'price_lit'] = all_products['price'] / all_products['volume'] * 1000

NOTES: 1) Все молоко нужно перевести в литры. 2) Убрать чай, у которого нет веса, потому что чая итак достаточно.

In [7]:
# find milks that doesn't have volume yet, convert weight to volume (milk density is 1.03 g/l)
all_products.loc[(all_products['product_type'] == 'milk') & (all_products['volume'].isna()), 'volume'] = all_products['weight'] / 1.03
all_products.loc[all_products['volume'].notna(), 'price_lit'] = all_products['price'] / all_products['volume'] * 1000
# drop tea without weight
all_products = all_products.loc[~((all_products['product_type'].isin(['black_tea', 'green_tea'])) & (all_products['weight'].isna()))]

Finally, let's remove unnecessary and erroneous values, like weight for liquids or number of units where it's not applicable. Let's create a new DataFrame for that.

In [8]:
clean_products = all_products

unit_types = ['egg']
volume_types = ['water', 'sunflower_oil', 'soybean_oil', 'milk', 'fish_sauce']
weight_items = ~clean_products['product_type'].isin(unit_types + volume_types)

clean_products.loc[clean_products['product_type'].isin(unit_types), ['weight', 'price_kg', 'volume', 'price_lit']] = None
clean_products.loc[clean_products['product_type'].isin(volume_types), ['weight', 'price_kg', 'number_of_units', 'price_unit']] = None
clean_products.loc[weight_items, ['volume', 'price_lit', 'number_of_units', 'price_unit']] = None

NameError: name 'egg_items' is not defined

Probably final tweak: let's convert prices into dollars

In [300]:
# first, let's define exchange rate
# Rates are a bit volatile, especially ruble, so I will use average for the month from Feb 7 to Mar 7 (last data collection)
# source: exchange-rates.org

rub_to_usd = 0.011056
vnd_to_usd = 0.000039214

clean_products.loc[clean_products['supermarket'].isin(['Pyaterochka', 'Lenta']), ['price','price_kg', 'price_unit', 'price_lit']] *= rub_to_usd
clean_products.loc[clean_products['supermarket'].isin(['Winmart', 'Co.op']), ['price','price_kg', 'price_unit', 'price_lit']] *= vnd_to_usd

In [301]:
with pd.option_context('display.max_rows', None, 'display.max_colwidth', None, 'display.float_format', '{:.2f}'.format):
     display(clean_products.sort_values(by=['product_type', 'supermarket', 'price_kg', 'price_unit', 'price_lit']))

Unnamed: 0,name,price,uom,product_type,supermarket,weight,price_kg,number_of_units,price_unit,volume,price_lit
1869,Chuối cau,0.62,kg,banana,Co.op,1000.0,0.62,,,,
1871,Chuối sứ – kg,0.75,kg,banana,Co.op,1000.0,0.75,,,,
1865,Chuối sáp sống kg,1.16,kg,banana,Co.op,1000.0,1.16,,,,
1874,Chuối Coop Select L1 kg,1.25,kg,banana,Co.op,1000.0,1.25,,,,
1873,Chuối vàng Laha giống Philippin kg – Bảo Phương,1.37,kg,banana,Co.op,1000.0,1.37,,,,
735,"Бананы, фасованные, весовые",2.61,,banana,Lenta,1400.0,1.86,,,,
729,"Бананы, весовые",2.37,,banana,Lenta,1200.0,1.98,,,,
29,Бананы Global Village,1.88,Цена за 1 кг,banana,Pyaterochka,1000.0,1.88,,,,
31,Бананы Красная Цена фасованные,1.88,Цена за 1 кг,banana,Pyaterochka,1000.0,1.88,,,,
1689,Chuối sứ,0.82,KG,banana,Winmart,1000.0,0.82,,,,


In [302]:
clean_products.to_csv(f'clean_products-2025-03-12.csv')