In [7]:
import pandas as pd
from rapidfuzz import process, fuzz
from tqdm import tqdm

# Load data
recipes = pd.read_csv("recipe2.csv")
walmart = pd.read_csv("walmart_cleaned.csv")

# Clean text
def clean_text(s):
    s = str(s).lower().strip()
    s = ''.join(ch for ch in s if ch.isalnum() or ch.isspace())
    return s

recipes["ingredient_clean"] = recipes["ingredient"].map(clean_text)
walmart["product_clean"] = walmart["PRODUCT_NAME"].map(clean_text)

# Prepare sets
choices = walmart["product_clean"].tolist()

# Cache & batching
unique_ingredients = recipes["ingredient_clean"].unique()
print(f"ðŸ§‚ Unique ingredients to match: {len(unique_ingredients)}")

cache = {}
batch_size = 200

for start in tqdm(range(0, len(unique_ingredients), batch_size)):
    batch = unique_ingredients[start:start+batch_size]
    # vectorized distance computation
    results = process.cdist(
        batch, choices,
        scorer=fuzz.token_set_ratio,
        workers=-1,  # use all CPU cores
        score_cutoff=80
    )
    # get best match per ingredient
    for i, ing in enumerate(batch):
        if results[i].size > 0:
            best_idx = results[i].argmax()
            best_match = choices[best_idx]
            best_score = results[i][best_idx]
            cache[ing] = (best_match, best_score)
        else:
            cache[ing] = (None, None)

# Map cached matches to recipes
recipes["matched_product"] = recipes["ingredient_clean"].map(lambda x: cache[x][0])
recipes["match_score"] = recipes["ingredient_clean"].map(lambda x: cache[x][1])

# Merge
merged = recipes.merge(
    walmart,
    left_on="matched_product",
    right_on="product_clean",
    how="left",
    suffixes=("_recipe", "_walmart")
)

# Select relevant columns
final = merged[[
    "title", "ingredient", "amount", "unit",
    "matched_product", "match_score",
    "PRODUCT_NAME", "BRAND", "PRICE_RETAIL", "PRICE_CURRENT", "PRODUCT_SIZE"
]]

# Export
final.to_csv("merged_recipe_prices_fast.csv", index=False)
print("Saved merged_recipe_prices_fast.csv")


ðŸ§‚ Unique ingredients to match: 6251


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 32/32 [22:42<00:00, 42.57s/it]


âœ… Saved merged_recipe_prices_fast.csv


In [13]:
# Remove duplicates from merged dataset
merged = pd.read_csv("merged_recipe_prices_fast.csv")
print("Before deduplication:", len(merged))
merged_nodupl = merged.drop_duplicates(subset=["title", "ingredient", "amount", "unit", "matched_product"])
print("After deduplication:", len(merged_nodupl))

# Save cleaned file
final_nodupl = merged_nodupl[[
    "title", "ingredient", "amount", "unit",
    "matched_product", "match_score",
    "PRODUCT_NAME", "BRAND", "PRICE_RETAIL", "PRICE_CURRENT", "PRODUCT_SIZE"
]]

# Export final cleaned file (duplicates removed)
final_nodupl.to_csv("merged_recipe_price_nodupl.csv", index=False)
print("Saved merged_recipe_prices_nodupl.csv (duplicates removed)")

Before deduplication: 5621815
After deduplication: 139815
Saved merged_recipe_prices_nodupl.csv (duplicates removed)
