In [1]:
# Imports
import pandas as pd
from rapidfuzz import process, fuzz
from tqdm import tqdm
import re

# Load data
recipes = pd.read_csv("recipe2.csv")
walmart = pd.read_csv("walmart_cleaned.csv")

# Text cleaning helpers
def clean_text(s):
    """
    Simple cleaner for ingredient strings:
    - lowercases
    - strips leading/trailing whitespace
    - keeps only alphanumeric + spaces
    - collapses multiple spaces
    """
    s = str(s).lower().strip()
    s = ''.join(ch for ch in s if ch.isalnum() or ch.isspace())
    s = re.sub(r"\s+", " ", s).strip()
    return s

recipes["ingredient_clean"] = recipes["ingredient"].map(clean_text)

# Check for product_clean column (from WalmartDataCleaning), but if not can create a minimal version as a fallback
if "product_clean" not in walmart.columns:
    def fallback_clean_product_name(name, brand=None):
        s = str(name).lower()
        if brand:
            b = str(brand).lower()
            s = re.sub(r"\b" + re.escape(b) + r"\b", " ", s)
        s = re.sub(r"[^a-z0-9\s]", " ", s)
        s = re.sub(r"\s+", " ", s).strip()
        return s

    if "BRAND" in walmart.columns:
        walmart["product_clean"] = walmart.apply(
            lambda row: fallback_clean_product_name(row["PRODUCT_NAME"], row["BRAND"]),
            axis=1
        )
    else:
        walmart["product_clean"] = walmart["PRODUCT_NAME"].map(
            lambda x: fallback_clean_product_name(x, None)
        )
else:
    # Normalize
    walmart["product_clean"] = walmart["product_clean"].astype(str).str.lower().str.strip()

# Helper: non-purchased ingredients (skip matching)
def is_non_purchased(ing_clean: str) -> bool:
    """
    Marks ingredients that are not actually purchased as products.
    """
    ing = ing_clean.strip()
    tokens = ing.split()

    # plain water / ice cases
    non_purchased_exact = {
        "water", "ice", "cold water", "hot water", "boiling water", "ice water"
    }
    if ing in non_purchased_exact:
        return True

    # water
    if len(tokens) == 2 and tokens[1] == "water":
        return True

    return False

# Helper: semantic adjustment to fuzzy scores
def adjusted_score(ingredient_clean: str, prod_clean: str, base_score: float) -> float:
    """
    Adjust fuzzy score with simple semantic penalties for obviously bad matches
    """
    meat_tokens = {"sausage", "bacon", "ham", "beef", "pork", "turkey", "chicken", "meat"}
    candy_tokens = {"chocolate", "candy", "caramel", "toffee", "bonbon", "truffle", "snack"}
    drink_tokens = {"soda", "cola", "drink", "juice", "energy", "coffee", "tea"}

    s = float(base_score)
    ing_tokens = set(ingredient_clean.split())
    prod_tokens = set(prod_clean.split())

    # Penalize meats if ingredient doesn't mention any meat tokens
    if meat_tokens & prod_tokens and not (meat_tokens & ing_tokens):
        s -= 10

    # Penalize candy/chocolate unless ingredient mentions chocolate
    if candy_tokens & prod_tokens and "chocolate" not in ing_tokens:
        s -= 8

    # Penalize drinks unless ingredient is a common beverage
    beverage_tokens = {"juice", "coffee", "tea", "soda", "cola", "drink"}
    if drink_tokens & prod_tokens and not (beverage_tokens & ing_tokens):
        s -= 8

    return s

# Fuzzy matching: ingredient_clean -> walmart.product_clean
unique_ingredients = recipes["ingredient_clean"].unique()
product_list = walmart["product_clean"].tolist()

cache = {}  # ingredient_clean -> {"matched_product": str or None, "match_score": float or None}

tqdm.pandas(desc="Matching ingredients to Walmart products")

for ing in tqdm(unique_ingredients, desc="Ingredients"):
    if is_non_purchased(ing):
        cache[ing] = {"matched_product": None, "match_score": None}
        continue

    # Top-k fuzzy matches for this ingredient against all walmart products.
    matches = process.extract(
        ing,
        product_list,
        scorer=fuzz.token_set_ratio,
        limit=5
    )

    best_prod = None
    best_adj_score = -1.0

    for prod_clean, score, idx in matches:
        adj = adjusted_score(ing, prod_clean, score)
        if adj > best_adj_score:
            best_adj_score = adj
            best_prod = prod_clean

    # Cutoff to reject low-confidence matches
    score_cutoff = 80.0
    if best_adj_score < score_cutoff:
        cache[ing] = {"matched_product": None, "match_score": best_adj_score}
    else:
        cache[ing] = {"matched_product": best_prod, "match_score": best_adj_score}

# Map cache back onto full recipes dataframe
recipes["matched_product"] = recipes["ingredient_clean"].map(
    lambda x: cache.get(x, {}).get("matched_product", None)
)
recipes["match_score"] = recipes["ingredient_clean"].map(
    lambda x: cache.get(x, {}).get("match_score", None)
)

# Merge recipes with Walmart catalog
merged = recipes.merge(
    walmart,
    left_on="matched_product",
    right_on="product_clean",
    how="left",
    suffixes=("_recipe", "_walmart")
)

# Compute pricing fields: PRICE_USED and PRICE_PER_UNIT
# Use PRICE_CURRENT when available, else use PRICE_RETAIL
merged["PRICE_USED"] = merged["PRICE_CURRENT"]
merged.loc[merged["PRICE_USED"].isna(), "PRICE_USED"] = merged.loc[
    merged["PRICE_USED"].isna(), "PRICE_RETAIL"
]

# Guard against divide-by-zero or missing PRODUCT_SIZE
merged["PRICE_PER_UNIT"] = merged["PRICE_USED"] / merged["PRODUCT_SIZE"]

# Reliability flag for matching
merged["reliable_match"] = merged["match_score"] >= 90.0

# Select and save "fast" merged file (with duplicates)
# Keep original columns for backward compatibility, then append new ones
cols_base = [
    "title", "ingredient", "amount", "unit",
    "matched_product", "match_score",
    "PRODUCT_NAME", "BRAND", "PRICE_RETAIL", "PRICE_CURRENT", "PRODUCT_SIZE"
]

cols_extra = [
    "PRICE_USED", "PRICE_PER_UNIT",
    "ingredient_clean", "product_clean",
    "DEPARTMENT", "CATEGORY", "SUBCATEGORY",
    "reliable_match"
]

# Only keep extra columns that actually exist in the merged df
cols_extra = [c for c in cols_extra if c in merged.columns]

final = merged[cols_base + cols_extra]

final.to_csv("merged_recipe_prices_fast.csv", index=False)
print("Saved merged_recipe_prices_fast.csv")

# Remove duplicates for nodupl version
merged_nodupl = final.drop_duplicates(
    subset=["title", "ingredient", "amount", "unit", "matched_product"]
)

final_nodupl = merged_nodupl[cols_base + cols_extra]

final_nodupl.to_csv("merged_recipe_price_nodupl.csv", index=False)
print("Saved merged_recipe_price_nodupl.csv (duplicates removed)")


Ingredients: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 6245/6245 [04:48<00:00, 21.62it/s]


Saved merged_recipe_prices_fast.csv
Saved merged_recipe_price_nodupl.csv (duplicates removed)


In [2]:
# Merge Quality Diagnostics

# Match score distribution
if "match_score" in final.columns:
    print("Match Score Summary:")
    print(final["match_score"].describe())
    print()

# Count matched ingredients
matched_count = final["matched_product"].notna().sum()
total_count = len(final)
print(f"Matched Ingredients: {matched_count} / {total_count} "
      f"({matched_count/total_count*100:.1f}%)\n")

# Show top unmatched ingredients
unmatched = final[final["matched_product"].isna()]["ingredient_clean"].unique()

print("Unmatched Ingredients (first 20):")
for x in unmatched[:20]:
    print("  •", x)
print()

# Show suspicious matches (matched product exists but low score)
sus = final[
    (final["match_score"] < 85) &
    (final["match_score"].notna()) &
    (final["matched_product"].notna())
]

print(f"Suspicious Matches: {len(sus)} rows\n")

if not sus.empty:
    print("Examples of Suspicious Matches (first 10):\n")
    display(
        sus[[
            "title",
            "ingredient",
            "ingredient_clean",
            "matched_product",
            "PRODUCT_NAME",
            "CATEGORY",
            "SUBCATEGORY",
            "match_score",
            "PRICE_USED"
        ]].head(10)
    )
else:
    print("No suspicious matches found.\n")

# Show high-confidence matches
good = final[final["match_score"] >= 95].head(10)

print("\nExamples of High-Confidence Matches:\n")
display(
    good[[
        "title",
        "ingredient",
        "ingredient_clean",
        "matched_product",
        "PRODUCT_NAME",
        "CATEGORY",
        "SUBCATEGORY",
        "match_score",
        "PRICE_USED"
    ]]
)

# Summaries by category
if "CATEGORY" in final.columns:
    print("\nMatch Count by CATEGORY (top 20):")
    display(final["CATEGORY"].value_counts().head(20))

print("\n=== END OF MERGE DIAGNOSTICS ===\n")


Match Score Summary:
count    242302.000000
mean         96.974933
std           7.053796
min           0.000000
25%         100.000000
50%         100.000000
75%         100.000000
max         100.000000
Name: match_score, dtype: float64

Matched Ingredients: 235280 / 246390 (95.5%)

Unmatched Ingredients (first 20):
  • water
  • flavor gelatin
  • boiling water
  • almond extract
  • paraffin
  • cleaned strawberries
  • wesson oil
  • frango
  • crisco oil
  • yellow apples
  • purple grapes
  • pimentos
  • english peas
  • warm water
  • oleo
  • egg yolks
  • multicolored candies
  • creamstyle
  • nut meats
  • orange sliced candy

Suspicious Matches: 3864 rows

Examples of Suspicious Matches (first 10):



Unnamed: 0,title,ingredient,ingredient_clean,matched_product,PRODUCT_NAME,CATEGORY,SUBCATEGORY,match_score,PRICE_USED
5,No-Bake Nut Cookies,bite size shredded rice biscuits,bite size shredded rice biscuits,rice 2,"goya goya rice, 2 lb",international foods,shop all,80.0,4.68
71,Scalloped Corn,cream-style corn,creamstyle corn,cream style corn 14 75,"butter kernel cream style corn, 14.75 oz, can",canned vegetables,shop all,81.081081,1.58
95,Millionaire Pie,graham cracker crusts,graham cracker crusts,kellogg s baked graham cracker snacks cinnamon 11,kellogg's scooby-doo! baked graham cracker sna...,crackers,,80.0,3.78
96,Millionaire Pie,graham cracker crusts,graham cracker crusts,kellogg s baked graham cracker snacks cinnamon 11,kellogg's scooby-doo! baked graham cracker sna...,healthy snacks,,80.0,2.84
138,Fresh Strawberry Pie,cornstarch,cornstarch,corn starch 16,"great value corn starch, 16 oz",baking soda & starch,,83.333333,1.54
139,Fresh Strawberry Pie,cornstarch,cornstarch,corn starch 16,"great value corn starch, 16 oz",baking soda & starch,,83.333333,1.96
328,Cherry Pizza,ground nuts,ground nuts,nutmeg ground,"badia nutmeg ground, bottle","herbs, spices & seasonings",shop all,83.333333,3.42
362,Watermelon Rind Pickles,watermelon rind,watermelon rind,tropical mix variety with citrus lime watermel...,corona hard seltzer tropical mix variety pack ...,beer,flavored beer & hard seltzers,80.0,17.27
363,Watermelon Rind Pickles,watermelon rind,watermelon rind,tropical mix variety with citrus lime watermel...,corona hard seltzer tropical mix variety pack ...,beer,flavored beer & hard seltzers,80.0,18.48
364,Watermelon Rind Pickles,watermelon rind,watermelon rind,tropical mix variety with citrus lime watermel...,corona hard seltzer tropical mix variety pack ...,beer,flavored beer & hard seltzers,80.0,14.27



Examples of High-Confidence Matches:



Unnamed: 0,title,ingredient,ingredient_clean,matched_product,PRODUCT_NAME,CATEGORY,SUBCATEGORY,match_score,PRICE_USED
0,No-Bake Nut Cookies,brown sugar,brown sugar,mini bagels brown sugar cinnamon 17 12,"pepperidge farm mini bagels, brown sugar cinna...",breakfast breads,bagels,100.0,4.84
2,No-Bake Nut Cookies,vanilla,vanilla,cold brew vanilla cinn 4pk,cafe agave cafe agave cold brew vanilla cinn 4pk,beer,flavored beer & hard seltzers,100.0,12.98
3,No-Bake Nut Cookies,nuts,nuts,ziyad extra mixed nuts 300 gm,ziyad castania extra mixed nuts 300 gm,baking nuts & seeds,,100.0,7.98
6,Jewell Ball'S Chicken,beef,beef,amb nc beef hd lrg,amb nc beef hd lrg,"bacon, hot dogs, sausage",,100.0,5.18
7,Jewell Ball'S Chicken,chicken breasts,chicken breasts,meats country fried chicken breasts 5 with gra...,mistica ranch meats country fried chicken brea...,condiments,shop all,100.0,6.97
8,Jewell Ball'S Chicken,chicken breasts,chicken breasts,meats country fried chicken breasts 5 with gra...,mistica ranch meats country fried chicken brea...,condiments,shop all,100.0,7.22
9,Jewell Ball'S Chicken,chicken breasts,chicken breasts,meats country fried chicken breasts 5 with gra...,mistica ranch meats country fried chicken brea...,condiments,shop all,100.0,6.94
10,Jewell Ball'S Chicken,cream of mushroom soup,cream of mushroom soup,cream,puck puck cream,international foods,shop all,100.0,3.98
11,Jewell Ball'S Chicken,sour cream,sour cream,potato chips sour cream and onion kettle chips...,"cape cod potato chips, sour cream and onion ke...",chips,,100.0,3.68
12,Creamy Corn,frozen corn,frozen corn,battered sweet corn nuggets frozen 26,pecos valley farms battered sweet corn nuggets...,fresh food,plant-base alternatives,100.0,4.84



Match Count by CATEGORY (top 20):


CATEGORY
bacon, hot dogs, sausage      33600
canned goods                  27547
baking soda & starch          15854
baking nuts & seeds           15599
condiments                    12664
better for you                10831
fresh food                    10404
breakfast breads               9849
healthy snacks                 9784
beer                           8622
biscuits, cookies, doughs      7496
chocolate                      6365
flours & meals                 6350
herbs, spices & seasonings     6095
easy to make                   5147
cheese                         4263
butter & margarine             3908
international foods            3432
chips                          3381
juices                         3357
Name: count, dtype: int64


=== END OF MERGE DIAGNOSTICS ===

