In [None]:
import pandas as pd
import re
from rapidfuzz import process, fuzz

# === Load Cleaned Data ===
recipes = pd.read_csv("ingredients_cleaned_fast.csv")
walmart = pd.read_csv("walmart_cleaned.csv")  # result of WalmartDataCleaning.ipynb

# === Text Normalization ===
def normalize(s):
    if pd.isna(s): 
        return ""
    s = s.lower()
    s = re.sub(r"[^a-z\s]", "", s)
    s = re.sub(r"\b(great value|market pantry|kroger|walmart|brand|store)\b", "", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

recipes["ingredient_name_clean"] = recipes["ingredient_name"].apply(normalize)
walmart["product_name_clean"] = walmart["product_name"].apply(normalize)

# === Build Walmart lookup list ===
product_names = walmart["product_name_clean"].tolist()

# === Fuzzy Match Function ===
def find_best_match(ingredient):
    if not isinstance(ingredient, str) or ingredient == "":
        return pd.Series([None, None, None, None, 0])
    match, score, idx = process.extractOne(
        ingredient,
        product_names,
        scorer=fuzz.token_sort_ratio
    )
    if score < 75:  # skip weak matches
        return pd.Series([None, None, None, None, score])
    product = walmart.iloc[idx]
    return pd.Series([
        product["product_name"],
        product.get("price", None),
        product.get("size", None),
        product.get("unit", None),
        score
    ])

# === Apply Matching (vectorized) ===
recipes[["matched_product", "price", "size", "unit", "match_score"]] = (
    recipes["ingredient_name_clean"].apply(find_best_match)
)

# === Save merged result ===
recipes.to_csv("recipes_with_prices.csv", index=False)
print(recipes.head(10))