In [2]:
import pandas as pd
import ast
import re


def safe_literal_eval(x):
    """Safely parse a string like "['salt', 'pepper']" into a Python list."""
    try:
        return ast.literal_eval(x)
    except Exception:
        return []


def clean_and_normalize(token: str):
    """
    Full cleaning + normalization pipeline for a single NER token.

    Returns:
      - final ingredient string, or
      - None if this should be discarded.
    """
    if not isinstance(token, str):
        return None

    # Basic normalize
    t = token.strip().lower()
    if not t:
        return None

    # Remove leading possessive: "'s applesauce" -> "applesauce"
    t = re.sub(r"^['`\"]?s\s+", "", t)

    # Strip surrounding punctuation and commas
    t = t.strip(" ,.;:()[]{}\"'")
    if not t:
        return None

    # Collapse multiple spaces
    t = re.sub(r"\s+", " ", t)

    # Drop tokens with no letters at all (only symbols/numbers)
    if not re.search(r"[a-z]", t):
        return None

    # Strip common prefixes:
    #   "additional parsley" -> "parsley"
    #   "any kind blueberries" -> "blueberries"
    #   "some oil" -> "oil"
    #   "your favorite salsa" -> "salsa"
    prefixes = [
        "additional ",
        "another ",
        "any kind ",
        "any ",
        "some ",
        "your favorite ",
        "amount ",
    ]
    for p in prefixes:
        if t.startswith(p):
            t = t[len(p):].strip()
            break

    if not t:
        return None

    # Handle "with ..." / "without ...":
    #   "with juice" -> "juice"
    #   "without sugar" -> "sugar"
    if t.startswith("with ") or t.startswith("without "):
        parts = t.split(maxsplit=1)
        if len(parts) == 2:
            t = parts[1].strip()
        else:
            return None

    if not t:
        return None

    # Remove descriptor suffixes like " washed", " drained", " prepared"
    for suffix in (" washed", " drained", " prepared"):
        if t.endswith(suffix):
            t = t[: -len(suffix)].strip()

    if not t:
        return None

    # Handle "favorite X" anywhere in the phrase:
    #   "favorite chicken" -> "chicken"
    #   "sack favorite tortilla" -> "tortilla"
    m = re.search(r"\bfavorite\s+(.+)$", t)
    if m:
        t = m.group(1).strip()

    # Special case cleanups / normalizations
    special_fixes = {
        "xxxx sugar": "sugar",
        "young carrots": "carrots",
        "acorn": "acorn squash",
    }
    if t in special_fixes:
        t = special_fixes[t]

    # Explicitly remove AccentÂ® brand and variants
    accent_variants = {
        "accent",
        "accent salt",
        "accent seasoning",
        "accent seasonings",
    }
    if t in accent_variants or t.startswith("accent "):
        return None

    # Hard blacklist for obvious non-ingredients / junk
    blacklist_exact = {
        "young groundhog",
        "favorite",
        "your favorite",
        "amount",
        "equal amount",
    }
    if t in blacklist_exact:
        return None

    # Descriptor words that are not ingredients by themselves
    descriptor_words = {
        "washed", "drained", "fresh", "cold", "hot", "warm",
        "optional", "prepared", "chopped", "sliced", "diced",
        "cooked", "uncooked", "raw", "frozen", "thawed",
        "ripe", "lean", "boneless", "skinless",
        "whole", "large", "small", "medium",
        "fine", "coarse", "thick", "thin",
        "regular", "lite", "low-fat", "nonfat", "fat-free",
    }
    if t in descriptor_words:
        return None

    # VERY IMPORTANT: drop single-letter tokens like "a"
    if len(t) <= 1:
        return None

    # Final trailing comma + whitespace cleanup
    t = re.sub(r",+$", "", t).strip()
    if not t:
        return None

    return t


def main():
    # Input file with NER column
    recipes_csv = "../recipes_data_10k.csv"

    # Output files
    out_csv = "unique_ingredients_final.csv"
    out_txt = "unique_ingredients_final.txt"

    print(f"Loading recipes from: {recipes_csv}")
    df = pd.read_csv(recipes_csv)

    if "NER" not in df.columns:
        raise ValueError("Expected a column named 'NER' in recipes_data_10k.csv")

    # Parse NER column into Python lists
    df["NER_list"] = df["NER"].apply(safe_literal_eval)

    # Collect cleaned + normalized unique ingredients
    unique = set()
    for ner_list in df["NER_list"]:
        for tok in ner_list:
            cleaned = clean_and_normalize(tok)
            if cleaned:
                unique.add(cleaned)

    # Sort and convert to DataFrame
    unique_list = sorted(unique)
    out_df = pd.DataFrame({"ingredient": unique_list})

    # Save CSV
    out_df.to_csv(out_csv, index=False)

    # Save TXT (one per line)
    with open(out_txt, "w", encoding="utf-8") as f:
        for ing in unique_list:
            f.write(ing + "\n")

    print("=== FINAL INGREDIENT LIST BUILT ===")
    print(f"Total unique ingredients: {len(unique_list)}")
    print(f"Saved CSV -> {out_csv}")
    print(f"Saved TXT -> {out_txt}")
    print("\nSample of final ingredients:")
    print(out_df.head(40).to_string(index=False))


if __name__ == "__main__":
    main()


Loading recipes from: ../recipes_data_10k.csv
=== FINAL INGREDIENT LIST BUILT ===
Total unique ingredients: 3905
Saved CSV -> unique_ingredients_final.csv
Saved TXT -> unique_ingredients_final.txt

Sample of final ingredients:
             ingredient
       achiote coloring
                  acini
           acorn squash
       active dry yeast
           active yeast
       adams wheat beer
              ajinomoto
            alaga syrup
          alfalfa honey
        alfalfa sprouts
          alfredo sauce
               all-bran
        all-bran cereal
            all-purpose
all-purpose biscuit mix
      all-purpose flour
               allspice
                 almond
            almond bark
         almond extract
       almond flavoring
          almond slices
         almond slivers
           almond-honey
                almonds
                 almost
                   alum
          aluminum foil
        american cheese
 american cheese slices
                ammonia
     

In [3]:
import pandas as pd
from rapidfuzz import process, fuzz

# Load files
ing = pd.read_csv('../ingredients.csv')
uniq = pd.read_csv('unique_ingredients_final.csv')

# Convert to lowercase
descriptions = ing['Description'].astype(str).str.lower()
unique_names = uniq.iloc[:,0].astype(str).str.lower()

matches = []
unmatched = []

for name in unique_names:
    result = process.extractOne(name, descriptions, scorer=fuzz.partial_ratio)
    if result:
        match, score, idx = result
        if score > 70:
            row = ing.iloc[idx]
            matches.append({
                'Unique Ingredient': name,
                'Matched Description': row['Description'],
                'Carbohydrate': row.get('Data.Carbohydrate', None),
                'Fat.Total Lipid': row.get('Data.Fat.Total Lipid', None),
                'Protein': row.get('Data.Protein', None),
                'Score': score
            })
        else:
            unmatched.append(name)
    else:
        unmatched.append(name)

# Convert results to dataframe
out_df = pd.DataFrame(matches)

# Save to CSV
output_path = 'matched_ingredients.csv'
out_df.to_csv(output_path, index=False)

(len(matches), len(unmatched), output_path)


(3182, 723, 'matched_ingredients.csv')