In [1]:
import pandas as pd
import ast
import re
import csv

In [2]:
df = pd.read_csv("recipes_data_10k.csv")

def safe_literal_eval(s):
    try:
        return ast.literal_eval(s)
    except Exception:
        return []

df["ing_list"] = df["ingredients"].apply(safe_literal_eval)
df["ner_list"] = df["NER"].apply(safe_literal_eval)

# ðŸ‘‰ Load the final cleaned + normalized ingredient universe
# ðŸ‘‰ Load the final cleaned + normalized ingredient universe
final_ing_df = pd.read_csv("unique_ingredients_final.csv")

# Drop obviously non-ingredient patterns like "choice", "favorite", "amount"
bad_pattern = r"(choice|favorite|equal amount)"
mask_bad = final_ing_df["ingredient"].str.contains(bad_pattern, case=False, na=False)
final_ing_df = final_ing_df[~mask_bad]

final_ingredients = final_ing_df["ingredient"].astype(str).tolist()
final_ingredient_set = set(final_ingredients)

print("Final ingredient universe after extra cleaning:", len(final_ingredients))



Final ingredient universe after extra cleaning: 3896


  mask_bad = final_ing_df["ingredient"].str.contains(bad_pattern, case=False, na=False)


In [3]:
import re

def clean_ner_token(token):
    """
    Clean and normalize a raw NER token into a canonical ingredient name.
    This should match the logic used in UniqueIngredients.ipynb.
    """
    if not isinstance(token, str):
        return None

    t = token.strip().lower()
    if not t:
        return None

    # Remove leading possessive: "'s applesauce" -> "applesauce"
    t = re.sub(r"^['`\"]?s\s+", "", t)

    # Strip surrounding punctuation and commas
    t = t.strip(" ,.;:()[]{}\"'")

    # Collapse multiple spaces
    t = re.sub(r"\s+", " ", t)

    # Drop tokens with no letters at all
    if not re.search(r"[a-z]", t):
        return None

    # Strip common prefixes:
    #   "additional parsley" -> "parsley"
    #   "any kind blueberries" -> "blueberries"
    #   "some oil" -> "oil"
    #   "your favorite salsa" -> "salsa"
    prefixes = [
        "additional ",
        "another ",
        "any kind ",
        "any ",
        "some ",
        "your favorite ",
        "amount ",
    ]
    for p in prefixes:
        if t.startswith(p):
            t = t[len(p):].strip()
            break

    if not t:
        return None

    # Handle "with ..." / "without ...":
    #   "with juice" -> "juice"
    #   "without sugar" -> "sugar"
    if t.startswith("with ") or t.startswith("without "):
        parts = t.split(maxsplit=1)
        if len(parts) == 2:
            t = parts[1].strip()
        else:
            return None

    if not t:
        return None

    # Remove descriptor suffixes like " washed", " drained", " prepared"
    for suffix in (" washed", " drained", " prepared"):
        if t.endswith(suffix):
            t = t[: -len(suffix)].strip()

    if not t:
        return None

    # Handle "favorite X" anywhere in the phrase:
    #   "favorite chicken" -> "chicken"
    #   "sack favorite tortilla" -> "tortilla"
    m = re.search(r"\bfavorite\s+(.+)$", t)
    if m:
        t = m.group(1).strip()

    # Special case cleanups / normalizations
    special_fixes = {
        "xxxx sugar": "sugar",
        "young carrots": "carrots",
        "acorn": "acorn squash",
    }
    if t in special_fixes:
        t = special_fixes[t]

    # Explicitly remove Accent brand variants
    accent_variants = {
        "accent",
        "accent salt",
        "accent seasoning",
        "accent seasonings",
    }
    if t in accent_variants or t.startswith("accent "):
        return None

    # Hard blacklist for obvious non-ingredients / junk
    blacklist_exact = {
        "young groundhog",
        "favorite",
        "your favorite",
        "amount",
        "equal amount",
    }
    if t in blacklist_exact:
        return None

    # Descriptor words that are not ingredients by themselves
    descriptor_words = {
        "washed", "drained", "fresh", "cold", "hot", "warm",
        "optional", "prepared", "chopped", "sliced", "diced",
        "cooked", "uncooked", "raw", "frozen", "thawed",
        "ripe", "lean", "boneless", "skinless",
        "whole", "large", "small", "medium",
        "fine", "coarse", "thick", "thin",
        "regular", "lite", "low-fat", "nonfat", "fat-free",
    }
    if t in descriptor_words:
        return None

    # Skip single-character garbage
    if len(t) <= 1:
        return None

    # Final trailing commas/whitespace cleanup
    t = re.sub(r",+$", "", t).strip()
    if not t:
        return None

    return t


In [4]:
unit_aliases = {
    'tsp': 'tsp', 'tsps': 'tsp', 'tsp.': 'tsp', 'teaspoon': 'tsp', 'teaspoons': 'tsp',
    'tbsp': 'tbsp', 'tbsp.': 'tbsp', 'tbsps': 'tbsp', 'tablespoon': 'tbsp', 'tablespoons': 'tbsp',
    'c': 'cup', 'c.': 'cup', 'cup': 'cup', 'cups': 'cup',
    'pt': 'pt', 'pt.': 'pt', 'pint': 'pt', 'pints': 'pt',
    'qt': 'qt', 'qt.': 'qt', 'quart': 'qt', 'quarts': 'qt',
    'oz': 'oz', 'oz.': 'oz', 'ounce': 'oz', 'ounces': 'oz',
    'lb': 'lb', 'lb.': 'lb', 'lbs': 'lb', 'lbs.': 'lb', 'pound': 'lb', 'pounds': 'lb',
    'g': 'g', 'g.': 'g', 'gram': 'g', 'grams': 'g',
    'kg': 'kg', 'kg.': 'kg', 'kilogram': 'kg', 'kilograms': 'kg',
}

In [5]:
grams_per_unit = {
    'tsp': 5.0,
    'tbsp': 15.0,
    'cup': 240.0,
    'pt': 473.0,
    'qt': 946.0,
    'oz': 28.3495,
    'lb': 453.592,
    'g': 1.0,
    'kg': 1000.0,
}

In [6]:
def parse_leading_quantity(s: str):
    """
    Extract a leading quantity like:
      - '1'
      - '1/2'
      - '1 1/2'
      - '1-1/2'
    and return (float_qty, raw_qty_str, rest_of_string)
    """
    s = s.strip().lower()
    m = re.match(r'(\d+/\d+|\d+(?:\s+\d+/\d+|\s*-\s*\d+/\d+)?)', s)
    if not m:
        return None, None, s
    qty_str = m.group(1)
    rest = s[m.end():].lstrip()

    def frac_to_float(fs: str) -> float:
        # handle things like "1 1/2" or "1-1/2"
        fs = fs.replace('-', ' ')
        parts = fs.split()
        total = 0.0
        for p in parts:
            if '/' in p:
                num, den = p.split('/')
                total += float(num) / float(den)
            else:
                total += float(p)
        return total

    try:
        qty = frac_to_float(qty_str)
    except Exception:
        qty = None

    return qty, qty_str, rest


In [7]:
def parse_grams(ing_text: str):
    """
    Approximate the weight in grams for a single ingredient line.
    Returns a float (grams) or None if completely unparseable.
    """
    s = ing_text.lower()
    qty, qty_str, rest = parse_leading_quantity(s)
    grams = None

    if qty is not None:
        # Try to read a unit immediately after the quantity
        m = re.match(r'([a-z]+\.?)', rest)
        unit_norm = None
        if m:
            unit_raw = m.group(1)
            unit_norm = unit_aliases.get(unit_raw)

        if unit_norm not in grams_per_unit:
            unit_norm = None

        # Case like "1 (12 oz.) can ..." (nested weight inside)
        if grams is None:
            m2 = re.search(
                r'(\d+(?:\.\d+)?)\s*(oz\.?|ounce[s]?|g\.?|gram[s]?|kg\.?|kilogram[s]?|lb\.?|pound[s]?)',
                rest
            )
            if m2:
                inner_qty = float(m2.group(1))
                unit_raw2 = m2.group(2).replace('.', '')
                if unit_raw2.startswith(('oz', 'ou')):
                    unit_norm2 = 'oz'
                elif unit_raw2.startswith('g'):
                    unit_norm2 = 'g'
                elif unit_raw2.startswith(('kg', 'ki')):
                    unit_norm2 = 'kg'
                elif unit_raw2.startswith(('lb', 'po')):
                    unit_norm2 = 'lb'
                else:
                    unit_norm2 = None

                if unit_norm2 and unit_norm2 in grams_per_unit:
                    grams = inner_qty * grams_per_unit[unit_norm2] * (qty if qty not in (None, 0) else 1.0)

        # Simple unit (cup, tsp, Tbsp, etc.)
        if grams is None and unit_norm in grams_per_unit:
            grams = qty * grams_per_unit[unit_norm]

        # Fallback heuristics if we still have nothing
        if grams is None:
            if 'egg' in rest:
                grams = qty * 50.0
            elif 'clove' in rest and 'garlic' in rest:
                grams = qty * 5.0
            elif 'can ' in rest or 'can of' in rest:
                grams = qty * 400.0
            elif 'package' in rest or 'pkg' in rest:
                grams = qty * 300.0
            elif 'slice' in rest:
                grams = qty * 30.0
            else:
                grams = qty * 50.0  # generic guess

    else:
        # No leading qty, but maybe "16 oz. cheese"
        m3 = re.search(
            r'(\d+(?:\.\d+)?)\s*(oz\.?|ounce[s]?|g\.?|gram[s]?|kg\.?|kilogram[s]?|lb\.?|pound[s]?)',
            s
        )
        if m3:
            q = float(m3.group(1))
            unit_raw3 = m3.group(2).strip('.')
            if unit_raw3.startswith(('oz', 'ou')):
                unit_norm3 = 'oz'
            elif unit_raw3.startswith('g'):
                unit_norm3 = 'g'
            elif unit_raw3.startswith(('kg', 'ki')):
                unit_norm3 = 'kg'
            elif unit_raw3.startswith(('lb', 'po')):
                unit_norm3 = 'lb'
            else:
                unit_norm3 = None

            if unit_norm3 and unit_norm3 in grams_per_unit:
                grams = q * grams_per_unit[unit_norm3]

    return grams

In [8]:
def compute_recipe_grams_row(ing_texts, ner_names):
    grams_map = {}
    for txt, name in zip(ing_texts, ner_names):
        cleaned_name = clean_ner_token(name)
        if cleaned_name is None:
            continue

        # Only keep ingredients that are in the final normalized universe
        if cleaned_name not in final_ingredient_set:
            continue

        g = parse_grams(txt)
        if g is None:
            continue

        grams_map[cleaned_name] = grams_map.get(cleaned_name, 0.0) + g

    return grams_map

df["grams_map"] = [
    compute_recipe_grams_row(ings, ners)
    for ings, ners in zip(df["ing_list"], df["ner_list"])
]

print("Example grams_map entry:\n", df["grams_map"].iloc[0])


Example grams_map entry:
 {'bite size shredded rice biscuits': 240.0, 'vanilla': 120.0, 'brown sugar': 2.5, 'nuts': 120.0, 'milk': 30.0, 'butter': 840.0}


In [9]:
# Use the final cleaned ingredient universe + consistent order
all_ingredients = final_ingredients

output_path = "recipes_ingredients_grams_matrix.csv"

import csv

with open(output_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)

    # Header
    header = ["title"] + all_ingredients
    writer.writerow(header)

    # Rows
    for title, grams_map in zip(df["title"], df["grams_map"]):
        row = [title]
        for ing in all_ingredients:
            val = grams_map.get(ing, 0.0)

            # ðŸ”¥ rounding to whole grams
            val = round(val)
            # if you want decimals instead:
            # val = round(val, 2)

            row.append(val)
        writer.writerow(row)

print("Rounded CSV saved to:", output_path)
print("Columns:", len(all_ingredients))


Rounded CSV saved to: recipes_ingredients_grams_matrix.csv
Columns: 3896
