In [None]:
import re

COMMON_UNITS = {
    "tsp", "teaspoon", "tbsp", "tablespoon", "cup", "oz", "ounce", 
    "lb", "pound", "g", "gram", "kg", "kilogram", "ml", "l", "liter",
    "clove", "can", "package", "bunch", "pinch", "slice", "stick"
}

ADJECTIVES = {
    "large", "medium", "small", "fresh", "dried", "chopped", "sliced", "diced", "minced",
    "grated", "crushed", "whole", "ground", "fine", "coarse", "thinly", "thickly",
    "lean", "boneless", "skinless"
}

GARBAGE_TOKENS = {
    "or", "and", "optional", "to taste", "if needed", "for serving", "plus more", "divided"
}

def sanitize_ingredient_text(text: str) -> str:
    """Strip markdown and normalize whitespace."""
    if not text:
        return ""
        
    s = text
    # Remove markdown bold/italic markers
    s = s.replace("**", "").replace("__", "").replace("*", "")
    
    # Remove leading bullets
    s = re.sub(r'^[\s\-\#]+', '', s)
    
    # Collapse whitespace
    s = re.sub(r'\s+', ' ', s).strip()
    
    return s

def is_garbage_line(text: str) -> bool:
    """Check if the text is just a connector word or garbage."""
    if not text:
        return True
        
    t = text.lower().strip()
    # Remove punctuation
    t = re.sub(r'[^\w\s]', '', t)
    
    if not t:
        return True

    # Check exact match against tokens
    if t in GARBAGE_TOKENS:
        return True
        
    return False

def normalize_ingredient(name: str, qty: float | None, unit: str | None):
    """
    Normalize ingredient for grocery list aggregation.
    Returns (key, display, qty, unit).
    If the ingredient is detected as garbage/connector, key will be None.
    """
    # 0. Sanitize display name first (handles "Or**" -> "Or")
    clean_display = sanitize_ingredient_text(name)
    
    # 0.5 Check for garbage
    if is_garbage_line(clean_display):
        return None, None, None, None
        
    # 1. Basic cleaning
    clean_name = clean_display.lower()
    
    # Remove parentheticals for KEY generation only (e.g. "onions (chopped)")
    clean_name_key = re.sub(r'\([^)]*\)', '', clean_name)
    
    # Remove adjectives
    words = clean_name_key.split()
    filtered_words = [w for w in words if w not in ADJECTIVES]
    clean_name_key = " ".join(filtered_words)
    
    # Remove punctuation
    clean_name_key = re.sub(r'[^\w\s]', '', clean_name_key).strip()
    
    # Naive singularization (very basic)
    if clean_name_key.endswith("s") and not clean_name_key.endswith("ss"):
        clean_name_key = clean_name_key[:-1]
        
    key = clean_name_key
    
    # Use sanitized name for display to preserve nuances but no markdown
    display = clean_display.capitalize() 
    
    # Unit normalization (basic mapping)
    norm_unit = unit.lower() if unit else None
    if norm_unit:
        if norm_unit in ["teaspoon", "teaspoons"]: norm_unit = "tsp"
        elif norm_unit in ["tablespoon", "tablespoons"]: norm_unit = "tbsp"
        elif norm_unit in ["pound", "pounds"]: norm_unit = "lb"
        elif norm_unit in ["ounce", "ounces"]: norm_unit = "oz"
        elif norm_unit in ["gram", "grams"]: norm_unit = "g"
        
    return key, display, qty, norm_unit

In [None]:
test_inputs = [
    ("Onion", 1.0, "whole"),
    ("Or", None, None),
    ("Optional", None, None),
    ("Salt", None, "pinch"),
    ("Ground Beef", 1.0, "lb"),
    ("Fresh Parsley", 1.0, "bunch"),
    ("Chopped Onions", 1.0, "cup"),
    ("Garlic Cloves", 3.0, "clove"),
    ("Boneless Skinless Chicken Breasts", 2.0, "lb"),
    ("1/2 cup sugar", 0.5, "cup"), # This input format (name="1/2 cup sugar") simulates bad data where name contains everything
    ("Tomatoes (diced)", 1.0, "can"),
    ("Red Pepper Flakes (optional)", 1.0, "tsp"),
    ("Water", 1.0, "cup"),
    (" Olive Oil ", 2.0, "tbsp"),
    ("* Salt", None, None),
    ("**Pepper**", None, None),
    ("- Cumin", None, None),
    ("Milk", 1.0, "cup"),
    ("Heavy Cream", 1.0, "cup"),
    ("Parmesan Cheese", 1.0, "cup"),
    ("Butter", 1.0, "stick"),
]

for name, qty, unit in test_inputs:
    result = normalize_ingredient(name, qty, unit)
    print(f"Input: ('{name}', {qty}, '{unit}') -> Output: {result}")