In [None]:
import pandas as pd
import numpy as np
import re

# Load and clean dataset
data = pd.read_csv("recipe.csv")
data = data.dropna()
data = data.drop(columns=['directions', 'link', 'source'])

UNITS = [
    "cup", "cups", "c", "tbsp", "tablespoon", "tsp", "teaspoon", "oz", "ounce",
    "lb", "pound", "g", "gram", "kg", "ml", "liter", "quart", "pinch", "dash"
]

def clean_ingredient(raw_ingredient):
    text = raw_ingredient.lower().strip()

    # Extract quantity (fraction, decimal, or integer)
    qty_pattern = r"(\d+\/\d+|\d+\.\d+|\d+)"
    qty_match = re.match(qty_pattern, text)
    quantity = qty_match.group(0) if qty_match else None

    # Remove quantity from the string
    if quantity:
        text = text[len(quantity):].strip()

    # Extract unit if present
    unit = None
    for u in UNITS:
        # Look for the unit at the start or after quantity
        pattern = r"^\b" + u + r"\b\.?"
        match = re.match(pattern, text)
        if match:
            unit = u
            text = text[len(match.group(0)):].strip()
            break

    # Remove filler words (e.g. descriptors)
    DESCRIPTORS = ["firmly", "packed", "chopped", "fresh", "broken"]
    for d in DESCRIPTORS:
        text = text.replace(d, "").strip()

    # Remove punctuation and extra spaces
    text = re.sub(r"[^\w\s]", "", text)  # remove non-alphanumeric except spaces
    text = re.sub(r"\s+", " ", text).strip()

    return {
        "ingredient_name": text,
        "quantity": quantity,
        "unit": unit
    }

def process_ingredient_list(ingredient_list):
    cleaned_list = []
    for item in ingredient_list:
        cleaned_list.append(clean_ingredient(item))
    return cleaned_list

data["Cleaned_Ingredients"] = data["ingredients"].apply(process_ingredient_list)

print(data.head())
