In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', 200) 

In [None]:
df = pd.read_csv("data/recipes.csv", index_col=0)

df.head()


In [None]:
import re

# Define common measurement words
measurements = [
    'teaspoon', 'teaspoons', 'tsp', 'tablespoon', 'tablespoons', 'tbsp',
    'cup', 'cups', 'ounce', 'ounces', 'oz', 'pint', 'pints', 'quart', 'quarts',
    'gallon', 'gallons', 'ml', 'l', 'liter', 'liters', 'gram', 'grams', 'g',
    'kilogram', 'kilograms', 'kg', 'pound', 'pounds', 'lb', 'lbs', 'dash', 'pinch',
    'package', 'packages', 'can', 'cans', 'bottle', 'bottles', 'slice', 'slices',
    'stick', 'sticks', 'piece', 'pieces', 'whole', 'small', 'finely', 'chopped', 'meleted', 'plus',
    'room', 'temperature', 'softened', 'divided', 'to', 'taste', 'optional' ,'large', 'medium', 
    'extra-large', 'thinly', 'sliced', 'fresh', 'ground', 'crushed', 'minced', 'grated', 'dried', 
    '½', '¼', '⅓', '¾', '⅔', '⅛', '. ', 'halved', 'lenghtwise', 'quartered', 'rinsed', 'drained', 
    'cooked', 'uncooked', 'frozen', 'thawed', 'packed', 'lightly', 'into', 'of',
]

def clean_ingredient(ingredient):
    # Remove parenthesis content
    ingredient = re.sub(r'\([^)]*\)', '', ingredient)
    # Remove numbers and fractions
    ingredient = re.sub(r'\d+\s?/\s?\d+|\d+\.\d+|\d+', '', ingredient)
    # Remove measurement words (standalone or after numbers/fractions)
    pattern = r'\b(?:' + '|'.join(measurements) + r')\b'
    ingredient = re.sub(pattern, '', ingredient, flags=re.IGNORECASE)
    # Remove extra spaces and commas
    ingredient = re.sub(r'[,\s]+', ' ', ingredient).strip()
    return ingredient

# Flatten the list if it's a list of one string with multiple ingredients separated by quotes or spaces
def split_ingredients(lst):
    if isinstance(lst, list) and len(lst) == 1 and isinstance(lst[0], str):
        # Split by single quote and strip whitespace, filter out empty strings
        items = [item.strip() for item in lst[0].split("'") if item.strip()]
        return [clean_ingredient(i) for i in items]
    elif isinstance(lst, list):
        return [clean_ingredient(i) for i in lst]
    else:
        return [clean_ingredient(lst)]

df['ingred_cleaned'] = df['Cleaned_Ingredients'].apply(split_ingredients)

display(df['ingred_cleaned'].iloc[0])


In [None]:
df['ingred_cleaned'].iloc(0)[0][0]

In [None]:
df['ingred_cleaned'].head(60)