In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', 200) 

In [2]:
df = pd.read_csv("data/recipes.csv", index_col=0)

df.head()


Unnamed: 0,Title,Ingredients,Instructions,Image_Name,Cleaned_Ingredients
0,Miso-Butter Roast Chicken With Acorn Squash Panzanella,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher salt, divided, plus more', '2 small acorn squash (about 3 lb. total)', '2 Tbsp. finely chopped sage', '1 Tbsp. finely chopped rosemary', '6 Tbsp. uns...","Pat chicken dry with paper towels, season all over with 2 tsp. salt, and tie legs together with kitchen twine. Let sit at room temperature 1 hour.\nMeanwhile, halve squash and scoop out seeds. Run...",miso-butter-roast-chicken-acorn-squash-panzanella,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher salt, divided, plus more', '2 small acorn squash (about 3 lb. total)', '2 Tbsp. finely chopped sage', '1 Tbsp. finely chopped rosemary', '6 Tbsp. uns..."
1,Crispy Salt and Pepper Potatoes,"['2 large egg whites', '1 pound new potatoes (about 1 inch in diameter)', '2 teaspoons kosher salt', '¾ teaspoon finely ground black pepper', '1 teaspoon finely chopped rosemary', '1 teaspoon fine...","Preheat oven to 400°F and line a rimmed baking sheet with parchment. In a large bowl, whisk the egg whites until foamy (there shouldn’t be any liquid whites in the bowl). Add the potatoes and toss...",crispy-salt-and-pepper-potatoes-dan-kluger,"['2 large egg whites', '1 pound new potatoes (about 1 inch in diameter)', '2 teaspoons kosher salt', '¾ teaspoon finely ground black pepper', '1 teaspoon finely chopped rosemary', '1 teaspoon fine..."
2,Thanksgiving Mac and Cheese,"['1 cup evaporated milk', '1 cup whole milk', '1 tsp. garlic powder', '1 tsp. onion powder', '1 tsp. smoked paprika', '½ tsp. freshly ground black pepper', '1 tsp. kosher salt, plus more', '2 lb. ...","Place a rack in middle of oven; preheat to 400°. Bring evaporated milk and whole milk to a bare simmer in a large saucepan over medium heat. Whisk in garlic powder, onion powder, paprika, pepper, ...",thanksgiving-mac-and-cheese-erick-williams,"['1 cup evaporated milk', '1 cup whole milk', '1 tsp. garlic powder', '1 tsp. onion powder', '1 tsp. smoked paprika', '½ tsp. freshly ground black pepper', '1 tsp. kosher salt, plus more', '2 lb. ..."
3,Italian Sausage and Bread Stuffing,"['1 (¾- to 1-pound) round Italian loaf, cut into 1-inch cubes (8 cups)', '2 tablespoons olive oil, divided', '2 pounds sweet Italian sausage, casings removed, divided', '1 stick unsalted butter, c...","Preheat oven to 350°F with rack in middle. Generously butter baking dish.\nPut bread in 2 shallow baking pans and bake, switching position of pans halfway through baking, until just dried out, abo...",italian-sausage-and-bread-stuffing-240559,"['1 (¾- to 1-pound) round Italian loaf, cut into 1-inch cubes (8 cups)', '2 tablespoons olive oil, divided', '2 pounds sweet Italian sausage, casings removed, divided', '1 stick unsalted butter, c..."
4,Newton's Law,"['1 teaspoon dark brown sugar', '1 teaspoon hot water', '1 ½ oz. bourbon', '½ oz. fresh lemon juice', '2 teaspoons apple butter (storebought or homemade)', 'Garnish: orange twist and freshly grate...","Stir together brown sugar and hot water in a cocktail shaker to dissolve. Let cool, then add bourbon, lemon juice, and apple butter and fill with ice. Shake until well chilled, about 15 seconds. S...",newtons-law-apple-bourbon-cocktail,"['1 teaspoon dark brown sugar', '1 teaspoon hot water', '1 ½ oz. bourbon', '½ oz. fresh lemon juice', '2 teaspoons apple butter (storebought or homemade)', 'Garnish: orange twist and freshly grate..."


In [23]:
import re

# Define common measurement words
measurements = [
    'teaspoon', 'teaspoons', 'tsp', 'tablespoon', 'tablespoons', 'tbsp',
    'cup', 'cups', 'ounce', 'ounces', 'oz', 'pint', 'pints', 'quart', 'quarts',
    'gallon', 'gallons', 'ml', 'l', 'liter', 'liters', 'gram', 'grams', 'g',
    'kilogram', 'kilograms', 'kg', 'pound', 'pounds', 'lb', 'lbs', 'dash', 'pinch',
    'package', 'packages', 'can', 'cans', 'bottle', 'bottles', 'slice', 'slices',
    'stick', 'sticks', 'piece', 'pieces', 'whole', 'small', 'finely', 'chopped', 'meleted', 'plus',
    'room', 'temperature', 'softened', 'divided', 'to', 'taste', 'optional' ,'large', 'medium', 
    'extra-large', 'thinly', 'sliced', 'fresh', 'ground', 'crushed', 'minced', 'grated', 'dried', 
    '½', '¼', '⅓', '¾', '⅔', '⅛', '. ', 'halved', 'lenghtwise', 'quartered', 'rinsed', 'drained', 
    'cooked', 'uncooked', 'frozen', 'thawed', 'packed', 'lightly', 'into', 'of',
]

def clean_ingredient(ingredient):
    # Remove parenthesis content
    ingredient = re.sub(r'\([^)]*\)', '', ingredient)
    # Remove numbers and fractions
    ingredient = re.sub(r'\d+\s?/\s?\d+|\d+\.\d+|\d+', '', ingredient)
    # Remove measurement words (standalone or after numbers/fractions)
    pattern = r'\b(?:' + '|'.join(measurements) + r')\b'
    ingredient = re.sub(pattern, '', ingredient, flags=re.IGNORECASE)
    # Remove extra spaces and commas
    ingredient = re.sub(r'[,\s]+', ' ', ingredient).strip()
    return ingredient

# Flatten the list if it's a list of one string with multiple ingredients separated by quotes or spaces
def split_ingredients(lst):
    if isinstance(lst, list) and len(lst) == 1 and isinstance(lst[0], str):
        # Split by single quote and strip whitespace, filter out empty strings
        items = [item.strip() for item in lst[0].split("'") if item.strip()]
        return [clean_ingredient(i) for i in items]
    elif isinstance(lst, list):
        return [clean_ingredient(i) for i in lst]
    else:
        return [clean_ingredient(lst)]

df['ingred_cleaned'] = df['Cleaned_Ingredients'].apply(split_ingredients)

display(df['ingred_cleaned'].iloc[0])


['[\' chicken\' \' kosher salt more\' \' acorn squash \' \' sage\' \' rosemary\' \' unsalted buttermelted \' \' allspice\' \' red pepper flakes\' \'Freshly black pepper\' \' loaf good-quality sturdy white breadtorn " \' \' apples coredcut " \' \' extra-virgin olive oil\' \' red onion \' \' apple cider vinegar\' \' white miso\' \' all-purpose flour\' \' unsalted butter \' \' dry white wine\' \' unsalted chicken broth\' \' white miso\' \'Kosher salt\' \'freshly pepper\']']

In [29]:
df['ingred_cleaned'].iloc(0)[0][0]

'[\' chicken\' \' kosher salt more\' \' acorn squash \' \' sage\' \' rosemary\' \' unsalted buttermelted \' \' allspice\' \' red pepper flakes\' \'Freshly black pepper\' \' loaf good-quality sturdy white breadtorn " \' \' apples coredcut " \' \' extra-virgin olive oil\' \' red onion \' \' apple cider vinegar\' \' white miso\' \' all-purpose flour\' \' unsalted butter \' \' dry white wine\' \' unsalted chicken broth\' \' white miso\' \'Kosher salt\' \'freshly pepper\']'

In [None]:
df['ingred_cleaned'].head(60)