In [1]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

In [2]:
ingredients = pd.read_csv('ingredients.csv')

In [3]:
ingredients.head()

Unnamed: 0.1,Unnamed: 0,recipe_name,ingredients
0,0,Mushroom Rarebit with Poached Eggs - Recipe - ...,3 large eggs
1,1,Mushroom Rarebit with Poached Eggs - Recipe - ...,1/3 cup shredded cheddar cheese
2,2,Mushroom Rarebit with Poached Eggs - Recipe - ...,1/2 tsp Dijon mustard
3,3,Mushroom Rarebit with Poached Eggs - Recipe - ...,2 drops Worcestershire sauce
4,4,Mushroom Rarebit with Poached Eggs - Recipe - ...,1 tbsp Greek yogurt


In [4]:
# drop the 'Unnamed: 0' column
ingredients.drop('Unnamed: 0',axis=1,inplace=True)
ingredients.head(2)

Unnamed: 0,recipe_name,ingredients
0,Mushroom Rarebit with Poached Eggs - Recipe - ...,3 large eggs
1,Mushroom Rarebit with Poached Eggs - Recipe - ...,1/3 cup shredded cheddar cheese


In [5]:
# perform same cleaning steps to recipe_name as we did for recipes dataframe
ingredients['recipe_name'] = ingredients['recipe_name'].str.replace('- Recipe - Diet Doctor','').str.replace('- Diet Doctor','').str.replace('- Recipe','').str.strip()

In [6]:
len(ingredients['recipe_name'].unique())

1131

In [7]:
def first_10_matches(pattern):
    """
    Return the first 10 ingredients that match
    the provided regular expression
    """
    ingredient_names = ingredients['ingredients']
    all_matches = ingredient_names[ingredient_names.str.contains(pattern)]
    first_10 = all_matches.head(10)
    return first_10

Ingredients are generally in the format: [quantity] [unit of measurement] [ingredient name] e.g. 1 tbsp Greek yogurt.

Let's parse out each component, and isolate the ingredient names.

Scratch:

When we scraped the ingredients, we split the ingredient list by commas. Looks like some individual ingredients included a comma and are thus split between multiple rows.

It appears that the information after the comma does not contain the actual ingredient, but is rather a supplemental description. Therefore, we will drop these rows.

In [8]:
first_10_matches(r"[0-9]/?[0-9]?[\s]?oz.?")

5     12 oz. portobello mushrooms about 1 large cap ...
16                                   1 1/2 oz. cucumber
18                 4 oz. deli turkey or smoked deli ham
23                                       5 oz. zucchini
25                              1 1/2 oz. celery stalks
28                                6 oz. deli roast beef
29                       6 oz. rotisseri chicken (meat)
30                                  3 oz. green cabbage
37                                  5 oz. tuna in water
39                                    1/2 oz. scallions
Name: ingredients, dtype: object

In [9]:
# appears to capture all rows
first_10_matches(r"[0-9]/?[0-9]?[\s]?oz.?|tsp?|tbsp?|pinch?|cups?|cup?|lb?|lbs?|drops?")

0                                         3 large eggs
1                      1/3 cup shredded cheddar cheese
2                                1/2 tsp Dijon mustard
3                         2 drops Worcestershire sauce
4                                  1 tbsp Greek yogurt
5    12 oz. portobello mushrooms about 1 large cap ...
6                                        brushed clean
7                                  and stalks cut away
8                                      2 tsp olive oil
9                                      salt and pepper
Name: ingredients, dtype: object

In [10]:
first_10_matches(r"[0-9]/?[0-9]?")

0                                          3 large eggs
1                       1/3 cup shredded cheddar cheese
2                                 1/2 tsp Dijon mustard
3                          2 drops Worcestershire sauce
4                                   1 tbsp Greek yogurt
5     12 oz. portobello mushrooms about 1 large cap ...
8                                       2 tsp olive oil
11                               1/2 cup cottage cheese
12                                   1 tbsp green pesto
13      1 tbsp finely chopped sun-dried tomatoes in oil
Name: ingredients, dtype: object

In [11]:
ingredients.head(10)

Unnamed: 0,recipe_name,ingredients
0,Mushroom Rarebit with Poached Eggs,3 large eggs
1,Mushroom Rarebit with Poached Eggs,1/3 cup shredded cheddar cheese
2,Mushroom Rarebit with Poached Eggs,1/2 tsp Dijon mustard
3,Mushroom Rarebit with Poached Eggs,2 drops Worcestershire sauce
4,Mushroom Rarebit with Poached Eggs,1 tbsp Greek yogurt
5,Mushroom Rarebit with Poached Eggs,12 oz. portobello mushrooms about 1 large cap ...
6,Mushroom Rarebit with Poached Eggs,brushed clean
7,Mushroom Rarebit with Poached Eggs,and stalks cut away
8,Mushroom Rarebit with Poached Eggs,2 tsp olive oil
9,Mushroom Rarebit with Poached Eggs,salt and pepper


In [12]:
# ingredients['ingredients'].str.extract(r"([0-9]/?[0-9]?)",flags=re.I)
# ingredients['ingredients'].str.extract(r"([\s]?oz.?|tsp?|tbsp?|pinch?|cups?|cup?|lb?|lbs?|drops?)",flags=re.I)
# ingredient_measure = ingredients['ingredients'].str.contains(r"[0-9]/?[0-9]?[\s]?oz.?|tsp?|tbsp?|pinch?|cups?|cup?|lb?|lbs?|drops?",flags=re.I)
# ingredient_measure.sum()
# len(ingredients['ingredients'])

In [None]:
pattern = r"(?P<measure>[0-9]/?[0-9]?[\s]?oz.?|tsp?|tbsp?|pinch?|cups?|cup?|lb?|lbs?|drops?)\s(?P<name>.+)"
ingredients_v2 = ingredients['ingredients'].str.extract(pattern,flags=re.I)
ingredients_v2
# not capturing fractions with '/' e.g. '1/3 cup'
# also not capturing amounts (digits) without a measure/unit e.g. '3 large eggs'

In [None]:
pattern = r"(?P<amount>[0-9]/?[0-9]?)(?P<measure>[\s]?oz.?|tsp?|tbsp?|pinch?|cups?|cup?|lb?|lbs?|drops?)\s(?P<name>.+)"
ingredients_v3 = ingredients['ingredients'].str.extractall(pattern,flags=re.I)
ingredients_v3

In [None]:
ingredients_v2.isnull().sum()
ingredients_v3.isnull().sum()
ingredients_v3['measure'].value_counts()

In [13]:
pattern = r"(?P<measure>[0-9]/?[0-9]?[\s]?oz.?|tsp?|tbsp?|pinch?|cups?|cup?|lb?|lbs?|drops?)\s(?P<name>.+)"
ingredients_extracted = ingredients['ingredients'].str.extract(pattern,flags=re.I)
ingredients_extracted

Unnamed: 0,measure,name
0,,
1,cup,shredded cheddar cheese
2,tsp,Dijon mustard
3,drops,Worcestershire sauce
4,tbsp,Greek yogurt
...,...,...
18428,cup,sour cream
18429,cup,mayonnaise
18430,cup,ajvar relish
18431,,


In [47]:
pattern = r"(?P<amount>[0-9]/?[0-9]?)(?P<measure>[\s]?oz.?|tsp?|tbsp?|pinch?|cups?|cup?|lb?|lbs?|drops?)\s(?P<name>.+)"
ingredients_extracted_v2 = ingredients['ingredients'].str.extractall(pattern,flags=re.I)
ingredients_extracted_v2

Unnamed: 0_level_0,Unnamed: 1_level_0,amount,measure,name
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5,0,12,oz.,portobello mushrooms about 1 large cap per ser...
16,0,1/2,oz.,cucumber
18,0,4,oz.,deli turkey or smoked deli ham
23,0,5,oz.,zucchini
25,0,1/2,oz.,celery stalks
...,...,...,...,...
18402,0,2,oz.,fresh raspberries
18408,0,1/2,oz.,butter
18412,0,7,oz.,fresh spinach
18416,0,6,oz.,goat cheese


In [16]:
pattern = r"(?P<name>.+)"
ingredients_extracted_v3 = ingredients['ingredients'].str.extract(pattern,flags=re.I)
ingredients_extracted_v3

Unnamed: 0,name
0,3 large eggs
1,1/3 cup shredded cheddar cheese
2,1/2 tsp Dijon mustard
3,2 drops Worcestershire sauce
4,1 tbsp Greek yogurt
...,...
18428,1/4 cup sour cream
18429,1/4 cup mayonnaise
18430,1/4 cup ajvar relish
18431,poppy seeds


In [17]:
ingredients_extracted_v3.isnull().sum()

name    0
dtype: int64

In [49]:
pattern_oz = r"(?P<amount>[0-9]/?[0-9]?)(?P<measure>[\s]?oz.?)\s(?P<name>.+)"
ingredients_extracted_oz = ingredients['ingredients'].str.extractall(pattern,flags=re.I)
ingredients_extracted_oz

Unnamed: 0_level_0,Unnamed: 1_level_0,amount,measure,name
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5,0,12,oz.,portobello mushrooms about 1 large cap per ser...
16,0,1/2,oz.,cucumber
18,0,4,oz.,deli turkey or smoked deli ham
23,0,5,oz.,zucchini
25,0,1/2,oz.,celery stalks
...,...,...,...,...
18402,0,2,oz.,fresh raspberries
18408,0,1/2,oz.,butter
18412,0,7,oz.,fresh spinach
18416,0,6,oz.,goat cheese


In [29]:
measures = ['oz.', 'tsp', 'tbsp', 'pinch', 'cups', 'cup', 'lb', 'lbs', 'drops']

ingredients_extracted = pd.DataFrame()
for measure in measures:
    pattern = r"(?P<amount>[0-9]/?[0-9]?)(?P<measure>[\s]?{measure}?)\s(?P<name>.+)".format(measure)
    extract_measure = ingredients['ingredients'].str.extractall(pattern,flags=re.I)
    ingredients_extracted = pd.concat([ingredients_extracted, extract_measure])
    
ingredients_extracted

KeyError: 'measure'

In [32]:
pattern_oz = r"(?P<amount>[0-9]/?[0-9]?)(?P<measure>[\s]?oz.?)\s(?P<name>.+)"
pattern_tsp = r"(?P<amount>[0-9]/?[0-9]?)(?P<measure>[\s]?tsp?)\s(?P<name>.+)"
pattern_tbsp = r"(?P<amount>[0-9]/?[0-9]?)(?P<measure>[\s]?tbsp?)\s(?P<name>.+)"
pattern_pinch = r"(?P<amount>[0-9]/?[0-9]?)(?P<measure>[\s]?pinch?)\s(?P<name>.+)"
pattern_cups = r"(?P<amount>[0-9]/?[0-9]?)(?P<measure>[\s]?cups?)\s(?P<name>.+)"
pattern_cup = r"(?P<amount>[0-9]/?[0-9]?)(?P<measure>[\s]?cup?)\s(?P<name>.+)"
pattern_lb = r"(?P<amount>[0-9]/?[0-9]?)(?P<measure>[\s]?lb?)\s(?P<name>.+)"
pattern_lbs = r"(?P<amount>[0-9]/?[0-9]?)(?P<measure>[\s]?lbs?)\s(?P<name>.+)"
pattern_drops = r"(?P<amount>[0-9]/?[0-9]?)(?P<measure>[\s]?drops?)\s(?P<name>.+)"

patterns = [pattern_oz, pattern_tsp, pattern_tbsp, pattern_pinch, pattern_cups, 
            pattern_cup, pattern_lb, pattern_lbs, pattern_drops]

ingredients_extracted = pd.DataFrame()
for pattern in patterns:
    extract_measure = ingredients['ingredients'].str.extractall(pattern,flags=re.I)
    ingredients_extracted = pd.concat([ingredients_extracted, extract_measure])
    
ingredients_extracted

Unnamed: 0_level_0,Unnamed: 1_level_0,amount,measure,name
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5,0,12,oz.,portobello mushrooms about 1 large cap per ser...
16,0,1/2,oz.,cucumber
18,0,4,oz.,deli turkey or smoked deli ham
23,0,5,oz.,zucchini
25,0,1/2,oz.,celery stalks
...,...,...,...,...
18386,0,2,lbs,boneless chicken thighs
18422,0,1,lb,rutabaga
3,0,2,drops,Worcestershire sauce
4055,0,10,drops,liquid sweetener (optional)


In [33]:
ingredients_extracted.isnull().sum()

amount     0
measure    0
name       0
dtype: int64

In [41]:
ingredients_extracted.columns

Index(['amount', 'measure', 'name'], dtype='object')

In [42]:
ingredients_extracted.index

MultiIndex([(    5, 0),
            (   16, 0),
            (   18, 0),
            (   23, 0),
            (   25, 0),
            (   28, 0),
            (   29, 0),
            (   30, 0),
            (   37, 0),
            (   39, 0),
            ...
            (18300, 0),
            (18308, 0),
            (18321, 0),
            (18332, 0),
            (18356, 0),
            (18386, 0),
            (18422, 0),
            (    3, 0),
            ( 4055, 0),
            ( 4458, 0)],
           names=[None, 'match'], length=10436)

In [43]:
ingredients_extracted.reset_index()

Unnamed: 0,level_0,match,amount,measure,name
0,5,0,12,oz.,portobello mushrooms about 1 large cap per ser...
1,16,0,1/2,oz.,cucumber
2,18,0,4,oz.,deli turkey or smoked deli ham
3,23,0,5,oz.,zucchini
4,25,0,1/2,oz.,celery stalks
...,...,...,...,...,...
10431,18386,0,2,lbs,boneless chicken thighs
10432,18422,0,1,lb,rutabaga
10433,3,0,2,drops,Worcestershire sauce
10434,4055,0,10,drops,liquid sweetener (optional)


In [67]:
# find instances where measure is missing
measures = ['oz.', 'tsp', 'tbsp', 'pinch', 'cups', 'cup', 'lb', 'lbs', 'drops']
ingredients_no_measure = []
for ingredient in ingredients['ingredients']:
    if not any(x in ingredient for x in measures):
        ingredients_no_measure.append(ingredient)
#     for measure in measures:
#         if not any(measure):
#             ingredients_no_measure.append(ingredient)
#         break
        
ingredients_no_measure

['3 large eggs',
 ' brushed clean',
 ' and stalks cut away',
 'salt and pepper',
 ' for seasoning',
 'salt or ground black pepper to taste',
 ' diced',
 ' diced',
 ' diced',
 'salt and ground black pepper',
 ' shredded',
 'salt to taste',
 ' drained',
 ' finely chopped ',
 '1/4 lime',
 ' the juice',
 'salt to taste',
 '4 small lettuce leafs',
 '1 red chili pepper',
 '1 garlic clove',
 'pepper',
 ' to taste',
 '1 white onion',
 ' for greasing',
 ' portobello mushrooms',
 ' large (preferably 2 per serving)',
 ' roughly chopped (See tip)',
 ' or ground beef',
 ' or ground pork',
 'salt and pepper',
 '2 large eggs',
 ' fresh cilantro',
 ' or flat-leaf parsley',
 '1 green chili pepper',
 ' seeded',
 ' finely sliced',
 ' cheddar cheese',
 ' roughly chopped',
 'salt and pepper',
 ' for seasoning',
 ' frozen',
 ' or fresh',
 ' divided',
 '1 yellow onion',
 ' finely chopped',
 ' roughly chopped ',
 '5 large eggs',
 ' peeled and cut into strips',
 '2 garlic cloves',
 ' crushed',
 ' 1 per portion

In [73]:
# list contains some non-ingredients that were the result of the split on comma, which appear to have a leading space
# let's remove any ingredients with a leading space
ingredients_no_measure_clean = []
for ingredient in ingredients_no_measure:
    if not ingredient.startswith(' '):
        ingredients_no_measure_clean.append(ingredient)
        
ingredients_no_measure_clean

['3 large eggs',
 'salt and pepper',
 'salt or ground black pepper to taste',
 'salt and ground black pepper',
 'salt to taste',
 '1/4 lime',
 'salt to taste',
 '4 small lettuce leafs',
 '1 red chili pepper',
 '1 garlic clove',
 'pepper',
 '1 white onion',
 'salt and pepper',
 '2 large eggs',
 '1 green chili pepper',
 'salt and pepper',
 '1 yellow onion',
 '5 large eggs',
 '2 garlic cloves',
 'salt and pepper',
 '1 garlic clove',
 '1/2 yellow onion',
 '1 large egg',
 '4 fresh rosemary',
 '2 red onions',
 '1 red bell pepper',
 'salt',
 '1 lemon',
 'pepper',
 '1 red chili pepper',
 '2 garlic cloves',
 '1 garlic clove',
 'salt and pepper',
 'salt to taste',
 '8 large eggs',
 'salt to taste',
 'pepper to taste',
 '1 red bell pepper',
 '1 yellow onion',
 'salt and pepper',
 '1 avocado mashed',
 'salt and pepper',
 'salt or ground black pepper',
 '1 red bell pepper',
 '1 eggplant',
 '1 yellow onion',
 '4 garlic cloves (optional)',
 'coarse salt',
 '2 large eggs',
 '2 large eggs',
 'salt or g

In [75]:
ingredients_no_measure_clean = pd.Series(ingredients_no_measure_clean)
ingredients_no_measure_clean

0                               3 large eggs
1                            salt and pepper
2       salt or ground black pepper to taste
3               salt and ground black pepper
4                              salt to taste
                        ...                 
7109                         iceberg lettuce
7110                                1 tomato
7111                               1 avocado
7112                             poppy seeds
7113                                sea salt
Length: 7114, dtype: object

In [78]:
ingredients_no_measure_clean.str.extract(r"(?P<amount>[0-9]/?[0-9]?)?[\s]?(?P<name>.+)",flags=re.I)

Unnamed: 0,amount,name
0,3,large eggs
1,,salt and pepper
2,,salt or ground black pepper to taste
3,,salt and ground black pepper
4,,salt to taste
...,...,...
7109,,iceberg lettuce
7110,1,tomato
7111,1,avocado
7112,,poppy seeds
