<a href="https://colab.research.google.com/github/lail-lei/nlp-cupcakes/blob/main/adding_labels_to_ingredients.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [159]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer 
import math
import re
from textblob import TextBlob as tb
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from google.colab import files

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [160]:
# our data from csv
df = pd.read_csv('https://raw.githubusercontent.com/lail-lei/nlp-cupcakes/main/cupcakes_with_frosting.csv')

In [161]:
df = df.iloc[:, 0:6]
df

Unnamed: 0,Title,Yield,Ingredients,Steps,URL,Tags
0,Coconut Cupcakes,18 to 20 cupcakes,[{'frosting': ['1 pound cream cheese at room t...,['Preheat the oven to 325 degrees F. In the bo...,https://www.foodnetwork.com/recipes/ina-garten...,"['Baking', 'Dessert', 'Mixer Recipes', 'Cupcak..."
1,Go-To Vanilla Cupcakes,12 cupcakes (or about 48 mini cupcakes),"[{'main': ['1 1/2 cups all purpose flour', '1 ...",['Preheat the oven to 350 degrees F and positi...,https://www.foodnetwork.com/recipes/food-netwo...,"['Baking', 'Dessert', 'Cupcake']"
2,Vanilla Cupcakes,12 cupcakes,"[{'frosting': ['4 large egg whites', '3/4 cup ...",['Preheat the oven to 350 degrees F. Line a 12...,https://www.foodnetwork.com/recipes/food-netwo...,"['Easy Baking', 'Cupcake']"
3,Magnolia's Vanilla Cupcake,24 cupcakes,"[{'main': ['1 1/2 cups self rising flour', '1 ...",['Preheat oven to 350 degrees F. Line 2 muffin...,https://www.foodnetwork.com/recipes/magnolias-...,"['Easy Dessert Recipes', 'Dessert', 'Easy Baki..."
4,Lemon Lime Cupcakes,48 mini cupcakes,"[{'main': ['1 3/4 cups all purpose flour', '1/...",['For the batter Preheat the oven to 350 degre...,https://www.foodnetwork.com/recipes/ree-drummo...,"['Cupcake', 'Dessert']"
...,...,...,...,...,...,...
750,Buttercream by Billington's,12,"[{'main': ['50g Unsalted butter ', '200g Billi...",['Beat the butter and sugar together until smo...,https://www.bakingmad.com/recipes/basic-butter...,
751,Red Velvet Cake,9,"[{'main': ['120g Unsalted butter', '300g Silve...","['Preheat your oven to 180 C .', 'Line 2 x 20c...",https://www.bakingmad.com/recipes/red-velvet-cake,
752,Whoopie Pies by Lorraine Pascale,6,"[{'main': ['120ml Milk ', '190g Unrefined deme...","['Preheat the oven to 170 C .', 'Line 2 baking...",https://www.bakingmad.com/recipes/whoopie-pies,
753,Toffee Sauce,40,"[{'sauce': ['50ml Water', '300g Billington s U...",['To make the sauce use a large heavy bottomed...,https://www.bakingmad.com/recipes/toffee-sauce,


In [162]:
from ast import literal_eval
from collections import defaultdict
# this is some spaghetti code
def convert_to_dict (string):
  ings = literal_eval(string)
  dict_x = defaultdict(list)
  for item in ings:
    for key in item:
      for ing in item[key]:
        dict_x[key].append(ing)
  return dict_x

In [163]:
processed = df['Ingredients'].apply(convert_to_dict)

In [164]:
df["processed_ingredients"] = processed

In [165]:
# contains a structural ingredient
def is_structural (string):
  keywords = ["flour", "egg"]
  return any([substring in string.lower() for substring in keywords])

# contains a sweetening ingredient
def is_sweetener (string):
  keywords = ["sugar", "stevia", "agave", "honey", "maple syrup", "erythritol", "xylitol"]
  return any([substring in string.lower() for substring in keywords])

# contains a sweetening ingredient
def is_moistening (string):
  # this is a terrible solution
  if "milk chocolate" in string:
    return False
  keywords = ["buttermilk", " milk " "heavy cream", "half and half", "half n half", "yogurt", "sour cream", "water"]
  return any([substring in string.lower() for substring in keywords])
# contains a leavener ingredient
def is_prep (string):
  keywords = ["cooking spray", "baking spray", "vegetable oil spray", " pam ", "cupcake liners", "cake pans", "muffin tins"]
  return any([substring in string.lower() for substring in keywords])

# contains a fat ingredient
def is_fat (string):
  keywords = ["butter", "oil", "shortening", "Crisco", "lard", "cream cheese"]
  return any([substring in string.lower() for substring in keywords])

# contains a leavener ingredient
def is_leavener (string):
  keywords = ["baking soda", "baking powder"]
  return any([substring in string.lower() for substring in keywords])

# contains a leavener ingredient
def is_premade (string):
  keywords = ["cupcakes", "frosting"]
  return any([substring in string.lower() for substring in keywords])



In [166]:
def classify_ingredient(string):
  if is_structural(string):
    return {"ingredient": string, "type": "structural"}
  if is_sweetener(string):
    return {"ingredient": string, "type": "sweetener"}
  if is_moistening(string):
    return {"ingredient": string, "type": "moistening"}
  if is_prep(string):
    return {"ingredient": string, "type": "prep"}
  if is_fat(string):
    return {"ingredient": string, "type": "fat"}
  if is_leavener(string):
    return {"ingredient": string, "type": "leavener"}
  if is_premade(string):
    return {"ingredient": string, "type": "premade"}
  return {"ingredient": string, "type": "flavoring"}

In [167]:
def classify_sub_recipe_ingredients (ings):
  objs = []
  for item in ings:
    objs.append(classify_ingredient(item))
  return objs
    


In [168]:
def classify_ingredients (ings):
  dict_x = defaultdict(list)
  for key in ings:
    dict_x[key].append(classify_sub_recipe_ingredients(ings[key]))
  
  return dict_x
    


In [169]:
processed = df["processed_ingredients"].apply(classify_ingredients)
processed[0]

defaultdict(list,
            {'frosting': [[{'ingredient': '1 pound cream cheese at room temperature',
                'type': 'fat'},
               {'ingredient': '3/4 pound unsalted butter room temperature',
                'type': 'fat'},
               {'ingredient': '1 teaspoon pure vanilla extract',
                'type': 'flavoring'},
               {'ingredient': '1/2 teaspoon pure almond extract',
                'type': 'flavoring'},
               {'ingredient': '1 1/2 pounds confectioners sugar sifted',
                'type': 'sweetener'}]],
             'main': [[{'ingredient': '14 ounces sweetened shredded coconut',
                'type': 'flavoring'},
               {'ingredient': '1 cup buttermilk', 'type': 'moistening'},
               {'ingredient': '1/2 teaspoon kosher salt', 'type': 'flavoring'},
               {'ingredient': '1/2 teaspoon baking soda', 'type': 'leavener'},
               {'ingredient': '1 teaspoon baking powder', 'type': 'leavener'},
         

In [170]:
df["processed_ingredients"] = processed

In [171]:
df.to_csv('cupcakes_with_frosting_processed.csv') 
files.download('cupcakes_with_frosting_processed.csv') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>