# Exploring Natural Language Processing using Cooking Recipes

## Imports and Dataset Load

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import torch

from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
from tqdm.auto import tqdm

from utils import *

# per colab runna questa cella

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Load cleaned dataset

In [35]:
# Load the data locally
df = pd.read_csv('dataset/dataset.csv')

# Load the data from GDrive
# df = pd.read_csv('/content/drive/MyDrive/datasets/dataset.csv')

## Named Entity Recognition to Extract the Ingredients

In the following we'll use some NER models to extract all the ingredients from the recipes in order to train a network with them afterwards.

In [24]:
def get_ingredients(recipe: str, ner_result: list):
    ingredients = []
    last_added = 0
    b_word = 'B-FOOD'
    i_word = 'I-FOOD'

    for i in range(len(ner_result)):

        if ner_result[i]['entity'] == b_word:
            # check if previous word was a segmentation of the same one
            if ner_result[i]['word'].startswith('#') and ner_result[i-1]['entity'] == b_word:
                # if (for any reason (it happens)) the first word is a segment, we ignore it
                if last_added == 0:
                    continue
                ingredients[last_added-1] += recipe[ner_result[i]['start'] : ner_result[i]['end']]
            else:
                # get the ingredient from the recipe given its position
                ingredients.append(recipe[ner_result[i]['start'] : ner_result[i]['end']])
                last_added += 1

        elif ner_result[i]['entity'] == i_word:
            # check if segmentation is occurring
            if ner_result[i]['word'].startswith('#'):
                # if (for any reason (it happens)) the first word is a segment, we ignore it
                if last_added == 0:
                    continue
                ingredients[last_added-1] += recipe[ner_result[i]['start'] : ner_result[i]['end']]
            elif last_added == 0:
                ingredients.append(recipe[ner_result[i]['start'] : ner_result[i]['end']])
                last_added += 1
            else:
                ingredients[last_added-1] = ingredients[last_added-1] + ' ' + recipe[ner_result[i]['start'] : ner_result[i]['end']]

    return ingredients

In [25]:
# reduce the dataset to the first 10 rows
df = df.head(1)


In [39]:
# get the first recipe
recipe = df['cooking_method'][31]
recipe

"['Remove the chicken legs, wings and thighs from the rotisserie chicken and set aside for another meal.', 'Remove the skin from the 2 breasts and remove the meat from the bones. Break the chicken into chunks with your hands or a knife and place into a large bowl. Add the fresh dill, lemon juice, lemon zest, olive oil and salt.', 'Refrigerate until ready to eat.']"

In [None]:
# Questo funziona male, lascia gli ingredienti separati da #

tokenizer = AutoTokenizer.from_pretrained("Dizex/FoodBaseBERT")
model = AutoModelForTokenClassification.from_pretrained("Dizex/FoodBaseBERT")

pipe = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)
ner_result = pipe(recipe)

ner_result

In [40]:
# questo funziona bene come il nostro, ma restituisce il dizionario

tokenizer = AutoTokenizer.from_pretrained("Dizex/FoodBaseBERT")
model = AutoModelForTokenClassification.from_pretrained("Dizex/FoodBaseBERT")

pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="MAX")
ner_result = pipe(recipe)

ner_result

[{'entity_group': 'FOOD',
  'score': 0.9934605,
  'word': 'chicken legs',
  'start': 13,
  'end': 25},
 {'entity_group': 'FOOD',
  'score': 0.99471396,
  'word': 'wings',
  'start': 27,
  'end': 32},
 {'entity_group': 'FOOD',
  'score': 0.9796107,
  'word': 'thighs',
  'start': 37,
  'end': 43},
 {'entity_group': 'FOOD',
  'score': 0.9848013,
  'word': 'rotisserie chicken',
  'start': 53,
  'end': 71},
 {'entity_group': 'FOOD',
  'score': 0.9872923,
  'word': 'skin',
  'start': 118,
  'end': 122},
 {'entity_group': 'FOOD',
  'score': 0.84520423,
  'word': 'breasts',
  'start': 134,
  'end': 141},
 {'entity_group': 'FOOD',
  'score': 0.9972771,
  'word': 'meat',
  'start': 157,
  'end': 161},
 {'entity_group': 'FOOD',
  'score': 0.99037725,
  'word': 'bones',
  'start': 171,
  'end': 176},
 {'entity_group': 'FOOD',
  'score': 0.99775213,
  'word': 'chicken',
  'start': 188,
  'end': 195},
 {'entity_group': 'FOOD',
  'score': 0.9954136,
  'word': 'fresh dill',
  'start': 272,
  'end': 28

In [41]:
# il nostro funziona bene e restituisce la lista

tokenizer = AutoTokenizer.from_pretrained("Dizex/FoodBaseBERT")
model = AutoModelForTokenClassification.from_pretrained("Dizex/FoodBaseBERT")

pipe = pipeline("ner", model=model, tokenizer=tokenizer)
ner_result = pipe(recipe)
ingredients = get_ingredients(recipe, ner_result)

ingredients

['chicken legs',
 'wings',
 'thighs',
 'rotisserie chicken',
 'skin',
 'breasts',
 'meat',
 'bones',
 'chicken',
 'fresh dill',
 'lemon juice',
 'lemon zest',
 'olive oil',
 'salt']

In [31]:
ner_result

[{'entity_group': 'FOOD',
  'score': 0.7495046,
  'word': 'extra',
  'start': 16,
  'end': 21},
 {'entity_group': 'FOOD',
  'score': 0.9697037,
  'word': 'virgin olive oil',
  'start': 22,
  'end': 38},
 {'entity_group': 'FOOD',
  'score': 0.978496,
  'word': 'ch',
  'start': 48,
  'end': 50},
 {'entity_group': 'FOOD',
  'score': 0.99201655,
  'word': '##opped yellow onion',
  'start': 50,
  'end': 68},
 {'entity_group': 'FOOD',
  'score': 0.98822474,
  'word': 'c',
  'start': 74,
  'end': 75},
 {'entity_group': 'FOOD',
  'score': 0.9753904,
  'word': '##love',
  'start': 75,
  'end': 79},
 {'entity_group': 'FOOD',
  'score': 0.9716952,
  'word': '##s garlic',
  'start': 79,
  'end': 87},
 {'entity_group': 'FOOD',
  'score': 0.6193733,
  'word': 'min',
  'start': 89,
  'end': 92},
 {'entity_group': 'FOOD',
  'score': 0.5813063,
  'word': '##ced',
  'start': 92,
  'end': 95},
 {'entity_group': 'FOOD',
  'score': 0.9946885,
  'word': 'crushed red pepper flakes',
  'start': 113,
  'end': 

In [29]:
ingredients_v = set()

tokenizer = AutoTokenizer.from_pretrained("Dizex/FoodBaseBERT")
model = AutoModelForTokenClassification.from_pretrained("Dizex/FoodBaseBERT")

if torch.cuda.is_available():
    model = model.to('cuda')
    print("Model moved to GPU.")
else:
    print("CUDA is not available. Model will run on CPU.")

pipe = pipeline("ner", model=model, tokenizer=tokenizer)

for i, recipe in tqdm(enumerate(df.ingredients), total=len(df.ingredients)):
    ner_result = pipe(recipe)
    ingredients = get_ingredients(recipe, ner_result)
    for ingredient in ingredients:
        ingredients_v.add(ingredient)



Model moved to GPU.


100%|██████████| 1/1 [00:00<00:00,  2.35it/s]


In [9]:
# Use the tokenizer to merge tokens


cleaned_results = clean_entities(results, tokenizer)
print(cleaned_results)

['poppy seeds',
 'skin',
 'cream',
 'fresh dill',
 'scallions',
 'brown sugar',
 'onion powder',
 'enne bulb',
 'minced',
 'garnish',
 'chopped steamed broccoli',
 'liquid',
 'cloves garlic',
 'bagel seasoning',
 'ham',
 'beefsteak tomato',
 'gluten',
 'dry white wine',
 'diced ham steak',
 '-',
 'cod',
 'green beans beans',
 'chopped kale leaves',
 'grated Parmesan',
 'balsamic vinegar',
 'milk',
 'fresh cilantro',
 'Kosher salt',
 'minced dried onion flakes',
 '%',
 'low fat sour cream',
 'ground pepper',
 'buttermilk',
 'garlic powder',
 'chopped fennel',
 'juice',
 'sodium vegetable broth',
 'white whole wheat flour',
 'halibut',
 'crushed red pepper flakes',
 'fresh black pepper',
 'dried parsley',
 'sliced black olives',
 'fat cream cheese',
 'rated zest',
 'cooking spray',
 'ground black pepper',
 'dried basil',
 'fat',
 'Organic olive oil cooking spray salt',
 'nova lox',
 'fresh grated Swiss cheese',
 'pitted',
 'heat oil',
 'sea salt',
 'garlic head',
 'sesame seeds',
 'nutme

In [7]:
tokenizer.is_fast

True

In [None]:
import csv

PATH = '/content/drive/MyDrive/datasets/ingredients_v.csv'

with open(PATH, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)

    for ingredient in ingredients_v:
        writer.writerow([ingredient])

print(f"Set saved to '{PATH}'.")

-----------
## This takes too much time so we want to work on a smaller, balanced dataset

In [4]:
df_main=pd.read_csv("dataset/recipes_df.csv")
df_veg=df[df_main.Vegetarian==True]
df_notveg=df[df_main.Vegetarian==False]

df_veg=df_veg.sample(n=2500)
df_notveg=df_notveg.sample(n=2500)
df=pd.concat([df_veg,df_notveg])
# shuffle the dataframe
df = df.sample(frac=1).reset_index(drop=True)

# save the new dataframe
df.to_csv('dataset/recipes_df_r.csv', index=False)


----
# once we have the balanced dataset we can start from here

## we want to preprocess a bit the dataset, first of all we process the ingridient with a regex

In [2]:
PATH = './dataset/recipes_df_r.csv'


df = pd.read_csv(PATH)

In [27]:
# create a regex function that given a string removes all except letters and spaces
def get_nouns_verbs(ingredient):
    tagged = pos_tagging(ingredient)
    return ' '.join([word for word, pos in tagged if pos in ['NOUN', 'VERB']])

def clean_text(s):
    s = s.lower()
    s1 = re.sub(r'[^a-z\s]', '', s)
    # remove multiple spaces and starting and ending spaces
    s2 = re.sub(r'\s+', ' ', s1).strip()
    return s2


clean_text('1 cup of flour')


def get_nouns_verbs(ingredient):
    tagged = pos_tagging(ingredient)
    return ' '.join([word for word, pos in tagged if pos in ['NOUN', 'VERB']])




'cup of flour'

In [38]:
# use spacy to apply pos tagging to the ingredients
import spacy

nlp = spacy.load('en_core_web_sm')

def pos_tagging(ingredient):
    doc = nlp(ingredient)
    return [(token.text, token.pos_) for token in doc]

# use specy finetuned to ingredients
"""nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('ner', source=tokenizer)

def pos_tagging(ingredient):
    doc = nlp(ingredient)
    return [(token.text, token.pos_) for token in doc]

"""

pos_tagging('gour blend mushrooms sliced')

# return the string with only the nouns and verbs
def get_nouns_verbs(ingredient):
    tagged = pos_tagging(ingredient)
    return ' '.join([word for word, pos in tagged if pos in ['NOUN', 'VERB', 'PROPN']])
print(pos_tagging('italian pinot grigio'))
get_nouns_verbs('italian pinot grigio')




[('italian', 'ADJ'), ('pinot', 'PROPN'), ('grigio', 'NOUN')]


'grigio'

In [None]:

ingredients_list = []

print("Loading model and tokenizer...")

tokenizer = AutoTokenizer.from_pretrained("Dizex/FoodBaseBERT")
model = AutoModelForTokenClassification.from_pretrained("Dizex/FoodBaseBERT")

print("Model and tokenizer loaded.")

if torch.cuda.is_available():
    model = model.to('cuda')
    print("Model moved to GPU.")
else:
    print("CUDA is not available. Model will run on CPU.")

pipe = pipeline("ner", model=model, tokenizer=tokenizer)


#return re.sub(r'[^\w\s]', '', text)

print("Pipeline created.")




for i, recipe in tqdm(enumerate(df.ingredients), total=len(df.ingredients)):
    print(recipe)
    recipe_p = re.sub(r'[^\w\s]', '', recipe)

    ingredients_list.append([])
    ner_result = pipe(recipe_p)
    ingredients = get_ingredients(recipe_p, ner_result)
    for ingredient in ingredients:
        ingredients_list[i].append(ingredient)



In [29]:

print("Loading model and tokenizer...")

tokenizer = AutoTokenizer.from_pretrained("Dizex/FoodBaseBERT")
model = AutoModelForTokenClassification.from_pretrained("Dizex/FoodBaseBERT")

print("Model and tokenizer loaded.")

if torch.cuda.is_available():
    model = model.to('cuda')
    print("Model moved to GPU.")
else:
    print("CUDA is not available. Model will run on CPU.")

pipe = pipeline("ner", model=model, tokenizer=tokenizer)


#return re.sub(r'[^\w\s]', '', text)

print("Pipeline created.")

Loading model and tokenizer...
Model and tokenizer loaded.
CUDA is not available. Model will run on CPU.
Pipeline created.
