# Exploring Natural Language Processing using Cooking Recipes

## Imports and Dataset Load

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import torch

from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
from tqdm.auto import tqdm

from utils import *

# per colab runna questa cella

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Load cleaned dataset

In [2]:
# Load the data locally
df = pd.read_csv('dataset/dataset.csv')

# Load the data from GDrive
# df = pd.read_csv('/content/drive/MyDrive/datasets/dataset.csv')

## Named Entity Recognition to Extract the Ingredients

In the following we'll use some NER models to extract all the ingredients from the recipes in order to train a network with them afterwards.

In [18]:
def get_ingredients(recipe: str, ner_result: list):
    ingredients = []
    last_added = 0
    b_word = 'B-FOOD'
    i_word = 'I-FOOD'

    for i in range(len(ner_result)):

        if ner_result[i]['entity'] == b_word:
            # check if previous word was a segmentation of the same one
            if ner_result[i]['word'].startswith('#') and ner_result[i-1]['entity'] == b_word:
                # if (for any reason (it happens)) the first word is a segment, we ignore it
                if last_added == 0:
                    continue
                ingredients[last_added-1] += recipe[ner_result[i]['start'] : ner_result[i]['end']]
            else:
                # get the ingredient from the recipe given its position
                ingredients.append(recipe[ner_result[i]['start'] : ner_result[i]['end']])
                last_added += 1

        elif ner_result[i]['entity'] == i_word:
            # check if segmentation is occurring
            if ner_result[i]['word'].startswith('#'):
                # if (for any reason (it happens)) the first word is a segment, we ignore it
                if last_added == 0:
                    continue
                ingredients[last_added-1] += recipe[ner_result[i]['start'] : ner_result[i]['end']]
            elif last_added == 0:
                ingredients.append(recipe[ner_result[i]['start'] : ner_result[i]['end']])
                last_added += 1
            else:
                ingredients[last_added-1] = ingredients[last_added-1] + ' ' + recipe[ner_result[i]['start'] : ner_result[i]['end']]

    return ingredients

In [4]:
ingredients_v = set()

tokenizer = AutoTokenizer.from_pretrained("Dizex/FoodBaseBERT")
model = AutoModelForTokenClassification.from_pretrained("Dizex/FoodBaseBERT")

if torch.cuda.is_available():
    model = model.to('cuda')
    print("Model moved to GPU.")
else:
    print("CUDA is not available. Model will run on CPU.")

pipe = pipeline("ner", model=model, tokenizer=tokenizer)

for i, recipe in tqdm(enumerate(df.ingredients), total=len(df.ingredients)):
    ner_result = pipe(recipe)
    ingredients = get_ingredients(recipe, ner_result)
    for ingredient in ingredients:
        ingredients_v.add(ingredient)

CUDA is not available. Model will run on CPU.


  0%|          | 0/81957 [00:00<?, ?it/s]

NameError: name 'get_ingredients' is not defined

In [5]:
tokenizer.is_fast

True

In [None]:
import csv

PATH = '/content/drive/MyDrive/datasets/ingredients_v.csv'

with open(PATH, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)

    for ingredient in ingredients_v:
        writer.writerow([ingredient])

print(f"Set saved to '{PATH}'.")

-----------
## This takes too much time so we want to work on a smaller, balanced dataset

In [4]:
df_main=pd.read_csv("dataset/recipes_df.csv")
df_veg=df[df_main.Vegetarian==True]
df_notveg=df[df_main.Vegetarian==False]

df_veg=df_veg.sample(n=2500)
df_notveg=df_notveg.sample(n=2500)
df=pd.concat([df_veg,df_notveg])
# shuffle the dataframe
df = df.sample(frac=1).reset_index(drop=True)

# save the new dataframe
df.to_csv('dataset/recipes_df_r.csv', index=False)


----
# once we have the balanced dataset we can start from here

## we want to preprocess a bit the dataset, first of all we process the ingridient with a regex

In [2]:
PATH = './dataset/recipes_df_r.csv'


df = pd.read_csv(PATH)

In [27]:
# create a regex function that given a string removes all except letters and spaces
def get_nouns_verbs(ingredient):
    tagged = pos_tagging(ingredient)
    return ' '.join([word for word, pos in tagged if pos in ['NOUN', 'VERB']])

def clean_text(s):
    s = s.lower()
    s1 = re.sub(r'[^a-z\s]', '', s)
    # remove multiple spaces and starting and ending spaces
    s2 = re.sub(r'\s+', ' ', s1).strip()
    return s2


clean_text('1 cup of flour')


def get_nouns_verbs(ingredient):
    tagged = pos_tagging(ingredient)
    return ' '.join([word for word, pos in tagged if pos in ['NOUN', 'VERB']])




'cup of flour'

In [38]:
# use spacy to apply pos tagging to the ingredients
import spacy

nlp = spacy.load('en_core_web_sm')

def pos_tagging(ingredient):
    doc = nlp(ingredient)
    return [(token.text, token.pos_) for token in doc]

# use specy finetuned to ingredients
"""nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('ner', source=tokenizer)

def pos_tagging(ingredient):
    doc = nlp(ingredient)
    return [(token.text, token.pos_) for token in doc]

"""

pos_tagging('gour blend mushrooms sliced')

# return the string with only the nouns and verbs
def get_nouns_verbs(ingredient):
    tagged = pos_tagging(ingredient)
    return ' '.join([word for word, pos in tagged if pos in ['NOUN', 'VERB', 'PROPN']])
print(pos_tagging('italian pinot grigio'))
get_nouns_verbs('italian pinot grigio')




[('italian', 'ADJ'), ('pinot', 'PROPN'), ('grigio', 'NOUN')]


'grigio'

In [None]:

ingredients_list = []

print("Loading model and tokenizer...")

tokenizer = AutoTokenizer.from_pretrained("Dizex/FoodBaseBERT")
model = AutoModelForTokenClassification.from_pretrained("Dizex/FoodBaseBERT")

print("Model and tokenizer loaded.")

if torch.cuda.is_available():
    model = model.to('cuda')
    print("Model moved to GPU.")
else:
    print("CUDA is not available. Model will run on CPU.")

pipe = pipeline("ner", model=model, tokenizer=tokenizer)


#return re.sub(r'[^\w\s]', '', text)

print("Pipeline created.")




for i, recipe in tqdm(enumerate(df.ingredients), total=len(df.ingredients)):
    print(recipe)
    recipe_p = re.sub(r'[^\w\s]', '', recipe)

    ingredients_list.append([])
    ner_result = pipe(recipe_p)
    ingredients = get_ingredients(recipe_p, ner_result)
    for ingredient in ingredients:
        ingredients_list[i].append(ingredient)



In [29]:

print("Loading model and tokenizer...")

tokenizer = AutoTokenizer.from_pretrained("Dizex/FoodBaseBERT")
model = AutoModelForTokenClassification.from_pretrained("Dizex/FoodBaseBERT")

print("Model and tokenizer loaded.")

if torch.cuda.is_available():
    model = model.to('cuda')
    print("Model moved to GPU.")
else:
    print("CUDA is not available. Model will run on CPU.")

pipe = pipeline("ner", model=model, tokenizer=tokenizer)


#return re.sub(r'[^\w\s]', '', text)

print("Pipeline created.")

Loading model and tokenizer...
Model and tokenizer loaded.
CUDA is not available. Model will run on CPU.
Pipeline created.
