In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import torch

from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
from tqdm.auto import tqdm

from utils import *

"""
# To use Google Colab:
from google.colab import drive
drive.mount('/content/drive')
"""

"\n# To use Google Colab:\nfrom google.colab import drive\ndrive.mount('/content/drive')\n"

## Load cleaned dataset
We use the dataset obtained in the notebook `Data_Understanding.ipynb`

In [7]:
# Load the data locally
df = pd.read_csv('dataset/dataset.csv')

# Load the data from GDrive
# df = pd.read_csv('/content/drive/MyDrive/datasets/dataset.csv')

# Named Entity Recognition to Extract the Ingredients
In the following we'll use some NER models to extract all the ingredients from the recipes in order to train a network with them afterwards. 

## FoodBaseBERT
We start considering a fine-tuned version of the `bert-base-cased` architecture, which can be found at the following [link](https://huggingface.co/Dizex/FoodBaseBERT-NER).

Let's compare different pipelines on the same recipe: the first of the DataFrame. For the First and Second pipelines we refer to this [documentation](https://huggingface.co/transformers/v4.10.1/_modules/transformers/pipelines/token_classification.html), while for the Third we implemented by hand the construction of the tokens.

In [10]:
tokenizer = AutoTokenizer.from_pretrained("Dizex/FoodBaseBERT")
model = AutoModelForTokenClassification.from_pretrained("Dizex/FoodBaseBERT")

Here we check if we can use the hugging face superpowers. In particular we want to use the `aggregation_strategy` for the split tokens

In [None]:
tokenizer.is_fast

In [11]:
# get the first recipe
recipe = df['cooking_method'][0]
recipe

"['Set the racks in the middle and upper thirds of the oven and preheat the oven to 425 F', 'In a large skillet over medium heat, heat the olive oil until shimmering. Add the onion, garlic and red pepper flakes and cook until golden, stirring occasionally, about 5 minutes.', 'Add the fennel and cook until the vegetables are soft and translucent, an additional 3 to 5 minutes.', 'Reduce the heat to medium and add the tomatoes with their juices. Using the back of a wooden spoon, smash the tomatoes and cook for 5 minutes.', 'Add the basil, wine, olives, 1 teaspoon salt, and 1/8 teaspoon black pepper.', 'Reduce to low and simmer for 15 minutes, or until the sauce is slightly thickened, while you prepare the fish.', 'Pat the fillets dry, lightly spray them with cooking spray, and season with salt and pepper.', 'In a heavy ovenproof skillet over high heat, heat the olive oil until shimmering. Add the fillets, rounded-side down, and cook for 2 minutes.', 'Carefully flip the fillets with a meta

#### 1. First Pipeline
We can notice that the performances aren't good; for example `on` `##ion`, `f` `##enne` `##l` are split.

In [12]:
pipe = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)
ner_result = pipe(recipe)

ner_result

[{'entity_group': 'FOOD',
  'score': 0.99878013,
  'word': 'olive oil',
  'start': 138,
  'end': 147},
 {'entity_group': 'FOOD',
  'score': 0.9983912,
  'word': 'on',
  'start': 174,
  'end': 176},
 {'entity_group': 'FOOD',
  'score': 0.9979122,
  'word': '##ion',
  'start': 176,
  'end': 179},
 {'entity_group': 'FOOD',
  'score': 0.9981351,
  'word': 'garlic',
  'start': 181,
  'end': 187},
 {'entity_group': 'FOOD',
  'score': 0.99817646,
  'word': 'red pepper flakes',
  'start': 192,
  'end': 209},
 {'entity_group': 'FOOD',
  'score': 0.99844223,
  'word': 'f',
  'start': 284,
  'end': 285},
 {'entity_group': 'FOOD',
  'score': 0.9983156,
  'word': '##enne',
  'start': 285,
  'end': 289},
 {'entity_group': 'FOOD',
  'score': 0.9981218,
  'word': '##l',
  'start': 289,
  'end': 290},
 {'entity_group': 'FOOD',
  'score': 0.9989028,
  'word': 'vegetables',
  'start': 310,
  'end': 320},
 {'entity_group': 'FOOD',
  'score': 0.9990877,
  'word': 'tomato',
  'start': 418,
  'end': 424},
 {

#### 2. Second Pipeline
These parameters perform much better! For example `onion` and `fennel` are now kept intact.

In [13]:
pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="MAX")
ner_result = pipe(recipe)

ner_result

[{'entity_group': 'FOOD',
  'score': 0.99878013,
  'word': 'olive oil',
  'start': 138,
  'end': 147},
 {'entity_group': 'FOOD',
  'score': 0.9983912,
  'word': 'onion',
  'start': 174,
  'end': 179},
 {'entity_group': 'FOOD',
  'score': 0.9981351,
  'word': 'garlic',
  'start': 181,
  'end': 187},
 {'entity_group': 'FOOD',
  'score': 0.9982384,
  'word': 'red pepper flakes',
  'start': 192,
  'end': 209},
 {'entity_group': 'FOOD',
  'score': 0.99844223,
  'word': 'fennel',
  'start': 284,
  'end': 290},
 {'entity_group': 'FOOD',
  'score': 0.9989028,
  'word': 'vegetables',
  'start': 310,
  'end': 320},
 {'entity_group': 'FOOD',
  'score': 0.9992041,
  'word': 'tomatoes',
  'start': 418,
  'end': 426},
 {'entity_group': 'FOOD',
  'score': 0.99859935,
  'word': 'juices',
  'start': 438,
  'end': 444},
 {'entity_group': 'FOOD',
  'score': 0.9990983,
  'word': 'tomatoes',
  'start': 490,
  'end': 498},
 {'entity_group': 'FOOD',
  'score': 0.9987276,
  'word': 'basil',
  'start': 534,
  

#### 3. Third Pipeline
Now let's define by hand what we would like to happen: 
- if there are two adjacent B-words, and the second one starts with '#', join them
- if an I-word starts with '#', join it to the precedent ingredient

We can notice that our pipeline works as well as the one above, the difference is that the HugginFace pipeline returns a dictionary while our pipeline returns a list.

In [14]:
def get_ingredients(recipe: str, ner_result: list):
    ingredients = []
    last_added = 0
    b_word = 'B-FOOD'
    i_word = 'I-FOOD'

    for i in range(len(ner_result)):

        if ner_result[i]['entity'] == b_word:
            # check if previous word was a segmentation of the same one
            if ner_result[i]['word'].startswith('#') and ner_result[i-1]['entity'] == b_word:
                # if (for any reason (it happens)) the first word is a segment, we ignore it
                if last_added == 0:
                    continue
                ingredients[last_added-1] += recipe[ner_result[i]['start'] : ner_result[i]['end']]
            else:
                # get the ingredient from the recipe given its position
                ingredients.append(recipe[ner_result[i]['start'] : ner_result[i]['end']])
                last_added += 1

        elif ner_result[i]['entity'] == i_word:
            # check if segmentation is occurring
            if ner_result[i]['word'].startswith('#'):
                # if (for any reason (it happens)) the first word is a segment, we ignore it
                if last_added == 0:
                    continue
                ingredients[last_added-1] += recipe[ner_result[i]['start'] : ner_result[i]['end']]
            elif last_added == 0:
                ingredients.append(recipe[ner_result[i]['start'] : ner_result[i]['end']])
                last_added += 1
            else:
                ingredients[last_added-1] = ingredients[last_added-1] + ' ' + recipe[ner_result[i]['start'] : ner_result[i]['end']]

    return ingredients

In [15]:
pipe = pipeline("ner", model=model, tokenizer=tokenizer)
ner_result = pipe(recipe)

ingredients = get_ingredients(recipe, ner_result)
ingredients

['olive oil',
 'onion',
 'garlic',
 'red pepper flakes',
 'fennel',
 'vegetables',
 'tomatoes',
 'juices',
 'tomatoes',
 'basil',
 'wine',
 'olives',
 'salt',
 'black pepper',
 'sauce',
 'fish',
 'fillets',
 'cooking spray',
 'salt',
 'pepper',
 'olive oil',
 'fillets',
 'fillets',
 'fish',
 'fillets',
 'fillets',
 'sauce']

In [None]:
# TODO: indagare se ci sono differenze sostanziali a livello di tempistiche tra la second e la third pipeline

### Creation of the Vocabulary with FoodBaseBERT
Let's use the Third pipeline to to obtain the dictionary of ingredients.

In [29]:
# we create both the vocabulary and the list of ingredients for each recipe
ingredients_v = set()
ingredients_list = []

if torch.cuda.is_available():
    model = model.to('cuda')
    print("Model moved to GPU.")
else:
    print("CUDA is not available. Model will run on CPU.")

pipe = pipeline("ner", model=model, tokenizer=tokenizer)

for i, recipe in tqdm(enumerate(df.ingredients), total=len(df.ingredients)):
    
    ner_result = pipe(recipe)
    ingredients = get_ingredients(recipe, ner_result)

    ingredients_list.append([])
    for ingredient in ingredients:
        # we add the ingredient to the vocabulary
        ingredients_v.add(ingredient)
        # we append the ingredient to the list of ingredients for the current recipe
        ingredients_list[i].append(ingredient)





Model moved to GPU.


100%|██████████| 1/1 [00:00<00:00,  2.35it/s]


### Dataset Loading

We read the csv file obtained from `Data_Understanding.ipynb` which is perfectly balanced between `Vegetarian` and `Meat&Fish` 

In [3]:
PATH = './dataset/dataset_balanced_10k.csv'

df = pd.read_csv(PATH)

In [38]:
# use spacy to apply pos tagging to the ingredients
import spacy

nlp = spacy.load('en_core_web_sm')

def pos_tagging(ingredient):
    doc = nlp(ingredient)
    return [(token.text, token.pos_) for token in doc]

# use specy finetuned to ingredients
"""nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('ner', source=tokenizer)

def pos_tagging(ingredient):
    doc = nlp(ingredient)
    return [(token.text, token.pos_) for token in doc]

"""

pos_tagging('gour blend mushrooms sliced')

# return the string with only the nouns and verbs
def get_nouns_verbs(ingredient):
    tagged = pos_tagging(ingredient)
    return ' '.join([word for word, pos in tagged if pos in ['NOUN', 'VERB', 'PROPN']])
print(pos_tagging('italian pinot grigio'))
get_nouns_verbs('italian pinot grigio')




[('italian', 'ADJ'), ('pinot', 'PROPN'), ('grigio', 'NOUN')]


'grigio'

In [27]:
# create a regex function that given a string removes all except letters and spaces
def get_nouns_verbs(ingredient):
    tagged = pos_tagging(ingredient)
    return ' '.join([word for word, pos in tagged if pos in ['NOUN', 'VERB']])

def clean_text(s):
    s = s.lower()
    s1 = re.sub(r'[^a-z\s]', '', s)
    # remove multiple spaces and starting and ending spaces
    s2 = re.sub(r'\s+', ' ', s1).strip()
    return s2


clean_text('1 cup of flour')


def get_nouns_verbs(ingredient):
    tagged = pos_tagging(ingredient)
    return ' '.join([word for word, pos in tagged if pos in ['NOUN', 'VERB']])


'cup of flour'

### Save the vocabulry and ingredients list

In [None]:
import csv

PATH = '/content/drive/MyDrive/datasets/vocabulary_10k.csv'

with open(PATH, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)

    for ingredient in ingredients_v:
        writer.writerow([ingredient])

print(f"Set saved to '{PATH}'.")

ingredients_df = pd.DataFrame(ingredients_list)

#save the cleaned dataset
ingredients_df.to_csv('./dataset/ingredient_list_10k.csv', index=False)