In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import torch

from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
from tqdm.auto import tqdm

"""
# To use Google Colab:
from google.colab import drive
drive.mount('/content/drive')
"""

"\n# To use Google Colab:\nfrom google.colab import drive\ndrive.mount('/content/drive')\n"

In [2]:
#read the csv file in dataset/vocabulary_spacy_clean_10k.csv as a set
df = pd.read_csv('dataset/vocabulary_spacy_clean_10k.csv')
vocab = set(df['ingredient'])


In [3]:
# count the number of token for each ingredient in the vocabulary
df['token_count'] = df['ingredient'].apply(lambda x: len(x.split()))
len(df['token_count'])


6490

In [10]:
# plot the averge token count for each ingredient
df['token_count'].mean()

1.8842835130970723

## Load cleaned dataset
We use the dataset obtained in the notebook `Data_Understanding.ipynb`

In [20]:
# Load the data locally
df = pd.read_csv('./dataset/dataset_balanced_10k.csv')

# Load the data from GDrive
# df = pd.read_csv('/content/drive/MyDrive/datasets/dataset.csv')

# Named Entity Recognition to Extract the Ingredients
In the following we'll use some NER models to extract all the ingredients from the recipes in order to train a network with them afterwards.

## FoodBaseBERT
We start considering a fine-tuned version of the `bert-base-cased` architecture, which can be found at the following [link](https://huggingface.co/Dizex/FoodBaseBERT-NER).

Let's compare different pipelines on the same recipe: the first of the DataFrame. For the First and Second pipelines we refer to this [documentation](https://huggingface.co/transformers/v4.10.1/_modules/transformers/pipelines/token_classification.html), while for the Third we implemented by hand the construction of the tokens.

In [21]:
tokenizer = AutoTokenizer.from_pretrained("Dizex/FoodBaseBERT")
model = AutoModelForTokenClassification.from_pretrained("Dizex/FoodBaseBERT")



Here we check if we can use the hugging face superpowers. In particular we want to use the `aggregation_strategy` for the split tokens

In [22]:
tokenizer.is_fast

True

In [23]:
# get the first recipe
recipe = df['cooking_method'][0]
recipe

"['To bone turkey, place on a work surface, breast-side down. Slice skin along backbone from neck to tail. Cut and pull flesh and skin away from carcass. Cut flesh from saber-shaped bone near wing, and remove bone. Sever ball-and-socket joints so that they are separated from carcass but still attached to skin. Continue cutting breast meat away from bone until reaching the ridge of breastbone. Turn turkey around and repeat on other side. Pull gently to separate breastbone and carcass flesh. Cut off wing tip and middle section, leaving largest wing bone. Holding outside of wing bone, cut through tendons and scrape meat from bone. Pull out bone, using knife to free it. Holding inside end of leg bone, cut through tendons attaching the flesh to the bone. Use knife to scrape meat from bone, pushing it away from end of bone. Cut bone free of skin. Cut out any sinews still remaining on leg. Repeat on other side, then push leg and wing skin-side out. Butterfly breast so that meat completely cov

#### 1. First Pipeline
We can notice that the performances aren't good; for example `on` `##ion`, `f` `##enne` `##l` are split.

In [6]:
pipe = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)
ner_result = pipe(recipe)

ner_result



[{'entity_group': 'FOOD',
  'score': 0.99878013,
  'word': 'olive oil',
  'start': 138,
  'end': 147},
 {'entity_group': 'FOOD',
  'score': 0.9983912,
  'word': 'on',
  'start': 174,
  'end': 176},
 {'entity_group': 'FOOD',
  'score': 0.9979122,
  'word': '##ion',
  'start': 176,
  'end': 179},
 {'entity_group': 'FOOD',
  'score': 0.9981351,
  'word': 'garlic',
  'start': 181,
  'end': 187},
 {'entity_group': 'FOOD',
  'score': 0.99817646,
  'word': 'red pepper flakes',
  'start': 192,
  'end': 209},
 {'entity_group': 'FOOD',
  'score': 0.99844223,
  'word': 'f',
  'start': 284,
  'end': 285},
 {'entity_group': 'FOOD',
  'score': 0.9983156,
  'word': '##enne',
  'start': 285,
  'end': 289},
 {'entity_group': 'FOOD',
  'score': 0.9981218,
  'word': '##l',
  'start': 289,
  'end': 290},
 {'entity_group': 'FOOD',
  'score': 0.9989028,
  'word': 'vegetables',
  'start': 310,
  'end': 320},
 {'entity_group': 'FOOD',
  'score': 0.9990877,
  'word': 'tomato',
  'start': 418,
  'end': 424},
 {

#### 2. Second Pipeline
These parameters perform much better! For example `onion` and `fennel` are now kept intact.

In [7]:
pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="MAX")
ner_result = pipe(recipe)

ner_result

[{'entity_group': 'FOOD',
  'score': 0.99878013,
  'word': 'olive oil',
  'start': 138,
  'end': 147},
 {'entity_group': 'FOOD',
  'score': 0.9983912,
  'word': 'onion',
  'start': 174,
  'end': 179},
 {'entity_group': 'FOOD',
  'score': 0.9981351,
  'word': 'garlic',
  'start': 181,
  'end': 187},
 {'entity_group': 'FOOD',
  'score': 0.9982384,
  'word': 'red pepper flakes',
  'start': 192,
  'end': 209},
 {'entity_group': 'FOOD',
  'score': 0.99844223,
  'word': 'fennel',
  'start': 284,
  'end': 290},
 {'entity_group': 'FOOD',
  'score': 0.9989028,
  'word': 'vegetables',
  'start': 310,
  'end': 320},
 {'entity_group': 'FOOD',
  'score': 0.9992041,
  'word': 'tomatoes',
  'start': 418,
  'end': 426},
 {'entity_group': 'FOOD',
  'score': 0.99859935,
  'word': 'juices',
  'start': 438,
  'end': 444},
 {'entity_group': 'FOOD',
  'score': 0.9990983,
  'word': 'tomatoes',
  'start': 490,
  'end': 498},
 {'entity_group': 'FOOD',
  'score': 0.9987276,
  'word': 'basil',
  'start': 534,
  

#### 3. Third Pipeline
Now let's define by hand what we would like to happen:
- if there are two adjacent B-words, and the second one starts with '#', join them
- if an I-word starts with '#', join it to the precedent ingredient

We can notice that our pipeline works as well as the one above, the difference is that the HugginFace pipeline returns a dictionary while our pipeline returns a list.

In [24]:
def get_ingredients(recipe: str, ner_result: list):
    ingredients = []
    last_added = 0
    b_word = 'B-FOOD'
    i_word = 'I-FOOD'

    for i in range(len(ner_result)):

        if ner_result[i]['entity'] == b_word:
            # check if previous word was a segmentation of the same one
            if ner_result[i]['word'].startswith('#') and ner_result[i-1]['entity'] == b_word:
                # if (for any reason (it happens)) the first word is a segment, we ignore it
                if last_added == 0:
                    continue
                ingredients[last_added-1] += recipe[ner_result[i]['start'] : ner_result[i]['end']]
            else:
                # get the ingredient from the recipe given its position
                ingredients.append(recipe[ner_result[i]['start'] : ner_result[i]['end']])
                last_added += 1

        elif ner_result[i]['entity'] == i_word:
            # check if segmentation is occurring
            if ner_result[i]['word'].startswith('#'):
                # if (for any reason (it happens)) the first word is a segment, we ignore it
                if last_added == 0:
                    continue
                ingredients[last_added-1] += recipe[ner_result[i]['start'] : ner_result[i]['end']]
            elif last_added == 0:
                ingredients.append(recipe[ner_result[i]['start'] : ner_result[i]['end']])
                last_added += 1
            else:
                ingredients[last_added-1] = ingredients[last_added-1] + ' ' + recipe[ner_result[i]['start'] : ner_result[i]['end']]

    return ingredients

In [9]:
pipe = pipeline("ner", model=model, tokenizer=tokenizer)
ner_result = pipe(recipe)

ingredients = get_ingredients(recipe, ner_result)
ingredients

['olive oil',
 'onion',
 'garlic',
 'red pepper flakes',
 'fennel',
 'vegetables',
 'tomatoes',
 'juices',
 'tomatoes',
 'basil',
 'wine',
 'olives',
 'salt',
 'black pepper',
 'sauce',
 'fish',
 'fillets',
 'cooking spray',
 'salt',
 'pepper',
 'olive oil',
 'fillets',
 'fillets',
 'fish',
 'fillets',
 'fillets',
 'sauce']

In [None]:
# TODO: indagare se ci sono differenze sostanziali a livello di tempistiche tra la second e la third pipeline

### Dataset Loading

We read the csv file obtained from `Data_Understanding.ipynb` which is perfectly balanced between `Vegetarian` and `Meat&Fish`

In [None]:
PATH = './dataset/dataset_balanced_10k.csv'

df = pd.read_csv(PATH)

### Creation of the Vocabulary with FoodBaseBERT
Let's use the Third pipeline to to obtain the dictionary of ingredients.

In [25]:
# we create both the vocabulary and the list of ingredients for each recipe
ingredients_v = set()
ingredients_list = []

if torch.cuda.is_available():
    model = model.to('cuda')
    print("Model moved to GPU.")
else:
    print("CUDA is not available. Model will run on CPU.")

pipe = pipeline("ner", model=model, tokenizer=tokenizer)

for i, recipe in tqdm(enumerate(df.ingredients), total=len(df.ingredients)):

    ner_result = pipe(recipe)
    ingredients = get_ingredients(recipe, ner_result)

    ingredients_list.append([])
    for ingredient in ingredients:
        # we add the ingredient to the vocabulary
        ingredients_v.add(ingredient)
        # we append the ingredient to the list of ingredients for the current recipe
        ingredients_list[i].append(ingredient)



Model moved to GPU.


  0%|          | 0/10000 [00:00<?, ?it/s]

### PoS tagging and lemmatization with spaCy

to use spaCy we need to install the model with the following command:

python -m spacy download en_core_web_sm

In [26]:
import spacy

# Load English tokenizer, tagger, parser and NER
nlp = spacy.load('en_core_web_sm')

# apply part-of-speech tagging to the ingredient
def pos_tagging(ingredient):
    doc = nlp(ingredient)
    return [(token.text, token.pos_) for token in doc]

# return the string without adjectives, verbs and proper nouns
def get_nouns_verbs(ingredient):
    tagged = pos_tagging(ingredient)
    return ' '.join([word for word, pos in tagged if pos not in  ['ADJ', 'VERB', 'PROPN']])

# lemmatize the ingredient
def lemmatize(ingredient):
    doc = nlp(ingredient)
    return ' '.join([token.lemma_ for token in doc])


def clean_text(s):
    s = s.lower()
    s1 = re.sub(r'[^a-z\s]', '', s)
    # remove multiple spaces and starting and ending spaces
    s2 = re.sub(r'\s+', ' ', s1).strip()
    return s2


now we test these functions on the ingredient ` 5 calamari rings `

In [None]:
ingredient = ' 5 calamari rings'

clean_ingredient = clean_text(ingredient)
print(f'Ingredient: \t\t\t{ingredient}')
print(f'Cleaned ingredient: \t\t{clean_ingredient}')
print(f'PoS tagging: \t\t\t{pos_tagging(clean_ingredient)}')
print(f'Without ADJ, PROPN, VERB: \t{get_nouns_verbs(clean_ingredient)}')
print(f'Lemmatized: \t\t\t{lemmatize(get_nouns_verbs(clean_ingredient))}')

Ingredient: 			 5 calamari rings
Cleaned ingredient: 		calamari rings
PoS tagging: 			[('calamari', 'PROPN'), ('rings', 'NOUN')]
Without ADJ, PROPN, VERB: 	rings
Lemmatized: 			ring


Now we apply these function to both the vocabulary and the list of ingredients.

In [27]:
# create the vocabulary
vocabulary = set()
for ingredient in tqdm(ingredients_v):
    clean_ingredient = clean_text(ingredient)
    vocabulary.add(lemmatize(get_nouns_verbs(clean_ingredient)))

# create the list of ingredients for each recipe
ingredients_list_clean = []
for recipe in tqdm(ingredients_list):
    ingredients_list_clean.append([])
    for ingredient in recipe:
        clean_ingredient = clean_text(ingredient)
        ingredients_list_clean[-1].append(lemmatize(get_nouns_verbs(clean_ingredient)))

  0%|          | 0/21530 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

In [29]:
print(len(vocabulary))
print(len(ingredients_list_clean))

6491
10000


In [32]:
# remove empty ingredients from the vocabulary
vocabulary.discard('')
print(len(vocabulary))

6490


### Save the vocabulary and ingredients list

In [33]:
import csv

# save the vocabulary
PATH = './dataset/vocabulary_spacy_clean_10k.csv'

with open(PATH, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)

    for ingredient in vocabulary:
        writer.writerow([ingredient])

print(f"Set saved to '{PATH}'.")


#save the ingredients list
ingredients_df = pd.DataFrame(ingredients_list_clean)

ingredients_df.to_csv('./dataset/ingredient_list_spacy_clean_10k.csv', index=False)

Set saved to './dataset/vocabulary_spacy_clean_10k.csv'.


---

## DistilBert


Now we consider a fine-tuned version of the distilbert-base-cased architecture, which can be found at the following [link](https://github.com/chambliss/foodbert).

We chose to work with the feature `ingredients` because the examples provided in the GitHub repository for the fine-tuned model were more similar to this feature than the `cooking_method` column.

NOTE: The model requires a conda environment to be executed. To get the required setup, run the following cell.

In [None]:
# Install Miniconda
!wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
!bash Miniconda3-latest-Linux-x86_64.sh -bfp /usr/local

# Update PATH
import sys
sys.path.append('/usr/local/lib/python3.7/site-packages')

# Clone your repo
!git clone https://github.com/chambliss/foodbert.git

# Change directory to the repo
%cd foodbert

# Create the conda environment
!conda env create -f environment.yml


In [None]:
%%bash
source /usr/local/etc/profile.d/conda.sh && conda activate hf-nlp
pip install -e .

In [None]:
from food_extractor.food_model import FoodModel
model = FoodModel("chambliss/distilbert-for-food-extraction")

#### Load the dataset

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
df = pd.read_csv('/content/drive/MyDrive/datasets/dataset_balanced_10k.csv')

In [9]:
df.head()

Unnamed: 0,cooking_method,ingredients,tags,Vegetarian,Dairy Free,Gluten Free,Low Carb,Low Fat,Low Sodium,Dessert,Meat,Fish,Dairy,Vegetarian&Dessert,Meat&Fish
0,"['To bone turkey, place on a work surface, bre...","['1 (12 to 14 pound) turkey', '3 tablespoons e...","Poultry,Turkey Recipes,Main Dish",0,0,0,0,0,0,0,1,0,0,0,1
1,['Combine all the ingredients in a blender and...,"['2 1/4 cups freshly squeezed orange juice', '...","Liquor Recipes,Tequila Recipes,Fruit,Pureeing ...",1,0,1,0,1,0,0,0,0,0,1,0
2,['Pulse the black peppercorns in a spice grind...,"['1 tablespoon black peppercorns', '3/4 cup ci...","Sauce Recipes,Barbecue Restaurants,Gluten Free...",1,0,1,0,1,0,0,0,0,0,1,0
3,"['Pour water, lemon juice, and simple syrup in...","['8 cups cold water', '2 cups freshly squeezed...","Make Ahead,American,Lemonade Recipes,Tea Recip...",1,0,1,0,1,1,0,0,0,1,1,0
4,['Toss all ingredients together and season wit...,"['2 roasted red and yellow bell peppers, peele...","Easy Main Dish,Easy,Main Dish,American,Southwe...",1,0,1,0,0,0,0,0,0,0,1,0


In [10]:
len(df)

10000

In [11]:
# we associate each recipe with its ingredients
ingredients_list = df.ingredients

In [12]:
# we save the labels for label
labels = df[df.columns[3:]]

In [13]:
# for each recipe we extract the ingredients and save them in a list
ner_list = []
for i, recipe in tqdm(enumerate(ingredients_list), total=len(ingredients_list)):
    try:
        ner_list.append([ingredient['text'] for ingredient in model.extract_foods(recipe)[0]['Ingredient']])
    except Exception as e:
        print(i)
        continue

  0%|          | 0/10000 [00:00<?, ?it/s]

In [14]:
# now we create the vocabulary by extracting the ingredients from the list of ingredients
ner_ingredients = set()
for el in ner_list:
    for ingredient in el:
        ner_ingredients.add(ingredient)

#### save the vocabulary and list of ingredients

In [None]:
import csv

with open('/content/drive/MyDrive/ner_ingredients_balanced.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['ingredients'])  # write the column name
    for item in list(ner_ingredients):
        writer.writerow([item])

In [27]:
recipes_df = pd.DataFrame(ner_list)
recipes_df.to_csv('/content/drive/MyDrive/ner_recipes_balanced.csv', index=False)

### Cleaning with Stanza Pipeline

In [62]:
import stanza

df_vocabulary = pd.read_csv('./dataset/ner_ingredients_balanced.csv')

In [63]:
df_vocabulary.head()

Unnamed: 0,ingredients
0,red currant jelly
1,picholine
2,apple juice
3,savoy
4,White truffle oil


We define the fuction to process the vocabulary.

In particular we used the Stanza pipeline: Tokenization, Multi-Word-Token, PoS, Lemmatization.

Furthermore we want to remove the token tagged as VERB, PROPN, ADJ to keep the ingredient as significant as possibile.

This pipeline successfully removes plurals, extra infos and errors carried by the tag PROPN

In [68]:
# remove words with less than 3 characters, special characters and numbers, and fix the blank spaces.
def clean_text_stanza(s):
    s = s.lower()
    s1 = re.sub(r'[^a-z\s]', ' ', s)
    s1 = ' '.join([w for w in s1.split() if len(w) > 2])
    s2 = re.sub(r'\s+', ' ', s1).strip()
    return s2

In [69]:
def clean_vocabulary(df):
    df['ingredients'] = df['ingredients'].apply(clean_text_stanza)
    df = df.drop_duplicates()
    df = df[df['ingredients'] != '']

    nlp = stanza.Pipeline(lang='en', processors='tokenize, mwt, pos, lemma', use_gpu=True)
    
    # definition of new dictionary
    cleaned_ingredients = []

    with tqdm(total=len(df)) as pbar:
        for ingredient in  df['ingredients']:
            # Process ingredient through the pipeline
            doc = nlp(ingredient)
            
            # Extract tokenized forms, part-of-speech tags, and lemmatized forms
            tokens = [word.text for sent in doc.sentences for word in sent.words]
            pos_tags = [word.upos for sent in doc.sentences for word in sent.words]
            lemmas = [word.lemma for sent in doc.sentences for word in sent.words]
            
            ### NOTICE THAT WE ARE USING `lemmas` INSTEAD OF `tokens`, so we will define our clean dictionary with the pure form of the words (their lemmatization!!!) ###
            # eliminate the tokens in `tokens` that are ADJ in `pos_tags`
            tokens = [lemmas[i] for i in range(len(tokens)) if pos_tags[i] != 'ADJ' and pos_tags[i] != 'PROPN' and pos_tags[i] != 'VERB']

            # reconvert tokens to a string
            cleaned_ingredient = ' '.join(tokens)

            # append to the list
            cleaned_ingredients.append(cleaned_ingredient)
            pbar.update(1)

    cleaned_df = pd.DataFrame(cleaned_ingredients, columns=['ingredients'])
    cleaned_df = cleaned_df.drop_duplicates()
    cleaned_df = cleaned_df[cleaned_df['ingredients'] != '']
    return cleaned_df

In [70]:
df_vocabulary = clean_vocabulary(df_vocabulary)

2024-05-26 17:06:25 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-05-26 17:06:25 INFO: Downloaded file to C:\Users\chucki\stanza_resources\resources.json
2024-05-26 17:06:26 INFO: Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |

2024-05-26 17:06:26 INFO: Using device: cuda
2024-05-26 17:06:26 INFO: Loading: tokenize
2024-05-26 17:06:26 INFO: Loading: mwt
2024-05-26 17:06:26 INFO: Loading: pos
2024-05-26 17:06:26 INFO: Loading: lemma
2024-05-26 17:06:27 INFO: Done loading processors!


  0%|          | 0/8033 [00:00<?, ?it/s]

In [71]:
df_vocabulary.head()

Unnamed: 0,ingredients
0,currant jelly
1,picholine
2,apple juice
4,truffle oil
5,mason jar


In [75]:
# save the cleaned dataset
df_vocabulary.to_csv('./dataset/vocabulary_stanza_10k.csv', index=False)

---

Now for the list of ingredients of each recipes

In [135]:
df = pd.read_csv('./dataset/ner_recipes_balanced.csv')

In [136]:
def clean_ingredients_list(df):
    # replace NaN values with empty strings
    df = df.fillna('')
    df = df.map(clean_text_stanza)
    
    #pipeline
    nlp = stanza.Pipeline(lang='en', processors='tokenize, mwt, pos, lemma', use_gpu=True)
   
    ingredients_l = df.values.tolist()
    # removes ''
    ingredients_l = [[x for x in row if x != ''] for row in ingredients_l]
    cleaned_ingredients = []

    
    for row in tqdm(ingredients_l):
        cleaned_row = []
        for ingredient in row:
            # Process ingredient through the pipeline
            doc = nlp(ingredient)
            
            # Extract tokenized forms, part-of-speech tags, and lemmatized forms
            tokens = [word.text for sent in doc.sentences for word in sent.words]
            pos_tags = [word.upos for sent in doc.sentences for word in sent.words]
            lemmas = [word.lemma for sent in doc.sentences for word in sent.words]
            
            ### NOTICE THAT WE ARE USING `lemmas` INSTEAD OF `tokens`, so we will define our clean dictionary with the pure form of the words (their lemmatization!!!) ###
            # eliminate the tokens in `tokens` that are ADJ in `pos_tags`
            tokens = [lemmas[i] for i in range(len(tokens)) if pos_tags[i] != 'ADJ' and pos_tags[i] != 'PROPN' and pos_tags[i] != 'VERB']

            # reconvert tokens to a string
            cleaned_ingredient = ' '.join(tokens)

            # append to the list
            cleaned_row.append(cleaned_ingredient)

        cleaned_ingredients.append(cleaned_row)

    # remove duplicates
    for i in range(len(cleaned_ingredients)):
        cleaned_ingredients[i] = list(set(cleaned_ingredients[i]))

    # remove empty strings
    cleaned_ingredients = [[x for x in row if x != ''] for row in cleaned_ingredients]

    cleaned_df = pd.DataFrame(cleaned_ingredients)

    return cleaned_df

In [137]:
df = clean_ingredients_list(df)

2024-05-26 17:52:13 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-05-26 17:52:13 INFO: Downloaded file to C:\Users\chucki\stanza_resources\resources.json
2024-05-26 17:52:15 INFO: Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |

2024-05-26 17:52:15 INFO: Using device: cuda
2024-05-26 17:52:15 INFO: Loading: tokenize
2024-05-26 17:52:15 INFO: Loading: mwt
2024-05-26 17:52:15 INFO: Loading: pos
2024-05-26 17:52:15 INFO: Loading: lemma
2024-05-26 17:52:15 INFO: Done loading processors!


  0%|          | 0/10000 [00:00<?, ?it/s]

In [138]:
df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,33,34,35,36,37,38,39,40,41,42
0,maple syrup,garlic,ginger,cayenne pepper,raisin,sage leave,vinegar,mustard,olive oil,clove,...,,,,,,,,,,
1,lime juice,grenadine syrup,cayenne pepper,juice,salt,,,,,,...,,,,,,,,,,
2,garlic,cider vinegar,pepper flake,sugar,kosher salt,peppercorn,vinegar,,,,...,,,,,,,,,,
3,lemon juice,tea,syrup,water,,,,,,,...,,,,,,,,,,
4,olive oil,cilantro leave,bell pepper,poblano pepper,plum tomato,lime,salt,,,,...,,,,,,,,,,
5,egg,vanilla,bread cube,cinnamon,cheddar cheese,sugar,milk,apple,,,...,,,,,,,,,,
6,apple cider,,,,,,,,,,...,,,,,,,,,,
7,egg,garlic,bread crumb,steak sauce,worcestershire sauce,thyme,ground beef,steak seasoning,,,...,,,,,,,,,,
8,olive oil,oregano,pepper,lemon,feta cheese,oregano sprig,sea salt,snapper,kalamata olive,salt,...,,,,,,,,,,
9,sugar,cashew,sweetener,almond,,,,,,,...,,,,,,,,,,


In [139]:
# save the cleaned dataset
df.to_csv('./dataset/ingredients_list_stanza_10k.csv', index=False)

In [144]:
# number of rows with all empty strings or nan
df.isnull().all(axis=1).sum()

45

In [145]:
# print the idexes
df[df.isnull().all(axis=1)].index

Index([ 211,  221,  300,  404, 1074, 1422, 1997, 2122, 2172, 2324, 2635, 2813,
       2831, 3380, 3457, 3858, 4058, 4327, 4379, 4642, 4727, 4736, 4870, 4899,
       4935, 5286, 5668, 5839, 6128, 6880, 7159, 7257, 7317, 7595, 7800, 7953,
       8192, 8276, 8350, 8820, 9199, 9221, 9616, 9663, 9705],
      dtype='int64')

NOTE: these recipes don't have any ingredient after the processing, when using `ingredients_list_stanza_10k.csv` remember to take this in consideration

---

count voc avg ingredient number of tokens

In [4]:
# load the dataset
df = pd.read_csv('./dataset/vocabulary_stanza_10k.csv')


In [5]:
# count the number of token for each ingredient in the vocabulary
df['token_count'] = df['ingredients'].apply(lambda x: len(x.split()))
len(df['token_count'])


5211

In [6]:
# plot the averge token count for each ingredient
df['token_count'].mean()

1.8422567645365573