In [15]:
import stanza
import pandas as pd
import re

In [16]:
# df = pd.read_csv('./dataset/recipes_82k.csv')
df = pd.read_csv('./dataset/ner_ingredients.csv')

In [17]:
# change the column name to `ingredients`
df.rename(columns={'red chicory': 'ingredients'}, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9820 entries, 0 to 9819
Data columns (total 1 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ingredients  9820 non-null   object
dtypes: object(1)
memory usage: 76.8+ KB


In [18]:
# add a new instance to df
df = df._append({'ingredients': 'red chicory'}, ignore_index=True)

In [19]:
df.tail()

Unnamed: 0,ingredients
9816,Potato chips
9817,fish flavor
9818,dry beef
9819,cauliflower head
9820,red chicory


We strt processingt his dtaaframe: eliminate words of 2 chars, etc...

In [20]:
def clean_text(s):
    s = s.lower()
    # if you encounter a - or ' (or something else) in the text, replace it with a space
    #TODO: right?
    s1 = re.sub(r'[^a-z\s]', ' ', s)
    s1 = ' '.join([w for w in s1.split() if len(w) > 2])
    # remove multiple spaces and starting and ending spaces
    s2 = re.sub(r'\s+', ' ', s1).strip()
    return s2

In [21]:
# apply the function to the ingredients column
df['ingredients'] = df['ingredients'].apply(clean_text)
df.head()

Unnamed: 0,ingredients
0,mango juice
1,pinch salt
2,tooth coriander
3,sticks
4,vegetable stock


In [27]:
# count empty strings
print(df[df['ingredients'] == ''].shape)

(59, 1)


In [28]:
# duplicates
print(df[df.duplicated()].shape)

(1240, 1)


In [32]:
df.shape

(9821, 1)

In [33]:
# eliminate both empty strings and duplicates
df = df.drop_duplicates()
df = df[df['ingredients'] != '']
df.shape

(8580, 1)

In [37]:
nlp = stanza.Pipeline(lang='en', processors='tokenize, pos, lemma')
nlp

2024-04-29 21:26:30 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-04-29 21:26:30 INFO: Downloaded file to /Users/irene/stanza_resources/resources.json


Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.8.0/models/tokenize/combined.pt:   0%|    …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.8.0/models/mwt/combined.pt:   0%|         …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.8.0/models/pos/combined_charlm.pt:   0%|  …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.8.0/models/lemma/combined_nocharlm.pt:   0…

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.8.0/models/forward_charlm/1billion.pt:   0…

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.8.0/models/backward_charlm/1billion.pt:   …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.8.0/models/pretrain/conll17.pt:   0%|     …

2024-04-29 21:27:15 INFO: Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |

2024-04-29 21:27:15 INFO: Using device: cpu
2024-04-29 21:27:15 INFO: Loading: tokenize
2024-04-29 21:27:15 INFO: Loading: mwt
2024-04-29 21:27:15 INFO: Loading: pos
2024-04-29 21:27:16 INFO: Loading: lemma
2024-04-29 21:27:16 INFO: Done loading processors!


<stanza.pipeline.core.Pipeline at 0x31f231ad0>

Let's visualize how these tools work!!!

In [47]:
# Process each ingredient
for ingredient in df[7000:7010]['ingredients']:
    # Process ingredient through the pipeline
    doc = nlp(ingredient)
    
    # Extract tokenized forms, part-of-speech tags, and lemmatized forms
    tokens = [word.text for sent in doc.sentences for word in sent.words]
    pos_tags = [word.upos for sent in doc.sentences for word in sent.words]
    lemmas = [word.lemma for sent in doc.sentences for word in sent.words]
    
    # Print the processed information
    print("Original Ingredient:", ingredient)
    print("Tokens:", tokens)
    print("POS Tags:", pos_tags)
    print("Lemmas:", lemmas)
    print()

Original Ingredient: bones fish bones
Tokens: ['bones', 'fish', 'bones']
POS Tags: ['NOUN', 'NOUN', 'NOUN']
Lemmas: ['bone', 'fish', 'bone']

Original Ingredient: eyed peas
Tokens: ['eyed', 'peas']
POS Tags: ['ADJ', 'NOUN']
Lemmas: ['eye', 'peas']

Original Ingredient: sliced pepperoni
Tokens: ['sliced', 'pepperoni']
POS Tags: ['VERB', 'NOUN']
Lemmas: ['slice', 'pepperoni']

Original Ingredient: carrot fronds
Tokens: ['carrot', 'fronds']
POS Tags: ['NOUN', 'NOUN']
Lemmas: ['carrot', 'frond']

Original Ingredient: chocolate sprinkles
Tokens: ['chocolate', 'sprinkles']
POS Tags: ['NOUN', 'NOUN']
Lemmas: ['chocolate', 'sprinkle']

Original Ingredient: lash milk
Tokens: ['lash', 'milk']
POS Tags: ['NOUN', 'NOUN']
Lemmas: ['lash', 'milk']

Original Ingredient: goose
Tokens: ['goose']
POS Tags: ['NOUN']
Lemmas: ['goose']

Original Ingredient: bucatini pasta
Tokens: ['bucatini', 'pasta']
POS Tags: ['NOUN', 'NOUN']
Lemmas: ['bucatini', 'pasta']

Original Ingredient: lowfat yogurt
Tokens: ['low

In [51]:
# INTERNAL PROCESSING: ELIMINATE ADJ AND PROPN

for ingredient in df[7000:7010]['ingredients']:
    # Process ingredient through the pipeline
    doc = nlp(ingredient)
    
    # Extract tokenized forms, part-of-speech tags, and lemmatized forms
    tokens = [word.text for sent in doc.sentences for word in sent.words]
    pos_tags = [word.upos for sent in doc.sentences for word in sent.words]
    lemmas = [word.lemma for sent in doc.sentences for word in sent.words]
    
    ### NOTICE THAT WE ARE USING `lemmas` INSTEAD OF `tokens`, so we will define our clean dictionary with the pure form of the words (their lemmatization!!!) ###
    # eliminate the tokens in `tokens` that are ADJ in `pos_tags`
    tokens = [lemmas[i] for i in range(len(tokens)) if pos_tags[i] != 'ADJ' and pos_tags[i] != 'PROPN']
    
    # Print the processed information
    print("Original Ingredient:", ingredient)
    print("Tokens:", tokens)
    print("POS Tags:", pos_tags)
    print("Lemmas:", lemmas)
    print()

Original Ingredient: bones fish bones
Tokens: ['bone', 'fish', 'bone']
POS Tags: ['NOUN', 'NOUN', 'NOUN']
Lemmas: ['bone', 'fish', 'bone']

Original Ingredient: eyed peas
Tokens: ['peas']
POS Tags: ['ADJ', 'NOUN']
Lemmas: ['eye', 'peas']

Original Ingredient: sliced pepperoni
Tokens: ['slice', 'pepperoni']
POS Tags: ['VERB', 'NOUN']
Lemmas: ['slice', 'pepperoni']

Original Ingredient: carrot fronds
Tokens: ['carrot', 'frond']
POS Tags: ['NOUN', 'NOUN']
Lemmas: ['carrot', 'frond']

Original Ingredient: chocolate sprinkles
Tokens: ['chocolate', 'sprinkle']
POS Tags: ['NOUN', 'NOUN']
Lemmas: ['chocolate', 'sprinkle']

Original Ingredient: lash milk
Tokens: ['lash', 'milk']
POS Tags: ['NOUN', 'NOUN']
Lemmas: ['lash', 'milk']

Original Ingredient: goose
Tokens: ['goose']
POS Tags: ['NOUN']
Lemmas: ['goose']

Original Ingredient: bucatini pasta
Tokens: ['bucatini', 'pasta']
POS Tags: ['NOUN', 'NOUN']
Lemmas: ['bucatini', 'pasta']

Original Ingredient: lowfat yogurt
Tokens: ['lowfat', 'yogurt

In [53]:
#TODO: valutare se togliere anche VERB

As we can see from above, now we will have new empty strings!!!

In [52]:
# definition of new dictionary
cleaned_ingredients = []

for ingredient in df['ingredients']:
    # Process ingredient through the pipeline
    doc = nlp(ingredient)
    
    # Extract tokenized forms, part-of-speech tags, and lemmatized forms
    tokens = [word.text for sent in doc.sentences for word in sent.words]
    pos_tags = [word.upos for sent in doc.sentences for word in sent.words]
    lemmas = [word.lemma for sent in doc.sentences for word in sent.words]
    
    ### NOTICE THAT WE ARE USING `lemmas` INSTEAD OF `tokens`, so we will define our clean dictionary with the pure form of the words (their lemmatization!!!) ###
    # eliminate the tokens in `tokens` that are ADJ in `pos_tags`
    tokens = [lemmas[i] for i in range(len(tokens)) if pos_tags[i] != 'ADJ' and pos_tags[i] != 'PROPN']

    # reconvert tokens to a string
    cleaned_ingredient = ' '.join(tokens)

    # append to the list
    cleaned_ingredients.append(cleaned_ingredient)

In [61]:
# switch cleaned_ingredients to a DataFrame
cleaned_df = pd.DataFrame(cleaned_ingredients, columns=['ingredients'])
cleaned_df.head()

Unnamed: 0,ingredients
0,mango juice
1,pinch salt
2,tooth coriander
3,stick
4,vegetable stock


In [62]:
# see how many empty strings we have
print(cleaned_df[cleaned_df['ingredients'] == ''].shape)

(498, 1)


In [63]:
# see duplicates
print(cleaned_df[cleaned_df.duplicated()].shape)

(2517, 1)


In [66]:
# see the duplicates 
cleaned_df[cleaned_df.duplicated()].head(30)

Unnamed: 0,ingredients
33,
42,clam
44,
60,
74,
80,rice
92,
95,
105,
110,


In [67]:
# eliminate both empty strings and duplicates
cleaned_df = cleaned_df.drop_duplicates()
cleaned_df = cleaned_df[cleaned_df['ingredients'] != '']
cleaned_df.shape

(6062, 1)