## PREPROCESSING

Ricette definite in un formato semistrutturato, raggruppamento degli step e ingredienti in un unica stringa, funzioni di utilità

### Funzioni util e import

In [None]:
from pathlib import Path
import pandas as pd
from IPython.display import display
from ast import literal_eval
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree
from nltk.sem import relextract
import spacy
from spacy import displacy
from nltk.corpus import conll2000

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('conll2000')


!python -m spacy download en_core_web_md

NLP = spacy.load('en_core_web_md')


def string_recipe(i):
    return dataset.iloc[i]['title'] + "\n\n" + dataset.iloc[i]['ingredients'] + "\n\n" + dataset.iloc[i]['step'] 

### Caricamento del dataset

In [None]:
dataset = pd.read_csv(
    Path("../data/test_dataset.csv").resolve(), 
    index_col=[0], 
    names=["index", "title","ingredients","step"], 
    usecols=[0,1,2,3]
    )

for index in range(len(dataset)):
    dataset.iloc[index]['ingredients'] = ".\n".join(literal_eval(dataset.iloc[index]['ingredients']))
    dataset.iloc[index]['step'] = " ".join(literal_eval(dataset.iloc[index]['step']))

display(dataset.head())
print(string_recipe(10))


### Estrazione delle abbreviazioni

Fase iniziale di ritrovamento del set di abbreviazioni

In [None]:
abbrv_dataset = pd.read_csv(
    Path("../data/test_dataset.csv").resolve(), 
    index_col=[0], 
    names=["index", "title","ingredients","step"], 
    usecols=[0,1,2,3]
    )

abbrv = set()
for index in range(len(dataset)):
    abbrv_dataset.iloc[index]['ingredients'] = " ".join(literal_eval(abbrv_dataset.iloc[index]['ingredients']))
    for element in re.findall(r"[A-Za-z]*\.", abbrv_dataset.iloc[index]['ingredients']):
        abbrv.add(element)
    
print(abbrv)

Rimozione delle abbreviazioni in quanto possono essere dannose per il processo di tokenizzazione. Es pkg. ---> package

In [None]:
def expand_abbreviations(ingredients_string):
    __ABBREVIATIONS__ = {
        'pkg.'  :   'package',
        'tsb.'  :   'tablespoon',
        'no.'   :   'number',
        'pt.'   :   'pint',
        'no.'   :   'number',
        'gal.'  :   'gallon',
        'tbsp.' :   'tablespoon',
        'sq.'   :   'square',
        'oz.'   :   'ounce',
        'lb.'   :   'pound',
        'qt.'   :   'quart',
        'c.'    :   'cup',
        'tsp.'  :   'teaspoon'
    }
    for item, value in __ABBREVIATIONS__.items():
        ingredients_string = ingredients_string.lower().replace(item, value)
    return ingredients_string


for index in range(len(dataset)):
    dataset.iloc[index]['ingredients'] = expand_abbreviations(dataset.iloc[index]['ingredients'])

### Sentence splitting
Il contenuto delle colonne 'ingredients' e 'step' verrà suddiviso in frasi. In precedenza i periodi contenuti nelle singole celle sono stati formattati in modo tale da renderli riconoscibili e facilmente suddivisibili in frasi ben separate.

In [None]:
for index in range(len(dataset)):
     dataset.iloc[index]['ingredients'] = (sent_tokenize(dataset.iloc[index]['ingredients']))
print(dataset.iloc[10]['ingredients'])

In [None]:
for index in range(len(dataset)):
     dataset.iloc[index]['step'] = (sent_tokenize(dataset.iloc[index]['step']))
print(dataset.iloc[10]['step'])

### Rimozione quantità doppie

In [None]:
dr_reg = r'\d*\s*\(.*\)'

for index in range(len(dataset)):
    for value in range(len(dataset.iloc[index]['ingredients'])):
        elements = re.findall(dr_reg, dataset.iloc[index]['ingredients'][value])
        for e in elements:
            new_string = dataset.iloc[index]['ingredients'][value].replace(e,e[e.find("(")+1: e.find(")")].strip())
            dataset.iloc[index]['ingredients'][value] = new_string

## ANALISI

### Stop words removal
Nella colonna 'step' troviamo una serie di passaggi da compiere per creare la ricetta. Questi passaggi sono scritti in linguaggio naturale e possono essere semplificati rimuovendo delle parole dette stop words.

In [None]:
# OPTINAL -> Risultati in swr_dataset"

stop_words = set(stopwords.words('english'))
swr_dataset = dataset.copy(deep=True)

tk = lambda x,st: ' '.join([w for w in x if w not in st])
for index in range(len(dataset)):
    swr_dataset.iloc[index]['step'] = [tk(word_tokenize(sent), stop_words) for sent in dataset.iloc[index]['step']]

### Stemming e Lemming
Questi due processi potrebbero portare valore all'analisi del dominio. Il codice per entrambi viene proposto qui.

In [None]:
# OPTINAL -> Risultati in lem_dataset"

lemmatizer = WordNetLemmatizer()
lem_dataset = swr_dataset.copy(deep=True)

for index in range(len(swr_dataset)):
    lem_dataset.iloc[index]['step'] = stem_sent = [' '.join([lemmatizer.lemmatize(w) for w in word_tokenize(sent)]) for sent in swr_dataset.iloc[index]['step']]

In [None]:
# OPTINAL -> Risultati in stm_dataset"

stemmer = PorterStemmer()
stm_dataset = lem_dataset.copy(deep=True)

for index in range(len(lem_dataset)):
    stm_dataset.iloc[index]['step'] = [' '.join([stemmer.stem(w) for w in word_tokenize(sent)]) for sent in lem_dataset.iloc[index]['step']]

In [None]:
print("Frasi originali\n")
print(dataset.iloc[10]['step'])
print("\nStop word removal\n")
print(swr_dataset.iloc[10]['step'])
print("\nStop word e lemming\n")
print(lem_dataset.iloc[10]['step'])
print("\nStop word lemming e stemming\n")
print(stm_dataset.iloc[10]['step'])

### Entity Extraction and relation extraction and POS-Tagging
Questa operazione ci permette di riconoscere le parti del testo. !!!Attualmente vengono solo stampate, ma in seguito le utilizzeremo a modo!!!

In [None]:
spacy_dataset = dataset.copy(deep=True)
       
for index in range(len(dataset)):
    spacy_dataset.iloc[index]['ingredients'] = [NLP(element) for element in dataset.iloc[index]['ingredients']]
    spacy_dataset.iloc[index]['step'] = [NLP(element) for element in dataset.iloc[index]['step']]
        

In [None]:
def __pprintsp__(dataset, index, column, value, extend=False):
    displacy.render(dataset.iloc[index][column][value], style='dep')
    print(dataset.iloc[index][column][value])
    displacy.render(dataset.iloc[index][column][value], style='ent')
    if extend:
        for token in dataset.iloc[index][column][value]:
            print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop)

def pprint_spacyd_all(dataset, index, column, extend=False):
    for v in range(len(dataset.iloc[index][column])):
        __pprintsp__(dataset, index, column, v, extend)
            
def pprint_spacyd(dataset, index, column, value, extend=False):
    __pprintsp__(dataset, index, column, v, extend)

In [None]:
pprint_spacyd_all(spacy_dataset_lem, 0, 'ingredients')

In [None]:
from spacy.symbols import X, NUM, VERB, NOUN
from nltk.corpus import wordnet as wn

def is_food(word):
    syns = wn.synsets(str(word), pos = wn.NOUN)
    for syn in syns:
        if 'food' in syn.lexname():
            return True
    return False


#Root generation

__SIZES__ = [
       'package',
       'tablespoon',
       'number',
       'pint',
       'number',
       'gallon',
       'tablespoon',
       'square',
       'ounce',
       'pound',
       'quart',
       'cup',
       'teaspoon',
       'can'
]


class Ingredient:
    def __init__(self, full_text):
        self.name=""
        self.size=""
        self.quantity=""
        self.size=""
        self.adj=[]
        self.original=full_text
    def __str__(self):
        return f"Name: {self.name}\nAjdectives: {self.adj}\nQuantity: {self.quantity} {self.size}\nOriginal: {self.original}"

for index in range(6,10):
    for v in range(4):

        ingredient = spacy_dataset.iloc[index]['ingredients'][v]
        pprint_spacyd(spacy_dataset, index, 'ingredients', v)

        IGN = Ingredient(ingredient.text)
        ROOT_NODE = [token for token in ingredient if token.dep_ == 'ROOT'][0]
        

        if (ROOT_NODE.pos == VERB):
                IGN.adj.append(ROOT_NODE.text)
        else:
                IGN.name = ROOT_NODE.text
                
        stack = [element for element in ROOT_NODE.children]

        while len(stack)!=0:
            CURRENT_NODE = stack.pop()

            if (CURRENT_NODE.pos == X) or (CURRENT_NODE.pos == NUM):
                    IGN.quantity += " " + CURRENT_NODE.text
            else:
                if CURRENT_NODE.text.lower() in  __SIZES__:
                    IGN.size += " " + CURRENT_NODE.text

                if ((CURRENT_NODE.dep_ == 'compound') or (CURRENT_NODE.dep_ == 'dobj') or (CURRENT_NODE.dep_ == 'pobj')) and is_food(CURRENT_NODE.text) and (CURRENT_NODE.text.lower() not in  __SIZES__):
                    IGN.name = CURRENT_NODE.text + " " + IGN.name

                if (CURRENT_NODE.dep_ == 'amod') or (CURRENT_NODE.dep_ == 'appos'):
                    if is_food(CURRENT_NODE.text):
                        IGN.name = CURRENT_NODE.text + " " + IGN.name
                    else:
                        IGN.adj.append(CURRENT_NODE.text)         

            stack += [element for element in CURRENT_NODE.children]   

        print(IGN)

In [None]:
ingredients = set()
for index in range(len(spacy_dataset)):
    for doc in spacy_dataset.iloc[index]['ingredients']:
        for token in doc:
            string = ''
            if token.dep_ == 'ROOT':
                string += token.text + ' '
                for child in token.children:
                    if child.dep_ == 'compound':
                        string = child.text + ' ' + string
                    if child.dep_ == 'amod':
                        string = child.text + ' ' + string
                    if child.dep_ == 'dobj':
                        string = string + ' ' + child.text
            ingredients.add(string)

clean_ingredients = ingredients.copy()

for ing in ingredients:
    if 'teaspoon' in ing:
        clean_ingredients.remove(ing)
        ing = ing.replace('teaspoon', ' ')
        clean_ingredients.add(ing)
    elif 'tablespoon ' in ing:
        clean_ingredients.remove(ing)
        ing = ing.replace('tablespoon ', ' ')
        clean_ingredients.add(ing)
    elif 'cup' in ing:
        clean_ingredients.remove(ing)
        ing = ing.replace('cup', ' ')
        clean_ingredients.add(ing)
    elif 'pound' in ing:
        clean_ingredients.remove(ing)
        ing = ing.replace('pound', ' ')
        clean_ingredients.add(ing)
    elif 'ounce' in ing:
        clean_ingredients.remove(ing)
        ing = ing.replace('ounce', ' ')
        clean_ingredients.add(ing)
    elif 'box' in ing:
        clean_ingredients.remove(ing)
        ing = ing.replace('box', ' ')
        clean_ingredients.add(ing)
    elif 'package' in ing:
        clean_ingredients.remove(ing)
        ing = ing.replace('package', ' ')
        clean_ingredients.add(ing)
    elif 'cans' in ing:
        clean_ingredients.remove(ing)
        ing = ing.replace('cans', ' ')
        clean_ingredients.add(ing)

print(clean_ingredients)