## PREPROCESSING

Ricette definite in un formato semistrutturato, raggruppamento degli step e ingredienti in un unica stringa, funzioni di utilità

### Funzioni util e import

In [None]:
from pathlib import Path
import pandas as pd
from IPython.display import display
from ast import literal_eval
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree
from nltk.sem import relextract
import spacy
from spacy import displacy
from nltk.corpus import conll2000
from spacy.symbols import X, NUM, VERB, NOUN
from nltk.corpus import wordnet as wn

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('conll2000')


!python -m spacy download en_core_web_md

NLP = spacy.load('en_core_web_md')


def string_recipe(i):
    return dataset.iloc[i]['title'] + "\n\n" + dataset.iloc[i]['ingredients'] + "\n\n" + dataset.iloc[i]['step'] 

### Caricamento del dataset

In [None]:
dataset = pd.read_csv(
    Path("../data/test_dataset.csv").resolve(), 
    index_col=[0], 
    names=["index", "title","ingredients","step"], 
    usecols=[0,1,2,3]
    )

for index in range(len(dataset)):
    dataset.iloc[index]['ingredients'] = ".\n".join(literal_eval(dataset.iloc[index]['ingredients']))
    dataset.iloc[index]['step'] = " ".join(literal_eval(dataset.iloc[index]['step']))

display(dataset.head())
print(string_recipe(10))


### Estrazione delle abbreviazioni

Fase iniziale di ritrovamento del set di abbreviazioni

In [None]:
abbrv_dataset = pd.read_csv(
    Path("../data/test_dataset.csv").resolve(), 
    index_col=[0], 
    names=["index", "title","ingredients","step"], 
    usecols=[0,1,2,3]
    )

abbrv = set()
for index in range(len(dataset)):
    abbrv_dataset.iloc[index]['ingredients'] = " ".join(literal_eval(abbrv_dataset.iloc[index]['ingredients']))
    for element in re.findall(r"[A-Za-z]*\.", abbrv_dataset.iloc[index]['ingredients']):
        abbrv.add(element)
    
print(abbrv)

Rimozione delle abbreviazioni in quanto possono essere dannose per il processo di tokenizzazione. Es pkg. ---> package

In [None]:
def expand_abbreviations(ingredients_string):
    __ABBREVIATIONS__ = {
        'pkg.'  :   'package',
        'tsb.'  :   'tablespoon',
        'no.'   :   'number',
        'pt.'   :   'pint',
        'no.'   :   'number',
        'gal.'  :   'gallon',
        'tbsp.' :   'tablespoon',
        'sq.'   :   'square',
        'oz.'   :   'ounce',
        'lb.'   :   'pound',
        'qt.'   :   'quart',
        'c.'    :   'cup',
        'tsp.'  :   'teaspoon'
    }
    for item, value in __ABBREVIATIONS__.items():
        ingredients_string = ingredients_string.lower().replace(item, value)
    return ingredients_string


for index in range(len(dataset)):
    dataset.iloc[index]['ingredients'] = expand_abbreviations(dataset.iloc[index]['ingredients'])

### Sentence splitting
Il contenuto delle colonne 'ingredients' e 'step' verrà suddiviso in frasi. In precedenza i periodi contenuti nelle singole celle sono stati formattati in modo tale da renderli riconoscibili e facilmente suddivisibili in frasi ben separate.

In [None]:
for index in range(len(dataset)):
     dataset.iloc[index]['ingredients'] = (sent_tokenize(dataset.iloc[index]['ingredients']))
print(dataset.iloc[10]['ingredients'])

In [None]:
for index in range(len(dataset)):
     dataset.iloc[index]['step'] = (sent_tokenize(dataset.iloc[index]['step']))
print(dataset.iloc[10]['step'])

### Rimozione quantità doppie

In [None]:
dr_reg = r'\d*\s*\(.*\)'

for index in range(len(dataset)):
    for value in range(len(dataset.iloc[index]['ingredients'])):
        elements = re.findall(dr_reg, dataset.iloc[index]['ingredients'][value])
        for e in elements:
            new_string = dataset.iloc[index]['ingredients'][value].replace(e,e[e.find("(")+1: e.find(")")].strip())
            dataset.iloc[index]['ingredients'][value] = new_string

## ANALISI

### Stop words removal
Nella colonna 'step' troviamo una serie di passaggi da compiere per creare la ricetta. Questi passaggi sono scritti in linguaggio naturale e possono essere semplificati rimuovendo delle parole dette stop words.

In [None]:
# OPTINAL -> Risultati in swr_dataset"

stop_words = set(stopwords.words('english'))
swr_dataset = dataset.copy(deep=True)

tk = lambda x,st: ' '.join([w for w in x if w not in st])
for index in range(len(dataset)):
    swr_dataset.iloc[index]['step'] = [tk(word_tokenize(sent), stop_words) for sent in dataset.iloc[index]['step']]

### Stemming e Lemming
Questi due processi potrebbero portare valore all'analisi del dominio. Il codice per entrambi viene proposto qui.

In [None]:
# OPTINAL -> Risultati in lem_dataset"

lemmatizer = WordNetLemmatizer()
lem_dataset = swr_dataset.copy(deep=True)

for index in range(len(swr_dataset)):
    lem_dataset.iloc[index]['step'] = stem_sent = [' '.join([lemmatizer.lemmatize(w) for w in word_tokenize(sent)]) for sent in swr_dataset.iloc[index]['step']]

In [None]:
# OPTINAL -> Risultati in stm_dataset"

stemmer = PorterStemmer()
stm_dataset = lem_dataset.copy(deep=True)

for index in range(len(lem_dataset)):
    stm_dataset.iloc[index]['step'] = [' '.join([stemmer.stem(w) for w in word_tokenize(sent)]) for sent in lem_dataset.iloc[index]['step']]

In [None]:
print("Frasi originali\n")
print(dataset.iloc[10]['step'])
print("\nStop word removal\n")
print(swr_dataset.iloc[10]['step'])
print("\nStop word e lemming\n")
print(lem_dataset.iloc[10]['step'])
print("\nStop word lemming e stemming\n")
print(stm_dataset.iloc[10]['step'])

### Entityt Extraction, Relation Extraction e POS tagging con SpaCy

In [None]:
spacy_dataset = dataset.copy(deep=True)
       
for index in range(len(dataset)):
    spacy_dataset.iloc[index]['ingredients'] = [NLP(element) for element in dataset.iloc[index]['ingredients']]
    spacy_dataset.iloc[index]['step'] = [NLP(element) for element in dataset.iloc[index]['step']]

In [None]:
def __pprintsp__(dataset, index, column, value, extend=False):
    displacy.render(dataset.iloc[index][column][value], style='dep')
    print(dataset.iloc[index][column][value])
    displacy.render(dataset.iloc[index][column][value], style='ent')
    if extend:
        for token in dataset.iloc[index][column][value]:
            print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop)

def pprint_spacyd_all(dataset, index, column, extend=False):
    for v in range(len(dataset.iloc[index][column])):
        __pprintsp__(dataset, index, column, v, extend)
            
def pprint_spacyd(dataset, index, column, value, extend=False):
    __pprintsp__(dataset, index, column, value, extend)

### Analisi degli ingredienti

In [None]:
class Recipe:
    def __init__(self, title, index, ingredients_str, steps_str):
        self.title = title
        self.ingredients = []
        self.ing_str = ingredients_str
        self.stp_str = steps_str
        self.steps = []
        self.idx = index
    
    def add_ing(self, ing):
        self.ingredients.append(ing)
        
    def __str__(self):
        _ing_s = '\n'.join([ing.short_srt() for ing in self.ingredients])
        _stp_s = '\n'.join(str(step) for step in self.steps)
        return f"Title: {self.title}\n{_ing_s}\nSteps:\n{_stp_s}"
    
    
    
class Step:
    def __init__(self, step_str, step_no, action, ins):
        self.action = action
        self.ins = ins
        self.des = ""
        self.step_str = step_str
        self.step_no = step_no
        
    def __str__(self):
        return f"{self.step_no:<3d} {self.action:<15s} [{','.join(self.ins):20s}] ({self.step_str})"

    
    
class Ingredient:
    def __init__(self, full_text):
        self.name=""
        self.ing_cat_id = None
        self.size=""
        self.quantity=""
        self.size=""
        self.adj=[]
        self.original=full_text
    def __str__(self):
        return f"Name: {self.name}\nAjdectives: {self.adj}\nQuantity: {self.quantity} {self.size}\nOriginal: {self.original}"

    def short_srt(self):
        return f"{self.name} ({self.adj}) {self.quantity} {self.size}"
    
    def set_id(self, identifier):
        self.ing_cat_id = identifier

In [None]:

def is_food(word):
    syns = wn.synsets(str(word), pos = wn.NOUN)
    for syn in syns:
        if 'food' in syn.lexname():
            return True
    return False


#Root generation

__SIZES__ = [
       'package',
       'tablespoon',
       'number',
       'pint',
       'number',
       'gallon',
       'tablespoon',
       'square',
       'ounce',
       'pound',
       'quart',
       'cup',
       'teaspoon',
       'can'
]


all_recipes = []
    
for index in range(len(spacy_dataset)):
    
    recipe = Recipe(
        spacy_dataset.iloc[index]['title'], 
        index, 
        " ".join(dataset.iloc[index]['ingredients']),
        " ".join(dataset.iloc[index]['step'])       
    )
    
    for element in spacy_dataset.iloc[index]['ingredients']:

        ingredient = element

        IGN = Ingredient(ingredient.text)
        ROOT_NODE = [token for token in ingredient if token.dep_ == 'ROOT'][0]
        

        if (ROOT_NODE.pos == VERB):
                IGN.adj.append(ROOT_NODE.text)
        else:
                IGN.name = ROOT_NODE.text
                
        stack = [element for element in ROOT_NODE.children]

        while len(stack)!=0:
            CURRENT_NODE = stack.pop()

            if (CURRENT_NODE.pos == X) or (CURRENT_NODE.pos == NUM):
                    IGN.quantity += " " + CURRENT_NODE.text
            else:
                if CURRENT_NODE.text.lower() in  __SIZES__:
                    IGN.size += " " + CURRENT_NODE.text

                if ((CURRENT_NODE.dep_ == 'compound') or (CURRENT_NODE.dep_ == 'dobj') or (CURRENT_NODE.dep_ == 'pobj')) and is_food(CURRENT_NODE.text) and (CURRENT_NODE.text.lower() not in  __SIZES__):
                    IGN.name = CURRENT_NODE.text + " " + IGN.name

                if (CURRENT_NODE.dep_ == 'amod') or (CURRENT_NODE.dep_ == 'appos'):
                    if is_food(CURRENT_NODE.text):
                        IGN.name = CURRENT_NODE.text + " " + IGN.name
                    else:
                        IGN.adj.append(CURRENT_NODE.text)         

            stack += [element for element in CURRENT_NODE.children]
            
        recipe.add_ing(IGN)
    all_recipes.append(recipe)
        


### Analisi degli step

In [None]:
PREPS = ['with', 'in', 'on', 'of', 'using', 'an']

#Questa funzione potrebbe essere migliorata in mille modi, per ora me la faccio bastare
def find_instrument(child):
    if child.dep_ == 'prep' and child.text in PREPS:
        for subchild in child.children:
            if subchild.dep_ == 'pobj' and not is_food(subchild.text):
                for subsubchild in subchild.children:
                    if subsubchild.dep_ == 'prep' and subsubchild.text in PREPS:
                        if subsubchild.dep_ == 'pobj' and not is_food(subchild.text):
                            ins = subsubchild.text
                            for conn in subsubchild.children:
                                if conn.dep_ == 'compound':
                                    ins = conn.text + ' ' + ins
                            return ins
                    else:
                        ins = subchild.text
                        for conn in subchild.children:
                            if conn.dep_ == 'compound':
                                ins = conn.text + ' ' + ins
                        return ins
    else:
        return False

#Questa funzione sono riuscito a farla leggermente meglio e funziona discretamente bene, 
# potrebbe essere fatta qualche miglioria ma tutto sommato fa il suo dovere
def find_ingredient(child):
    ing_list = []
    ing = child.text
    for subchild in child.children:
        if subchild.dep_ == 'amod'and is_food(subchild.text):
            ing = subchild.text + ' ' + ing
    ing_list.append(ing)
    for subchild in child.children:
        if subchild.dep_ == 'conj' and is_food(subchild.text):
            ing_list.append(find_ingredient(subchild).pop())
    return ing_list

def hidden_verb(word):
    ss = wn.synsets(word)
    for s in ss:
        if s.pos() == 'v':
            return True
    return False
                
actions = set()
instruments = set()
ingredients_conn = set()

for index in range(len(spacy_dataset)):
    print(index)
    for doc in spacy_dataset.iloc[index]['step']:
        print("\n\n")
        print(doc)
        #displacy.render(doc, style='dep')
        for token in doc:
            #Cerco i verbi
            if token.pos_ == 'VERB' or hidden_verb(token.text):
                #AZIONE TROVATA (per ora sono tutti i verbi, effettuare un controllo ulteriore?)
                act = token.text
                actions.add(act)

                for child in token.children:
                    #Tra tutti i child del verbo mi cerco lo strumento tramite la funzione find_instrument
                    if find_instrument(child):
                        ins = find_instrument(child)
                        print(ins)
                        instruments.add(ins)
                    #Se un child ha dipendenza dobj è un ingrediente e chiamo la funzione find_ingredient che mi ritorna una lista di ingredienti, aggiungo singolarmente ognuno di essi al set
                    if child.dep_ == 'dobj' and is_food(child.text):
                        ing_list = find_ingredient(child)
                        for ing in ing_list:
                            print(ing)
                            ingredients_conn.add(ing)
#print(actions)
#print(instruments)
print(ingredients_conn)

In [None]:
def hidden_verb(word):
    ss = wn.synsets(word)
    for s in ss:
        if s.pos() == 'v':
            return True
    return False

def combine_action(action_token):
    action = action_token.text
    for child in action_token.children:
        if child.dep_ == 'prt' or child.dep_ == 'compound':
            action += " " + child.text
    return action

def find_instrument(token):
    stack = [element for element in token.children]
    instrument = ""
    found_main = False
    main_node = None

    while stack:
        s_element = stack.pop()
        if s_element.pos == NOUN and (s_element.text not in ingredients_all_words) and (not found_main) and (s_element.text not in forbidden_instruments):
            instrument += s_element.text
            found_main = True
            main_node = s_element
        elif s_element.dep_ == 'compound' and main_node == s_element.head:
            instrument = s_element.text + " " + instrument
            main_node = s_element

        childrens = [element for element in s_element.children]
        stack += [element for element in s_element.children]
    return instrument


forbidden_actions = ['done', 'will', 'be']
forbidden_instruments = ['top']
PREPS = ['with', 'in', 'on', 'of', 'using', 'an', 'a']

for index in range(len(dataset)):
    step_counter = 1
    for value in range(len(dataset.iloc[index]['step'])):
        for element in dataset.iloc[index]['step'][value].split(";"):
            
            
            #print("\n" , element)
            #print("Instruments: ")
            step = NLP(element.strip())
            
            #displacy.render(step, style='dep')
            FINAL_ACTION = ""
            _hidden_verbs = []
            _found_action = False
            found_instruments = []
            found_ingredients = []
            
            for TOKEN in step:
                #AZIONE TROVATA (per ora sono tutti i verbi, effettuare un controllo ulteriore?)
                if "ed" not in (TOKEN.text) and TOKEN.text not in forbidden_actions and "ing" not in (TOKEN.text):
                    if not _found_action:
                        if (TOKEN.pos == VERB):
                            FINAL_ACTION = combine_action(TOKEN)
                            _found_action = True
                        elif hidden_verb(TOKEN.text):
                            _hidden_verbs.append(combine_action(TOKEN))
                            
                if TOKEN.text in PREPS:
                    #Tra tutti i child del verbo mi cerco lo strumento tramite la funzione find_instrument
                    instrument = find_instrument(TOKEN)
                    if instrument != "":
                        found_instruments.append(instrument)
                        
                      
            if FINAL_ACTION == "" and _hidden_verbs:
                FINAL_ACTION = _hidden_verbs[0]
                
            if FINAL_ACTION != "":
                found_step = Step(element, step_counter, FINAL_ACTION, found_instruments)
                all_recipes[index].steps.append(found_step)
                step_counter += 1

            

In [None]:
for index in all_recipes[11:100]:
    print(index, '\n')
    

## GENERAZIONE DATABASE

Crezione indici per ricette e ingredienti

In [None]:
macro_ingredients = {}
ingredients_idx = {}

for element in all_recipes:
    for ingredient in element.ingredients:
        if ingredient.name in macro_ingredients:
            macro_ingredients[ingredient.name] += 1
        else:
            macro_ingredients[ingredient.name] = 1
            
for index, key in enumerate(macro_ingredients):
    ingredients_idx[key] = index
    #print(f"{key:>50s} {macro_ingredients[key]:<3d}")

for key in ingredients_idx:
    print(key.split(" "))
ingredients_all_words = set([])
for key in ingredients_idx:
    for element in key.split(" "):
        ingredients_all_words.add(element)
print(ingredients_all_words)

print('potato' in ingredients_all_words)

In [None]:
print(all_recipes[0].ing_str)
print(all_recipes[0].stp_str)

Creazione DDL ingredienti

In [None]:
ING_DB_CREATE = """
USE recipe_analysis;
DROP TABLE IF EXISTS ingredients;
CREATE TABLE ingredients(
id INT PRIMARY KEY,
name VARCHAR(100) NOT NULL
);
"""

print(ING_DB_CREATE)


with open(Path("../database/create_ingredients.sql").resolve(), "w") as ddl_file:
    for ing in ingredients_idx:
        ING_DB_CREATE = ING_DB_CREATE + f"INSERT INTO ingredients VALUES ({ingredients_idx[ing]},\"{ing}\");\n"
    ddl_file.write(ING_DB_CREATE)

In [None]:
display(dataset)

Creazione DDL ricette

In [None]:
REC_DB_CREATE = """
USE recipe_analysis;
DROP TABLE IF EXISTS recipes;
CREATE TABLE recipes(
id INT PRIMARY KEY,
titolo VARCHAR(100) NOT NULL,
preparazione_nstr VARCHAR(500),
ingredienti_nstr VARCHAR(500)
);
"""

print(REC_DB_CREATE)



with open(Path("../database/create_recipes.sql").resolve(), "w") as ddl_file:
    for rec in all_recipes:
        
        corrected_ing = rec.ing_str.replace("\'","\'\'")
        corrected_stp = rec.stp_str.replace("\'","\'\'")
        corrected_tit = rec.title.replace("\'","\'\'")
        
        REC_DB_CREATE = REC_DB_CREATE + f"INSERT INTO recipes (id, titolo, ingredienti_nstr, preparazione_nstr) VALUES ({rec.idx},\'{corrected_tit}\',\'{corrected_ing}\',\'{corrected_stp}\');\n"
    ddl_file.write(REC_DB_CREATE)

Crezione DDL tabella relazione Ricette <--> Ingredienti

In [None]:
CON_DB_CREATE = """
USE recipe_analysis;
DROP TABLE IF EXISTS contains;
CREATE TABLE contains(
id INT NOT NULL AUTO_INCREMENT,
misura VARCHAR(50),
quantita VARCHAR(50),
proprieta VARCHAR(100),
recipeID int NOT NULL,
ingredientID int NOT NULL,
PRIMARY KEY (id),
FOREIGN KEY (recipeID) REFERENCES recipes(id),
FOREIGN KEY (ingredientID) REFERENCES ingredients(id)
);
"""

print(CON_DB_CREATE)


with open(Path("../database/create_contains.sql").resolve(), "w") as ddl_file:
    for r in all_recipes:
        for i in r.ingredients:
            if i.name.strip() != "":
                adj = ', '.join(i.adj)
                data = f"(\'{i.size}\',\'{i.quantity}\',\'{adj}\',{r.idx}, {ingredients_idx[i.name]})"
                CON_DB_CREATE = CON_DB_CREATE + f"INSERT INTO contains (misura, quantita, proprieta, recipeID, ingredientID) VALUES {data};\n"
    ddl_file.write(CON_DB_CREATE)      