# Importing Libraries

In [167]:
import requests
import pandas as pd
import difflib
import re
from tqdm.auto import tqdm
import workers2
import sqlalchemy as db
from tqdm.auto import tqdm
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from unidecode import unidecode
import spacy
import pt_core_news_sm
from multiprocessing import Pool

# Reading recipes dataset

In [80]:
recipes_df = pd.read_csv('./data/recipes_df.csv')

In [18]:
recipes_df.shape

(195351, 8)

In [19]:
recipes_df.head()

Unnamed: 0,id_recipes_df,nome,tempo_preparo,rendimento,href,modo_preparo,categoria,ingredientes
0,1945,Pavê de limão,15MIN,8 porções,https://www.tudogostoso.com.br/receita/166511-...,Bata no liquidificador o leite condensado e o...,Doces e sobremesas,"['1 lata de leite condensado', '1 lata de crem..."
1,1946,Pudim de goiabada,40MIN,4 porções,https://www.tudogostoso.com.br/receita/306360-...,Forre todo o interior de uma tigela refratári...,Doces e sobremesas,"['500 g de goiabada', '500 g de requeijão', '3..."
2,1947,Petit gateau,20MIN,1 porção,https://www.tudogostoso.com.br/receita/191668-...,Misture todos os ingredientes e deixe a massa...,Bolos e tortas doces,"['1 ovo', '1 colher de manteiga', '2 colheres ..."
3,1948,Bolo de pudim,90MIN,8 porções,https://www.tudogostoso.com.br/receita/178924-...,Bata todos os ingredientes no liquidificador ...,Doces e sobremesas,"['3 ovos', '1 lata de leite condensado', '2 la..."
4,1949,Pão de queijo fit,20MIN,6 porções,https://www.tudogostoso.com.br/receita/306341-...,Misture todos ingredientes num bowl até que f...,Alimentação Saudável,"['1 xícara de polvilho azedo', '1 xícarade que..."


## Listing all food categories

In [20]:
recipes_df['categoria'].unique()

array(['Doces e sobremesas', 'Bolos e tortas doces',
       'Alimentação Saudável', 'Massas',
       'Saladas, molhos e acompanhamentos', 'Peixes e frutos do mar',
       'Sopas', 'Lanches', 'Prato Único', 'Aves', 'Carnes', 'Bebidas',
       'Peixes', 'Light'], dtype=object)

# Creating a unique list of ingredients

Cleaning the ingredient

In [81]:
recipes_df['ingredientes'] = recipes_df['ingredientes'].apply(lambda x : unidecode(x))

In [82]:
recipes_df['ingredientes'] = recipes_df['ingredientes'].apply(lambda x : x.replace('[','').replace(']','').replace("'",'').split(', '))

Removing all words that are between parentheses

In [83]:
pattern = '\(.+\)'

In [84]:
recipes_df['ingredientes'] = recipes_df['ingredientes'].apply(lambda x: [re.sub(pattern, '', ingredients) for ingredients in x])

Using spacy library

In [25]:
nlp = pt_core_news_sm.load()

Adding some words in STOP_WORDS list to remove them from the ingredients

In [26]:
customize_stop_words = [
    'lata', 'copo', 'colher', 'pacote', 'caixa', 'caixas', 'caixinha', 'caixinhas', 'ml', 'g', 'gramas', 'l',
    'litros', 'ml', 'colheres', 'vidrinho', 'xicaras', 'xicara', 'fatias', 'pitada', 'xicarade', 'bandeja',
    'bandejas', 'kg', 'cubo', 'cubos', 'barra', 'rasas', 'rasa', 'raso', 'rasos', 'cheia', 'cheias', 'cheio',
    'cheios', 'litro', 'medio', 'grande', 'pequeno', 'dentes', 'pequena', 'pequenas', 'pequenos', 'media',
    'medias', 'medios', 'folha', 'folhas', 'medida', 'inteiro', 'inteiros', 'inteiras', 'inteira', 'e', 'saches',
    'morna', 'morno' 'cortada', 'cortadas', 'cortado', 'cortados', 'maco', 'tira', 'tirinha', 'tiras', 'tirinhas',
    'sache', 'picada', 'picadas', 'picado', 'picados', 'picadinho', 'picadinhos', 'picadinha', 'picadinhas', 'ralado',
    'ralada', 'ralados', 'raladas', 'raladinho', 'raladinhos', 'raladinha', 'raladinhas', 'misturada', 'misturado',
    'cozido', 'cozidos', 'cozida', 'cozidas', 'a', 'gosto', 'tablete', 'tabletes', 'pacotes', 'pele', 'ate', 'o',
    'copos', 'bolo', 'fina', 'flocos', 'grossas', 'grossa', 'grosso', 'grossos', 'casca', 'cascas', 'descascada',
    'descascadas', 'descascado', 'descascados', 'cascas', 'aproximadamente', 'sal', 'tempero', 'quente', 'sabor',
    'temperos', 'sopa', 'desfiado', 'desfiados', 'desfiada', 'desfiadas', 'cha', 'opcional', 'saquinho', 'saco',
    'preferencia', 'usei', 'madura', 'maduras', 'maduro', 'maduros', 'amassada', 'amassadas', 'amassado', 'amassados',
    'latas', 'rasgadas', 'rasgada', 'temperada', 'temperadas', 'temperado', 'temperados', 'vidro', 'vidrinho', 'vidros',
    'vidrinhos', 'panela', 'escolheres', 'pote', 'potes', 'potinho', 'potinhos', 'americano', 'prato', 'refratario',
    'forno', 'preaquecido', 'pronta', 'banda', 'tradicional', 'fio', 'unidade', 'unidades', 'dissolvida', 'sobra',
    'sobras', 'servir', 'separado', 'separados', 'separada', 'separadas', 'semente', 'sementes', 'cubinho',
    'cubinhos', 'ja', 'refogada', 'refogar', 'refogadas', 'refogados', 'refogado', 'frio', 'fria', 'congelada',
    'congeladas', 'congelado', 'congelados', 'fresca', 'frescas', 'fresco', 'frescos', 'diluir', 'escorrido',
    'escorrida', 'envelope', 'envelopes', 'forma', 'formas', 'forminhas', 'redondo', 'redondos', 'refinado',
    'ramo', 'ramos', 'raminhos', 'desidratada', 'desidratadas', 'desidratado', 'desidratados', 'cozinho', 'dente',
    'pressao', 'hora', 'gelado', 'gelada', 'gelados', 'geladas', 'geladinhos', 'geladinhas', 'fica', 'criterio',
    'industrializado', 'industrializada', 'industrializados', 'industrializadas', 'quantidade', 'rodelas','laminas',
    'maos', 'diferentes', 'pedaco', 'pedacos', 'grosseiros', 'suficiente', 'cobrir'
]

for w in customize_stop_words:
    nlp.vocab[w].is_stop = True

Creating a function to clean the ingredients

In [107]:
def clean_ingredient(ingredient:str):
    '''
    The function receives an ingredient and returns a clean string of that ingredient.
    
    Example: input > '1 lata de leite condensado'
             output > 'leite condensado'
    '''
    
    ingredient = ingredient.lower()
    doc = nlp(ingredient)
    tokens = []
    for token in doc:
        if token.is_stop == False and token.is_punct == False and token.is_alpha == True:
            if token.is_digit == False:
                tokens.append(token.text)
    ingredient = ' '.join(tokens)
    if len(ingredient) > 0:
        if ingredient[-1] == 's':
            ingredient = ingredient[:-1]      ## this prevents words in plural
            
    ingredient = re.sub('gemas|claras|ovos|gema|clara', 'ovo', ingredient)
    ingredient = re.sub('fermento po', 'fermento', ingredient)
    ingredient = re.sub('azeite oliva', 'azeite', ingredient)
    ingredient = re.sub('queijo mussarela', 'mussarela', ingredient)
    ingredient = re.sub('maisena', 'maizena', ingredient)
    
    return ingredient

Creating a list with all cleaned ingredients

In [None]:
lst_ing = []
for row in tqdm(recipes_df['ingredientes']):
    if len(row) > 0:
        for ing in row:
            ingredient = clean_ingredient(ing)
            lst_ing.append(ingredient)

Now, I'm counting how many times each ingredient appears in the list and deleting the duplicated ones

In [29]:
df_ing = pd.DataFrame(lst_ing, columns=['ingrediente'])
count_ingredients_df = pd.DataFrame(df_ing['ingrediente'].value_counts()).reset_index()
count_ingredients_df.columns = ['ingrediente', 'quantidade']

In [None]:
count_ingredients_df = count_ingredients_df.drop_duplicates()

In [32]:
# remove unique ingredient that only appear 2 times or less

mask = count_ingredients_df['quantidade'] > 2
count_ingredients_df = count_ingredients_df.loc[mask, :]

In [35]:
# Removing ingredients that has only 2, or less, letters

for ingrediente in count_ingredients_df['ingrediente']:
    if len(ingrediente) < 2:
        count_ingredients_df.drop(count_ingredients_df.index[count_ingredients_df['ingrediente'] == f'{ingrediente}'], inplace=True)

# Send dataframe to database

The Dataframe of unique ingredients is now being sent to the database as table 'ingredient'

In [123]:
engine = db.create_engine(f'{DATABASE_URL}')
conn = engine.connect()
ingredient_df[['ingrediente']].to_sql('ingredient', con=conn)

# Create recipe_ingredient table in database

I'm creating a table with the relation between the recipes and ingredients. This will facilitate the search for the recipes based on the ingredients

In [149]:
engine = db.create_engine(f'{DATABASE_URL}')
conn = engine.connect()
ingredient_df = pd.read_sql('SELECT /*+ MAX_EXECUTION_TIME(1000000) */ * from recipes.ingredient', con=conn)

In [156]:
ingredient_df.head()

Unnamed: 0,id_ingrediente,ingrediente
0,1,ovo
1,2,acucar
2,3,leite
3,4,cebola
4,5,farinha trigo


In [152]:
def get_id_ingredient(name_ingredient):
    '''
    The function returns the ingredient id in the database given the name of the ingredient
    '''
    try:
        results = results = int(ingredient_df[ingredient_df['ingrediente'] == f'{name_ingredient}']['id_ingrediente'].values[0])
    except:
        results = []
    
    return results

## Create the dataframe of the relation recipe/ingredient

In [177]:
recipe_ingredient = pd.DataFrame(columns=['id_recipe', 'id_ingredient'])

In [178]:
for i in tqdm(range(recipes_df[['id_recipes_df', 'ingredientes']][130000:].shape[0])):
    id_recipe = (recipes_df[['id_recipes_df', 'ingredientes']][130000:].iloc[i, :][0])
    list_ingredient = (recipes_df[['id_recipes_df', 'ingredientes']][130000:].iloc[i, :][1])
    for ingrediente in list_ingredient:
        ing = clean_ingredient(ingrediente)
        id_ingredient = get_id_ingredient(ing)
        if type(id_ingredient) == int:
            recipe_ingredient = recipe_ingredient.append({'id_recipe' : id_recipe , 'id_ingredient' : id_ingredient} , ignore_index=True)
        else:
            for unique_ingredient in ingredient_df['ingrediente']:
                partial_ratio = fuzz.partial_ratio(ingrediente, unique_ingredient)
                if partial_ratio > 80:
                    id_ingredient = get_id_ingredient(unique_ingredient)
                    recipe_ingredient = recipe_ingredient.append({'id_recipe' : id_recipe , 'id_ingredient' : id_ingredient} , ignore_index=True)
                    break

HBox(children=(FloatProgress(value=0.0, max=65351.0), HTML(value='')))




In [179]:
recipe_ingredient

Unnamed: 0,id_recipe,id_ingredient
0,142808,71
1,142808,14625
2,142808,1139
3,142808,8
4,142808,2
...,...,...
535173,227439,3905
535174,227439,49
535175,227439,3
535176,227439,13


# Drop duplicates

In [183]:
recipe_ingredient.duplicated().sum()

17460

In [None]:
recipe_ingredient = recipe_ingredient.drop_duplicated()

# Send DataFrame to Database

In [211]:
engine = db.create_engine(f'{DATABASE_URL}')
conn = engine.connect()

In [212]:
qty_lines = recipe_ingredient.shape[0]

while qty_lines > 0:
    df_first_4000 = recipe_ingredient.iloc[:4000,:]
    conn = engine.connect()
    df_first_4000.to_sql('recipe_ingredient', con=conn, if_exists='append', index = False)
    conn.close()
    recipe_ingredient = recipe_ingredient.drop(index=list(df_first_4000.index))
    qty_lines = recipe_ingredient.shape[0]

In [80]:
recipe_ingredient = pd.read_csv('./data/recipe_ingredient.csv')

In [96]:
recipe_ingredient.to_csv('./data/recipe_ingredient.csv', index=False)

In [210]:
engine = db.create_engine(f'{DATABASE_URL}')
conn = engine.connect()
recipe_ingredient = pd.read_sql('SELECT /*+ MAX_EXECUTION_TIME(1000000) */ * from recipes.recipe_ingredient', con=conn)

In [204]:
recipe_ingredient.head()

Unnamed: 0,id_recipe_ingredient,id_recipe,id_ingredient
0,2518445,1945,7
1,2518446,1945,6
2,2518447,1945,3278
3,2518448,1945,857
4,2518449,1945,1267
