# Recipes dataset preprocessing

## Importing dependencies and datasets

In [None]:
import numpy as np
import pandas as pd
from annoy import AnnoyIndex

db_recipes = pd.read_csv('data/01_Recipe_Details.csv')
db_details = pd.read_csv('data/04_Recipe-Ingredients_Aliases.csv')

In [None]:
ingredients = [ingr.strip() for ingr in db_details['Aliased Ingredient Name'].unique()]
num_dims = len(ingredients)
dict_ingredients = {name : id for name, id in zip(ingredients, range(num_dims))}

## Creating vector encoding for each recipe

In [None]:
num_recipes = len(db_recipes)
vectors_ingr = []
for id in range(num_recipes):
    recipe_ingr = [ingr.strip() for ingr in db_details[db_details['Recipe ID'] == id + 1]['Aliased Ingredient Name']]
    v = [1 if ingr in recipe_ingr else 0 for ingr in ingredients]
    vectors_ingr.append(v)

## Creating the Annoy distances tree

In [None]:
t = AnnoyIndex(num_dims, 'euclidean')
for id in range(num_recipes):
    t.add_item(id, vectors_ingr[id])
t.build(40)
t.save('base.tree')

## Testing code

In [None]:
rec_id = 1
rec_ingr = list(db_details[db_details['Recipe ID'] == rec_id + 1]['Aliased Ingredient Name'])

neighbors = t.get_nns_by_item(rec_id, 10)

meand = np.mean([t.get_distance(rec_id, x) for x in neighbors if x != rec_id])

for x in neighbors:
    if x != rec_id and t.get_distance(rec_id, x) <= meand:
        name = list(db_recipes[db_recipes['Recipe ID'] == x + 1]['Title'])[0]
        ingr = list(db_details[db_details['Recipe ID'] == x + 1]['Aliased Ingredient Name'])
        n_common = len([x for x in rec_ingr if x in ingr])
        dist = t.get_distance(rec_id, x)
        print(name, '({}) ingr'.format(len(ingr)), '| Common Ingredients:', n_common, '| Dist:', dist)