In [1]:
from cocktaildb import (
    load_recipes_from_db,
)
from utils.distance import (
    build_ingredient_tree,
    build_ingredient_distance_matrix,
    build_recipe_volume_matrix,
    compute_emd,
    emd_matrix,
    knn_matrix,
)
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
recipes = load_recipes_from_db()
recipes = recipes.dropna(subset=["volume_fraction"])
# Patch parent nodes to have substitution level 0
recipes_single_level = recipes[recipes["ingredient_path"].str.match(r"^/\d+/$")]
recipes.loc[recipes_single_level.index, "substitution_level"] = 0
recipes["substitution_level"] = recipes["substitution_level"].fillna(0).astype(int)
recipes["weight"] = recipes["substitution_level"].apply(lambda x: 10 if x == 0 else 1)


In [20]:
tree, parent_map = build_ingredient_tree(recipes, weight_col="weight")
ing_dist, ing_id_to_idx = build_ingredient_distance_matrix(parent_map)
volume_matrix, recipe_id_to_idx = build_recipe_volume_matrix(
    recipes,
    ing_id_to_idx,
    recipe_id_col="recipe_id",
    ingredient_id_col="ingredient_id",
    volume_col="volume_fraction",
)
ing_idx_to_id = {v: k for k, v in ing_id_to_idx.items()}
recipe_idx_to_id = {v: k for k, v in recipe_id_to_idx.items()}
ing_id_to_name = {
    row["ingredient_id"]: row["ingredient_name"] for _, row in recipes.iterrows()
}
recipe_id_to_name = {
    str(row["recipe_id"]): row["recipe_name"] for _, row in recipes.iterrows()
}


In [4]:
dist, plan = compute_emd(
    volume_matrix[recipe_id_to_idx["21"]],
    volume_matrix[recipe_id_to_idx["254"]],
    ing_dist,
    return_plan=True,
)
plan

[(10, 45, 0.09740256805982199, 2.142856497316084),
 (10, 184, 0.04545457479732086, 1.0000006455410588),
 (12, 12, 0.13636363217181133, 0.0),
 (12, 45, 0.012987021370663057, 0.2857144701545873),
 (12, 90, 0.13636363217181133, 2.7272726434362267),
 (57, 45, 0.025974042741326087, 0.8051953249811087),
 (57, 134, 0.5454545286872453, 10.909090573744907)]

In [5]:
distmat = emd_matrix(volume_matrix, ing_dist)
knn_idx, knn_dist = knn_matrix(distmat, k=10)

Computing EMD matrix:   0%|          | 0/783 [00:00<?, ?it/s]

Computing EMD matrix: 100%|██████████| 783/783 [00:37<00:00, 20.70it/s] 


In [22]:
q = knn_idx[recipe_id_to_idx["113"]]
[recipe_id_to_name[recipe_idx_to_id[int(ii)]] for ii in q]

['Regal Daiquiri',
 'John De Piper’s\xa0Mojito',
 'La Bomba Daiquiri',
 "I Don't Mind You Shooting Me, Frank, But Take It Easy on the\xa0Rum",
 "Erick Castro's Hemingway Daiquiri",
 'Daiquiri #4',
 'Jelani Johnson’s\xa0Mojito',
 "Dan Sabo's Mojito",
 'Hole in the\xa0Fence',
 'Hemingway Daiquiri']

In [7]:
recipes[recipes["recipe_name"].str.lower().str.contains("daiquiri")][
    ["recipe_id", "recipe_name"]
].drop_duplicates()

Unnamed: 0,recipe_id,recipe_name
26,8,Brucato Nuclear Daiquiri
88,21,Daiquiri
460,113,Alex Day’s Daiquiri
1067,254,Daiquiri #2
1072,255,Daiquiri #4
1076,256,Daiquiri Clasico
1217,286,Dry Daiquiri
1297,301,Corduroy Daiquiri
1345,310,Don's Special Daiquiri
1351,311,Dry Pornstar Daiquiri


In [8]:
# Query recipes whose name contains 'daiquiri', 'margarita', or 'sour'
mask = (
    recipes["recipe_name"]
    .str.lower()
    .str.contains("daiquiri|margarita|sour|bird|collins", regex=True)
)
sours = recipes[mask].copy()


In [9]:
print(
    f"n_recipes: {len(sours['recipe_id'].unique())}; n_ingredients: {len(sours['ingredient_id'].unique())}"
)

n_recipes: 64; n_ingredients: 89
