In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
import re

import numpy as np
import pandas as pd
from barcart import (
    build_ingredient_distance_matrix,
    build_ingredient_tree,
    build_recipe_volume_matrix,
    emd_matrix,
    expected_ingredient_match_matrix,
    m_step_blosum,
    report_neighbors,
    em_fit,
)
from utils.db import load_ingredients_from_db, load_recipes_from_db

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
recipes = load_recipes_from_db()
ingredients = load_ingredients_from_db()
recipes = recipes.dropna(subset=["volume_fraction"])
# Patch parent nodes to have substitution level 0
ingredients_single_level = ingredients[ingredients["path"].str.match(r"^/\d+/$")]
ingredients.loc[ingredients_single_level.index, "substitution_level"] = 0
ingredients["substitution_level"] = (
    ingredients["substitution_level"].fillna(0).astype(int)
)
ingredients["weight"] = ingredients["substitution_level"].apply(
    lambda x: 10 if x == 0 else 1
)


In [4]:
tree, parent_map = build_ingredient_tree(
    ingredients,
    id_col="id",
    name_col="name",
    path_col="path",
    weight_col="weight",
)

# Step 2: Extract id_to_name mapping
id_to_name = dict(zip(ingredients["id"], ingredients["name"]))

# Step 3: Build matrix and registry atomically (guaranteed consistent)
cost_matrix, registry = build_ingredient_distance_matrix(parent_map, id_to_name)

# Step 4: Use registry throughout
neighbors_df = report_neighbors(cost_matrix, registry, k=5)

In [5]:
volume_matrix, recipe_registry = build_recipe_volume_matrix(
    recipes,
    registry,
    recipe_id_col="recipe_id",
    ingredient_id_col="ingredient_id",
    volume_col="volume_fraction",
)


In [6]:
distance_matrix, plans = emd_matrix(volume_matrix, cost_matrix, return_plans=True)

Computing EMD matrix:   0%|          | 0/783 [00:00<?, ?it/s]

Computing EMD matrix: 100%|██████████| 783/783 [01:17<00:00, 10.15it/s]


In [7]:
recipe_nn = report_neighbors(distance_matrix, recipe_registry, k=5)

In [8]:
# Build ingredient-level counts without recomputing plans
T_sum, _ = expected_ingredient_match_matrix(
    distance_matrix,
    plans,
    len(registry),
    k=10,
    beta=1.0,
    plan_topk=3,
    plan_minfrac=0.05,
    symmetrize=True,
)


In [9]:
C_new = m_step_blosum(T_sum, blosum_alpha=1.0)

In [10]:
new_neighbors_df = report_neighbors(C_new, registry, k=20)

In [11]:
new_neighbors_df.head()

Unnamed: 0,id,name,neighbor_id,neighbor_name,distance
0,1,Whiskey,392,Abasolo Whiskey,0.910902
1,1,Whiskey,112,Scotch,0.921748
2,1,Whiskey,382,Ransom WhipperSnapper,0.92255
3,1,Whiskey,161,Japanese Whisky,0.943188
4,1,Whiskey,435,Bushmills,0.964189


In [12]:
final_dist, final_cost, log = em_fit(volume_matrix, cost_matrix, len(registry), iters=5)

EM fit:   0%|          | 0/5 [00:00<?, ?it/s]

EM fit:  20%|██        | 1/5 [01:11<04:47, 71.76s/it]

[iter 01] pairs=7830 delta=9.7574e-01


EM fit:  40%|████      | 2/5 [02:00<02:54, 58.25s/it]

[iter 02] pairs=7830 delta=3.0433e-03


                                                     

[iter 03] pairs=7830 delta=7.5796e-04
Converged.


In [32]:
np.max(final_cost)

np.float64(1.0404659370416327)

In [17]:
ingredient_neighbors_df = report_neighbors(final_cost, registry, k=5)
recipe_neighbors_df = report_neighbors(final_dist, recipe_registry, k=5)

In [25]:
recipe_neighbors_df.sort_values("distance", ascending=True).head(50)

Unnamed: 0,id,name,neighbor_id,neighbor_name,distance
2980,602,Naked & Famous,70,Naked And Famous,0.0
2000,405,Gin Fizz,411,Gin Sour,0.0
3170,640,The St-Germain Spritz,744,St-Germain Spritz,0.0
335,70,Naked And Famous,602,Naked & Famous,0.0
2030,411,Gin Sour,405,Gin Fizz,0.0
2060,417,Giuseppe González’s Jungle Bird,347,Ezra Star's Jungle Bird,0.0
310,65,Last Word,668,"Read, No Reply",0.0
1715,347,Ezra Star's Jungle Bird,417,Giuseppe González’s Jungle Bird,0.0
3310,668,"Read, No Reply",65,Last Word,0.0
3690,744,St-Germain Spritz,640,The St-Germain Spritz,0.0


In [26]:
ingredient_neighbors_df.sort_values("distance", ascending=False).head(50)

Unnamed: 0,id,name,neighbor_id,neighbor_name,distance
913,173,Blackberries,469,Lemon Curd,0.999872
914,173,Blackberries,476,Sprig Rosemary,0.999872
2154,435,Bushmills,452,Guava Liqueur,0.999849
1759,356,Hibiscus Syrup,449,Guava Puree,0.999843
1758,356,Hibiscus Syrup,470,Sprig Thyme,0.999843
787,146,Rum Fire,114,Herbal Liqueur,0.999832
788,146,Rum Fire,476,Sprig Rosemary,0.999832
789,146,Rum Fire,154,Fruit / Vegetable,0.999832
2113,427,Foursquare Probitas,450,Mango Puree,0.999824
2114,427,Foursquare Probitas,272,Single Malt Scotch,0.999824
