In [1]:
import pandas as pd
from sklearn.cluster import SpectralCoclustering, SpectralBiclustering
import numpy as np
import umap
from sklearn.metrics import pairwise_distances

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_parquet("../rationalized_matrix_20250705_152129.parquet")

In [12]:
rdf = df.loc["Zombie Panda", :]
rdf[rdf>0]

category  specific_type  brand  
juice     lemon juice    generic    22.180125
Name: Zombie Panda, dtype: float64

In [3]:
grouped_cols = df.groupby(level=[0, 1], axis=1)
df_collapsed = grouped_cols.sum()
# Create normalized dataframe where each row sums to 1
df_normalized = df_collapsed.div(df_collapsed.sum(axis=1), axis=0)
# Drop columns that are all 0 or NaN
df_normalized = df_normalized.fillna(0)
df_normalized = df_normalized.loc[:, (df_normalized != 0).any(axis=0)]
df_normalized = df_normalized.loc[(df_normalized != 0).any(axis=1),:]


  grouped_cols = df.groupby(level=[0, 1], axis=1)


In [4]:


distance_matrix_sklearn = pairwise_distances(df_normalized, metric='manhattan')

distance_df_sklearn = pd.DataFrame(
    distance_matrix_sklearn, 
    index=df_normalized.index, 
    columns=df_normalized.index
)


In [6]:
import altair as alt

# Create UMAP embedding from the distance matrix
reducer: umap.UMAP = umap.UMAP(
    n_neighbors=15, 
    n_components=2, 
    metric='precomputed', 
    random_state=42
)
embedding: np.ndarray = reducer.fit_transform(distance_matrix_sklearn)

# Create a DataFrame with the embedding coordinates and names
embedding_df: pd.DataFrame = pd.DataFrame(
    embedding,
    columns=['UMAP1', 'UMAP2'],
    index=distance_df_sklearn.index
)
embedding_df['name'] = embedding_df.index

# Add recipe information for each cocktail
def get_recipe_string(cocktail_name: str) -> str:
    """Get the recipe as a formatted string for a given cocktail."""
    recipe = df_normalized.loc[cocktail_name, :]
    ingredients = recipe[recipe > 0]
    
    if len(ingredients) == 0:
        return "No ingredients found"
    
    # Format ingredients with proportions
    recipe_parts = []
    for ingredient, proportion in ingredients.items():
        # Handle multi-level column names if they exist
        if isinstance(ingredient, tuple):
            ingredient_name = ' - '.join(str(part) for part in ingredient)
        else:
            ingredient_name = str(ingredient)
        recipe_parts.append(f"{ingredient_name}: {proportion:.3f}")
    
    return " | ".join(recipe_parts)

# Add recipe strings to the embedding dataframe
embedding_df['recipe'] = embedding_df['name'].apply(get_recipe_string)

# Create the Altair chart
chart: alt.Chart = alt.Chart(embedding_df.reset_index()).mark_circle(
    size=100, 
    opacity=0.7
).encode(
    x=alt.X('UMAP1:Q', title='UMAP Dimension 1'),
    y=alt.Y('UMAP2:Q', title='UMAP Dimension 2'),
    tooltip=['name:N', 'recipe:N'],
    color=alt.value('steelblue'),
    stroke=alt.value('white'),
    strokeWidth=alt.value(1)
).properties(
    width=600,
    height=500,
    title='UMAP Embedding of Cocktail Recipes'
).interactive()

chart.show()

  warn("using precomputed metric; inverse_transform will be unavailable")
  warn(


In [19]:
clusters = SpectralCoclustering(n_clusters=20, svd_method="arpack")

In [20]:
clusters.fit(df_normalized.fillna(0))

0,1,2
,n_clusters,20
,svd_method,'arpack'
,n_svd_vecs,
,mini_batch,False
,init,'k-means++'
,n_init,10
,random_state,


In [40]:
df_normalized.index[clusters.row_labels_ == 1]

Index([], dtype='object', name='recipe_name')

In [41]:
c = SpectralBiclustering(n_clusters=10)

In [42]:
c.fit(df_normalized)

0,1,2
,n_clusters,10
,method,'bistochastic'
,n_components,6
,n_best,3
,svd_method,'randomized'
,n_svd_vecs,
,mini_batch,False
,init,'k-means++'
,n_init,10
,random_state,


In [44]:
df_normalized.columns[c.column_labels_==1]

MultiIndex([('juice', 'lime juice')],
           names=['category', 'specific_type'])

In [60]:
df.columns[c.column_labels_==4]

Index(['vodka'], dtype='object')

In [64]:
df.index[c.row_labels_==0]

Index(['Rhubarbra Streisand', 'Songs About Keri', 'Green Gazoo',
       'Gradda’s Tamal', 'Gin Coco No. 2', 'Beton', 'Coronation No. 1',
       'Santo Libre', 'Kangaroo', 'Strega Drop',
       ...
       'Fast Canoe', 'Leeward Negroni', 'Pandan-quiri', 'Rosé Piscine',
       'Butter Martini', 'Queen Bee', 'Garret Richard’s Mai Tai',
       'Groove is in the Heart', 'Garibaldi Spritz',
       'Fermented Citrus Garibaldi'],
      dtype='object', length=983)

In [None]:
import altair as alt

# Create UMAP embedding from the distance matrix
reducer: umap.UMAP = umap.UMAP(
    n_neighbors=15, 
    n_components=2, 
    metric='precomputed', 
    random_state=42
)
embedding: np.ndarray = reducer.fit_transform(distance_matrix_sklearn)

# Create a DataFrame with the embedding coordinates and names
embedding_df: pd.DataFrame = pd.DataFrame(
    embedding,
    columns=['UMAP1', 'UMAP2'],
    index=distance_df_sklearn.index
)
embedding_df['name'] = embedding_df.index

# Create the Altair chart
chart: alt.Chart = alt.Chart(embedding_df.reset_index()).mark_circle(
    size=100, 
    opacity=0.7
).encode(
    x=alt.X('UMAP1:Q', title='UMAP Dimension 1'),
    y=alt.Y('UMAP2:Q', title='UMAP Dimension 2'),
    tooltip=['name:N', 'UMAP1:Q', 'UMAP2:Q'],
    color=alt.value('steelblue'),
    stroke=alt.value('white'),
    strokeWidth=alt.value(1)
).properties(
    width=600,
    height=500,
    title='UMAP Embedding of Cocktail Recipes'
).interactive()

chart.show()
