In [1]:
import torch
import numpy as np
import pandas as pd
import altair as alt
import matplotlib.pyplot as plt
from torch_geometric.nn import Node2Vec
from torch_geometric.data import Data
from torch_geometric.transforms import RandomLinkSplit
import umap
from utils import load_data, get_recipe_string
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import itertools


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_normalized, df_bool_collapsed = load_data()


  grouped_cols = df.groupby(level=[0, 1], axis=1)
  grouped_cols = df_bool.groupby(level=[0, 1], axis=1)


In [3]:
# Create early stopping function
def train_with_early_stopping_v1(model, loader, optimizer, device, patience=10, min_delta=1e-4, max_epochs=100):
    """Early stopping based on loss plateau"""
    model.train()
    
    best_loss = float('inf')
    epochs_without_improvement = 0
    loss_history = []
    
    for epoch in range(max_epochs):
        total_loss = 0
        for pos_rw, neg_rw in loader:
            optimizer.zero_grad()
            loss = model.loss(pos_rw.to(device), neg_rw.to(device))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        avg_loss = total_loss / len(loader)
        loss_history.append(avg_loss)
        
        print(f'Epoch {epoch:03d}, Loss: {avg_loss:.4f}')
        
        if avg_loss < best_loss - min_delta:
            best_loss = avg_loss
            epochs_without_improvement = 0
            print(f'    -> New best loss: {best_loss:.4f}')
        else:
            epochs_without_improvement += 1
            print(f'    -> No improvement for {epochs_without_improvement} epochs')
        
        if epochs_without_improvement >= patience:
            print(f'Early stopping triggered after {epoch + 1} epochs')
            print(f'Best loss: {best_loss:.4f}')
            break
    
    return loss_history


In [4]:
# Create bipartite graph
recipes, ingredients = df_bool_collapsed.shape
print(f"Recipes: {recipes}, Ingredients: {ingredients}")

# Create edge list for bipartite graph
rows, cols = np.nonzero(df_bool_collapsed)
# Map ingredient indices to start after recipe indices
edge_index = torch.tensor(
    [
        np.concatenate([rows, cols + recipes]),  # source nodes
        np.concatenate([cols + recipes, rows]),  # target nodes (bidirectional)
    ],
    dtype=torch.long,
)

print(f"Total nodes: {recipes + ingredients}")
print(f"Total edges: {edge_index.shape[1]}")

# Create PyTorch Geometric data object
data = Data(edge_index=edge_index, num_nodes=recipes + ingredients)

splitter = RandomLinkSplit(
    num_val=0,          # No validation edges
    num_test=0.1,         # 10% test edges
    is_undirected=True,   # recipe–ingredient graph is undirected
    neg_sampling_ratio=1, # 1 negative edge per positive
    add_negative_train_samples=False,  # Node2Vec does its own negative sampling
    split_labels=True,
    key="edge_label",
)
train_data, _, test_data = splitter(data)

print(f"Train edges: {train_data.edge_index.shape[1]}")
print(f"Test positive edges: {test_data.pos_edge_label_index.shape[1]}")
print(f"Test negative edges: {test_data.neg_edge_label_index.shape[1]}")


Recipes: 1517, Ingredients: 303
Total nodes: 1820
Total edges: 12918
Train edges: 11628
Test positive edges: 645
Test negative edges: 645


  edge_index = torch.tensor(


In [5]:
def edge_operator(u, v):                    # simple Hadamard product
    return u * v


def get_edge_features_bipartite(embeddings, test_data, recipes):
    """Get edge features for bipartite graph, handling both recipe->ingredient and ingredient->recipe edges"""
    
    recipe_embeddings = embeddings[:recipes]
    ingredient_embeddings = embeddings[recipes:]
    
    def get_features_for_edges(edge_index):
        """Extract features for a set of edges, handling both directions"""
        source_nodes = edge_index[0]
        target_nodes = edge_index[1]
        
        features = []
        
        for i in range(len(source_nodes)):
            src = source_nodes[i].item()
            tgt = target_nodes[i].item()
            
            # Determine node types
            src_is_recipe = src < recipes
            tgt_is_recipe = tgt < recipes
            
            if src_is_recipe and not tgt_is_recipe:
                # Recipe -> Ingredient
                recipe_emb = recipe_embeddings[src]
                ingredient_emb = ingredient_embeddings[tgt - recipes]
            elif not src_is_recipe and tgt_is_recipe:
                # Ingredient -> Recipe  
                ingredient_emb = ingredient_embeddings[src - recipes]
                recipe_emb = recipe_embeddings[tgt]
            else:
                # Skip same-type edges (shouldn't happen in bipartite graph)
                print(f"Warning: Same-type edge found: {src} -> {tgt}")
                continue
                
            # Apply edge operator (always recipe × ingredient)
            feature = edge_operator(recipe_emb, ingredient_emb)
            features.append(feature)
        
        return np.array(features)
    
    # Get features for positive and negative edges
    X_pos = get_features_for_edges(test_data.pos_edge_label_index)
    X_neg = get_features_for_edges(test_data.neg_edge_label_index)
    
    print(f"Positive edge features: {X_pos.shape}")
    print(f"Negative edge features: {X_neg.shape}")
    
    return X_pos, X_neg


def train_and_evaluate_node2vec(data: Data, embedding_dim=64, p=1.0, q=0.5, walk_length=20, context_size=10, walks_per_node=10, seed=42) -> tuple[float, object]:
    """Train and evaluate Node2Vec model with proper parameter handling"""
    # Set random seed for reproducibility
    torch.manual_seed(seed)
    np.random.seed(seed)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # FIXED: Use the passed data parameter instead of global train_data
    model = Node2Vec(
        data.edge_index,  # Use the passed data parameter
        embedding_dim=embedding_dim,
        walk_length=walk_length,
        context_size=context_size,
        walks_per_node=walks_per_node,
        num_negative_samples=1,
        p=p,  # return parameter
        q=q,  # in-out parameter
        sparse=True,
    ).to(device)

    # Create data loader
    loader = model.loader(batch_size=128, shuffle=True, num_workers=4)

    # Optimizer
    optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01)
    train_with_early_stopping_v1(
        model, loader, optimizer, device, patience=10, min_delta=1e-4, max_epochs=100
    )
    model.eval()
    with torch.no_grad():
        embeddings = model().cpu().numpy()

    # FIXED: Properly handle bipartite edge prediction
    X_pos, X_neg = get_edge_features_bipartite(embeddings, test_data, recipes)
    
    # Combine positive and negative features
    X_val = np.vstack([X_pos, X_neg])

    # Create labels: 1 for positive edges, 0 for negative edges
    y_pos = np.ones(X_pos.shape[0])
    y_neg = np.zeros(X_neg.shape[0])
    y_val = np.concatenate([y_pos, y_neg])

    # Train classifier and compute AUC
    clf = LogisticRegression(max_iter=1000).fit(X_val, y_val)
    auc = roc_auc_score(y_val, clf.predict_proba(X_val)[:,1])

    print(f"Parameters: p={p:.2f}, q={q:.2f}, dim={embedding_dim}, AUC={auc:.4f}")
    return auc, model


In [6]:
# Test with 2D embeddings to investigate the high AUC
print("Testing 2D embeddings...")
auc_2d, model_2d = train_and_evaluate_node2vec(
    data=train_data, embedding_dim=2, p=1.0, q=1.0, seed=42
)


Testing 2D embeddings...
Epoch 000, Loss: 1.7064
    -> New best loss: 1.7064
Epoch 001, Loss: 1.5982
    -> New best loss: 1.5982
Epoch 002, Loss: 1.5253
    -> New best loss: 1.5253
Epoch 003, Loss: 1.4819
    -> New best loss: 1.4819
Epoch 004, Loss: 1.4512
    -> New best loss: 1.4512
Epoch 005, Loss: 1.4292
    -> New best loss: 1.4292
Epoch 006, Loss: 1.4130
    -> New best loss: 1.4130
Epoch 007, Loss: 1.3959
    -> New best loss: 1.3959
Epoch 008, Loss: 1.3760
    -> New best loss: 1.3760
Epoch 009, Loss: 1.3517
    -> New best loss: 1.3517
Epoch 010, Loss: 1.3254
    -> New best loss: 1.3254
Epoch 011, Loss: 1.2966
    -> New best loss: 1.2966
Epoch 012, Loss: 1.2697
    -> New best loss: 1.2697
Epoch 013, Loss: 1.2477
    -> New best loss: 1.2477
Epoch 014, Loss: 1.2294
    -> New best loss: 1.2294
Epoch 015, Loss: 1.2164
    -> New best loss: 1.2164
Epoch 016, Loss: 1.2045
    -> New best loss: 1.2045
Epoch 017, Loss: 1.1952
    -> New best loss: 1.1952
Epoch 018, Loss: 1.18

In [7]:
# Analyze the 2D embeddings to understand why AUC is so high
def analyze_2d_embeddings_fixed(model, auc_score, recipes):
    """Analyze the 2D embeddings to understand why AUC is so high - FIXED VERSION"""
    
    print(f"\n=== ANALYZING 2D EMBEDDINGS (AUC: {auc_score:.4f}) ===\n")
    
    # Get 2D embeddings
    model.eval()
    with torch.no_grad():
        embeddings_2d = model().cpu().numpy()
    
    print(f"Embedding shape: {embeddings_2d.shape}")
    print(f"Embedding range: [{embeddings_2d.min():.3f}, {embeddings_2d.max():.3f}]")
    print(f"Embedding mean: {embeddings_2d.mean():.3f}")
    print(f"Embedding std: {embeddings_2d.std():.3f}")
    
    # Split into recipe and ingredient embeddings
    recipe_embeddings_2d = embeddings_2d[:recipes]
    ingredient_embeddings_2d = embeddings_2d[recipes:]
    
    print(f"\nRecipe embeddings: {recipe_embeddings_2d.shape}")
    print(f"Ingredient embeddings: {ingredient_embeddings_2d.shape}")
    
    # Get edge features using the fixed function
    print("\n=== EDGE PREDICTION ANALYSIS ===")
    X_pos, X_neg = get_edge_features_bipartite(embeddings_2d, test_data, recipes)
    
    print(f"\nPositive edge features stats:")
    print(f"  Mean: {X_pos.mean(axis=0)}")
    print(f"  Std: {X_pos.std(axis=0)}")
    print(f"  Min: {X_pos.min(axis=0)}")
    print(f"  Max: {X_pos.max(axis=0)}")
    
    print(f"\nNegative edge features stats:")
    print(f"  Mean: {X_neg.mean(axis=0)}")
    print(f"  Std: {X_neg.std(axis=0)}")
    print(f"  Min: {X_neg.min(axis=0)}")
    print(f"  Max: {X_neg.max(axis=0)}")
    
    # Check separability
    print(f"\n=== SEPARABILITY ANALYSIS ===")
    pos_norms = np.linalg.norm(X_pos, axis=1)
    neg_norms = np.linalg.norm(X_neg, axis=1)
    
    print(f"Positive edge feature norms: mean={pos_norms.mean():.4f}, std={pos_norms.std():.4f}")
    print(f"Negative edge feature norms: mean={neg_norms.mean():.4f}, std={neg_norms.std():.4f}")
    
    # Simple threshold test
    threshold = (pos_norms.mean() + neg_norms.mean()) / 2
    pos_correct = (pos_norms > threshold).sum()
    neg_correct = (neg_norms < threshold).sum()
    simple_accuracy = (pos_correct + neg_correct) / (len(pos_norms) + len(neg_norms))
    
    print(f"\nSimple threshold classification:")
    print(f"  Threshold: {threshold:.4f}")
    print(f"  Accuracy: {simple_accuracy:.4f}")
    
    if simple_accuracy > 0.8:
        print(f"  ⚠️  Features are very separable - this explains the high AUC!")
    
    return embeddings_2d, X_pos, X_neg

# Analyze the 2D embeddings
embeddings_2d, X_pos, X_neg = analyze_2d_embeddings_fixed(model_2d, auc_2d, recipes)



=== ANALYZING 2D EMBEDDINGS (AUC: 0.7962) ===

Embedding shape: (1820, 2)
Embedding range: [-6.215, 7.092]
Embedding mean: 0.057
Embedding std: 0.712

Recipe embeddings: (1517, 2)
Ingredient embeddings: (303, 2)

=== EDGE PREDICTION ANALYSIS ===
Positive edge features: (645, 2)
Negative edge features: (189, 2)

Positive edge features stats:
  Mean: [0.3812508 0.7055534]
  Std: [0.9396719 1.2872707]
  Min: [ -4.4554725 -18.749989 ]
  Max: [3.639674  4.3399587]

Negative edge features stats:
  Mean: [0.08633984 0.19066608]
  Std: [0.40767556 0.5024476 ]
  Min: [-1.8413185 -1.8431976]
  Max: [1.8894529 2.5693538]

=== SEPARABILITY ANALYSIS ===
Positive edge feature norms: mean=1.3262, std=1.1935
Negative edge feature norms: mean=0.4106, std=0.5421

Simple threshold classification:
  Threshold: 0.8684
  Accuracy: 0.6631


In [8]:
# Quick baseline check
def random_baseline_check_fixed():
    """Check what AUC we get with random predictions"""
    
    print("=== RANDOM BASELINE CHECK ===\n")
    
    # Get the number of positive and negative test edges
    n_pos = len(X_pos)
    n_neg = len(X_neg)
    n_total = n_pos + n_neg
    
    print(f"Test set size: {n_total} edges ({n_pos} positive, {n_neg} negative)")
    
    # Create random predictions
    np.random.seed(42)
    random_predictions = np.random.random(n_total)
    true_labels = np.concatenate([np.ones(n_pos), np.zeros(n_neg)])
    
    # Calculate AUC for random predictions
    random_auc = roc_auc_score(true_labels, random_predictions)
    print(f"Random baseline AUC: {random_auc:.4f}")
    print(f"Expected random AUC: ~0.5000")
    
    if random_auc > 0.6 or random_auc < 0.4:
        print("⚠️  Random baseline is far from 0.5 - this suggests data imbalance issues!")
    else:
        print("✅ Random baseline looks normal")
    
    # Compare with your 2D model
    print(f"\nYour 2D model AUC: {auc_2d:.4f}")
    print(f"Improvement over random: {(auc_2d - 0.5) / 0.5 * 100:.1f}%")
    
    return random_auc

random_auc = random_baseline_check_fixed()


=== RANDOM BASELINE CHECK ===

Test set size: 834 edges (645 positive, 189 negative)
Random baseline AUC: 0.4908
Expected random AUC: ~0.5000
✅ Random baseline looks normal

Your 2D model AUC: 0.7962
Improvement over random: 59.2%


In [9]:
# Analyze the same-type edge warnings
def analyze_same_type_edges():
    """Analyze what the same-type edge warnings mean"""
    
    print("=== ANALYZING SAME-TYPE EDGE WARNINGS ===\n")
    
    print("Your node ranges:")
    print(f"  Recipes: 0 to {recipes-1} (total: {recipes})")
    print(f"  Ingredients: {recipes} to {recipes + ingredients - 1} (total: {ingredients})")
    print()
    
    # Analyze test edges
    print("Analyzing test edges...")
    
    # Check positive edges
    pos_src = test_data.pos_edge_label_index[0].numpy()
    pos_tgt = test_data.pos_edge_label_index[1].numpy()
    
    pos_recipe_to_recipe = 0
    pos_ingredient_to_ingredient = 0
    pos_valid_bipartite = 0
    
    for src, tgt in zip(pos_src, pos_tgt):
        src_is_recipe = src < recipes
        tgt_is_recipe = tgt < recipes
        
        if src_is_recipe and tgt_is_recipe:
            pos_recipe_to_recipe += 1
        elif not src_is_recipe and not tgt_is_recipe:
            pos_ingredient_to_ingredient += 1
        else:
            pos_valid_bipartite += 1
    
    print(f"POSITIVE test edges:")
    print(f"  Recipe -> Recipe: {pos_recipe_to_recipe}")
    print(f"  Ingredient -> Ingredient: {pos_ingredient_to_ingredient}")
    print(f"  Valid bipartite: {pos_valid_bipartite}")
    print(f"  Total: {len(pos_src)}")
    print()
    
    # Check negative edges
    neg_src = test_data.neg_edge_label_index[0].numpy()
    neg_tgt = test_data.neg_edge_label_index[1].numpy()
    
    neg_recipe_to_recipe = 0
    neg_ingredient_to_ingredient = 0
    neg_valid_bipartite = 0
    
    for src, tgt in zip(neg_src, neg_tgt):
        src_is_recipe = src < recipes
        tgt_is_recipe = tgt < recipes
        
        if src_is_recipe and tgt_is_recipe:
            neg_recipe_to_recipe += 1
        elif not src_is_recipe and not tgt_is_recipe:
            neg_ingredient_to_ingredient += 1
        else:
            neg_valid_bipartite += 1
    
    print(f"NEGATIVE test edges:")
    print(f"  Recipe -> Recipe: {neg_recipe_to_recipe}")
    print(f"  Ingredient -> Ingredient: {neg_ingredient_to_ingredient}")
    print(f"  Valid bipartite: {neg_valid_bipartite}")
    print(f"  Total: {len(neg_src)}")
    print()
    
    # Explain the problem
    total_same_type = pos_recipe_to_recipe + pos_ingredient_to_ingredient + neg_recipe_to_recipe + neg_ingredient_to_ingredient
    
    print("=== WHAT THIS MEANS ===")
    print(f"🚨 Found {total_same_type} same-type edges in test set!")
    print()
    print("❌ PROBLEM: In a bipartite graph, edges should ONLY exist between different node types:")
    print("   ✅ Recipe ↔ Ingredient (valid)")
    print("   ❌ Recipe ↔ Recipe (invalid)")
    print("   ❌ Ingredient ↔ Ingredient (invalid)")
    print()
    print("🔍 WHY THIS HAPPENS:")
    print("   The RandomLinkSplit negative sampling is creating invalid edges")
    print("   that violate the bipartite constraint!")
    print()
    print("📊 IMPACT ON RESULTS:")
    print("   - Your model correctly can't predict these invalid edges")
    print("   - This explains the lower accuracy (0.66 instead of ~0.9)")
    print("   - These edges should be filtered out of the evaluation")
    
    return (pos_recipe_to_recipe, pos_ingredient_to_ingredient, pos_valid_bipartite,
            neg_recipe_to_recipe, neg_ingredient_to_ingredient, neg_valid_bipartite)

same_type_stats = analyze_same_type_edges()



Your node ranges:
  Recipes: 0 to 1516 (total: 1517)
  Ingredients: 1517 to 1819 (total: 303)

Analyzing test edges...
POSITIVE test edges:
  Recipe -> Recipe: 0
  Ingredient -> Ingredient: 0
  Valid bipartite: 645
  Total: 645

NEGATIVE test edges:
  Recipe -> Recipe: 437
  Ingredient -> Ingredient: 19
  Valid bipartite: 189
  Total: 645

=== WHAT THIS MEANS ===
🚨 Found 456 same-type edges in test set!

❌ PROBLEM: In a bipartite graph, edges should ONLY exist between different node types:
   ✅ Recipe ↔ Ingredient (valid)
   ❌ Recipe ↔ Recipe (invalid)
   ❌ Ingredient ↔ Ingredient (invalid)

🔍 WHY THIS HAPPENS:
   The RandomLinkSplit negative sampling is creating invalid edges
   that violate the bipartite constraint!

📊 IMPACT ON RESULTS:
   - Your model correctly can't predict these invalid edges
   - This explains the lower accuracy (0.66 instead of ~0.9)
   - These edges should be filtered out of the evaluation


In [14]:
recipe_embeddings_2d = embeddings_2d[:recipes]
ingredient_embeddings_2d = embeddings_2d[recipes:]

In [15]:
reducer = umap.UMAP(
    n_neighbors=5,
    n_components=2,
    metric="cosine",  # Cosine distance works well for factor matrices
    random_state=42,
)
recipe_umap = reducer.fit_transform(recipe_embeddings_2d)

# Create DataFrame
recipe_embedding_df = pd.DataFrame(
    {
        "UMAP1": recipe_umap[:, 0],
        "UMAP2": recipe_umap[:, 1],
        "recipe_name": df_normalized.index,
    }
)
recipe_embedding_df["ingredients"] = recipe_embedding_df["recipe_name"].apply(
    lambda x: get_recipe_string(df_normalized, x)
)

ingredient_umap = reducer.fit_transform(ingredient_embeddings_2d)
ingredient_embedding_df = pd.DataFrame(
    {
        "UMAP1": ingredient_umap[:, 0],
        "UMAP2": ingredient_umap[:, 1],
        "ingredient_name": df_normalized.columns,
    }
)


  warn(


In [16]:

# Create interactive plot with Altair
chart = alt.Chart(recipe_embedding_df).mark_circle().add_params(
    alt.selection_point()
).encode(
    x=alt.X('UMAP1:Q', title='UMAP Dimension 1'),
    y=alt.Y('UMAP2:Q', title='UMAP Dimension 2'),
    tooltip=['recipe_name:N', 'ingredients:N']
).properties(
    width=800,
    height=600,
    title='Recipe Clusters - UMAP Visualization'
).interactive()

# Display the chart
chart.show()
