In [4]:
# Imports
import pandas as pd
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.metrics import accuracy_score
from scipy.sparse import hstack


# Load merged dataset
df = pd.read_csv("merged_recipe_price_nodupl.csv")

# Ensure PRICE_USED exists and is filled
if "PRICE_USED" not in df.columns:
    df["PRICE_USED"] = df["PRICE_CURRENT"].fillna(df["PRICE_RETAIL"])
else:
    df["PRICE_USED"] = df["PRICE_USED"].fillna(df["PRICE_CURRENT"]).fillna(df["PRICE_RETAIL"])

# Drop rows without essential info
df = df.dropna(subset=["PRICE_USED", "ingredient", "title"])

# Ensure numeric columns exist and are filled
if "match_score" not in df.columns:
    df["match_score"] = np.nan
df["match_score"] = df["match_score"].fillna(df["match_score"].median())

df["PRODUCT_SIZE"] = pd.to_numeric(df.get("PRODUCT_SIZE", np.nan), errors="coerce")
df["PRODUCT_SIZE"] = df["PRODUCT_SIZE"].fillna(df["PRODUCT_SIZE"].median())

# GMM Clustering
# Text representation: prefer ingredient_clean if present
text_col = "ingredient_clean" if "ingredient_clean" in df.columns else "ingredient"
text_data = df[text_col].astype(str)

vectorizer = CountVectorizer(max_features=400, stop_words="english")
text_features = vectorizer.fit_transform(text_data)

# Numeric features: match_score, PRICE_USED, PRODUCT_SIZE
num_features = df[["match_score", "PRICE_USED", "PRODUCT_SIZE"]].values

# Scale numeric features
scaler = StandardScaler()
num_scaled = scaler.fit_transform(num_features)

# Combine sparse text + dense numeric
X = hstack([text_features, num_scaled])

# GaussianMixture expects dense array
X_dense = X.toarray()

# Fit GMM
n_components = 12 
gmm = GaussianMixture(
    n_components=n_components,
    covariance_type="diag",
    random_state=42
)
gmm.fit(X_dense)

# Assign cluster labels
df["cluster"] = gmm.predict(X_dense)

# LDA classifier for clusters
X_train, X_test, y_train, y_test = train_test_split(
    X_dense,
    df["cluster"],
    test_size=0.3,
    random_state=42,
    stratify=df["cluster"]
)

lda = LDA()
lda.fit(X_train, y_train)
preds_train = lda.predict(X_train)
preds_test = lda.predict(X_test)

print(f"LDA classification accuracy (on testing set): {accuracy_score(y_test, preds_test):.2f}")
print(f"LDA classification accuracy (on training set): {accuracy_score(y_train, preds_train):.2f}")

# Cost Optimization
def optimize_recipe_cost(recipe_name, df, min_match_score=80.0):
    """
    For a given recipe title, select one Walmart product per ingredient.
    - Filters to a single recipe by title
    - Filters out low match_score rows
    - Within each ingredient, keeps the dominant cluster (if >1)
    - Picks the cheapest product (by PRICE_USED) for that ingredient
    Returns (result_dataframe, total_cost).
    """

    # Filter to this recipe
    mask = df["title"].str.lower() == recipe_name.lower()
    recipe_df = df[mask].copy()

    if recipe_df.empty:
        print(f"No recipe found with title: {recipe_name}")
        return None, None

    # Remove rows without usable price
    recipe_df = recipe_df.dropna(subset=["PRICE_USED"])

    # Restrict to good fuzzy matches
    if "match_score" in recipe_df.columns:
        recipe_df = recipe_df[recipe_df["match_score"] >= min_match_score]

    if recipe_df.empty:
        print(f"No usable ingredient matches for recipe: {recipe_name}")
        return None, None

    chosen_rows = []

    # Group by ingredient so we keep one product per ingredient
    for ing, group in recipe_df.groupby("ingredient"):
        # If multiple clusters for this ingredient, keep the dominant cluster
        if "cluster" in group.columns and group["cluster"].nunique() > 1:
            dominant_cluster = group["cluster"].value_counts().idxmax()
            group = group[group["cluster"] == dominant_cluster]

        # Pick the product with minimum package price
        best_idx = group["PRICE_USED"].idxmin()
        best_row = group.loc[best_idx]
        chosen_rows.append(best_row)

    if not chosen_rows:
        print(f"No ingredients could be optimized for recipe: {recipe_name}")
        return None, None

    result = pd.DataFrame(chosen_rows)

    total_cost = result["PRICE_USED"].sum()

    # Clean output table
    cols_out = ["ingredient", "PRODUCT_NAME", "BRAND", "PRICE_USED", "cluster"]
    cols_out = [c for c in cols_out if c in result.columns]
    result = result[cols_out].rename(columns={"PRICE_USED": "Selected_Price"})

    print(f"Optimized cost for '{recipe_name}': ${total_cost:.2f}")
    return result, total_cost

# EXAMPLES
# Example 1
optimized1, total1 = optimize_recipe_cost("Pasta", df)

if optimized1 is not None:
    print("Optimized Ingredient List for Pasta:")
    print(optimized1)

# Example 2
optimized2, total2 = optimize_recipe_cost("Cookies", df)

if optimized2 is not None:
    print("Optimized Ingredient List for Cookies:")
    print(optimized2)

# Example 3
optimized3, total3 = optimize_recipe_cost("Pizza", df)

if optimized3 is not None:
    print("Optimized Ingredient List for Pizza:")
    print(optimized3)


LDA classification accuracy (on testing set): 0.96
LDA classification accuracy (on training set): 0.96
Optimized cost for 'Pasta': $47.16
Optimized Ingredient List for Pasta:
                       ingredient  \
99804              Cheddar cheese   
98458           Mozzarella cheese   
98459             Parmesan cheese   
99806  Wish-Bone Italian dressing   
98455                       basil   
99800                black olives   
98453                      garlic   
99802                green onions   
99801                green pepper   
98454                      olives   
98451                       penne   
98456                  red pepper   
99799                 shell pasta   
98457                tomato sauce   
98452            virgin olive oil   

                                            PRODUCT_NAME             BRAND  \
99804                    cheddar cheese & smoked sausage  long john snacks   
98458  baker low moisture part skim strings mozzarell...             baker  

In [5]:
# Cluster Analysis

import numpy as np
import pandas as pd
from collections import Counter

def print_top_tokens(text_list, vectorizer, top_k=15):
    """
    Computes most frequent vocabulary terms inside this cluster.
    """
    # Convert text list to sparse matrix
    X = vectorizer.transform(text_list)
    term_counts = np.array(X.sum(axis=0)).flatten()
    vocab = np.array(vectorizer.get_feature_names_out())

    # Sort by count
    idx_sorted = np.argsort(term_counts)[::-1]
    top_terms = vocab[idx_sorted][:top_k]
    return top_terms


def detailed_cluster_report(df, vectorizer, ingredient_col="ingredient_clean", top_n=10):
    """
    Produces summaries for each cluster.
    """

    if ingredient_col not in df.columns:
        ingredient_col = "ingredient"
        
    clusters = sorted(df["cluster"].unique())
    print(f"\n=== Cluster Report ({len(clusters)} clusters) ===\n")

    for c in clusters:
        sub = df[df["cluster"] == c]
        print(f"\n============================================================")
        print(f"CLUSTER {c} — {len(sub)} rows")
        print("============================================================\n")

        # Ingredients
        print("Top Ingredients:")
        top_ing = (
            sub[ingredient_col]
            .value_counts()
            .head(top_n)
            .index
            .tolist()
        )
        for ing in top_ing:
            print("  •", ing)
        print()

        # Product Names
        if "PRODUCT_NAME" in sub.columns:
            print("Top Product Names:")
            top_products = (
                sub["PRODUCT_NAME"]
                .value_counts()
                .head(top_n)
                .index
                .tolist()
            )
            for p in top_products:
                print("  •", p)
            print()

        # Brands
        if "BRAND" in sub.columns:
            print("Top Brands:")
            top_brands = (
                sub["BRAND"]
                .value_counts()
                .head(top_n)
                .index
                .tolist()
            )
            for b in top_brands:
                print("  •", b)
            print()

        # Category/Subcategory
        if "CATEGORY" in sub.columns:
            print("Category Breakdown:")
            top_cat = (
                sub["CATEGORY"]
                .value_counts()
                .head(top_n)
                .index
                .tolist()
            )
            for c2 in top_cat:
                print("  •", c2)
            print()

        if "SUBCATEGORY" in sub.columns:
            print("Subcategory Breakdown:")
            top_subcat = (
                sub["SUBCATEGORY"]
                .value_counts()
                .head(top_n)
                .index
                .tolist()
            )
            for sc in top_subcat:
                print("  •", sc)
            print()

        # Price
        if "PRICE_USED" in sub.columns:
            print("Price Statistics:")
            print(f"  Mean Price: ${sub['PRICE_USED'].mean():.2f}")
            print(f"  Median:     ${sub['PRICE_USED'].median():.2f}")
            print(f"  Min:        ${sub['PRICE_USED'].min():.2f}")
            print(f"  Max:        ${sub['PRICE_USED'].max():.2f}")
            print()

        # Tokens
        print("Top Text Tokens:")
        top_tokens = print_top_tokens(sub[ingredient_col].astype(str).tolist(), vectorizer, top_k=15)
        print("  ", ", ".join(top_tokens))
        print("\n\n")

# Generate report:
detailed_cluster_report(df, vectorizer, ingredient_col="ingredient_clean", top_n=10)



=== Detailed Cluster Report (12 clusters) ===


CLUSTER 0 — 21112 rows

Top Ingredients:
  • onion
  • eggs
  • nuts
  • cream cheese
  • pecans
  • lemon juice
  • raisins
  • bacon
  • mustard
  • cream of mushroom soup

Top Product Names:
  • bubba's onion bagels, 6 ct
  • 6 count, kinder joy eggs, great for halloween treats, 4.2 oz
  • ziyad castania extra mixed nuts 300 gm
  • great value cinnamon rolls with cream cheese icing, 8 count
  • diamond of californiachopped pecans, 8oz
  • hawaiian punch lemon berry squeeze, juice drink, 1 gal bottle
  • gary's quicksteak chicken
  • puck puck cream
  • pepperidge farm swirl 100% whole wheat cinnamon with raisins breakfast bread, 16 oz loaf
  • al fresco original uncured chicken bacon, 3 oz.

Top Brands:
  • bubba’s
  • kinder joy
  • great value
  • castania
  • diamond
  • hawaiian punch
  • gary's quicksteak
  • puck
  • al fresco
  • pepperidge farm

Category Breakdown:
  • breakfast breads
  • chocolate
  • baking nuts & seeds
  •