In [11]:
import pandas as pd
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.metrics import accuracy_score
from scipy.sparse import hstack

# Load data
df = pd.read_csv("merged_recipe_price_nodupl.csv")

# Replace missing prices
df["PRICE_USED"] = df["PRICE_CURRENT"].fillna(df["PRICE_RETAIL"])
df = df.dropna(subset=["PRICE_USED"])

# Numeric features
num_features = df[["match_score", "PRICE_USED", "PRODUCT_SIZE"]].fillna(0)

# Text features (ingredient names)
vectorizer = CountVectorizer(max_features=400, stop_words="english")
text_features = vectorizer.fit_transform(df["ingredient"].astype(str).str.lower())

# Combine sparse text features + numeric features
X = hstack([text_features, num_features])


# GMM CLUSTERING
print("Fitting GMM")
# Assigned GMM clusters (0â€“11) to each ingredient
gmm = GaussianMixture(n_components=12, covariance_type="diag", random_state=42)
clusters = gmm.fit_predict(X.toarray())
df["cluster"] = clusters

# Inspect some clusters to verify
for c in range(3):
    print(f"Cluster {c}: {df[df['cluster']==c]['ingredient'].head(5).tolist()}")

# Train LDA classifier
X_train, X_test, y_train, y_test = train_test_split(
    X.toarray(), df["cluster"], test_size=0.2, random_state=42, stratify=df["cluster"]
)

lda = LDA()
lda.fit(X_train, y_train)
preds = lda.predict(X_test)
print(f"LDA classification accuracy: {accuracy_score(y_test, preds):.2f}")

# Cost Optimization
def optimize_recipe_cost(recipe_name, df):
    # Pick cheapest ingredient within same GMM cluster for a recipe
    recipe_df = df[df["title"].str.lower() == recipe_name.lower()].copy()
    if recipe_df.empty:
        print(f"No recipe found: {recipe_name}")
        return None

    chosen_rows = []
    for cluster_id, group in recipe_df.groupby("cluster"):
        # choose cheapest product within that cluster
        best_row = group.loc[group["PRICE_USED"].idxmin()]
        chosen_rows.append(best_row)

    result = pd.DataFrame(chosen_rows)
    total_cost = result["PRICE_USED"].sum()
    result = result[[
        "ingredient", "PRODUCT_NAME", "BRAND", "PRICE_USED", "cluster"
    ]].rename(columns={"PRICE_USED": "Selected_Price"})

    print(f"Optimized cost for '{recipe_name}': ${total_cost:.2f}")
    return result, total_cost

# Example
optimized, total = optimize_recipe_cost("Pasta", df)

if optimized is not None:
    print("Optimized Ingredient List for Pasta:")
    print(optimized)
    optimized.to_csv("optimized_pasta_clustered.csv", index=False)

# Example 2
optimized2, total2 = optimize_recipe_cost("Cookies", df)

if optimized2 is not None:
    print("Optimized Ingredient List for Cookies:")
    print(optimized2)
    optimized2.to_csv("optimized_cookies_clustered.csv", index=False)

# Example 3
optimized3, total3 = optimize_recipe_cost("Pizza", df)

if optimized3 is not None:
    print("Optimized Ingredient List for Pizza:")
    print(optimized3)
    optimized3.to_csv("optimized_pizza_clustered.csv", index=False)

Fitting GMM
Cluster 0: ['nuts', 'cream of mushroom soup', 'garlic powder', 'salt', 'cream of mushroom soup']
Cluster 1: ['bite size shredded rice biscuits', 'boiling water', 'almond extract', 'paraffin', 'cleaned strawberries']
Cluster 2: ['Kahlua', 'Grand Marnier', 'Kahlua', 'Grand Marnier', 'Kahlua']
LDA classification accuracy: 0.96
Optimized cost for 'Pasta': $30.18
Optimized Ingredient List for Pasta:
                       ingredient  \
98458           Mozzarella cheese   
99803                    pimentos   
98451                       penne   
98457                tomato sauce   
99802                green onions   
98459             Parmesan cheese   
99806  Wish-Bone Italian dressing   

                                            PRODUCT_NAME              BRAND  \
98458  Great Value Finely Shredded Low-Moisture Part-...        great value   
99803        Marketside Roasted Red Pepper Hummus, 10 Oz         marketside   
98451  Banquet Mega Bowls Frozen Meal, Dynamite Penne...