In [1]:
import pandas as pd
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.metrics import accuracy_score
from scipy.sparse import hstack
from tqdm import tqdm 

print("Loading data...")
df = pd.read_csv("merged_recipe_price_nodupl.csv")
df["PRICE_USED"] = df["PRICE_CURRENT"].fillna(df["PRICE_RETAIL"])
df = df.dropna(subset=["PRICE_USED"])
print(f"Total rows: {len(df)}")

print("Vectorizing text...")
unique_ingredients = df["ingredient"].astype(str).str.lower().unique()
vectorizer = CountVectorizer(max_features=400, stop_words="english")
vectorizer.fit(unique_ingredients)

print("Fitting GMM on a sample...")
sample_size = min(50000, len(df))
df_sample = df.sample(n=sample_size, random_state=42)

text_sample = vectorizer.transform(df_sample["ingredient"].astype(str).str.lower())
num_sample = df_sample[["match_score", "PRICE_USED", "PRODUCT_SIZE"]].fillna(0)
X_sample = hstack([text_sample, num_sample]).toarray()

gmm = GaussianMixture(n_components=12, covariance_type="diag", random_state=42)
gmm.fit(X_sample)
print("GMM Training complete.")

print("Predicting clusters for full dataset in batches...")
batch_size = 50000
all_clusters = []

for i in tqdm(range(0, len(df), batch_size)):
    df_batch = df.iloc[i : i + batch_size]
    text_batch = vectorizer.transform(df_batch["ingredient"].astype(str).str.lower())
    num_batch = df_batch[["match_score", "PRICE_USED", "PRODUCT_SIZE"]].fillna(0)
    X_batch = hstack([text_batch, num_batch]).toarray()
    batch_preds = gmm.predict(X_batch)
    all_clusters.append(batch_preds)

df["cluster"] = np.concatenate(all_clusters)
print("--- Cluster Inspection ---")
for c in range(12):
    cluster_data = df[df['cluster'] == c]
    top_ingredients = cluster_data['ingredient'].value_counts().head(5).index.tolist()
    
    print(f"Cluster {c} (Size: {len(cluster_data)}): {top_ingredients}")


print("Training LDA for validation (on sample)...")
y_sample_pred = gmm.predict(X_sample) 

X_train, X_test, y_train, y_test = train_test_split(
    X_sample, y_sample_pred, test_size=0.2, random_state=42, stratify=y_sample_pred
)

lda = LDA()
lda.fit(X_train, y_train)
preds = lda.predict(X_test)
print(f"LDA classification accuracy (on sample): {accuracy_score(y_test, preds):.2f}")


Loading data...
Total rows: 2631005
Vectorizing text...
Fitting GMM on a sample...
GMM Training complete.
Predicting clusters for full dataset in batches...


100%|██████████| 53/53 [00:06<00:00,  8.30it/s]


--- Cluster Inspection ---
Cluster 0 (Size: 38376): ['chicken', 'chicken breasts', 'beef', 'pork', 'pork chops']
Cluster 1 (Size: 81312): ['boiling water', 'clove garlic', 'catsup', 'stalks celery', 'bay leaf']
Cluster 2 (Size: 227292): ['Cheddar cheese', 'mayonnaise', 'bread crumbs', 'bacon', 'olive oil']
Cluster 3 (Size: 505902): ['sugar', 'eggs', 'milk', 'pecans', 'nutmeg']
Cluster 4 (Size: 186294): ['nuts', 'onions', 'raisins', 'green onions', 'oregano']
Cluster 5 (Size: 309993): ['flour', 'baking powder', 'egg', 'baking soda', 'soda']
Cluster 6 (Size: 103598): ['oil', 'vinegar', 'Worcestershire sauce', 'soy sauce', 'corn']
Cluster 7 (Size: 130): ['cognac', 'Triple Sec', 'Zinfandel', 'Estates Seven Oaks Cabernet', 'citron']
Cluster 8 (Size: 258186): ['vanilla', 'cream cheese', 'pineapple', 'lemon juice', 'parsley']
Cluster 9 (Size: 502729): ['salt', 'onion', 'pepper', 'celery', 'cinnamon']
Cluster 10 (Size: 308362): ['butter', 'water', 'shortening', 'potatoes', 'powdered sugar']
Cl

In [2]:
def calculate_distance_vectorized(lat1, lon1, lat2_series, lon2_series):
    R = 6371  
    phi1, phi2 = np.radians(lat1), np.radians(lat2_series)
    dphi = np.radians(lat2_series - lat1)
    dlambda = np.radians(lon2_series - lon1)
    a = np.sin(dphi/2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(dlambda/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    return R * c

def optimize_recipe_cost(recipe_name, df, user_lat, user_lon, shipping_rate=0.5):
    """
    shipping_rate: $0.5/km
    """
    recipe_df = df[df["title"].str.lower() == recipe_name.lower()].copy()
    
    if recipe_df.empty:
        print(f"No recipe found: {recipe_name}")
        return None, 0

    lat_diff = 5.0
    lon_diff = 5.0

    if 'latitude' in recipe_df.columns:
        recipe_df = recipe_df[
            (recipe_df['latitude'] > user_lat - lat_diff) & 
            (recipe_df['latitude'] < user_lat + lat_diff) &
            (recipe_df['longitude'] > user_lon - lon_diff) & 
            (recipe_df['longitude'] < user_lon + lon_diff)
        ].copy()
    
    if recipe_df.empty:
        print(f"Warning: No relevant products found for '{recipe_name}' within approximately 500km.")
        return None, 0

    if 'latitude' not in recipe_df.columns or 'longitude' not in recipe_df.columns:
        recipe_df['distance_km'] = 0
    else:
        recipe_df['distance_km'] = calculate_distance_vectorized(
            user_lat, user_lon, recipe_df['latitude'], recipe_df['longitude']
        )
    
    recipe_df['distance_km'] = recipe_df['distance_km'].fillna(9999)
    temp_shipping_cost = recipe_df['distance_km'] * shipping_rate
    recipe_df['sort_cost'] = recipe_df['PRICE_USED'] + temp_shipping_cost
    result = recipe_df.sort_values('sort_cost').drop_duplicates(subset=['cluster'], keep='first')
    product_total_price = result["PRICE_USED"].sum()
    unique_stores = result[['SHIPPING_LOCATION', 'distance_km']].drop_duplicates(subset=['SHIPPING_LOCATION'])
    basket_shipping_cost = (unique_stores['distance_km'] * shipping_rate).sum()
    
    final_total_cost = product_total_price + basket_shipping_cost

    cols_to_show = [
        "ingredient", "PRODUCT_NAME", "BRAND", "PRICE_USED", 
        "distance_km", "cluster", "SHIPPING_LOCATION"
    ]

    cols_to_show = [c for c in cols_to_show if c in result.columns]
    
    result_display = result[cols_to_show].rename(columns={"PRICE_USED": "Product_Price"})

    print(f"--- Optimization for '{recipe_name}' ---")
    print(f"Items Count: {len(result)}")
    print(f"Stores Involved: {len(unique_stores)} (Zips: {unique_stores['SHIPPING_LOCATION'].tolist()})")
    print(f"Product Cost:   ${product_total_price:.2f}")
    print(f"Shipping Cost:  ${basket_shipping_cost:.2f}")
    print(f"==================================")
    print(f"TOTAL COST:     ${final_total_cost:.2f}")
    print(f"==================================\n")
    
    return result_display, final_total_cost

# --- Example Execution ---

# Tech Square, Atlanta, GA
user_latitude = 33.7769
user_longitude = -84.3915

print(f"User Location: {user_latitude}, {user_longitude}\n")

# - Pasta
optimized_pasta, total_pasta = optimize_recipe_cost("Pasta", df, user_latitude, user_longitude)
if optimized_pasta is not None:
    print(optimized_pasta)
    # optimized_pasta.to_csv("optimized_pasta_basket.csv", index=False)

# - Cookies
optimized_cookies, total_cookies = optimize_recipe_cost("Cookies", df, user_latitude, user_longitude)
if optimized_cookies is not None:
    print(optimized_cookies)
    # optimized_cookies.to_csv("optimized_cookies_basket.csv", index=False)

# - Pizza
optimized_pizza, total_pizza = optimize_recipe_cost("Pizza", df, user_latitude, user_longitude)
if optimized_pizza is not None:
    print(optimized_pizza)
    # optimized_pizza.to_csv("optimized_pizza_basket.csv", index=False)

User Location: 33.7769, -84.3915

--- Optimization for 'Pasta' ---
Items Count: 7
Stores Involved: 1 (Zips: [30044])
Product Cost:   $32.26
Shipping Cost:  $16.93
TOTAL COST:     $49.19

                         ingredient  \
1857967           Mozzarella cheese   
1883057  Wish-Bone Italian dressing   
1883010                    pimentos   
1883054       fresh chopped parsley   
1882931                 shell pasta   
1857832            virgin olive oil   
1857940                tomato sauce   

                                              PRODUCT_NAME          BRAND  \
1857967  Great Value Finely Shredded Low-Moisture Part-...    great value   
1883057         Wish-Bone Italian Salad Dressing, 15 fl oz      wish-bone   
1883010        Marketside Roasted Red Pepper Hummus, 10 Oz     marketside   
1883054        Marketside Roasted Red Pepper Hummus, 10 Oz     marketside   
1882931  Velveeta Shells and Cheese with Bacon Shell Pa...       velveeta   
1857832       Great Value Extra Virgin