# Recipe Reccommendation System

##### Install & Import Libraries

In [None]:
# Install packages
# !pip install nltk 
# !pip install spacy 
# !pip install datasketch 
# !pip install unidecode 
# !pip install simhash 
# !pip install flask 
# !pip install sumy
# !pip install gensim
# !pip install pyLDAvis
# !pip install swifter
# !pip install sentence_transformers
# !pip install recommendation_engine

In [None]:
# ===============================
# Standard Library
# ===============================
import os
import sys
import re
import random
import logging
import warnings
import subprocess
import multiprocessing
import socket
import ast
from collections import defaultdict
from fractions import Fraction

# ===============================
# Data Processing & Analysis
# ===============================
import numpy as np
import pandas as pd
import swifter

# ===============================
# Text Processing
# ===============================
import nltk
import spacy
import unidecode
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize

# Spellchecker (optional)
try:
    from pyspellchecker import SpellChecker
    spell_checker = SpellChecker()
    SPELLCHECK = True
except ImportError:
    SPELLCHECK = False

# NLTK Setup & Downloads
_nltk_resources = {
    "punkt": "tokenizers/punkt",
    "punkt_tab": "tokenizers/punkt_tab",
    "stopwords": "corpora/stopwords", 
    "wordnet": "corpora/wordnet",
    "omw-1.4": "corpora/omw-1.4",
}

for pkg, path in _nltk_resources.items():
    try:
        nltk.data.find(path)
    except LookupError:
        print(f"Downloading {pkg}...")
        nltk.download(pkg)

# ===============================
# Hashing
# ===============================
from simhash import Simhash
from datasketch import MinHash, MinHashLSH

# ===============================
# Machine Learning
# ===============================
from sklearn.metrics.pairwise import cosine_similarity

# ===============================
# Embeddings & Transformers
# ===============================
import torch
from sentence_transformers import SentenceTransformer, util
from transformers import pipeline


# ===============================
# Topic Modeling & LDA
# ===============================
import gensim
import gensim.corpora as corpora
from gensim.models import Word2Vec
from gensim.models import LdaMulticore
from gensim.models.ldamulticore import LdaMulticore
from gensim import models
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis


# ===============================
# Summarization
# ===============================
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer as SumyTokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer


# ===============================
# Progress & Parallel Processing
# ===============================
from tqdm import tqdm
from tqdm.auto import tqdm as auto_tqdm
from joblib import Parallel, delayed

# ===============================
# Evaluation Metrics
# ===============================
from rouge import Rouge
from bert_score import score as bert_score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from textstat import flesch_reading_ease, flesch_kincaid_grade

# ===============================
# Web Framework
# ===============================
from flask import Flask, render_template_string, request, redirect, url_for, session

# ===============================
# Utilities
# ===============================
from scipy.spatial.distance import cdist


# ===============================
# Custom Modules (Optional)
# ===============================
try:
    from recommendation_engine import get_recommendations_as_records
    print("Custom recommendation engine imported")
except ImportError:
    print("Custom recommendation_engine module not found - skipping")
    get_recommendations_as_records = None

# ===============================
# Global Config
# ===============================
# Suppress warnings and configure logging
warnings.filterwarnings("ignore")
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
)

# Progress bar setup
auto_tqdm.pandas(disable=False)

# Set seeds
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
try:
    torch.manual_seed(SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(SEED)
        torch.cuda.manual_seed_all(SEED)
except Exception:
    pass

  from .autonotebook import tqdm as notebook_tqdm


Downloading wordnet...
Downloading omw-1.4...


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/celinewidjaja/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/celinewidjaja/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Custom recommendation_engine module not found - skipping


### ===================== Part 1 - Text Preprocessing =====================

In [None]:
# Load raw dataset
raw_df = pd.read_csv("/Users/celinewidjaja/Documents/recipe-reccomender/recipe_dataset.csv")

# View the shape of the dataset
print("Original shape:", raw_df.shape)

Original shape: (2231142, 7)


In [None]:
# Fill missing values for ingredients
raw_df['ingredients'] = raw_df['ingredients'].fillna("").astype(str)

# Fill missing values for directions
raw_df['directions'] = raw_df['directions'].fillna("").astype(str)

# Define text cleaning function
def simple_clean(text):
    text = text.lower()
    text = unidecode.unidecode(text)
    text = re.sub(r'[^a-z\s]', '', text)
    return re.sub(r'\s+', ' ', text).strip()

# Apply for ingredients
raw_df['clean_ingredients'] = raw_df['ingredients'].apply(simple_clean)

# Apply for directions
raw_df['clean_directions'] = raw_df['directions'].apply(simple_clean)

# Concatenate ingredients and direction for deduplication
raw_df['clean_text'] = raw_df['ingredients'] + ' ' + raw_df['clean_directions']

### SimHash Deduplication

In [None]:
# Define simhash function
def simhash_text(text, max_words=100):
    tokens = text.split()[:max_words]
    return Simhash(' '.join(tokens)).value

# Apply Simhash with progress bars since dataset is big
print(f"Processing {len(raw_df)} recipes...")

# The progress bar application lines
tqdm.pandas(desc="Computing SimHash values")
raw_df['simhash'] = raw_df['clean_text'].progress_apply(simhash_text)

# Make an initial count to compare after deduplication
initial_count = len(raw_df)

# Remove duplicates from Simhash results
raw_df = raw_df.drop_duplicates(subset='simhash').reset_index(drop=True)

# Count recipes after deduplication
final_count = len(raw_df)

# Output completion update
print(f"Removed {initial_count - final_count:,} duplicates")
print(f"Final dataset: {final_count:,} recipes")

Processing 2231142 recipes...


Computing SimHash values: 100%|██████████| 2231142/2231142 [14:07<00:00, 2632.43it/s]


Removed 833 duplicates
Final dataset: 2,230,309 recipes


### MinHash Deduplication

In [None]:
# Define minhash function
def minhash_signature(text, num_perm=128):
    m = MinHash(num_perm=num_perm)
    for word in text.split()[:100]:
        m.update(word.encode('utf8'))
    return m

# Apply MinHash with data bars as well
print(f"Processing {len(raw_df)} recipes with threshold=0.9...")

# Initialize LSH
lsh = MinHashLSH(threshold=0.9, num_perm=128)

# initialize minhash disctionary
minhashes = {}

# Progress comments for easy tracking
print("Step 1/3: Computing MinHash signatures...")

# Create MinHash signatures with progress bar
for i, row in tqdm(raw_df.iterrows(), total=len(raw_df), desc="Creating MinHashes"):
    m = minhash_signature(row['clean_text'])
    minhashes[i] = m
    lsh.insert(i, m)

# Report completion of Minhash signature creation
print(f"Created {len(minhashes):,} MinHash signatures")

# Find duplicates with progress bar
print("Step 2/3: Finding similar recipes...")

# Initialize temporary sets and dictionaries
to_drop = set()
duplicate_pairs = []

# Deduplicate with minhash
for i in tqdm(minhashes, desc="Finding duplicates"):
    results = lsh.query(minhashes[i])
    for r in results:
        if i != r and r not in to_drop:
            to_drop.add(r)
            duplicate_pairs.append((i, r))

# Show how many extra duplicates Minhash found
print(f"Found {len(duplicate_pairs):,} duplicate pairs")

# Update that we're already on the third step
print("Step 3/3: Removing duplicates...")

# Count before deduplication
initial_count = len(raw_df)

# Remove duplicates
cleaned_df = raw_df.drop(index=to_drop).reset_index(drop=True)

# Count after deduplication
final_count = len(raw_df)

# Output completion update
print(f"   Before: {initial_count:,} recipes")
print(f"   After:  {final_count:,} recipes") 
print(f"   Removed: {initial_count - final_count:,} duplicates")

Processing 2230309 recipes with threshold=0.9...
Step 1/3: Computing MinHash signatures...


Creating MinHashes: 100%|██████████| 2230309/2230309 [30:45<00:00, 1208.50it/s] 


Created 2,230,309 MinHash signatures
Step 2/3: Finding similar recipes...


Finding duplicates: 100%|██████████| 2230309/2230309 [00:41<00:00, 54146.03it/s]


Found 5,540 duplicate pairs
Step 3/3: Removing duplicates...
   Before: 2,230,309 recipes
   After:  2,230,309 recipes
   Removed: 0 duplicates


Save to CSV

In [None]:
# Save deduplicated data after SimHash and MinHash
cleaned_df.to_csv("/Users/celinewidjaja/Documents/recipe-reccomender/preprocessed_recipes.csv", index=False)

### ===================== Part 2 - Feature Engineering =====================

In [None]:
# Load the dataset
df_preprocessed = pd.read_csv("/Users/celinewidjaja/Documents/recipe-reccomender/preprocessed_recipes.csv")

# View shape
print("Original shape:", df_preprocessed.shape)

Original shape: (2224769, 11)


In [None]:
# Define corpus creation function
class RecipeCorpus:
    def __init__(self, df, batch_size=100000):
        self.df = df
        self.batch_size = batch_size

    def __iter__(self):
        for start in range(0, len(self.df), self.batch_size):
            end = start + self.batch_size
            batch = self.df.iloc[start:end]
            for _, row in batch.iterrows():
                ingredients = str(row['ingredients'])
                directions = str(row['directions'])
                yield (ingredients + ' ' + directions).split()

# Update step 1
print("Step 1/3: Creating corpus...")

# Apply function on df
corpus = RecipeCorpus(df_preprocessed)

# Update step 2
print("Step 2/3: Building vocabulary...")

# Build vocabulary
w2v_model = Word2Vec(vector_size=100, window=5, min_count=2, workers=4, seed=42)
w2v_model.build_vocab(corpus)

# Update step 3
print("Step 3/3: Training Word2Vec model...")

# Train one epoch at a time for progress tracking
for epoch in tqdm(range(10), desc="Training epochs"):
    w2v_model.train(
        corpus, 
        total_examples=w2v_model.corpus_count, 
        epochs=1
    )
    print(f"  Completed epoch {epoch + 1}/10")

2025-08-15 12:09:56,082 | INFO | Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=100, alpha=0.025>', 'datetime': '2025-08-15T12:09:56.078529', 'gensim': '4.3.3', 'python': '3.12.11 | packaged by Anaconda, Inc. | (main, Jun  5 2025, 08:03:38) [Clang 14.0.6 ]', 'platform': 'macOS-15.6-arm64-arm-64bit', 'event': 'created'}
2025-08-15 12:09:56,091 | INFO | collecting all words and their counts
2025-08-15 12:09:56,148 | INFO | PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


Step 1/3: Creating corpus...
Step 2/3: Building vocabulary...


2025-08-15 12:09:56,430 | INFO | PROGRESS: at sentence #10000, processed 741573 words, keeping 21069 word types
2025-08-15 12:09:56,609 | INFO | PROGRESS: at sentence #20000, processed 1484390 words, keeping 29347 word types
2025-08-15 12:09:56,787 | INFO | PROGRESS: at sentence #30000, processed 2225220 words, keeping 35653 word types
2025-08-15 12:09:56,966 | INFO | PROGRESS: at sentence #40000, processed 2965258 words, keeping 41085 word types
2025-08-15 12:09:57,144 | INFO | PROGRESS: at sentence #50000, processed 3705513 words, keeping 45559 word types
2025-08-15 12:09:57,319 | INFO | PROGRESS: at sentence #60000, processed 4446326 words, keeping 49872 word types
2025-08-15 12:09:57,493 | INFO | PROGRESS: at sentence #70000, processed 5181310 words, keeping 53881 word types
2025-08-15 12:09:57,672 | INFO | PROGRESS: at sentence #80000, processed 5923000 words, keeping 57499 word types
2025-08-15 12:09:57,849 | INFO | PROGRESS: at sentence #90000, processed 6666270 words, keeping 6

Step 3/3: Training Word2Vec model...


Training epochs:   0%|          | 0/10 [00:00<?, ?it/s]2025-08-15 12:10:51,579 | INFO | Word2Vec lifecycle event {'msg': 'training model with 4 workers on 410788 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2025-08-15T12:10:51.579746', 'gensim': '4.3.3', 'python': '3.12.11 | packaged by Anaconda, Inc. | (main, Jun  5 2025, 08:03:38) [Clang 14.0.6 ]', 'platform': 'macOS-15.6-arm64-arm-64bit', 'event': 'train'}
2025-08-15 12:10:52,586 | INFO | EPOCH 0 - PROGRESS: at 1.58% examples, 2153050 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:10:53,591 | INFO | EPOCH 0 - PROGRESS: at 3.11% examples, 2120625 words/s, in_qsize 8, out_qsize 0
2025-08-15 12:10:54,592 | INFO | EPOCH 0 - PROGRESS: at 4.56% examples, 2073862 words/s, in_qsize 6, out_qsize 1
2025-08-15 12:10:55,597 | INFO | EPOCH 0 - PROGRESS: at 6.08% examples, 2072995 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:10:56,601 | INFO | EPOCH 0 - PROGRESS: at 7.63% examp

  Completed epoch 1/10


2025-08-15 12:12:35,548 | INFO | EPOCH 0 - PROGRESS: at 1.38% examples, 1890352 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:12:36,554 | INFO | EPOCH 0 - PROGRESS: at 2.81% examples, 1913761 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:12:37,556 | INFO | EPOCH 0 - PROGRESS: at 4.27% examples, 1940864 words/s, in_qsize 6, out_qsize 1
2025-08-15 12:12:38,560 | INFO | EPOCH 0 - PROGRESS: at 5.73% examples, 1951296 words/s, in_qsize 8, out_qsize 0
2025-08-15 12:12:39,560 | INFO | EPOCH 0 - PROGRESS: at 7.27% examples, 1983727 words/s, in_qsize 8, out_qsize 0
2025-08-15 12:12:40,560 | INFO | EPOCH 0 - PROGRESS: at 8.73% examples, 1983301 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:12:41,561 | INFO | EPOCH 0 - PROGRESS: at 10.13% examples, 1970823 words/s, in_qsize 8, out_qsize 0
2025-08-15 12:12:42,563 | INFO | EPOCH 0 - PROGRESS: at 11.57% examples, 1967671 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:12:43,571 | INFO | EPOCH 0 - PROGRESS: at 13.00% examples, 1963664 words/s, in_q

  Completed epoch 2/10


2025-08-15 12:14:42,476 | INFO | EPOCH 0 - PROGRESS: at 1.04% examples, 1426139 words/s, in_qsize 8, out_qsize 0
2025-08-15 12:14:43,478 | INFO | EPOCH 0 - PROGRESS: at 2.11% examples, 1437937 words/s, in_qsize 8, out_qsize 0
2025-08-15 12:14:44,484 | INFO | EPOCH 0 - PROGRESS: at 3.28% examples, 1489473 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:14:45,520 | INFO | EPOCH 0 - PROGRESS: at 4.44% examples, 1498117 words/s, in_qsize 3, out_qsize 2
2025-08-15 12:14:46,526 | INFO | EPOCH 0 - PROGRESS: at 5.61% examples, 1515302 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:14:47,530 | INFO | EPOCH 0 - PROGRESS: at 6.77% examples, 1528408 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:14:48,533 | INFO | EPOCH 0 - PROGRESS: at 7.92% examples, 1533603 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:14:49,539 | INFO | EPOCH 0 - PROGRESS: at 9.05% examples, 1532824 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:14:50,541 | INFO | EPOCH 0 - PROGRESS: at 10.21% examples, 1537133 words/s, in_qsi

  Completed epoch 3/10


2025-08-15 12:16:55,329 | INFO | EPOCH 0 - PROGRESS: at 0.97% examples, 1320935 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:16:56,330 | INFO | EPOCH 0 - PROGRESS: at 2.23% examples, 1518303 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:16:57,330 | INFO | EPOCH 0 - PROGRESS: at 3.47% examples, 1576172 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:16:58,331 | INFO | EPOCH 0 - PROGRESS: at 4.60% examples, 1568041 words/s, in_qsize 6, out_qsize 1
2025-08-15 12:16:59,332 | INFO | EPOCH 0 - PROGRESS: at 5.83% examples, 1590896 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:17:00,335 | INFO | EPOCH 0 - PROGRESS: at 7.03% examples, 1597429 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:17:01,339 | INFO | EPOCH 0 - PROGRESS: at 8.23% examples, 1603101 words/s, in_qsize 8, out_qsize 0
2025-08-15 12:17:02,340 | INFO | EPOCH 0 - PROGRESS: at 9.34% examples, 1591319 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:17:03,344 | INFO | EPOCH 0 - PROGRESS: at 10.56% examples, 1596245 words/s, in_qsi

  Completed epoch 4/10


2025-08-15 12:19:07,064 | INFO | EPOCH 0 - PROGRESS: at 1.10% examples, 1502722 words/s, in_qsize 8, out_qsize 0
2025-08-15 12:19:08,071 | INFO | EPOCH 0 - PROGRESS: at 2.35% examples, 1605289 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:19:09,077 | INFO | EPOCH 0 - PROGRESS: at 3.59% examples, 1625767 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:19:10,080 | INFO | EPOCH 0 - PROGRESS: at 4.72% examples, 1606069 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:19:11,081 | INFO | EPOCH 0 - PROGRESS: at 5.92% examples, 1611508 words/s, in_qsize 8, out_qsize 0
2025-08-15 12:19:12,084 | INFO | EPOCH 0 - PROGRESS: at 7.13% examples, 1618764 words/s, in_qsize 6, out_qsize 1
2025-08-15 12:19:13,086 | INFO | EPOCH 0 - PROGRESS: at 8.34% examples, 1621911 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:19:14,093 | INFO | EPOCH 0 - PROGRESS: at 9.45% examples, 1607779 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:19:15,095 | INFO | EPOCH 0 - PROGRESS: at 10.66% examples, 1611345 words/s, in_qsi

  Completed epoch 5/10


2025-08-15 12:21:24,713 | INFO | EPOCH 0 - PROGRESS: at 1.06% examples, 1444966 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:21:25,716 | INFO | EPOCH 0 - PROGRESS: at 2.29% examples, 1563034 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:21:26,722 | INFO | EPOCH 0 - PROGRESS: at 3.45% examples, 1564541 words/s, in_qsize 8, out_qsize 0
2025-08-15 12:21:27,728 | INFO | EPOCH 0 - PROGRESS: at 4.56% examples, 1551260 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:21:28,734 | INFO | EPOCH 0 - PROGRESS: at 5.75% examples, 1566007 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:21:29,735 | INFO | EPOCH 0 - PROGRESS: at 6.95% examples, 1577357 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:21:30,736 | INFO | EPOCH 0 - PROGRESS: at 8.14% examples, 1584122 words/s, in_qsize 6, out_qsize 1
2025-08-15 12:21:31,746 | INFO | EPOCH 0 - PROGRESS: at 9.25% examples, 1572942 words/s, in_qsize 8, out_qsize 0
2025-08-15 12:21:32,754 | INFO | EPOCH 0 - PROGRESS: at 10.43% examples, 1574767 words/s, in_qsi

  Completed epoch 6/10


2025-08-15 12:23:39,054 | INFO | EPOCH 0 - PROGRESS: at 1.13% examples, 1534205 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:23:40,058 | INFO | EPOCH 0 - PROGRESS: at 2.37% examples, 1610680 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:23:41,060 | INFO | EPOCH 0 - PROGRESS: at 3.60% examples, 1634447 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:23:42,063 | INFO | EPOCH 0 - PROGRESS: at 4.77% examples, 1623021 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:23:43,065 | INFO | EPOCH 0 - PROGRESS: at 5.95% examples, 1621584 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:23:44,071 | INFO | EPOCH 0 - PROGRESS: at 7.15% examples, 1623642 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:23:45,072 | INFO | EPOCH 0 - PROGRESS: at 8.35% examples, 1623825 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:23:46,074 | INFO | EPOCH 0 - PROGRESS: at 9.50% examples, 1615601 words/s, in_qsize 6, out_qsize 1
2025-08-15 12:23:47,077 | INFO | EPOCH 0 - PROGRESS: at 10.71% examples, 1618006 words/s, in_qsi

  Completed epoch 7/10


2025-08-15 12:25:55,695 | INFO | EPOCH 0 - PROGRESS: at 0.89% examples, 1216451 words/s, in_qsize 8, out_qsize 0
2025-08-15 12:25:56,701 | INFO | EPOCH 0 - PROGRESS: at 1.99% examples, 1351095 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:25:57,711 | INFO | EPOCH 0 - PROGRESS: at 3.07% examples, 1388843 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:25:58,717 | INFO | EPOCH 0 - PROGRESS: at 4.16% examples, 1411332 words/s, in_qsize 8, out_qsize 0
2025-08-15 12:25:59,721 | INFO | EPOCH 0 - PROGRESS: at 5.17% examples, 1403798 words/s, in_qsize 6, out_qsize 1
2025-08-15 12:26:00,730 | INFO | EPOCH 0 - PROGRESS: at 6.10% examples, 1379861 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:26:01,738 | INFO | EPOCH 0 - PROGRESS: at 7.16% examples, 1388581 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:26:02,740 | INFO | EPOCH 0 - PROGRESS: at 8.25% examples, 1400279 words/s, in_qsize 6, out_qsize 1
2025-08-15 12:26:03,746 | INFO | EPOCH 0 - PROGRESS: at 9.18% examples, 1384346 words/s, in_qsiz

  Completed epoch 8/10


2025-08-15 12:28:24,091 | INFO | EPOCH 0 - PROGRESS: at 0.97% examples, 1326160 words/s, in_qsize 8, out_qsize 0
2025-08-15 12:28:25,092 | INFO | EPOCH 0 - PROGRESS: at 2.12% examples, 1446817 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:28:26,093 | INFO | EPOCH 0 - PROGRESS: at 3.26% examples, 1481778 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:28:27,095 | INFO | EPOCH 0 - PROGRESS: at 4.39% examples, 1498482 words/s, in_qsize 6, out_qsize 1
2025-08-15 12:28:28,095 | INFO | EPOCH 0 - PROGRESS: at 5.44% examples, 1482939 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:28:29,096 | INFO | EPOCH 0 - PROGRESS: at 6.54% examples, 1488785 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:28:30,099 | INFO | EPOCH 0 - PROGRESS: at 7.66% examples, 1492634 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:28:31,107 | INFO | EPOCH 0 - PROGRESS: at 8.74% examples, 1489396 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:28:32,110 | INFO | EPOCH 0 - PROGRESS: at 9.77% examples, 1479398 words/s, in_qsiz

  Completed epoch 9/10


2025-08-15 12:30:50,029 | INFO | EPOCH 0 - PROGRESS: at 1.45% examples, 1989233 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:30:51,030 | INFO | EPOCH 0 - PROGRESS: at 3.04% examples, 2074011 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:30:52,032 | INFO | EPOCH 0 - PROGRESS: at 4.53% examples, 2058927 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:30:53,038 | INFO | EPOCH 0 - PROGRESS: at 6.11% examples, 2083493 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:30:54,038 | INFO | EPOCH 0 - PROGRESS: at 7.67% examples, 2091162 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:30:55,045 | INFO | EPOCH 0 - PROGRESS: at 8.08% examples, 1834514 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:30:56,045 | INFO | EPOCH 0 - PROGRESS: at 8.65% examples, 1683723 words/s, in_qsize 7, out_qsize 0
2025-08-15 12:30:57,057 | INFO | EPOCH 0 - PROGRESS: at 8.94% examples, 1519751 words/s, in_qsize 4, out_qsize 0
2025-08-15 12:30:58,060 | INFO | EPOCH 0 - PROGRESS: at 10.32% examples, 1558390 words/s, in_qsi

  Completed epoch 10/10





In [None]:
# Define function for cooking time extraction
def extract_cook_time(text):
    matches = re.findall(r'(\d+)\s*(minutes|min|hours|hrs)', text.lower())
    total_minutes = 0
    for num, unit in matches:
        num = int(num)
        if 'hour' in unit or 'hr' in unit:
            total_minutes += num * 60
        else:
            total_minutes += num

    # Set to max 6 hours (360 min)
    total_minutes = min(total_minutes, 360)

    # Use a default if no time was found
    if total_minutes == 0:
        total_minutes = 30
    hours = total_minutes // 60
    minutes = total_minutes % 60
    return f"{hours} hr {minutes} min" if hours > 0 else f"{minutes} min"

# Extract cooking time with progress tracking
print("Extracting cooking times from recipes...")
print(f"Processing {len(df_preprocessed):,} recipes...")

# Pandas progress bar
tqdm.pandas(desc="Extracting cooking times")

# Apply with progress bar
df_preprocessed['cooking_time'] = df_preprocessed['directions'].progress_apply(extract_cook_time)

# Completion update and display statistics
print(f"\nCooking time statistics:")
print(f"Average cooking time: {df_preprocessed['cooking_time'].value_counts().head()}")

Extracting cooking times from recipes...
Processing 2,224,769 recipes...


Extracting cooking times: 100%|██████████| 2224769/2224769 [00:12<00:00, 174361.66it/s]



Cooking time statistics:
Average cooking time: cooking_time
30 min        819324
20 min         91675
15 min         85074
6 hr 0 min     84494
10 min         81001
Name: count, dtype: int64


In [None]:
# Define some keywords for each of the 11 cuisines. 
# Keywords are optimized to be as mutually exclusive as possible
cuisine_keywords = {
    "Italian": {
        "parmesan", "parmigiano", "mozzarella", "pecorino", "ricotta", "gorgonzola",
        "basil", "oregano", "rosemary", "sage", "arugula", "prosciutto",
        "pasta", "spaghetti", "penne", "linguine", "fettuccine", "gnocchi",
        "risotto", "lasagna", "carbonara", "bolognese", "marinara", "pesto",
        "pizza", "bruschetta", "tiramisu", "gelato", "osso buco", "pancetta"
    },
    
    "Mexican": {
        "tortilla", "masa", "salsa", "guacamole", "pico de gallo", "mole",
        "jalapeno", "poblano", "habanero", "chipotle", "serrano", "anaheim",
        "queso", "oaxaca", "monterey jack", "avocado", "lime", "cilantro",
        "enchilada", "tamales", "quesadilla", "tacos", "burritos", "carnitas",
        "chorizo", "pozole", "ceviche", "elote", "tres leches", "horchata"
    },
    
    "Indian": {
        "haldi", "cumin", "jeera", "coriander", "dhania",
        "masala", "garam masala", "tandoori", "tikka", "paneer", "ghee",
        "dal", "chana", "basmati", "naan", "chapati",
        "cardamom", "cloves", "fenugreek", "asafoetida", "hing",
        "biryani", "samosa", "dosa", "idli", "vindaloo", "korma", "raita"
    },
    
    "Chinese": {
        "oyster sauce", "hoisin", "chow mein", "lo mein", "wonton", "dumpling", "dim sum",
        "tofu", "bok choy", "chinese broccoli", "water chestnut", "bamboo shoots",
        "five spice", "szechuan", "kung pao", "sweet and sour", "hot pot", "chinese", "fried rice"
    },
    
    "Japanese": {
        "miso", "white miso", "red miso", "soy sauce", "shoyu", "tamari",
        "sushi", "sashimi", "nori", "wakame", "kombu", "dashi", "bonito",
        "sake", "mirin", "rice vinegar", "wasabi", "ginger", "pickled ginger",
        "teriyaki", "tempura", "ramen", "udon", "soba", "mochi", "matcha",
        "edamame", "shiitake", "enoki", "daikon", "katsu", "yakitori"
    },
    
    "Middle Eastern": {
        "couscous", "bulgur", "freekeh", "hummus", "baba ganoush", "tabbouleh",
        "tahini", "harissa", "zaatar", "za'atar", "sumac", "pomegranate molasses",
        "falafel", "shawarma", "kebab", "kibbeh", "labneh", "halloumi",
        "pita", "lavash", "baklava", "rose water", "orange blossom", "cardamom",
        "lamb", "goat", "dates", "figs", "almonds", "pistachios"
    },
    
    "Thai": {
        "lemongrass", "galangal", "kaffir lime", "thai basil", "holy basil",
        "fish sauce", "nam pla", "oyster sauce", "coconut milk", "coconut cream",
        "curry paste", "red curry", "green curry", "yellow curry", "massaman",
        "pad thai", "tom yum", "tom kha", "som tam", "larb", "sticky rice",
        "tamarind", "bird's eye chili", "thai chili", "jasmine rice"
    },
    
    "American": {
        "steak", "ribeye", "t-bone", "sirloin", "grill", "grilled", "barbecue", "bbq",
        "cheddar", "american cheese", "monterey jack", "ranch", "thousand island",
        "macaroni", "mac and cheese", "sloppy joe", "brisket", "pulled pork",
        "meatloaf", "meatballs", "cornbread", "biscuits", "gravy", "bacon",
        "buttermilk", "burger", "hamburger", "cheeseburger", "fried chicken",
        "coleslaw", "potato salad", "hotdog", "hot dog", "fries", "french fries",
        "ketchup", "mustard", "apple pie", "chocolate chip", "peanut butter"
    },
    
    "French": {
        "thyme", "herbes de provence", "tarragon", "chervil", "chives",
        "brie", "camembert", "roquefort", "gruyere", "emmental",
        "wine", "red wine", "white wine", "cognac", "brandy", "champagne",
        "butter", "clarified butter", "cream", "heavy cream", "creme fraiche",
        "shallot", "leek", "ratatouille", "bouillabaisse", "coq au vin",
        "beef bourguignon", "cassoulet", "confit", "foie gras", "baguette",
        "croissant", "brioche", "escargot", "souffle", "creme brulee"
    },
    
    "Indonesian": {
        "tempeh", "tempe", "sambal", "sambal oelek", "sambal terasi",
        "gula", "santan",
        "rendang", "satay", "sate", "gado gado", "nasi goreng",
        "ketumbar", "kemiri", "candlenut", "galangal", "lengkuas",
        "kecap", "kecap manis", "sweet soy", "tamarind", "asam", "pandan",
        "medan", "padang", "javanese", "balinese", "rijsttafel", "gudeg"
    },
    
    "German": {
        "sauerkraut", "red cabbage", "strudel", "apfelstrudel", "pumpernickel",
        "rye bread", "pretzel", "brezel", "bratwurst", "knockwurst", "weisswurst",
        "spätzle", "spaetzle", "schnitzel", "sauerbraten", "rouladen",
        "beer", "lager", "weissbier", "oktoberfest", "mustard", "senf",
        "black forest", "stollen", "lebkuchen", "kartoffel", "potato", "dumpling"
    }
}

In [None]:
# Ensure all text fields are string and fill NaNs
df_preprocessed['ingredients'] = df_preprocessed['ingredients'].fillna("").astype(str)

# Function to calculate average word vector
def get_avg_vector(words):
    vectors = [w2v_model.wv[w] for w in words if w in w2v_model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(w2v_model.vector_size)

# Calculate cuisine keyword vectors
cuisine_vectors = {
    cuisine: get_avg_vector(list(keywords))
    for cuisine, keywords in cuisine_keywords.items()
}

In [None]:
# Define function to tag cuisine based on cosine similarity
def tag_cuisine(text):
    text = str(text) 
    tokens = text.split()
    vec = get_avg_vector(tokens)

    similarities = {
        cuisine: np.dot(vec, cuisine_vectors[cuisine]) /
                 (np.linalg.norm(vec) * np.linalg.norm(cuisine_vectors[cuisine]) + 1e-10)
        for cuisine in cuisine_vectors
    }

    return max(similarities, key=similarities.get)

# Apply function with progress tracking
print(f"Processing {len(df_preprocessed):,} recipes...")

# Enable pandas progress bar
tqdm.pandas(desc="Tagging cuisines")

# Create combined text for better cuisine detection
print("Creating combined text for cuisine analysis...")

df_preprocessed['combined_text'] = (
    df_preprocessed['title'].fillna('') + ' ' + 
    df_preprocessed['ingredients'].fillna('')
    ).str.strip()

# Apply cuisine tagging with progress bar using combined text
df_preprocessed['cuisine_tag'] = df_preprocessed['combined_text'].progress_apply(tag_cuisine)

# Update completion
print(f"\nCuisine distribution:")
cuisine_counts = df_preprocessed['cuisine_tag'].value_counts()
total_recipes = len(df_preprocessed)

print(f"Total recipes processed: {total_recipes:,}")
print(f"Recipes with cuisine tags: {(cuisine_counts != 'Unknown').sum():,}")
print(f"Unknown/untagged recipes: {cuisine_counts.get('Unknown', 0):,}")

# Print cuisine distribution
print(f"\nCuisines:")
for cuisine, count in cuisine_counts.items():
    percentage = count/total_recipes*100
    print(f"  {cuisine}: {count:,} recipes ({percentage:.1f}%)")

Processing 2,224,769 recipes...
Creating combined text for cuisine analysis...


Tagging cuisines: 100%|██████████| 2224769/2224769 [02:37<00:00, 14135.13it/s]



Cuisine distribution:
Total recipes processed: 2,224,769
Recipes with cuisine tags: 11
Unknown/untagged recipes: 0

Cuisines:
  Indian: 661,303 recipes (29.7%)
  Indonesian: 644,019 recipes (28.9%)
  Mexican: 349,553 recipes (15.7%)
  Thai: 160,501 recipes (7.2%)
  Italian: 135,178 recipes (6.1%)
  Chinese: 101,145 recipes (4.5%)
  Japanese: 64,404 recipes (2.9%)
  French: 64,166 recipes (2.9%)
  American: 35,720 recipes (1.6%)
  German: 8,230 recipes (0.4%)
  Middle Eastern: 550 recipes (0.0%)


In [None]:
# We're still spotting mistakes after keyword optimization, so opt for overriding some cuisine classifications
cuisine_overrides = {
    # American
    "steak": "American",
    "brisket": "American",
    "sloppy joe": "American",
    "bbq": "American",
    "barbecue": "American",
    "pie crust": "American",
    "ranch": "American",
    "bacon": "American",
    "meatloaf": "American",
    "macaroni": "American",
    "cheddar": "American",
    "hotdog": "American",
    "cornbread": "American",
    "buttermilk": "American",
    "fried chicken": "American",
    "chili": "American",
    "ketchup": "American",
    "coleslaw": "American",
    "graham cracker": "American",
    "cranberry sauce": "American",
    "peanut butter": "American",
    "crisco": "American",

    # Italian
    "pasta": "Italian",
    "lasagna": "Italian",
    "risotto": "Italian",
    "bolognese": "Italian",
    "mozzarella": "Italian",
    "parmesan": "Italian",
    "oregano": "Italian",
    "basil": "Italian",
    "anchovy": "Italian",
    "fettuccine": "Italian",
    "gnocchi": "Italian",
    "prosciutto": "Italian",
    "carbonara": "Italian",
    "caprese": "Italian",
    "marinara": "Italian",
    "penne": "Italian",
    "ziti": "Italian",

    # French
    "tart": "French",
    "quiche": "French",
    "ratatouille": "French",
    "vinaigrette": "French",
    "brie": "French",
    "croissant": "French",
    "thyme": "French",
    "shallot": "French",
    "crepe": "French",
    "bouillabaisse": "French",
    "bechamel": "French",
    "duxelles": "French",
    "souffle": "French",

    # German
    "pretzel": "German",
    "portzelky": "German",
    "strudel": "German",
    "sauerkraut": "German",
    "bratwurst": "German",
    "spätzle": "German",
    "wiener": "German",

    # Indonesian
    "rendang": "Indonesian",
    "sambal": "Indonesian",
    "medan": "Indonesian",
    "tempeh": "Indonesian",
    "gula jawa": "Indonesian",
    "kecap": "Indonesian",
    "nasi goreng": "Indonesian",
    "bumbu": "Indonesian",
    "ikan bakar": "Indonesian",
    "balado": "Indonesian",

    # Mexican
    "tortilla": "Mexican",
    "quesadilla": "Mexican",
    "ranchero": "Mexican",
    "enchilada": "Mexican",
    "salsa": "Mexican",
    "jalapeno": "Mexican",
    "chipotle": "Mexican",
    "avocado": "Mexican",
    "taco": "Mexican",
    "guacamole": "Mexican",
    "refried beans": "Mexican",
    "queso": "Mexican",

    # Japanese
    "nori": "Japanese",
    "miso": "Japanese",
    "sushi": "Japanese",
    "teriyaki": "Japanese",
    "sashimi": "Japanese",
    "mirin": "Japanese",
    "wasabi": "Japanese",
    "dashi": "Japanese",
    "udon": "Japanese",
    "tempura": "Japanese",

    # Middle Eastern
    "falafel": "Middle Eastern",
    "tahini": "Middle Eastern",
    "zaatar": "Middle Eastern",
    "sumac": "Middle Eastern",
    "hummus": "Middle Eastern",
    "couscous": "Middle Eastern",
    "shawarma": "Middle Eastern",
    "pita": "Middle Eastern",
    "lamb kebab": "Middle Eastern",

    # Chinese
    "soy sauce": "Chinese",
    "hoisin": "Chinese",
    "bok choy": "Chinese",
    "scallion": "Chinese",
    "dumpling": "Chinese",
    "wonton": "Chinese",
    "chow mein": "Chinese",
    "szechuan": "Chinese",
    "five-spice": "Chinese",
    "ginger": "Chinese",

    # Thai
    "lemongrass": "Thai",
    "galangal": "Thai",
    "kaffir": "Thai",
    "coconut milk": "Thai",
    "fish sauce": "Thai",
    "green curry": "Thai",
    "red curry": "Thai",
    "thai basil": "Thai",

    # Indian
    "masala": "Indian",
    "garam": "Indian",
    "paneer": "Indian",
    "dal": "Indian",
    "turmeric": "Indian",
    "curry": "Indian",
    "tikka": "Indian",
    "ghee": "Indian",
    "biryani": "Indian"
}

In [None]:
# Define function to override
def tag_cuisine_with_overrides(text):
    text = str(text).lower()
    for keyword, cuisine in cuisine_overrides.items():
        if keyword in text:
            return cuisine
    tokens = text.split()
    vec = get_avg_vector(tokens)
    similarities = {
        cuisine: np.dot(vec, cuisine_vectors[cuisine]) /
                 (np.linalg.norm(vec) * np.linalg.norm(cuisine_vectors[cuisine]) + 1e-10)
        for cuisine in cuisine_vectors
    }
    return max(similarities, key=similarities.get)

# Apply function with progress updates
print(f"Processing {len(df_preprocessed):,} recipes...")

# Enable pandas progress bar
tqdm.pandas(desc="Applying cuisine overrides")

# Apply cuisine tagging with overrides and progress bar
df_preprocessed['cuisine_tag'] = df_preprocessed['ingredients'].progress_apply(tag_cuisine_with_overrides)

# Update completion and show updated cuisine distribution
print(f"\nFinal cuisine distribution:")
cuisine_counts = df_preprocessed['cuisine_tag'].value_counts().head(10)
for cuisine, count in cuisine_counts.items():
    print(f"  {cuisine}: {count:,} recipes ({count/len(df_preprocessed)*100:.1f}%)")

Processing 2,224,769 recipes...


Applying cuisine overrides: 100%|██████████| 2224769/2224769 [01:39<00:00, 22439.38it/s]


Final cuisine distribution:
  American: 567,696 recipes (25.5%)
  Indonesian: 469,220 recipes (21.1%)
  Indian: 362,140 recipes (16.3%)
  Italian: 234,014 recipes (10.5%)
  Chinese: 154,852 recipes (7.0%)
  Mexican: 138,719 recipes (6.2%)
  Thai: 121,179 recipes (5.4%)
  French: 113,417 recipes (5.1%)
  Japanese: 37,640 recipes (1.7%)
  German: 15,731 recipes (0.7%)





In [None]:
# Function to tag diet preferences
def tag_diet(text):

    if not isinstance(text, str) or not text.strip():
        return "Unknown"
    
    text = text.lower()
    
    # Define positive keywords
    keywords = {
        "meat": {
            "high": ["beef", "chicken", "pork", "bacon", "sausage", "lamb", "turkey", "ham", "steak", "ground beef", "oyster"],
            "medium": ["meat", "poultry", "pepperoni", "salami", "chorizo"],
            "hidden": ["gelatin", "lard", "chicken stock", "beef broth"]
        },
        "seafood": {
            "high": ["fish", "salmon", "tuna", "cod", "shrimp", "crab", "lobster", "scallops"],
            "medium": ["seafood", "anchovy", "sardines", "mackerel", "prawns"],
            "hidden": ["fish sauce", "worcestershire sauce", "oyster sauce"]
        },
        "dairy": {
            "high": ["milk", "cheese", "butter", "cream", "yogurt", "egg", "eggs"],
            "medium": ["dairy", "mozzarella", "cheddar", "parmesan"],
            "hidden": ["whey", "casein", "lactose", "ghee"]
        },
        "gluten": {
            "high": ["wheat flour", "all purpose flour", "bread", "pasta", "noodle", "wheat"],
            "medium": ["barley", "rye", "semolina", "couscous"],
            "hidden": ["soy sauce", "malt", "seitan"] 
        },
        "nuts": {
            "high": ["almond", "walnut", "pecan", "cashew", "pistachio", "peanut"],
            "medium": ["nuts", "nut", "hazelnut", "macadamia"],
            "hidden": ["almond flour", "peanut butter", "tahini"]
        }
    }
    
    # Define negative keywords
    negatives = {
        "vegan": ["vegan", "plant-based", "plant based", "100% plant"],
        "vegetarian": ["vegetarian", "veggie", "meat-free", "meat free"],
        "dairy_free": ["dairy-free", "dairy free", "lactose-free", "non-dairy"],
        "gluten_free": ["gluten-free", "gluten free", "gf", "wheat-free"],
        "nut_free": ["nut-free", "nut free"]
    }
    
    # Define fake ingredients
    fake_ingredients = {
        "fake_meat": [
            "vegan chicken", "fake chicken", "plant-based chicken", "mock chicken",
            "vegan beef", "fake beef", "plant-based beef", "mock beef",
            "vegan sausage", "fake sausage", "plant-based sausage",
            "beyond meat", "impossible", "seitan", "jackfruit", "mushroom burger",
            "plant-based meat", "mock meat", "meat substitute", "fake bacon",
            "vegan bacon", "tempeh bacon", "coconut bacon"
        ],
        "fake_dairy": [
            "vegan cheese", "fake cheese", "plant-based cheese", "cashew cheese",
            "almond cheese", "nutritional yeast", "dairy-free cheese",
            "almond milk", "soy milk", "coconut milk", "oat milk", "rice milk",
            "plant milk", "dairy-free milk", "vegan butter", "plant-based butter"
        ]
    }
    
    # Detect fake ingredients 
    has_fake_meat = any(fake in text for fake in fake_ingredients["fake_meat"])
    has_fake_dairy = any(fake in text for fake in fake_ingredients["fake_dairy"])
    
    # Check for explicit vegan/vegetarian labels
    is_labeled_vegan = any(label in text for label in negatives["vegan"])
    is_labeled_vegetarian = any(label in text for label in negatives["vegetarian"])
    is_labeled_dairy_free = any(label in text for label in negatives["dairy_free"])
    is_labeled_gluten_free = any(label in text for label in negatives["gluten_free"])
    is_labeled_nut_free = any(label in text for label in negatives["nut_free"])
    
    # Look for REAL animal products (but be smart about context)
    def get_real_ingredient_score(category):
        score = 0.0
        confidence_weights = {"high": 1.0, "medium": 0.7, "hidden": 0.5}
        
        for confidence, word_list in keywords[category].items():
            for word in word_list:
                if word in text:
                    is_fake = False
                    
                    if category in ["meat", "dairy"]:
                        fake_list = fake_ingredients.get(f"fake_{category}", [])
                        for fake_phrase in fake_list:
                            if word in fake_phrase and fake_phrase in text:
                                is_fake = True
                                break
                    
                    if not is_fake:
                        weight = confidence_weights[confidence]
                        score += weight
        
        return min(score, 2.0)
    
    # Calculate scores for real ingredients only
    real_meat_score = get_real_ingredient_score("meat")
    real_seafood_score = get_real_ingredient_score("seafood") 
    real_dairy_score = get_real_ingredient_score("dairy")
    gluten_score = get_real_ingredient_score("gluten")
    nuts_score = get_real_ingredient_score("nuts")
    
    # Fix false nut detection for specific cases
    if "beyond meat" in text or "impossible" in text:
        nuts_score = max(0, nuts_score - 0.5) 
    
    if "almond flour" in text and "dairy" not in text and "egg" not in text:
        pass 
    
    # Apply label overrides (but verify against ingredients)
    if is_labeled_vegan:
        if real_meat_score > 0.3 or real_seafood_score > 0.3 or real_dairy_score > 0.3:
            pass
        else:
            real_meat_score = real_seafood_score = real_dairy_score = 0.0
    elif is_labeled_vegetarian:
        if real_meat_score > 0.3 or real_seafood_score > 0.3:
            pass 
        else:
            real_meat_score = real_seafood_score = 0.0
    
    # Handle fake ingredients properly
    if has_fake_meat and real_meat_score < 0.5:
        real_meat_score = 0.0 
    
    if has_fake_dairy and real_dairy_score < 0.5:
        real_dairy_score = 0.0 
    
    # Final classification
    threshold = 0.3
    
    if real_meat_score > threshold:
        diet = "Non-Vegetarian"
    elif real_seafood_score > threshold:
        diet = "Pescatarian"  
    elif real_dairy_score > threshold:
        diet = "Vegetarian"
    else:
        diet = "Vegan"
    
    # Add dietary restrictions (be more selective)
    restrictions = []
    
    # Only add gluten warning if significant gluten content
    if gluten_score > 0.5 and not is_labeled_gluten_free:
        restrictions.append("Contains Gluten")
    elif is_labeled_gluten_free:
        restrictions.append("Gluten-Free")
        
    # Only add nut warning if significant nut content
    if nuts_score > 0.5 and not is_labeled_nut_free:
        restrictions.append("Contains Nuts")
    elif is_labeled_nut_free:
        restrictions.append("Nut-Free")
    
    # Combine diet and restrictions
    if restrictions:
        return f"{diet}, {', '.join(restrictions[:2])}"
    else:
        return diet

# 50 Random Recipe Cases to Test
test_cases = [
    # Italian recipes
    ("spaghetti, tomato sauce, garlic, basil, olive oil", "Should be Vegan"),
    ("fettuccine, heavy cream, parmesan cheese, butter, black pepper", "Should be Vegetarian"),
    ("pizza dough, mozzarella cheese, pepperoni, tomato sauce", "Should be Non-Vegetarian"),
    ("risotto rice, mushrooms, white wine, vegetable broth, parmesan", "Should be Vegetarian"),
    ("lasagna noodles, ground beef, ricotta cheese, marinara sauce", "Should be Non-Vegetarian"),
    
    # Asian recipes
    ("tofu, soy sauce, ginger, garlic, sesame oil, vegetables", "Should be Vegan"),
    ("chicken breast, teriyaki sauce, rice, broccoli", "Should be Non-Vegetarian"),
    ("salmon, miso paste, sake, mirin, green onions", "Should be Pescatarian"),
    ("rice noodles, bean sprouts, peanuts, lime, fish sauce", "Should be Pescatarian"),
    ("tempeh, coconut milk, lemongrass, chili, lime leaves", "Should be Vegan"),
    
    # Mexican recipes
    ("black beans, corn tortillas, avocado, lime, cilantro", "Should be Vegan"),
    ("ground beef, cheddar cheese, lettuce, tomato, sour cream", "Should be Non-Vegetarian"),
    ("shrimp, corn, bell peppers, onions, cumin", "Should be Pescatarian"),
    ("refried beans, flour tortillas, monterey jack cheese", "Should be Vegetarian"),
    ("chicken thighs, chipotle peppers, adobo sauce, onions", "Should be Non-Vegetarian"),
    
    # American recipes
    ("quinoa, kale, cranberries, almonds, olive oil dressing", "Should be Vegan"),
    ("bacon, eggs, cheddar cheese, hash browns, butter", "Should be Non-Vegetarian"),
    ("tuna, mayonnaise, celery, bread, lettuce", "Should be Pescatarian"),
    ("mac and cheese, milk, butter, flour, cheddar", "Should be Vegetarian"),
    ("turkey, stuffing, cranberry sauce, gravy", "Should be Non-Vegetarian"),
    
    # Indian recipes
    ("lentils, turmeric, cumin, garam masala, coconut oil", "Should be Vegan"),
    ("paneer, tomatoes, cream, garam masala, onions", "Should be Vegetarian"),
    ("chicken, yogurt, tandoori spice, garlic, ginger", "Should be Non-Vegetarian"),
    ("chickpeas, spinach, ginger, garlic, coconut milk", "Should be Vegan"),
    ("lamb, basmati rice, saffron, almonds, ghee", "Should be Non-Vegetarian"),
    
    # Mediterranean recipes
    ("olives, tomatoes, cucumber, feta cheese, olive oil", "Should be Vegetarian"),
    ("hummus, tahini, chickpeas, lemon, garlic", "Should be Vegan"),
    ("grilled fish, lemon, oregano, olive oil, capers", "Should be Pescatarian"),
    ("eggplant, zucchini, bell peppers, onions, herbs", "Should be Vegan"),
    ("lamb kebabs, yogurt sauce, pita bread, onions", "Should be Non-Vegetarian"),
    
    # Breakfast recipes
    ("oatmeal, banana, almond milk, maple syrup, walnuts", "Should be Vegan"),
    ("scrambled eggs, butter, milk, chives, salt", "Should be Vegetarian"),
    ("pancakes, eggs, milk, flour, butter, syrup", "Should be Vegetarian"),
    ("yogurt, granola, berries, honey", "Should be Vegetarian"),
    ("sausage, eggs, hash browns, cheese, toast", "Should be Non-Vegetarian"),
    
    # Soup recipes
    ("vegetable broth, carrots, celery, onions, herbs", "Should be Vegan"),
    ("chicken stock, noodles, carrots, celery, chicken", "Should be Non-Vegetarian"),
    ("coconut milk, red curry paste, vegetables, tofu", "Should be Vegan"),
    ("cream, potatoes, leeks, butter, herbs", "Should be Vegetarian"),
    ("clam chowder, heavy cream, potatoes, bacon", "Should be Non-Vegetarian"),
    
    # Salad recipes
    ("mixed greens, tomatoes, cucumber, balsamic vinegar", "Should be Vegan"),
    ("caesar dressing, romaine lettuce, parmesan, croutons", "Should be Vegetarian"),
    ("spinach, strawberries, goat cheese, pecans, vinaigrette", "Should be Vegetarian"),
    ("arugula, grilled chicken, cherry tomatoes, mozzarella", "Should be Non-Vegetarian"),
    ("kale, quinoa, avocado, lemon dressing, seeds", "Should be Vegan"),
    
    # Dessert recipes
    ("flour, sugar, vanilla, eggs, butter, chocolate chips", "Should be Vegetarian"),
    ("coconut cream, dates, cacao powder, almonds", "Should be Vegan"),
    ("cream cheese, sugar, eggs, vanilla, graham crackers", "Should be Vegetarian"),
    ("dark chocolate, coconut oil, agave, vanilla", "Should be Vegan"),
    ("heavy cream, sugar, gelatin, vanilla, berries", "Should be Non-Vegetarian"),
    
    # Snack recipes
    ("peanut butter, oats, honey, chocolate chips", "Should be Vegetarian"),
    ("cashews, nutritional yeast, garlic powder, salt", "Should be Vegan"),
    ("cheese, crackers, grapes, nuts", "Should be Vegetarian"),
    ("jerky, beef, salt, spices, preservatives", "Should be Non-Vegetarian"),
    ("popcorn, nutritional yeast, olive oil, salt", "Should be Vegan")
]

# Classification counters
results = {
    "Vegan": 0,
    "Vegetarian": 0, 
    "Pescatarian": 0,
    "Non-Vegetarian": 0,
    "Other": 0
}

# Error tracking
unexpected_results = []
critical_errors = []

for i, (ingredients, expected) in enumerate(test_cases, 1):
    result = tag_diet(ingredients)
    
    # Count results
    main_diet = result.split(',')[0].strip()
    if main_diet in results:
        results[main_diet] += 1
    else:
        results["Other"] += 1
    
    # Check if result matches expectation
    expected_diet = expected.replace("Should be ", "").split(" ")[0]
    actual_diet = main_diet
    
    matches = expected_diet.lower() in actual_diet.lower()
    
    # Check for critical errors (veg marked as non-veg)
    is_critical = False
    if ("Vegan" in expected or "Vegetarian" in expected) and "Non-Vegetarian" in result:
        is_critical = True
        critical_errors.append((ingredients, result, expected))
    
    # Mark unexpected results
    if not matches:
        unexpected_results.append((ingredients, result, expected))
    
    # Display result
    print(f"{i:2d}. '{ingredients[:50]}{'...' if len(ingredients) > 50 else ''}'")
    print(f"     Result: {result}")
    print(f"     Expected: {expected}")
    print()

# Summary statistics
print("=" * 70)
print("CLASSIFICATION DISTRIBUTION")
print("=" * 70)
total = len(test_cases)
for diet, count in results.items():
    if count > 0:
        percentage = (count / total) * 100
        print(f"{diet:15}: {count:2d} recipes ({percentage:4.1f}%)")

print(f"\nTotal recipes tested: {total}")

# Error analysis
print("\n" + "=" * 70)
print("ERROR ANALYSIS")
print("=" * 70)

print(f"Critical errors (veg→non-veg): {len(critical_errors)}")
print(f"Unexpected classifications: {len(unexpected_results)}")
print(f"Accuracy rate: {((total - len(unexpected_results)) / total * 100):.1f}%")

if critical_errors:
    print(f"\nCRITICAL ERRORS:")
    for ingredients, result, expected in critical_errors:
        print(f"   '{ingredients}' → {result}")
        print(f"   Expected: {expected}")

if unexpected_results and len(unexpected_results) <= 10:
    print(f"\nUNEXPECTED RESULTS:")
    for ingredients, result, expected in unexpected_results[:10]:
        print(f"   '{ingredients}' → {result}")
        print(f"   Expected: {expected}")
elif len(unexpected_results) > 10:
    print(f"\nUNEXPECTED RESULTS (showing first 10 of {len(unexpected_results)}):")
    for ingredients, result, expected in unexpected_results[:10]:
        print(f"   '{ingredients}' → {result}")

print("\n" + "=" * 70)

# Run the diet tagging function to create diet_tag column
df_preprocessed['diet_tag'] = df_preprocessed['ingredients'].progress_apply(tag_diet)

 1. 'spaghetti, tomato sauce, garlic, basil, olive oil'
     Result: Vegan
     Expected: Should be Vegan

 2. 'fettuccine, heavy cream, parmesan cheese, butter, ...'
     Result: Vegetarian
     Expected: Should be Vegetarian

 3. 'pizza dough, mozzarella cheese, pepperoni, tomato ...'
     Result: Non-Vegetarian
     Expected: Should be Non-Vegetarian

 4. 'risotto rice, mushrooms, white wine, vegetable bro...'
     Result: Vegetarian
     Expected: Should be Vegetarian

 5. 'lasagna noodles, ground beef, ricotta cheese, mari...'
     Result: Non-Vegetarian, Contains Gluten
     Expected: Should be Non-Vegetarian

 6. 'tofu, soy sauce, ginger, garlic, sesame oil, veget...'
     Result: Vegan
     Expected: Should be Vegan

 7. 'chicken breast, teriyaki sauce, rice, broccoli'
     Result: Non-Vegetarian
     Expected: Should be Non-Vegetarian

 8. 'salmon, miso paste, sake, mirin, green onions'
     Result: Pescatarian
     Expected: Should be Pescatarian

 9. 'rice noodles, bean spro

Applying cuisine overrides: 100%|██████████| 2224769/2224769 [00:58<00:00, 37708.52it/s]


In [None]:
# Preview
df_preprocessed[['title', 'ingredients', 'cuisine_tag', 'diet_tag', 'cooking_time']].head().style.set_properties(**{'white-space': 'pre-wrap'})

Unnamed: 0,title,ingredients,cuisine_tag,diet_tag,cooking_time
0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. evaporated milk"", ""1/2 tsp. vanilla"", ""1/2 c. broken nuts (pecans)"", ""2 Tbsp. butter or margarine"", ""3 1/2 c. bite size shredded rice biscuits""]",Indonesian,"Vegetarian, Contains Nuts",35 min
1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned chicken breasts"", ""1 can cream of mushroom soup"", ""1 carton sour cream""]",Mexican,Non-Vegetarian,3 hr 0 min
2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg. cream cheese, cubed"", ""1/3 c. butter, cubed"", ""1/2 tsp. garlic powder"", ""1/2 tsp. salt"", ""1/4 tsp. pepper""]",Indian,Vegetarian,4 hr 0 min
3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans chicken gravy"", ""1 (10 1/2 oz.) can cream of mushroom soup"", ""1 (6 oz.) box Stove Top stuffing"", ""4 oz. shredded cheese""]",Chinese,Non-Vegetarian,20 min
4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker crumbs"", ""1 c. melted butter"", ""1 lb. (3 1/2 c.) powdered sugar"", ""1 large pkg. chocolate chips""]",American,"Non-Vegetarian, Contains Nuts",20 min


In [None]:
# Save the result
df_preprocessed.to_csv("/Users/celinewidjaja/Documents/recipe-reccomender/tagged_recipes.csv", index=False)

### =====================Part 3 - LDA Topic Modeling =====================


### Preprocess Data & Build LDA Topics

#### Load Dataset

In [None]:
# Load tagged recipes
file = pd.read_csv("/Users/celinewidjaja/Documents/recipe-reccomender/tagged_recipes.csv")

# Peak at format
file.head()

Unnamed: 0.1,Unnamed: 0,title,ingredients,directions,link,source,NER,clean_ingredients,clean_directions,clean_text,simhash,cooking_time,combined_text,cuisine_tag,diet_tag
0,0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu...",c firmly packed brown sugar c evaporated milk ...,in a heavy quart saucepan mix brown sugar nuts...,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...",4629474778827133560,35 min,"No-Bake Nut Cookies [""1 c. firmly packed brown...",Indonesian,"Vegetarian, Contains Nuts"
1,1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,Gathered,"[""beef"", ""chicken breasts"", ""cream of mushroom...",small jar chipped beef cut up boned chicken br...,place chipped beef on bottom of baking dish pl...,"[""1 small jar chipped beef, cut up"", ""4 boned ...",6036693287934039245,3 hr 0 min,"Jewell Ball'S Chicken [""1 small jar chipped be...",Mexican,Non-Vegetarian
2,2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,Gathered,"[""frozen corn"", ""cream cheese"", ""butter"", ""gar...",oz pkg frozen corn oz pkg cream cheese cubed c...,in a slow cooker combine all ingredients cover...,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...",9759286286531696208,4 hr 0 min,"Creamy Corn [""2 (16 oz.) pkg. frozen corn"", ""1...",Indian,Vegetarian
3,3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""Boil and debone chicken."", ""Put bite size pi...",www.cookbooks.com/Recipe-Details.aspx?id=897570,Gathered,"[""chicken"", ""chicken gravy"", ""cream of mushroo...",large whole chicken oz cans chicken gravy oz c...,boil and debone chicken put bite size pieces i...,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...",15288822227484078784,20 min,"Chicken Funny [""1 large whole chicken"", ""2 (10...",Chinese,Non-Vegetarian
4,4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...",www.cookbooks.com/Recipe-Details.aspx?id=659239,Gathered,"[""peanut butter"", ""graham cracker crumbs"", ""bu...",c peanut butter c graham cracker crumbs c melt...,combine first four ingredients and press in x ...,"[""1 c. peanut butter"", ""3/4 c. graham cracker ...",12022098804620604003,20 min,"Reeses Cups(Candy) [""1 c. peanut butter"", ""3...",American,"Non-Vegetarian, Contains Nuts"


In [None]:
# Make a copy of dataframe
tagged_df = file

In [None]:
# Print columns
tagged_df.columns

Index(['Unnamed: 0', 'title', 'ingredients', 'directions', 'link', 'source',
       'NER', 'clean_ingredients', 'clean_directions', 'clean_text', 'simhash',
       'cooking_time', 'combined_text', 'cuisine_tag', 'diet_tag'],
      dtype='object')

#### convert cooking_time to minutes for all recipes

In [None]:

# Function to convert cooking time to minutes
def cooking_time_to_minutes(time_str):
    if pd.isna(time_str):
        return 0
    time_str = time_str.lower()
    hours = 0
    minutes = 0
    hr_match = re.search(r'(\d+)\s*hr', time_str)
    if hr_match:
        hours = int(hr_match.group(1))
    min_match = re.search(r'(\d+)\s*min', time_str)
    if min_match:
        minutes = int(min_match.group(1))
    return hours * 60 + minutes

# Apply function
tagged_df['cooking_time_mins'] = tagged_df['cooking_time'].apply(cooking_time_to_minutes)

#### Fill missing values and prepare full text field

In [None]:
# Fill missing values and prepare full text field
tagged_df['full_text'] = tagged_df['clean_ingredients'].fillna('') + ' ' + tagged_df['clean_directions'].fillna('')

#### Parallel Text Preprocessing / Tokenization

In [None]:
# Extend NLTK's stopwords with domain-specific cooking terms
custom_stopwords = set([
    "add", "cook", "heat", "bake", "boil", "fry", "stir", "mix", "remove", "place","pepper","bring","teaspoon","like","top",
    "pan", "pot", "oil", "water", "oven", "serve", "grill", "preheat", "use", "tsp", "salt","inch","two","cup","tbsp","bottom",
    "set", "let", "make", "prepare", "cut", "minutes", "cook", "temperature", "degrees","roll","bowl","one","tablespoon","turn",
    "take","get","hard", "side","put","surface","get","little","slow","dont", "mixture","medium","together","whole"
])

# Set stopwords
stop_words = set(stopwords.words('english')).union(custom_stopwords)

# Initiate lemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
num_cores = multiprocessing.cpu_count()

# Define preprocessing function
def preprocess(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z ]', '', text)
    tokens = word_tokenize(text)
    tokens = [
        lemmatizer.lemmatize(w)
        for w in tokens
        if w not in stop_words and len(w) > 2
    ]
    return tokens


# Apply preprocessing with progress bar
tokens_list = Parallel(n_jobs=num_cores-1)(
    delayed(preprocess)(text) for text in tqdm(tagged_df['full_text'], desc="Preprocessing")
)

# Safe tokens_list
tagged_df['tokens'] = tokens_list

Preprocessing: 100%|██████████| 75000/75000 [00:07<00:00, 9489.98it/s] /s]


In [None]:
# Inspect tokens
tagged_df['tokens'].head()

1857564    [cooking, spray, poblano, pepper, pepper, per,...
1447591    [ounce, ground, round, ounce, ground, veal, fr...
747168     [big, cherry, pie, filling, crushed, pineapple...
243615     [baked, pie, shell, milk, mushroom, soup, undi...
1013754    [minced, onion, teaspoon, dried, parsley, flak...
Name: tokens, dtype: object

#### Build Dictionary & Corpus for LDA

In [None]:
# Create dictionary of tokens
dictionary = corpora.Dictionary(tagged_df['tokens'])

# Filter out extreme tokens
dictionary.filter_extremes(no_below=20, no_above=0.8, keep_n=10000)

# Create Bag-of-Words corpus
corpus = [dictionary.doc2bow(text) for text in tqdm(tagged_df['tokens'], desc="Creating BoW Corpus")]

2025-08-15 12:52:18,596 | INFO | adding document #0 to Dictionary<0 unique tokens: []>
2025-08-15 12:52:18,784 | INFO | adding document #10000 to Dictionary<11833 unique tokens: ['arrange', 'arranging', 'bater', 'batter', 'beat']...>
2025-08-15 12:52:18,967 | INFO | adding document #20000 to Dictionary<16708 unique tokens: ['arrange', 'arranging', 'bater', 'batter', 'beat']...>
2025-08-15 12:52:19,161 | INFO | adding document #30000 to Dictionary<20777 unique tokens: ['arrange', 'arranging', 'bater', 'batter', 'beat']...>
2025-08-15 12:52:19,351 | INFO | adding document #40000 to Dictionary<24576 unique tokens: ['arrange', 'arranging', 'bater', 'batter', 'beat']...>
2025-08-15 12:52:19,537 | INFO | adding document #50000 to Dictionary<27961 unique tokens: ['arrange', 'arranging', 'bater', 'batter', 'beat']...>
2025-08-15 12:52:19,721 | INFO | adding document #60000 to Dictionary<31091 unique tokens: ['arrange', 'arranging', 'bater', 'batter', 'beat']...>
2025-08-15 12:52:19,907 | INFO 

#### Train LDA Model (Parallel)

In [None]:
# Configure logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Set number of CPU cores
num_cores = multiprocessing.cpu_count()

# Set number of topics
NUM_TOPICS = 10
print(f"Training LDA model with {NUM_TOPICS} topics using {num_cores-1} workers...")

# Train LDA model
lda_model = LdaMulticore(
    corpus=corpus,
    id2word=dictionary,
    num_topics=NUM_TOPICS,
    workers=num_cores - 1,
    passes=5,
    chunksize=1000,  # default is 2000
    random_state=42,
    per_word_topics=True
)

# Save the dictionary
dictionary.save('/Users/celinewidjaja/Documents/recipe-reccomender/dictionary.dict')


2025-08-15 12:52:21,056 | INFO | using symmetric alpha at 0.1
2025-08-15 12:52:21,057 | INFO | using symmetric eta at 0.1
2025-08-15 12:52:21,059 | INFO | using serial LDA version on this node
2025-08-15 12:52:21,063 | INFO | running online LDA training, 10 topics, 5 passes over the supplied corpus of 75000 documents, updating every 7000 documents, evaluating every ~70000 documents, iterating 50x with a convergence threshold of 0.001000
2025-08-15 12:52:21,065 | INFO | training LDA model using 7 processes


Training LDA model with 10 topics using 7 workers...


2025-08-15 12:52:24,692 | INFO | PROGRESS: pass 0, dispatched chunk #0 = documents up to #1000/75000, outstanding queue size 1
2025-08-15 12:52:24,698 | INFO | PROGRESS: pass 0, dispatched chunk #1 = documents up to #2000/75000, outstanding queue size 2
2025-08-15 12:52:24,701 | INFO | PROGRESS: pass 0, dispatched chunk #2 = documents up to #3000/75000, outstanding queue size 3
2025-08-15 12:52:24,704 | INFO | PROGRESS: pass 0, dispatched chunk #3 = documents up to #4000/75000, outstanding queue size 4
2025-08-15 12:52:24,708 | INFO | PROGRESS: pass 0, dispatched chunk #4 = documents up to #5000/75000, outstanding queue size 5
2025-08-15 12:52:24,718 | INFO | PROGRESS: pass 0, dispatched chunk #5 = documents up to #6000/75000, outstanding queue size 6
2025-08-15 12:52:24,722 | INFO | PROGRESS: pass 0, dispatched chunk #6 = documents up to #7000/75000, outstanding queue size 7
2025-08-15 12:52:24,732 | INFO | PROGRESS: pass 0, dispatched chunk #7 = documents up to #8000/75000, outstandi

#### Show Top Words per Topic

In [None]:
# Get topic keywords for each topic
top_words_per_topic = []
for i in range(NUM_TOPICS):
    words = lda_model.show_topic(i, topn=10)
    topic_keywords = ", ".join([word for word, _ in words])
    top_words_per_topic.append((i, topic_keywords))

# Store in dataframe for more organized display
topic_df = pd.DataFrame(top_words_per_topic, columns=["Topic ID", "Top Keywords"])

# Display top keywords per topic
display(topic_df)

Unnamed: 0,Topic ID,Top Keywords
0,0,"flour, dough, sugar, butter, egg, baking, cup,..."
1,1,"juice, sugar, orange, lemon, cup, ice, apple, ..."
2,2,"onion, cheese, chopped, chicken, soup, sauce, ..."
3,3,"olive, garlic, tomato, chopped, fresh, tablesp..."
4,4,"sugar, egg, flour, butter, vanilla, baking, mi..."
5,5,"onion, chopped, garlic, bean, simmer, large, c..."
6,6,"sugar, cake, egg, cream, chocolate, cup, vanil..."
7,7,"chicken, sauce, tablespoon, garlic, juice, oni..."
8,8,"slice, cheese, bread, potato, egg, onion, half..."
9,9,"cream, pineapple, chopped, cracker, pkg, chees..."


#### Assign Topics to Each Recipe

In [None]:
# Assign dominant topic to each recipe
def get_dominant_topic(bow):
    topics = lda_model.get_document_topics(bow)
    if topics:
        return max(topics, key=lambda x: x[1])[0]
    return None

# Apply function and store in df as a new column
tagged_df['dominant_topic'] = [get_dominant_topic(doc) for doc in corpus]


#### Add topic name labels

In [None]:
# Add topic name labels for better UX
def get_topic_name(topic_id):
    # Just using top 3 words
    return ", ".join([word for word, _ in lda_model.show_topic(topic_id, topn=3)])

# Apply function and store as a new column
tagged_df['topic_name'] = tagged_df['dominant_topic'].apply(get_topic_name)


#### Visualize with pyLDAvis

In [None]:
# Initiate pyLDAvis
pyLDAvis.enable_notebook()

# Visualize topic distribution and top words
vis = gensimvis.prepare(lda_model, corpus, dictionary)
vis

### Save dataset to drive

In [None]:
# Save LDA model
lda_model.save('/Users/celinewidjaja/Documents/recipe-reccomender/lda_model.model')

2025-08-15 12:53:03,100 | INFO | LdaState lifecycle event {'fname_or_handle': '/Users/celinewidjaja/Documents/recipe-reccomender/lda_model.model.state', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2025-08-15T12:53:03.100128', 'gensim': '4.3.3', 'python': '3.12.11 | packaged by Anaconda, Inc. | (main, Jun  5 2025, 08:03:38) [Clang 14.0.6 ]', 'platform': 'macOS-15.6-arm64-arm-64bit', 'event': 'saving'}
2025-08-15 12:53:03,102 | INFO | saved /Users/celinewidjaja/Documents/recipe-reccomender/lda_model.model.state
2025-08-15 12:53:03,104 | INFO | LdaMulticore lifecycle event {'fname_or_handle': '/Users/celinewidjaja/Documents/recipe-reccomender/lda_model.model', 'separately': "['expElogbeta', 'sstats']", 'sep_limit': 10485760, 'ignore': ['state', 'dispatcher', 'id2word'], 'datetime': '2025-08-15T12:53:03.104490', 'gensim': '4.3.3', 'python': '3.12.11 | packaged by Anaconda, Inc. | (main, Jun  5 2025, 08:03:38) [Clang 14.0.6 ]', 'platform': 'macOS-15.6-ar

In [None]:
# Save DataFrame
tagged_df.to_csv('/Users/celinewidjaja/Documents/recipe-reccomender/LDA_recipes.csv', index=False)

## ===================== Part 4 - Recommendation System =====================

In [None]:
# Import latest dataset with LDA topic tags
lda_df = pd.read_csv("/Users/celinewidjaja/Documents/recipe-reccomender/LDA_recipes.csv")

# Inspect
lda_df.head()

Unnamed: 0.1,Unnamed: 0,title,ingredients,directions,link,source,NER,clean_ingredients,clean_directions,clean_text,simhash,cooking_time,combined_text,cuisine_tag,diet_tag,cooking_time_mins,full_text,tokens,dominant_topic,topic_name
0,1862699,Chili-Crusted Chorizo Quiche,"[""Cooking spray"", ""1 can poblano peppers (6-8 ...","[""Preheat oven to 375 degrees F. Spray a 9-inc...",www.foodgeeks.com/recipes/21147,Recipes1M,"[""Cooking spray"", ""peppers"", ""chorizo"", ""hot I...",cooking spray can poblano peppers peppers per ...,preheat oven to degrees f spray a inch glass p...,"[""Cooking spray"", ""1 can poblano peppers (6-8 ...",2238965210148526785,55 min,"Chili-Crusted Chorizo Quiche [""Cooking spray"",...",American,Non-Vegetarian,55,cooking spray can poblano peppers peppers per ...,"['cooking', 'spray', 'poblano', 'pepper', 'pep...",8,"slice, cheese, bread"
1,1451453,Individual Meat Loaves,"[""5 ounces ground round"", ""4 ounces ground vea...","[""Combine first 10 ingredients in a medium bow...",www.myrecipes.com/recipe/individual-meat-loaves,Gathered,"[""ground round"", ""ground veal"", ""frozen egg su...",ounces ground round ounces ground veal cup fro...,combine first ingredients in a medium bowl sti...,"[""5 ounces ground round"", ""4 ounces ground vea...",11671750152439083674,5 min,"Individual Meat Loaves [""5 ounces ground round...",American,"Pescatarian, Contains Gluten",5,ounces ground round ounces ground veal cup fro...,"['ounce', 'ground', 'round', 'ounce', 'ground'...",8,"slice, cheese, bread"
2,749204,Cherry Dump Cake,"[""1 big can cherry pie filling"", ""1 medium can...","[""Mix together cherry pie filling and crushed ...",www.cookbooks.com/Recipe-Details.aspx?id=826178,Gathered,"[""cherry pie filling"", ""pineapple"", ""yellow ca...",big can cherry pie filling medium can crushed ...,mix together cherry pie filling and crushed pi...,"[""1 big can cherry pie filling"", ""1 medium can...",6537681170507377488,30 min,"Cherry Dump Cake [""1 big can cherry pie fillin...",Indonesian,"Vegetarian, Contains Nuts",30,big can cherry pie filling medium can crushed ...,"['big', 'cherry', 'pie', 'filling', 'crushed',...",9,"cream, pineapple, chopped"
3,244426,Mushroom Quiche,"[""baked pie shell"", ""1/2 c. milk"", ""1 can mush...","[""Mix soup, eggs, milk and spices."", ""Add rema...",www.cookbooks.com/Recipe-Details.aspx?id=665433,Gathered,"[""pie shell"", ""milk"", ""mushroom soup"", ""mushro...",baked pie shell c milk can mushroom soup undil...,mix soup eggs milk and spices add remaining in...,"[""baked pie shell"", ""1/2 c. milk"", ""1 can mush...",5414583236597724786,1 hr 0 min,"Mushroom Quiche [""baked pie shell"", ""1/2 c. mi...",Italian,Non-Vegetarian,60,baked pie shell c milk can mushroom soup undil...,"['baked', 'pie', 'shell', 'milk', 'mushroom', ...",2,"onion, cheese, chopped"
4,1016321,Ranch-Flavored Brussel Sprouts And Chicken Sau...,"[""1 tablespoon of minced onion"", ""2 teaspoons ...","[""Place a large skillet over medium-high heat....",www.food.com/recipe/ranch-flavored-brussel-spr...,Gathered,"[""onion"", ""parsley flakes"", ""salt"", ""ground pe...",tablespoon of minced onion teaspoons dried par...,place a large skillet over mediumhigh heat whe...,"[""1 tablespoon of minced onion"", ""2 teaspoons ...",16393498101522897562,7 min,Ranch-Flavored Brussel Sprouts And Chicken Sau...,American,Non-Vegetarian,7,tablespoon of minced onion teaspoons dried par...,"['minced', 'onion', 'teaspoon', 'dried', 'pars...",5,"onion, chopped, garlic"


## Sentence-BERT Embeddings

#### Format recipe to Generate Full Recipe Text (for Embedding)
- Combines key parts of each recipe into a single string used as input for Sentence-BERT.
- Format ensures the embedding captures meaningful semantic information.

In [None]:
# Define function to format recipe to Generate Full Recipe Text (for Embedding)
def format_recipe(row):
    return f"Recipe: {row['title']}. Ingredients: {row['ingredients']}. Directions: {row['directions']}. Cooking time: {row['cooking_time']} minutes. Cuisine: {row['cuisine_tag']}. Diet: {row['diet_tag']}."

# Apply function and save text as new column
lda_df['recipe_text'] = lda_df.swifter.apply(format_recipe, axis=1)

Pandas Apply: 100%|██████████| 75000/75000 [00:00<00:00, 162262.42it/s]


#### Encode Recipes & User Query with Sentence-BERT Model
- Transforms each recipe's recipe_text into a dense vector (embedding).
- Converts the user's query into a vector.

In [None]:
# Loads lightweight sentence transformer model
# model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')  # Forces use of GPU
model = SentenceTransformer('all-MiniLM-L6-v2')

2025-08-15 12:53:09,866 | INFO | Use pytorch device_name: mps
2025-08-15 12:53:09,866 | INFO | Load pretrained SentenceTransformer: all-MiniLM-L6-v2


In [None]:
# Convert recipe text to list save as texts
texts = lda_df['recipe_text'].tolist()

# Encodes each recipe's text into a vector (embedding) that captures its meaning  with Sentence-BERT
recipe_embeddings = model.encode(
    texts,
    batch_size=256,
    show_progress_bar=True,
    convert_to_tensor=True
)

Batches: 100%|██████████| 293/293 [08:00<00:00,  1.64s/it]


In [None]:
# Save recipe embeddings
torch.save(recipe_embeddings, '/Users/celinewidjaja/Documents/recipe-reccomender/recipe_embeddings.pt')

# Remove duplicates based on title
lda_df= lda_df.drop_duplicates(subset=['title'], keep='first')

# Save df as well
lda_df.to_csv('/Users/celinewidjaja/Documents/recipe-reccomender/processed_recipes.csv', index=False)

## Load models and Data

In [None]:
# Define preprocessing function
def preprocess(text):
    # Extend NLTK's stopwords with domain-specific cooking terms
    custom_stopwords = set([
        "add", "cook", "heat", "bake", "boil", "fry", "stir", "mix", "remove", "place","pepper","bring","teaspoon","like","top",
        "pan", "pot", "oil", "water", "oven", "serve", "grill", "preheat", "use", "tsp", "salt","inch","two","cup","tbsp","bottom",
        "set", "let", "make", "prepare", "cut", "minutes", "cook", "temperature", "degrees","roll","bowl","one","tablespoon","turn",
        "take","get","hard", "side","put","surface","get","little","slow","dont", "mixture","medium","together","whole"
    ])

    # Initialize stopwords and lemmatizer within the function
    stop_words = set(stopwords.words('english')).union(custom_stopwords)
    lemmatizer = WordNetLemmatizer()

    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z ]', '', text)
    tokens = word_tokenize(text)
    tokens = [
        lemmatizer.lemmatize(w)
        for w in tokens
        if w not in stop_words and len(w) > 2
    ]
    return tokens

# Define function to calculate ingredient overla
def ingredient_overlap(row_ingredients, user_ingredients):
    recipe_ings = set(str(row_ingredients).lower().split(', '))
    user_ings = set(user_ingredients)
    return len(recipe_ings & user_ings) / max(len(recipe_ings | user_ings), 1)

# Get User Preferences
def get_user_preferences():
    print("Enter diet preference (e.g., Vegetarian, Non-Vegetarian): ")
    diet = input().strip()

    print("Enter cuisine preference (e.g., Italian, French, Mexican): ")
    cuisine = input().strip()

    print("Enter max cooking time in minutes (e.g., 30): ")
    try:
        max_time = int(input().strip())
    except:
        max_time = 60  # fallback

    print("Enter your food idea / cravings (e.g., 'chocolate chip cookie', 'spicy tofu'): ")
    query_text = input().strip()

    print("Enter ingredients you want to include (comma-separated): ")
    ingredients = input().strip().split(',')

    return {
        "diet_tag": diet,
        "cuisine_tag": cuisine,
        "max_cooking_time_minutes": max_time,
        "query_text": query_text,
        "ingredients": [i.strip().lower() for i in ingredients if i.strip()]
    }

# Import models
def get_model():
  # Determine the device to use
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
  print(f"Using device: {device}")

  # Load model and move to the determined device
  model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

  # Load recipe embeddings and move to the determined device
  recipe_embeddings = torch.load('/Users/celinewidjaja/Documents/recipe-reccomender/recipe_embeddings.pt',
                                 map_location=torch.device(device))

  # load recipe data
  processed_df = pd.read_csv('/Users/celinewidjaja/Documents/recipe-reccomender/processed_recipes.csv')

  #load lda model
  lda_model =  models.LdaModel.load('/Users/celinewidjaja/Documents/recipe-reccomender/lda_model.model')

  # Load the dictionary
  dictionary = corpora.Dictionary.load('/Users/celinewidjaja/Documents/recipe-reccomender/dictionary.dict')


  return model, recipe_embeddings, processed_df, lda_model, dictionary


# Define reccomendation function
def reccomendations(user_input, rc_model, df, recipe_embeddings,lda_model, dictionary):

  # Ensure user_embedding is on the same device as recipe_embeddings
  device = recipe_embeddings.device

  # Encode User Query with Sentence-BERT and move to the same device
  user_embedding = rc_model.encode(user_input["query_text"], convert_to_tensor=True).to(device)

  # Computes cosine similarity between the user query and each recipe
  cos_sim = util.cos_sim(user_embedding, recipe_embeddings).cpu().numpy().flatten()


  # Calculate Ingredient Overlap iteratively
  processed_df['ingredient_overlap'] = processed_df['clean_ingredients'].apply(lambda x: ingredient_overlap(x, user_input["ingredients"]))

  # Calculate Tag Match (diet & cuisine)
  processed_df['tag_match'] = (
      (processed_df['diet_tag'].str.lower() == user_input['diet_tag'].lower()).astype(int) +
      (processed_df['cuisine_tag'].str.lower() == user_input['cuisine_tag'].lower()).astype(int)
  ) / 2

  # Process df['full_text'] iteratively instead of in parallel
  tokens_list = [preprocess(text) for text in tqdm(processed_df['full_text'], desc="Preprocessing")]

  processed_df['tokens'] = tokens_list


  # Calculate Topic Match
  query_tokens = preprocess(user_input['query_text'])
  # Filter query tokens based on the dictionary vocabulary
  query_tokens_filtered = [token for token in query_tokens if token in dictionary.token2id]
  query_bow = dictionary.doc2bow(query_tokens_filtered)
  query_topic_dist = lda_model.get_document_topics(query_bow)
  query_dominant_topic = max(query_topic_dist, key=lambda x: x[1])[0]
  processed_df['topic_match'] = (processed_df['dominant_topic'] == query_dominant_topic).astype(int)

  # Filter by cooking time
  df_filtered = processed_df[processed_df['cooking_time_mins'] <= user_input['max_cooking_time_minutes']].copy()

  # Normalize cooking time so that faster recipes score higher
  max_time = user_input['max_cooking_time_minutes']
  df_filtered['cooking_time_score'] = 1 - (df_filtered['cooking_time_mins'] / max_time)
  df_filtered['cooking_time_score'] = df_filtered['cooking_time_score'].clip(0, 1)

  ## Final Score
  # Ensure cos_sim is aligned with df_filtered index
  cos_sim_filtered = cos_sim[df_filtered.index]
  df_filtered['score'] = (
      0.45 * cos_sim_filtered +
      0.2 * df_filtered['ingredient_overlap'] +
      0.15 * df_filtered['tag_match'] +
      0.1 * df_filtered['topic_match'] +
      0.1  * df_filtered['cooking_time_score']
  )

  # Top 21 Recommendations
  top_21 = df_filtered.sort_values('score', ascending=False).head(21)

  return top_21

In [None]:
# Load models
rc_model, recipe_embeddings, processed_df, lda_model, dictionary = get_model()

2025-08-15 13:13:28,419 | INFO | Load pretrained SentenceTransformer: all-MiniLM-L6-v2


Using device: cpu


2025-08-15 13:13:34,016 | INFO | loading LdaModel object from /Users/celinewidjaja/Documents/recipe-reccomender/lda_model.model
2025-08-15 13:13:34,018 | INFO | loading expElogbeta from /Users/celinewidjaja/Documents/recipe-reccomender/lda_model.model.expElogbeta.npy with mmap=None
2025-08-15 13:13:34,019 | INFO | setting ignored attribute state to None
2025-08-15 13:13:34,019 | INFO | setting ignored attribute dispatcher to None
2025-08-15 13:13:34,020 | INFO | setting ignored attribute id2word to None
2025-08-15 13:13:34,020 | INFO | LdaMulticore lifecycle event {'fname': '/Users/celinewidjaja/Documents/recipe-reccomender/lda_model.model', 'datetime': '2025-08-15T13:13:34.020128', 'gensim': '4.3.3', 'python': '3.12.11 | packaged by Anaconda, Inc. | (main, Jun  5 2025, 08:03:38) [Clang 14.0.6 ]', 'platform': 'macOS-15.6-arm64-arm-64bit', 'event': 'loaded'}
2025-08-15 13:13:34,020 | INFO | loading LdaState object from /Users/celinewidjaja/Documents/recipe-reccomender/lda_model.model.st

## Get Reccomendation from user

In [None]:
# Store user preferences into user_input
user_input = get_user_preferences()

# Store reccommendation results in top_df
top_df = reccomendations(user_input, rc_model, processed_df, recipe_embeddings, lda_model, dictionary)

Enter diet preference (e.g., Vegetarian, Non-Vegetarian): 
Enter cuisine preference (e.g., Italian, French, Mexican): 
Enter max cooking time in minutes (e.g., 30): 
Enter your food idea / cravings (e.g., 'chocolate chip cookie', 'spicy tofu'): 
Enter ingredients you want to include (comma-separated): 


Batches: 100%|██████████| 1/1 [00:00<00:00, 10.63it/s]
Preprocessing:  44%|████▍     | 989204/2224769 [24:27<30:33, 674.02it/s]  
Preprocessing: 100%|██████████| 60194/60194 [01:08<00:00, 880.64it/s] 


In [None]:
# Display recommendations
print(f"========================\n Our reccomendation :\n")
for index, row in top_df.iterrows():
    print(f"Recipe: {row['title']}")
    print(f"Cuisine: {row['cuisine_tag']}")
    print(f"Diet restrictions: {row['diet_tag']}")
    print(f"Cooking time: {row['cooking_time_mins']}\n\n")

 Our reccomendation :

Recipe: Real Southern Fried Green Tomatoes
Cuisine: Italian
Diet restrictions: Vegetarian
Cooking time: 2


Recipe: Zucchini Rounds
Cuisine: Italian
Diet restrictions: Vegetarian
Cooking time: 3


Recipe: Cheeses Pizza
Cuisine: Italian
Diet restrictions: Vegetarian
Cooking time: 8


Recipe: Ratatouille Pancakes
Cuisine: Italian
Diet restrictions: Vegetarian
Cooking time: 9


Recipe: Rosemary Parmesan Biscuits
Cuisine: Italian
Diet restrictions: Vegetarian
Cooking time: 13


Recipe: Calzone Recipes
Cuisine: Italian
Diet restrictions: Vegetarian
Cooking time: 15


Recipe: Lemon-And-Black-Pepper Cheese Straws
Cuisine: Italian
Diet restrictions: Vegetarian
Cooking time: 15


Recipe: Carbonara Pizza
Cuisine: Italian
Diet restrictions: Vegetarian
Cooking time: 12


Recipe: Savory Parmesan And Fennel Cookies
Cuisine: Italian
Diet restrictions: Vegetarian
Cooking time: 15


Recipe: Cheese-Stuffed Colombian-Style Arepas Recipe
Cuisine: Italian
Diet restrictions: Vegetaria

# **Model Evaluation**

## Create Evaluation Dataset

In [54]:
diet_tags = [
    'Non-Vegetarian, Gluten-Free',
    'Vegetarian, Gluten-Free',
    'Vegetarian, Contains Gluten',
    'Vegan, Gluten-Free',
    'Non-Vegetarian, Contains Gluten',
    'Vegan, Contains Gluten'
]

cuisine_tags = [
    'American', 'French', 'German', 'Chinese', 'Mexican', 'Italian',
    'Thai', 'Indian', 'Japanese', 'Middle Eastern', 'Indonesian'
]

sample_ingredients = [
    # Proteins
    "chicken", "beef", "pork", "lamb", "goat", "duck", "turkey", "fish", "salmon", "tuna", "cod",
    "shrimp", "prawn", "crab", "lobster", "scallop", "oyster", "squid", "tofu", "tempeh", "paneer",
    "egg", "egg white", "egg yolk", "bacon", "sausage", "ham", "chorizo",

    # Vegetables
    "onion", "red onion", "green onion", "garlic", "ginger", "tomato", "cherry tomato", "spinach",
    "kale", "arugula", "lettuce", "romaine", "carrot", "celery", "broccoli", "cauliflower",
    "brussels sprout", "zucchini", "eggplant", "bell pepper", "red pepper", "green pepper",
    "yellow pepper", "chili pepper", "jalapeno", "habanero", "potato", "sweet potato", "yam",
    "radish", "beetroot", "mushroom", "shiitake mushroom", "portobello mushroom", "peas", "corn",

    # Fresh Herbs
    "cilantro", "parsley", "basil", "mint", "dill", "chives", "rosemary", "thyme", "oregano",
    "sage", "tarragon", "lemongrass",

    # Spices & Seasonings
    "black pepper", "white pepper", "sea salt", "cumin", "coriander", "paprika", "smoked paprika",
    "turmeric", "chili powder", "curry powder", "garam masala", "five spice", "nutmeg", "cinnamon",
    "clove", "cardamom", "anise", "fennel seed", "fenugreek", "mustard seed",

    # Citrus & Acids
    "lemon", "lime", "orange", "grapefruit", "vinegar", "balsamic vinegar", "apple cider vinegar",
    "rice vinegar", "white vinegar", "tamarind", "pomegranate molasses",

    # Dairy & Alternatives
    "milk", "whole milk", "cream", "heavy cream", "sour cream", "yogurt", "greek yogurt",
    "buttermilk", "cheddar cheese", "mozzarella cheese", "parmesan cheese", "feta cheese",
    "goat cheese", "butter", "ghee",

    # Oils & Fats
    "olive oil", "extra virgin olive oil", "canola oil", "vegetable oil", "coconut oil",
    "sesame oil", "peanut oil", "sunflower oil",

    # Condiments & Sauces
    "soy sauce", "tamari", "fish sauce", "oyster sauce", "worcestershire sauce", "hot sauce",
    "sriracha", "hoisin sauce", "teriyaki sauce", "barbecue sauce", "mustard", "ketchup",
    "mayonnaise", "harissa", "pesto", "chimichurri",

    # Grains, Legumes & Nuts
    "rice", "white rice", "brown rice", "basmati rice", "jasmine rice", "quinoa", "bulgur",
    "couscous", "oats", "barley", "wheat flour", "cornmeal", "tortilla", "pasta", "spaghetti",
    "penne", "macaroni", "lentils", "red lentils", "green lentils", "black beans", "kidney beans",
    "chickpeas", "peanuts", "almonds", "cashews", "walnuts", "pistachios", "sesame seeds",

    # Sweeteners
    "sugar", "brown sugar", "powdered sugar", "honey", "maple syrup", "molasses", "agave syrup",

    # Miscellaneous
    "stock", "chicken stock", "beef stock", "vegetable stock", "broth", "gelatin", "cornstarch",
    "baking powder", "baking soda", "cocoa powder", "vanilla extract", "chocolate chips"
]


# Cuisine-specific cravings (query_text options)
cuisine_queries = {
    "American": ["cheeseburger", "fried chicken", "BBQ ribs", "mac and cheese", "apple pie"],
    "French": ["coq au vin", "ratatouille", "beef bourguignon", "quiche lorraine", "crepes"],
    "German": ["bratwurst", "sauerbraten", "pretzel with sausage", "schnitzel", "potato salad"],
    "Chinese": ["kung pao chicken", "sweet and sour pork", "mapo tofu", "chow mein", "dumplings"],
    "Mexican": ["tacos al pastor", "chicken enchiladas", "beef burritos", "chile relleno", "tamales"],
    "Italian": ["spaghetti carbonara", "margherita pizza", "lasagna", "risotto", "fettuccine alfredo"],
    "Thai": ["pad thai", "green curry", "tom yum soup", "massaman curry", "pineapple fried rice"],
    "Indian": ["butter chicken", "paneer tikka", "biryani", "chole bhature", "dal makhani"],
    "Japanese": ["sushi", "ramen", "teriyaki chicken", "okonomiyaki", "gyoza"],
    "Middle Eastern": ["shawarma", "falafel wrap", "kebab platter", "hummus with pita", "baba ganoush"],
    "Indonesian": ["nasi goreng", "satay chicken", "gado gado", "rendang", "soto ayam"]
}



In [55]:
# Generate random N user inputs
N=3
user_inputs = []
for _ in range(N):
    cuisine = random.choice(cuisine_tags)
    query = random.choice(cuisine_queries[cuisine])
    ingredients_str = ", ".join(random.sample(sample_ingredients, random.randint(4, 8)))

    entry = {
        "diet_tag": random.choice(diet_tags),
        "cuisine_tag": cuisine,
        "max_cooking_time_minutes": random.randint(10, 120),
        "query_text": query,
        "ingredients": ingredients_str
    }
    user_inputs.append(entry)

file_path = "/Users/celinewidjaja/Documents/recipe-reccomender/main_df.csv"

try:
    if os.path.exists(file_path):
        main_df = pd.read_csv(file_path)
        print(f"Loaded existing data from {file_path}")
    else:
        raise FileNotFoundError
except Exception as e:
    print(f"Could not load data: {e}")

    # Define expected columns to create an empty DataFrame
    expected_columns = ["user_id","user_cuisine","user_diet","user_cook_time",
                        "user_food_pref", "user_ingredients", "recommended_rank",
        'title', 'ingredients', 'directions','cooking_time',
                     'cuisine_tag', 'diet_tag', 'cooking_time_mins',
                     'dominant_topic','tag_match', 'topic_name','score'
    ]
    main_df = pd.DataFrame(columns=expected_columns)
    print("Created empty DataFrame with expected columns.")

#######################################################################

if 'main_df' in locals() or 'main_df' in globals():
  print("main_df exists")
  if not main_df.empty:
    i= main_df.user_id.iloc[-1]+1
  else:
    i=0
else:
   main_df = pd.DataFrame()
   i=0



#######################################################################

for user_input in user_inputs:
    top_df = reccomendations(user_input, rc_model, processed_df, recipe_embeddings, lda_model, dictionary)
    top_df = top_df[['title', 'ingredients', 'directions','cooking_time',
                     'cuisine_tag', 'diet_tag', 'cooking_time_mins',
                     'dominant_topic','tag_match', 'topic_name','score']]
    top_df["user_id"]=i
    top_df["user_cuisine"]= user_input["cuisine_tag"]
    top_df["user_diet"]= user_input["diet_tag"]
    top_df["user_cook_time"]= user_input["max_cooking_time_minutes"]
    top_df["user_food_pref"]= user_input["query_text"]
    top_df["user_ingredients"]= user_input["ingredients"]
    # Rank by score (highest = rank 1)
    top_df['recommended_rank'] = top_df['score'].rank(method='first',
                                                      ascending=False).astype(int)
    top_df = top_df.sort_values('recommended_rank')
    i+=1
    main_df = pd.concat([main_df, top_df],axis=0, ignore_index=True)



Loaded existing data from /Users/celinewidjaja/Documents/recipe-reccomender/main_df.csv
main_df exists


Batches: 100%|██████████| 1/1 [00:00<00:00, 29.43it/s]
Preprocessing: 100%|██████████| 60194/60194 [00:17<00:00, 3415.23it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 108.12it/s]
Preprocessing: 100%|██████████| 60194/60194 [00:17<00:00, 3416.59it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 166.75it/s]
Preprocessing: 100%|██████████| 60194/60194 [00:17<00:00, 3450.55it/s]


## Metrics Calculation

**Check reccomendation result against the sythentics ground truth(manually created)**



In [None]:
# Compute Precision@K, Recall@K, MRR, MAP, and Mean Average Recall (MAR)
def evaluate_ranking_metrics(main_df, k=5):

    precision_at_k_list = []
    recall_at_k_list = []
    reciprocal_ranks = []
    average_precisions = []
    average_recalls = []

    for user_id, group in main_df.groupby('user_id'):
        # Sort by rank
        group_sorted = group.sort_values('recommended_rank')
        relevances = group_sorted['is_relevant'].values

        total_relevant = relevances.sum()
        if total_relevant == 0:
            continue

        # Precision@K
        top_k = relevances[:k]
        precision_at_k = top_k.sum() / k
        precision_at_k_list.append(precision_at_k)

        # Recall@K
        recall_at_k = top_k.sum() / total_relevant
        recall_at_k_list.append(recall_at_k)

        # Reciprocal Rank for MRR
        relevant_indices = (relevances == 1).nonzero()[0]
        rr = 1 / (relevant_indices[0] + 1) if len(relevant_indices) > 0 else 0
        reciprocal_ranks.append(rr)

        # Average Precision for MAP
        num_relevant_found = 0
        precisions = []
        for i, rel in enumerate(relevances, start=1):
            if rel == 1:
                num_relevant_found += 1
                precisions.append(num_relevant_found / i)
        ap = sum(precisions) / total_relevant if total_relevant > 0 else 0
        average_precisions.append(ap)

        # Average Recall for MAR
        num_relevant_found = 0
        recalls = []
        for i, rel in enumerate(relevances, start=1):
            if rel == 1:
                num_relevant_found += 1
                recalls.append(num_relevant_found / total_relevant)
        ar = np.mean(recalls) if recalls else 0
        average_recalls.append(ar)

    results = {
        f'Precision@{k}': np.mean(precision_at_k_list) if precision_at_k_list else 0,
        f'Recall@{k}': np.mean(recall_at_k_list) if recall_at_k_list else 0,
        'MRR (Mean Relevance Rank)': np.mean(reciprocal_ranks) if reciprocal_ranks else 0,
        'MAP(Mean Average precision)': np.mean(average_precisions) if average_precisions else 0,
        'MAR (Mean average recall)': np.mean(average_recalls) if average_recalls else 0
    }
    return results


Read the manually tagged (ground truth) dataset

In [59]:
file_path = "/Users/celinewidjaja/Documents/recipe-reccomender/ground_truth_tagged.csv"
main_df = pd.read_csv(file_path)

In [60]:
metrics = evaluate_ranking_metrics(main_df, k=10)
print(f"=========================================")
for metric, value in metrics.items():
    print(f"{metric}: {np.round(value, 3)}")

Precision@10: 0.688
Recall@10: 0.453
MRR (Mean Relevance Rank): 0.779
MAP(Mean Average precision): 0.722
MAR (Mean average recall): 0.557


**Mean Average Precision**

*   Precision at k is calculated at each rank position where a relevant recipe appears.
*   Average Precision (AP) for a single user is the mean of these precisions across all relevant recipes for that user.
*   Mean Average Precision (MAP) is the mean of the AP values across all users.

**Mean Average Recall**

Mean Average Recall (MAR) is the mean of the average recall values computed for each user_id in the dataset.

## ===================== Part 5 - Shopping List Aggregator =====================

Create Cleaned Ingredient List

In [61]:
# Import previous reccomended instance
recipes = pd.read_csv('/Users/celinewidjaja/Documents/top_21_recommended_recipes.csv')

Clean Shopping List

In [62]:
# Load Spacy
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser", "lemmatizer"])
stop_words = set(stopwords.words("english"))

# Essential constants that will help with data cleaning
instruction_words = {
    "add", "chopped", "sliced", "diced", "minced", "crushed", "ground", "grated", 
    "packed", "frozen", "fresh", "large", "small", "medium", "optional", "about",
    "garnish", "serve", "cooked", "raw", "whole", "halved", "pieces", "chunks"
}

unit_conversions = {
    'pound': 16, 'lb': 16, 'ounce': 1, 'oz': 1,
    'gram': 0.035, 'g': 0.035, 'kg': 35.27,
    'tablespoon': 3, 'tbsp': 3, 'teaspoon': 1, 'tsp': 1
}

unit_mapping = {
    "c": "cup", "c.": "cup", "cups": "cup", "tbsp": "tablespoon", "tbsps": "tablespoon",
    "tsp": "teaspoon", "tsps": "teaspoon", "lbs": "pound", "pounds": "pound",
    "oz": "ounce", "ounces": "ounce", "g": "gram", "grams": "gram", "lb": "pound"
}

non_ingredients = {
    'amount', 'taste', 'cook', 'bake', 'mix', 'serve', 'dish', 'recipe', 'food',
    'sauce', 'water', 'oil', 'salt', 'pepper', 'season', 'flavor', 'time'
}

# Define function to clean code
def clean_text(text):
    text = unidecode.unidecode(str(text))
    text = re.sub(r"\([^)]*\)", "", text)
    for word in instruction_words:
        text = re.sub(rf"\b{word}\b", "", text, flags=re.IGNORECASE)
    return re.sub(r"\s+", " ", text).strip()

# Define function to extract quantity unit
def extract_quantity_unit(text):
    # Extract quantity
    match = re.match(r"^\s*(\d+\s\d+/\d+|\d+/\d+|\d+)", text)
    if match:
        try:
            qty = float(sum(Fraction(part) for part in match.group(1).split()))
        except:
            qty = 1.0
        text = text[match.end():].strip()
    else:
        qty = 1.0
    
    # Extract unit
    unit = "unit"
    for u in unit_mapping:
        if re.search(rf"\b{u}\b", text, flags=re.IGNORECASE):
            unit = unit_mapping[u]
            text = re.sub(rf"\b{u}\b", "", text, flags=re.IGNORECASE).strip()
            break
    
    return qty, unit, text

# Define function to process ingredient name
def process_ingredient_name(text):
    if SPELLCHECK:
        tokens = word_tokenize(text.lower())
        tokens = [spell_checker.correction(token) if token in spell_checker else token for token in tokens]
    else:
        tokens = word_tokenize(text.lower())
    
    filtered = [t for t in tokens if t.isalpha() and t not in stop_words and t not in non_ingredients]
    name = " ".join(filtered).strip()
    
    # Singularize
    if name.endswith("s") and not name.endswith("ss"):
        name = name[:-1]
    
    return name

# Define function to parse ingredients
def parse_ingredients(ingredients_str):
    if pd.isna(ingredients_str):
        return []
    if isinstance(ingredients_str, list):
        return ingredients_str
    try:
        parsed = ast.literal_eval(ingredients_str)
        if isinstance(parsed, list):
            return parsed
    except:
        pass
    return [item.strip() for item in str(ingredients_str).split(',')]

# Define function to process ingredients and cseparate ingredients, units, and quantity
def process_ingredients_from_dataframe(df, ingredients_column='ingredients'):
    all_details = []
    
    for _, row in df.iterrows():
        ingredients = parse_ingredients(row[ingredients_column])
        for ingredient in ingredients:
            if not str(ingredient).strip():
                continue
                
            original = str(ingredient).strip()
            cleaned = clean_text(original)
            qty, original_unit, rest = extract_quantity_unit(original)
            final_name = process_ingredient_name(cleaned)
            
            if not final_name:
                continue
            
            # Convert units
            unit = original_unit
            if original_unit in unit_conversions:
                qty *= unit_conversions[original_unit]
                unit = "ounce" if original_unit in {'pound', 'lb', 'ounce', 'oz', 'gram', 'g', 'kg'} else "teaspoon"
            
            all_details.append({
                'ingredient': final_name,
                'unit': unit,
                'quantity': qty,
                'ingredient_cleaned': cleaned,
                'unit_in_text': original_unit
            })
    
    # Aggregate
    details_df = pd.DataFrame(all_details)
    result = details_df.groupby(['ingredient', 'unit']).agg({
        'quantity': 'sum',
        'ingredient_cleaned': 'first',
        'unit_in_text': 'first'
    }).reset_index()
    
    recipe_counts = details_df.groupby(['ingredient', 'unit']).size().reset_index(name='used_in_recipes')
    result = result.merge(recipe_counts, on=['ingredient', 'unit'])
    
    # Final formatting
    result['total_quantity'] = result['quantity'].round(2)
    result = result.sort_values('total_quantity', ascending=False).reset_index(drop=True)
    result['_id'] = range(len(result))
    
    return result[['_id', 'ingredient', 'unit', 'total_quantity', 'used_in_recipes', 'ingredient_cleaned', 'unit_in_text']]

# Define function to process from file
def process_from_file(file_path, ingredients_column='ingredients'):
    if file_path.endswith('.csv'):
        df = pd.read_csv(file_path)
    else:
        df = pd.read_excel(file_path)
    return process_ingredients_from_dataframe(df, ingredients_column)

# Identify input and output paths
if __name__ == "__main__":
    # Your file paths
    input_file = '/Users/celinewidjaja/Documents/recipe-reccomender/processed_recipes.csv'
    output_file = '/Users/celinewidjaja/Documents/recipe-reccomender/ingredient_shoppinglist.csv'
    
    result_df = process_from_file(input_file, 'ingredients')
    
    # Save to CSV
    result_df.to_csv(output_file, index=False)
    print(result_df.head(10)[['ingredient', 'unit', 'total_quantity']].to_string(index=False))

        ingredient  unit  total_quantity
                ml  unit        48267.00
          teaspoon  unit        27929.12
           ml milk  unit        27114.00
               egg  unit        25877.50
           lb beef ounce        23634.88
         lb carrot ounce        13016.00
ml vegetable stock  unit         9920.00
         cup sugar  unit         9426.06
      garlic clove  unit         8813.75
          lb onion ounce         8400.00


In [None]:
# Rename result df to shopping list
result_df = shopping_list

# Load top 21 recommended titles from df_filtered
top_21 = top_df.sort_values('score', ascending=False).head(21)

# Convert to list
recommended_titles = top_21["title"].tolist()

In [75]:
# Initiate cleaned ingredients set
cleaned_ingredients = set()

# Initiate ingredient units dictionary
ingredient_units = {}

# Build cleaned ingredients and units from shopping list collection
for _, item in shopping_list.iterrows():
    name = item.get("ingredient")
    if not isinstance(name, str):
        continue
    name = name.strip().lower()
    unit = (item.get("unit") or "unit").strip().lower()
    cleaned_ingredients.add(name)
    ingredient_units[name] = unit

In [76]:
# Define function to match ingredients 
def match_ingredient(name: str) -> str:
    name = name.lower()
    for clean in cleaned_ingredients:
        if clean in name or name in clean:
            return clean
    return name

# Define function to normalize ingredients
def normalize_ingredient(raw: str):
    raw = raw.replace(",", "").replace(".", "").strip()
    pattern = r"""^\s*
        (\d+\s\d+/\d+|\d+/\d+|\d+)?       # quantity
        \s*
        (cup|cups|tbsp|tablespoon|tsp|teaspoon|oz|ounce|pound|lb|
         clove|can|slice|handful|stalk|medium|large|small|unit|g|kg|ml|l)?
        \s+
        (.+)$                             # ingredient name
    """
    match = re.match(pattern, raw, re.IGNORECASE | re.VERBOSE)

    if not match:
        return raw.lower(), "unit", 1.0

    qty_str, unit, name = match.groups()
    name = name.strip().lower()
    unit = (unit or "unit").lower()

    try:
        qty = float(sum(Fraction(part) for part in qty_str.split())) if qty_str else 1.0
    except Exception:
        qty = 1.0

    matched_name = match_ingredient(name)
    final_unit = ingredient_units.get(matched_name, unit)

    return matched_name, final_unit, qty

# Define metrics aggregator function
def aggregate_ingredients(titles, recipes_df):
    totals = defaultdict(lambda: {
        "quantity": 0.0,
        "unit": "",
        "raw": [],
        "recipes": set()
    })

    for title in titles:
        recipe_rows = recipes_df[recipes_df['title'] == title]
        if recipe_rows.empty:
            continue
        
        ingredients = recipe_rows.iloc[0]['ingredients']
        
        # Handle string lists
        if isinstance(ingredients, str):
            try:
                import ast
                ingredients = ast.literal_eval(ingredients)
            except:
                ingredients = ingredients.split(',')

        for raw in ingredients:
            name, unit, qty = normalize_ingredient(raw)
            totals[name]["quantity"] += qty
            totals[name]["unit"] = unit
            totals[name]["raw"].append(raw)
            totals[name]["recipes"].add(title)

    return totals

In [79]:
# Aggregate ingredients
result = aggregate_ingredients(recommended_titles, recipes)

## ===================== Part 6 - Recipe Summarizer =====================

In [83]:
# Define device for model loading
DEVICE = 0 if torch.cuda.is_available() else -1  # GPU if available, else CPU

# For FP16
FP16 = torch.cuda.is_available()

# Initialize the abstractive model
summarizer = pipeline(
    "summarization",
    model="facebook/bart-large-cnn",
    device=DEVICE,
    torch_dtype="auto" if FP16 else None
)

Device set to use cpu


Load CSV

In [84]:
# Load csv
csv_path = "/Users/celinewidjaja/Documents/top_21_recommended_recipes.csv"

# Save as df
df = pd.read_csv(csv_path)

# Columns to use
title_col = "title"

# Preview
pd.set_option("display.max_colwidth", None)
df[[title_col]].head(21)

Unnamed: 0,title
0,Vegetable-Lentil Chowder
1,Wild Rice With Mushrooms And Onions
2,Fish Fry Batter
3,Sweet-Sour Carrots(Serves 8 To 10)
4,Sour Cream Cucumbers(Polish)
5,Beer Batter(For Fish Or Onion Rings)
6,Carp-A-La-Shingle
7,Sesame Brown Rice
8,Campers' Hot Chocolate Mix
9,Beef Tips In Mushroom Sauce


In [85]:
# Create concise content for AI analysis 
def prepare_recipe_content_for_summarization(df):

    content_parts = []
    
    # Add recipe details as natural text
    for _, recipe in df.iterrows():
        recipe_text = f"Recipe: {recipe['title']}. "
        
        # Add cuisine information
        if 'cuisine_tag' in recipe and pd.notna(recipe['cuisine_tag']):
            recipe_text += f"Cuisine: {recipe['cuisine_tag']}. "
        
        # Add diet information  
        if 'diet_tag' in recipe and pd.notna(recipe['diet_tag']):
            recipe_text += f"Diet: {recipe['diet_tag']}. "
        
        # Add cooking time (using actual column name)
        if 'cooking_time' in recipe and pd.notna(recipe['cooking_time']):
            recipe_text += f"Cooking time: {recipe['cooking_time']}. "
        
        # Add recipe source information
        if 'source' in recipe and pd.notna(recipe['source']):
            recipe_text += f"Source: {recipe['source']}. "
        
        content_parts.append(recipe_text)
    
    # Combine all recipe information
    full_content = " ".join(content_parts)
    
    # Add contextual information for the AI
    context = f"""
    The following collection represents personalized meal recommendations for a user's weekly meal plan. 
    This selection of {len(df)} recipes has been curated based on the user's dietary preferences, 
    cooking time constraints, and flavor preferences. Analyze the culinary themes, cuisine diversity, 
    cooking complexity balance, and how this collection works as a cohesive weekly meal plan.
    
    Recipe Collection:
    {full_content}
    
    Please provide insights about the culinary themes, cuisine balance, cooking time variety, 
    and how this collection creates an engaging weekly meal experience.
    """
    
    return context

# Generate proper abstractive summary with simpler, clearer prompting
def generate_abstractive_weekly_preview(df):

    try:
        # Prepare simple, clean data
        cuisines = df['cuisine_tag'].value_counts()
        dominant_cuisine = cuisines.index[0] if len(cuisines) > 0 else "International"
        cuisine_count = len(cuisines)
        total_recipes = len(df)
        
        # Simple prompt so AI doesn't get confused
        analysis_prompt = f"""
        This meal plan contains {total_recipes} recipes, with {dominant_cuisine} cuisine being dominant. 
        The collection includes recipes like {', '.join(df['title'].head(4).tolist())}.
        
        Describe what kind of cooking experience this meal plan offers and how the recipes work together as a cohesive weekly plan.
        """
        
        # Generate with conservative parameters
        summary_result = summarizer(
            analysis_prompt,
            max_length=80,
            min_length=40,
            do_sample=True,
            temperature=0.7,
            no_repeat_ngram_size=2
        )
        
        generated_text = summary_result[0]["summary_text"]
        
        # Clean the output 
        sentences = generated_text.split('.')
        clean_sentences = []
        
        for sentence in sentences:
            sentence = sentence.strip()
            # Skip sentences that look like they're from the prompt
            if (len(sentence) > 20 and 
                'meal plan contains' not in sentence.lower() and
                'describe what kind' not in sentence.lower() and
                'recipes like' not in sentence.lower()):
                clean_sentences.append(sentence)
        
        if clean_sentences:
            return '. '.join(clean_sentences) + '.'
        else:
            raise Exception("Generated text was mostly prompt repetition")
        
    except Exception as e:
        print(f"Falling back to template-based summary due to: {e}")
        
        # Add fallback with abstractive language
        cuisines = df['cuisine_tag'].value_counts()
        if len(cuisines) > 0:
            dominant_cuisine = cuisines.index[0]
            cuisine_ratio = cuisines.iloc[0] / len(df)
            
            if cuisine_ratio > 0.8:
                return f"This meal plan creates an immersive {dominant_cuisine} cooking experience, emphasizing traditional flavors through {len(df)} carefully selected recipes that balance authenticity with practical preparation methods for modern home cooking."
            else:
                return f"This internationally diverse collection offers a global culinary journey across {len(cuisines)} cooking traditions, providing varied flavor profiles while maintaining consistent preparation approaches throughout the week."
        else:
            return f"This diverse meal collection offers a balanced cooking experience with {len(df)} recipes designed to provide culinary variety while maintaining practical preparation methods."

# Define function to create clean text-based meal plan summary
def create_user_meal_summary(df):

    # Get the weekly preview
    weekly_preview = df['weekly_preview'].iloc[0] if 'weekly_preview' in df.columns else "A curated collection of recipes tailored to your preferences."
    
    # Cuisine analysis
    if 'cuisine_tag' in df.columns:
        cuisine_counts = df['cuisine_tag'].value_counts()
        top_cuisine = cuisine_counts.index[0] if len(cuisine_counts) > 0 else "International"
        cuisine_summary = f"Features {len(cuisine_counts)} cuisines, predominantly {top_cuisine}"
    else:
        cuisine_summary = "Features diverse international cuisines"
    
    # Diet analysis
    diet_summary = ""
    if 'diet_tag' in df.columns:
        diet_counts = df['diet_tag'].value_counts().head(3)
        if len(diet_counts) > 0:
            diet_list = [f"{diet} ({count})" for diet, count in diet_counts.items()]
            diet_summary = " • ".join(diet_list)
    
    # Cooking time analysis
    time_summary = ""
    if 'cooking_time' in df.columns:
        cooking_times = df['cooking_time'].dropna()
        if len(cooking_times) > 0:
            time_summary = f"{cooking_times.min()} - {cooking_times.max()}"
    
    # Create simple text summary
    summary = f"""
Your Weekly Meal Plan 

We've curated {len(df)} recipes perfectly matched to your preferences!

Our Curation:
"{weekly_preview}"

Plan Overview:
• Cuisines: {cuisine_summary}
• Cooking Time: {time_summary if time_summary else 'Varies'}
• Diet Options: {diet_summary if diet_summary else 'Mixed dietary styles'}

Featured Recipes:
"""
    
    # Add featured recipes
    for i, (_, recipe) in enumerate(df.head(8).iterrows(), 1):
        title = recipe['title']
        cuisine = recipe.get('cuisine_tag', 'International')
        diet = recipe.get('diet_tag', '').split(',')[0].strip()
        cooking_time = recipe.get('cooking_time', 'N/A')
        
        summary += f"{i:2d}. {title}\n"
        summary += f"    🌍 {cuisine} • 🥗 {diet} • ⏰ {cooking_time}\n"
    
    return summary

# Generate abstractive analysis
print(f"Generating abstractive analysis for {len(df)} recommended recipes...")
print("Analyzing recipe patterns, nutritional balance, and culinary themes...")

# Generate truly abstractive weekly preview
weekly_preview = generate_abstractive_weekly_preview(df)
user_summary = create_user_meal_summary(df)

# Save results using actual column names
df['weekly_preview'] = weekly_preview
out_path = "weekly_meal_preview.csv"

# Use available columns for output
output_columns = ['title', 'weekly_preview']
if 'cuisine_tag' in df.columns:
    output_columns.insert(1, 'cuisine_tag')

df[output_columns].to_csv(out_path, index=False)

print(f"Saved → {out_path}")
print("\n" + "🍽️ " + "="*60 + " 🍽️")
print("                    YOUR WEEKLY MEAL PLAN")
print("="*70)

print("\nOUR CURATION:")
print(f'"{weekly_preview}"')

print(f"\nFEATURED RECIPES ({len(df)} total):")
for i, (_, recipe) in enumerate(df.head(8).iterrows(), 1):
    cuisine = recipe.get('cuisine_tag', 'International')
    diet = recipe.get('diet_tag', '').split(',')[0].strip()
    time = recipe.get('cooking_time', 'N/A')
    
    print(f"{i:2d}. {recipe['title']}")
    print(f"    🌍 {cuisine} • 🥗 {diet} • ⏰ {time}")

# Stats summary
if 'cuisine_tag' in df.columns:
    cuisine_counts = df['cuisine_tag'].value_counts()
    print(f"\n📊 CUISINE BREAKDOWN:")
    for cuisine, count in cuisine_counts.items():
        percentage = (count / len(df)) * 100
        bar = "█" * min(int(percentage / 5), 20)  # Visual bar
        print(f"   {cuisine:15} {count:2d} recipes {bar} {percentage:.1f}%")

if 'diet_tag' in df.columns:
    diet_counts = df['diet_tag'].value_counts().head(5)
    print(f"\nDIET DISTRIBUTION:")
    for diet, count in diet_counts.items():
        percentage = (count / len(df)) * 100
        print(f"   {diet:25} {count:2d} recipes ({percentage:.1f}%)")

print("\n" + "="*70)

Generating abstractive analysis for 21 recommended recipes...
Analyzing recipe patterns, nutritional balance, and culinary themes...
Falling back to template-based summary due to: Generated text was mostly prompt repetition
Saved → weekly_meal_preview.csv

                    YOUR WEEKLY MEAL PLAN

OUR CURATION:
"This internationally diverse collection offers a global culinary journey across 6 cooking traditions, providing varied flavor profiles while maintaining consistent preparation approaches throughout the week."

FEATURED RECIPES (21 total):
 1. Vegetable-Lentil Chowder
    🌍 Indian • 🥗 Vegetarian • ⏰ 1 hr 0 min
 2. Wild Rice With Mushrooms And Onions
    🌍 Indian • 🥗 Vegetarian • ⏰ 40 min
 3. Fish Fry Batter
    🌍 Indian • 🥗 Vegetarian • ⏰ 45 min
 4. Sweet-Sour Carrots(Serves 8 To 10)  
    🌍 Indian • 🥗 Vegetarian • ⏰ 30 min
 5. Sour Cream Cucumbers(Polish)  
    🌍 Thai • 🥗 Vegetarian • ⏰ 45 min
 6. Beer Batter(For Fish Or Onion Rings)  
    🌍 Indian • 🥗 Vegetarian • ⏰ 30 min
 7

## **Summarizer Evaluation**

In [None]:
# Define functions for executing summary evaluator
class SummarizationEvaluator:
    def __init__(self):
        self.rouge = Rouge()

    # Generate weekly preview summary for a user's reccomended recipes  
    def generate_weekly_preview(self, user_data):
        if len(user_data) == 0:
            return "No recipes available for meal planning."
        
        try:
            # Extract recipe information
            recipe_titles = user_data['title'].tolist()
            cuisines = user_data['cuisine_tag'].value_counts() if 'cuisine_tag' in user_data.columns else {}
            cooking_times = user_data['cooking_time_mins'].dropna() if 'cooking_time_mins' in user_data.columns else []
            
            # Create input text for summarization
            if cuisines.empty:
                dominant_cuisine = "International"
            else:
                dominant_cuisine = cuisines.index[0]
            
            total_recipes = len(user_data)
            avg_time = cooking_times.mean() if len(cooking_times) > 0 else 30
            
            # Simple input text (not a prompt)
            input_text = f"Weekly meal plan with {total_recipes} recipes. Main cuisine is {dominant_cuisine}. " \
                        f"Featured dishes: {', '.join(recipe_titles[:3])}. " \
                        f"Average cooking time {avg_time:.0f} minutes. " \
                        f"Recipes provide diverse flavors and cooking techniques."
            
            # Try BART summarization
            if torch.backends.mps.is_available():
                device = "mps"
            elif torch.cuda.is_available():
                device = 0
            else:
                device = "cpu"
            
            summarizer = pipeline(
                "summarization",
                model="facebook/bart-large-cnn",
                device=device,
                torch_dtype="auto" if device != "cpu" else None
            )
            
            result = summarizer(
                input_text,
                max_length=50,
                min_length=20,
                do_sample=False,
                num_beams=4,
                early_stopping=True,
                no_repeat_ngram_size=3
            )
            
            generated_text = result[0]["summary_text"]
            
            # Validate output
            if len(generated_text.strip()) < 15 or input_text.lower() in generated_text.lower():
                raise Exception("Generated text was not useful")
            
            return generated_text.strip()
            
        except Exception as e:
            print(f"Falling back to template-based summary: {e}")
            
            # Template fallback
            cuisines_dict = user_data['cuisine_tag'].value_counts().to_dict() if 'cuisine_tag' in user_data.columns else {}
            total_recipes = len(user_data)
            
            if cuisines_dict:
                dominant_cuisine = max(cuisines_dict, key=cuisines_dict.get)
                if cuisines_dict[dominant_cuisine] / total_recipes > 0.7:
                    return f"This {dominant_cuisine} meal plan offers {total_recipes} authentic recipes with traditional flavors and modern preparation methods."
                else:
                    return f"This diverse meal plan spans {len(cuisines_dict)} culinary traditions with {total_recipes} carefully selected recipes."
            else:
                return f"This curated meal collection features {total_recipes} diverse recipes for a balanced weekly cooking experience."
    
    # Create reference summaries for evaluation
    def create_reference_summaries(self, main_df):
        reference_summaries = []
        
        for user_id, group in main_df.groupby('user_id'):
            user_row = group.iloc[0]
            
            # Extract information
            cuisines = group['cuisine_tag'].value_counts() if 'cuisine_tag' in group.columns else pd.Series()
            total_recipes = len(group)
            cooking_times = group['cooking_time_mins'].dropna() if 'cooking_time_mins' in group.columns else pd.Series()
            
            # Create structured reference summary
            if not cuisines.empty:
                dominant_cuisine = cuisines.index[0]
                cuisine_count = len(cuisines)
            else:
                dominant_cuisine = "International"
                cuisine_count = 1
            
            time_range = f"{cooking_times.min():.0f}-{cooking_times.max():.0f}" if len(cooking_times) > 1 else "30"
            
            reference = f"This meal plan features {total_recipes} recipes with {dominant_cuisine} cuisine predominant. " \
                       f"The collection spans {cuisine_count} cooking traditions with preparation times of {time_range} minutes. " \
                       f"Recipes provide variety while maintaining coherent flavor profiles."
            
            reference_summaries.append({
                'user_id': user_id,
                'reference_summary': reference,
                'recipe_count': total_recipes,
                'dominant_cuisine': dominant_cuisine
            })
        
        return pd.DataFrame(reference_summaries)
    
    # Calculate rouge scores
    def calculate_rouge_scores(self, generated_summary, reference_summary):
        try:
            scores = self.rouge.get_scores(generated_summary, reference_summary)
            return {
                'rouge-1-f': scores[0]['rouge-1']['f'],
                'rouge-2-f': scores[0]['rouge-2']['f'],
                'rouge-l-f': scores[0]['rouge-l']['f']
            }
        except Exception as e:
            print(f"ROUGE calculation failed: {e}")
            return {'rouge-1-f': 0.0, 'rouge-2-f': 0.0, 'rouge-l-f': 0.0}
    
    def calculate_bert_scores(self, generated_summaries, reference_summaries):
        """Calculate BERTScore for all summaries."""
        try:
            P, R, F1 = bert_score(generated_summaries, reference_summaries, 
                                lang="en", verbose=False)
            return {
                'bert_precision': P.mean().item(),
                'bert_recall': R.mean().item(),
                'bert_f1': F1.mean().item()
            }
        except Exception as e:
            print(f"BERTScore calculation failed: {e}")
            return {'bert_precision': 0.0, 'bert_recall': 0.0, 'bert_f1': 0.0}
    
    def evaluate_summary_quality(self, main_df):
        """Comprehensive evaluation of summary quality."""
        print("Evaluating summarization quality...")
        
        # Create reference summaries
        reference_df = self.create_reference_summaries(main_df)
        
        evaluation_results = []
        generated_summaries = []
        reference_summaries = []
        
        for _, ref_row in reference_df.iterrows():
            user_id = ref_row['user_id']
            reference_summary = ref_row['reference_summary']
            
            # Get user's recipe data
            user_data = main_df[main_df['user_id'] == user_id]
            
            # Generate summary
            generated_summary = self.generate_weekly_preview(user_data)
            
            # Calculate ROUGE scores
            rouge_scores = self.calculate_rouge_scores(generated_summary, reference_summary)
            
            # Collect for BERTScore
            generated_summaries.append(generated_summary)
            reference_summaries.append(reference_summary)
            
            result = {
                'user_id': user_id,
                'generated_summary': generated_summary,
                'reference_summary': reference_summary,
                **rouge_scores
            }
            evaluation_results.append(result)
        
        # Calculate BERTScore for all summaries
        if generated_summaries and reference_summaries:
            bert_scores = self.calculate_bert_scores(generated_summaries, reference_summaries)
            # Add BERTScore to each result
            for result in evaluation_results:
                result.update(bert_scores)
        
        return pd.DataFrame(evaluation_results)

In [87]:
# Evaluation metrics output
evaluator = SummarizationEvaluator()
results = evaluator.evaluate_summary_quality(main_df)

# Display basic metrics
print("SUMMARIZATION EVALUATION RESULTS")
print("=" * 50)

# Overall averages
print(f"Average ROUGE-1 F1: {results['rouge-1-f'].mean():.4f}")
print(f"Average ROUGE-2 F1: {results['rouge-2-f'].mean():.4f}")
print(f"Average ROUGE-L F1: {results['rouge-l-f'].mean():.4f}")
print(f"Average BERT F1: {results['bert_f1'].mean():.4f}")

# Individual user scores
print(f"\nIndividual User Scores:")
for _, row in results.iterrows():
    print(f"User {row['user_id']:2d}: ROUGE-1={row['rouge-1-f']:.3f}, BERT={row['bert_f1']:.3f}")

# Best and worst performers
best_user = results.loc[results['rouge-1-f'].idxmax()]
worst_user = results.loc[results['rouge-1-f'].idxmin()]
print(f"\n🏆 Best: User {best_user['user_id']} (ROUGE-1: {best_user['rouge-1-f']:.3f})")
print(f"📉 Worst: User {worst_user['user_id']} (ROUGE-1: {worst_user['rouge-1-f']:.3f})")

# Show sample summaries
print(f"\nSample Summaries (User {results.iloc[0]['user_id']}):")
print(f"Generated: {results.iloc[0]['generated_summary']}")
print(f"Reference: {results.iloc[0]['reference_summary']}")

# Save results
results.to_csv('evaluation_results.csv', index=False)

Evaluating summarization quality...


Device set to use mps
Your max_length is set to 50, but your input_length is only 48. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=24)
Device set to use mps
Your max_length is set to 50, but your input_length is only 48. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=24)
Device set to use mps
Your max_length is set to 50, but your input_length is only 46. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=23)
Device set to use mps
Your max_length is set to 50, but your input_length is only 48. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might

SUMMARIZATION EVALUATION RESULTS
Average ROUGE-1 F1: 0.3407
Average ROUGE-2 F1: 0.0967
Average ROUGE-L F1: 0.3016
Average BERT F1: 0.8847

Individual User Scores:
User  0: ROUGE-1=0.440, BERT=0.885
User  1: ROUGE-1=0.286, BERT=0.885
User  2: ROUGE-1=0.286, BERT=0.885
User  3: ROUGE-1=0.440, BERT=0.885
User  4: ROUGE-1=0.292, BERT=0.885
User  5: ROUGE-1=0.269, BERT=0.885
User  6: ROUGE-1=0.154, BERT=0.885
User  7: ROUGE-1=0.213, BERT=0.885
User  8: ROUGE-1=0.440, BERT=0.885
User  9: ROUGE-1=0.286, BERT=0.885
User 10: ROUGE-1=0.409, BERT=0.885
User 11: ROUGE-1=0.255, BERT=0.885
User 12: ROUGE-1=0.264, BERT=0.885
User 13: ROUGE-1=0.440, BERT=0.885
User 14: ROUGE-1=0.409, BERT=0.885
User 15: ROUGE-1=0.160, BERT=0.885
User 16: ROUGE-1=0.440, BERT=0.885
User 17: ROUGE-1=0.440, BERT=0.885
User 18: ROUGE-1=0.440, BERT=0.885
User 19: ROUGE-1=0.346, BERT=0.885
User 20: ROUGE-1=0.435, BERT=0.885
User 21: ROUGE-1=0.280, BERT=0.885
User 22: ROUGE-1=0.298, BERT=0.885
User 23: ROUGE-1=0.440, BERT=0.8

### =====================Part 7 - Flask Interface =====================

#### Due to port clashes, we had to run the code below in a separate file and save the algorithms that power the interface in one combined separate file as well

Saved the code below as recommendations_engine.py

In [88]:
# Imports
import pandas as pd
import numpy as np
import torch
import re
import multiprocessing
from tqdm import tqdm
from sentence_transformers import util
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import gensim.corpora as corpora

# Copied over the preprocessing code
def preprocess(text):

    # Extend NLTK's stopwords with domain-specific cooking terms
    custom_stopwords = set([
        "add", "cook", "heat", "bake", "boil", "fry", "stir", "mix", "remove", "place","pepper","bring","teaspoon","like","top",
        "pan", "pot", "oil", "water", "oven", "serve", "grill", "preheat", "use", "tsp", "salt","inch","two","cup","tbsp","bottom",
        "set", "let", "make", "prepare", "cut", "minutes", "cook", "temperature", "degrees","roll","bowl","one","tablespoon","turn",
        "take","get","hard", "side","put","surface","get","little","slow","dont", "mixture","medium","together","whole"
    ])

    stop_words = set(stopwords.words('english')).union(custom_stopwords)
    lemmatizer = WordNetLemmatizer()
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z ]', '', text)
    tokens = word_tokenize(text)
    tokens = [
        lemmatizer.lemmatize(w)
        for w in tokens
        if w not in stop_words and len(w) > 2
    ]
    return tokens

# Ingredient overlap code
def ingredient_overlap(row_ingredients, user_input):

    recipe_ings = set(str(row_ingredients).lower().split(', '))
    user_ings = set(user_input["ingredients"])
    return len(recipe_ings & user_ings) / max(len(recipe_ings | user_ings), 1)

# Hard cuisine filter
def apply_hard_cuisine_filter(df, cuisine_preference):
    if cuisine_preference and cuisine_preference.strip():
        # First filter by cuisine, then apply similarity ranking
        cuisine_filtered = df[df['cuisine_tag'].str.lower() == cuisine_preference.lower()]
        print(f"🔒 Hard cuisine filter: {len(cuisine_filtered)} {cuisine_preference} recipes available")
        
        # If we get too few results, show warning but proceed
        if len(cuisine_filtered) < 5:
            print(f"⚠️  Warning: Only {len(cuisine_filtered)} {cuisine_preference} recipes found")
            if len(cuisine_filtered) == 0:
                print(f"❌ No {cuisine_preference} recipes available, showing all cuisines")
                return df
        
        return cuisine_filtered
    return df

# Grab related cuisines for fallback
def get_related_cuisines(requested_cuisine):
    cuisine_families = {
        'Italian': ['Italian', 'Mediterranean', 'French'],
        'Indian': ['Indian', 'Middle Eastern', 'Thai'],
        'Chinese': ['Chinese', 'Japanese', 'Thai'],
        'Mexican': ['Mexican', 'American'],
        'Thai': ['Thai', 'Chinese', 'Indian'],
        'French': ['French', 'Italian', 'Mediterranean'],
        'Japanese': ['Japanese', 'Chinese', 'Thai'],
        'American': ['American', 'Mexican'],
        'German': ['German', 'French', 'American'],
        'Indonesian': ['Indonesian', 'Thai', 'Indian'],
        'Mediterranean': ['Mediterranean', 'Italian', 'French'],
        'Middle Eastern': ['Middle Eastern', 'Indian', 'Mediterranean']
    }
    
    return cuisine_families.get(requested_cuisine, [requested_cuisine])

# Copied over reccommendations function as well
def reccomendations(user_input, rc_model, df, recipe_embeddings, lda_model):
    
    print(f"\n RECOMMENDATION ENGINE DEBUG:")
    print(f"   Original dataset size: {len(df)}")
    print(f"   Requested cuisine: '{user_input.get('cuisine_tag', 'Any')}'")
    print(f"   Requested diet: '{user_input.get('diet_tag', 'Any')}'")
    print(f"   Max cooking time: {user_input.get('max_cooking_time_minutes', 60)} minutes")
    
    # Ensure user_embedding is on the same device as recipe_embeddings
    device = recipe_embeddings.device

    # Encode User Query with Sentence-BERT and move to the same device
    user_embedding = rc_model.encode(user_input["query_text"], convert_to_tensor=True)

    # Computes cosine similarity between the user query and each recipe
    # Ensure both tensors are on CPU for compatibility
    user_embedding_cpu = user_embedding.cpu()
    recipe_embeddings_cpu = recipe_embeddings.cpu()
    cos_sim = util.cos_sim(user_embedding_cpu, recipe_embeddings_cpu).cpu().numpy().flatten()

    # Calculate ingredient overlap
    df['ingredient_overlap'] = df['ingredients'].apply(lambda x: ingredient_overlap(x, user_input))

    # ===== HARD CUISINE FILTERING FIRST (NEW!) =====
    df_filtered = df.copy()
    requested_cuisine = user_input.get('cuisine_tag', '').strip()
    
    if requested_cuisine:
        # Step 1: Try exact cuisine match
        df_filtered = apply_hard_cuisine_filter(df_filtered, requested_cuisine)
        
        # Step 2: If too few results, try related cuisines
        if len(df_filtered) < 3:
            print(f" Expanding to related cuisines...")
            related_cuisines = get_related_cuisines(requested_cuisine)
            df_filtered = df[df['cuisine_tag'].isin(related_cuisines)]
            print(f" Using related cuisines {related_cuisines}: {len(df_filtered)} recipes")
        
        print(f" Cuisine filtering result: {len(df_filtered)} recipes")
    
    # Filter by cooking time
    time_before = len(df_filtered)
    df_filtered = df_filtered[df_filtered['cooking_time_mins'] <= user_input['max_cooking_time_minutes']].copy()
    print(f" Time filtering: {time_before} → {len(df_filtered)} recipes")

    # STRICT DIET FILTERING
    user_diet = user_input.get('diet_tag', '').strip()
    if user_diet:
        print(f" Applying strict {user_diet} filter")
        before_count = len(df_filtered)
        
        df_filtered = df_filtered[df_filtered['diet_tag'].str.lower() == user_diet.lower()].copy()
        
        after_count = len(df_filtered)
        print(f" Diet filtering: {before_count} → {after_count} recipes for {user_diet}")
        
        if len(df_filtered) == 0:
            print(" No recipes found for this diet combination, showing all recipes")
            df_filtered = df[df['cooking_time_mins'] <= user_input['max_cooking_time_minutes']].copy()

    # Calculate Tag Match (now just for diet since cuisine is hard-filtered)
    df_filtered['tag_match'] = (
        (df_filtered['diet_tag'].str.lower() == user_input.get('diet_tag', '').lower()).astype(int)
    )

    num_cores = multiprocessing.cpu_count()

    # Preprocess text for topic modeling
    tokens_list = [preprocess(text) for text in tqdm(df_filtered['full_text'], desc="Preprocessing")]
    df_filtered['tokens'] = tokens_list

    # Create dictionary of tokens
    dictionary = corpora.Dictionary(df_filtered['tokens'])

    # Filter out extreme tokens
    dictionary.filter_extremes(no_below=20, no_above=0.8, keep_n=10000)

    # Calculate Topic Match
    query_tokens = preprocess(user_input['query_text'])
    query_bow = dictionary.doc2bow(query_tokens)
    query_topic_dist = lda_model.get_document_topics(query_bow)
    
    if query_topic_dist:  # Check if topic distribution exists
        query_dominant_topic = max(query_topic_dist, key=lambda x: x[1])[0]
        df_filtered['topic_match'] = (df_filtered['dominant_topic'] == query_dominant_topic).astype(int)
    else:
        df_filtered['topic_match'] = 0  # Default if no topics found

    # BALANCED TIME SCORING
    max_time = user_input['max_cooking_time_minutes']
    
    if max_time >= 180:  # "No time limit" case
        df_filtered['cooking_time_score'] = 0.5  # Neutral score for all
        print(" No time preference - all cooking times treated equally")
    else:
        # Preference for recipes around 90% of max time (sweet spot)
        optimal_time = max_time * 0.9
        time_deviation = abs(df_filtered['cooking_time_mins'] - optimal_time) / max_time
        df_filtered['cooking_time_score'] = 1 - time_deviation
        df_filtered['cooking_time_score'] = df_filtered['cooking_time_score'].clip(0, 1)
        print(f" Optimal cooking time: {optimal_time:.0f} minutes")

    ## Final Score
    # Ensure cos_sim is aligned with df_filtered index
    cos_sim_filtered = cos_sim[df_filtered.index]
    
    # Adjust scoring weights since cuisine is now hard-filtered
    df_filtered['score'] = (
        0.5 * cos_sim_filtered +           # Increased semantic similarity weight
        0.2 * df_filtered['ingredient_overlap'] +
        0.15 * df_filtered['tag_match'] +   # Now just diet match
        0.1 * df_filtered['topic_match'] +
        0.05 * df_filtered['cooking_time_score']  # Reduced since it's already filtered
    )

    # Top 21 Recommendations
    top_21 = df_filtered.sort_values('score', ascending=False).head(21)

    # Final validation
    if requested_cuisine:
        cuisine_matches = sum(1 for _, row in top_21.iterrows() 
                            if row['cuisine_tag'].lower() == requested_cuisine.lower())
        match_rate = cuisine_matches / len(top_21) if len(top_21) > 0 else 0
        print(f"\n FINAL CUISINE ACCURACY:")
        print(f"   Requested: {requested_cuisine}")
        print(f"   Delivered: {cuisine_matches}/{len(top_21)} recipes ({match_rate:.1%})")
        
        if match_rate < 0.8:  # Less than 80% match
            print(f"  Low match rate - consider expanding dataset for {requested_cuisine}")

    return top_21


# Wrapper function for different number of recommendations
def get_recommendations(user_input, rc_model, df, recipe_embeddings, lda_model, num_recommendations=21):

    # Use the exact same algorithm
    result = reccomendations(user_input, rc_model, df, recipe_embeddings, lda_model)
    
    # Just change the number returned
    if num_recommendations != 21:
        result = result.head(num_recommendations)
    
    return result


# For Flask usage - convert to records format
def get_recommendations_as_records(user_input, rc_model, df, recipe_embeddings, lda_model, num_recommendations=21):

    result_df = get_recommendations(user_input, rc_model, df, recipe_embeddings, lda_model, num_recommendations)
    return result_df.to_dict('records')

Load the code below and save it as flask_app.py

In [None]:
# Core imports for the Flask web application
import pandas as pd
import numpy as np
import torch
import re
import socket
import os
from collections import defaultdict
from fractions import Fraction
from sentence_transformers import SentenceTransformer, util
from flask import Flask, render_template_string, request, redirect, url_for, session
from gensim import models

# Base path where all files and models are stored
BASE_PATH = '/Users/celinewidjaja/Documents/recipe-reccomender'

# Initialize Flask app with static folder for images and CSS
app = Flask(__name__, 
           static_folder=os.path.join(BASE_PATH, 'static'),
           static_url_path='/static')

print(f"Flask static folder configured to: {app.static_folder}")

app.secret_key = 'recipe-recommender-secret-key-2025'

# Global variables for ML models and data
flask_model = None                  # SentenceTransformer model for embeddings
flask_embeddings = None             # Pre-computed recipe embeddings
flask_df = None                     # Main recipe dataset
flask_lda_model = None              # LDA topic model
flask_shopping_df = None            # Pre-aggregated shopping list data
temp_full_recommendations = []      # Temporary storage for full recommendation data

def load_flask_models():
    """Load all ML models and datasets needed for the Flask application"""
    global flask_model, flask_embeddings, flask_df, flask_lda_model
    global flask_shopping_df
    
    try:
        print("Loading models for Flask...")
        
        # Force CPU usage to avoid GPU compatibility issues
        flask_model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
        flask_model = flask_model.cpu() 
        
        # Load pre-computed embeddings
        flask_embeddings = torch.load(os.path.join(BASE_PATH, 'recipe_embeddings.pt'),
                                     map_location=torch.device('cpu'))
        
        # Load main recipe dataset
        flask_df = pd.read_csv(os.path.join(BASE_PATH, 'processed_recipes.csv'))
        
        # Load LDA topic model
        flask_lda_model = models.LdaModel.load(os.path.join(BASE_PATH, 'lda.model'))
        
        # Load pre-aggregated shopping list data
        flask_shopping_df = pd.read_csv(os.path.join(BASE_PATH, 'ingredient_shoppinglist.csv'))
        
        # Debug the shopping list structure
        print(f"Shopping list columns: {list(flask_shopping_df.columns)}")
        print(f"used_in_recipes column info:")
        print(f"   Data type: {flask_shopping_df['used_in_recipes'].dtype}")
        print(f"   Null values: {flask_shopping_df['used_in_recipes'].isnull().sum()}")
        print(f"   Sample values: {flask_shopping_df['used_in_recipes'].head().tolist()}")
        
        # Clean and validate the used_in_recipes column
        if 'used_in_recipes' in flask_shopping_df.columns:
            flask_shopping_df['used_in_recipes'] = flask_shopping_df['used_in_recipes'].fillna('').astype(str)
        else:
            print(f"Warning: 'used_in_recipes' column not found in shopping list data")
            print(f"   Available columns: {list(flask_shopping_df.columns)}")
            # Try to find similar column names
            possible_columns = [col for col in flask_shopping_df.columns if 'recipe' in col.lower()]
            if possible_columns:
                print(f"   Possible recipe columns: {possible_columns}")
        
        return True
        
    except Exception as e:
        print(f"Error loading models: {e}")
        return False

# Generate shopping list by aggregating from pre-cleaned shopping_list.csv
def optimized_flask_aggregate_ingredients(recipe_titles):
    
    print(f"\nOPTIMIZED AGGREGATION: Processing {len(recipe_titles)} recipes")
    print(f"Shopping database contains {len(flask_shopping_df)} ingredient entries")
    
    # Initialize aggregation dictionary
    totals = defaultdict(lambda: {
        "quantity": 0.0,
        "unit": "",
        "raw": [],
        "recipes": set()
    })
    
    # Track processing statistics
    found_recipes = set()
    missing_recipes = set()
    total_ingredients_processed = 0
    
    # Check what recipe columns are available
    recipe_columns = [col for col in flask_shopping_df.columns if 'recipe' in col.lower()]
    print(f"Available recipe-related columns: {recipe_columns}")
    
    # Process each recipe in the meal plan
    for recipe_title in recipe_titles:
        print(f"\nProcessing: '{recipe_title}'")
        
        recipe_ingredients = pd.DataFrame()
        
        # Try multiple approaches to find the recipe
        if 'used_in_recipes' in flask_shopping_df.columns:
            try:
                recipe_mask = flask_shopping_df['used_in_recipes'].astype(str).str.contains(
                    recipe_title, case=False, na=False
                )
                recipe_ingredients = flask_shopping_df[recipe_mask]
                print(f"  Approach 1 (used_in_recipes): Found {len(recipe_ingredients)} entries")
            except Exception as e:
                print(f"  Approach 1 failed: {e}")
        
        # Try exact title match if there's a title column
        if recipe_ingredients.empty and 'title' in flask_shopping_df.columns:
            try:
                recipe_ingredients = flask_shopping_df[
                    flask_shopping_df['title'].astype(str).str.contains(recipe_title, case=False, na=False)
                ]
                print(f"  Approach 2 (title column): Found {len(recipe_ingredients)} entries")
            except Exception as e:
                print(f"  Approach 2 failed: {e}")
        
        # Try any column that might contain recipe names
        if recipe_ingredients.empty:
            for col in recipe_columns:
                if col not in ['used_in_recipes', 'title']:
                    try:
                        recipe_mask = flask_shopping_df[col].astype(str).str.contains(
                            recipe_title, case=False, na=False
                        )
                        recipe_ingredients = flask_shopping_df[recipe_mask]
                        if not recipe_ingredients.empty:
                            print(f"  Approach 3 ({col} column): Found {len(recipe_ingredients)} entries")
                            break
                    except Exception as e:
                        continue
        
        # Fallback to original parsing method if optimized approach fails
        if recipe_ingredients.empty:
            print(f"  Recipe '{recipe_title}' not found in shopping database")
            print(f"  Falling back to original ingredient parsing method")
            missing_recipes.add(recipe_title)
            
            # Use original parsing as fallback
            try:
                recipe_rows = flask_df[flask_df['title'] == recipe_title]
                if not recipe_rows.empty:
                    ingredients_raw = recipe_rows.iloc[0]['ingredients']
                    
                    # Handle string lists
                    if isinstance(ingredients_raw, str):
                        try:
                            import ast
                            ingredients = ast.literal_eval(ingredients_raw)
                        except:
                            ingredients = ingredients_raw.split(',')
                    else:
                        ingredients = ingredients_raw if ingredients_raw else []
                    
                    # Process with enhanced parsing
                    for raw_ingredient in ingredients:
                        if isinstance(raw_ingredient, str) and raw_ingredient.strip():
                            raw = raw_ingredient.replace(",", "").replace(".", "").strip()
                            
                            # Regex pattern to extract quantity, unit, and ingredient name
                            pattern = r"""^\s*
                                (\d+\s\d+/\d+|\d+/\d+|\d+)?                    # Quantity
                                \s*
                                (cup|cups|tbsp|tablespoon|tsp|teaspoon|oz|ounce|pound|lb|
                                clove|can|slice|handful|stalk|medium|large|small|unit|g|kg|ml|l)?  # Unit
                                \s+
                                (.+)$                                          # Ingredient name
                            """
                            
                            import re
                            from fractions import Fraction
                            match = re.match(pattern, raw, re.IGNORECASE | re.VERBOSE)

                            if not match:
                                ingredient_name = raw_ingredient.strip().lower()
                                quantity = 1.0
                                unit = "unit"
                            else:
                                qty_str, unit_match, name = match.groups()
                                ingredient_name = name.strip().lower() if name else raw_ingredient.strip().lower()
                                unit = (unit_match or "unit").lower()

                                # Parse quantity (handle fractions)
                                try:
                                    if qty_str:
                                        quantity = float(sum(Fraction(part) for part in qty_str.split()))
                                    else:
                                        quantity = 1.0
                                except Exception:
                                    quantity = 1.0
                            
                            # Clean ingredient name
                            for word in ['chopped', 'diced', 'minced', 'sliced', 'fresh', 'large', 'medium', 'small']:
                                ingredient_name = ingredient_name.replace(word, '').strip()
                            
                            if ingredient_name and len(ingredient_name) > 2:
                                totals[ingredient_name]["quantity"] += quantity
                                totals[ingredient_name]["unit"] = unit
                                totals[ingredient_name]["raw"].append(raw_ingredient.strip())
                                totals[ingredient_name]["recipes"].add(recipe_title)
                                total_ingredients_processed += 1
                    
                    found_recipes.add(recipe_title)
                    print(f"  Fallback processing successful: {len(ingredients)} ingredients with proper quantities")
                
            except Exception as e:
                print(f"  Fallback processing failed: {e}")
            
            continue
        
        print(f"  Found {len(recipe_ingredients)} pre-processed ingredients")
        found_recipes.add(recipe_title)
        
        # Aggregate each ingredient for this recipe
        for _, ingredient_row in recipe_ingredients.iterrows():
            ingredient_key = str(ingredient_row.get('ingredient_cleaned', '')).strip().lower()
            
            # Skip empty or invalid entries
            if not ingredient_key or ingredient_key == 'nan' or pd.isna(ingredient_key):
                ingredient_key = str(ingredient_row.get('ingredient', '')).strip().lower()
                if not ingredient_key or ingredient_key == 'nan' or pd.isna(ingredient_key):
                    continue
            
            # Get pre-processed data from the shopping database
            quantity = ingredient_row.get('total_quantity', 0)
            unit = str(ingredient_row.get('unit', 'unit')).strip()
            original_ingredient = str(ingredient_row.get('ingredient', ingredient_key))
            
            # Handle different quantity data types
            if pd.isna(quantity) or quantity == '':
                quantity = 1.0
            else:
                try:
                    quantity = float(quantity)
                except (ValueError, TypeError):
                    print(f"     Invalid quantity '{quantity}' for {ingredient_key}, using 1.0")
                    quantity = 1.0
            
            # Handle unit standardization
            if pd.isna(unit) or unit == 'nan':
                unit = 'unit'
            
            # Aggregate the ingredient
            totals[ingredient_key]["quantity"] += quantity
            totals[ingredient_key]["unit"] = unit
            totals[ingredient_key]["raw"].append(original_ingredient)
            totals[ingredient_key]["recipes"].add(recipe_title)
            
            total_ingredients_processed += 1
            print(f"    + {ingredient_key}: {quantity} {unit} (from '{original_ingredient}')")
    
    # Print summary statistics
    print(f"\nAGGREGATION SUMMARY:")
    print(f"  Recipes successfully processed: {len(found_recipes)}")
    print(f"  Recipes not found in shopping DB: {len(missing_recipes)}")
    print(f"  Total ingredient entries processed: {total_ingredients_processed}")
    print(f"  Unique ingredients in final shopping list: {len(totals)}")
    
    if missing_recipes:
        print(f"    Missing recipes: {', '.join(missing_recipes)}")
        print(f"    These recipes used fallback parsing method")
    
    if found_recipes:
        print(f"  Successfully processed: {', '.join(found_recipes)}")
    
    return dict(totals)

# Get recipe recommendations from the ML pipeline with debugging
def flask_get_recommendations(user_input, num_recommendations=12):
    try:
        print(f"\nUSER INPUT DEBUG:")
        print(f"   Diet requested: '{user_input.get('diet_tag', 'None')}'")
        print(f"   Cuisine requested: '{user_input.get('cuisine_tag', 'None')}'")
        print(f"   Max time: {user_input.get('max_cooking_time_minutes', 'None')} minutes")
        print(f"   Query text: '{user_input.get('query_text', 'None')}'")
        print(f"   Ingredients: {user_input.get('ingredients', [])}")
        
        # Show dataset cuisine distribution
        print(f"\nDATASET CUISINE DISTRIBUTION:")
        cuisine_counts = flask_df['cuisine_tag'].value_counts()
        for cuisine, count in cuisine_counts.head(10).items():
            print(f"   {cuisine}: {count} recipes available")
        
        # Call the main recommendation engine
        recommendations = get_recommendations_as_records(
            user_input, 
            flask_model, 
            flask_df, 
            flask_embeddings, 
            flask_lda_model, 
            num_recommendations
        )
        
        print(f"\nRESULTS DEBUG:")
        print(f"   Generated {len(recommendations)} recommendations")
        
        if recommendations:
            # Analyze cuisine distribution in results
            print(f"   Cuisine distribution in results:")
            result_cuisines = {}
            for i, rec in enumerate(recommendations):
                cuisine = rec.get('cuisine_tag', 'Unknown')
                result_cuisines[cuisine] = result_cuisines.get(cuisine, 0) + 1
                print(f"     {i+1}. {cuisine} - '{rec.get('title', 'No title')}' (Score: {rec.get('score', 0):.3f})")
            
            print(f"\nCUISINE SUMMARY:")
            for cuisine, count in result_cuisines.items():
                print(f"   {cuisine}: {count} recipes")
            
            # Check if requested cuisine is being honored
            requested_cuisine = user_input.get('cuisine_tag', '')
            if requested_cuisine:
                matching_count = result_cuisines.get(requested_cuisine, 0)
                print(f"\n CUISINE REQUEST CHECK:")
                print(f"   Requested: {requested_cuisine}")
                print(f"   Got {matching_count} out of {len(recommendations)} recommendations")
                if matching_count < len(recommendations) * 0.7:
                    print(f"  WARNING: Low cuisine match rate!")
        
        return recommendations
        
    except Exception as e:
        print(f" Error: {e}")
        import traceback
        traceback.print_exc()
        return []

# Generate weekly meal preview summary using BART or fallback template
def generate_weekly_preview(recipe_titles):
    if not recipe_titles:
        return "No recipes selected for preview."
    
    try:
        # Create DataFrame from recipe titles for analysis
        preview_data = []
        for title in recipe_titles:
            recipe_row = flask_df[flask_df['title'] == title]
            if not recipe_row.empty:
                preview_data.append({
                    'title': title,
                    'cuisine_tag': recipe_row.iloc[0]['cuisine_tag'],
                    'diet_tag': recipe_row.iloc[0]['diet_tag'],
                    'cooking_time_mins': recipe_row.iloc[0]['cooking_time_mins']
                })
        
        if not preview_data:
            return "A curated collection of recipes tailored to your preferences."
        
        temp_df = pd.DataFrame(preview_data)
        
        # Prepare analysis prompt for BART model
        cuisines = temp_df['cuisine_tag'].value_counts()
        dominant_cuisine = cuisines.index[0] if len(cuisines) > 0 else "International"
        total_recipes = len(temp_df)
        
        analysis_prompt = f"""
        This meal plan contains {total_recipes} recipes, with {dominant_cuisine} cuisine being dominant. 
        The collection includes recipes like {', '.join(temp_df['title'].head(4).tolist())}.
        
        Describe what kind of cooking experience this meal plan offers and how the recipes work together as a cohesive weekly plan.
        """
        
        # Try to use BART summarization model
        from transformers import pipeline
        import torch
        
        # Initialize summarizer
        device = 0 if torch.cuda.is_available() else -1
        summarizer = pipeline(
            "summarization",
            model="facebook/bart-large-cnn",
            device=device,
            torch_dtype="auto" if torch.cuda.is_available() else None
        )
        
        # Generate summary
        summary_result = summarizer(
            analysis_prompt,
            max_length=80,
            min_length=40,
            do_sample=True,
            temperature=0.7,
            no_repeat_ngram_size=2
        )
        
        generated_text = summary_result[0]["summary_text"]
        
        # Clean the output to remove prompt remnants
        sentences = generated_text.split('.')
        clean_sentences = []
        
        for sentence in sentences:
            sentence = sentence.strip()
            # Skip sentences that look like they're from the prompt
            if (len(sentence) > 20 and 
                'meal plan contains' not in sentence.lower() and
                'describe what kind' not in sentence.lower() and
                'recipes like' not in sentence.lower()):
                clean_sentences.append(sentence)
        
        if clean_sentences:
            return '. '.join(clean_sentences) + '.'
        else:
            raise Exception("Generated text was mostly prompt repetition")
        
    except Exception as e:
        print(f"Falling back to template-based summary due to: {e}")
        
        # Fallback template-based summary generation
        cuisines = {}
        cooking_times = []
        
        for title in recipe_titles:
            recipe_row = flask_df[flask_df['title'] == title]
            if not recipe_row.empty:
                cuisine = recipe_row.iloc[0]['cuisine_tag']
                time_mins = recipe_row.iloc[0]['cooking_time_mins']
                
                cuisines[cuisine] = cuisines.get(cuisine, 0) + 1
                cooking_times.append(time_mins)
        
        if cuisines:
            dominant_cuisine = max(cuisines, key=cuisines.get)
            cuisine_ratio = cuisines[dominant_cuisine] / len(recipe_titles)
            
            if cuisine_ratio > 0.8:
                return f"This meal plan creates an immersive {dominant_cuisine} cooking experience, emphasizing traditional flavors through {len(recipe_titles)} carefully selected recipes that balance authenticity with practical preparation methods for modern home cooking."
            else:
                return f"This internationally diverse collection offers a global culinary journey across {len(cuisines)} cooking traditions, providing varied flavor profiles while maintaining consistent preparation approaches throughout the week."
        else:
            return f"This diverse meal collection offers a balanced cooking experience with {len(recipe_titles)} recipes designed to provide culinary variety while maintaining practical preparation methods."

# Clean up Unicode escape sequences in recipe text for proper display
def clean_unicode_text(text):
    if not isinstance(text, str):
        return text
    
    # Replace common Unicode issues found in recipe data
    text = text.replace('\\u00b0', '°') 
    text = text.replace('\\u2013', '–')  
    text = text.replace('\\u2014', '—') 
    text = text.replace('\\u2019', "'") 
    text = text.replace('\\u201c', '"')  
    text = text.replace('\\u201d', '"')  
    text = text.replace('\\u00bd', '½')  
    text = text.replace('\\u00bc', '¼') 
    text = text.replace('\\u00be', '¾') 
    text = text.replace('\\u2153', '⅓')  
    text = text.replace('\\u2154', '⅔')  
    
    return text

# Get flag emoji and food icon for each cuisine type
def get_cuisine_flag_and_icon(cuisine_tag):
    cuisine_data = {
        'Italian': {'flag': '🇮🇹', 'icon': 'fas fa-pizza-slice'},
        'French': {'flag': '🇫🇷', 'icon': 'fas fa-wine-glass-alt'},
        'Chinese': {'flag': '🇨🇳', 'icon': 'fas fa-drumstick-bite'},
        'Indian': {'flag': '🇮🇳', 'icon': 'fas fa-pepper-hot'},
        'Mexican': {'flag': '🇲🇽', 'icon': 'fas fa-pepper-hot'},
        'American': {'flag': '🇺🇸', 'icon': 'fas fa-hamburger'},
        'Thai': {'flag': '🇹🇭', 'icon': 'fas fa-leaf'},
        'Middle Eastern': {'flag': '🌍', 'icon': 'fas fa-bread-slice'},
        'Japanese': {'flag': '🇯🇵', 'icon': 'fas fa-fish'},
        'German': {'flag': '🇩🇪', 'icon': 'fas fa-beer'},
        'Indonesian': {'flag': '🇮🇩', 'icon': 'fas fa-seedling'},
        'Mediterranean': {'flag': '🌊', 'icon': 'fas fa-olive-branch'}
    }
    
    return cuisine_data.get(cuisine_tag, {'flag': '🍽️', 'icon': 'fas fa-utensils'})

# Get distinct color scheme for each cuisine type for UI consistency
def get_cuisine_color(cuisine_tag):
    cuisine_colors = {
        'Italian': '#c8102e',   
        'French': '#002654',       
        'Chinese': '#ff6b35',      
        'Indian': '#ff9933',      
        'Mexican': '#006341',      
        'American': '#1f4788',   
        'Thai': '#9b59b6',        
        'Middle Eastern': '#8b4513', 
        'Japanese': '#d63384',   
        'German': '#28a745',     
        'Indonesian': '#fd7e14',  
        'Mediterranean': '#20c997' 
    }
    
    return cuisine_colors.get(cuisine_tag, '#6c757d')

# Remove 'C ' prefix and other unwanted prefixes from ingredient names
def clean_ingredient_name(ingredient_name):
    if not isinstance(ingredient_name, str):
        return str(ingredient_name)
    
    # Remove "C " prefix (case insensitive)
    cleaned = ingredient_name.strip()
    if cleaned.lower().startswith('c '):
        cleaned = cleaned[2:].strip()
    
    # Remove other common prefixes that might appear
    prefixes_to_remove = ['cup ', 'cups ', 'tbsp ', 'tsp ']
    for prefix in prefixes_to_remove:
        if cleaned.lower().startswith(prefix):
            cleaned = cleaned[len(prefix):].strip()
            break
    
    # Capitalize first letter for display
    return cleaned.capitalize() if cleaned else ingredient_name

# HTML Templates with styling
BASE_TEMPLATE = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Culinary Collection</title>
    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/css/bootstrap.min.css" rel="stylesheet">
    <link href="https://fonts.googleapis.com/css2?family=Playfair+Display:ital,wght@0,400;0,500;0,600;0,700;1,400&family=Source+Sans+Pro:wght@300;400;500;600&display=swap" rel="stylesheet">
    <link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css" rel="stylesheet">
    <style>
        :root {
            --cb-white: #ffffff;
            --cb-cream: #faf8f5;
            --cb-warm-gray: #f5f4f2;
            --cb-medium-gray: #8b8680;
            --cb-charcoal: #2d2926;
            --cb-sage: #a8b5a0;
            --cb-accent-gold: #d4af37;
        }
        
        body {
            font-family: 'Source Sans Pro', sans-serif;
            background: var(--cb-white);
            color: var(--cb-charcoal);
            line-height: 1.7;
        }
        
        h1, h2, h3, h4, h5, h6 {
            font-family: 'Playfair Display', serif;
            font-weight: 500;
            line-height: 1.3;
        }
        
        .navbar {
            background: var(--cb-white) !important;
            border-bottom: 1px solid var(--cb-warm-gray);
            padding: 1.5rem 0;
        }
        
        .navbar-brand {
            font-family: 'Playfair Display', serif;
            color: var(--cb-charcoal) !important;
            font-weight: 600;
            font-size: 1.8rem;
        }
        
        .nav-link {
            color: var(--cb-charcoal) !important;
            font-weight: 400;
            text-transform: uppercase;
            letter-spacing: 1px;
            font-size: 0.9rem;
            margin: 0 1rem;
        }
        
        .hero-section {
            background: linear-gradient(135deg, var(--cb-warm-gray) 0%, var(--cb-cream) 100%);
            padding: 6rem 0 4rem;
        }
        
        .hero-content {
            background: rgba(255,255,255,0.95);
            padding: 3rem;
            border-radius: 2px;
            box-shadow: 0 20px 60px rgba(0,0,0,0.1);
        }
        
        .hero-title {
            font-size: 3rem;
            line-height: 1.2;
            color: var(--cb-charcoal);
            margin-bottom: 1.5rem;
            font-style: italic;
        }
        
        .btn-primary {
            background: var(--cb-charcoal);
            border: 2px solid var(--cb-charcoal);
            color: var(--cb-white);
            padding: 0.8rem 2.5rem;
            font-weight: 500;
            text-transform: uppercase;
            letter-spacing: 1px;
            border-radius: 0;
        }
        
        .btn-primary:hover {
            background: transparent;
            color: var(--cb-charcoal);
            border-color: var(--cb-charcoal);
        }
        
        .recipe-card {
            background: var(--cb-white);
            border: none;
            transition: all 0.4s ease;
            margin-bottom: 2rem;
            overflow: hidden;
            box-shadow: 0 8px 30px rgba(0,0,0,0.08);
            border-radius: 0;
        }
        
        .recipe-card:hover {
            transform: translateY(-5px);
            box-shadow: 0 15px 50px rgba(0,0,0,0.12);
        }
        
        .cuisine-badge {
            position: absolute;
            top: 1rem;
            left: 1rem;
            color: white;
            padding: 0.5rem 1rem;
            font-size: 0.85rem;
            font-weight: 600;
            text-transform: uppercase;
            letter-spacing: 1px;
            border-radius: 20px;
            text-shadow: 0 1px 2px rgba(0,0,0,0.3);
        }
        
        .section-title {
            font-family: 'Playfair Display', serif;
            font-size: 2.5rem;
            font-weight: 400;
            color: var(--cb-charcoal);
            margin-bottom: 1rem;
            font-style: italic;
            text-align: center;
        }
        
        .form-label {
            font-family: 'Playfair Display', serif;
            font-size: 1.1rem;
            color: var(--cb-charcoal);
            margin-bottom: 0.8rem;
            font-weight: 500;
        }
        
        .form-control, .form-select {
            border: 1px solid var(--cb-warm-gray);
            padding: 1rem;
            font-size: 0.95rem;
            color: var(--cb-charcoal);
            background: var(--cb-white);
            transition: border-color 0.3s ease;
            border-radius: 0;
        }
        
        .form-control:focus, .form-select:focus {
            border-color: var(--cb-sage);
            box-shadow: 0 0 0 0.2rem rgba(168, 181, 160, 0.15);
        }
        
        .cuisine-grid {
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
            gap: 1rem;
            margin-bottom: 2rem;
        }
        
        .cuisine-option {
            background: white;
            border: 2px solid var(--cb-warm-gray);
            padding: 1.5rem;
            text-align: center;
            cursor: pointer;
            transition: all 0.3s ease;
            border-radius: 8px;
        }
        
        .cuisine-option:hover {
            border-color: var(--cb-sage);
            transform: translateY(-2px);
            box-shadow: 0 8px 25px rgba(0,0,0,0.1);
        }
        
        .shopping-item {
            display: flex;
            justify-content: space-between;
            align-items: center;
            padding: 1rem 0;
            border-bottom: 1px solid var(--cb-warm-gray);
            transition: background-color 0.2s ease;
        }
        
        .shopping-item:hover {
            background-color: var(--cb-cream);
            margin: 0 -1rem;
            padding-left: 1rem;
            padding-right: 1rem;
            border-radius: 4px;
        }
        
        .shopping-item:last-child {
            border-bottom: none;
        }
        
        .ingredient-name {
            font-weight: 600;
            font-size: 1.1rem;
            color: var(--cb-charcoal);
        }
        
        .ingredient-details {
            color: var(--cb-medium-gray);
            font-size: 0.9rem;
            margin-top: 0.2rem;
        }
        
        .ingredient-quantity {
            font-weight: 600;
            color: var(--cb-sage);
            font-size: 1.1rem;
        }
        
        .shopping-stats {
            background: linear-gradient(135deg, var(--cb-sage), #c3d69b);
            color: white;
            padding: 1.5rem;
            border-radius: 8px;
            text-align: center;
            margin-bottom: 2rem;
        }
    </style>
</head>
<body>
    <nav class="navbar navbar-expand-lg">
        <div class="container">
            <a class="navbar-brand" href="/">Culinary Collection</a>
            <div class="navbar-nav ms-auto">
                <a class="nav-link" href="/">Home</a>
                <a class="nav-link" href="/preferences">Discover</a>
            </div>
        </div>
    </nav>
    <main>
        {% block content %}{% endblock %}
    </main>
    <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/js/bootstrap.bundle.min.js"></script>
    {% block scripts %}{% endblock %}
</body>
</html>
"""

HOME_TEMPLATE = BASE_TEMPLATE.replace('{% block content %}{% endblock %}', """
<div class="hero-section">
    <div class="container">
        <div class="row justify-content-center">
            <div class="col-lg-8">
                <div class="hero-content text-center">
                    <h1 class="hero-title">
                        Discover Your Next<br>
                        <span style="color: var(--cb-sage);">Culinary Adventure</span>
                    </h1>
                    <p class="lead mb-4">
                        Effortless meal planning with AI-powered recipe recommendations from global cuisines, using advanced NLP, topic modeling, and your personal taste preferences
                    </p>
                    <a href="/preferences" class="btn btn-primary">Plan My Meals</a>
                </div>
            </div>
        </div>
    </div>
</div>

<div class="container my-5">
    <div class="row">
        <div class="col-md-4 text-center">
            <div style="width: 80px; height: 80px; background: var(--cb-sage); border-radius: 50%; display: flex; align-items: center; justify-content: center; margin: 0 auto 2rem; color: white; font-size: 1.8rem;">
                <i class="fas fa-brain"></i>
            </div>
            <h5 style="font-family: 'Playfair Display', serif;">Intelligent Curation</h5>
            <p style="color: var(--cb-medium-gray);">Advanced machine learning understands your taste preferences and dietary needs.</p>
        </div>
        <div class="col-md-4 text-center">
            <div style="width: 80px; height: 80px; background: var(--cb-sage); border-radius: 50%; display: flex; align-items: center; justify-content: center; margin: 0 auto 2rem; color: white; font-size: 1.8rem;">
                <i class="fas fa-shopping-cart"></i>
            </div>
            <h5 style="font-family: 'Playfair Display', serif;">Optimized Shopping Lists</h5>
            <p style="color: var(--cb-medium-gray);">Pre-aggregated ingredient database generates shopping lists so grocery shopping becomes stress free.</p>
        </div>
        <div class="col-md-4 text-center">
            <div style="width: 80px; height: 80px; background: var(--cb-sage); border-radius: 50%; display: flex; align-items: center; justify-content: center; margin: 0 auto 2rem; color: white; font-size: 1.8rem;">
                <i class="fas fa-tags"></i>
            </div>
            <h5 style="font-family: 'Playfair Display', serif;">Topic Modeling</h5>
            <p style="color: var(--cb-medium-gray);">LDA topic analysis groups similar recipes for cohesive recommendations.</p>
        </div>
    </div>
</div>
""")

PREFERENCES_TEMPLATE = BASE_TEMPLATE.replace('{% block content %}{% endblock %}', """
<div class="container my-5">
    <div class="row justify-content-center">
        <div class="col-lg-8">
            <div class="text-center mb-5">
                <h2 class="section-title">Personalize Your Experience</h2>
                <div style="width: 60px; height: 1px; background: var(--cb-accent-gold); margin: 2rem auto;"></div>
                <p style="color: var(--cb-medium-gray); font-size: 1.1rem; max-width: 600px; margin: 0 auto;">
                    Tell us about your culinary preferences and dietary needs to receive recommendations tailored specifically for you
                </p>
            </div>
            
            <div style="background: var(--cb-white); padding: 3rem; box-shadow: 0 10px 40px rgba(0,0,0,0.08);">
                <form method="POST">
                    <div class="row">
                        <div class="col-md-6 mb-4">
                            <label class="form-label">Cuisine Preference</label>
                            <select class="form-select" name="cuisine_tag">
                                <option value="">Any Cuisine (Diverse Selection)</option>
                                <option value="Italian">🇮🇹 Italian</option>
                                <option value="Indian">🇮🇳 Indian</option>
                                <option value="Mexican">🇲🇽 Mexican</option>
                                <option value="Chinese">🇨🇳 Chinese</option>
                                <option value="American">🇺🇸 American</option>
                                <option value="French">🇫🇷 French</option>
                                <option value="Thai">🇹🇭 Thai</option>
                                <option value="Japanese">🇯🇵 Japanese</option>
                                <option value="German">🇩🇪 German</option>
                                <option value="Indonesian">🇮🇩 Indonesian</option>
                            </select>
                            <small style="color: var(--cb-medium-gray);">Select "Any Cuisine" for maximum variety across all 10 cuisines</small>
                        </div>
                        <div class="col-md-6 mb-4">
                            <label class="form-label">Dietary Lifestyle</label>
                            <select class="form-select" name="diet_tag">
                                <option value="">Any Diet</option>
                                <option value="Non-Vegetarian">Non-Vegetarian</option>
                                <option value="Vegetarian">Vegetarian</option>
                                <option value="Vegan">Vegan</option>
                                <option value="Pescatarian">Pescatarian</option>
                            </select>
                        </div>
                    </div>
                    
                    <div class="row">
                        <div class="col-md-6 mb-4">
                            <label class="form-label">Number of Recipes</label>
                            <select class="form-select" name="num_recipes">
                                <option value="3">3 recipes (Quick picks)</option>
                                <option value="5" selected>5 recipes (Balanced selection)</option>
                                <option value="7">7 recipes (Weekly variety)</option>
                                <option value="10">10 recipes (Maximum diversity)</option>
                            </select>
                            <small style="color: var(--cb-medium-gray);">More recipes = greater cuisine diversity</small>
                        </div>
                        <div class="col-md-6 mb-4">
                            <label class="form-label">Maximum Cooking Time</label>
                            <input type="range" class="form-range" name="max_cooking_time" 
                                   min="15" max="180" value="60" 
                                   style="accent-color: var(--cb-sage);"
                                   oninput="updateTimeDisplay(this.value)">
                            <div class="d-flex justify-content-between mt-2" style="font-size: 0.85rem; text-transform: uppercase; letter-spacing: 1px; color: var(--cb-medium-gray);">
                                <span>Quick Prep</span>
                                <span id="time-display">60 minutes</span>
                                <span>Leisurely Cooking</span>
                            </div>
                        </div>
                    </div>
                    
                    <div class="mb-4">
                        <label class="form-label">What Inspires You Today?</label>
                        <input type="text" class="form-control" name="query_text" 
                               placeholder="Describe your mood, cravings, or cooking inspiration...">
                        <small style="color: var(--cb-medium-gray);">Examples: "comfort food", "spicy and warm", "light and fresh", "celebratory dinner"</small>
                    </div>
                    
                    <div class="mb-4">
                        <label class="form-label">Available Ingredients (optional)</label>
                        <input type="text" class="form-control" name="ingredients" 
                               placeholder="tomatoes, chicken, garlic (comma-separated)">
                        <small style="color: var(--cb-medium-gray);">List ingredients you already have or want to use</small>
                    </div>
                    
                    <div class="text-center">
                        <button type="submit" class="btn btn-primary">Find My Perfect Recipes</button>
                    </div>
                </form>
            </div>
        </div>
    </div>
</div>
""").replace('{% block scripts %}{% endblock %}', """
<script>
function updateTimeDisplay(value) {
    const hours = Math.floor(value / 60);
    const minutes = value % 60;
    let display = '';
    
    if (hours > 0) {
        display = hours + 'h';
        if (minutes > 0) display += ' ' + minutes + 'm';
    } else {
        display = minutes + ' minutes';
    }
    
    document.getElementById('time-display').textContent = display;
}
</script>
""")

# Flask Routes
@app.route('/')
def index():
    return render_template_string(HOME_TEMPLATE)

@app.route('/preferences', methods=['GET', 'POST'])
def preferences():
    if request.method == 'POST':
        num_recipes = int(request.form.get('num_recipes', 5))
        
        # Build user input dictionary for recommendation engine
        user_input = {
            'diet_tag': request.form.get('diet_tag', ''),
            'cuisine_tag': request.form.get('cuisine_tag', ''),
            'max_cooking_time_minutes': int(request.form.get('max_cooking_time', 60)),
            'query_text': request.form.get('query_text', ''),
            'ingredients': [ing.strip() for ing in request.form.get('ingredients', '').split(',') if ing.strip()],
            'num_recipes': num_recipes
        }
        
        # Clear any existing session data
        session.pop('recommendations', None)
        session.pop('meal_plan', None)
        
        session['user_input'] = user_input
        print(f"Generating {num_recipes} fresh recommendations")
        
        # Generate recommendations using ML pipeline
        recommendations = flask_get_recommendations(user_input, num_recipes)
        
        # Store FULL recommendations in global variable for detailed access
        global temp_full_recommendations
        temp_full_recommendations = recommendations
        
        # Generate AI-powered weekly meal preview summary
        recipe_titles = [rec.get('title', '') for rec in recommendations]
        weekly_preview = generate_weekly_preview(recipe_titles)
        
        # Store minimal recommendation data in session for UI display
        recommendations_lite = []
        for i, rec in enumerate(recommendations):
            recommendations_lite.append({
                'title': rec.get('title', ''),
                'cuisine_tag': rec.get('cuisine_tag', ''),
                'diet_tag': rec.get('diet_tag', ''),
                'cooking_time_mins': rec.get('cooking_time_mins', 30),
                'score': rec.get('score', 0)
            })
        
        session['recommendations'] = recommendations_lite
        session['weekly_preview'] = weekly_preview
        
        return redirect(url_for('recommendations'))
    
    return render_template_string(PREFERENCES_TEMPLATE)

@app.route('/recommendations')
def recommendations():
    user_input = session.get('user_input')
    recommendations = session.get('recommendations', [])
    weekly_preview = session.get('weekly_preview', '')
    
    if not user_input or not recommendations:
        return redirect(url_for('preferences'))
    
    # Build the recommendations page HTML
    rec_html = f"""
    <div class="container my-5">
        <div class="d-flex justify-content-between align-items-center mb-4">
            <div>
                <h2 class="section-title" style="text-align: left; margin-bottom: 0;">Your Recipe Recommendations</h2>
                <p style="color: var(--cb-medium-gray);">Curated from global cuisines using optimized ML pipeline</p>
            </div>
            <a href="/meal_plan" class="btn btn-primary">Create Meal Plan</a>
        </div>
        
        <div class="row">
    """
    
    # Generate individual recipe cards
    for i, rec in enumerate(recommendations):
        cuisine_color = get_cuisine_color(rec.get('cuisine_tag', 'International'))
        cuisine_info = get_cuisine_flag_and_icon(rec.get('cuisine_tag', 'International'))
        
        print(f"Creating card {i} for recipe: {rec.get('title', 'No title')}")
        
        rec_html += f"""
            <div class="col-md-6 col-lg-4 mb-4">
                <div class="recipe-card" style="position: relative; overflow: hidden;">
                    <div style="background: {cuisine_color}; color: white; padding: 0.8rem 1rem; text-align: center; font-size: 0.9rem; font-weight: 600; text-transform: uppercase; letter-spacing: 0.5px;">
                        {cuisine_info['flag']} {rec.get('cuisine_tag', 'International')}
                    </div>
                    
                    <div style="padding: 1.5rem;">
                        <div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 1rem;">
                            <span style="color: var(--cb-medium-gray); font-size: 0.9rem;">
                                <i class="fas fa-clock"></i> {rec.get('cooking_time_mins', 30)} min
                            </span>
                            <span style="color: var(--cb-medium-gray); font-size: 0.85rem;">
                                {rec.get('diet_tag', 'Any').split(',')[0]}
                            </span>
                        </div>
                        
                        <h5 style="font-family: 'Playfair Display', serif; margin-bottom: 1rem; line-height: 1.3; color: var(--cb-charcoal);">
                            {clean_unicode_text(rec['title'])}
                        </h5>
                        
                        <a href="/recipe/{i}" class="btn btn-primary btn-sm w-100" onclick="console.log('Clicked recipe {i}'); return true;">
                            <i class="fas fa-eye"></i> View Recipe
                        </a>
                    </div>
                </div>
            </div>
        """

    rec_html += """
        </div>
        
        <div class="text-center mt-4">
            <a href="/preferences" class="btn" style="border: 1px solid var(--cb-charcoal); color: var(--cb-charcoal); margin-right: 1rem;">
                <i class="fas fa-redo"></i> New Recommendations
            </a>
            <a href="/meal_plan" class="btn btn-primary">
                <i class="fas fa-calendar-plus"></i> Create Meal Plan
            </a>
        </div>
    </div>
    """
    
    return render_template_string(BASE_TEMPLATE.replace('{% block content %}{% endblock %}', rec_html))

@app.route('/recipe/<int:recipe_index>')
def recipe_detail(recipe_index):
    global temp_full_recommendations
    
    print(f"Recipe detail requested for index {recipe_index}")
    print(f"Available recommendations: {len(temp_full_recommendations)}")
    
    if not temp_full_recommendations:
        print("ERROR: No recommendations available!")
        return redirect(url_for('recommendations'))
    
    if recipe_index >= len(temp_full_recommendations):
        print(f"ERROR: Index {recipe_index} out of range!")
        return redirect(url_for('recommendations'))
    
    # Get the specific recipe data
    recipe = temp_full_recommendations[recipe_index]
    print(f"Found recipe: {recipe.get('title', 'No title')}")
    
    # Parse ingredients with Unicode cleaning
    ingredients_raw = recipe.get('ingredients', '')
    ingredients = []
    
    if isinstance(ingredients_raw, str):
        ingredients_raw = clean_unicode_text(ingredients_raw)
        
        if ingredients_raw.startswith('[') and ingredients_raw.endswith(']'):
            clean_str = ingredients_raw.strip('[]')
            ingredients = [clean_unicode_text(item.strip().strip('\'"')) for item in clean_str.split('", "') if item.strip()]
        else:
            ingredients = [clean_unicode_text(item.strip().strip('\'"')) for item in ingredients_raw.split(',') if item.strip()]
    
    # Parse directions with Unicode cleaning
    directions_raw = recipe.get('directions', '')
    directions = []
    
    if isinstance(directions_raw, str):
        directions_raw = clean_unicode_text(directions_raw)
        
        if directions_raw.startswith('[') and directions_raw.endswith(']'):
            clean_str = directions_raw.strip('[]')
            directions = [clean_unicode_text(item.strip().strip('\'"')) for item in clean_str.split('", "') if item.strip()]
        else:
            directions = [clean_unicode_text(item.strip().strip('\'"')) for item in directions_raw.split('.') if item.strip() and len(item.strip()) > 10]
    
    # Filter out empty or very short items
    ingredients = [ing for ing in ingredients if ing and len(ing.strip()) > 2]
    directions = [dir for dir in directions if dir and len(dir.strip()) > 5]
    
    cuisine_color = get_cuisine_color(recipe.get('cuisine_tag', 'International'))
    cuisine_info = get_cuisine_flag_and_icon(recipe.get('cuisine_tag', 'International'))
    
    print(f"Parsed {len(ingredients)} ingredients and {len(directions)} directions")
    
    # Build the detailed recipe page HTML
    detail_html = f"""
    <div class="container my-5">
        <div class="row">
            <div class="col-lg-8 mx-auto">
                <a href="/recommendations" class="btn btn-secondary mb-3">← Back to Recommendations</a>
                
                <div class="recipe-card" style="position: relative; overflow: hidden;">
                    <div style="background: {cuisine_color}; color: white; padding: 1.2rem 2rem; text-align: center; font-size: 1.1rem; font-weight: 600; text-transform: uppercase; letter-spacing: 0.8px;">
                        {cuisine_info['flag']} {recipe.get('cuisine_tag', 'International')}
                    </div>
                    
                    <div style="padding: 3rem;">
                        <h2 style="color: var(--cb-charcoal); text-align: center; margin-bottom: 2rem; font-family: 'Playfair Display', serif; line-height: 1.2;">
                            {clean_unicode_text(recipe['title'])}
                        </h2>
                        
                        <div class="row mb-4">
                            <div class="col-md-4">
                                <strong>Cuisine:</strong><br>
                                <span style="color: #8b8680;">{recipe.get('cuisine_tag', 'International')}</span>
                            </div>
                            <div class="col-md-4">
                                <strong>Diet:</strong><br>
                                <span style="color: #8b8680;">{recipe.get('diet_tag', 'Any').split(',')[0]}</span>
                            </div>
                            <div class="col-md-4">
                                <strong>Cooking Time:</strong><br>
                                <span style="color: #8b8680;">{recipe.get('cooking_time_mins', 30)} minutes</span>
                            </div>
                        </div>
                        
                        <hr style="border-color: #f5f4f2; margin: 2rem 0;">
                        
                        <div class="row">
                            <div class="col-md-5">
                                <h3 style="font-family: 'Playfair Display', serif; color: #2d2926; margin-bottom: 1.5rem;">
                                    <i class="fas fa-list-ul"></i> Ingredients
                                </h3>
                                <ul style="list-style: none; padding: 0;">
    """
    
    # Add each ingredient as a list item
    for ingredient in ingredients:
        if ingredient.strip():
            detail_html += f"""
                                    <li style="padding: 0.8rem 0; border-bottom: 1px solid #f5f4f2; display: flex; align-items: flex-start;">
                                        <i class="fas fa-check-circle" style="color: #a8b5a0; margin-right: 1rem; margin-top: 0.2rem; font-size: 0.9rem; flex-shrink: 0;"></i>
                                        <span>{ingredient.strip()}</span>
                                    </li>
            """
    
    detail_html += """
                                </ul>
                            </div>
                            <div class="col-md-7">
                                <h3 style="font-family: 'Playfair Display', serif; color: #2d2926; margin-bottom: 1.5rem;">
                                    <i class="fas fa-clipboard-list"></i> Directions
                                </h3>
                                <div style="padding-left: 0;">
    """
    
    # Add each direction as a numbered step
    for i, direction in enumerate(directions, 1):
        if direction.strip():
            detail_html += f"""
                                    <div style="margin-bottom: 1.5rem; padding: 1rem; background: #faf8f5; border-left: 4px solid #a8b5a0;">
                                        <strong style="color: #a8b5a0;">Step {i}</strong>
                                        <p style="margin: 0.5rem 0 0 0; line-height: 1.7;">{direction.strip()}</p>
                                    </div>
            """
    
    detail_html += f"""
                                </div>
                                
                                <div style="margin-top: 2rem; padding: 1.5rem; background: linear-gradient(135deg, var(--cb-sage), #c3d69b); border-radius: 8px; text-align: center; color: white;">
                                    <i class="fas fa-check-circle" style="font-size: 2rem; margin-bottom: 0.5rem;"></i>
                                    <h4 style="margin: 0; font-family: 'Playfair Display', serif; font-weight: 600;">You're All Done!</h4>
                                    <p style="margin: 0.5rem 0 0 0; opacity: 0.9;">Enjoy your delicious {recipe.get('cuisine_tag', 'International')} creation! 🍽️</p>
                                </div>
                            </div>
                        </div>
                        
                        <hr style="border-color: #f5f4f2; margin: 2rem 0;">
                        
                        <div class="text-center">
                            <form method="POST" action="/add_to_meal_plan" style="display: inline;">
                                <input type="hidden" name="recipe_index" value="{recipe_index}">
                                <button type="submit" class="btn btn-primary me-3">
                                    <i class="fas fa-plus"></i> Add to Meal Plan
                                </button>
                            </form>
                            <a href="/recommendations" class="btn" style="border: 1px solid #2d2926; color: #2d2926;">
                                <i class="fas fa-arrow-left"></i> Browse More Recipes
                            </a>
                        </div>
                    </div>
                </div>
            </div>
        </div>
    </div>
    """
    
    print("Recipe detail page HTML generated successfully")
    return render_template_string(BASE_TEMPLATE.replace('{% block content %}{% endblock %}', detail_html))

# Add recipe to meal plan route
@app.route('/add_to_meal_plan', methods=['POST'])
def add_to_meal_plan():
    global temp_full_recommendations
    
    recipe_index = int(request.form.get('recipe_index', 0))
    
    if recipe_index < len(temp_full_recommendations):
        meal_plan = session.get('meal_plan', [])
        recipe = temp_full_recommendations[recipe_index]
        
        # Avoid duplicates by checking recipe titles
        if not any(r['title'] == recipe['title'] for r in meal_plan):
            meal_plan.append(recipe)
            session['meal_plan'] = meal_plan
            print(f"Added '{recipe['title']}' to meal plan (now {len(meal_plan)} recipes)")
    
    return redirect(url_for('meal_plan'))

# Meal plan display route
@app.route('/meal_plan')
def meal_plan():
    meal_plan = session.get('meal_plan', [])
    
    if not meal_plan:
        plan_html = """
        <div class="container my-5">
            <div class="text-center">
                <h2 class="section-title">Your Meal Plan</h2>
                <p style="color: var(--cb-medium-gray); font-size: 1.1rem;">Your meal plan is empty.</p>
                <a href="/recommendations" class="btn btn-primary">Add Some Recipes!</a>
            </div>
        </div>
        """
    else:
        # Analyze cuisine distribution in meal plan
        cuisines = {}
        for recipe in meal_plan:
            cuisine = recipe.get('cuisine_tag', 'Unknown')
            cuisines[cuisine] = cuisines.get(cuisine, 0) + 1
        
        plan_html = f"""
        <div class="container my-5">
            <div class="d-flex justify-content-between align-items-center mb-4">
                <div>
                    <h2 class="section-title" style="text-align: left; margin-bottom: 0;">Your Meal Plan</h2>
                    <p style="color: var(--cb-medium-gray);">{len(meal_plan)} recipes from {len(cuisines)} cuisines</p>
                </div>
                <a href="/shopping_list" class="btn btn-primary">Generate Optimized Shopping List</a>
            </div>
            
            <div class="row">
        """
        
        # Generate compact cards for each recipe in meal plan
        for i, recipe in enumerate(meal_plan):
            cuisine_color = get_cuisine_color(recipe.get('cuisine_tag', 'International'))
            cuisine_info = get_cuisine_flag_and_icon(recipe.get('cuisine_tag', 'International'))
            
            plan_html += f"""
                <div class="col-md-4 col-lg-3 mb-3">
                    <div class="recipe-card" style="position: relative; overflow: hidden;">
                        <div style="background: {cuisine_color}; color: white; padding: 0.6rem 0.8rem; text-align: center; font-size: 0.75rem; font-weight: 600; text-transform: uppercase; letter-spacing: 0.3px;">
                            {cuisine_info['flag']} {recipe.get('cuisine_tag', 'International')}
                        </div>
                        
                        <div style="padding: 1rem;">
                            <h6 style="font-family: 'Playfair Display', serif; margin-bottom: 0.5rem; line-height: 1.2; font-size: 0.95rem;">
                                {clean_unicode_text(recipe['title'])}
                            </h6>
                            <p style="color: var(--cb-medium-gray); font-size: 0.8rem; margin: 0;">
                                <i class="fas fa-clock"></i> {recipe.get('cooking_time_mins', 30)} min
                            </p>
                        </div>
                    </div>
                </div>
            """
        
        plan_html += """
            </div>
            <div class="text-center mt-4">
                <a href="/recommendations" class="btn" style="border: 1px solid var(--cb-charcoal); color: var(--cb-charcoal); margin-right: 1rem;">
                    <i class="fas fa-plus"></i> Add More Recipes
                </a>
                <a href="/shopping_list" class="btn btn-primary">
                    <i class="fas fa-shopping-cart"></i> Generate Optimized Shopping List
                </a>
            </div>
        </div>
        """
    
    return render_template_string(BASE_TEMPLATE.replace('{% block content %}{% endblock %}', plan_html))

# Shopping list generation route using optimized pre-aggregated data
@app.route('/shopping_list')
def shopping_list():
    meal_plan = session.get('meal_plan', [])
    if not meal_plan:
        return redirect(url_for('meal_plan'))
    
    recipe_titles = [recipe['title'] for recipe in meal_plan]
    print(f"\nGENERATING OPTIMIZED SHOPPING LIST")
    print(f"Processing meal plan with {len(recipe_titles)} recipes")
    
    shopping_items = optimized_flask_aggregate_ingredients(recipe_titles)
    
    shop_html = f"""
    <div class="container my-5">
        <div class="d-flex justify-content-between align-items-center mb-4">
            <div>
                <h2 class="section-title" style="text-align: left; margin-bottom: 0;">Shopping List</h2>
                <p style="color: var(--cb-medium-gray);">Ingredients for {len(meal_plan)} recipes</p>
            </div>
            <a href="/meal_plan" class="btn btn-primary">Back to Meal Plan</a>
        </div>
        
        <div class="row">
            <div class="col-lg-8">
                <div style="background: white; padding: 2rem; box-shadow: 0 10px 40px rgba(0,0,0,0.08);">
                    <h4 style="margin-bottom: 2rem;">
                        <i class="fas fa-shopping-cart"></i> Shopping List ({len(shopping_items)} items)
                    </h4>
    """
    
    # Add each aggregated ingredient to the list
    for ingredient, details in shopping_items.items():
        shop_html += f"""
            <div style="display: flex; justify-content: space-between; align-items: center; padding: 1rem 0; border-bottom: 1px solid #eee;">
                <div>
                    <strong style="font-size: 1.1rem;">{clean_ingredient_name(ingredient)}</strong><br>
                    <small style="color: var(--cb-medium-gray);">Used in: {', '.join(list(details['recipes'])[:2])}</small>
                </div>
                <div style="text-align: right;">
                    <strong style="color: var(--cb-charcoal);">{details['quantity']:.1f} {details['unit']}</strong>
                </div>
            </div>
        """
    
    shop_html += f"""
                </div>
            </div>
            <div class="col-lg-4">
                <div style="background: var(--cb-warm-gray); padding: 2rem; border-radius: 4px;">
                    <h5 style="margin-bottom: 1.5rem;"><i class="fas fa-info-circle"></i> Shopping Tips</h5>
                    <ul style="color: var(--cb-medium-gray); line-height: 1.8;">
                        <li>Check your pantry for basic ingredients first</li>
                        <li>Buy fresh herbs and vegetables last</li>
                        <li>Consider buying in bulk for frequently used items</li>
                        <li>Check expiration dates, especially for dairy</li>
                    </ul>
                </div>
            </div>
        </div>
    </div>
    """
    
    return render_template_string(BASE_TEMPLATE.replace('{% block content %}{% endblock %}', shop_html))

# Find an available port between 7000-9000 for Flask development server
def find_free_port():
    for port in range(7000, 9000):
        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
            try:
                s.bind(('localhost', port))
                return port
            except OSError:
                continue
    return 8888

# Initialize and launch the Flask application
def run_flask_app():
    
    print("Starting OPTIMIZED Recipe Recommendation Flask App...")
    print("=" * 60)
    print("Supporting 10 Global Cuisines with AI-Powered Recommendations")
    print("   Italian  Indian  Mexican  Chinese  American")
    print("   French   Thai   Japanese German  Indonesian")
    print()
    print("NEW FEATURES:")
    print("   Optimized Shopping Lists - Zero parsing errors!")
    print("   Pre-aggregated ingredient database")
    print("   Clean, accurate ingredient quantities")
    print("   Enhanced performance with direct data lookup")
    print("=" * 60)
    
    if load_flask_models():
        print("All models loaded successfully!")
        
        port = find_free_port()
        print(f"Starting Flask development server on port {port}...")
        print(f"Open your browser to: http://localhost:{port}")
        print("Debug mode enabled for development")
        print("=" * 60)
        
        app.run(debug=True, port=port, host='127.0.0.1')
    else:
        print("Failed to load models. Please check file paths:")
        print(f"   - {os.path.join(BASE_PATH, 'recipe_embeddings.pt')}")
        print(f"   - {os.path.join(BASE_PATH, 'processed_recipes.csv')}")
        print(f"   - {os.path.join(BASE_PATH, 'lda.model')}")
        print(f"   - {os.path.join(BASE_PATH, 'ingredient_shoppinglist.csv')}")
        print("\nEnsure all model files are present in the BASE_PATH directory")

if __name__ == '__main__': 
    run_flask_app()