# **Training Model**

## =====================Part 3 - LDA Topic Modeling =====================


### Preprocess Data & Build LDA Topics

#### Import packages

In [None]:
!pip install gensim
!pip install pyLDAvis
!pip install swifter
!pip install sentence_transformers



In [None]:
import pandas as pd
import re

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import gensim.corpora as corpora
from gensim.models.ldamulticore import LdaMulticore

from joblib import Parallel, delayed
import multiprocessing

import warnings
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


#### Setup & Downloads

In [None]:
# Setup & Downloads
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

#### Load Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Load tagged recipes
file = pd.read_csv("/content/drive/MyDrive/NLP_group_project/tagged_recipes.csv")
file.head()

Unnamed: 0.1,Unnamed: 0,title,ingredients,directions,link,source,NER,clean_ingredients,clean_directions,clean_text,simhash,cooking_time,cuisine_tag,diet_tag
0,2015528,Marinated Flank Steak Recipe,"[""1 1/2 pound flank steak"", ""1/2 c. finely min...","[""Remove tenderloin from steak."", ""Score meat....",cookeatshare.com/recipes/marinated-flank-steak...,Recipes1M,"[""flank steak"", ""green onions"", ""red wine"", ""s...",pound flank steak c finely minced green onions...,remove tenderloin from steak score meat combin...,pound flank steak c finely minced green onions...,2984177662168074987,6 hr 0 min,American,"Non-Vegetarian, Gluten-Free"
1,1608734,French Chicken Stew,"[""1 tablespoon rosemary"", ""1 teaspoon thyme"", ...","[""combine all ingredients in slow cooker (6 qu...",www.yummly.com/recipe/French-Chicken-Stew-1433580,Gathered,"[""rosemary"", ""thyme"", ""bay leaves"", ""paprika"",...",tablespoon rosemary teaspoon thyme bay leaves ...,combine all ingredients in slow cooker quarts ...,tablespoon rosemary teaspoon thyme bay leaves ...,16962350100183742163,6 hr 0 min,French,"Non-Vegetarian, Gluten-Free"
2,778500,Glazed Carrots,"[""3 to 4 carrots"", ""1 1/2 Tbsp. butter"", ""1/3 ...","[""Cook 3 to 4 carrots; cut crosswise in 1-inch...",www.cookbooks.com/Recipe-Details.aspx?id=1011892,Gathered,"[""carrots"", ""butter"", ""brown sugar"", ""lemon ri...",to carrots tbsp butter c brown sugar grated le...,cook to carrots cut crosswise in inch pieces a...,to carrots tbsp butter c brown sugar grated le...,13631088686507211408,15 min,French,"Vegetarian, Gluten-Free"
3,1334975,Moms Pie Dough,"[""4.5 Cups Flour"", ""1.5 Tsp Salt"", ""Pinch Baki...","[""Mix all dry ingredients in a bowl."", """", ""Ad...",www.epicurious.com/recipes/member/views/moms-p...,Gathered,"[""Flour"", ""Salt"", ""Baking Powder"", ""Sugar"", ""C...",cups flour tsp salt pinch baking powder tbls s...,mix all dry ingredients in a bowl add crisco a...,cups flour tsp salt pinch baking powder tbls s...,9758474554084259533,30 min,American,"Vegetarian, Contains Gluten"
4,116562,Pretzel Salad Or Dessert,"[""2 c. crushed small thin pretzels (sticks)"", ...","[""Mix and press in baking pan, approximately 1...",www.cookbooks.com/Recipe-Details.aspx?id=106723,Gathered,"[""thin pretzels"", ""margarine""]",c crushed small thin pretzels sticks c margarine,mix and press in baking pan approximately x in...,c crushed small thin pretzels sticks c margari...,12570825253960956135,8 min,German,"Vegan, Gluten-Free"


In [None]:
# make a copy of dataframe
df = file

In [None]:
df.columns

Index(['Unnamed: 0', 'title', 'ingredients', 'directions', 'link', 'source',
       'NER', 'clean_ingredients', 'clean_directions', 'clean_text', 'simhash',
       'cooking_time', 'cuisine_tag', 'diet_tag'],
      dtype='object')

In [None]:
df = df.head(50000)

#### convert cooking_time to minutes for all recipes

In [None]:
import re

def cooking_time_to_minutes(time_str):
    if pd.isna(time_str):
        return 0
    time_str = time_str.lower()
    hours = 0
    minutes = 0
    hr_match = re.search(r'(\d+)\s*hr', time_str)
    if hr_match:
        hours = int(hr_match.group(1))
    min_match = re.search(r'(\d+)\s*min', time_str)
    if min_match:
        minutes = int(min_match.group(1))
    return hours * 60 + minutes

# Convert cooking_time to minutes for all recipes once
df['cooking_time_mins'] = df['cooking_time'].apply(cooking_time_to_minutes)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cooking_time_mins'] = df['cooking_time'].apply(cooking_time_to_minutes)


#### Fill missing values and prepare full text field

In [None]:
# Fill missing values and prepare full text field
df['full_text'] = df['clean_ingredients'].fillna('') + ' ' + df['clean_directions'].fillna('')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['full_text'] = df['clean_ingredients'].fillna('') + ' ' + df['clean_directions'].fillna('')


#### Parallel Text Preprocessing / Tokenization

In [None]:
# Extend NLTK's stopwords with domain-specific cooking terms
custom_stopwords = set([
    "add", "cook", "heat", "bake", "boil", "fry", "stir", "mix", "remove", "place","pepper","bring","teaspoon","like","top",
    "pan", "pot", "oil", "water", "oven", "serve", "grill", "preheat", "use", "tsp", "salt","inch","two","cup","tbsp","bottom",
    "set", "let", "make", "prepare", "cut", "minutes", "cook", "temperature", "degrees","roll","bowl","one","tablespoon","turn",
    "take","get","hard", "side","put","surface","get","little","slow","dont", "mixture","medium","together","whole"
])

stop_words = set(stopwords.words('english')).union(custom_stopwords)
lemmatizer = WordNetLemmatizer()

In [None]:
# checking number of cores
num_cores = multiprocessing.cpu_count()
print(f"Using {num_cores} CPU cores for tokenization...")

Using 2 CPU cores for tokenization...


In [None]:
from joblib import Parallel, delayed
from tqdm import tqdm
import re
from nltk.tokenize import word_tokenize

def preprocess(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z ]', '', text)
    tokens = word_tokenize(text)
    tokens = [
        lemmatizer.lemmatize(w)
        for w in tokens
        if w not in stop_words and len(w) > 2
    ]
    return tokens


# Wrap df['full_text'] with tqdm to show progress
tokens_list = Parallel(n_jobs=num_cores-1)(
    delayed(preprocess)(text) for text in tqdm(df['full_text'], desc="Preprocessing")
)

df['tokens'] = tokens_list

Preprocessing: 100%|██████████| 50000/50000 [00:38<00:00, 1305.57it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tokens'] = tokens_list


In [None]:
df['tokens'].head()

Unnamed: 0,tokens
0,"[pound, flank, steak, finely, minced, green, o..."
1,"[rosemary, thyme, bay, leaf, smoked, paprika, ..."
2,"[carrot, butter, brown, sugar, grated, lemon, ..."
3,"[cup, flour, pinch, baking, powder, tbls, suga..."
4,"[crushed, small, thin, pretzel, stick, margari..."


#### Build Dictionary & Corpus for LDA

In [None]:
from tqdm import tqdm

# Create dictionary of tokens
dictionary = corpora.Dictionary(df['tokens'])

# Filter out extreme tokens
dictionary.filter_extremes(no_below=20, no_above=0.8, keep_n=10000)

# Create Bag-of-Words corpus
corpus = [dictionary.doc2bow(text) for text in tqdm(df['tokens'], desc="Creating BoW Corpus")]

Creating BoW Corpus: 100%|██████████| 50000/50000 [00:01<00:00, 26489.68it/s]


#### Train LDA Model (Parallel)

In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


# Set number of topics
NUM_TOPICS = 10
print(f"Training LDA model with {NUM_TOPICS} topics using {num_cores-1} workers...")


# Train LDA model
lda_model = LdaMulticore(corpus=corpus,
                         id2word=dictionary,
                         num_topics=NUM_TOPICS,
                         workers=num_cores - 1,
                         passes=5,
                         chunksize=1000,  # default is 2000
                         random_state=42,
                         per_word_topics=True)

Training LDA model with 10 topics using 1 workers...


In [None]:
ƒ,p

#### Show Top Words per Topic

In [None]:
# Get topic keywords for each topic
top_words_per_topic = []
for i in range(NUM_TOPICS):
    words = lda_model.show_topic(i, topn=10)
    topic_keywords = ", ".join([word for word, _ in words])
    top_words_per_topic.append((i, topic_keywords))

topic_df = pd.DataFrame(top_words_per_topic, columns=["Topic ID", "Top Keywords"])
display(topic_df)

Unnamed: 0,Topic ID,Top Keywords
0,0,"onion, chopped, tomato, green, bean, garlic, g..."
1,1,"cream, chocolate, cake, cool, sugar, milk, but..."
2,2,"olive, garlic, tablespoon, fresh, chopped, lar..."
3,3,"sugar, egg, flour, butter, baking, vanilla, mi..."
4,4,"sauce, tablespoon, soy, pork, fish, garlic, st..."
5,5,"dough, sheet, flour, baking, butter, ball, edg..."
6,6,"cup, sugar, ounce, tablespoon, ice, cream, syr..."
7,7,"juice, lemon, orange, apple, dressing, sugar, ..."
8,8,"chicken, onion, rice, broth, butter, potato, s..."
9,9,"cheese, cream, onion, bread, egg, dish, butter..."


#### Assign Topics to Each Recipe

In [None]:
# Assign dominant topic to each recipe
def get_dominant_topic(bow):
    topics = lda_model.get_document_topics(bow)
    if topics:
        return max(topics, key=lambda x: x[1])[0]
    return None

df['dominant_topic'] = [get_dominant_topic(doc) for doc in corpus]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['dominant_topic'] = [get_dominant_topic(doc) for doc in corpus]


#### Add topic name labels

In [None]:
# Add topic name labels for better UX
def get_topic_name(topic_id):
    return ", ".join([word for word, _ in lda_model.show_topic(topic_id, topn=3)])

df['topic_name'] = df['dominant_topic'].apply(get_topic_name)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['topic_name'] = df['dominant_topic'].apply(get_topic_name)


#### Visualize with pyLDAvis

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, dictionary)
vis

### save dataset to drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Save DataFrame
df.to_csv('/content/drive/MyDrive/NLP_group_project/LDA_recipes.csv', index=False)
print("File saved to Google Drive")

File saved to Google Drive


## ===================== Part 4 - Recommendation System 1 =====================

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Load dataset
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/NLP_group_project/LDA_recipes.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,title,ingredients,directions,link,source,NER,clean_ingredients,clean_directions,clean_text,simhash,cooking_time,cuisine_tag,diet_tag,cooking_time_mins,full_text,tokens,dominant_topic,topic_name
0,2015528,Marinated Flank Steak Recipe,"[""1 1/2 pound flank steak"", ""1/2 c. finely min...","[""Remove tenderloin from steak."", ""Score meat....",cookeatshare.com/recipes/marinated-flank-steak...,Recipes1M,"[""flank steak"", ""green onions"", ""red wine"", ""s...",pound flank steak c finely minced green onions...,remove tenderloin from steak score meat combin...,pound flank steak c finely minced green onions...,2984177662168074987,6 hr 0 min,American,"Non-Vegetarian, Gluten-Free",360,pound flank steak c finely minced green onions...,"['pound', 'flank', 'steak', 'finely', 'minced'...",4,"sauce, tablespoon, soy"
1,1608734,French Chicken Stew,"[""1 tablespoon rosemary"", ""1 teaspoon thyme"", ...","[""combine all ingredients in slow cooker (6 qu...",www.yummly.com/recipe/French-Chicken-Stew-1433580,Gathered,"[""rosemary"", ""thyme"", ""bay leaves"", ""paprika"",...",tablespoon rosemary teaspoon thyme bay leaves ...,combine all ingredients in slow cooker quarts ...,tablespoon rosemary teaspoon thyme bay leaves ...,16962350100183742163,6 hr 0 min,French,"Non-Vegetarian, Gluten-Free",360,tablespoon rosemary teaspoon thyme bay leaves ...,"['rosemary', 'thyme', 'bay', 'leaf', 'smoked',...",8,"chicken, onion, rice"
2,778500,Glazed Carrots,"[""3 to 4 carrots"", ""1 1/2 Tbsp. butter"", ""1/3 ...","[""Cook 3 to 4 carrots; cut crosswise in 1-inch...",www.cookbooks.com/Recipe-Details.aspx?id=1011892,Gathered,"[""carrots"", ""butter"", ""brown sugar"", ""lemon ri...",to carrots tbsp butter c brown sugar grated le...,cook to carrots cut crosswise in inch pieces a...,to carrots tbsp butter c brown sugar grated le...,13631088686507211408,15 min,French,"Vegetarian, Gluten-Free",15,to carrots tbsp butter c brown sugar grated le...,"['carrot', 'butter', 'brown', 'sugar', 'grated...",7,"juice, lemon, orange"
3,1334975,Moms Pie Dough,"[""4.5 Cups Flour"", ""1.5 Tsp Salt"", ""Pinch Baki...","[""Mix all dry ingredients in a bowl."", """", ""Ad...",www.epicurious.com/recipes/member/views/moms-p...,Gathered,"[""Flour"", ""Salt"", ""Baking Powder"", ""Sugar"", ""C...",cups flour tsp salt pinch baking powder tbls s...,mix all dry ingredients in a bowl add crisco a...,cups flour tsp salt pinch baking powder tbls s...,9758474554084259533,30 min,American,"Vegetarian, Contains Gluten",30,cups flour tsp salt pinch baking powder tbls s...,"['cup', 'flour', 'pinch', 'baking', 'powder', ...",3,"sugar, egg, flour"
4,116562,Pretzel Salad Or Dessert,"[""2 c. crushed small thin pretzels (sticks)"", ...","[""Mix and press in baking pan, approximately 1...",www.cookbooks.com/Recipe-Details.aspx?id=106723,Gathered,"[""thin pretzels"", ""margarine""]",c crushed small thin pretzels sticks c margarine,mix and press in baking pan approximately x in...,c crushed small thin pretzels sticks c margari...,12570825253960956135,8 min,German,"Vegan, Gluten-Free",8,c crushed small thin pretzels sticks c margari...,"['crushed', 'small', 'thin', 'pretzel', 'stick...",1,"cream, chocolate, cake"


In [None]:
from sentence_transformers import SentenceTransformer, util
import torch
import swifter

## Sentence-BERT Embeddings

#### Format recipe to Generate Full Recipe Text (for Embedding)
- Combines key parts of each recipe into a single string used as input for Sentence-BERT.
- Format ensures the embedding captures meaningful semantic information.

In [None]:
# Format recipe to Generate Full Recipe Text (for Embedding)
def format_recipe(row):
    return f"Recipe: {row['title']}. Ingredients: {row['ingredients']}. Directions: {row['directions']}. Cooking time: {row['cooking_time']} minutes. Cuisine: {row['cuisine_tag']}. Diet: {row['diet_tag']}."

df['recipe_text'] = df.swifter.apply(format_recipe, axis=1)

Pandas Apply:   0%|          | 0/50000 [00:00<?, ?it/s]

#### Encode Recipes & User Query with Sentence-BERT Model
- Transforms each recipe's recipe_text into a dense vector (embedding).
- Converts the user's query into a vector.

In [None]:
# Loads lightweight sentence transformer model
# model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')  # Forces use of GPU
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
import torch
print("GPU available:", torch.cuda.is_available())   # use GPU to execute fast

GPU available: True


In [None]:
# Encodes each recipe's text into a vector (embedding) that captures its meaning  with Sentence-BERT
texts = df['recipe_text'].tolist()

recipe_embeddings = model.encode(
    texts,
    batch_size=256,
    show_progress_bar=True,
    convert_to_tensor=True
)

Batches:   0%|          | 0/196 [00:00<?, ?it/s]

In [None]:
import torch

# Save recipe embeddings
torch.save(recipe_embeddings, '/content/drive/MyDrive/NLP_group_project/recipe_embeddings.pt')
df.to_csv('/content/drive/MyDrive/NLP_group_project/processed_recipes.csv', index=False)

# **Reccomendation Function -  This is all you need**

## Load models and Data

In [None]:
!pip install gensim
!pip install pyLDAvis
!pip install swifter
!pip install sentence_transformers

import pandas as pd
import re

import gensim.corpora as corpora
from gensim.models.ldamulticore import LdaMulticore

from joblib import Parallel, delayed
import multiprocessing

import warnings
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

import torch
from sentence_transformers import SentenceTransformer, util
import swifter
from gensim import corpora, models, similarities

from joblib import Parallel, delayed
from tqdm import tqdm
import re
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import multiprocessing


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

from google.colab import drive
drive.mount('/content/drive')

Collecting pyLDAvis
  Using cached pyLDAvis-3.4.1-py3-none-any.whl.metadata (4.2 kB)
Collecting funcy (from pyLDAvis)
  Using cached funcy-2.0-py2.py3-none-any.whl.metadata (5.9 kB)
Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-2.0 pyLDAvis-3.4.1
Collecting swifter
  Using cached swifter-1.4.0.tar.gz (1.2 MB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: swifter
  Building wheel for swifter (setup.py) ... [?25l[?25hdone
  Created wheel for swifter: filename=swifter-1.4.0-py3-none-any.whl size=16505 sha256=6f8c715eb9c294462f6982c0460d285084054637c93194e0d1a3976376d7f747
  Stored in directory: /root/.cache/pip/wheels/ef/7f/bd/9bed48f078f3ee1fa75e0b29b6e0335ce1cb03a38d3443b3a3
Successfull

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Mounted at /content/drive


In [None]:
def preprocess(text):
    # Extend NLTK's stopwords with domain-specific cooking terms
    custom_stopwords = set([
        "add", "cook", "heat", "bake", "boil", "fry", "stir", "mix", "remove", "place","pepper","bring","teaspoon","like","top",
        "pan", "pot", "oil", "water", "oven", "serve", "grill", "preheat", "use", "tsp", "salt","inch","two","cup","tbsp","bottom",
        "set", "let", "make", "prepare", "cut", "minutes", "cook", "temperature", "degrees","roll","bowl","one","tablespoon","turn",
        "take","get","hard", "side","put","surface","get","little","slow","dont", "mixture","medium","together","whole"
    ])

    stop_words = set(stopwords.words('english')).union(custom_stopwords)
    lemmatizer = WordNetLemmatizer()
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z ]', '', text)
    tokens = word_tokenize(text)
    tokens = [
        lemmatizer.lemmatize(w)
        for w in tokens
        if w not in stop_words and len(w) > 2
    ]
    return tokens

# Calculate Ingredient Overlap
def ingredient_overlap(row_ingredients):
    recipe_ings = set(str(row_ingredients).lower().split(', '))
    user_ings = set(user_input["ingredients"])
    return len(recipe_ings & user_ings) / max(len(recipe_ings | user_ings), 1)

# Get User Preferences
def get_user_preferences():
    print("Enter diet preference (e.g., Vegetarian, Non-Vegetarian): ")
    diet = input().strip()

    print("Enter cuisine preference (e.g., Italian, French, Mexican): ")
    cuisine = input().strip()

    print("Enter max cooking time in minutes (e.g., 30): ")
    try:
        max_time = int(input().strip())
    except:
        max_time = 60  # fallback

    print("Enter your food idea / cravings (e.g., 'chocolate chip cookie', 'spicy tofu'): ")
    query_text = input().strip()

    print("Enter ingredients you want to include (comma-separated): ")
    ingredients = input().strip().split(',')

    return {
        "diet_tag": diet,
        "cuisine_tag": cuisine,
        "max_cooking_time_minutes": max_time,
        "query_text": query_text,
        "ingredients": [i.strip().lower() for i in ingredients if i.strip()]
    }

def get_model():
  # Determine the device to use
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
  print(f"Using device: {device}")

  # Load model and move to the determined device
  model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

  # Load recipe embeddings and move to the determined device
  recipe_embeddings = torch.load('/content/drive/MyDrive/NLP_group_project/recipe_embeddings.pt',
                                 map_location=torch.device(device))

  # load recipe data
  df = pd.read_csv('/content/drive/MyDrive/NLP_group_project/processed_recipes.csv')

  #load lda model
  lda_model =  models.LdaModel.load('/content/drive/MyDrive/NLP_group_project/lda.model')

  return model, recipe_embeddings, df, lda_model



def reccomendations(user_input, rc_model, df, recipe_embeddings,lda_model):

  # Ensure user_embedding is on the same device as recipe_embeddings
  device = recipe_embeddings.device

  # Encode User Query with Sentence-BERT and move to the same device
  user_embedding = rc_model.encode(user_input["query_text"], convert_to_tensor=True).to(device)

  # Computes cosine similarity between the user query and each recipe
  cos_sim = util.cos_sim(user_embedding, recipe_embeddings).cpu().numpy().flatten()


  df['ingredient_overlap'] = df['clean_ingredients'].apply(ingredient_overlap)

  # Calculate Tag Match (diet & cuisine)
  df['tag_match'] = (
      (df['diet_tag'].str.lower() == user_input['diet_tag'].lower()).astype(int) +
      (df['cuisine_tag'].str.lower() == user_input['cuisine_tag'].lower()).astype(int)
  ) / 2

  num_cores = multiprocessing.cpu_count()

  # Wrap df['full_text'] with tqdm to show progress
  tokens_list = Parallel(n_jobs=num_cores-1)(
      delayed(preprocess)(text) for text in tqdm(df['full_text'], desc="Preprocessing")
  )

  df['tokens'] = tokens_list

  # Create dictionary of tokens
  dictionary = corpora.Dictionary(df['tokens'])

  # Filter out extreme tokens
  dictionary.filter_extremes(no_below=20, no_above=0.8, keep_n=10000)

  # Calculate Topic Match
  query_tokens = preprocess(user_input['query_text'])
  query_bow = dictionary.doc2bow(query_tokens)
  query_topic_dist = lda_model.get_document_topics(query_bow)
  query_dominant_topic = max(query_topic_dist, key=lambda x: x[1])[0]
  df['topic_match'] = (df['dominant_topic'] == query_dominant_topic).astype(int)

  # Filter by cooking time
  df_filtered = df[df['cooking_time_mins'] <= user_input['max_cooking_time_minutes']].copy()

  # Normalize cooking time so that faster recipes score higher
  max_time = user_input['max_cooking_time_minutes']
  df_filtered['cooking_time_score'] = 1 - (df_filtered['cooking_time_mins'] / max_time)
  df_filtered['cooking_time_score'] = df_filtered['cooking_time_score'].clip(0, 1)

  ## Final Score
  # Ensure cos_sim is aligned with df_filtered index
  cos_sim_filtered = cos_sim[df_filtered.index]
  df_filtered['score'] = (
      0.45 * cos_sim_filtered +
      0.2 * df_filtered['ingredient_overlap'] +
      0.15 * df_filtered['tag_match'] +
      0.1 * df_filtered['topic_match'] +
      0.1  * df_filtered['cooking_time_score']
  )

  # Top 21 Recommendations
  top_21 = df_filtered.sort_values('score', ascending=False).head(21)

  return top_21

In [None]:
rc_model, recipe_embeddings, df,lda_model = get_model()

Using device: cpu


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

## Get Reccomendation from user

In [None]:
user_input = get_user_preferences()

top_df = reccomendations(user_input, rc_model, df, recipe_embeddings, lda_model)


In [None]:
print(f"========================\n Our reccomendation :\n")
for index, row in top_df.iterrows():
    print(f"Recipe: {row['title']}")
    print(f"Cuisine: {row['cuisine_tag']}")
    print(f"Diet restrictions: {row['diet_tag']}")
    print(f"Cooking time: {row['cooking_time_mins']}\n\n")


# **Model Evaluation**

## Create Evaluation Dataset

In [None]:
import random
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cdist
import re
import os


In [None]:
diet_tags = [
    'Non-Vegetarian, Gluten-Free',
    'Vegetarian, Gluten-Free',
    'Vegetarian, Contains Gluten',
    'Vegan, Gluten-Free',
    'Non-Vegetarian, Contains Gluten',
    'Vegan, Contains Gluten'
]

cuisine_tags = [
    'American', 'French', 'German', 'Chinese', 'Mexican', 'Italian',
    'Thai', 'Indian', 'Japanese', 'Middle Eastern', 'Indonesian'
]

sample_ingredients = [
    # Proteins
    "chicken", "beef", "pork", "lamb", "goat", "duck", "turkey", "fish", "salmon", "tuna", "cod",
    "shrimp", "prawn", "crab", "lobster", "scallop", "oyster", "squid", "tofu", "tempeh", "paneer",
    "egg", "egg white", "egg yolk", "bacon", "sausage", "ham", "chorizo",

    # Vegetables
    "onion", "red onion", "green onion", "garlic", "ginger", "tomato", "cherry tomato", "spinach",
    "kale", "arugula", "lettuce", "romaine", "carrot", "celery", "broccoli", "cauliflower",
    "brussels sprout", "zucchini", "eggplant", "bell pepper", "red pepper", "green pepper",
    "yellow pepper", "chili pepper", "jalapeno", "habanero", "potato", "sweet potato", "yam",
    "radish", "beetroot", "mushroom", "shiitake mushroom", "portobello mushroom", "peas", "corn",

    # Fresh Herbs
    "cilantro", "parsley", "basil", "mint", "dill", "chives", "rosemary", "thyme", "oregano",
    "sage", "tarragon", "lemongrass",

    # Spices & Seasonings
    "black pepper", "white pepper", "sea salt", "cumin", "coriander", "paprika", "smoked paprika",
    "turmeric", "chili powder", "curry powder", "garam masala", "five spice", "nutmeg", "cinnamon",
    "clove", "cardamom", "anise", "fennel seed", "fenugreek", "mustard seed",

    # Citrus & Acids
    "lemon", "lime", "orange", "grapefruit", "vinegar", "balsamic vinegar", "apple cider vinegar",
    "rice vinegar", "white vinegar", "tamarind", "pomegranate molasses",

    # Dairy & Alternatives
    "milk", "whole milk", "cream", "heavy cream", "sour cream", "yogurt", "greek yogurt",
    "buttermilk", "cheddar cheese", "mozzarella cheese", "parmesan cheese", "feta cheese",
    "goat cheese", "butter", "ghee",

    # Oils & Fats
    "olive oil", "extra virgin olive oil", "canola oil", "vegetable oil", "coconut oil",
    "sesame oil", "peanut oil", "sunflower oil",

    # Condiments & Sauces
    "soy sauce", "tamari", "fish sauce", "oyster sauce", "worcestershire sauce", "hot sauce",
    "sriracha", "hoisin sauce", "teriyaki sauce", "barbecue sauce", "mustard", "ketchup",
    "mayonnaise", "harissa", "pesto", "chimichurri",

    # Grains, Legumes & Nuts
    "rice", "white rice", "brown rice", "basmati rice", "jasmine rice", "quinoa", "bulgur",
    "couscous", "oats", "barley", "wheat flour", "cornmeal", "tortilla", "pasta", "spaghetti",
    "penne", "macaroni", "lentils", "red lentils", "green lentils", "black beans", "kidney beans",
    "chickpeas", "peanuts", "almonds", "cashews", "walnuts", "pistachios", "sesame seeds",

    # Sweeteners
    "sugar", "brown sugar", "powdered sugar", "honey", "maple syrup", "molasses", "agave syrup",

    # Miscellaneous
    "stock", "chicken stock", "beef stock", "vegetable stock", "broth", "gelatin", "cornstarch",
    "baking powder", "baking soda", "cocoa powder", "vanilla extract", "chocolate chips"
]


# Cuisine-specific cravings (query_text options)
cuisine_queries = {
    "American": ["cheeseburger", "fried chicken", "BBQ ribs", "mac and cheese", "apple pie"],
    "French": ["coq au vin", "ratatouille", "beef bourguignon", "quiche lorraine", "crepes"],
    "German": ["bratwurst", "sauerbraten", "pretzel with sausage", "schnitzel", "potato salad"],
    "Chinese": ["kung pao chicken", "sweet and sour pork", "mapo tofu", "chow mein", "dumplings"],
    "Mexican": ["tacos al pastor", "chicken enchiladas", "beef burritos", "chile relleno", "tamales"],
    "Italian": ["spaghetti carbonara", "margherita pizza", "lasagna", "risotto", "fettuccine alfredo"],
    "Thai": ["pad thai", "green curry", "tom yum soup", "massaman curry", "pineapple fried rice"],
    "Indian": ["butter chicken", "paneer tikka", "biryani", "chole bhature", "dal makhani"],
    "Japanese": ["sushi", "ramen", "teriyaki chicken", "okonomiyaki", "gyoza"],
    "Middle Eastern": ["shawarma", "falafel wrap", "kebab platter", "hummus with pita", "baba ganoush"],
    "Indonesian": ["nasi goreng", "satay chicken", "gado gado", "rendang", "soto ayam"]
}




In [None]:
# Generate random N user inputs
N=10
user_inputs = []
for _ in range(N):
    cuisine = random.choice(cuisine_tags)
    query = random.choice(cuisine_queries[cuisine])
    ingredients_str = ", ".join(random.sample(sample_ingredients, random.randint(4, 8)))

    entry = {
        "diet_tag": random.choice(diet_tags),
        "cuisine_tag": cuisine,
        "max_cooking_time_minutes": random.randint(10, 120),
        "query_text": query,
        "ingredients": ingredients_str
    }
    user_inputs.append(entry)

In [None]:

file_path = "/content/drive/MyDrive/NLP_group_project/main_df.csv"

try:
    if os.path.exists(file_path):
        main_df = pd.read_csv(file_path)
        print(f"Loaded existing data from {file_path}")
    else:
        raise FileNotFoundError
except Exception as e:
    print(f"Could not load data: {e}")

    # Define expected columns to create an empty DataFrame
    expected_columns = ["user_id","user_cuisine","user_diet","user_cook_time",
                        "user_food_pref", "user_ingredients", "recommended_rank",
        'title', 'ingredients', 'directions','cooking_time',
                     'cuisine_tag', 'diet_tag', 'cooking_time_mins',
                     'dominant_topic','tag_match', 'topic_name','score'
    ]
    main_df = pd.DataFrame(columns=expected_columns)
    print("Created empty DataFrame with expected columns.")

#######################################################################

if 'main_df' in locals() or 'main_df' in globals():
  print("main_df exists")
  if not main_df.empty:
    i= main_df.user_id.iloc[-1]
  else:
    i=0
else:
   main_df = pd.DataFrame()
   i=0
#######################################################################

for user_input in user_inputs:
    top_df = reccomendations(user_input, rc_model, df, recipe_embeddings, lda_model)
    top_df = top_df[['title', 'ingredients', 'directions','cooking_time',
                     'cuisine_tag', 'diet_tag', 'cooking_time_mins',
                     'dominant_topic','tag_match', 'topic_name','score']]
    top_df["user_id"]=i
    top_df["user_cuisine"]= user_input["cuisine_tag"]
    top_df["user_diet"]= user_input["diet_tag"]
    top_df["user_cook_time"]= user_input["max_cooking_time_minutes"]
    top_df["user_food_pref"]= user_input["query_text"]
    top_df["user_ingredients"]= user_input["ingredients"]
    # Rank by score (highest = rank 1)
    top_df['recommended_rank'] = top_df['score'].rank(method='first',
                                                      ascending=False).astype(int)
    top_df = top_df.sort_values('recommended_rank')
    i+=1
    main_df = pd.concat([main_df, top_df],axis=0, ignore_index=True)



Could not load data: 
Created empty DataFrame with expected columns.
main_df exists
0



Preprocessing:   0%|          | 0/50000 [00:00<?, ?it/s][A
Preprocessing:   0%|          | 12/50000 [00:00<07:12, 115.48it/s][A
Preprocessing:   0%|          | 24/50000 [00:00<08:27, 98.55it/s] [A
Preprocessing:   0%|          | 45/50000 [00:00<05:59, 139.02it/s][A
Preprocessing:   0%|          | 65/50000 [00:00<05:14, 158.89it/s][A
Preprocessing:   0%|          | 82/50000 [00:00<05:50, 142.28it/s][A
Preprocessing:   0%|          | 97/50000 [00:00<06:44, 123.25it/s][A
Preprocessing:   0%|          | 110/50000 [00:00<07:16, 114.42it/s][A
Preprocessing:   0%|          | 122/50000 [00:01<08:13, 101.10it/s][A
Preprocessing:   0%|          | 154/50000 [00:01<05:29, 151.22it/s][A
Preprocessing:   0%|          | 176/50000 [00:01<04:56, 168.17it/s][A
Preprocessing:   0%|          | 198/50000 [00:01<04:36, 179.83it/s][A
Preprocessing:   0%|          | 218/50000 [00:01<04:40, 177.52it/s][A
Preprocessing:   0%|          | 237/50000 [00:01<05:17, 156.91it/s][A
Preprocessing:   1%|  

## Metrics Calculation

Build a semantic user preference corpus and check how close each recommended recipe is to that corpus. This similarity acts as a proxy relevance score.



In [None]:
class RecipeRelevanceScorer:
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        print("Loading sentence transformer model...")
        self.model = SentenceTransformer(model_name)

    def _keyword_match_filter(self, user_food_pref, user_ingredients, df):
        def extract_keywords(text):
            if not isinstance(text, str) or text.strip() == '':
                return set()
            tokens = re.split(r'[\s,]+', text.lower())
            tokens = [re.sub(r'\W+', '', t) for t in tokens if t.strip() != '']
            return set(tokens)

        keywords = extract_keywords(user_food_pref)
        keywords.update(extract_keywords(user_ingredients))

        if not keywords:
            return df.copy()

        def recipe_matches_keywords(row):
            combined_text = " ".join([
                str(row.get('title', '')).lower(),
                str(row.get('ingredients', '')).lower(),
                str(row.get('recipe_text', '') or row.get('clean_text', '')).lower()
            ])
            return any(kw in combined_text for kw in keywords)

        filtered = df[df.apply(recipe_matches_keywords, axis=1)]

        return filtered

    def build_user_corpus(self, user_food_pref, user_cuisine, user_diet, user_ingredients, df):
        filtered = df.copy()
        if pd.notna(user_cuisine):
            filtered = filtered[filtered['cuisine_tag'].str.lower() == user_cuisine.lower()]
        if pd.notna(user_diet):
            filtered = filtered[filtered['diet_tag'].str.lower() == user_diet.lower()]

        filtered = self._keyword_match_filter(user_food_pref, user_ingredients, filtered)

        if filtered.empty:
            return []

        filtered['combined_text'] = filtered.apply(
            lambda row: " ".join([
                str(row.get('title', '')),
                str(row.get('recipe_text', '') or row.get('clean_text', ''))
            ]).strip(), axis=1)

        return filtered['combined_text'].tolist()

    def embed_texts(self, texts):
        return self.model.encode(texts, convert_to_numpy=True)

    def compute_relevance_scores(self, user_main_df, corpus):
        if len(corpus) == 0:
            user_main_df['relevance_score'] = 0.0
            return user_main_df

        corpus_embeddings = self.embed_texts(corpus)
        corpus_mean_embedding = np.mean(corpus_embeddings, axis=0, keepdims=True)

        def combine_recipe_text(row):
            parts = []
            for col in ['title', 'ingredients', 'directions']:
                val = row.get(col)
                if pd.notna(val):
                    parts.append(str(val))
            return " ".join(parts).strip()

        rec_texts = user_main_df.apply(combine_recipe_text, axis=1).tolist()
        rec_embeddings = self.embed_texts(rec_texts)

        distances = cdist(rec_embeddings, corpus_mean_embedding, metric='cosine').flatten()
        relevance_scores = 1 - distances

        user_main_df['relevance_score'] = relevance_scores
        return user_main_df


def evaluator(df, main_df):
    scorer = RecipeRelevanceScorer()

    results = []
    for user_id, user_group in main_df.groupby('user_id'):
        # Assume user input columns consistent per user, take first row
        user_row = user_group.iloc[0]
        user_food_pref = user_row.get('user_food_pref', '')
        user_cuisine = user_row.get('user_cuisine', None)
        user_diet = user_row.get('user_diet', None)
        user_ingredients = user_row.get('user_ingredients', '')

        corpus = scorer.build_user_corpus(user_food_pref, user_cuisine, user_diet, user_ingredients, df)
        scored_user_df = scorer.compute_relevance_scores(user_group.copy(), corpus)

        results.append(scored_user_df)

    combined_df = pd.concat(results).reset_index(drop=True)
    print("Relevance scores computed for all users.")
    return combined_df


def evaluate_ranking_metrics(main_df, k=5):
    """
    Compute Precision@K, Recall@K, MRR, and MAP for recommendations in main_df.

    main_df must have columns: ['user_id', 'recommended_rank', 'is_relevant'].
    recommended_rank should start at 1 for top recommendation.

    Returns dict with metrics averaged over users.
    """

    precision_at_k_list = []
    recall_at_k_list = []
    reciprocal_ranks = []
    average_precisions = []

    for user_id, group in main_df.groupby('user_id'):
        group_sorted = group.sort_values('recommended_rank')
        relevances = group_sorted['is_relevant'].values

        total_relevant = relevances.sum()
        if total_relevant == 0:
            # No relevant items for user; skip metrics or treat recall as 0
            continue

        # Precision@K
        top_k = relevances[:k]
        precision_at_k = top_k.sum() / k
        precision_at_k_list.append(precision_at_k)

        # Recall@K
        recall_at_k = top_k.sum() / total_relevant
        recall_at_k_list.append(recall_at_k)

        # MRR: reciprocal rank of first relevant item
        relevant_indices = (relevances == 1).nonzero()[0]
        if len(relevant_indices) > 0:
            rr = 1 / (relevant_indices[0] + 1)
        else:
            rr = 0
        reciprocal_ranks.append(rr)

        # Average Precision (AP)
        num_relevant_found = 0
        precisions = []
        for i, rel in enumerate(relevances, start=1):
            if rel == 1:
                num_relevant_found += 1
                precisions.append(num_relevant_found / i)
        ap = sum(precisions) / total_relevant if total_relevant > 0 else 0
        average_precisions.append(ap)

    results = {
        f'Precision@{k}': np.mean(precision_at_k_list) if precision_at_k_list else 0,
        f'Recall@{k}': np.mean(recall_at_k_list) if recall_at_k_list else 0,
        'MRR': np.mean(reciprocal_ranks) if reciprocal_ranks else 0,
        'MAP': np.mean(average_precisions) if average_precisions else 0,
    }
    return results


In [None]:
main_df = evaluator(df, main_df)
main_df["is_relevant"] = main_df["relevance_score"] > 0.6
metrics = evaluate_ranking_metrics(main_df, k=10)
print(f"=========================================")
for metric, value in metrics.items():
    print(f"{metric}: {np.round(value, 3)}")

Loading sentence transformer model...
Relevance scores computed for all users.
Precision@10: 0.9
Recall@10: 0.447
MRR: 0.867
MAP: 0.882


In [None]:
main_df = main_df[['user_id', 'user_cuisine', 'user_diet',
       'user_cook_time', 'user_food_pref', 'user_ingredients',
       'recommended_rank','title', 'ingredients', 'directions', 'cooking_time',
         'cuisine_tag','diet_tag', 'cooking_time_mins', 'dominant_topic',
         'tag_match','topic_name', 'score', ]]

main_df.to_csv("/content/drive/MyDrive/NLP_group_project/main_df.csv", index=False)