In [1]:
# ! pip install nltk
# ! pip install sent2vec
# ! pip install update pydantic
# ! pip install update spacy
# ! pip install sentence-transformers==2.5.0

### Importing libraries

In [2]:
import re
import nltk
import json
import warnings
import numpy as np
import pandas as pd
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sent2vec.vectorizer import Vectorizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

  from pandas.core import (


In [3]:
warnings.filterwarnings("ignore")
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/ambujupadhyay/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:
#List of common kitchen words to remove
#keep it updated
common_kitchen_words = [
    'pound', 'gallon', 'teaspoon', 'tablespoon', 'cup','ounce', 'oz', 'gram', 'kg', 'kilogram', 
    'liter', 'milliliter', 'ml', 'pinch', 'slice', 'bunch', 'piece', 'pack', 'jar', 'can', 'bottle', 
    'quart', 'pint', 'bag', 'box', 'tsp', 'tbsp', 'g', 'l', 'diced', 'chopped', 'sliced', 'crushed', 
    'minced', 'peeled', 'grated', 'ground', 'water']

# Data Cleaning and Text Normalization
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    # Tokenization
    tokens = text.split()
    # Remove stopwords
    nltk.download('stopwords', quiet=True)
    stop_words = stopwords.words('english')
    tokens = [token for token in tokens if token not in stop_words]
    # Lemmatization
    nltk.download('wordnet', quiet=True)
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Remove common kitchen words
    tokens = [token for token in tokens if token not in common_kitchen_words]
    # Join tokens back to string
    text = ' '.join(tokens)
    return text


def extract_cooking_time(text):
    # Text to Lower
    text = text.lower()
    
    # First, try to capture combined patterns like "1 hour and 30 minutes" or "1 and a half hours"
    combined_pattern = re.compile(r'(\d+)\s*hour(?:s)?\s*and\s*(\d+)\s*minute(?:s)?', re.IGNORECASE)
    half_hour_pattern = re.compile(r'(\d+)\s*and\s*a\s*half\s*hours?', re.IGNORECASE)
    
    # Capture ranges and specific times
    range_pattern = re.compile(r'(\d+)(?:\s*to\s*(\d+))?\s*(minutes?|hours?|mins?|hrs?)', re.IGNORECASE)
    
    total_minutes = 0
    
    # Check for combined patterns
    combined_matches = combined_pattern.findall(text)
    for hours, minutes in combined_matches:
        total_minutes += int(hours) * 60 + int(minutes)

    half_hour_matches = half_hour_pattern.findall(text)
    for hours in half_hour_matches:
        total_minutes += int(hours[0]) * 60 + 30  # Adding 30 minutes for the "half" part

    if total_minutes > 0:
        return total_minutes

    # Fallback to range pattern if no combined pattern matched
    matches = range_pattern.findall(text)
    for start, end, unit in matches:
        start = int(start)
        end = int(end) if end else start
        if 'hour' in unit:
            start, end = start * 60, end * 60
        if start == end:
            total_minutes += start
        else:
            # Average the range
            total_minutes += (start + end) / 2

    return round(total_minutes) if total_minutes > 0 else np.nan

def get_combined_text(title, ingredients, instructions):
    return 'Title : {} \n Ingredients : {} \n Instructions : {}'.format(title, ingredients, instructions)

### Reading data

In [5]:
f_ar = open('recipes_raw/recipes_raw_nosource_ar.json')
f_epi = open('recipes_raw/recipes_raw_nosource_epi.json')
f_fn = open('recipes_raw/recipes_raw_nosource_fn.json')

In [6]:
recipes_ar = json.load(f_ar)
recipes_epi = json.load(f_epi)
recipes_fn = json.load(f_fn)

In [7]:
# Convert Dictionary to DataFrame
recipes_ar_df = pd.DataFrame.from_dict(recipes_ar, orient='index').reset_index()
recipes_epi_df = pd.DataFrame.from_dict(recipes_epi, orient='index').reset_index()
recipes_fn_df = pd.DataFrame.from_dict(recipes_fn, orient='index').reset_index()

# Concatenate Recipes
recipes =  pd.concat([recipes_ar_df,recipes_epi_df, recipes_fn_df])
recipes = recipes.reset_index()
recipes = recipes.drop(columns=['picture_link', 'level_0'])

In [8]:
recipes.head(5)

Unnamed: 0,index,title,ingredients,instructions
0,rmK12Uau.ntP510KeImX506H6Mr6jTu,Slow Cooker Chicken and Dumplings,"[4 skinless, boneless chicken breast halves AD...","Place the chicken, butter, soup, and onion in ..."
1,5ZpZE8hSVdPk2ZXo1mZTyoPWJRSCPSm,Awesome Slow Cooker Pot Roast,[2 (10.75 ounce) cans condensed cream of mushr...,"In a slow cooker, mix cream of mushroom soup, ..."
2,clyYQv.CplpwJtjNaFGhx0VilNYqRxu,Brown Sugar Meatloaf,"[1/2 cup packed brown sugar ADVERTISEMENT, 1/2...",Preheat oven to 350 degrees F (175 degrees C)....
3,BmqFAmCrDHiKNwX.IQzb0U/v0mLlxFu,Best Chocolate Chip Cookies,"[1 cup butter, softened ADVERTISEMENT, 1 cup w...",Preheat oven to 350 degrees F (175 degrees C)....
4,N.jCksRjB4MFwbgPFQU8Kg.yF.XCtOi,Homemade Mac and Cheese Casserole,[8 ounces whole wheat rotini pasta ADVERTISEME...,Preheat oven to 350 degrees F. Line a 2-quart ...


### Data cleaning

In [9]:
#Removing the word 'ADVERTISEMENT' from the dataset
recipes['ingredients'] = recipes['ingredients'].apply(lambda x: [i.replace(' ADVERTISEMENT','') for i in x])
recipes['ingredients'] = recipes['ingredients'].apply(lambda x: [i for i in x if i not in ['ADVERTISEMENT']])

In [10]:
# Removing null values
recipes.dropna(inplace=True)
recipes.isnull().sum()

index           0
title           0
ingredients     0
instructions    0
dtype: int64

In [11]:
# Create String of Ingredients
recipes['ingredients_str'] = recipes['ingredients'].apply(lambda x: '; '.join(x))

# Clean Ingredients and Instructions
recipes['cleaned_ingredients'] = recipes['ingredients_str'].apply(clean_text)
recipes['cleaned_instructions'] = recipes['instructions'].apply(clean_text)
recipes['cleaned_title'] = recipes['title'].apply(clean_text)

In [23]:
recipes.head(3)

Unnamed: 0,index,title,ingredients,instructions,ingredients_str,cleaned_ingredients,cleaned_instructions,cleaned_title,ingredients_instructions,title_ingredients,cooking_time_mins,combined_text
0,rmK12Uau.ntP510KeImX506H6Mr6jTu,Slow Cooker Chicken and Dumplings,"[4 skinless, boneless chicken breast halves, 2...","Place the chicken, butter, soup, and onion in ...","4 skinless, boneless chicken breast halves; 2 ...",skinless boneless chicken breast half butter c...,place chicken butter soup onion slow cooker fi...,slow cooker chicken dumpling,skinless boneless chicken breast half butter c...,skinless soup cooker chicken dumpling package ...,360.0,Title : Slow Cooker Chicken and Dumplings \n I...
1,5ZpZE8hSVdPk2ZXo1mZTyoPWJRSCPSm,Awesome Slow Cooker Pot Roast,[2 (10.75 ounce) cans condensed cream of mushr...,"In a slow cooker, mix cream of mushroom soup, ...",2 (10.75 ounce) cans condensed cream of mushro...,condensed cream mushroom soup package dry onio...,slow cooker mix cream mushroom soup dry onion ...,awesome slow cooker pot roast,condensed cream mushroom soup package dry onio...,awesome condensed soup cooker pot slow package...,720.0,Title : Awesome Slow Cooker Pot Roast \n Ingre...
2,clyYQv.CplpwJtjNaFGhx0VilNYqRxu,Brown Sugar Meatloaf,"[1/2 cup packed brown sugar, 1/2 cup ketchup, ...",Preheat oven to 350 degrees F (175 degrees C)....,1/2 cup packed brown sugar; 1/2 cup ketchup; 1...,packed brown sugar ketchup lean beef milk egg ...,preheat oven degree f degree c lightly grease ...,brown sugar meatloaf,packed brown sugar ketchup lean beef milk egg ...,meatloaf brown salt pepper milk ginger packed ...,60.0,Title : Brown Sugar Meatloaf \n Ingredients : ...


In [12]:
# Combining ingredients and instructions for a comprehensive feature set
recipes['ingredients_instructions'] = recipes['cleaned_ingredients'] + " " + recipes['cleaned_instructions']
recipes['title_ingredients'] = recipes['cleaned_title'] + " " + recipes['cleaned_ingredients']
recipes['title_ingredients'] = recipes['title_ingredients'].apply(lambda x: ' '.join(list(set(x.split(' ')))))

In [13]:
# Extract Cooking Time
recipes['cooking_time_mins'] = recipes['instructions'].apply(lambda x: extract_cooking_time(x))

In [14]:
# Get Combined Text for Sentence Embeddings
recipes['combined_text'] = recipes[['title', 'ingredients_str', 'instructions']].apply(lambda x: get_combined_text(*x), axis=1)

In [36]:
recipes['combined_text'][2]

'Title : Brown Sugar Meatloaf \n Ingredients : 1/2 cup packed brown sugar; 1/2 cup ketchup; 1 1/2 pounds lean ground beef; 3/4 cup milk; 2 eggs; 1 1/2 teaspoons salt; 1/4 teaspoon ground black pepper; 1 small onion, chopped; 1/4 teaspoon ground ginger; 3/4 cup finely crushed saltine cracker crumbs \n Instructions : Preheat oven to 350 degrees F (175 degrees C). Lightly grease a 5x9 inch loaf pan.\nPress the brown sugar in the bottom of the prepared loaf pan and spread the ketchup over the sugar.\nIn a mixing bowl, mix thoroughly all remaining ingredients and shape into a loaf. Place on top of the ketchup.\nBake in preheated oven for 1 hour or until juices are clear.\n'

### Vectorization

In [15]:
# TF-IDF Vectorization of Title + Ingredients
tfidf_vectorizer_combined = TfidfVectorizer()
tfidf_matrix_combined = tfidf_vectorizer_combined.fit_transform(recipes['title_ingredients'])

# TF-IDF Vectorization of Title
tfidf_matrix_title = tfidf_vectorizer_combined.transform(recipes['cleaned_title'])

# TF-IDF Vectorization of Ingredients
tfidf_matrix_ingredients = tfidf_vectorizer_combined.transform(recipes['cleaned_ingredients'])

In [16]:
# Create a list of the combined text for all recipes
cooking_instructions = recipes['combined_text'].tolist()

# Load a pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [17]:
# Generate embeddings
instruction_embeddings = model.encode(cooking_instructions, batch_size=64, show_progress_bar=True)

Batches:   0%|          | 0/1945 [00:00<?, ?it/s]

### Recommender function

In [88]:
# Function to recommend recipes based on user input
def recommend_recipes(query):
    print(query)
    
    # Extract Time
    query_cooking_time = extract_cooking_time(query)
    
    # Process the query the same way as your dataset features
    query_processed = clean_text(query)
    print(query_processed)
    
    # Transform the query to match the dataset's feature space
    query_tfidf = tfidf_vectorizer_combined.transform([query_processed])
    query_embedding = model.encode([query])
    
    # Calculate cosine similarity between query and dataset
    cosine_sim_combined = cosine_similarity(query_tfidf, tfidf_matrix_combined).flatten()
    cosine_sim_title = cosine_similarity(query_tfidf, tfidf_matrix_title).flatten()
    cosine_sim_ingredients = cosine_similarity(query_tfidf, tfidf_matrix_ingredients).flatten()
    cosine_sim_instructions = cosine_similarity(query_embedding, instruction_embeddings).flatten()
    
    # Save Scores in DataFrame
    recipes['cos_sim_combined'] = cosine_sim_combined
    recipes['cos_sim_title'] = cosine_sim_title
    recipes['cos_sim_ingredients'] = cosine_sim_ingredients
    recipes['cos_sim_instructions'] = cosine_sim_instructions
    recipes['query_cooking_time'] = query_cooking_time
    recipes['aggregate_score'] = 0.25 * recipes['cos_sim_title'] + 0.35 * recipes[
    'cos_sim_ingredients'] + 0.4 * recipes['cos_sim_instructions']
    
    filter_cols = ['index','title',
    'ingredients', 
    'instructions', 
    'cos_sim_title',
    'cos_sim_ingredients',
    'cos_sim_instructions',
    'aggregate_score',
    'query_cooking_time',
    'cooking_time_mins']
    
    return recipes[filter_cols]

In [339]:
# Test Queries
test_queries_df = pd.read_csv('test_queries_v1.csv')

In [379]:
# Example User Query
user_query = test_queries_df['User_Queries'].values[5]
recommendations = recommend_recipes(user_query)

# Sort Recommendations by Aggregate Scores and Time
recommendations = recommendations.sort_values(by = ['aggregate_score', 
                                                    'cooking_time_mins'], ascending=[False, True])

# Sort Rrecommendations by Aggregate Scores and Time after applying time filter
recommendations_time = recommendations[
    recommendations['cooking_time_mins']<=recommendations['query_cooking_time']].sort_values(
    by = ['aggregate_score', 'cooking_time_mins'], 
    ascending=[False, True])

sweet made of coconut and bananas and ready in 30 minutes
sweet made coconut banana ready minute


In [380]:
recommendations.head(5)

Unnamed: 0,index,title,ingredients,instructions,cos_sim_title,cos_sim_ingredients,cos_sim_instructions,aggregate_score,query_cooking_time,cooking_time_mins
11533,9sqp0u5GYXpP9W4DKtuRd9U3t2VXO..,Banana Nut Coconut Cake,"[Banana Nut Coconut Cake:, 1 1/2 cups white su...",Preheat oven to 350 degrees F (175 degrees C)....,0.33759,0.305847,0.607319,0.434371,30,45.0
84079,IHBeJ2hTe7ukVGw9jQi9pKOg9FlUPPe,"Sweet Potato Toast with Almond Butter, Banana ...",[2 medium sweet potatoes (about 1 pound total)...,Preheat the oven to 450 degrees F.\nSlice off ...,0.326365,0.325233,0.584873,0.429372,30,15.0
89139,sqfFCr5lc3M64ZJnaUAs0luqacdzCZi,Banana Coconut Cake,"[7 ounces unsalted butter, 1 1/2 cups granulat...",Cake:\nPreheat the oven to 350 degrees. Butter...,0.400227,0.222916,0.616075,0.424507,30,30.0
115296,ZQRMw5WR5OjxsruRxWYgFd0y2xFWUgK,Banana and Coconut Shake,"[1 cup crushed ice, 3 frozen ripe bananas, 1/2...",Add the ice and the frozen bananas to a blende...,0.310649,0.257151,0.633458,0.421049,30,
114381,oko9oA6G6u9CzuprUGGN4O5ivB7bfaa,Coconut Banana Pudding,"[5 ripe bananas, 1/2 cup plus 2 tablespoons ag...",Special equipment: four 10-ounce ramekins\nIn ...,0.369313,0.233918,0.605007,0.416202,30,130.0


In [None]:
recommendations

In [375]:
recommendations['instructions'][0]

'Place the chicken, butter, soup, and onion in a slow cooker, and fill with enough water to cover.\nCover, and cook for 5 to 6 hours on High. About 30 minutes before serving, place the torn biscuit dough in the slow cooker. Cook until the dough is no longer raw in the center.\n'

In [354]:
recommendations_time.head(5)

Unnamed: 0,index,title,ingredients,instructions,cos_sim_title,cos_sim_ingredients,cos_sim_instructions,aggregate_score,query_cooking_time,cooking_time_mins


### Model Evaluation

In [340]:
def calculate_precision_recall(recommended_recipes, relevant_recipes):
    # Convert lists to sets for easier manipulation
    recommended_recipes_set = set(test_queries_df['predicted_index'][1:31])
    relevant_recipes_set = set(test_queries_df['index'][1:31])
    
    # Calculate the number of relevant items that are recommended
    true_positives = len(recommended_recipes_set.intersection(relevant_recipes_set))
    
    # Calculate precision and recall
    precision = true_positives / len(recommended_recipes_set) if recommended_recipes_set else 0
    recall = true_positives / len(relevant_recipes_set) if relevant_recipes_set else 0
    
    return precision, recall

In [341]:
recommended_recipes = ['Recipe A', 'Recipe B', 'Recipe C']  # Recipes recommended by your system
relevant_recipes = ['Recipe A', 'Recipe D', 'Recipe E']  # Recipes that are known to be relevant
precision, recall = calculate_precision_recall(recommended_recipes, relevant_recipes)
print(f"Precision: {precision}\nRecall: {recall}")

Precision: 0.7333333333333333
Recall: 0.7333333333333333
