In [1]:
import pandas as pd
import numpy as np
import re
import pickle
import json
import faiss
import requests
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
data=pd.read_csv('data/recipes.csv')
features_drop = ['recipe_url', 'input_db_index', 'food_kg_locator', 'food_com_unique_id', 'submit_date', 'last_changed_date', 'author_id', 'rating', 'recipe_id', 'serves', 'units' ]
data.drop(features_drop, axis = 1, inplace = True)

In [None]:
data["directions"].iloc[1].strip('{'""'}').replace('","', ' ')

In [None]:
def get_ingredients_list(ingredients:str) -> list: 
    """Clean the ingredients string to get a list. Each element of a list is a specific ingredient with the quantity

    Args:
        ingredients (str): string of ingredients

    Returns:
        list: list of ingredients (each elem in the lsit is an ingredient.)
    """a
    sample_list = ingredients.split(')')
    clean_sample_list = ''
    pattern = r'[^A-Za-z0-9/ -]'
    for elem in sample_list : 
        cleaned_elem = re.sub(pattern, ' ', elem)
        cleaned_elem = re.sub(r'\s+', ' ', cleaned_elem).strip(' ')
        if len(cleaned_elem) > 1: 
            clean_sample_list=clean_sample_list+" "+cleaned_elem

    return clean_sample_list

def get_directions_list(directions:str): 
    """Clean the directions string to get a list of directions

    Args:
        directions (str): badly formated directions string

    Returns:
        list: contains the different directions to follow for a given recipe. 
    """
    return directions.strip('{'""'}').replace('","', ' ')

data['new_ingredients'] = ''
data['new_directions'] = ''
data.new_ingredients = data.ingredients.apply(lambda x : get_ingredients_list(x))
data.new_directions = data.directions.apply(lambda x : get_directions_list(x))
data.drop(['ingredients', 'directions'], axis = 1, inplace = True)
data.rename(columns = {'new_ingredients': 'ingredients', 'new_directions' : 'directions'}, inplace = True)

In [None]:
data["text"] =data["title"] +". Ingredients:" + data["ingredients"] + ". Instructions:" +data["directions"]
documents = data["text"].tolist()


In [2]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
# embeddings = embedding_model.encode(documents[:1000], show_progress_bar=True)


In [43]:
len(documents)

507335

In [None]:
### 1️⃣ Load Data and Create FAISS Index ###
with open("data/documents.pkl", "rb") as f:
    documents = pickle.load(f)

with open("data/embeddings.pkl", "rb") as f:
    embeddings = pickle.load(f)


def normalize_embeddings(embeddings):
    return embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

embeddings = normalize_embeddings(embeddings)

def index_documents(method="faiss", index_name="recipes", es_host="http://localhost:9200"):
    if method == "faiss":
        dimension = embeddings.shape[1]
        index = faiss.IndexFlatL2(dimension)
        index.add(embeddings)
        faiss.write_index(index, "data/recipe_faiss.index")
        print("FAISS index saved.")
        return index
    elif method == "elasticsearch":
        es = Elasticsearch(es_host)
        mapping = {"mappings": {"properties": {"text": {"type": "text"}, "vector": {"type": "dense_vector", "dims": embeddings.shape[1]}}}}
        es.indices.create(index=index_name, body=mapping, ignore=400)
        for i, (text, vector) in enumerate(zip(documents, embeddings)):
            es.index(index=index_name, id=i, body={"text": text, "vector": vector.tolist()})
        print("Elasticsearch index created.")
        return es

# Choose indexing method

### 2️⃣ Retrieval Function ###
faiss_index = faiss.read_index("data/recipe_faiss.index")

# Load the same embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

def retrieve_documents(query, k=5):
    query_embedding = model.encode([query], convert_to_numpy=True)
    query_embedding = normalize_embeddings(query_embedding.reshape(1, -1))
    scores, indices = faiss_index.search(query_embedding, k)
    return [documents[i] for i in indices[0]], scores

### 3️⃣ Query RAG Pipeline ###
def query_rag(query, retrieved_docs, k):
    if k:
        retrieved_text = "\n".join(retrieved_docs)

        prompt = f"Using only the following list of recipes, answer the question about a recipe. \nList of recipes:{retrieved_text} \n If you can not find a recipe from the documents provided, then just answer -I do not have this recipe. Do not skip the details in the instruction.\n Question: {query}. Answer:"
    else:
        prompt= f'You are a foodchat bot who gives recipes. Given the query provide a recipe. Query: {query}. Answer:'
    url = "http://localhost:11434/api/generate"
    headers = {"Content-Type": "application/json"}
    data = {"model": "llama3.3", "prompt": prompt}
    
    response = requests.post(url, json=data, stream=True)
    
    full_response = ""
    
    for line in response.iter_lines():
        if line:
            try:
                json_data = json.loads(line.decode("utf-8"))
                full_response += json_data.get("response", "")
                
                # If done, exit early
                if json_data.get("done", False):
                    break
            except json.JSONDecodeError as e:
                print("JSON Decode Error:", e)
                continue

    if k:
        return full_response
    else:
        return full_response

FAISS index saved.


In [111]:
query="Give a recipe with beef or lamb which takes minimal effort"

In [None]:
resp, docs, scores=query_rag(query, k=10)

In [116]:
for a, b in zip(scores[0], docs):
    print(f"{a}\t{b[:200]}")

0.6258053183555603	Lamb (Or Beef) Entree. Ingredients: 1 lb lamb stew meat or beef 3 roma tomatoes 4 ounces portabella mushrooms 14 ounces quartered artichokes 2 garlic cloves chopped 2 tablespoons olive oil 6 ounces fe
0.6486079692840576	Easy Meat Patties. Ingredients: 1 lb ground lamb 1 small onion finely chopped 3 green chilies finely chopped 1 teaspoon gingerroot grated 6 -7 garlic cloves crushed 1/4 cup coriander leaves chopped 1/
0.6753540635108948	Old-Fashioned Irish Stew. Ingredients: 3 lbs lamb necks slices fat trimmed and reserved 1-inch thick 4 medium onions thinly sliced 1 medium onion chopped 1 lb medium carrot peeled halved crosswise and
0.6849710941314697	Lamb Stew. Ingredients: 2 lbs lamb shoulder cut into cubes 2 teaspoons flour 3 teaspoons shortening salt and pepper 1 cup chopped onion hot water 3 carrots cut into 1 inch chunks 6 potatoes cut into 1
0.6867395639419556	Lamb Stew. Ingredients: 6 medium peeled potatoes cut in chunks 4 carrots peeled and cut in chunks 2 l

In [117]:
print(resp)

Based on the provided recipes, I found one that requires minimal effort and uses lamb. Here is the recipe:

**Lamb Burgers**

Ingredients:
- 1/4 cup minced fresh cilantro
- 3 tablespoons crumbled feta cheese
- 2 teaspoons minced red onions
- 1/4 teaspoon salt
- 1/4 teaspoon ground coriander
- 1/4 teaspoon ground red pepper
- 1/4 teaspoon black pepper
- 1 lb lean ground lamb

Instructions:
1. Prepare broiler OR grill pan.
2. Combine all 8 ingredients.
3. Divide lamb mixture into 4 equal portions, shaping each into a 3/4-inch thick patty.
4. Place patties on broiler or grillpan coated with cooking spray.
5. Cook 4 minutes on each side or until done.
6. Serve it on a bun with your favorites (tomato, lettuce).

This recipe requires minimal effort as it involves only mixing the ingredients, shaping the patties, and cooking them. It's a simple and quick recipe that can be prepared in about 20-30 minutes.


In [None]:
import itertools
import numpy as np
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity


def compute_shapley_values(retrieved_docs):
    """
    Compute exact Shapley values for retrieved documents based on cosine similarity.
    """
    n = len(retrieved_docs)
    shapley_values = np.zeros(n)
    
    # Generate full response
    full_response = query_rag(query, k=n)
    full_embedding = normalize_embeddings(model.encode(full_response).reshape(1, -1))
    

    # Iterate over all subsets
    for subset in itertools.chain.from_iterable(itertools.combinations(range(n), r) for r in range(n)):
        if not subset:
            continue
        
        subset_docs = [retrieved_docs[i] for i in subset]
        subset_response = query_rag("\n".join(subset_docs))
        subset_embedding = model.encode(subset_response).reshape(1, -1)
        
        for i in subset:
            subset_minus_i = [retrieved_docs[j] for j in subset if j != i]
            if not subset_minus_i:
                continue
            subset_minus_i_response = query_rag("\n".join(subset_minus_i))
            subset_minus_i_embedding = model.encode(subset_minus_i_response).reshape(1, -1)
            
            # Compute cosine similarities
            cos_full_subset = cosine_similarity(full_embedding, subset_embedding)[0, 0]
            cos_full_subset_minus_i = cosine_similarity(full_embedding, subset_minus_i_embedding)[0, 0]
            
            # Marginal contribution
            marginal_contrib = cos_full_subset - cos_full_subset_minus_i
            shapley_values[i] += marginal_contrib / len(subset)
    
    # Normalize by number of permutations
    shapley_values /= n
    
    return shapley_values

In [129]:
shv=compute_shapley_values(docs)

IndexError: index 1 is out of bounds for axis 0 with size 1