In [2]:
import pandas as pd
import numpy as np
import re
import pickle
import json
import faiss
import requests
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from sentence_transformers import SentenceTransformer
import itertools
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from concurrent.futures import ThreadPoolExecutor

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
data=pd.read_csv('data/recipes.csv')
features_drop = ['recipe_url', 'input_db_index', 'food_kg_locator', 'food_com_unique_id', 'submit_date', 'last_changed_date', 'author_id', 'rating', 'recipe_id', 'serves', 'units' ]
data.drop(features_drop, axis = 1, inplace = True)
data["directions"].iloc[1].strip('{'""'}').replace('","', ' ')

In [None]:
def get_ingredients_list(ingredients:str) -> list: 
    """Clean the ingredients string to get a list. Each element of a list is a specific ingredient with the quantity

    Args:
        ingredients (str): string of ingredients

    Returns:
        list: list of ingredients (each elem in the lsit is an ingredient.)
    """
    sample_list = ingredients.split(')')
    clean_sample_list = ''
    pattern = r'[^A-Za-z0-9/ -]'
    for elem in sample_list : 
        cleaned_elem = re.sub(pattern, ' ', elem)
        cleaned_elem = re.sub(r'\s+', ' ', cleaned_elem).strip(' ')
        if len(cleaned_elem) > 1: 
            clean_sample_list=clean_sample_list+" "+cleaned_elem

    return clean_sample_list

def get_directions_list(directions:str): 
    """Clean the directions string to get a list of directions

    Args:
        directions (str): badly formated directions string

    Returns:
        list: contains the different directions to follow for a given recipe. 
    """
    return directions.strip('{'""'}').replace('","', ' ')

data['new_ingredients'] = ''
data['new_directions'] = ''
data.new_ingredients = data.ingredients.apply(lambda x : get_ingredients_list(x))
data.new_directions = data.directions.apply(lambda x : get_directions_list(x))
data.drop(['ingredients', 'directions'], axis = 1, inplace = True)
data.rename(columns = {'new_ingredients': 'ingredients', 'new_directions' : 'directions'}, inplace = True)
data["text"] =data["title"] +". Ingredients:" + data["ingredients"] + ". Instructions:" +data["directions"]
documents = data["text"].tolist()

In [None]:
# embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
# embeddings = embedding_gen_embs(documents[:1000], show_progress_bar=True)
# embeddings = embed.text(documents, model="nomic-embed-text-v1")

In [14]:
### 1️⃣ Load Data and Create FAISS Index ###
with open("data/documents.pkl", "rb") as f:
    documents = pickle.load(f)

with open("data/embeddings_nomic.pkl", "rb") as f:
    embeddings = pickle.load(f)

def gen_embs(qtext, model="nomic"):
    if model=="nomic":
    
        data = {
            "model": "nomic-embed-text",
            "prompt": qtext
        }
        return np.array(requests.post('http://localhost:11434/api/embeddings', json=data).json()['embedding'])
    else:
        return SentenceTransformer("all-MiniLM-L6-v2").encode([qtext], convert_to_numpy=True)
    


def normalize_embeddings(embeddings):
    return embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

embeddings = normalize_embeddings(embeddings)

def index_documents(method="faiss", index_name="recipes_nomic", es_host="http://localhost:9200"):
    if method == "faiss":
        dimension = embeddings.shape[1]
        index = faiss.IndexFlatL2(dimension)
        index.add(embeddings)
        faiss.write_index(index, "data/recipe_nomic_faiss.index")
        print("FAISS index saved.")
        return index
    elif method == "elasticsearch":
        es = Elasticsearch(es_host)
        mapping = {"mappings": {"properties": {"text": {"type": "text"}, "vector": {"type": "dense_vector", "dims": embeddings.shape[1]}}}}
        es.indices.create(index=index_name, body=mapping, ignore=400)
        for i, (text, vector) in enumerate(zip(documents, embeddings)):
            es.index(index=index_name, id=i, body={"text": text, "vector": vector.tolist()})
        print("Elasticsearch index created.")
        return es

# Choose indexing method
# index_documents(method="faiss", index_name="recipes_nomic", es_host="http://localhost:9200")
# faiss_index = faiss.read_index("data/recipe_faiss.index")
faiss_index = faiss.read_index("data/recipe_nomic_faiss.index")

# Load the same embedding model
# model = SentenceTransformer("all-MiniLM-L6-v2")

def retrieve_documents(query, k=5):
    query_embedding = gen_embs(query)
    query_embedding = normalize_embeddings(query_embedding.reshape(1, -1))
    scores, indices = faiss_index.search(query_embedding, k)
    return [documents[i] for i in indices[0]], scores

### 3️⃣ Query RAG Pipeline ###
def query_rag(query, retrieved_docs=None):
    """Query the Ollama API with a prompt based on provided documents."""
    if retrieved_docs:
        retrieved_text = "\n".join(retrieved_docs)
        prompt = f"Using only the following list of recipes, answer the question about a recipe. \nList of recipes:{retrieved_text} \n If you can not find a recipe from the documents provided, then just answer -I do not have this recipe. Do not skip the details in the instruction.\n Question: {query}. Answer:"
    else:
        prompt= f'You are a foodchat bot who gives recipes. Given the query provide a recipe. Query: {query}. Answer:'
    HEADERS = {"Content-Type": "application/json"}
    url = "http://localhost:11434/api/generate"
    data = {"model": "llama2", "prompt": prompt, 'stream': False}

    response = requests.post(url, headers=HEADERS, json=data)

    return response.json()['response']

In [9]:
res=query_rag(query='Give me the recipe for pizza')

In [10]:
res

"Here's a simple and delicious recipe for a classic margherita pizza:\n\n**Ingredients:**\n\nFor the dough:\n- 2 cups of warm water\n- 1 tablespoon sugar\n- 2 teaspoons active dry yeast\n- 3 1/2 cups all-purpose flour\n- 1 teaspoon salt\n- 2 tablespoons olive oil\n\nFor the sauce:\n- 1 can (28 oz) crushed tomatoes\n- 4 cloves garlic, minced\n- 1 teaspoon dried oregano\n- 1 teaspoon dried basil\n- Salt and pepper to taste\n- 2 tablespoons olive oil\n\nFor the topping:\n- 8 ounces fresh mozzarella cheese, sliced\n- Fresh basil leaves\n\n**Instructions:**\n\n1. **Make the Dough:** In a large mixing bowl, combine warm water, sugar, and yeast. Let it sit for about 5 minutes until the mixture becomes frothy. Add flour, salt, and olive oil to the bowl. Mix everything together until a dough forms. Knead the dough on a floured surface for about 10 minutes until it becomes smooth and elastic. Place the dough in a greased bowl, cover it with plastic wrap, and let it rise in a warm place for about

In [4]:
query="Give me a recipe for fastest pizza preparation?"
docs, scores=retrieve_documents(query=query, k=5)

In [5]:
for a, b in zip(scores[0], docs):
    print(f"{a}\t{b[:230]}\n")

0.4916580319404602	Fast and Easy Pita Pizza in Less Than 10 Minutes!. Ingredients: 1 piece pita bread white or whole wheat store-bought jar your favorite sauce spaghetti or pizza shredded mozzarella cheese part-skim or whole milk your favorite Itali

0.4942934513092041	Super Fast and Easy Pizza Dough/ Crust. Ingredients: 3 -3 1/2 cups flour divided 1 1/4 ounce package fast rising yeast about 2 1/4 tsp 3/4 teaspoon salt 1 cup very warm water 2 tablespoons olive oil 2 2 teaspoons template2 optiona

0.5022497773170471	THE Easiest Pizza Crust. Ingredients: 1 tablespoon yeast 1 1/4 cups all-purpose flour 1/2 teaspoon salt 1/2 teaspoon garlic powder 1 teaspoon italian seasoning 2/3 cup water 110 degrees F 1/2 teaspoon sugar 1 tablespoon olive oil.

0.5047016739845276	quick 'n' easy pizza. Ingredients: 1 cup all-purpose flour 1/2 cup milk 2 tablespoons vegetable oil 1 teaspoon baking powder 1/2 teaspoon salt toppings pizza sauce toppings shredded mozzarella cheese toppings chopped onion toppi

In [20]:
resp=query_rag(query=query, retrieved_docs=docs)

In [21]:
print(resp)

Based on the recipes provided, the fastest pizza preparation would be the "Super Fast and Easy Pizza Dough/Crust" recipe. This recipe only requires 5 minutes of kneading time and can be prepared in under 10 minutes, including rising time. The ingredients are also basic and easily found in most kitchens.

Here is the recipe:

Ingredients:

* 3-3 1/2 cups flour
* 1 1/4 ounce package fast rising yeast
* 3/4 teaspoon salt
* 1 cup very warm water
* 2 tablespoons olive oil
* 2-2 1/2 teaspoons template (optional)
* 1 minced garlic clove (optional)

Instructions:

1. In a large bowl, combine 2 cups flour, yeast, and salt. Add any other spices or herbs you like.
2. Stir in the very warm water and oil. Mix well.
3. Add enough of the remaining flour to make a soft dough. Knead on a lightly floured surface until smooth and elastic, about 5 minutes.
4. Cover and let rise about 10 minutes. Lightly oil one 14" or two 12" round pizza pans.
5. Sprinkle with cornmeal. Form dough into smooth ball(s).
6. 

In [19]:
import random
random.shuffle(docs)

In [22]:
docs

['Fast and Easy Pita Pizza in Less Than 10 Minutes!. Ingredients: 1 piece pita bread white or whole wheat store-bought jar your favorite sauce spaghetti or pizza shredded mozzarella cheese part-skim or whole milk your favorite Italian spices garlic powder oregano basil parsley etc. Instructions:"ok ready?  here\'s the \\"hard\\" part.\r Pre-heat oven to 350 degrees.\r Split open the pita bread to make two flat round halves.\r Spread spaghetti/pizza sauce with a spoon.  Drop the sauce with the spoon and use the flat back-end of the spoon to spread.\r Spread mozzarella cheese on top.\r Dash of your favorite spices.\r Cook 5-7 minutes, depending on how toasty you want the bread.\r",Enjoy!',
 'Speed-Cooking Spinach & Feta Pizza for One. Ingredients: 8 inches pizza crusts 3 tablespoons tomato sauce 1 1/2 cups fresh spinach leaves 1/4 cup reduced-fat mozzarella cheese shredded 2 tablespoons feta cheese shredded or crumbled 1 teaspoon oregano. Instructions:"Thaw out, prebake, etc. the pizza c

In [6]:
import numpy as np
from itertools import combinations
import math

def F(subset, full_set_embedding):
    """
    Cost function: Cosine similarity between the LLM's response for the subset and the full set.
    """
    if not subset:
        return 0.0  # Empty subset has no contribution
    
    # Query the LLM with the subset
    response = query_rag(query, subset)
    
    # Generate and normalize embeddings for the subset's response
    subset_embedding = normalize_embeddings(gen_embs(response).reshape(1, -1))
    
    # Compute cosine similarity with the full set's embedding
    return cosine_similarity(subset_embedding, full_set_embedding)

def shapley_values(S):
    """
    Compute Shapley values for a set of textual queries S.
    """
    S = list(S)
    n = len(S)
    
    # Query the LLM with the full set to get the reference embedding
    full_set_response = query_rag(query, S)
    full_set_embedding = normalize_embeddings(gen_embs(full_set_response).reshape(1, -1))
    
    # Precompute the cost for all subsets
    F_cache = {}
    for bitmask in tqdm(range(0, 1 << n), desc="Calculating cosine to full response"):
        subset = [S[i] for i in range(n) if (bitmask & (1 << i))]
        if len(subset)==n:
            F_cache[bitmask] = 1
        else:
            F_cache[bitmask] = F(subset, full_set_embedding)
    
    # Initialize Shapley values
    shap = {element: 0.0 for element in S}
    
    # Calculate contributions for each subset
    for bitmask in tqdm(range(0, 1 << n), desc="Calculating shap"):
        subset_size = bin(bitmask).count('1')
        if subset_size == 0:
            continue  # Skip empty subsets
        
        for i in range(n):
            if not (bitmask & (1 << i)):
                continue  # Skip subsets without the current element
            
            # Compute subset without the current element
            subset_without_i = bitmask ^ (1 << i)
            
            # Compute Shapley weight
            k = bin(subset_without_i).count('1')
            weight = (math.factorial(k) * math.factorial(n - k - 1)) / math.factorial(n)
            
            # Compute marginal contribution
            marginal = F_cache[bitmask] - F_cache[subset_without_i]
            shap[S[i]] += marginal * weight
    
    return shap

In [23]:
shap=shapley_values(docs)

Calculating cosine to full response: 100%|██████████| 32/32 [01:39<00:00,  3.10s/it]
Calculating shap: 100%|██████████| 32/32 [00:00<00:00, 48877.54it/s]


In [25]:
for element, value in shap.items():
    print(f"Contribution of '{element[:25]}': {value}")

Contribution of 'Fast and Easy Pita Pizza ': [[0.17313253]]
Contribution of 'Speed-Cooking Spinach & F': [[0.19128451]]
Contribution of 'THE Easiest Pizza Crust. ': [[0.1926467]]
Contribution of 'quick 'n' easy pizza. Ing': [[0.20487761]]
Contribution of 'Super Fast and Easy Pizza': [[0.23805865]]
