In [1]:
import pandas as pd
import numpy as np
import re
import pickle
import json
import faiss
import requests
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from sentence_transformers import SentenceTransformer
import itertools
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
data=pd.read_csv('data/recipes.csv')
features_drop = ['recipe_url', 'input_db_index', 'food_kg_locator', 'food_com_unique_id', 'submit_date', 'last_changed_date', 'author_id', 'rating', 'recipe_id', 'serves', 'units' ]
data.drop(features_drop, axis = 1, inplace = True)

In [None]:
data["directions"].iloc[1].strip('{'""'}').replace('","', ' ')

In [None]:
def get_ingredients_list(ingredients:str) -> list: 
    """Clean the ingredients string to get a list. Each element of a list is a specific ingredient with the quantity

    Args:
        ingredients (str): string of ingredients

    Returns:
        list: list of ingredients (each elem in the lsit is an ingredient.)
    """
    sample_list = ingredients.split(')')
    clean_sample_list = ''
    pattern = r'[^A-Za-z0-9/ -]'
    for elem in sample_list : 
        cleaned_elem = re.sub(pattern, ' ', elem)
        cleaned_elem = re.sub(r'\s+', ' ', cleaned_elem).strip(' ')
        if len(cleaned_elem) > 1: 
            clean_sample_list=clean_sample_list+" "+cleaned_elem

    return clean_sample_list

def get_directions_list(directions:str): 
    """Clean the directions string to get a list of directions

    Args:
        directions (str): badly formated directions string

    Returns:
        list: contains the different directions to follow for a given recipe. 
    """
    return directions.strip('{'""'}').replace('","', ' ')

data['new_ingredients'] = ''
data['new_directions'] = ''
data.new_ingredients = data.ingredients.apply(lambda x : get_ingredients_list(x))
data.new_directions = data.directions.apply(lambda x : get_directions_list(x))
data.drop(['ingredients', 'directions'], axis = 1, inplace = True)
data.rename(columns = {'new_ingredients': 'ingredients', 'new_directions' : 'directions'}, inplace = True)

In [None]:
data["text"] =data["title"] +". Ingredients:" + data["ingredients"] + ". Instructions:" +data["directions"]
documents = data["text"].tolist()


In [2]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
# embeddings = embedding_model.encode(documents[:1000], show_progress_bar=True)


In [None]:
len(documents)

In [3]:
### 1️⃣ Load Data and Create FAISS Index ###
with open("data/documents.pkl", "rb") as f:
    documents = pickle.load(f)

with open("data/embeddings.pkl", "rb") as f:
    embeddings = pickle.load(f)


def normalize_embeddings(embeddings):
    return embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

embeddings = normalize_embeddings(embeddings)

def index_documents(method="faiss", index_name="recipes", es_host="http://localhost:9200"):
    if method == "faiss":
        dimension = embeddings.shape[1]
        index = faiss.IndexFlatL2(dimension)
        index.add(embeddings)
        faiss.write_index(index, "data/recipe_faiss.index")
        print("FAISS index saved.")
        return index
    elif method == "elasticsearch":
        es = Elasticsearch(es_host)
        mapping = {"mappings": {"properties": {"text": {"type": "text"}, "vector": {"type": "dense_vector", "dims": embeddings.shape[1]}}}}
        es.indices.create(index=index_name, body=mapping, ignore=400)
        for i, (text, vector) in enumerate(zip(documents, embeddings)):
            es.index(index=index_name, id=i, body={"text": text, "vector": vector.tolist()})
        print("Elasticsearch index created.")
        return es

# Choose indexing method

### 2️⃣ Retrieval Function ###
faiss_index = faiss.read_index("data/recipe_faiss.index")

# Load the same embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

def retrieve_documents(query, k=5):
    query_embedding = model.encode([query], convert_to_numpy=True)
    query_embedding = normalize_embeddings(query_embedding.reshape(1, -1))
    scores, indices = faiss_index.search(query_embedding, k)
    return [documents[i] for i in indices[0]], scores

### 3️⃣ Query RAG Pipeline ###
def query_rag(query, retrieved_docs=None):
    if retrieved_docs:
        retrieved_text = "\n".join(retrieved_docs)

        prompt = f"Using only the following list of recipes, answer the question about a recipe. \nList of recipes:{retrieved_text} \n If you can not find a recipe from the documents provided, then just answer -I do not have this recipe. Do not skip the details in the instruction.\n Question: {query}. Answer:"
    else:
        prompt= f'You are a foodchat bot who gives recipes. Given the query provide a recipe. Query: {query}. Answer:'
    url = "http://localhost:11434/api/generate"
    headers = {"Content-Type": "application/json"}
    data = {"model": "llama3.3", "prompt": prompt}
    
    response = requests.post(url, json=data, stream=True)
    
    full_response = ""
    
    for line in response.iter_lines():
        if line:
            try:
                json_data = json.loads(line.decode("utf-8"))
                full_response += json_data.get("response", "")
                
                # If done, exit early
                if json_data.get("done", False):
                    break
            except json.JSONDecodeError as e:
                print("JSON Decode Error:", e)
                continue

    return full_response

In [57]:
query="I have some chicken and vegetables, what can i cook which is not that hard?"
docs, scores=retrieve_documents(query, k=5)

In [58]:
for a, b in zip(scores[0], docs):
    print(f"{a}\t{b[:200]}")

0.5773535966873169	Veggie-Chicken Toss With Lemon Herb Sauce. Ingredients: 2 chicken breasts salt and pepper 1 teaspoon dried rosemary 1 teaspoon dried basil 1/3 cup chopped onion 1 chopped garlic clove 1 red potatoes 3
0.5913329720497131	Super Easy Chicken. Ingredients: 4 pieces chicken drumsticks actually any 6 piece of chicken works 2 onions sliced 4 potatoes cut into 4 2 red peppers sliced 2 green peppers sliced 2 tablespoons olive
0.6129103302955627	Chicken With Chickpeas and Olives. Ingredients: 1 roasting chicken divided into 8 pieces salt and pepper oil for frying 2 green peppers cut into small cubes 2 small onions chopped 1 small hot pepper c
0.6155734062194824	Golden Chicken & Autumn Vegetables. Ingredients: 4 boneless skinless chicken breasts 1 15 ounce can Swanson chicken broth 1 tablespoon chopped fresh parsley I used dried 1/2 teaspoon garlic powder 1/2
0.619018018245697	Chicken and Vegetable Stir -Fry. Ingredients: 2 tablespoons sesame oil 1 lb boneless skinless chicken 

In [59]:
resp=query_rag(query, retrieved_docs=docs)

In [60]:
print(resp)

You can try cooking "Super Easy Chicken". This recipe seems to be quite simple and requires minimal effort. All you need to do is place the chicken and your desired vegetables (except tomatoes) in a shallow saucepan, add some olive oil, salt, and pepper, and cook on low heat until everything is cooked through. The instructions are straightforward and don't require any complicated techniques or ingredient preparations. Give it a try!


In [None]:

def compute_shapley_values(retrieved_docs):
    """
    Compute exact Shapley values for retrieved documents based on cosine similarity.
    """
    n = len(retrieved_docs)
    shapley_values = np.zeros(n)
    
    # Generate full response
    full_response = query_rag(query, retrieved_docs=retrieved_docs)
    full_embedding = normalize_embeddings(model.encode(full_response).reshape(1, -1))
    

    # Iterate over all subsets
    for subset in itertools.chain.from_iterable(itertools.combinations(range(n), r) for r in range(n)):
        if not subset:
            continue
        
        subset_docs = [retrieved_docs[i] for i in subset]
        subset_response = query_rag(subset_docs)
        subset_embedding = model.encode(subset_response).reshape(1, -1)
        
        for i in subset:
            subset_minus_i = [retrieved_docs[j] for j in subset if j != i]
            if not subset_minus_i:
                continue
            subset_minus_i_response = query_rag(subset_minus_i)
            subset_minus_i_embedding = model.encode(subset_minus_i_response).reshape(1, -1)
            
            # Compute cosine similarities
            cos_full_subset = cosine_similarity(full_embedding, subset_embedding)[0, 0]
            cos_full_subset_minus_i = cosine_similarity(full_embedding, subset_minus_i_embedding)[0, 0]
            
            # Marginal contribution
            marginal_contrib = cos_full_subset - cos_full_subset_minus_i
            shapley_values[i] += marginal_contrib
    
    
    return shapley_values/pow(2, (n-1))

In [61]:
from concurrent.futures import ThreadPoolExecutor

def compute_shapley_values_parallel(retrieved_docs):
    """
    Compute exact Shapley values for retrieved documents using cosine similarity.
    Parallelized using ThreadPoolExecutor to optimize API requests to Ollama.
    """
    n = len(retrieved_docs)
    shapley_values = np.zeros(n)

    # Generate full response
    full_response = query_rag(query, retrieved_docs=retrieved_docs)
    full_embedding = normalize_embeddings(model.encode(full_response).reshape(1, -1))

    # Function to process a subset
    def process_subset(subset):
        if not subset:
            return []

        subset_docs = [retrieved_docs[i] for i in subset]
        subset_response = query_rag(query, retrieved_docs=subset_docs)
        subset_embedding = model.encode(subset_response).reshape(1, -1)

        results = []
        for i in subset:
            subset_minus_i = [retrieved_docs[j] for j in subset if j != i]
            if not subset_minus_i:
                subset_minus_i_response = query_rag(query)
            else:
                subset_minus_i_response = query_rag(query, retrieved_docs=subset_minus_i)
            subset_minus_i_embedding = model.encode(subset_minus_i_response).reshape(1, -1)

            # Compute cosine similarities
            cos_full_subset = cosine_similarity(full_embedding, subset_embedding)[0, 0]
            cos_full_subset_minus_i = cosine_similarity(full_embedding, subset_minus_i_embedding)[0, 0]

            # Marginal contribution
            marginal_contrib = cos_full_subset - cos_full_subset_minus_i
            results.append((i, marginal_contrib))

        return results

    # Use ThreadPoolExecutor to process subsets in parallel
    with ThreadPoolExecutor(max_workers=4) as executor:
        subsets = itertools.chain.from_iterable(itertools.combinations(range(n), r) for r in range(n+1))
        futures = [executor.submit(process_subset, subset) for subset in subsets]

        # Collect results
        for future in futures:
            for i, contrib in future.result():
                shapley_values[i] += contrib

    return shapley_values / pow(2, (n-1))


In [62]:
shv=compute_shapley_values_parallel(docs)

In [64]:
shv

array([-0.0018289 ,  0.23371889, -0.00851012,  0.0081025 , -0.04131911])

In [34]:
shv=compute_shapley_values(docs)

In [37]:
shv

array([0.07867983, 0.00879009, 0.00241612])