# Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import functools
import traceback
import pdb
import pprint
import time

from dotenv import load_dotenv, find_dotenv
import pandas as pd

import langchain
from langchain.llms import OpenAI
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.vectorstores import Chroma
from langchain.vectorstores import FAISS

from langchain.embeddings import HuggingFaceInstructEmbeddings

from langchain.schema import Document
from langchain.chains.query_constructor.base import AttributeInfo
from langchain import PromptTemplate

# Data

In [3]:
!echo $PWD

/home/divinekage/Codes/LLM/RAGcipe


In [4]:
recipes_csv = 'data/recipes.csv'
recipes_df = pd.read_csv(recipes_csv)

# Setup

In [5]:
# Load embedding model
embedding = HuggingFaceInstructEmbeddings(model_name = "hkunlp/instructor-base",
                                          model_kwargs = {'device': 'cpu'}
                                         )

  from tqdm.autonotebook import trange


load INSTRUCTOR_Transformer


Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`,  it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again. You will see the new `added_tokens_decoder` attribute that will store the relevant information.


max_seq_length  512


In [6]:
def data_prep(recipes_df):
    titles_documents = df_to_documents(recipes_df,
                                       page_content_column = "title",
                                       metadata_lst = ["title", "cuisine", "Carbs", "Proteins"])
    ingredients_documents = df_to_documents(recipes_df,
                                            page_content_column = "ingredients",
                                            metadata_lst = ["title"])
    return titles_documents, ingredients_documents

def df_to_documents(recipes_df, page_content_column, metadata_lst):
    split_documents = []
    for idx, row in recipes_df.iterrows():
        # create document
        current_document = Document(
            page_content = row[page_content_column],
            metadata = dict(
                (i, row[i])
                for i in metadata_lst
            ),
        )
        # append to document list
        split_documents.append(current_document)
    return split_documents

In [7]:
titles_documents, ingredients_documents = data_prep(recipes_df)

In [8]:
def create_vector_db(titles_documents, ingredients_documents):
    # make vector dbs
    vector_dbs = {}
    vector_dbs["titles_db"] = document_to_vectordb(titles_documents)
    vector_dbs["ingredients_db"] = document_to_vectordb(ingredients_documents)
    return vector_dbs

def document_to_vectordb(split_documents):
    vectordb = Chroma.from_documents(
        documents = split_documents,
        embedding = embedding,
    )
    return vectordb

In [9]:
vector_dbs = create_vector_db(titles_documents, ingredients_documents)

In [10]:
def get_title(document):
    ingredient = document.page_content
    title_bool = recipes_df['ingredients'] == ingredient
    for extracted_title in recipes_df[title_bool]['title']:
        pass
    return extracted_title

In [11]:
def ingredients_to_recipes(ingredients_string, display_titles=2):
    # ================ BM25Retriever search ================
    bm25_retriever = BM25Retriever.from_texts(recipes_df['ingredients'])
    document_lst = bm25_retriever.get_relevant_documents(ingredients_string)
    recipe_string_bm25 = ""
    for idx, document in enumerate(document_lst, 1):
        extracted_title = get_title(document)
        recipe_string_bm25 += f"{idx}. {extracted_title}\n"
        if idx >= display_titles:
            break
    recipe_string_bm25 = recipe_string_bm25.strip('\n')
    print("--- titles (bm25) ---")
    print(recipe_string_bm25)

    # ================ semantic search ================
    faiss_vectorstore = FAISS.from_texts(recipes_df['ingredients'],
                                         embedding)
    faiss_retriever = faiss_vectorstore.as_retriever(search_kwargs={"k": 5})
    document_lst = faiss_retriever.get_relevant_documents(ingredients_string)
    recipe_string_semantic = ""
    for idx, document in enumerate(document_lst, 1):
        extracted_title = get_title(document)
        recipe_string_semantic += f"{idx}. {extracted_title}\n"
        if idx >= display_titles:
            break
    recipe_string_semantic = recipe_string_semantic.strip('\n')
    print("\n--- titles (semantic) ---")
    print(recipe_string_semantic)

    # ================ ensemble search ================
    ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, faiss_retriever],
                                           weights=[0.5, 0.5])
    document_lst = ensemble_retriever.get_relevant_documents(ingredients_string)
    recipe_string_ensemble = ""
    for idx, document in enumerate(document_lst, 1):
        extracted_title = get_title(document)
        recipe_string_ensemble += f"{idx}. {extracted_title}\n"
        if idx >= display_titles:
            break
    recipe_string_ensemble = recipe_string_ensemble.strip('\n')
    print("\n--- titles (ensemble) ---")
    print(recipe_string_ensemble)

    # ================ return ================
    return recipe_string_bm25, recipe_string_semantic, recipe_string_ensemble, f"recipes successfully retrieved"

# Ensemble retrieval (BM25 + Semantic search)

In [24]:
ingredients_list = """
penne
prawns
salt
"""
_ = ingredients_to_recipes(ingredients_list, 3)

--- titles (bm25) ---
1. Prawn Pesto Penne
2. Sour Cream & Chives Pasta
3. Delifrance Inspired Lobster Filling

--- titles (semantic) ---
1. Prawn Pesto Spaghetti
2. Prawn Pesto Penne
3. Delifrance Inspired Lobster Filling

--- titles (ensemble) ---
1. Prawn Pesto Penne
2. Prawn Pesto Spaghetti
3. Delifrance Inspired Lobster Filling


In [25]:
ingredients_list = """
pasta
prawns
salt
"""
_ = ingredients_to_recipes(ingredients_list, 3)

--- titles (bm25) ---
1. Delifrance Inspired Lobster Filling
2. Prawn Pesto Spaghetti
3. Prawn Pesto Penne

--- titles (semantic) ---
1. Prawn Pesto Spaghetti
2. Prawn Pesto Penne
3. Delifrance Inspired Lobster Filling

--- titles (ensemble) ---
1. Prawn Pesto Spaghetti
2. Delifrance Inspired Lobster Filling
3. Prawn Pesto Penne


## recipe ingredients

In [14]:
print(recipes_df[recipes_df['title']=='Prawn Pesto Penne']['ingredients'][12])

12-14 prawns
pepper
herbs
chilli powder
1 clove garlic (minced)
olive oil
butter (1-2 spoonful)
penne
pesto paste


In [15]:
print(recipes_df[recipes_df['title']=='Prawn Pesto Spaghetti']['ingredients'][11])

12-14 prawns
pepper
herbs
chilli powder
1 clove garlic (minced)
olive oil
butter (1-2 spoonful)
spaghetti
pesto paste
salt


In [16]:
print(recipes_df[recipes_df['title']=='Delifrance Inspired Lobster Filling']['ingredients'][6])

prawns
crabmeat
onion
tuna
chili powder
mayo


# BM25

In [17]:
recipes_df.head(1)

Unnamed: 0,title,cuisine,Carbs,Proteins,recipe,ingredients
0,Chicken Udon Noodle Soup,japanese,noodles,chicken,1. cook udon noodles according to directions (...,udon noodles\n1 tsp dashi\n1 tsp soy sauce\n1 ...


In [18]:
recipes = dict(
    (row['title'], row['ingredients'].strip('\n ')\
                                       .replace('\n', ' ')\
                                       .replace('/', ' ')\
                                       .replace('(', ' ')\
                                       .replace(')', ' ')\
                                       .replace('-', ' ')\
                                       .replace('  ', ' ')\
                                       .replace('  ', ' ')\
                                       .replace('  ', ' ')\
                                       .strip(' ')\
                                       .split(' '))
    for idx, row in recipes_df.iterrows()
)
for key, value in recipes.items():
    print(key, '|', " ".join(value))

Chicken Udon Noodle Soup | udon noodles 1 tsp dashi 1 tsp soy sauce 1 pinch salt 1 pinch sugar 1 cup boiling water cooked chicken
cacio e pepe | salt pecorino romano parmigiano reggiano pepper linguine spaghetti olive oil
Creamy Mentaiko Pasta | spaghetti cheese cooking oil onion garlic red pepper flakes heavy cream mentaiko salt
Beef Enoki Rolls | shabu beef enoki mushrooms dashin stock shiitake mushrooms onion sake mirin soy sauce dashi stock cracked black pepper egg oil chilli powder salt
Cabbage Rolls in Broth | cabbage minced chicken shrimp paste black fungus soy sauce sesame oil white pepper chives broth salt
Chicken Veggie Patties | minced chicken frozen veggies onion garlic minced and grated sesame oil sugar white pepper salt egg ginger grated flour
Delifrance Inspired Lobster Filling | prawns crabmeat onion tuna chili powder mayo
Mushroom Orzo Risotto (Creamy) | mushrooms salt pepper butter garlic onion orzo thyme parsley chicken broth lemon parmesan heavy cream salt
Sour Crea

In [19]:
recipes_df.loc[[6,11,12]]

Unnamed: 0,title,cuisine,Carbs,Proteins,recipe,ingredients
6,Delifrance Inspired Lobster Filling,western,bread,seafood,Serves 2\n\nIngredients:\n- 7 boiled prawns (...,prawns\ncrabmeat\nonion\ntuna\nchili powder\nmayo
11,Prawn Pesto Spaghetti,italian,pasta,seafood,1. Take 12-14 prawns. Thaw and dry them\n2. Ma...,12-14 prawns\npepper\nherbs\nchilli powder\n1 ...
12,Prawn Pesto Penne,italian,pasta,seafood,1. Take 12-14 prawns. Thaw and dry them\n2. Ma...,12-14 prawns\npepper\nherbs\nchilli powder\n1 ...


In [20]:
from collections import Counter
import math

class BM25:
    def __init__(self, corpus):
        self.corpus = corpus
        self.doc_lengths = [len(doc) for doc in corpus]
        self.avg_doc_length = sum(self.doc_lengths) / len(corpus)
        self.inverted_index = self.build_inverted_index(corpus)
        self.k1 = 1.5
        self.b = 0.75

    def build_inverted_index(self, corpus):
        index = {}
        for i, doc in enumerate(corpus):
            term_freqs = Counter(doc)
            for term, freq in term_freqs.items():
                if term not in index:
                    index[term] = []
                index[term].append((i, freq))
        return index

    def score(self, query):
        scores = [0 for _ in range(len(self.corpus))]
        for term in query:
            print(f'term = {term}')
            f = self.inverted_index.get(term, [])
            for i, freq in f:
                if i not in [6, 11, 12]: continue
                idf = math.log((len(self.corpus) - len(f) + 0.5) / (len(f) + 0.5) + 1.0)
                numerator = freq * (self.k1 + 1)
                denominator = freq + self.k1 * (1 - self.b + self.b * (self.doc_lengths[i] / self.avg_doc_length))
                current_score = idf * numerator / denominator
                scores[i] += current_score
                title = recipes_df["title"][i].zfill(35).replace('0',' ')
                print(f'\t| {title} | freq = {freq}\t| current_score = {current_score}')
        return scores

In [21]:
# Example usage
corpus = [value for key, value in recipes.items()]
bm25_model = BM25(corpus)

In [22]:
query = ['prawns', 'penne', 'pasta', 'salt']
_ = bm25_model.score(query)

term = prawns
	| Delifrance Inspired Lobster Filling | freq = 1	| current_score = 1.85847031093045
	|               Prawn Pesto Spaghetti | freq = 1	| current_score = 1.2184000061987263
	|                   Prawn Pesto Penne | freq = 1	| current_score = 1.2491292152362885
term = penne
	|                   Prawn Pesto Penne | freq = 1	| current_score = 1.5523096310751912
term = pasta
term = salt
	|               Prawn Pesto Spaghetti | freq = 1	| current_score = 0.17288667576713773


In [23]:
x_df = pd.DataFrame({
    'title': ['Delifrance Inspired Lobster Filling', 'Prawn Pesto Spaghetti', 'Prawn Pesto Penne'],
    'prawns': [1.858, 1.218, 1.249],
    'penne':  [0,         0, 1.552],
    'pasta':  [0,         0,     0],
    'salt':   [0,     0.172,     0],
})

x_df['prawns+penne+salt'] = x_df.apply(lambda x: x['prawns']+x['penne']+x['salt'], axis=1)
x_df['prawns+pasta+salt'] = x_df.apply(lambda x: x['prawns']+x['pasta']+x['salt'], axis=1)
x_df

Unnamed: 0,title,prawns,penne,pasta,salt,prawns+penne+salt,prawns+pasta+salt
0,Delifrance Inspired Lobster Filling,1.858,0.0,0,0.0,1.858,1.858
1,Prawn Pesto Spaghetti,1.218,0.0,0,0.172,1.39,1.39
2,Prawn Pesto Penne,1.249,1.552,0,0.0,2.801,1.249
