In [10]:
import pandas as pd
import kaggle
from ast import literal_eval

In [None]:

# Download data
kaggle.api.authenticate()
kaggle.api.dataset_download_files(dataset='shuyangli94/food-com-recipes-and-user-interactions/',
                                 path='../../data/',
                                 unzip=True)


In [11]:
raw_df = pd.read_csv('../../data/RAW_recipes.csv', converters={'tags': literal_eval, 'nutrition': literal_eval, 'steps': literal_eval, 'ingredients': literal_eval})
raw_df.head()


Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"[60-minutes-or-less, time-to-make, course, mai...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"[make a choice and proceed with recipe, depend...",autumn is my favorite time of year to cook! th...,"[winter squash, mexican seasoning, mixed spice...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"[30-minutes-or-less, time-to-make, course, mai...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"[preheat oven to 425 degrees f, press dough in...",this recipe calls for the crust to be prebaked...,"[prepared pizza crust, sausage patty, eggs, mi...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"[time-to-make, course, preparation, main-dish,...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"[brown ground beef in large pot, add chopped o...",this modified version of 'mom's' chili was a h...,"[ground beef, yellow onions, diced tomatoes, t...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"[60-minutes-or-less, time-to-make, course, mai...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,[place potatoes in a large pot of lightly salt...,"this is a super easy, great tasting, make ahea...","[spreadable cheese with garlic and herbs, new ...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"[weeknight, time-to-make, course, main-ingredi...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,"[mix all ingredients& boil for 2 1 / 2 hours ,...",my dh's amish mother raised him on this recipe...,"[tomato juice, apple cider vinegar, sugar, sal...",8


In [13]:
raw_df[raw_df['name'].isna()]

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
721,,368257,10,779451,2009-04-27,"[15-minutes-or-less, time-to-make, course, pre...","[1596.2, 249.0, 155.0, 0.0, 2.0, 112.0, 14.0]",6,"[in a bowl , combine ingredients except for ol...",-------------,"[lemon, honey, horseradish mustard, garlic clo...",10


In [None]:
interactions_df = pd.read_csv('../../data/RAW_interactions.csv')
interactions_df.head()

In [None]:
interactions_df['review'][0]

In [None]:
import numpy as np
interactions_df.groupby(by='recipe_id', as_index=False)['rating'].agg(np.mean)

In [None]:
avg_rating = interactions_df.groupby(by='recipe_id', as_index=False)['rating'].agg(np.mean)
test_df = pd.merge(raw_df, avg_rating, how='left', left_on='id', right_on="recipe_id")

In [None]:
test_df.isna().sum()

In [None]:
def nutrition_labels(nutrition):
    nutrition_labels = [' calories', '% total fat', '% sugar', '% sodium', '% protein', '% saturated fat', '% total carbohydrates']
    result = ''
    for i in range(6):
        labelled = f'{nutrition[i]}{nutrition_labels[i]}, '
        result += labelled
    return result

In [None]:
def numbered_steps(steps):
    result = ''
    for i, step in enumerate(steps):
        result += f'{i+1} {step}\n'
    return result

In [None]:
def create_doc(row):
    """
    Collapses each row into a single document for the recipe,
    adding labels to column values.
    """
    # Extract and format relevant data from each field
    name = row['name']
    minutes = row['minutes']
    tags = ', '.join(row['tags'])
    description = row['description']
    n_ingredients = row['n_ingredients']
    ingredients = ', '.join(row['ingredients'])
    steps = numbered_steps(row['steps'])    
    nutrition_info = nutrition_labels(row['nutrition'])
    
    # Combine fields into full recipe    
    full_recipe = f"Name: {name}\n\nCook Time: {minutes} minutes\n\nTags: {tags}\n\nDescription: {description}\n\nNumber of ingredients: {n_ingredients}\n\nIngredients List: {ingredients}\n\nSteps:\n{steps}\n\nNutrition: {nutrition_info}"
    return full_recipe

In [None]:
raw_df['full_recipe'] = raw_df.apply(create_doc, axis=1)

In [None]:
raw_df.head()

In [None]:
print(raw_df['full_recipe'][10])

In [None]:
from haystack import Document

# Cast data into Haystack Document objects
titles = list(raw_df['name'].values)
texts = list(raw_df['full_recipe'].values)
documents = []
for title, text in zip(titles, texts):
    documents.append(Document(content=text, meta={'name': title or ''}))

In [None]:
from haystack.document_stores import FAISSDocumentStore

document_store = FAISSDocumentStore(faiss_index_factory_str="Flat", return_embedding=True)

In [None]:
from haystack.nodes import DensePassageRetriever

retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
    use_gpu=False,
    embed_title=True,
)

# Document Store

In [1]:
from haystack import Document
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import AnswerParser, PromptNode, PromptTemplate, DensePassageRetriever
import pandas as pd

In [2]:
def initialize_documents(file_path):
    """
    Casts recipes from prepared recipe_docs.csv file into document structure for Haystack.

    Args:
        file_path (str): location of recipe_docs.csv file
    Returns:
        documents ()
    """
    # Load data
    df = pd.read_csv(file_path)

    # Cast data into Haystack Document objects
    titles = list(df['name'].values)
    texts = list(df['full_recipe'].values)
    documents = []
    for title, text in zip(titles, texts):
        documents.append(Document(content=text, meta={'name': title or ''}))
    return documents

In [3]:
documents = initialize_documents('../../data/recipe_docs.csv')

In [28]:
# Delete existing documents in document store
document_store.delete_documents()
document_store.write_documents(documents)


Writing Documents: 240000it [08:56, 447.57it/s]                            


In [29]:
index_path = 'rag_faiss_index.faiss'
config_path = 'rag_faiss_index.json'
document_store.load(index_path=index_path, config_path=config_path)

<haystack.document_stores.faiss.FAISSDocumentStore at 0x1e82a014490>

In [30]:
# Initialize Retriever
retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
    use_gpu=False,
    embed_title=True
    )

In [31]:
from haystack import Pipeline
import os

openai_key = os.environ['OPENAI_HACKTOBERFEST_KEY']

prompt_template = PromptTemplate(prompt=""""Offer the user the recipe that best matches their query.
                                 If they ask for a different option, provide them the next best match.
                                 Related text: {join(documents)} \n\n Question: {query} \n\n Answer: 
                                        """,
                                        output_parser=AnswerParser())
prompt_node = PromptNode(model_name_or_path="gpt-3.5-turbo",
                            api_key=openai_key,
                            default_prompt_template=prompt_template,
                            max_length=500,
                            model_kwargs={"stream": True})

query_pipeline = Pipeline()
query_pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])
query_pipeline.add_node(component=prompt_node, name="PromptNode", inputs=["Retriever"])

In [32]:
message = 'What can I make in less than 20 minutes?'

output = query_pipeline.run(query=message, documents=documents)

# Create a Chainlit message with the response
response = output['answers'][0].answer

Expected prompt parameter 'documents' to be provided but it is missing. Continuing with an empty list of documents.


You can make a delicious and quick Caprese Salad in less than 20 minutes. Here's the recipe: 

Ingredients:
- 2 large tomatoes
- 8 ounces fresh mozzarella cheese
- 1/4 cup fresh basil leaves
- 2 tablespoons extra virgin olive oil
- 2 tablespoons balsamic glaze
- Salt and pepper to taste

Instructions:
1. Slice the tomatoes and mozzarella cheese into 1/4-inch thick slices.
2. Arrange the tomato and mozzarella slices on a platter, alternating between them.
3. Tuck fresh basil leaves in between the tomato and mozzarella slices.
4. Drizzle extra virgin olive oil and balsamic glaze over the salad.
5. Season with salt and pepper to taste.
6. Serve immediately and enjoy!

If you're looking for a different option, the next best match would be a quick and easy stir-fry. Let me know if you'd like the recipe for that!

In [16]:
new_doc_store = FAISSDocumentStore(faiss_index_path=index_path, faiss_config_path=config_path)

In [18]:
new_doc_store.describe_documents()

{'count': 231636,
 'chars_mean': 1346.320209293892,
 'chars_max': 12837,
 'chars_min': 421,
 'chars_median': 1245.0}

In [34]:
document_store.describe_documents()

{'count': 231636,
 'chars_mean': 1346.320209293892,
 'chars_max': 12837,
 'chars_min': 421,
 'chars_median': 1245.0}

In [33]:
print(document_store.get_embedding_count())
print(document_store.get_document_count())

0
231636


In [35]:
import torch
torch.cuda.is_available()

False

# In Memory Storage

In [1]:
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import BM25Retriever, PromptTemplate, AnswerParser, PromptNode
import os
from haystack.pipelines import Pipeline
import torch
import chainlit as cl
import pandas as pd
from haystack import Document


In [2]:
# Load API key and data
openai_key = os.environ['OPENAI_HACKTOBERFEST_KEY']

df = pd.read_csv('../../data/recipe_docs.csv')

# Cast data into Haystack Document objects
titles = list(df['name'].values)
texts = list(df['full_recipe'].values)
documents = []
for title, text in zip(titles, texts):
    documents.append(Document(content=text, meta={'name': title or ''}))

In [3]:
# Initialize document store
document_store = InMemoryDocumentStore(use_bm25=True)
document_store.delete_documents()
document_store.write_documents(documents)
print(document_store.get_all_documents())

2023-10-22 14:36:55 - Using devices: CPU - Number of GPUs: 0


Updating BM25 representation...: 100%|██████████| 231636/231636 [00:22<00:00, 10294.00 docs/s]


In [5]:
rag_prompt = PromptTemplate(
    prompt="""Offer the user the recipe that best matches their query.
              If they ask for a different option, provide them the next best match.
              \n\n Related text: {join(documents)} \n\n Question: {query} \n\n Answer:""",
    output_parser=AnswerParser(),
)

# Set up nodes
retriever = BM25Retriever(document_store=document_store, top_k=2)
pn = PromptNode("gpt-3.5-turbo", 
                api_key=openai_key, 
                model_kwargs={"stream":False},
                default_prompt_template=rag_prompt)

NameError: name 'PromptTemplate' is not defined

In [None]:
# Set up pipeline
pipe = Pipeline()
pipe.add_node(component=retriever, name='retriever', inputs=['Query'])
pipe.add_node(component=pn, name='prompt_node', inputs=['retriever'])

In [1]:
message = "What can I make with spinach, chicken and lemons?"

In [2]:
output = pipe.run(query=message)

# Create a Chainlit message with the response
response = output['answers'][0].answer

NameError: name 'pipe' is not defined

In [None]:
response

In [None]:
output = pipe.run(query='Show me the Jalapeno and Pesto Chicken Pasta Salad recipe.')
output['answers'][0].answer