In [86]:
!pip install -q python-dotenv
!pip install -q openai
!pip install -q --upgrade langchain
!pip install -q pickle5

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/132.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.1/132.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for pickle5 (setup.py) ... [?25l[?25hdone


In [87]:
import os
import openai
import pickle5 as pickle

from dotenv import load_dotenv, find_dotenv
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser

_ = load_dotenv(find_dotenv()) # read local .env file
openai.api_key = os.environ['OPENAI_API_KEY']

In [5]:
from google.colab import drive
drive.mount('/content/drive')

saved_model_path = "/content/drive/MyDrive/Projects/rag/saved_model"
data_path = "/content/drive/MyDrive/Projects/rag/data"

Mounted at /content/drive


In [6]:
# account for deprecation of LLM model
import datetime
# Get the current date
current_date = datetime.datetime.now().date()

# Define the date after which the model should be set to "gpt-3.5-turbo"
target_date = datetime.date(2024, 6, 12)

# Set the model variable based on the current date
if current_date > target_date:
    llm_model = "gpt-3.5-turbo"
else:
    llm_model = "gpt-3.5-turbo-0301"

In [13]:
def get_completion(prompt, model=llm_model):
    client = openai.OpenAI(api_key=openai.api_key)
    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "user", "content": prompt}
        ],
        temperature=0
    )
    response = completion.choices[0].message.content
    return response

**LangChain : Prompting LLM for Data Augmentation**

In [85]:
prompt_preface = """
    Example :
      Sample Input:
        ```Recipe :labneh fresh herbs and olive oil',
        Instructions : Line a strainer with a double layer of cheesecloth and suspend over a bowl.
        Spoon in yogurt. Refrigerate and let drain for at least 2 hours. Discard liquid.
        The longer the yogurt drains, the thicker the cheese will be. For a thicker spread, drain covered yogurt overnight in the refrigerator.
        Transfer to a bowl. Add oil, tarragon, basil, chives, thyme, zest, salt and pepper, and whisk until blended.
        Let sit for 15 minutes to allow the flavors to meld.
        Taste and adjust seasoning with salt and pepper.
        Labneh will keep in an airtight container in the refrigerator for up to 5 days.```

        Question : Can I let the ingredients sit for longer to make the flavors stronger?
        Answer : Only 15 minutes is needed for the flavors to meld.
"""

improved_question_schema = ResponseSchema(name="improved_question",
                             description="If the question is very simple, you need to reframe the question and improve it by converting it to a question that requires more cooking related reasoning and details.")
improved_answer_schema = ResponseSchema(name="improved_answer",
                                      description="Generate a detailed answer to the improved question in less than 150 words.")
follow_up_schema = ResponseSchema(name="follow_up",
                                    description="follow_up: Array of 5 to 10 follow up questions and corresponding answers related to the the context and previous question and answer. format : [{'question': string, 'answer': string}]",
                                    type="array(objects)")

response_schemas = [improved_question_schema,
                    improved_answer_schema,
                    follow_up_schema]


output_template = """
  You are given a recipe delimited by triple backticks.
  Following that is a question about the recipe and an answer is provided after the question.
  Return the following information :

  improved_question: If the question is very simple, you need to reframe the question and improve it by converting it to a question that requires more cooking related reasoning and details.
  improved_answer: Generate a detailed answer to the improved question in less than 100 words.
  follow_up: Array of 5 to 10 follow up questions and corresponding answers related to the the context and previous question and answer.

  text: {text}

  {format_instructions}
"""

In [74]:
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

In [75]:
format_instructions = output_parser.get_format_instructions()

In [76]:
def create_prompt(recipe, question, answer):
  prompt = prompt_preface + f"\nRecipe : {recipe}\nQuestion : {question}\nAnswer : {answer}"
  return prompt

**Sample Prompt and Response**

In [77]:
recipe = """
  labneh fresh herbs and olive oil',
  Instructions : Line a strainer with a double layer of cheesecloth and suspend over a bowl.
  Spoon in yogurt. Refrigerate and let drain for at least 2 hours. Discard liquid.
  The longer the yogurt drains, the thicker the cheese will be. For a thicker spread, drain covered yogurt overnight in the refrigerator.
  Transfer to a bowl. Add oil, tarragon, basil, chives, thyme, zest, salt and pepper, and whisk until blended.
  Let sit for 15 minutes to allow the flavors to meld.
  Taste and adjust seasoning with salt and pepper.
  Labneh will keep in an airtight container in the refrigerator for up to 5 days.
"""
question = "Can I let the ingredients sit for longer to make the flavors stronger?"
answer = "Only 15 minutes is needed for the flavors to meld."

prompt_example = create_prompt(recipe, question, answer)
prompt_example

"\n    Example :\n      Sample Input:\n        ```Recipe :labneh fresh herbs and olive oil',\n        Instructions : Line a strainer with a double layer of cheesecloth and suspend over a bowl.\n        Spoon in yogurt. Refrigerate and let drain for at least 2 hours. Discard liquid.\n        The longer the yogurt drains, the thicker the cheese will be. For a thicker spread, drain covered yogurt overnight in the refrigerator.\n        Transfer to a bowl. Add oil, tarragon, basil, chives, thyme, zest, salt and pepper, and whisk until blended.\n        Let sit for 15 minutes to allow the flavors to meld.\n        Taste and adjust seasoning with salt and pepper.\n        Labneh will keep in an airtight container in the refrigerator for up to 5 days.```\n\n        Question : Can I let the ingredients sit for longer to make the flavors stronger?\n        Answer : Only 15 minutes is needed for the flavors to meld.\n\nRecipe : \n  labneh fresh herbs and olive oil',\n  Instructions : Line a stra

In [78]:
# from langchain.prompts import ChatPromptTemplate

# prompt_template = ChatPromptTemplate.from_template(output_template)
# print(prompt_template)

# messages = prompt_template.format_messages(text=prompt_example)
# chat = ChatOpenAI(temperature=0.0, model=llm_model)
# response = chat(messages)
# print(response.content)

# print(type(response.content))

In [79]:
chat = ChatOpenAI(temperature=0.0, model=llm_model)
prompt = ChatPromptTemplate.from_template(template=output_template)
messages = prompt.format_messages(text=prompt_example,
                                format_instructions=format_instructions)

In [80]:
print(messages[0].content)


  You are given a recipe delimited by triple backticks.
  Following that is a question about the recipe and an answer is provided after the question.
  Return the following information :

  improved_question: If the question is very simple, you need to reframe the question and improve it by converting it to a question that requires more cooking related reasoning and details.
  improved_answer: Generate a detailed answer to the improved question in less than 100 words.
  follow_up: Array of 5 to 10 follow up questions and corresponding answers related to the the context and previous question and answer.

  text: 
    Example :
      Sample Input:
        ```Recipe :labneh fresh herbs and olive oil',
        Instructions : Line a strainer with a double layer of cheesecloth and suspend over a bowl.
        Spoon in yogurt. Refrigerate and let drain for at least 2 hours. Discard liquid.
        The longer the yogurt drains, the thicker the cheese will be. For a thicker spread, drain cover

In [81]:
response = chat(messages)

In [82]:
print(response.content)

```json
{
	"improved_question": "What are some other herbs that can be added to the labneh?",
	"improved_answer": "Other herbs that can be added to the labneh include rosemary, parsley, cilantro, and dill. Experiment with different combinations to find your favorite flavor profile.",
	"follow_up": [
		{
			"question": "Can I use Greek yogurt instead of regular yogurt?",
			"answer": "Yes, Greek yogurt can be used instead of regular yogurt. However, the resulting labneh may be slightly thicker and tangier."
		},
		{
			"question": "What can I do with leftover liquid from the yogurt?",
			"answer": "The leftover liquid, also known as whey, can be used in smoothies, soups, or as a substitute for buttermilk in baking recipes."
		},
		{
			"question": "Can I add other ingredients to the labneh, such as garlic or lemon juice?",
			"answer": "Yes, garlic, lemon juice, and other flavorings can be added to the labneh to customize the taste. Just be sure to adjust the seasoning accordingly."
		}

In [83]:
output_dict = output_parser.parse(response.content)
type(output_dict)

dict

In [84]:
output_dict

{'improved_question': 'What are some other herbs that can be added to the labneh?',
 'improved_answer': 'Other herbs that can be added to the labneh include rosemary, parsley, cilantro, and dill. Experiment with different combinations to find your favorite flavor profile.',
 'follow_up': [{'question': 'Can I use Greek yogurt instead of regular yogurt?',
   'answer': 'Yes, Greek yogurt can be used instead of regular yogurt. However, the resulting labneh may be slightly thicker and tangier.'},
  {'question': 'What can I do with leftover liquid from the yogurt?',
   'answer': 'The leftover liquid, also known as whey, can be used in smoothies, soups, or as a substitute for buttermilk in baking recipes.'},
  {'question': 'Can I add other ingredients to the labneh, such as garlic or lemon juice?',
   'answer': 'Yes, garlic, lemon juice, and other flavorings can be added to the labneh to customize the taste. Just be sure to adjust the seasoning accordingly.'},
  {'question': 'What are some se

In [None]:
output_dict.get()

**Generating QA dataset for Amazon Wizard of Tasks dataset**

**Importing Instructions and QA data**

In [90]:
with open(os.path.join(data_path, "recipes.pickle"), "rb") as f:
  recipes_dataset = pickle.load(f)

formated_recipes_dataset = {}
for k, v in recipes_dataset.items():
  formated_recipes_dataset[k.strip()] = v
print(len(formated_recipes_dataset))

237


In [95]:
import json

with open(os.path.join(data_path, "cleaned__wizard_of_tasks_cooking.json")) as f:
    d = json.load(f)
    print(d)

print(len(d))

{'Wizard-of-Task-food-1': {'document_url': 'https://www.wholefoodsmarket.com/recipes/labneh-fresh-herbs-and-olive-oil', 'data_split': 'test', 'turns': [{'text': "Hi! I love labneh but I've never mixed it with herbs into a spread before, looks amazing. What ingredients do I need to start? Thank you :)", 'turn_counter': 1, 'dangerous_tools': [], 'shared_data': [], 'intent': 'ask_question_ingredients_tools', 'real_life_action': 'N/A', 'relevant': 'yes', 'useful': 'yes', 'worker_id': 111, 'previous_worker_id': None, 'role': 'student'}, {'text': 'Here are the ingredients you will need!', 'turn_counter': 2, 'dangerous_tools': [], 'shared_data': ['2 cups plain Greek yogurt', '1 tablespoon extra-virgin olive oil', '1 teaspoon lemon zest', '1/2  teaspoon fine sea salt', '1/4  teaspoon ground black pepper', '1 teaspoon chopped fresh chives', '1 teaspoon chopped fresh thyme', '1 tablespoon chopped fresh tarragon', '1 tablespoon chopped fresh basil'], 'relevant': 'yes', 'useful': 'yes', 'intent': 

In [96]:
recipe_data = {}
for i, (k, v) in enumerate(d.items()):
  title = " ".join(v['document_url'].split("/")[-1].split("-")).strip()
  if title in formated_recipes_dataset:
    instructions = formated_recipes_dataset[title]
    qna = []

    turn = 0
    while turn+1 < len(v['turns']):
      question, answer = None, None
      student_turn = v['turns'][turn]
      teacher_turn = v['turns'][turn + 1]

      if student_turn["role"] == "student":
        question = student_turn["text"]
      if teacher_turn["role"] == "teacher":
        answer = teacher_turn["text"]
      qna.append({"question": question, "answer": answer})
      turn += 2

    recipe_data[title] = {"instructions": instructions, "qna": qna}
print(len(recipe_data))

237


In [102]:
def create_qna_dataset(data):
  res = {}
  for i, (title, v) in enumerate(data.items()):
    if i > 0 and i % 10 == 0:
      print(f"{i+1} recipes data generated...")

    instructions = v["instructions"]
    recipe = title + "\n" + instructions
    qna = v["qna"]
    res_qna = []
    for qa in qna:
      question, answer = qa["question"], qa["answer"]
      prompt_template = create_prompt(recipe, question, answer)
      prompt = ChatPromptTemplate.from_template(template=output_template)
      messages = prompt.format_messages(text=prompt_template,
                                format_instructions=format_instructions)
      response = chat(messages)
      output_dict = output_parser.parse(response.content)
      res_qna.append(output_dict)
    res[title] = {"instructions": instructions, "prompt_output": output_dict}
    with open(os.path.join(data_path, "prompt_output.pickle"), "wb") as f:
      pickle.dump(res, f)
  return res

In [103]:
res = create_qna_dataset(recipe_data)

11 recipes data generated...
21 recipes data generated...
31 recipes data generated...
41 recipes data generated...
51 recipes data generated...
61 recipes data generated...
71 recipes data generated...
81 recipes data generated...
91 recipes data generated...
101 recipes data generated...
111 recipes data generated...
121 recipes data generated...
131 recipes data generated...
141 recipes data generated...
151 recipes data generated...
161 recipes data generated...
171 recipes data generated...
181 recipes data generated...
191 recipes data generated...
201 recipes data generated...
211 recipes data generated...
221 recipes data generated...
231 recipes data generated...
