### Libraries

In [405]:
import asyncio
import random
from pathlib import Path
from typing import List

import nest_asyncio
from datasets import Dataset
from IPython.display import Markdown
from langchain.prompts import ChatPromptTemplate
from langchain.schema import Document
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.chat_models import ChatOpenAI
from langchain_openai.embeddings import OpenAIEmbeddings
from llama_parse import LlamaParse

from src.settings import settings

DATA = Path("data")

In [2]:
# Enable async in jupyter notebook
nest_asyncio.apply()

In [382]:
# Helper function
def gen_batches(iterable, n=100):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]

### 1. Load and split data

In [6]:
# Parse documents using LlamaParse
parser = LlamaParse(
    api_key=settings.env.LLAMA_CLOUD_API_KEY,
    result_type="markdown",
    verbose=True,
    language="en"
)
documents = await parser.aload_data(file_path=[str(DATA / f"wellarchitected-framework-pt{i}.pdf") for i in range(2)])

Started parsing the file under job_id d19dea99-70ed-4ffc-84a0-afbb06694bc1
Started parsing the file under job_id 1c3cf003-573c-404a-987f-3f836b87ad42
..............................................

In [25]:
# Split text into semantic chunks using OpenAI embeddings
text_splitter = SemanticChunker(embeddings=OpenAIEmbeddings(model="text-embedding-3-large", api_key=settings.env.OPENAI_API_KEY))
splitted_documents = await text_splitter.atransform_documents(documents=[Document(page_content=doc.text) for doc in documents])

### 2. Dataset Creation

In [92]:
# Take out the first 2 chunks (table of content)
new_splitted_documents = splitted_documents[2:]

#### 2.1 Generate Questions

In [171]:
# Initialize gpt-4-turbo
llm = ChatOpenAI(
    model_name="gpt-4-turbo-preview",
    api_key=settings.env.OPENAI_API_KEY,
    temperature=0
)

In [172]:
# Output pydantic parser
class Questions(BaseModel):
    """Generated Synthetic Questions"""
    questions: List[str] = Field(..., description="List of generated questions")
parser = PydanticOutputParser(pydantic_object=Questions)

In [173]:
# Define system message
system = "You are a synthetic question-answer pair generator. Given a chunk of context " \
    "about some topic(s), generate 3 example questions a user could ask and would" \
    " be answered using information from the chunk. For example, if the given con" \
    "text was a Wikipedia paragraph about the United States, an example question " \
    "could be 'How many states are in the United States?'. The questions should b" \
    "e able to be answered in a few words or less. Include only the questions in " \
    "your response.\n{format_instructions}"

In [188]:
# Create chain
generate_instructions_prompt = ChatPromptTemplate.from_messages(
    messages=[
        ("system", system),
        ("human", "{chunk}")
    ]
)
generate_instructions_chain = generate_instructions_prompt | llm | parser

In [182]:
# Format prompts for each chunk
prompts = [
    {"chunk": doc.page_content, "format_instructions": parser.get_format_instructions()}
    for doc in new_splitted_documents
]

In [187]:
# Generate all the questions
all_questions = []
batch_size = 100
total = len(prompts) // batch_size + 1

for idx, batch in enumerate(gen_batches(prompts, batch_size)):
        print(f"Batch {idx}/{total} processing...")
        all_questions.extend(await generate_instructions_chain.abatch(batch, return_exceptions=False))
        print(f"Batch {idx}/{total} processed! sleeping...")
        await asyncio.sleep(60)
        print(f"Batch {idx}/{total} done!")

Batch 0/6 processing...
Batch 0/6 processed! sleeping...
Batch 0/6 done!
Batch 1/6 processing...
Batch 1/6 processed! sleeping...
Batch 1/6 done!
Batch 2/6 processing...
Batch 2/6 processed! sleeping...
Batch 2/6 done!
Batch 3/6 processing...
Batch 3/6 processed! sleeping...
Batch 3/6 done!
Batch 4/6 processing...
Batch 4/6 processed! sleeping...
Batch 4/6 done!
Batch 5/6 processing...
Batch 5/6 processed! sleeping...
Batch 5/6 done!


In [352]:
# Inspect example
print(all_questions[0].json(indent=4))

{
    "questions": [
        "What are the six pillars of the AWS Well-Architected Framework?",
        "What does the term 'workload' refer to in the context of the AWS Well-Architected Framework?",
        "What is the purpose of the AWS Well-Architected Tool?"
    ]
}


In [227]:
print(f"Number of expected questions: {len(all_questions) * 3}")
print(f"Number of actual questions: {sum((len(q.questions) for q in all_questions))}")

Number of expected questions: 1680
Number of actual questions: 1671


In [229]:
# Save good cases (3 generated questions)
_questions, _context = zip(*[(q, context) for q, context in zip(all_questions, new_splitted_documents) if len(q.questions) == 3])

#### 2.2 Generate Answers

In [283]:
# Output pydantic parser
class Answer(BaseModel):
    """Generated Chain-of-though Style Answer"""
    answer: str = Field(..., description="Generated Chain-of-though answer")
parser = PydanticOutputParser(pydantic_object=Answer)

In [320]:
# Define system & user message
system = "You are a helpful question answerer who can provide an answer given a question and relevant context." \
"Answer the question using the information given in the context. " \
"Here is things to pay attention to: \n- First provide ste" \
"p-by-step reasoning on how to answer the question. \n- In the re" \
"asoning, if you need to copy paste some sentences from the context, include " \
"them in ##begin_quote## and ##end_quote##. This would mean that things outsi" \
"de of ##begin_quote## and ##end_quote## are not directly copy paste from the" \
" context. \n- End your response with final answer in the form <A" \
"NSWER>: $answer, the answer should be succint."
user = "#### Question: {question} \n\n#### Context: {context}\n\n{format_instructions}"

In [342]:
# Define one shot
os_question = "The Oberoi family is part of a hotel company that has a head office in what city?"
os_context = "[The Oberoi family is an Indian family that is famous for its involvement in hotels, namely through The Oberoi Group]...[It is located in city center of Jakarta, near Mega Kuningan, adjacent to the sister JW Marriott Hotel. It is operated by The Ritz-Carlton Hotel Company. The complex has two towers that comprises a hotel and the Airlangga Apartment respectively]...[The Oberoi Group is a hotel company with its head office in Delhi.]"
os_user = f"#### Question: {os_question} \n\n#### Context: {os_context}" + "\n\n{format_instructions}"
os_json_answer = """```json
{{
    "answer": "##Reason: The document ##begin_quote## The Oberoi family is an Indian family that is famous for its involvement in hotels, namely through The Oberoi Group. ##end_quote## establishes that the Oberoi family is involved in the Oberoi group, and the document ##begin_quote## The Oberoi Group is a hotel company with its head office in Delhi. ##end_quote## establishes the head office of The Oberoi Group. Therefore, the Oberoi family is part of a hotel company whose head office is in Delhi. ##Answer: Delhi"
}}
```"""

In [343]:
# Create chain
generate_answer_prompt = ChatPromptTemplate.from_messages(
    messages=[
        ("system", system),
        ("human", os_user),
        ("ai", os_json_answer),
        ("human", user)
    ]
)
generate_answer_chain = generate_answer_prompt | llm | parser

In [344]:
# Format prompts for each chunk
prompts = []
for i, doc in enumerate(_context):
    for q in _questions[i].questions:
        prompts.append({
            "question": q,
            "context": doc.page_content,
            "format_instructions": parser.get_format_instructions()
        })

In [345]:
# Generate all the answers
_answers = []
batch_size = 100
total = len(prompts) // batch_size + 1

for idx, batch in enumerate(gen_batches(prompts, batch_size), start=1):
        print(f"Batch {idx}/{total} processing...")
        _answers.extend(await generate_answer_chain.abatch(batch, return_exceptions=False))
        print(f"Batch {idx}/{total} processed! sleeping...")
        await asyncio.sleep(60)
        print(f"Batch {idx}/{total} done!")

Batch 1/17 processing...
Batch 1/17 processed! sleeping...
Batch 1/17 done!
Batch 2/17 processing...
Batch 2/17 processed! sleeping...
Batch 2/17 done!
Batch 3/17 processing...
Batch 3/17 processed! sleeping...
Batch 3/17 done!
Batch 4/17 processing...
Batch 4/17 processed! sleeping...
Batch 4/17 done!
Batch 5/17 processing...
Batch 5/17 processed! sleeping...
Batch 5/17 done!
Batch 6/17 processing...
Batch 6/17 processed! sleeping...
Batch 6/17 done!
Batch 7/17 processing...
Batch 7/17 processed! sleeping...
Batch 7/17 done!
Batch 8/17 processing...
Batch 8/17 processed! sleeping...
Batch 8/17 done!
Batch 9/17 processing...
Batch 9/17 processed! sleeping...
Batch 9/17 done!
Batch 10/17 processing...
Batch 10/17 processed! sleeping...
Batch 10/17 done!
Batch 11/17 processing...
Batch 11/17 processed! sleeping...
Batch 11/17 done!
Batch 12/17 processing...
Batch 12/17 processed! sleeping...
Batch 12/17 done!
Batch 13/17 processing...
Batch 13/17 processed! sleeping...
Batch 13/17 done!


In [353]:
# Inspect example
print(_answers[0].json(indent=4))

{
    "answer": "##Reason: The AWS Well-Architected Framework is based on six pillars, as outlined in the context. These pillars are: ##begin_quote## operational excellence, security, reliability, performance efficiency, cost optimization, and sustainability. ##end_quote## Each pillar focuses on a specific aspect of architecture best practices and strategies for designing and operating reliable, secure, efficient, cost-effective, and sustainable systems in the cloud. ##Answer: operational excellence, security, reliability, performance efficiency, cost optimization, sustainability"
}


#### 2.3 Flatten lists

In [376]:
# Make sure that the lists have the same length for ease of the next steps
_flattened_questions = [q for qs in _questions for q in qs.questions]
_flattened_context = [chunk.page_content for chunk in _context for _ in range(3)]
len(_flattened_context), len(_flattened_questions), len(_answers)

(1671, 1671, 1671)

In [391]:
# Verify index match
display(Markdown("#### Context"))
display(Markdown(_flattened_context[10]))
display(Markdown("#### Question"))
display(Markdown(_flattened_questions[10]))
display(Markdown("#### Answer"))
display(Markdown(_answers[10].answer))

#### Context

You can script your operations procedures and automate their process by launching them in response to events. By performing operations as code, you limit human error and create consistent responses to events. - Make frequent, small, reversible changes: Design workloads that are scalable and loosely coupled to permit components to be updated regularly. Automated deployment techniques together with smaller, incremental changes reduces the blast radius and allows for faster reversal when failures occur. This increases confidence to deliver beneficial changes to your workload while maintaining quality and adapting quickly to changes in market conditions. - Refine operations procedures frequently: As you evolve your workloads, evolve your operations appropriately. As you use operations procedures, look for opportunities to improve them. Hold regular reviews and validate that all procedures are effective and that teams are familiar with them. Where gaps are identified, update procedures accordingly. Communicate procedural updates to all stakeholders and teams. Gamify your operations to share best practices and educate teams. - Anticipate failure: Perform “pre-mortem” exercises to identify potential sources of failure so that they can be removed or mitigated. Test your failure scenarios and validate your understanding of their impact. Test your response procedures to ensure they are effective and that teams are familiar with their process. Set up regular game days to test workload and team responses to simulated events. - Learn from all operational failures: Drive improvement through lessons learned from all operational events and failures.

#### Question

What is the benefit of making frequent, small, reversible changes in operations?

#### Answer

##Reason: The context ##begin_quote## Make frequent, small, reversible changes: Design workloads that are scalable and loosely coupled to permit components to be updated regularly. Automated deployment techniques together with smaller, incremental changes reduces the blast radius and allows for faster reversal when failures occur. This increases confidence to deliver beneficial changes to your workload while maintaining quality and adapting quickly to changes in market conditions. ##end_quote## highlights the benefits of making frequent, small, reversible changes in operations. These benefits include reducing the blast radius of changes, allowing for faster reversal when failures occur, increasing confidence in delivering beneficial changes, maintaining quality, and adapting quickly to changes in market conditions. ##Answer: Reducing the blast radius of changes, allowing for faster reversal when failures occur, increasing confidence in delivering beneficial changes, maintaining quality, and adapting quickly to changes in market conditions.

#### 2.4 Populate the dataset

In [400]:
# Create empty dataset
dataset = Dataset.from_dict({})

In [401]:
num_distract = 3 # Number of distractor documents
p = 0.9 # Probability of including the oracle document

for i, (question, chunk, answer) in enumerate(zip(_flattened_questions, _flattened_context, _answers)):
    datapt = {
        "id": f"seed_task_{dataset.num_rows}",
        "type": "general",
        "question": question,
        "context": None,
        "oracle_context": chunk,
        "cot_answer": answer.answer,
        "instruction": None
    }

    # Select distractor documents
    distractor_indices = random.sample(population=[j for j in range(len(_flattened_context)) if j != i], k=num_distract)
    distractor_docs = [_flattened_context[j] for j in distractor_indices]

    # Create the docs list with the chunk (oracle document) at the beginning
    docs = [chunk] + distractor_docs

    # Decide whether to replace the oracle document with a random distractor document
    # With a probability of 1 - p, the oracle document is replaced by a randomly selected distractor document.
    if random.uniform(0, 1) >= p:
        docs[0] = _flattened_context[random.choice(seq=distractor_indices)]
    random.shuffle(docs)

    datapt["context"] = {
        "title": [["placeholder_title"] * (num_distract + 1)],
        "sentences": [docs]
    }

    # Construct model instruction
    datapt["instruction"] = "\n".join([f"<DOCUMENT>{str(doc)}</DOCUMENT>" for doc in docs]) + "\n" + question

    # Add the datapoint to the dataset
    dataset = dataset.add_item(datapt)

In [413]:
dataset[0]

{'id': 'seed_task_0',
 'type': 'general',
 'question': 'What are the six pillars of the AWS Well-Architected Framework?',
 'context': {'sentences': [['All other trademarks not owned by Amazon are the property of their respective owners, who may or may not be affiliated with, connected to, or sponsored by Amazon. ---\n|Content|Page Number|\n|---|---|\n|Abstract and introduction|1|\n|Introduction|1|\n|Definitions|2|\n|On architecture|4|\n|General design principles|6|\n|The pillars of the framework|8|\n|Operational excellence|8|\n|Design principles|9|\n|Definition|10|\n|Best practices|10|\n|Resources|19|\n|Security|20|\n|Design principles|20|\n|Definition|21|\n|Best practices|22|\n|Resources|31|\n|Reliability|31|\n|Design principles|32|\n|Definition|32|\n|Best practices|33|\n|Resources|38|\n|Performance efficiency|38|\n|Design principles|39|\n|Definition|39|\n|Best practices|40|\n|Resources|45|\n|Cost optimization|46|\n|Design principles|46|\n|Definition|47|\n|Best practices|48|\n|Resourc

In [407]:
dataset.push_to_hub("raft-dataset-aws-wellarchitected")

Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 12.43ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.85s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/jjovalle99/raft-dataset-aws-wellarchitected/commit/73fa19d70efe7429de30d8870c36b7d545186b2a', commit_message='Upload dataset', commit_description='', oid='73fa19d70efe7429de30d8870c36b7d545186b2a', pr_url=None, pr_revision=None, pr_num=None)

![evid](assets/dataset-hf.png)