# Prepare Custom Dataset

This notebook will generate a Bedrock Evaluations compatible jsonl file for an evaluation job using [GSM8K](https://github.com/openai/grade-school-math) dataset

## Get GSM8K Dataset

In [None]:
from urllib.request import urlretrieve

raw_file_name = 'train.jsonl'

urlretrieve('https://raw.githubusercontent.com/openai/grade-school-math/refs/heads/master/grade_school_math/data/train.jsonl', raw_file_name)

## Read the first 1000 lines

In [None]:
import json

rows = []

with open(raw_file_name, 'r') as f:
    for _ in range(1000):
        rows.append(json.loads(f.readline()))

## Generate Prompt Dataset

In [None]:
from pprint import pprint

prompt_template = """Solve the following math problem.

{}

Respond by only providing an answer to the question"""

def to_prompt_json(obj):
    quesiton = obj.get('question')
    answer = obj.get('answer')
    answer_pos = answer.find('####') + 4
    arith_answer = answer[answer_pos:]

    return {
        'prompt': prompt_template.format(quesiton),
        'referenceResponse': arith_answer.strip(),
    }

def to_prompt_json_line(obj):
    return json.dumps(to_prompt_json(obj)) + '\n'

print('### Prompt looks like this ###\n')
pprint(to_prompt_json(rows[0]))

## Save the Prompt Dataset

In [None]:
with open('gsm8k.jsonl', 'w') as f:
    f.writelines([to_prompt_json_line(row) for row in rows])