# Prepare Fine-Tune Dataset

This notebook will generate a train dataset for fine tuning job using [GSM8k](https://huggingface.co/datasets/openai/gsm8k) dataset

## Install dependencies

In [None]:
%pip install -U datasets

## Get GSM8k Dataset

Restart the kernel if the command fails

In [None]:
from datasets import load_dataset

dataset = load_dataset("openai/gsm8k", "main")
train_set = dataset['train']
test_set = dataset['test']

## Format question prompt

In [None]:
from pprint import pprint
import random

prompt_template = """Solve the following math problem.

{}

Respond by only providing an answer to the question"""

def to_prompt_json(obj):
    quesiton = obj.get('question')
    answer = obj.get('answer')
    answer_pos = answer.find('####') + 4
    arith_answer = answer[answer_pos:]

    return {
        'prompt': prompt_template.format(quesiton),
        'referenceResponse': arith_answer.strip(),
    }

def to_prompt_json_line(obj):
    return json.dumps(to_prompt_json(obj)) + '\n'

print('### Prompt looks like this ###\n')
pprint(to_prompt_json(train_set[random.randint(0, len(train_set))]))

## Generate Dataset File

In [None]:
import json
import random

# Randomly select 10 prompts
indices = [random.randint(0, len(train_set)) for _ in range(10)]

prompt_file_name = 'gsm8k.jsonl'

with open(prompt_file_name, 'w') as f:
    f.writelines([to_prompt_json_line(train_set[i]) for i in indices])

## Upload to S3

In [None]:
import boto3

sts_client = boto3.client('sts')
account_info = sts_client.get_caller_identity()
account_id = account_info['Account']

bucket_name = f"bedrock-evaluation-{account_id}"

s3_client = boto3.client('s3')

object_key = f'datasets/{prompt_file_name}'
print(f'## Uploading {prompt_file_name} to s3://{bucket_name}/{object_key}')
s3_client.upload_file(prompt_file_name, bucket_name, object_key)
print('Upload Complete')