# Prepare Custom Dataset

This notebook will generate a Bedrock Evaluations compatible jsonl file for an evaluation job using [CNN/DailyMail](https://huggingface.co/datasets/abisee/cnn_dailymail) dataset

## Get CNN/DailyMail Dataset

In [None]:
from datasets import load_dataset

dataset = load_dataset("abisee/cnn_dailymail", "3.0.0")

## Generate Prompt Dataset

In [None]:
from pprint import pprint
import json

prompt_template = instruction='''Summarize the news article provided below. Do not provide anything other than a clean summarization in couple sentences.

Article: {}
'''

def to_prompt_json(obj):
    article = obj['article']
    highlight = obj.get('highlights')

    return {
        'prompt': prompt_template.format(article),
        'referenceResponse': highlight,
    }

def to_prompt_json_line(obj):
    return json.dumps(to_prompt_json(obj)) + '\n'


print('### Prompt looks like this ###\n')
pprint(to_prompt_json(dataset['train'][0]))

## Save the Prompt Dataset

In [None]:
import random

# Randomly select 1000 prompts
indices = [random.randint(0, len(dataset['train'])) for _ in range(1000)]

train_set = dataset['train']
prompt_file_name = 'cnn_dailymail.jsonl'

with open(prompt_file_name, 'w') as f:
    f.writelines([to_prompt_json_line(train_set[i]) for i in indices])

## Upload Data to S3

In [None]:
import boto3

BUCKET_NAME = "bedrock-evaluation-YOUR BUCKET NAME"

s3_client = boto3.client('s3')
s3_client.upload_file(prompt_file_name, BUCKET_NAME, f'datasets/{prompt_file_name}')