# Prepare Custom Dataset

This notebook will generate a Bedrock Evaluations compatible jsonl file for an evaluation job using [CNN/DailyMail](https://huggingface.co/datasets/abisee/cnn_dailymail) dataset

## Install dependencies

In [None]:
%pip install -U datasets

## Get CNN/DailyMail Dataset

Restart the kernel if the command fails

In [None]:
from datasets import load_dataset

dataset = load_dataset("abisee/cnn_dailymail", "3.0.0")
train_set = dataset['train']
test_set = dataset['test']

## Generate Prompt Dataset

In [None]:
from pprint import pprint
import json
import random

prompt_template = instruction='''Summarize the news article provided below. Do not provide anything other than a clean summarization in couple sentences.

Article: {}
'''

def to_prompt_json(obj):
    article = obj['article']
    highlight = obj.get('highlights')

    return {
        'prompt': prompt_template.format(article),
        'referenceResponse': highlight,
    }

def to_prompt_json_line(obj):
    return json.dumps(to_prompt_json(obj)) + '\n'


print('### Prompt looks like this ###\n')
pprint(to_prompt_json(train_set[random.randint(0, len(train_set))]))

## Save the Prompt Dataset

In [None]:
import random

# Randomly select 100 prompts
indices = [random.randint(0, len(train_set)) for _ in range(100)]

prompt_file_name = 'cnn_dailymail.jsonl'

with open(prompt_file_name, 'w') as f:
    f.writelines([to_prompt_json_line(train_set[i]) for i in indices])

## Upload Data to S3

In [None]:
import boto3

sts_client = boto3.client('sts')
account_info = sts_client.get_caller_identity()
account_id = account_info['Account']

bucket_name = f"bedrock-evaluation-{account_id}"

s3_client = boto3.client('s3')

object_key = f'datasets/{prompt_file_name}'
print(f'## Uploading {prompt_file_name} to s3://{bucket_name}/{object_key}')
s3_client.upload_file(prompt_file_name, bucket_name, object_key)