# Prepare BYOIDataset

This notebook will generate a Bedrock Evaluations compatible jsonl file for an evaluation job using [CNN/DailyMail](https://huggingface.co/datasets/abisee/cnn_dailymail) dataset

## Install dependencies

In [None]:
%pip install -U datasets

## Get CNN/DailyMail Dataset

Restart the kernel if the command fails

In [None]:
from datasets import load_dataset

dataset = load_dataset("abisee/cnn_dailymail", "3.0.0")
train_set = dataset['train']
test_set = dataset['test']

## Create Dummy Summarizer

In [None]:
import random

class DummySummarizer:
    def __init__(self, num_tokens: int):
        self.num_tokens = num_tokens
    
    def __call__(self, text: str) -> str:
        """Summarize the input text by randomly selecting words
        
        Args:
            text (str): Input text to summarize
            
        Returns:
            str: Summarized text with approximately num_tokens words
        """
        if not text or not text.strip():
            return ""
            
        # Split text into words
        words = text.split()
        
        # If num_tokens is greater than available words, return full text
        if self.num_tokens >= len(words):
            return text
            
        # Create list of indices and randomly select num_tokens of them
        indices = list(range(len(words)))
        selected_indices = sorted(random.sample(indices, self.num_tokens))
        
        # Build summary using selected indices to maintain word order
        summary_words = [words[i] for i in selected_indices]
        
        return " ".join(summary_words)

## Generate Prompt Dataset

In [None]:
from pprint import pprint
import json
import random

prompt_template = instruction='''Summarize the news article provided below. Do not provide anything other than a clean summarization in couple sentences.

Article: {}
'''

def to_prompt_json(obj, summarizer):
    article = obj['article']
    highlight = obj.get('highlights')

    return {
        'prompt': prompt_template.format(article),
        'referenceResponse': highlight,
        'modelResponses': {
            'modelIdentifier': 'dummy',
            'response': summarizer(article),
        },
    }

def to_prompt_json_line(obj, summarizer):
    return json.dumps(to_prompt_json(obj, summarizer)) + '\n'

In [None]:
import random

prompt_file_name = 'cnn_dailymail_byoi.jsonl'

# Randomly select 10 prompts
indices = [random.randint(0, len(train_set)) for _ in range(10)]

summarizer = DummySummarizer(100)

with open(prompt_file_name, 'w') as f:
    f.writelines([to_prompt_json_line(train_set[i], summarizer) for i in indices])

In [None]:
from pprint import pprint

print('### Example Prompt Json looks like this ###')

with open(prompt_file_name, 'r') as f:
    s = f.readline()
    pprint(json.loads(s))

## Upload Data to S3

In [None]:
import boto3

sts_client = boto3.client('sts')
account_info = sts_client.get_caller_identity()
account_id = account_info['Account']

bucket_name = f"bedrock-evaluation-{account_id}"

s3_client = boto3.client('s3')

object_key = f'datasets/{prompt_file_name}'
print(f'## Uploading {prompt_file_name} to s3://{bucket_name}/{object_key}')
s3_client.upload_file(prompt_file_name, bucket_name, object_key)