## Setting up Libraries

In [1]:
import boto3
import json
import jsonlines
import os
import pprint
import random
import warnings
from datasets import load_dataset

warnings.filterwarnings('ignore')

## Preparing the Session and Making Configurations

In [2]:
session = boto3.session.Session()
region = session.region_name
sts_client = boto3.client('sts')
account_id = sts_client.get_caller_identity()["Account"]
s3_suffix = f"{region}-{account_id}"
bucket_name = f"my-bedrock-fine-tuning-custom-model-{s3_suffix}"

# AWS Services {S3, Bedrock, IAM} clients
s3 = boto3.resource('s3')
s3_client = boto3.client('s3')
bedrock = boto3.client(service_name="bedrock", region_name=region)
bedrock_runtime = boto3.client(service_name="bedrock-runtime", region_name=region)
iam = boto3.client('iam', region_name=region)

In [3]:
role_name = "myAmazonBedrockFineTuningCustomRole"
s3_bedrock_finetuning_access_policy="myAmazonBedrockFineTuningCustomPolicy"
customization_role = f"arn:aws:iam::{account_id}:role/{role_name}"

In [4]:
s3bucket = s3_client.create_bucket(
                                    Bucket=bucket_name,
                                )

In [10]:
s3_bedrock_role = f"""{{
    "Version": "2012-10-17",
    "Statement": [
        {{
            "Effect": "Allow",
            "Principal": {{
                "Service": "bedrock.amazonaws.com"
            }},
            "Action": "sts:AssumeRole",
            "Condition": {{
                "StringEquals": {{
                    "aws:SourceAccount": "{account_id}"
                }},
                "ArnEquals": {{
                    "aws:SourceArn": "arn:aws:bedrock:{region}:{account_id}:model-customization-job/*"
                }}
            }}
        }}
    ]
}}
"""

In [11]:
response = iam.create_role(
    RoleName=role_name,
    AssumeRolePolicyDocument=s3_bedrock_role,
    Description="Role for Bedrock to access S3 for finetuning",
)
role_arn = response["Role"]["Arn"]
pprint.pp(response)

{'Role': {'Path': '/',
          'RoleName': 'myAmazonBedrockFineTuningCustomRole',
          'RoleId': 'AROAXMQUPTNEBW26SG2CH',
          'Arn': 'arn:aws:iam::507922848584:role/myAmazonBedrockFineTuningCustomRole',
          'CreateDate': datetime.datetime(2024, 10, 8, 16, 6, 8, tzinfo=tzutc()),
          'AssumeRolePolicyDocument': {'Version': '2012-10-17',
                                       'Statement': [{'Effect': 'Allow',
                                                      'Principal': {'Service': 'bedrock.amazonaws.com'},
                                                      'Action': 'sts:AssumeRole',
                                                      'Condition': {'StringEquals': {'aws:SourceAccount': '507922848584'},
                                                                    'ArnEquals': {'aws:SourceArn': 'arn:aws:bedrock:us-east-1:507922848584:model-customization-job/*'}}}]}},
 'ResponseMetadata': {'RequestId': '836a9303-2f7b-448e-82ac-61eef0aaa92f',
       

In [12]:
s3_access_policy = f"""{{
    "Version": "2012-10-17",
    "Statement": [
        {{
            "Effect": "Allow",
            "Action": [
                "s3:AbortMultipartUpload",
                "s3:DeleteObject",
                "s3:PutObject",
                "s3:GetObject",
                "s3:GetBucketAcl",
                "s3:GetBucketNotification",
                "s3:ListBucket",
                "s3:PutBucketNotification"
            ],
            "Resource": [
                "arn:aws:s3:::{bucket_name}",
                "arn:aws:s3:::{bucket_name}/*"
            ]
        }}
    ]
}}"""


In [13]:
response = iam.create_policy(
                                PolicyName=s3_bedrock_finetuning_access_policy,
                                PolicyDocument=s3_access_policy,
                            )
policy_arn = response["Policy"]["Arn"]
pprint.pp(response)

{'Policy': {'PolicyName': 'myAmazonBedrockFineTuningCustomPolicy',
            'PolicyId': 'ANPAXMQUPTNEOLLEBQ4TY',
            'Arn': 'arn:aws:iam::507922848584:policy/myAmazonBedrockFineTuningCustomPolicy',
            'Path': '/',
            'DefaultVersionId': 'v1',
            'AttachmentCount': 0,
            'PermissionsBoundaryUsageCount': 0,
            'IsAttachable': True,
            'CreateDate': datetime.datetime(2024, 10, 8, 16, 6, 29, tzinfo=tzutc()),
            'UpdateDate': datetime.datetime(2024, 10, 8, 16, 6, 29, tzinfo=tzutc())},
 'ResponseMetadata': {'RequestId': '231d8c80-8198-478a-b99d-b1535d887336',
                      'HTTPStatusCode': 200,
                      'HTTPHeaders': {'date': 'Tue, 08 Oct 2024 16:06:28 GMT',
                                      'x-amzn-requestid': '231d8c80-8198-478a-b99d-b1535d887336',
                                      'content-type': 'text/xml',
                                      'content-length': '809'},
              

In [14]:
iam.attach_role_policy(
                        RoleName=role_name,
                        PolicyArn=policy_arn,
                    )

{'ResponseMetadata': {'RequestId': '2fd35ab8-83fb-48a9-b399-3f45d391ef53',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Tue, 08 Oct 2024 16:06:31 GMT',
   'x-amzn-requestid': '2fd35ab8-83fb-48a9-b399-3f45d391ef53',
   'content-type': 'text/xml',
   'content-length': '212'},
  'RetryAttempts': 0}}

## Loading the GovReport Dataset

In [15]:
dataset = load_dataset("ccdv/govreport-summarization")

Downloading readme:   0%|          | 0.00/2.21k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/228M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/229M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/26.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/24.0M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/17517 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/973 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/973 [00:00<?, ? examples/s]

In [16]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['report', 'summary'],
        num_rows: 17517
    })
    validation: Dataset({
        features: ['report', 'summary'],
        num_rows: 973
    })
    test: Dataset({
        features: ['report', 'summary'],
        num_rows: 973
    })
})


In [17]:
instruction='''Below is an instruction which describes a task, paired with an input which will provide further context. Write a response that appropriately completes the request.

instruction:

Summarize the report provided below.

input:

'''

In [18]:
datapoints_train=[]
for data in dataset['train']:
    temp_dict={}
    temp_dict['prompt']=instruction+data['report']
    temp_dict['completion']='response:\n'+data['summary']
    datapoints_train.append(temp_dict)

In [19]:
print(datapoints_train[4]['prompt'])

Below is an instruction which describes a task, paired with an input which will provide further context. Write a response that appropriately completes the request.

instruction:

Summarize the report provided below.

input:

In 1991, we reported that, historically, INS leadership had allowed INS’ organizational structure to become decentralized without adequate controls. Specifically, its regional structure had created geographical separation among INS programs and hampered resource allocation and consistent program implementation. The field structure designed to carry out INS’ enforcement functions was bifurcated between districts and Border Patrol sectors, resulting in uncoordinated, overlapping programs. In addition, only a single senior INS headquarters manager supervised INS’ 33 district directors and 21 Border Patrol chiefs. In 1994, with the appointment of a new Commissioner, INS implemented an organizational structure intended to remedy at least two problems. First, the headqua

In [20]:
datapoints_valid=[]
for data in dataset['validation']:
    temp_dict={}
    temp_dict['prompt']=instruction+data['report']
    temp_dict['completion']='response:\n\n'+data['summary']
    datapoints_valid.append(temp_dict)

datapoints_test=[]
for data in dataset['test']:
    temp_dict={}
    temp_dict['prompt']=instruction+data['report']
    temp_dict['completion']='response:\n\n'+data['summary']
    datapoints_test.append(temp_dict)

In [21]:
def data_transform(data_points,num_data,max_data_length):
    lines=[]
    for data in data_points:
        if len(data['prompt']+data['completion'])<=max_data_length:
                lines.append(data)
    random.shuffle(lines)
    lines=lines[:num_data]
    return lines

In [33]:
# def jsonl_converter(dataset,file_name):
#     print(file_name)
#     with jsonlines.open(file_name, 'w') as writer:
#         for line in dataset:
#             writer.write(line)
            
def jsonl_converter(dataset, file_name):
    # Ensure the directory exists
    directory = os.path.dirname(file_name)
    
    if directory and not os.path.exists(directory):
        os.makedirs(directory)  # Create the directory if it doesn't exist
    
    print(f"Writing to file: {file_name}")
    
    try:
        with jsonlines.open(file_name, 'w') as writer:
            for line in dataset:
                writer.write(line)
        print(f"Successfully wrote data to {file_name}")
    except Exception as e:
        print(f"An error occurred: {e}")

In [34]:
train=data_transform(datapoints_train,5000,10000)
validation=data_transform(datapoints_valid,999,10000)
test=data_transform(datapoints_test,10,10000)

## Using Local Directories for Fine-tuning Datasets

In [36]:
dataset_folder="myfine-tuning-datasets"
train_file_name="mytrain-govreport.jsonl"
validation_file_name="myvalidation-govreport.jsonl"
test_file_name="mytest-govreport.jsonl"
!mkdir myfine-tuning-datasets
abs_path=os.path.abspath(dataset_folder) 
print(abs_path)

/Users/debnsuma/AWS-AI-Engineering/code/fcc-ai-engineering-aws/fine-tuning/myfine-tuning-datasets


In [37]:
jsonl_converter(train,f'{abs_path}/{train_file_name}')
jsonl_converter(validation,f'{abs_path}/{validation_file_name}')
jsonl_converter(test,f'{abs_path}/{test_file_name}')

Writing to file: /Users/debnsuma/AWS-AI-Engineering/code/fcc-ai-engineering-aws/fine-tuning/myfine-tuning-datasets/mytrain-govreport.jsonl
Successfully wrote data to /Users/debnsuma/AWS-AI-Engineering/code/fcc-ai-engineering-aws/fine-tuning/myfine-tuning-datasets/mytrain-govreport.jsonl
Writing to file: /Users/debnsuma/AWS-AI-Engineering/code/fcc-ai-engineering-aws/fine-tuning/myfine-tuning-datasets/myvalidation-govreport.jsonl
Successfully wrote data to /Users/debnsuma/AWS-AI-Engineering/code/fcc-ai-engineering-aws/fine-tuning/myfine-tuning-datasets/myvalidation-govreport.jsonl
Writing to file: /Users/debnsuma/AWS-AI-Engineering/code/fcc-ai-engineering-aws/fine-tuning/myfine-tuning-datasets/mytest-govreport.jsonl
Successfully wrote data to /Users/debnsuma/AWS-AI-Engineering/code/fcc-ai-engineering-aws/fine-tuning/myfine-tuning-datasets/mytest-govreport.jsonl


In [38]:
s3_client.upload_file(f'{abs_path}/{train_file_name}', bucket_name, f'fine-tuning-datasets/train/{train_file_name}')
s3_client.upload_file(f'{abs_path}/{validation_file_name}', bucket_name, f'fine-tuning-datasets/validation/{validation_file_name}')
s3_client.upload_file(f'{abs_path}/{test_file_name}', bucket_name, f'fine-tuning-datasets/test/{test_file_name}')

s3_train_uri=f's3://{bucket_name}/fine-tuning-datasets/train/{train_file_name}'
s3_validation_uri=f's3://{bucket_name}/fine-tuning-datasets/validation/{validation_file_name}'
s3_test_uri=f's3://{bucket_name}/fine-tuning-datasets/test/{test_file_name}'

## Initialize Bedrock Session for Fine-Tuning

In [39]:
session = boto3.session.Session()
region = 'us-east-1'
sts_client = boto3.client('sts')
s3_client = boto3.client('s3')
aws_account_id = sts_client.get_caller_identity()["Account"]

# Bedrock client
bedrock = boto3.client(service_name="bedrock", region_name=region)
bedrock_runtime = boto3.client(service_name="bedrock-runtime", region_name=region)

In [41]:
test_file_name = "mytest-govreport.jsonl"
data_folder = "myfine-tuning-datasets"

## Creating the Fine-tuning Job

In [42]:
from datetime import datetime
timestamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
base_model_id = "meta.llama2-13b-v1:0:4k"
customization_type = "FINE_TUNING"
customization_role = role_arn
customization_job_name = f"llama2-finetune-sm-test-model-{timestamp}"
custom_model_name = f"llama2-finetune-{timestamp}"

In [43]:
hyper_parameters = {
        "epochCount": "2",
        "batchSize": "1",
        "learningRate": "0.00005",
    }

training_data_config = {"s3Uri": s3_train_uri}

validation_data_config = {
        "validators": [{
            "s3Uri": s3_validation_uri
        }]
    }

output_data_config = {"s3Uri": f's3://{bucket_name}/outputs/output-{custom_model_name}'}

In [44]:
bedrock.create_model_customization_job(
    customizationType=customization_type,
    jobName=customization_job_name,
    customModelName=custom_model_name,
    roleArn=customization_role,
    baseModelIdentifier=base_model_id,
    hyperParameters=hyper_parameters,
    trainingDataConfig=training_data_config,
    validationDataConfig=validation_data_config,
    outputDataConfig=output_data_config
)

{'ResponseMetadata': {'RequestId': 'ff8004e2-05d3-46f9-b564-4f5339919f30',
  'HTTPStatusCode': 201,
  'HTTPHeaders': {'date': 'Tue, 08 Oct 2024 16:24:01 GMT',
   'content-type': 'application/json',
   'content-length': '112',
   'connection': 'keep-alive',
   'x-amzn-requestid': 'ff8004e2-05d3-46f9-b564-4f5339919f30'},
  'RetryAttempts': 0},
 'jobArn': 'arn:aws:bedrock:us-east-1:507922848584:model-customization-job/meta.llama2-13b-v1:0:4k/e9u3zrgtr5ho'}

### Waiting until customization job is completed

In [45]:
import time 

# check model customization status
status = bedrock.list_model_customization_jobs(
    nameContains=customization_job_name
)["modelCustomizationJobSummaries"][0]["status"]
while status == 'InProgress':
    time.sleep(50)
    status = bedrock.list_model_customization_jobs(
        nameContains=customization_job_name
    )["modelCustomizationJobSummaries"][0]["status"]
    print(status)

InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
Completed


## Select Customization Job

In [46]:
bedrock.list_model_customization_jobs(
        nameContains=customization_job_name
    )["modelCustomizationJobSummaries"][0]

{'jobArn': 'arn:aws:bedrock:us-east-1:507922848584:model-customization-job/meta.llama2-13b-v1:0:4k/e9u3zrgtr5ho',
 'baseModelArn': 'arn:aws:bedrock:us-east-1::foundation-model/meta.llama2-13b-v1:0:4k',
 'jobName': 'llama2-finetune-sm-test-model-2024-10-08-12-23-49',
 'status': 'Completed',
 'lastModifiedTime': datetime.datetime(2024, 10, 8, 17, 2, 51, 443000, tzinfo=tzutc()),
 'creationTime': datetime.datetime(2024, 10, 8, 16, 24, 0, 914000, tzinfo=tzutc()),
 'endTime': datetime.datetime(2024, 10, 8, 17, 2, 51, 79000, tzinfo=tzutc()),
 'customModelArn': 'arn:aws:bedrock:us-east-1:507922848584:custom-model/meta.llama2-13b-v1:0:4k/9llpk3c7n6gb',
 'customModelName': 'llama2-finetune-2024-10-08-12-23-49',
 'customizationType': 'FINE_TUNING'}

In [48]:
from ipywidgets import Dropdown

customization_jobs = {}
dropdown_vals = []
for cj in bedrock.list_model_customization_jobs()["modelCustomizationJobSummaries"]:
    if cj["status"] == "Completed":
        customization_jobs[cj["customModelName"]] = cj
        dropdown_vals.append(cj["customModelName"] + " - creationTime: " + cj["creationTime"].strftime("%Y-%m-%d %H:%M:%S"))

# display the model-ids in a dropdown to select a model for inference.
model_dropdown = Dropdown(
    options=dropdown_vals,
    value=dropdown_vals[0],
    description="Select a model",
    style={"description_width": "initial"},
    layout={"width": "max-content"},
)
display(model_dropdown)

Dropdown(description='Select a model', layout=Layout(width='max-content'), options=('llama2-finetune-2024-10-0…

In [49]:
selected_model = model_dropdown.value.split(" - creationTime: ")[0]
custom_model_name, custom_model_arn = selected_model, customization_jobs[selected_model]["customModelArn"]
custom_model_name, custom_model_arn

('llama2-finetune-2024-10-08-12-23-49',
 'arn:aws:bedrock:us-east-1:507922848584:custom-model/meta.llama2-13b-v1:0:4k/9llpk3c7n6gb')

## Creating Provisioned Throughput

In [51]:
provisioned_model_id = bedrock.create_provisioned_model_throughput(
     modelUnits=1,
     provisionedModelName=custom_model_name, 
     modelId=custom_model_arn
    )['provisionedModelArn']    

In [None]:
file_path_for_testing = f'{data_folder}/{test_file_name}'
with open(file_path_for_testing) as f:
    lines = f.read().splitlines()

test_prompt = json.loads(lines[0])['prompt']
reference_summary = json.loads(lines[0])['completion']


In [None]:
body = json.dumps({
    "prompt": test_prompt,
    "max_gen_len": 300,
    "temperature": 0.5,
    "top_p": 0.5,
})

modelId = provisioned_model_id
accept = 'application/json'
contentType = 'application/json'

response = bedrock_runtime.invoke_model(body=body, modelId=modelId, accept=accept, contentType=contentType) 

response_body = json.loads(response.get('body').read())
print(response_body)

## Clean Up

In [None]:
bedrock.delete_provisioned_model_throughput(provisionedModelId=provisioned_model_id)

In [None]:
objects = s3_client.list_objects(Bucket=bucket_name)  
if 'Contents' in objects:
    for obj in objects['Contents']:
        s3_client.delete_object(Bucket=bucket_name, Key=obj['Key']) 
s3_client.delete_bucket(Bucket=bucket_name)

In [None]:
iam.detach_role_policy(RoleName=role_name, PolicyArn=policy_arn)
iam.delete_role(RoleName=role_name)