# Fine-tuning Bloomz Seq2Seq Model on Botpress Dataset

## Development Environment and Permissions

### Installation

In [3]:
!pip install "sagemaker>=2.48.0" "transformers==4.12.3" "datasets[s3]==1.18.3" sacrebleu torch sentencepiece transformers[sentencepiece] --upgrade

Collecting transformers[sentencepiece]
  Using cached transformers-4.25.1-py3-none-any.whl (5.8 MB)
  Using cached transformers-4.24.0-py3-none-any.whl (5.5 MB)
  Using cached transformers-4.23.1-py3-none-any.whl (5.3 MB)
  Using cached transformers-4.23.0-py3-none-any.whl (5.3 MB)
  Using cached transformers-4.22.2-py3-none-any.whl (4.9 MB)
  Using cached transformers-4.22.1-py3-none-any.whl (4.9 MB)
  Using cached transformers-4.22.0-py3-none-any.whl (4.9 MB)
  Using cached transformers-4.21.3-py3-none-any.whl (4.7 MB)
  Using cached transformers-4.21.2-py3-none-any.whl (4.7 MB)
  Using cached transformers-4.21.1-py3-none-any.whl (4.7 MB)
  Using cached transformers-4.21.0-py3-none-any.whl (4.7 MB)
  Using cached transformers-4.20.1-py3-none-any.whl (4.4 MB)
  Using cached transformers-4.20.0-py3-none-any.whl (4.4 MB)
  Using cached transformers-4.19.4-py3-none-any.whl (4.2 MB)
  Using cached transformers-4.19.3-py3-none-any.whl (4.2 MB)
  Using cached transformers-4.19.2-py3-none-an

### Development environment

In [4]:
import sagemaker.huggingface
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
print(transformers.__version__)
from datasets import load_dataset, load_metric

import os
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

os.environ["WANDB_DISABLED"]="true"

4.12.3


### Permissions

In [5]:
import sagemaker

sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

role = sagemaker.get_execution_role()
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker role arn: arn:aws:iam::352302020638:role/service-role/SageMaker-MLEngineer
sagemaker bucket: sagemaker-eu-west-2-352302020638
sagemaker session region: eu-west-2


## Loading the fine-tuning dataset

In [6]:
with open('ob-loose-jun28-sm.jsonl', 'r') as json_file:
    jsonl = json_file.readlines()

In [7]:
# Format the data in the required format
data = [{'sequences':json.loads(t)} for t in jsonl]

In [8]:
# Split the files into train, validation and test
train, test = train_test_split(data, test_size=0.1)

validation, test = train_test_split(test, test_size=0.5)

train_df = pd.DataFrame(train)
validation_df = pd.DataFrame(validation)
test_df = pd.DataFrame(test)


In [9]:
# Write the train, validation & test sets
train_df.to_json(path_or_buf='ob-loose-jun28-sm_train.jsonl', orient='records', lines=True)
validation_df.to_json(path_or_buf='ob-loose-jun28-sm_validation.jsonl', orient='records', lines=True)
test_df.to_json(path_or_buf='ob-loose-jun28-sm_test.jsonl', orient='records', lines=True)

In [10]:
# Load the datasets as HuggingFace datasets
base_path = 'ob-loose-jun28-sm_'
raw_datasets = load_dataset("json", data_files={"train": base_path + "train.jsonl", "validation": base_path + "validation.jsonl", "test": base_path + "test.jsonl"})

Using custom data configuration default-f53d8eda4a457a87


Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-f53d8eda4a457a87/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-f53d8eda4a457a87/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [11]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sequences'],
        num_rows: 907
    })
    validation: Dataset({
        features: ['sequences'],
        num_rows: 50
    })
    test: Dataset({
        features: ['sequences'],
        num_rows: 51
    })
})

## Preprocessing

In [12]:
# model checkpoint, start with mt0-small
model_checkpoint = "bigscience/mt0-small"

# s3 key prefix for the data
s3_prefix = 'samples/datasets/fine_tuning'

In [13]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [14]:
prefix = ""
max_input_length = 128
max_target_length = 128
source = "prompt"
target = "completion"

def preprocess_function(examples):
    inputs = [prefix + ex[source] for ex in examples["sequences"]]
    targets = [ex[target] for ex in examples["sequences"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [15]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)



  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [16]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sequences', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 907
    })
    validation: Dataset({
        features: ['sequences', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 50
    })
    test: Dataset({
        features: ['sequences', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 51
    })
})

In [17]:
# set format for pytorch
train_dataset =  tokenized_datasets['train']
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset = tokenized_datasets['test']
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

### Uploading data to sagemaker_session_bucket

In [18]:
import botocore
from datasets.filesystems import S3FileSystem

s3 = S3FileSystem()  

# save train_dataset to s3
training_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/train'
train_dataset.save_to_disk(training_input_path,fs=s3)

# save test_dataset to s3
test_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/test'
test_dataset.save_to_disk(test_input_path,fs=s3)

# Finetuning the BLOOMZ model

In [19]:
!pygmentize ./scripts/train.py

[34mfrom[39;49;00m [04m[36mtransformers[39;49;00m [34mimport[39;49;00m AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Trainer, AutoTokenizer, DataCollatorForSeq2Seq
[34mfrom[39;49;00m [04m[36msklearn[39;49;00m[04m[36m.[39;49;00m[04m[36mmetrics[39;49;00m [34mimport[39;49;00m accuracy_score, precision_recall_fscore_support
[34mfrom[39;49;00m [04m[36mdatasets[39;49;00m [34mimport[39;49;00m load_from_disk, load_metric
[34mimport[39;49;00m [04m[36mrandom[39;49;00m
[34mimport[39;49;00m [04m[36mlogging[39;49;00m
[34mimport[39;49;00m [04m[36msys[39;49;00m
[34mimport[39;49;00m [04m[36margparse[39;49;00m
[34mimport[39;49;00m [04m[36mos[39;49;00m
[34mimport[39;49;00m [04m[36mtorch[39;49;00m
[34mimport[39;49;00m [04m[36mnumpy[39;49;00m [34mas[39;49;00m [04m[36mnp[39;49;00m

[34mif[39;49;00m [31m__name__[39;49;00m == [33m"[39;49;00m[33m__main__[39;49;00m[33m"[39;49;00m:

    parser = argparse.ArgumentParser()

    [37m#

## Creating an Estimator and start a training job

In [32]:
from sagemaker.huggingface import HuggingFace

# hyperparameters, which are passed into the training job
hyperparameters={'epochs': 1,
                 'train_batch_size': 4,
                 'model_name': model_checkpoint
                 }

In [33]:
huggingface_estimator = HuggingFace(entry_point='train.py',
                            source_dir='./scripts',
                            instance_type='ml.p3.2xlarge',
                            instance_count=1,
                            role=role,
                            transformers_version='4.12',
                            pytorch_version='1.9',
                            py_version='py38',
                            hyperparameters = hyperparameters)

In [None]:
# starting the train job with our uploaded datasets as input
huggingface_estimator.fit({'train': training_input_path, 'test': test_input_path})

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: huggingface-pytorch-training-2023-01-04-17-37-43-190


2023-01-04 17:37:43 Starting - Starting the training job...
2023-01-04 17:38:09 Starting - Preparing the instances for training.........
2023-01-04 17:39:26 Downloading - Downloading input data...
2023-01-04 17:39:55 Training - Downloading the training image..............

## Deploying the endpoint for inference

In [None]:
predictor = huggingface_estimator.deploy(1,"ml.g4dn.xlarge")

In [None]:
test_input= {"prompt":"Rephrase and join the sentences to remove repetition and sound more human without changing the wording and semantics.\nThe only exception is that you are allowed to rephrase from the user query the <|NOT_SURE|> parts.\n\n###\n\nQuestion: Did you know that || You mentioned that patients can schedule follow-up appointments at the doctor\u2019s office, what about first-time patients that want to schedule appointments in person? || We offer that as well!\nRobotic Answer: <|NOT_SURE|> <|Patients can schedule an appointment at the lobby at every Hospital X facility|>\nHuman Answer:"}

predictor.predict(test_input)

In [None]:
predictor.delete_endpoint()

# Extra information

In [None]:
# container image used for training job
print(f"container image used for training job: \n{huggingface_estimator.image_uri}\n")

# s3 uri where the trained model is located
print(f"s3 uri where the trained model is located: \n{huggingface_estimator.model_data}\n")

# latest training job name for this estimator
print(f"latest training job name for this estimator: \n{huggingface_estimator.latest_training_job.name}\n")

In [None]:
# access the logs of the training job
huggingface_estimator.sagemaker_session.logs_for_job(huggingface_estimator.latest_training_job.name)