# Fine-tuning Bloomz Seq2Seq Model on Botpress Dataset

## Development Environment and Permissions

### Installation

In [64]:
!pip install "sagemaker>=2.48.0" "transformers==4.12.3" "datasets[s3]==1.18.3" sacrebleu torch sentencepiece transformers[sentencepiece] --upgrade

Collecting sagemaker>=2.48.0
  Downloading sagemaker-2.127.0.tar.gz (655 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m655.0/655.0 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting transformers[sentencepiece]
  Using cached transformers-4.25.1-py3-none-any.whl (5.8 MB)
  Using cached transformers-4.24.0-py3-none-any.whl (5.5 MB)
  Using cached transformers-4.23.1-py3-none-any.whl (5.3 MB)
  Using cached transformers-4.23.0-py3-none-any.whl (5.3 MB)
  Using cached transformers-4.22.2-py3-none-any.whl (4.9 MB)
  Using cached transformers-4.22.1-py3-none-any.whl (4.9 MB)
  Using cached transformers-4.22.0-py3-none-any.whl (4.9 MB)
  Using cached transformers-4.21.3-py3-none-any.whl (4.7 MB)
  Using cached transformers-4.21.2-py3-none-any.whl (4.7 MB)
  Using cached transformers-4.21.1-py3-none-any.whl (4.7 MB)
  Using cached transformers-4.21.0-py3-none-any.whl (4.7 MB)
  Using cached transformers-4.

### Development environment

In [65]:
import sagemaker.huggingface
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
print(transformers.__version__)
from datasets import load_dataset, load_metric

import os
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

os.environ["WANDB_DISABLED"]="true"

4.12.3


### Permissions

In [66]:
import sagemaker

sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

role = sagemaker.get_execution_role()
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker role arn: arn:aws:iam::352302020638:role/service-role/SageMaker-MLEngineer
sagemaker bucket: sagemaker-eu-west-2-352302020638
sagemaker session region: eu-west-2


## Loading the fine-tuning dataset

In [67]:
with open('ob-loose-jun28-sm.jsonl', 'r') as json_file:
    jsonl = json_file.readlines()

In [68]:
# Format the data in the required format
data = [{'sequences':json.loads(t)} for t in jsonl]

In [69]:
# Split the files into train, validation and test
train, test = train_test_split(data, test_size=0.1)

validation, test = train_test_split(test, test_size=0.5)

train_df = pd.DataFrame(train)
validation_df = pd.DataFrame(validation)
test_df = pd.DataFrame(test)


In [70]:
# Write the train, validation & test sets
train_df.to_json(path_or_buf='ob-loose-jun28-sm_train.jsonl', orient='records', lines=True)
validation_df.to_json(path_or_buf='ob-loose-jun28-sm_validation.jsonl', orient='records', lines=True)
test_df.to_json(path_or_buf='ob-loose-jun28-sm_test.jsonl', orient='records', lines=True)

In [71]:
# Load the datasets as HuggingFace datasets
base_path = 'ob-loose-jun28-sm_'
raw_datasets = load_dataset("json", data_files={"train": base_path + "train.jsonl", "validation": base_path + "validation.jsonl", "test": base_path + "test.jsonl"})



Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-1ee73c2656f5c74c/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-1ee73c2656f5c74c/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [72]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sequences'],
        num_rows: 907
    })
    validation: Dataset({
        features: ['sequences'],
        num_rows: 50
    })
    test: Dataset({
        features: ['sequences'],
        num_rows: 51
    })
})

## Preprocessing

In [73]:
# model checkpoint, start with mt0-small
model_checkpoint = "bigscience/mt0-small"

# s3 key prefix for the data
s3_prefix = 'samples/datasets/imdb'

In [74]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [75]:
prefix = ""
max_input_length = 128
max_target_length = 128
source = "prompt"
target = "completion"

def preprocess_function(examples):
    inputs = [prefix + ex[source] for ex in examples["sequences"]]
    targets = [ex[target] for ex in examples["sequences"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [76]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [77]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sequences', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 907
    })
    validation: Dataset({
        features: ['sequences', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 50
    })
    test: Dataset({
        features: ['sequences', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 51
    })
})

In [78]:
# set format for pytorch
train_dataset =  tokenized_datasets['train']
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset = tokenized_datasets['test']
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

### Uploading data to sagemaker_session_bucket

In [79]:
import botocore
from datasets.filesystems import S3FileSystem

s3 = S3FileSystem()  

# save train_dataset to s3
training_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/train'
train_dataset.save_to_disk(training_input_path,fs=s3)

# save test_dataset to s3
test_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/test'
test_dataset.save_to_disk(test_input_path,fs=s3)

# Finetuning the BLOOMZ model

In [80]:
!pygmentize ./scripts/train.py

[34mfrom[39;49;00m [04m[36mtransformers[39;49;00m [34mimport[39;49;00m AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Trainer, AutoTokenizer, DataCollatorForSeq2Seq
[34mfrom[39;49;00m [04m[36msklearn[39;49;00m[04m[36m.[39;49;00m[04m[36mmetrics[39;49;00m [34mimport[39;49;00m accuracy_score, precision_recall_fscore_support
[34mfrom[39;49;00m [04m[36mdatasets[39;49;00m [34mimport[39;49;00m load_from_disk, load_metric
[34mimport[39;49;00m [04m[36mrandom[39;49;00m
[34mimport[39;49;00m [04m[36mlogging[39;49;00m
[34mimport[39;49;00m [04m[36msys[39;49;00m
[34mimport[39;49;00m [04m[36margparse[39;49;00m
[34mimport[39;49;00m [04m[36mos[39;49;00m
[34mimport[39;49;00m [04m[36mtorch[39;49;00m
[34mimport[39;49;00m [04m[36mnumpy[39;49;00m [34mas[39;49;00m [04m[36mnp[39;49;00m

[34mif[39;49;00m [31m__name__[39;49;00m == [33m"[39;49;00m[33m__main__[39;49;00m[33m"[39;49;00m:

    parser = argparse.ArgumentParser()

    [37m#

## Creating an Estimator and start a training job

In [81]:
from sagemaker.huggingface import HuggingFace

# hyperparameters, which are passed into the training job
hyperparameters={'epochs': 1,
                 'train_batch_size': 32,
                 'model_name': model_checkpoint
                 }

In [82]:
huggingface_estimator = HuggingFace(entry_point='train.py',
                            source_dir='./scripts',
                            instance_type='ml.p3.2xlarge',
                            instance_count=1,
                            role=role,
                            transformers_version='4.12',
                            pytorch_version='1.9',
                            py_version='py38',
                            hyperparameters = hyperparameters)

In [83]:
# starting the train job with our uploaded datasets as input
huggingface_estimator.fit({'train': training_input_path, 'test': test_input_path})

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: huggingface-pytorch-training-2023-01-04-10-52-31-339


ResourceLimitExceeded: An error occurred (ResourceLimitExceeded) when calling the CreateTrainingJob operation: The account-level service limit 'ml.p3.2xlarge for training job usage' is 0 Instances, with current utilization of 0 Instances and a request delta of 1 Instances. Please contact AWS support to request an increase for this limit.

In [21]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/773 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

In [22]:
batch_size = 16
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{source}-to-{target}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True    
)

In [23]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [24]:
import numpy as np
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [25]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `MT5ForConditionalGeneration.forward` and have been ignored: sequences. If sequences are not expected by `MT5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 907
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 171
  Number of trainable parameters = 300176768
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
