In [3]:
# install Hugging Face Libraries
!pip install "peft==0.2.0"
!pip install "transformers==4.27.2" "datasets==2.9.0" "accelerate==0.17.1" "evaluate==0.4.0" "bitsandbytes==0.37.1" loralib --upgrade --quiet
# install additional dependencies needed for training
!pip install rouge-score tensorboard py7zr
!pip install huggingface_hub

Collecting peft==0.2.0
  Downloading peft-0.2.0-py3-none-any.whl (40 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/40.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.3/40.3 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Collecting transformers (from peft==0.2.0)
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m73.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate (from peft==0.2.0)
  Downloading accelerate-0.23.0-py3-none-any.whl (258 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.1/258.1 kB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub (from accelerate->peft==0.2.0)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m

## Load datasets

We will use dataset **"samsum"**. Each sample in the dataset contains {'id':', 'dialogue',  'summary'}

In [77]:
from datasets import load_dataset

# Load dataset from the hub
dataset = load_dataset("samsum")

print(f"Train dataset size: {len(dataset['train'])}, and the train sample[0] is like {dataset['train'][0]}")
print(f"Train dataset size: {len(dataset['train'])}, and the train sample[10] is like {dataset['train'][10]}")

print(f"Test dataset size: {len(dataset['test'])}, and the test sample[0] is like {dataset['test'][0]}")
print(f"Test dataset size: {len(dataset['test'])}, and the test sample[10] is like {dataset['test'][10]}")





  0%|          | 0/3 [00:00<?, ?it/s]

Train dataset size: 14732, and the train sample[0] is like {'id': '13818513', 'dialogue': "Amanda: I baked  cookies. Do you want some?\r\nJerry: Sure!\r\nAmanda: I'll bring you tomorrow :-)", 'summary': 'Amanda baked cookies and will bring Jerry some tomorrow.'}
Train dataset size: 14732, and the train sample[10] is like {'id': '13727633', 'dialogue': 'Lucas: Hey! How was your day?\r\nDemi: Hey there! \r\nDemi: It was pretty fine, actually, thank you!\r\nDemi: I just got promoted! :D\r\nLucas: Whoa! Great news!\r\nLucas: Congratulations!\r\nLucas: Such a success has to be celebrated.\r\nDemi: I agree! :D\r\nDemi: Tonight at Death & Co.?\r\nLucas: Sure!\r\nLucas: See you there at 10pm?\r\nDemi: Yeah! See you there! :D', 'summary': 'Demi got promoted. She will celebrate that with Lucas at Death & Co at 10 pm.'}
Test dataset size: 819, and the test sample[0] is like {'id': '13862856', 'dialogue': "Hannah: Hey, do you have Betty's number?\nAmanda: Lemme check\nHannah: <file_gif>\nAmanda: S

Get a portion of the dataset to train, otherwise it takes too long to train one episode.

In [78]:
from datasets import DatasetDict

def subset(dataset, portion=0.1):
  train_subset = dataset["train"].shuffle().select(range(int(portion*len(dataset["train"]))))
  test_subset = dataset["test"].shuffle().select(range(int(portion*len(dataset["test"]))))
  validation_subset = dataset["validation"].shuffle().select(range(int(portion*len(dataset["validation"]))))
  subset = DatasetDict({"train": train_subset, "validation": validation_subset, "test": test_subset})
  return subset

dataset = subset(dataset)

print(f"Train dataset size: {len(dataset['train'])}, Test dataset size: {len(dataset['test'])}, Validation dataset size: {len(dataset['validation'])}, and the train sample[0] is like {dataset['train'][0]}")


Train dataset size: 1473, Test dataset size: 81, Validation dataset size: 81, and the train sample[0] is like {'id': '13681220', 'dialogue': 'Lucy: omg did you see JK this morning?\r\nSue: I try to avoid it lol\r\nLucy: you should have seen it it was disgusting\r\nSue: I cant do it anymore i try to listen to the radio in the mornings.. jk makes you think the whole world is full of idiots lol\r\nLucy: you may be right I dont know how some of them can go on there in public for the world to see\r\nSue: I would die if I got a call to go on there lol\r\nSue: could you imagine ha ha \r\nLucy: I would piss myself If I saw you and Andy up there\r\nSue: over my dead body !', 'summary': "Sue doesn't watch JK any more as it's disgusting."}


## Tokenizer

**{'input_ids'}**: i.e. "I" for 1045.
* BERT Base: 30,000 token ids (WordPiece vocab)
* RoBERTa Base: 50,000 token ids (BPE vocab)
* GPT-2 Small: 50,000 token ids (BPE vocab)

**{'token_types'}**: For multiple QA pairs in a dialogue, each new question-answer sequence would get a new token type id incrementing from 0.
* tokens: [Q1, A1, SEP, Q2, A2, SEP, Q3, A3]
* token_types: [0, 0, 0, 1, 1, 1, 2, 2]

**{'attention_mask'}**: The attention mask is set to 1 for valid tokens and 0 for padding/invalid tokens.

Eg. each sample in the **"samsum"** dataset contains {'id':', 'dialogue',  'summary'}. Using the tokenizer **"google/flan-t5-xxl"**:

* each sample's {'dialogue'} is tokenized to {'input_ids':', 'attention_mask'}

* each sample's {'summary'} is tokenized to {'input_ids':', 'attention_mask'}

but tokens are of inequal length among samples.

In [79]:
from transformers import AutoTokenizer

model_id="google/flan-t5-xxl"
tokenizer = AutoTokenizer.from_pretrained(model_id)

print("The tokenized results are not equal length among samples.")
print(f"Train sample[0] dialogue after tokenized {tokenizer(dataset['train'][0]['dialogue'])}")
print(f"Train sample[10] dialogue after tokenized {tokenizer(dataset['train'][10]['dialogue'])}")

print(f"Test sample[0] summary after tokenized {tokenizer(dataset['train'][0]['summary'])}")
print(f"Test sample[10] summary after tokenized {tokenizer(dataset['train'][10]['summary'])}")

Token indices sequence length is longer than the specified maximum sequence length for this model (605 > 512). Running this sequence through the model will result in indexing errors


The tokenized results are not equal length among samples.
Train sample[0] dialogue after tokenized {'input_ids': [21812, 10, 3, 32, 51, 122, 410, 25, 217, 446, 439, 48, 1379, 58, 17564, 10, 27, 653, 12, 1792, 34, 16497, 21812, 10, 25, 225, 43, 894, 34, 34, 47, 27635, 53, 17564, 10, 27, 54, 17, 103, 34, 7595, 3, 23, 653, 12, 3011, 12, 8, 2252, 16, 8, 1379, 7, 5, 5, 3, 354, 157, 656, 25, 317, 8, 829, 296, 19, 423, 13, 25851, 7, 16497, 21812, 10, 25, 164, 36, 269, 27, 2483, 214, 149, 128, 13, 135, 54, 281, 30, 132, 16, 452, 21, 8, 296, 12, 217, 17564, 10, 27, 133, 67, 3, 99, 27, 530, 3, 9, 580, 12, 281, 30, 132, 16497, 17564, 10, 228, 25, 3034, 4244, 4244, 21812, 10, 27, 133, 2816, 7, 7, 1512, 156, 27, 1509, 25, 11, 12838, 95, 132, 17564, 10, 147, 82, 3654, 643, 3, 55, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

### To make tokens equal length, as the max length in the samples.



Get the max length of "dialogue" and "summary".
* truncation=True. It allows truncating the dialogues to fit the max sequence length, by default 512.

* batched=True. It tokenizes the dialogues by batches to achieve quicker tokenization.

In [80]:
from datasets import concatenate_datasets
import numpy as np

# max length of "dialogue" from both train and test datasets
tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["dialogue"], truncation=True), batched=True, remove_columns=["dialogue", "summary"])
input_lenghts = [len(x) for x in tokenized_inputs["input_ids"]]
max_dialogue_length = int(np.percentile(input_lenghts, 85)) # take 85 percentile of max length for better utilization
print(f"Max dialogue length: {max_dialogue_length}")

# max length of "summary" from both train and test datasets
tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["summary"], truncation=True), batched=True, remove_columns=["dialogue", "summary"])
target_lenghts = [len(x) for x in tokenized_targets["input_ids"]]
max_summary_length = int(np.percentile(target_lenghts, 90)) # take 90 percentile of max length for better utilization
print(f"Max summary length: {max_summary_length}")


  0%|          | 0/2 [00:00<?, ?ba/s]

Max dialogue length: 271


  0%|          | 0/2 [00:00<?, ?ba/s]

Max summary length: 52


Truncate and padding following the max length.

In [81]:
def preprocess_function(sample,padding="max_length"):
    # change the prefix "dialogue" into the "summarize", tokenize the "summarize" into {'input_ids':', 'attention_mask'}
    inputs = ["summarize: " + item for item in sample["dialogue"]]
    model_inputs = tokenizer(inputs, max_length=max_dialogue_length, padding=padding, truncation=True,return_tensors="pt")

    # tokenize targets with the `summary` argument
    labels = tokenizer(text_target=sample["summary"], max_length=max_summary_length, padding=padding, truncation=True,return_tensors="pt")
    # change the padding token from 0 by default to -100 in the labels, because we want to ignore padding when computing the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    # we got return values with {'input_ids':', 'attention_mask', "labels"}
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["dialogue", "summary", "id"])

print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


## Config the model.

In [23]:
from transformers import AutoModelForSeq2SeqLM

# huggingface hub model id
model_id = "philschmid/flan-t5-xxl-sharded-fp16"

# load model from the hub
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, load_in_8bit=True, device_map="auto")

Downloading (…)lve/main/config.json:   0%|          | 0.00/759 [00:00<?, ?B/s]



Downloading (…)model.bin.index.json:   0%|          | 0.00/50.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/12 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00012.bin:   0%|          | 0.00/1.72G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00012.bin:   0%|          | 0.00/1.93G [00:00<?, ?B/s]

Downloading (…)l-00003-of-00012.bin:   0%|          | 0.00/1.93G [00:00<?, ?B/s]

Downloading (…)l-00004-of-00012.bin:   0%|          | 0.00/1.93G [00:00<?, ?B/s]

Downloading (…)l-00005-of-00012.bin:   0%|          | 0.00/1.93G [00:00<?, ?B/s]

Downloading (…)l-00006-of-00012.bin:   0%|          | 0.00/1.97G [00:00<?, ?B/s]

Downloading (…)l-00007-of-00012.bin:   0%|          | 0.00/1.93G [00:00<?, ?B/s]

Downloading (…)l-00008-of-00012.bin:   0%|          | 0.00/2.00G [00:00<?, ?B/s]

Downloading (…)l-00009-of-00012.bin:   0%|          | 0.00/2.00G [00:00<?, ?B/s]

Downloading (…)l-00010-of-00012.bin:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

Downloading (…)l-00011-of-00012.bin:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

Downloading (…)l-00012-of-00012.bin:   0%|          | 0.00/1.24G [00:00<?, ?B/s]


Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /usr/local/lib/python3.10/dist-packages/bitsandbytes/libbitsandbytes_cuda118.so...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Loading checkpoint shards:   0%|          | 0/12 [00:00<?, ?it/s]

In [24]:

from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType

# Define LoRA Config
lora_config = LoraConfig(
 r=16,
 lora_alpha=32,
 target_modules=["q", "v"],
 lora_dropout=0.05,
 bias="none",
 task_type=TaskType.SEQ_2_SEQ_LM
)
# prepare int-8 model for training
model = prepare_model_for_int8_training(model)

# add LoRA adaptor
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 18874368 || all params: 11154206720 || trainable%: 0.16921300163961817


In [25]:

from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [28]:

from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

output_dir="lora-flan-t5-xxl"

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
	auto_find_batch_size=True,
    learning_rate=1e-3, # higher learning rate
    num_train_epochs=1,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=1,
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=500,
    save_strategy="no",
    report_to="tensorboard",
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

## Train and push to hugging face


In [29]:
trainer.train()



Step,Training Loss


Step,Training Loss


TrainOutput(global_step=47, training_loss=1.0607836297217836, metrics={'train_runtime': 566.2918, 'train_samples_per_second': 2.601, 'train_steps_per_second': 0.083, 'total_flos': 2.683572556647629e+16, 'train_loss': 1.0607836297217836, 'epoch': 1.0})

In [30]:
trainer.model.save_pretrained("flan-t5-xxl-sharded-fp16-small_samsum")

In [33]:
from huggingface_hub import login

login(token="hf_KiIasdNyoVoWwwA***")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [34]:
model.push_to_hub("jaswu/flan-t5-xxl-sharded-fp16-small_samsum",create_pr=1)

adapter_model.bin:   0%|          | 0.00/75.6M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/jaswu/flan-t5-xxl-sharded-fp16-small_samsum/commit/a48623fab76b7cfb1b73b2edf192bf6eb87fc907', commit_message='Upload model', commit_description='', oid='a48623fab76b7cfb1b73b2edf192bf6eb87fc907', pr_url='https://huggingface.co/jaswu/flan-t5-xxl-sharded-fp16-small_samsum/discussions/1', pr_revision='refs/pr/1', pr_num=1)

## Inference example

In [66]:
sample_input=dataset['test'][0]
print(sample_input)

{'id': '13611929', 'dialogue': 'Louis: Hey, hows your day? :D\r\nCheryl: Okaaay… I guess\r\nLouis: Aha, someone’s feeling a bit down, am I right?\r\nCheryl: yea, sort of…\r\nLouis: Go on, tell me what happened\r\nCheryl: I…just had an argument with my mom\r\nLouis: Jesus what again\r\nCheryl: I forgot to close the window when I was leaving home!\r\nLouis: And that’s it?\r\nCheryl: No, not only… Ya know, wouldn’t be that bad, but I got angry, started screaming and everything ;/\r\nLouis: not a good idea, babe\r\nCheryl: I knoooow \uf04c\r\nLouis: Was it really bad? \r\nCheryl: I suppose yea, she kicked me out xd\r\nLouis: WHAT\r\nCheryl: I mean I don’t have to move right now, but she gave me time till the end of the year\r\nLouis: I’m sorry…\r\nCheryl: Naah, don’t be, I believe it’s for good. I couldn’t stand her anyway xD', 'summary': 'Cheryl had an argument with her mom. She forgot to close the window, got angry and started a fight. Her mom gave her time till the end of the year to mo

In [73]:
print(f"The dialogue is: {sample_input['dialogue']}")
sample_input_dialogue_tokenized = tokenizer(sample_input["dialogue"], max_length=max_dialogue_length, return_tensors="pt", padding="max_length", truncation=True).input_ids.cuda()
print(f"The tokenized dialogue is: {sample_input_dialogue_tokenized}")

The dialogue is: Louis: Hey, hows your day? :D
Cheryl: Okaaay… I guess
Louis: Aha, someone’s feeling a bit down, am I right?
Cheryl: yea, sort of…
Louis: Go on, tell me what happened
Cheryl: I…just had an argument with my mom
Louis: Jesus what again
Cheryl: I forgot to close the window when I was leaving home!
Louis: And that’s it?
Cheryl: No, not only… Ya know, wouldn’t be that bad, but I got angry, started screaming and everything ;/
Louis: not a good idea, babe
Cheryl: I knoooow 
Louis: Was it really bad? 
Cheryl: I suppose yea, she kicked me out xd
Louis: WHAT
Cheryl: I mean I don’t have to move right now, but she gave me time till the end of the year
Louis: I’m sorry…
Cheryl: Naah, don’t be, I believe it’s for good. I couldn’t stand her anyway xD
The tokenized dialogue is: tensor([[ 5181,    10,  9459,     6,   149,     7,    39,   239,    58,     3,
            10,   308,  2556, 12973,    10,  8872,     9,     9,     9,    63,
           233,    27,  3382,  5181

In [74]:
print(f"The summary is: {sample_input['summary']}")

outputs = model.generate(input_ids=sample_input_dialogue_tokenized, max_new_tokens=100)
print(f"The inference summary is: {tokenizer.decode(outputs[0], skip_special_tokens=True)}")


The summary is: Cheryl had an argument with her mom. She forgot to close the window, got angry and started a fight. Her mom gave her time till the end of the year to move out.
The inference summary is: Cheryl's mom kicked her out of the house because she forgot to close the window when she was leaving home.
