## References

* https://www.coursera.org/learn/generative-ai-with-llms/home/
* https://huggingface.co/docs/peft/main/en/conceptual_guides/lora
* https://docs.adapterhub.ml/classes/adapter_config.html#ia3config
* https://github.com/konstmish/prodigy


In [1]:
# Installing required packages
%pip install -U datasets
%pip install --upgrade pip
%pip install --disable-pip-version-check \
    torch \
    torchdata --quiet

%pip install transformers evaluate rouge_score loralib peft --quiet
%pip install prodigyopt



In [2]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch, torchdata, transformers, datasets
import time
import evaluate
import pandas as pd
import numpy as np

In [3]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
# package versions
print("torch versio", torch.__version__)
print("torchdata version", torchdata.__version__ )
print("transformers version", transformers.__version__ )
print("datasets version", datasets.__version__ )

torch versio 2.3.1+cu121
torchdata version 0.8.0+cpu
transformers version 4.42.4
datasets version 2.21.0


Loading data

In [5]:
# Loading the dialogue summarzation dataset and evaluating the baseline performance
huggingface_dataset_name = "knkarthick/dialogsum"
dataset = load_dataset(huggingface_dataset_name)
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [6]:
# Loading a flant5 model, ( base model)

# Evaluating the model
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
model_name='google/flan-t5-base'
original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map = 'cuda')
original_model = original_model.to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name, device_map = 'cuda')

cuda


In [7]:
# checking number of trainable parameters
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\n\
        all model parameters: {all_model_params}\n\
            percentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(original_model))

trainable model parameters: 247577856
        all model parameters: 247577856
            percentage of trainable model parameters: 100.00%


# Zero shot Inference test

In [8]:
index = 200

dialogue = dataset['test'][index]['dialogue']
summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

inputs = tokenizer(prompt, return_tensors='pt').to(device)
output = tokenizer.decode(
    original_model.generate(
        inputs["input_ids"],
        max_new_tokens=200,
    )[0],
    skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Summarize the following conversation.

#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.

Summary:

-------------------------------------------------------------------

### Preparing training data

In [9]:
# processing data
def tokenize_function(example):
    start_prompt = 'Summarize the following conversation.\n\n'
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids.to(device)
    example['labels'] = tokenizer(example["summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids.to(device)

    return example

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'topic', 'dialogue', 'summary',])

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

#### Instruction Finetuning

In [10]:
print_number_of_trainable_model_parameters(original_model)

'trainable model parameters: 247577856\n        all model parameters: 247577856\n            percentage of trainable model parameters: 100.00%'

In [22]:
output_dir = f'./instruct_tunining_prodigy_old'

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=50,
    log_level = "info",
    logging_dir = "logs" ,
    logging_strategy = "steps",
    save_strategy = "epoch"

)

trainer = Trainer(
    model=original_model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation']
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
You have loaded a model on multiple GPUs. `is_model_parallel` attribute will be force-set to `True` to avoid any unexpected behavior such as device placement mismatching.


In [29]:
# trainer.train()
# instruct_model.push_to_hub("Hari7696/instruct_tuning")
# loading a trained model from hugging face
instruct_model = AutoModelForSeq2SeqLM.from_pretrained("Hari7696/instruct_tuning", device_map = 'cuda')
# instruct_model = instruct_model.to(device)

config.json:   0%|          | 0.00/1.59k [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--Hari7696--instruct_tuning/snapshots/26ad4340ce1b60ab29bd60d95f9cd94d9e92a167/config.json
Model config T5Config {
  "_name_or_path": "Hari7696/instruct_tuning",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--Hari7696--instruct_tuning/snapshots/26ad4340ce1b60ab29bd60d95f9cd94d9e92a167/model.safetensors
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0
}

All model checkpoint weights were used when initializing T5ForConditionalGeneration.

All the weights of T5ForConditionalGeneration were initialized from the model checkpoint at Hari7696/instruct_tuning.
If your task is similar to the task the model of the checkpoint was trained on, you can already use T5ForConditionalGeneration for predictions without further training.
loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--Hari7696--instruct_tuning/snapshots/26ad4340ce1b60ab29bd60d95f9cd94d9e92a167/generation_config.json
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0
}



### Baseline model vs fine tuned model evaluation

In [22]:
# instruct_model_summaries[1]

'#Person1#: I need to take a dictation for you.'

In [37]:
rouge = evaluate.load('rouge')

dialogues = dataset['test']['dialogue']
human_baseline_summaries = dataset['test']['summary']

original_model_summaries = []
instruct_model_summaries = []
from tqdm import tqdm

for _, dialogue in tqdm(enumerate(dialogues)):

    prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")

    original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)
    original_model_summaries.append(original_model_text_output)

    instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)
    instruct_model_summaries.append(instruct_model_text_output)

zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries, instruct_model_summaries))

df = pd.DataFrame(zipped_summaries, columns = ['human_baseline_summaries', 'original_model_summaries', 'instruct_model_summaries'])
df

261it [03:24,  2.06it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1028 > 512). Running this sequence through the model will result in indexing errors
1500it [22:26,  1.11it/s]


Unnamed: 0,human_baseline_summaries,original_model_summaries,instruct_model_summaries
0,Ms. Dawson helps #Person1# to write a memo to ...,Employees are required to take dictation break...,This memo is to be distributed to all employee...
1,In order to prevent employees from wasting tim...,Employees who use instant messaging will face ...,This memo is to be distributed to all employee...
2,Ms. Dawson takes a dictation for #Person1# abo...,IMPORTANT!,This memo is to be distributed to all employee...
3,#Person2# arrives late because of traffic jam....,I'm finally here!,Take public transport to work.
4,#Person2# decides to follow #Person1#'s sugges...,You're finally here!,Take public transport to work.
...,...,...,...
1495,Matthew and Steve meet after a long time. Stev...,I'm looking for a place to live in the next fe...,Matthew and Steve are looking for a place to l...
1496,Steve has been looking for a place to live. Ma...,#Person1#: Hello!,Matthew and Steve are looking for a place to l...
1497,Frank invites Besty to the party to celebrate ...,A new promotion is on the way.,The promotion is good for you.
1498,Frank invites Betsy to the big promotion party...,The party is on Saturday.,The promotion is good for you.


In [35]:
original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

instruct_model_results = rouge.compute(
    predictions=instruct_model_summaries,
    references=human_baseline_summaries[0:len(instruct_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

print('ORIGINAL MODEL:')
print(original_model_results)
print('INSTRUCT MODEL:')
print(instruct_model_results)

ORIGINAL MODEL:
{'rouge1': 0.2352484435112126, 'rouge2': 0.09576365663322187, 'rougeL': 0.2037688749722178, 'rougeLsum': 0.20522878127331995}
INSTRUCT MODEL:
{'rouge1': 0.29970979020979016, 'rouge2': 0.14344664031620552, 'rougeL': 0.24626456876456876, 'rougeLsum': 0.24932465682465677}


In [36]:
print("Absolute percentage improvement of INSTRUCT MODEL over ORIGINAL MODEL")

improvement = (np.array(list(instruct_model_results.values())) - np.array(list(original_model_results.values())))
for key, value in zip(instruct_model_results.keys(), improvement):
    print(f'{key}: {value*100:.2f}%')

Absolute percentage improvement of INSTRUCT MODEL over ORIGINAL MODEL
rouge1: 6.45%
rouge2: 4.77%
rougeL: 4.25%
rougeLsum: 4.41%


#### PEFT IA3 model

In [None]:
# PEFT IA3 model https://arxiv.org/pdf/2205.05638
# reference https://daniel-mekuriaw16.medium.com/amharic-ia3-peft-4f1067edbd79

from peft import LoraConfig, TaskType, get_peft_model, PeftModel, IA3Config

ia3_config = IA3Config( task_type=TaskType.SEQ_2_SEQ_LM, target_modules=["q", "k", "v", "o"], feedforward_modules=["v"] )

peft_model_ia3 = get_peft_model(original_model, ia3_config)


In [None]:
output_dir = f'./peft-ia3'

from prodigyopt import Prodigy

n_epoch = 2
optimizer = Prodigy(peft_model_ia3.parameters(), weight_decay=0.01, safeguard_warmup=True, use_bias_correction=True, betas=(0.9, 0.99))

num_examples =  12460
batch_size = 8 # chose 10 instead of 2^ , because I can fit 10 only in the memory I have

steps = int(np.ceil(12460/batch_size) * n_epoch)
print(steps)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=steps )

# taking the configuration from IA3 paper
peft_ia3_training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size = batch_size,
    learning_rate=1.0,
    num_train_epochs=n_epoch,
    logging_steps=50,
    log_level = "info",
    logging_dir = "logs" ,
    logging_strategy = "steps"
)

peft_ia3_trainer = Trainer(
    model=peft_model_ia3,
    args=peft_ia3_training_args,
    optimizers = (optimizer,scheduler ),
    train_dataset=tokenized_datasets["train"],
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
You have loaded a model on multiple GPUs. `is_model_parallel` attribute will be force-set to `True` to avoid any unexpected behavior such as device placement mismatching.


Using decoupled weight decay
3116


In [None]:
print_number_of_trainable_model_parameters(peft_model_ia3)

'trainable model parameters: 110592\n        all model parameters: 247688448\n            percentage of trainable model parameters: 0.04%'

In [None]:
# print(original_model)

In [None]:
# peft_ia3_trainer.train()
# peft_ia3_trainer.push_to_hub("Hari7696/peft-ia3")

# peft_iam_model_path="./peft-ia3_model"

# peft_ia3_trainer.model.save_pretrained(peft_iam_model_path)
#loading a  saved model

***** Running training *****
  Num examples = 12,460
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3,116
  Number of trainable parameters = 110,592


Step,Training Loss
50,42.8188
100,1.5673
150,0.2161
200,0.1809
250,0.1537
300,0.147
350,0.1468
400,0.1516
450,0.1409
500,0.1419


Saving model checkpoint to ./peft-ia3/checkpoint-500
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--google--flan-t5-base/snapshots/7bcac572ce56db69c1ea7c8af255c5d7c9672fc2/config.json
Model config T5Config {
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max

adapter_model.safetensors:   0%|          | 0.00/461k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.05k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--google--flan-t5-base/snapshots/7bcac572ce56db69c1ea7c8af255c5d7c9672fc2/config.json
Model config T5Config {
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repe

In [38]:
# combing the peft model with the baseline model

from peft import PeftModel, PeftConfig

peft_model_base = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base", torch_dtype=torch.bfloat16, device_map = "cuda")
peft_model_base = peft_model_base.to(device)

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base", device_map = "cuda")

peft_ia3_model = PeftModel.from_pretrained(peft_model_base,
                                       "Hari7696/peft-ia3",
                                       torch_dtype=torch.bfloat16,
                                       is_trainable=False)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--google--flan-t5-base/snapshots/7bcac572ce56db69c1ea7c8af255c5d7c9672fc2/config.json
Model config T5Config {
  "_name_or_path": "google/flan-t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 2

adapter_config.json:   0%|          | 0.00/361 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/461k [00:00<?, ?B/s]

In [39]:
dialogues = dataset['test']['dialogue']
human_baseline_summaries = dataset['test']['summary']

# original_model_summaries = []
# instruct_model_summaries = []
peft_ia3_model_summaries = []

from tqdm import tqdm
for idx, dialogue in tqdm(enumerate(dialogues)):
    prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """

    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

    human_baseline_text_output = human_baseline_summaries[idx]

    peft_model_outputs = peft_ia3_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

    peft_ia3_model_summaries.append(peft_model_text_output)

zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries, instruct_model_summaries, peft_ia3_model_summaries))

df = pd.DataFrame(zipped_summaries, columns = ['human_baseline_summaries', 'original_model_summaries', 'instruct_model_summaries', 'peft_model_summaries'])
df

261it [07:30,  1.70s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (1028 > 512). Running this sequence through the model will result in indexing errors
1500it [42:33,  1.70s/it]


Unnamed: 0,human_baseline_summaries,original_model_summaries,instruct_model_summaries,peft_model_summaries
0,Ms. Dawson helps #Person1# to write a memo to ...,Employees are required to take dictation break...,This memo is to be distributed to all employee...,#Person1# asks Ms. Dawson to take dictation fo...
1,In order to prevent employees from wasting tim...,Employees who use instant messaging will face ...,This memo is to be distributed to all employee...,#Person1# asks Ms. Dawson to take dictation fo...
2,Ms. Dawson takes a dictation for #Person1# abo...,IMPORTANT!,This memo is to be distributed to all employee...,#Person1# asks Ms. Dawson to take dictation fo...
3,#Person2# arrives late because of traffic jam....,I'm finally here!,Take public transport to work.,#Person2# got stuck in traffic again. #Person2...
4,#Person2# decides to follow #Person1#'s sugges...,You're finally here!,Take public transport to work.,#Person2# got stuck in traffic again. #Person2...
...,...,...,...,...
1495,Matthew and Steve meet after a long time. Stev...,I'm looking for a place to live in the next fe...,Matthew and Steve are looking for a place to l...,Steve is looking for a place to live recently ...
1496,Steve has been looking for a place to live. Ma...,#Person1#: Hello!,Matthew and Steve are looking for a place to l...,Steve is looking for a place to live recently ...
1497,Frank invites Besty to the party to celebrate ...,A new promotion is on the way.,The promotion is good for you.,Bettsy wants to throw a huge party for all of ...
1498,Frank invites Betsy to the big promotion party...,The party is on Saturday.,The promotion is good for you.,Bettsy wants to throw a huge party for all of ...


### PEFT LoRA model

In [None]:

from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32, # Rank
    lora_alpha=32,
    target_modules=["q", "k", "v", "o"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
)

peft_lora_model = get_peft_model(original_model,
                            lora_config)
print(print_number_of_trainable_model_parameters(peft_lora_model))

trainable model parameters: 7077888
        all model parameters: 254655744
            percentage of trainable model parameters: 2.78%


In [None]:
output_dir = f'./peft-lora'

from prodigyopt import Prodigy

n_epoch = 2
optimizer = Prodigy(peft_lora_model.parameters(), weight_decay=0.01, safeguard_warmup=True, use_bias_correction=True, betas=(0.9, 0.99))

num_examples =  12460
batch_size = 8 # chose 10 instead of 2^ , because I can fit 10 only in the memory I have

steps = int(np.ceil(12460/batch_size) * n_epoch)
print(steps)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=steps )

# taking the configuration from IA3 paper
peft_lora_training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size = batch_size,
    learning_rate=1.0,
    num_train_epochs=n_epoch,
    logging_steps=50,
    log_level = "info",
    logging_dir = "logs" ,
    logging_strategy = "steps"
)

peft_lora_trainer = Trainer(
    model=peft_lora_model,
    args=peft_lora_training_args,
    optimizers = (optimizer,scheduler ),
    train_dataset=tokenized_datasets["train"],
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
You have loaded a model on multiple GPUs. `is_model_parallel` attribute will be force-set to `True` to avoid any unexpected behavior such as device placement mismatching.


Using decoupled weight decay
3116


In [None]:
# peft_lora_trainer.train()
# peft_lora_trainer.push_to_hub("Hari7696/PEFT_LoRa")

# peft_lora_model_path="./peft_lora_model"

# peft_lora_trainer.model.save_pretrained(peft_lora_model_path)

***** Running training *****
  Num examples = 12,460
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3,116
  Number of trainable parameters = 7,077,888


Step,Training Loss
50,36.8809
100,0.6481
150,0.1422
200,0.1318
250,0.1224
300,0.1173
350,0.1181
400,0.1171
450,0.1103
500,0.1116


Saving model checkpoint to ./peft-lora/checkpoint-500
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--google--flan-t5-base/snapshots/7bcac572ce56db69c1ea7c8af255c5d7c9672fc2/config.json
Model config T5Config {
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "ma

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.05k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/28.4M [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--google--flan-t5-base/snapshots/7bcac572ce56db69c1ea7c8af255c5d7c9672fc2/config.json
Model config T5Config {
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repe

In [40]:
# combing the peft model with the original model

from peft import PeftModel, PeftConfig

peft_model_base = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base", torch_dtype=torch.bfloat16, device_map = "cuda")
peft_model_base = peft_model_base.to(device)

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base", device_map = "cuda")

peft_lora_model = PeftModel.from_pretrained(peft_model_base,
                                       "Hari7696/peft-lora",
                                       torch_dtype=torch.bfloat16,
                                       is_trainable=False)


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--google--flan-t5-base/snapshots/7bcac572ce56db69c1ea7c8af255c5d7c9672fc2/config.json
Model config T5Config {
  "_name_or_path": "google/flan-t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 2

adapter_config.json:   0%|          | 0.00/656 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/28.4M [00:00<?, ?B/s]

In [41]:
dialogues = dataset['test']['dialogue']
human_baseline_summaries = dataset['test']['summary']

peft_lora_model_summaries = []

for idx, dialogue in enumerate(dialogues):
    prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """

    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

    human_baseline_text_output = human_baseline_summaries[idx]

    peft_model_outputs = peft_lora_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

    peft_lora_model_summaries.append(peft_model_text_output)

zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries, instruct_model_summaries, peft_ia3_model_summaries, peft_lora_model_summaries))

df = pd.DataFrame(zipped_summaries, columns = ['human_baseline_summaries', 'original_model_summaries', 'instruct_model_summaries', 'peft_ia3_model_summaries',
                                               'peft_lora_model_summaries'])
df

Token indices sequence length is longer than the specified maximum sequence length for this model (1028 > 512). Running this sequence through the model will result in indexing errors


Unnamed: 0,human_baseline_summaries,original_model_summaries,instruct_model_summaries,peft_ia3_model_summaries,peft_lora_model_summaries
0,Ms. Dawson helps #Person1# to write a memo to ...,Employees are required to take dictation break...,This memo is to be distributed to all employee...,#Person1# asks Ms. Dawson to take dictation fo...,#Person1# asks Ms. Dawson to take a dictation ...
1,In order to prevent employees from wasting tim...,Employees who use instant messaging will face ...,This memo is to be distributed to all employee...,#Person1# asks Ms. Dawson to take dictation fo...,#Person1# asks Ms. Dawson to take a dictation ...
2,Ms. Dawson takes a dictation for #Person1# abo...,IMPORTANT!,This memo is to be distributed to all employee...,#Person1# asks Ms. Dawson to take dictation fo...,#Person1# asks Ms. Dawson to take a dictation ...
3,#Person2# arrives late because of traffic jam....,I'm finally here!,Take public transport to work.,#Person2# got stuck in traffic again. #Person2...,#Person2# got stuck in traffic again. #Person1...
4,#Person2# decides to follow #Person1#'s sugges...,You're finally here!,Take public transport to work.,#Person2# got stuck in traffic again. #Person2...,#Person2# got stuck in traffic again. #Person1...
...,...,...,...,...,...
1495,Matthew and Steve meet after a long time. Stev...,I'm looking for a place to live in the next fe...,Matthew and Steve are looking for a place to l...,Steve is looking for a place to live recently ...,Steve has been looking for a place to live rec...
1496,Steve has been looking for a place to live. Ma...,#Person1#: Hello!,Matthew and Steve are looking for a place to l...,Steve is looking for a place to live recently ...,Steve has been looking for a place to live rec...
1497,Frank invites Besty to the party to celebrate ...,A new promotion is on the way.,The promotion is good for you.,Bettsy wants to throw a huge party for all of ...,Frank invites Betsy to a party on Saturday. Be...
1498,Frank invites Betsy to the big promotion party...,The party is on Saturday.,The promotion is good for you.,Bettsy wants to throw a huge party for all of ...,Frank invites Betsy to a party on Saturday. Be...


In [42]:
rouge = evaluate.load('rouge')

original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

instruct_model_results = rouge.compute(
    predictions=instruct_model_summaries,
    references=human_baseline_summaries[0:len(instruct_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

peft_ia3_model_results = rouge.compute(
    predictions=peft_ia3_model_summaries,
    references=human_baseline_summaries[0:len(peft_ia3_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

peft_lora_model_results = rouge.compute(
    predictions=peft_lora_model_summaries,
    references=human_baseline_summaries[0:len(peft_lora_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

print('ORIGINAL MODEL:')
print(original_model_results)
print('INSTRUCT MODEL:')
print(instruct_model_results)
print('PEFT IA3 MODEL:')
print(peft_ia3_model_results)
print('PEFT LoRa MODEL:')
print(peft_lora_model_results)

ORIGINAL MODEL:
{'rouge1': 0.1936365276341292, 'rouge2': 0.055830613818743824, 'rougeL': 0.1653205339411757, 'rougeLsum': 0.1655107637521635}
INSTRUCT MODEL:
{'rouge1': 0.2093799131160447, 'rouge2': 0.07290325654122923, 'rougeL': 0.18361440570543924, 'rougeLsum': 0.1840259644288299}
PEFT IA3 MODEL:
{'rouge1': 0.3751067792528885, 'rouge2': 0.14136289471705604, 'rougeL': 0.29975210059678203, 'rougeLsum': 0.2996095646384628}
PEFT LoRa MODEL:
{'rouge1': 0.43411376364961807, 'rouge2': 0.177861803998824, 'rougeL': 0.34936593187081066, 'rougeLsum': 0.34930439564897153}


In [49]:
df.to_pickle("results.pkl")

In [44]:
# Orginal model vs Instruct model
improvement = (np.array(list(instruct_model_results.values())) - np.array(list(original_model_results.values())))
for key, value in zip(instruct_model_results.keys(), improvement):
    print(f'{key}: {value*100:.2f}%')

rouge1: 1.57%
rouge2: 1.71%
rougeL: 1.83%
rougeLsum: 1.85%


In [45]:
# Instruct model vs Peft Lora
improvement = (np.array(list(peft_lora_model_results.values())) - np.array(list(original_model_results.values())))
for key, value in zip(peft_lora_model_results.keys(), improvement):
    print(f'{key}: {value*100:.2f}%')

rouge1: 24.05%
rouge2: 12.20%
rougeL: 18.40%
rougeLsum: 18.38%


In [46]:
# Instruct model vs Peft IA3
improvement = (np.array(list(peft_ia3_model_results.values())) - np.array(list(original_model_results.values())))
for key, value in zip(peft_ia3_model_results.keys(), improvement):
    print(f'{key}: {value*100:.2f}%')

rouge1: 18.15%
rouge2: 8.55%
rougeL: 13.44%
rougeLsum: 13.41%


In [47]:
# Peft IA3 vs PEFT Lora
improvement = (np.array(list(peft_ia3_model_results.values())) - np.array(list(peft_lora_model_results.values())))
for key, value in zip(peft_ia3_model_results.keys(), improvement):
    print(f'{key}: {value*100:.2f}%')

rouge1: -5.90%
rouge2: -3.65%
rougeL: -4.96%
rougeLsum: -4.97%
