In [1]:
import pandas as pd
import json

Loading the dataset

In [2]:
train_df = pd.read_csv('datasets/train.csv')
val_df = pd.read_csv('datasets/val.csv')

In [3]:
train_df.columns

Index(['question', 'answer'], dtype='object')

# Training in Lora

In [4]:
import os

import torch
from accelerate import Accelerator
from torch.utils.data import DataLoader
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, default_data_collator, get_linear_schedule_with_warmup, DataCollatorForSeq2Seq

from datasets import load_dataset
from peft import LoraConfig, TaskType, get_peft_model, get_peft_model_state_dict
from peft.utils.other import fsdp_auto_wrap_policy
from tqdm import tqdm


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.9
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...


  warn(msg)
  warn(msg)
  warn(msg)
Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)


In [5]:
model_id = 'google/flan-t5-xxl'

In [6]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from torch.utils.data import Dataset

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)  # replace 'model_name' with your model

def encode(example, max_length=512):
    source = tokenizer.encode_plus(
        example['question'],
        max_length=max_length,
        truncation=True,
        padding='max_length',
        return_tensors='pt'
    )
    
    target = tokenizer.encode_plus(
        example['answer'],
        max_length=max_length,
        truncation=True,
        padding='max_length',
        return_tensors='pt'
    )

    return {
        'input_ids': source['input_ids'].squeeze(), 
        'attention_mask': source['attention_mask'].squeeze(), 
        'labels': target['input_ids'].squeeze(), 
        'decoder_attention_mask': target['attention_mask'].squeeze()
    }

class Seq2SeqDataset(Dataset):
    def __init__(self, df, max_length=512):
        self.df = df
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return encode(self.df.iloc[idx].to_dict(), self.max_length)

# Apply the function to your dataframes
train_dataset = Seq2SeqDataset(train_df)
val_dataset = Seq2SeqDataset(val_df)



In [7]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig


In [8]:
#accelerator = Accelerator()
# google/flan-t5-xxl
# google/flan-t5-small
model_name_or_path = model_id
#batch_size = 2
#max_length = 512
#lr = 1e-4
#num_epochs = 1
#train_data = "./datasets/train.csv"
#test_data = "./datasets/val.csv"

# implementing qlora research paper
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

peft_config = LoraConfig(
task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1
)
#checkpoint_name = "chaT5_lora.pt"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path, quantization_config=bnb_config, device_map={"":0})
model = get_peft_model(model, peft_config)
#accelerator.print(model.print_trainable_parameters())

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [10]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [11]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [12]:
import transformers

tokenizer.pad_token = tokenizer.eos_token

trainer = transformers.Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        #max_steps=7,
        num_train_epochs=1,
        learning_rate=2e-4,
        #fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    data_collator = DataCollatorForSeq2Seq(
        tokenizer, model=model),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,5.1635
2,4.1056
3,5.3268
4,4.5437
5,4.9517
6,4.1507
7,4.3806
8,5.4316
9,4.5357
10,4.6334


KeyboardInterrupt: 

In [13]:
model.save_pretrained("google-flan-t5-xxl-lora-huggingface-meets-seq2seq") 

In [20]:
accelerator = Accelerator()

model_name_or_path = "google/flan-t5-small"
batch_size = 2
max_length = 512
lr = 1e-4
num_epochs = 1
train_data = "./datasets/train.csv"
test_data = "./datasets/val.csv"

peft_config = LoraConfig(
task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1
)
checkpoint_name = "chaT5_lora.pt"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)
model = get_peft_model(model, peft_config)
accelerator.print(model.print_trainable_parameters())

dataset = load_dataset(
        'csv', data_files={
            "train": train_data,
            "validation": test_data,
        },
        cache_dir="./cache")


tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

trainable params: 344064 || all params: 77305216 || trainable%: 0.445072166928555
None
Downloading and preparing dataset csv/default to /workspace/notebooks/cache/csv/default-83396a97107c9efb/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /workspace/notebooks/cache/csv/default-83396a97107c9efb/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [21]:
def preprocess_function(examples):
    inputs = [doc for doc in examples["question"]]
    model_inputs = tokenizer(
        inputs, max_length=max_length, padding=True, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["answer"], max_length=max_length, padding=True, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [22]:
with accelerator.main_process_first():
    processed_datasets = dataset.map(
        preprocess_function,
        batched=True,
        num_proc=16,
        remove_columns=dataset["train"].column_names,
        load_from_cache_file=False,
        desc="Running tokenizer on dataset",
    )

train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["validation"]

Running tokenizer on dataset (num_proc=16):   0%|          | 0/68647 [00:00<?, ? examples/s]



Running tokenizer on dataset (num_proc=16):   0%|          | 0/17162 [00:00<?, ? examples/s]



In [23]:
data_collator = DataCollatorForSeq2Seq(
        tokenizer, model=model)

train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=data_collator, batch_size=batch_size, pin_memory=True
)
eval_dataloader = DataLoader(
    eval_dataset, collate_fn=data_collator, batch_size=batch_size, pin_memory=True
)

optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),
)

In [25]:
if getattr(accelerator.state, "fsdp_plugin", None) is not None:
    accelerator.state.fsdp_plugin.auto_wrap_policy = fsdp_auto_wrap_policy(model)

model, train_dataloader, eval_dataloader, optimizer, lr_scheduler = accelerator.prepare(
    model, train_dataloader, eval_dataloader, optimizer, lr_scheduler
)
accelerator.print(model)
#accelerator.state.deepspeed_plugin.zero_stage == 3

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.detach().float()
        accelerator.backward(loss)
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        if step%1000 == 0:
            print("loss: ",loss.detach().float())
            accelerator.wait_for_everyone()
            if accelerator.is_main_process:
                accelerator.save(
                    get_peft_model_state_dict(model, state_dict=accelerator.get_state_dict(model)), checkpoint_name
                )


    model.eval()
    eval_loss = 0
    eval_preds = []
    for step, batch in enumerate(tqdm(eval_dataloader)):
        with torch.no_grad():
            outputs = model(**batch)
        loss = outputs.loss
        eval_loss += loss.detach().float()
        preds = accelerator.gather_for_metrics(torch.argmax(outputs.logits, -1)).detach().cpu().numpy()
        eval_preds.extend(tokenizer.batch_decode(preds, skip_special_tokens=True))
    eval_epoch_loss = eval_loss / len(train_dataloader)
    eval_ppl = torch.exp(eval_epoch_loss)
    train_epoch_loss = total_loss / len(eval_dataloader)
    train_ppl = torch.exp(train_epoch_loss)
    accelerator.print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")

    accelerator.wait_for_everyone()
    accelerator.save(
        get_peft_model_state_dict(model, state_dict=accelerator.get_state_dict(model)), checkpoint_name
    )
    accelerator.wait_for_everyone()

PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): T5ForConditionalGeneration(
      (shared): Embedding(32128, 512)
      (encoder): T5Stack(
        (embed_tokens): Embedding(32128, 512)
        (block): ModuleList(
          (0): T5Block(
            (layer): ModuleList(
              (0): T5LayerSelfAttention(
                (SelfAttention): T5Attention(
                  (q): Linear(
                    in_features=512, out_features=384, bias=False
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=512, out_features=8, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=8, out_features=384, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
                    (lora_embedding_B): Pa

  0%|          | 0/34324 [00:00<?, ?it/s]You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  0%|          | 4/34324 [00:00<1:52:20,  5.09it/s]

loss:  tensor(30.5362, device='cuda:0')


  3%|▎         | 1006/34324 [00:39<21:04, 26.34it/s]

loss:  tensor(4.3920, device='cuda:0')


  6%|▌         | 2005/34324 [01:18<20:24, 26.39it/s]

loss:  tensor(3.6094, device='cuda:0')


  9%|▉         | 3004/34324 [01:56<21:01, 24.83it/s]

loss:  tensor(2.8941, device='cuda:0')


 12%|█▏        | 4006/34324 [02:35<19:30, 25.91it/s]

loss:  tensor(3.6060, device='cuda:0')


 15%|█▍        | 5005/34324 [03:14<19:23, 25.19it/s]

loss:  tensor(2.6886, device='cuda:0')


 17%|█▋        | 6004/34324 [03:53<18:56, 24.92it/s]

loss:  tensor(2.8939, device='cuda:0')


 20%|██        | 7003/34324 [04:32<17:56, 25.37it/s]

loss:  tensor(2.1386, device='cuda:0')


 23%|██▎       | 8005/34324 [05:11<16:54, 25.95it/s]

loss:  tensor(2.3313, device='cuda:0')


 26%|██▌       | 9004/34324 [05:50<16:40, 25.31it/s]

loss:  tensor(2.6074, device='cuda:0')


 29%|██▉       | 10006/34324 [06:29<16:09, 25.10it/s]

loss:  tensor(1.9824, device='cuda:0')


 32%|███▏      | 11005/34324 [07:08<14:51, 26.16it/s]

loss:  tensor(2.5322, device='cuda:0')


 35%|███▍      | 12004/34324 [07:48<14:49, 25.10it/s]

loss:  tensor(2.2488, device='cuda:0')


 38%|███▊      | 13003/34324 [08:27<14:08, 25.12it/s]

loss:  tensor(2.3695, device='cuda:0')


 41%|████      | 14005/34324 [09:06<13:05, 25.86it/s]

loss:  tensor(2.3492, device='cuda:0')


 44%|████▎     | 15004/34324 [09:45<13:00, 24.74it/s]

loss:  tensor(1.9191, device='cuda:0')


 47%|████▋     | 16003/34324 [10:24<12:09, 25.11it/s]

loss:  tensor(2.1475, device='cuda:0')


 50%|████▉     | 17005/34324 [11:04<11:39, 24.77it/s]

loss:  tensor(2.2432, device='cuda:0')


 52%|█████▏    | 18004/34324 [11:43<10:41, 25.45it/s]

loss:  tensor(2.1043, device='cuda:0')


 55%|█████▌    | 19003/34324 [12:22<09:55, 25.75it/s]

loss:  tensor(2.4391, device='cuda:0')


 58%|█████▊    | 20005/34324 [13:02<09:14, 25.84it/s]

loss:  tensor(2.1490, device='cuda:0')


 61%|██████    | 21004/34324 [13:41<08:41, 25.53it/s]

loss:  tensor(2.7009, device='cuda:0')


 64%|██████▍   | 22006/34324 [14:20<08:05, 25.36it/s]

loss:  tensor(2.4904, device='cuda:0')


 67%|██████▋   | 23005/34324 [15:00<07:39, 24.66it/s]

loss:  tensor(2.0254, device='cuda:0')


 70%|██████▉   | 24004/34324 [15:39<06:45, 25.46it/s]

loss:  tensor(2.0851, device='cuda:0')


 73%|███████▎  | 25003/34324 [16:18<06:11, 25.11it/s]

loss:  tensor(2.6947, device='cuda:0')


 76%|███████▌  | 26005/34324 [16:57<05:28, 25.31it/s]

loss:  tensor(2.4341, device='cuda:0')


 79%|███████▊  | 27004/34324 [17:36<04:43, 25.81it/s]

loss:  tensor(2.7008, device='cuda:0')


 82%|████████▏ | 28006/34324 [18:15<04:06, 25.62it/s]

loss:  tensor(1.8839, device='cuda:0')


 85%|████████▍ | 29005/34324 [18:55<03:32, 24.98it/s]

loss:  tensor(1.9916, device='cuda:0')


 87%|████████▋ | 30004/34324 [19:34<02:48, 25.66it/s]

loss:  tensor(1.9473, device='cuda:0')


 90%|█████████ | 31006/34324 [20:13<02:11, 25.14it/s]

loss:  tensor(2.2065, device='cuda:0')


 93%|█████████▎| 32005/34324 [20:53<01:35, 24.28it/s]

loss:  tensor(2.6626, device='cuda:0')


 96%|█████████▌| 33004/34324 [21:32<00:51, 25.59it/s]

loss:  tensor(2.3539, device='cuda:0')


 99%|█████████▉| 34003/34324 [22:11<00:13, 24.49it/s]

loss:  tensor(1.9595, device='cuda:0')


100%|██████████| 34324/34324 [22:24<00:00, 25.54it/s]
100%|██████████| 8581/8581 [02:16<00:00, 62.68it/s]


epoch=0: train_ppl=tensor(30612.6133, device='cuda:0') train_epoch_loss=tensor(10.3292, device='cuda:0') eval_ppl=tensor(1.6856, device='cuda:0') eval_epoch_loss=tensor(0.5221, device='cuda:0')


In [29]:
# how I figured this out:
# https://huggingface.co/blog/peft
model.save_pretrained("luke_test") 