In [1]:
import transformers
transformers.__version__

'4.38.1'

##### Initial setup

In [1]:
from transformers import (
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    default_data_collator,
    get_linear_schedule_with_warmup,
)
from transformers import (
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)
from datasets import (
    load_dataset,
    Dataset,
)
from peft import (
    LoraConfig,
    PeftConfig,
    PromptEncoderConfig,
    PrefixTuningConfig,
    IA3Config,
    get_peft_model,
    TaskType,
    PromptTuningInit,
    PromptTuningConfig,
)
import torch
from torch import optim
from torch.utils.data import DataLoader
from tqdm import tqdm
import os

In [2]:
device = "cuda"
model_path = "bigscience/bloomz-560m" 
# model_path = "facebook/opt-350m"
lr = 1e-3
num_epochs = 1
batch_size = 1
steps = 150

save_location = "/home/aicoder/training/multi_bloomz"
# save_location = "/home/aicoder/training/multi_opt350"  # using qlora_minimal with AutoSeq2SeqLM
# loading tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [3]:
# creating BnB confing for 8-bit loading
bnb_4bit_nf4 = BitsAndBytesConfig(load_in_4bit=True,
                                  bnb_4bit_quant_type='nf4',
                                  bnb_4bit_compute_dtype=torch.bfloat16)

In [4]:
# loading model 4-bit quantisation with nesting & nf4, device_map=auto takes 2.70 GB VRAM
# using QloraMinimal dataset so use Seq2Seq
model = AutoModelForCausalLM.from_pretrained(model_path,
                                             # load_in_4bit=True, depricated,
                                             quantization_config=bnb_4bit_nf4,
                                             device_map="auto",
                                            torch_dtype=torch.bfloat16)

config.json:   0%|          | 0.00/715 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

In [5]:
def test_prompt(prompt, your_model):
    input_ids = tokenizer(prompt, return_tensors='pt')
    logits = your_model.generate(**input_ids.to(device),
                           max_new_tokens=50)
    return tokenizer.decode(logits[0])

In [7]:
# Add tokens <|im_start|> and <|im_end|>, latter is special eos token 
tokenizer.pad_token = "</s>"
tokenizer.add_tokens(["<|im_start|>"])
tokenizer.add_special_tokens(dict(eos_token="<|im_end|>"))
model.resize_token_embeddings(len(tokenizer))
model.config.eos_token_id = tokenizer.eos_token_id

In [5]:
# Testing the loaded model
# DONT RUN THIS WHEN PLANNING TO TRAIN
input_text = "What is the zodiac signs"
print(test_prompt(input_text,model))

</s>What is the zodiac signs?

The zodiac signs are the signs of the zodiac. The zodiac signs are the signs of the zodiac. The zodiac signs are the signs of the zodiac. The zodiac signs are the signs of the zodiac


##### Moving the models

In [5]:
# try moving the model back to cpu 
# will still keep the gpu memory
# trying this with quantised model will throw error
cpu_model = model.to('cpu')

In [6]:
# try deleting quantised model & releasing memory will also not work
del model

In [7]:
# Now the memory will be released from VRAM.
torch.cuda.empty_cache()

##### Preparing Datasets (fails in the returning the data batches)

In [6]:
# preping the dataset for training.
import pandas as pd
import json

from datasets import load_dataset

ds = load_dataset("ought/raft", "twitter_complaints")

classes = [k.replace("_", " ") for k in ds["train"].features["Label"].names]

ds = ds.map(
    lambda x: {"text_label": [classes[label] for label in x["Label"]]},
    batched=True,
    num_proc=1,
)

if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

target_max_length = max([len(tokenizer(class_label)["input_ids"]) for class_label in classes])  # 4

max_length = 64

def preprocess_function(examples, text_column="Tweet text", label_column="text_label"):
    batch_size = len(examples[text_column])
    inputs = [f"{text_column} : {x} Label : " for x in examples[text_column]]
    targets = [str(x) for x in examples[label_column]]
    model_inputs = tokenizer(inputs)
    labels = tokenizer(targets)
    classes = [k.replace("_", " ") for k in ds["train"].features["Label"].names]
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i]
        model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
            max_length - len(sample_input_ids)
        ) + sample_input_ids
        model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[
            "attention_mask"
        ][i]
        labels["input_ids"][i] = [-100] * (max_length - len(sample_input_ids)) + label_input_ids
        model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:max_length])
        model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:max_length])
        labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:max_length])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

process_ds = ds.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=ds["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

train_ds = process_ds["train"]
eval_ds = process_ds["test"]

batch_size = 16

train_dataloader = DataLoader(train_ds, shuffle=True,
                              collate_fn=default_data_collator,
                              batch_size=batch_size,
                              pin_memory=True)
eval_dataloader = DataLoader(eval_ds, collate_fn=default_data_collator,
                             batch_size=batch_size,
                             pin_memory=True)

Running tokenizer on dataset:   0%|          | 0/50 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/3399 [00:00<?, ? examples/s]

##### Preparing datasets 

In [9]:
dataset = load_dataset("OpenAssistant/oasst_top1_2023-08-25")

def tokenize(element):
    return tokenizer(
        element["text"],
        truncation=True,
        # max_length=512,
        max_length=1024,
        add_special_tokens=False,
    )

dataset_tokenized = dataset.map(
    tokenize, 
    batched=True, 
    num_proc=os.cpu_count(),    # multithreaded
    remove_columns=["text"]     # don't need this anymore, we have tokens from here on
)

Map (num_proc=20):   0%|          | 0/12947 [00:00<?, ? examples/s]

Map (num_proc=20):   0%|          | 0/690 [00:00<?, ? examples/s]

In [10]:
# define collate function - transform list of dictionaries [ {input_ids: [123, ..]}, {.. ] to single batch dictionary { input_ids: [..], labels: [..], attention_mask: [..] }
def collate(elements):
    tokenlist=[e["input_ids"] for e in elements]
    tokens_maxlen=max([len(t) for t in tokenlist])

    input_ids,labels,attention_masks = [],[],[]
    for tokens in tokenlist:
        pad_len=tokens_maxlen-len(tokens)

        # pad input_ids with pad_token, labels with ignore_index (-100) and set attention_mask 1 where content otherwise 0
        input_ids.append( tokens + [tokenizer.pad_token_id]*pad_len )   
        labels.append( tokens + [-100]*pad_len )    
        attention_masks.append( [1]*len(tokens) + [0]*pad_len ) 

    batch={
        "input_ids": torch.tensor(input_ids),
        "labels": torch.tensor(labels),
        "attention_mask": torch.tensor(attention_masks)
    }
    return batch

##### Model size details

In [11]:
# reviewing model footprint, use this for all models
model.get_memory_footprint()

207830016

In [12]:
# looking at the model architecture
model

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50267, 512)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
      (project_out): Linear4bit(in_features=1024, out_features=512, bias=False)
      (project_in): Linear4bit(in_features=512, out_features=1024, bias=False)
      (layers): ModuleList(
        (0-23): 24 x OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear4bit(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear4bit(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear4bit(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear4bit(in_features=1024, out_features=1024, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear4bit(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear4bi

### Run below cells when doing PEFT

In [11]:
# p-tuning

prompt_enc_config = PromptEncoderConfig(task_type="CAUSAL_LM",
                                        num_virtual_tokens=20,
                                        encoder_hidden_size=128)

prefix_tun_config = PrefixTuningConfig(task_type="CAUSAL_LM",
                                       num_virtual_tokens=20)

prompt_tuning_init_text = "Classify if the tweet is a complaint or no complaint.\n"

prompt_tuning_config = PromptTuningConfig(
    task_type="CAUSAL_LM",
    prompt_tuning_init=PromptTuningInit.TEXT,
    num_virtual_tokens=len(tokenizer(prompt_tuning_init_text)["input_ids"]),
    prompt_tuning_init_text=prompt_tuning_init_text,
    tokenizer_name_or_path=model_path,
)

In [12]:
prompt_enc_model = get_peft_model(model, prompt_enc_config)
prompt_enc_model.print_trainable_parameters()
# "trainable params: 300,288 || all params: 559,514,880 || trainable%: 0.05366935013417338"

trainable params: 300,288 || all params: 331,494,144 || trainable%: 0.09058621560446027


In [15]:
prefix_tun_model = get_peft_model(model, prefix_tun_config)
prefix_tun_model.print_trainable_parameters()
# "trainable params: 983,040 || all params: 560,197,632 || trainable%: 0.1754809274167014"

trainable params: 983,040 || all params: 332,176,896 || trainable%: 0.29593870369599695


In [16]:
prompt_tun_model = get_peft_model(model, prompt_tuning_config)
prompt_tun_model.print_trainable_parameters()
# "trainable params: 8,192 || all params: 559,222,784 || trainable%: 0.0014648902430985358"

trainable params: 7,168 || all params: 331,201,024 || trainable%: 0.002164244516345457


### Training part

In [12]:
prompt_enc_optimizer = optim.AdamW(prompt_enc_model.parameters(), lr=lr)
prompt_tun_optimizer = optim.AdamW(prompt_tun_model.parameters(), lr=lr)
prefix_tun_optimizer = optim.AdamW(prefix_tun_model.parameters(), lr=lr)

###### Below train loop fails

In [13]:
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=prompt_enc_optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs)
)

In [14]:
from tqdm import tqdm
model = prompt_enc_model

device = "cuda"
model = model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.detach().float()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    model.eval()
    eval_loss = 0
    eval_preds = []
    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        loss = outputs.loss
        eval_loss += loss.detach().float()
        eval_preds.extend(
            tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
        )

    eval_epoch_loss = eval_loss / len(eval_dataloader)
    eval_ppl = torch.exp(eval_epoch_loss)
    train_epoch_loss = total_loss / len(train_dataloader)
    train_ppl = torch.exp(train_epoch_loss)
    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")

  0%|          | 0/4 [00:00<?, ?it/s]


ValueError: expected sequence of length 38 at dim 1 (got 23)

##### qloraminimal trainloop

In [13]:
bs=1        # batch size

ga_steps=1  # gradient acc. steps

epochs=1

steps_per_epoch=len(dataset_tokenized["train"])//(bs*ga_steps)

In [14]:
args = TrainingArguments(
    output_dir=save_location,
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=bs,
    evaluation_strategy="steps",
    logging_steps=1,
    eval_steps=200,		# eval and save once per epoch  	
    save_steps=200,
    gradient_accumulation_steps=ga_steps,
    num_train_epochs=epochs,
    lr_scheduler_type="constant",
    # optim="paged_adamw_32bit",
    learning_rate=0.0002,
    group_by_length=True,
    # fp16=True,
    ddp_find_unused_parameters=False,

)

In [15]:
trainer = Trainer(
    model=prompt_enc_model,
    tokenizer=tokenizer,
    data_collator=collate,
    train_dataset=dataset_tokenized["train"],
    eval_dataset=dataset_tokenized["test"],
    args=args,
)


In [16]:
trainer.train()

RuntimeError: Sizes of tensors must match except in dimension 1. Expected size 1024 but got size 512 for tensor number 1 in the list.