In [None]:
!pip install -U accelerate bitsandbytes datasets peft transformers

In [1]:
from datasets import load_dataset
path = "/home/kamal/.cache/huggingface/datasets/OpenAssistant___oasst_top1_2023-08-25"
# dataset = load_dataset("OpenAssistant/oasst_top1_2023-08-25")
dataset = load_dataset(path)

Found cached dataset arrow (/home/kamal/.cache/huggingface/datasets/arrow/OpenAssistant___oasst_top1_2023-08-25-c177dd96dc3425f4/0.0.0/74f69db2c14c2860059d39860b1f400a03d11bf7fb5a8258ca38c501c878c137)


  0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 12947
    })
    test: Dataset({
        features: ['text'],
        num_rows: 690
    })
})

In [3]:
print(dataset["train"][0]["text"])

<|im_start|>user
Consigliami 5 nomi per il mio cucciolo di dobberman<|im_end|>
<|im_start|>assistant
Ecco 5 nomi per il tuo cucciolo di dobermann:

- Zeus
- Apollo
- Thor
- Athena
- Odin<|im_end|>



In [18]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

# modelpath="mistral/Mistral-7B-v0.1"
# modelpath="gpt2" 
# the lora config below is geared towards mistral model so need to use atleast t5-large or llama2
# model_path = "t5-large"  
# is not supported. Currently, only `torch.nn.Linear` and `Conv1D` are supported error
modelpath="facebook/opt-350m"

In [19]:
# Load (slow) Tokenizer, fast tokenizer sometimes ignores added tokens
# tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
tokenizer = AutoTokenizer.from_pretrained(modelpath, use_fast=False)

In [11]:
# Load 4-bit quantized model
from transformers import AutoModelForSeq2SeqLM
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_path,    
    device_map="auto",
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4",
    ),
    torch_dtype=torch.bfloat16,
)

In [20]:
# Load 4-bit quantized model

model = AutoModelForCausalLM.from_pretrained(
    modelpath,    
    device_map="auto",
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4",
    ),
    torch_dtype=torch.bfloat16,
)

In [21]:
# Add tokens <|im_start|> and <|im_end|>, latter is special eos token 
tokenizer.pad_token = "</s>"
tokenizer.add_tokens(["<|im_start|>"])
tokenizer.add_special_tokens(dict(eos_token="<|im_end|>"))
model.resize_token_embeddings(len(tokenizer))
model.config.eos_token_id = tokenizer.eos_token_id

In [22]:
# Add LoRA adapters to model
model = prepare_model_for_kbit_training(model)

In [23]:
model

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50267, 512)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
      (project_out): Linear4bit(in_features=1024, out_features=512, bias=False)
      (project_in): Linear4bit(in_features=512, out_features=1024, bias=False)
      (layers): ModuleList(
        (0-23): 24 x OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear4bit(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear4bit(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear4bit(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear4bit(in_features=1024, out_features=1024, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear4bit(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear4bi

In [24]:
# Add LoRA adapters to Causal model only. T5 series wont work
model = prepare_model_for_kbit_training(model)
config = LoraConfig(
    r=64, 
    lora_alpha=16, 
    target_modules = ['q_proj', 'k_proj', 'down_proj', 'v_proj', 'gate_proj', 'o_proj', 'up_proj'],
    lora_dropout=0.1, 
    bias="none", 
    modules_to_save = ["lm_head", "embed_tokens"],		# needed because we added new tokens to tokenizer/model
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, config)
model.config.use_cache = False

In [15]:
# don't execute, only for test
from peft import TaskType
config = LoraConfig(
    r=64, 
    lora_alpha=16, 
    target_modules = ['q', 'k', 'v', 'o'],
    lora_dropout=0.1, 
    bias="none", 
    modules_to_save = ["lm_head", "embed_tokens"],		# needed because we added new tokens to tokenizer/model
    task_type=TaskType.SEQ_2_SEQ_LM
)

In [25]:
model = get_peft_model(model, config)
model.config.use_cache = False

In [26]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): PeftModelForCausalLM(
      (base_model): LoraModel(
        (model): OPTForCausalLM(
          (model): OPTModel(
            (decoder): OPTDecoder(
              (embed_tokens): ModulesToSaveWrapper(
                (original_module): Embedding(50267, 512)
                (modules_to_save): ModuleDict(
                  (default): Embedding(50267, 512)
                )
              )
              (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
              (project_out): Linear4bit(in_features=1024, out_features=512, bias=False)
              (project_in): Linear4bit(in_features=512, out_features=1024, bias=False)
              (layers): ModuleList(
                (0-23): 24 x OPTDecoderLayer(
                  (self_attn): OPTAttention(
                    (k_proj): Linear4bit(
                      in_features=1024, out_features=1024, bias=True
                      (lora_dropout): ModuleDict(
         

In [27]:
import os 

def tokenize(element):
    return tokenizer(
        element["text"],
        truncation=True,
        max_length=512,
        add_special_tokens=False,
    )

dataset_tokenized = dataset.map(
    tokenize, 
    batched=True, 
    num_proc=os.cpu_count(),    # multithreaded
    remove_columns=["text"]     # don't need this anymore, we have tokens from here on
)

Map (num_proc=20):   0%|          | 0/12947 [00:00<?, ? examples/s]

Map (num_proc=20):   0%|          | 0/690 [00:00<?, ? examples/s]

In [28]:
dataset_tokenized

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 12947
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 690
    })
})

In [29]:
# define collate function - transform list of dictionaries [ {input_ids: [123, ..]}, {.. ] to single batch dictionary { input_ids: [..], labels: [..], attention_mask: [..] }
def collate(elements):
    tokenlist=[e["input_ids"] for e in elements]
    tokens_maxlen=max([len(t) for t in tokenlist])

    input_ids,labels,attention_masks = [],[],[]
    for tokens in tokenlist:
        pad_len=tokens_maxlen-len(tokens)

        # pad input_ids with pad_token, labels with ignore_index (-100) and set attention_mask 1 where content otherwise 0
        input_ids.append( tokens + [tokenizer.pad_token_id]*pad_len )   
        labels.append( tokens + [-100]*pad_len )    
        attention_masks.append( [1]*len(tokens) + [0]*pad_len ) 

    batch={
        "input_ids": torch.tensor(input_ids),
        "labels": torch.tensor(labels),
        "attention_mask": torch.tensor(attention_masks)
    }
    return batch

In [30]:
bs=1        # batch size

ga_steps=1  # gradient acc. steps

epochs=5

steps_per_epoch=len(dataset_tokenized["train"])//(bs*ga_steps)

steps_per_epoch

12947

In [31]:
args = TrainingArguments(
    output_dir="/home/kamal/training_files/facebook_opt_qloraminimal",
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=bs,
    evaluation_strategy="steps",
    logging_steps=1,
    eval_steps=steps_per_epoch,		# eval and save once per epoch  	
    save_steps=steps_per_epoch,
    gradient_accumulation_steps=ga_steps,
    num_train_epochs=epochs,
    lr_scheduler_type="constant",
    optim="paged_adamw_32bit",
    learning_rate=0.0002,
    group_by_length=True,
    fp16=True,
    ddp_find_unused_parameters=False,
)

In [None]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=collate,
    train_dataset=dataset_tokenized["train"],
    eval_dataset=dataset_tokenized["test"],
    args=args,
)

trainer.train()

You are using 8-bit optimizers with a version of `bitsandbytes` < 0.41.1. It is recommended to update your version as a major bug has been fixed in 8-bit optimizers.


Step,Training Loss,Validation Loss
