In [1]:
import torch
import transformers
from transformers import (
    LlamaTokenizer,
    LlamaForCausalLM,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    Trainer,
    GenerationConfig
)
import accelerate
import bitsandbytes
from tqdm import tqdm
from trl import SFTTrainer
import time
import pandas as pd
import numpy as np
import datasets

from data.preprocess_dataset import preprocess_dataset

  from .autonotebook import tqdm as notebook_tqdm
The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


# Loading and processing dataset

In [15]:
tokenizer = LlamaTokenizer.from_pretrained("maritaca-ai/sabia-7b")
tokenizer.add_special_tokens({"pad_token": "<PAD>"})
#tokenizer.save_pretrained("./data/custom_tokenizer")

canarim_dataset_name = "dominguesm/Canarim-Instruct-PTBR-Dataset"
train_dataset = datasets.load_dataset(canarim_dataset_name, split="train")
eval_dataset = datasets.load_dataset(canarim_dataset_name, split="test")

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [16]:
process_time = time.time()
print("Starting dataset processing...")
train_dataset = preprocess_dataset(train_dataset, tokenizer)
eval_dataset = preprocess_dataset(eval_dataset, tokenizer)
print(f"Finished processing. It took {time.time() - process_time} seconds\n")

train_dataset

Starting dataset processing...


Map: 100%|██████████| 316413/316413 [00:13<00:00, 23680.19 examples/s]
Map: 100%|██████████| 316413/316413 [00:13<00:00, 22629.83 examples/s]
Map: 100%|██████████| 316413/316413 [04:48<00:00, 1097.35 examples/s]
Map: 100%|██████████| 1519/1519 [00:00<00:00, 18987.63 examples/s]
Map: 100%|██████████| 1519/1519 [00:00<00:00, 16877.53 examples/s]
Map: 100%|██████████| 1519/1519 [00:01<00:00, 1155.13 examples/s]

Finished processing. It took 317.2381465435028 seconds






Dataset({
    features: ['instruction', 'input', 'output', 'prompt', 'input_ids'],
    num_rows: 316413
})

# Loading Sabia7B Model

In [25]:
model_name = "maritaca-ai/sabia-7b"
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = LlamaForCausalLM.from_pretrained(
    model_name,
    device_map=device,
    low_cpu_mem_usage=True,
    torch_dtype=torch.bfloat16   # If your GPU does not support bfloat16, change to torch.float16
)

# Tá dando erro pq eu acho que não consigo ativar o CUDA localmente (minha GPU é AMD) (Jonas)
model

Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.45it/s]


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
 

In [26]:
def trainable_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"""trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters:
            {100 * trainable_model_params / all_model_params:.2f}%"""

print(trainable_parameters(model))

trainable model parameters: 6738415616
all model parameters: 6738415616
percentage of trainable model parameters:
            100.00%


In [27]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

config = LoraConfig(
    r=64, #Rank
    lora_alpha=32,
    target_modules=[
        'q_proj',
        'k_proj',
        'v_proj',
        'dense'
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)

model.gradient_checkpointing_enable()

model = prepare_model_for_kbit_training(model)

peft_model = get_peft_model(model, config)

peft_model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096, padding_idx=0)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear(
                (base_layer): Line

In [28]:
print(trainable_parameters(peft_model))

trainable model parameters: 50331648
all model parameters: 6788747264
percentage of trainable model parameters:
            0.74%


In [29]:
output_dir = "./sabia-7b-instruct-training/checkpoint"
batch_size = 16
gradient_accumulation_steps = 4
epochs = 5
lr = 2e-4

training_args = TrainingArguments(
    output_dir = output_dir,
    warmup_steps = 1,
    warmup_ratio = 0.1,
    bf16=True,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size//2,
    gradient_accumulation_steps = gradient_accumulation_steps,
    learning_rate = lr,
    num_train_epochs = epochs,
    lr_scheduler_type = "cosine",
    logging_steps = 50,
    logging_dir = "./logs",
    save_strategy = "steps",
    save_steps = 25,
    evaluation_strategy = "epoch",
    do_eval = True,
    gradient_checkpointing = True,
    report_to = "none",
    overwrite_output_dir = 'True',
    group_by_length = True,
)

In [30]:
peft_trainer = transformers.Trainer(
    model=peft_model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

peft_trainer

<transformers.trainer.Trainer at 0x28aedcfb770>