In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from datasets import load_dataset
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
import transformers
from datetime import datetime
from trl import SFTTrainer

kwantyzacja z użyciem 4-bit NormalFloat

In [2]:
base_model_id = "mistralai/Mistral-7B-Instruct-v0.1"
base_model_id = "NousResearch/Llama-2-7b-chat-hf"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

wczytanie modelu

In [3]:
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



wczytanie tokenizera

In [4]:
tokenizer = AutoTokenizer.from_pretrained(base_model_id, add_bos_token=True, trust_remote_code=True)

Wczytanie zbioru danych. 

Zawiera on 5000 przykładów po 500 z każdą metryką

Każdy przykład jest nie dłużczy niż 256 tokenów

In [5]:
dataset = load_dataset('json', data_files='datasets/prepared_dataset_256max_small/reddit_posts_train.json', split='train')
dataset.shuffle()

Dataset({
    features: ['title', 'post_text', 'grade', 'selftext'],
    num_rows: 5000
})

funckja tworząca prompt

In [6]:
def formatting_func(example):
    text = f"""[INST]Given a context, score a comment from 0 to 9. Respond with just one number and nothing else.
    
    ### context: {example['title']} {example['post_text']}
    ### comment: {example['selftext']}[/INST] {example['grade']}"""
    example['text'] = text
    return example

In [7]:
formated_dataset = dataset.map(formatting_func)

przygotowanie modelu do treningu

In [8]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

funckja pokazująca ilość trenowalnych parametrów

In [9]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [10]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    

config LoRA

wybieramy warstwy liniowe

In [11]:
config = LoraConfig(
    r=32,
    lora_alpha=64,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)
print_trainable_parameters(model)


trainable params: 81108992 || all params: 3581521920 || trainable%: 2.264651559077991


In [12]:
print(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096, padding_idx=0)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear4bit(
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
              )
              (k_proj): Linear4bit(
                (lora_dropout): Modu

uczenie modelu

In [13]:
project = "inz"
base_model_name = "llama2"
run_name = base_model_name + "-" + project
output_dir = "./" + run_name

training_params = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=5,
    gradient_accumulation_steps=5,
    gradient_checkpointing=True,
    #max_steps=500,
    num_train_epochs=1,
    learning_rate=2.5e-5,
    bf16=True,
    optim="paged_adamw_8bit",
    logging_steps=25,
    logging_dir="./logs",
    save_strategy="steps",
    save_steps=25,
)

In [14]:
trainer = SFTTrainer(
    model=model,
    train_dataset=formated_dataset,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_params,
)

trainer.train()



Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
25,2.8691
50,2.1726
75,2.0304
100,1.9551
125,1.9844
150,1.9429
175,1.9442
200,1.9318




TrainOutput(global_step=200, training_loss=2.1037955474853516, metrics={'train_runtime': 2141.0987, 'train_samples_per_second': 2.335, 'train_steps_per_second': 0.093, 'total_flos': 4.702577455706112e+16, 'train_loss': 2.1037955474853516, 'epoch': 1.0})