In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset
from peft import get_peft_model, LoraConfig, TaskType

# Načtení a rozdělení trénovacích dat
dataset = load_dataset("json", data_files="everything_finetunning.json", field='train') # , field='train'
train_test = dataset["train"].train_test_split(test_size=0.1)
train_dataset = train_test["train"]
eval_dataset = train_test["test"]

# Qwen model a tokenizer
model_name = "Qwen/Qwen1.5-1.8B-Chat"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

# LoRA konfigurace
lora_config = LoraConfig(
    r=8, # Nebo 16
    lora_alpha=32, # Nebo 32
    lora_dropout=0.1,
    task_type=TaskType.CAUSAL_LM
)

# Načtení modelu
model = get_peft_model(model, lora_config)

# Tokenizační funkce 
def tokenize_function(examples):
    # Kombinace prompt + response podle Qwen chat stylu
    prompt = tokenizer.apply_chat_template(
        [{"role": "user", "content": examples["prompt"]},
         {"role": "assistant", "content": examples["response"]}],
        tokenize=False,
        add_generation_prompt=False
    )

    inputs = tokenizer(prompt, truncation=True, padding="max_length", max_length=2048)
    inputs["labels"] = inputs["input_ids"].copy()
    return inputs
# Tokenizace vstupních dat
tokenized_datasets = train_dataset.map(tokenize_function, batched=False)
tokenized_eval = eval_dataset.map(tokenize_function, batched=False)

# Parametry trénování. Parametry s vlivem na výstup jsou pospsány přímo v bakalařské práci.
training_args = TrainingArguments(
    output_dir="./results_qwen",
    num_train_epochs=17,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=1,
    save_strategy="epoch",
    logging_dir="./logs_qwen",
    logging_steps=10,
    optim="adamw_bnb_8bit",
    save_total_limit=2,
    report_to="none",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)

# Definice modulu Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer
)

# Inicializace trénování
trainer.train()

# Uložení výsledků
trainer.save_model("./r8_mine_gwen")


2025-04-17 13:30:36.431901: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-17 13:30:36.443634: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744889436.456235 2042103 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744889436.460121 2042103 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1744889436.471995 2042103 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

Map:   0%|          | 0/261 [00:00<?, ? examples/s]

Map:   0%|          | 0/29 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss
1,0.4427,0.433778
2,0.3729,0.370465
3,0.2132,0.335032
4,0.19,0.30104
5,0.3496,0.288185
6,0.1487,0.267591
7,0.1604,0.253701
8,0.1953,0.23859
9,0.2096,0.236368
10,0.1103,0.228388


In [10]:
from datasets import load_dataset

# Načtení datasetu
dataset = load_dataset('json', data_files='randomized_finetuning.json')

# Přístup ke splitu "train"
train_data = dataset['train']

# Výpis počtu záznamů
print("Počet záznamů v trénovacích datech:", len(train_data))

Generating train split: 0 examples [00:00, ? examples/s]

Počet záznamů v trénovacích datech: 1000
