In [None]:
%pip install --upgrade pip
%pip install --disable-pip-version-check \
    torch --index-url https://download.pytorch.org/whl/cu121 \
    torchdata --quiet

%pip install \
    transformers \
    datasets \
    evaluate \
    rouge_score \
    loralib \
    peft --quiet

%pip install sentencepiece

In [None]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np

In [None]:
res_folder = 'res/in_use/'

dataset = load_dataset('csv', data_files={"train": res_folder + 'generated_qna_GPT4_reviewed.csv', "validation": res_folder + 'generated_qna_GPT3.5_reviewed.csv', "test": res_folder + 'gyik.csv'})
dataset

In [None]:
# A másik két modellhez alakítani kéne a tokenizer-t
#model_name='NYTK/PULI-GPTrio'
#model_name='meta-llama/Llama-2-7b'
model_name='google/mt5-base'

In [None]:
original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, ignore_mismatched_sizes=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params=0
    for _, param in model.named_parameters():
        all_model_params +=param.numel()
        if(param.requires_grad):
            trainable_model_params +=param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall modell parameters: {all_model_params}\npercentage of trainable model params: {100*trainable_model_params/all_model_params}\n"

print (print_number_of_trainable_model_parameters(original_model))

# 2 - Perform Full Fine-Tuning

2.1 Preprocess the Dataset

You need to convert the question-answer (prompt-response) pairs into explicit instructions for the LLM. Prepend an instruction to the start of the dialog with Answer the following question and to the start of the summary with Answer as follows:

Training prompt (question):
Answer the following question!
How many hours should I work?
Answer:

Training response (answer):
240 for electrical engineers, 320 for IT engineers

Then preprocess the prompt-response dataset into tokens and pull out their input_ids (1 per token).


Ezeket mind magyarul fogom végezni, mert magyar modellel dolgozom.

In [None]:
# Lokálisan futtatva errort dob, de Colabon lefut

def tokenize_function (example):
    start_prompt = 'Válaszolja az alábbi kérdésre!\n\n'
    end_prompt = '\n\nVálasz: '
    prompt = [start_prompt + question + end_prompt for question in example["Question"]]

    tokenizer.pad_token = tokenizer.eos_token
    example['input_ids'] = tokenizer(prompt, padding=True, truncation=True, return_tensors="pt").input_ids

    example['labels'] = tokenizer(example["Answer"], padding=True, truncation=True, return_tensors="pt").input_ids

    return example

# The dataset actually contains 3 diff splits: train, validation, test.
# The tokenize_function code is handling all data across all splits in batches.
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["Question", "Answer" ])

In [None]:
#subsample the dataset to save time

#tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 100 ==0, with_indices=True)

In [None]:
from pprint import pprint

#Check the shapes

print("Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['validation'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")

The output dataset is ready for fine-tuning.

2.2 - Fine-tune the model with the preprocessed dataset.
Now utilize the built-in Hugging Face Trainer class. Pass the preprocessed dataset with reference to the original model. Other training parameters are found experimentally and there is no need to go into details about those at the moment.




In [None]:
output_dir = f'./qna-training-{str(int(time.time()))}'

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-3,
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,  # batch size for evaluation
    num_train_epochs=4,
    weight_decay=0.01,
    logging_steps=1,
    max_steps=5,
)

trainer = Trainer(
    model=original_model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation']
)

In [None]:
trainer.train()


In [None]:
trainer.save_model('./full_trained')

3.1 Setup PEFT/LoRA model for fine-tuning
A nrew layer/adapter
rank is the hyper parameter that defines the rank/dimension of the adapter to be trained

In [None]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32, #rank
    lora_alpha = 32,
    target_modules = ["q", "v"],
    lora_dropout = 0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM #mT5
)

Add lora adapter layers/parameters to the original LLM to be trained

In [None]:
print(torch.cuda.is_available())
print(torch.backends.cudnn.enabled)
print(torch.version.cuda)

In [None]:
peft_model = get_peft_model(original_model,lora_config)

#peft_model = AutoPeftModelForCausalLM.from_pretrained(model_name)
print(torch.cuda.is_available())
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
#device = 0 if torch.cuda.is_available() else torch.device("cpu") #Ugyanaz mint a fenti
peft_model.to(device)
print(print_number_of_trainable_model_parameters(peft_model))

3.2 Train PEFT adapter
Define training arguments and create Trainer instance

In [None]:
output_dir = f'./peft-dialogue-summary-training-{str(int(time.time()))}'

peft_training_args = TrainingArguments(
    output_dir = output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-4,
    num_train_epochs=1,
    logging_steps=1,
    max_steps=1
)

peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_datasets["train"],
)

In [None]:

peft_trainer.train()

peft_model_path="./peft-dialogue-summary-checkpoint-local"
peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

In [None]:
peft_trainer.save_model("./trained")