In [1]:
import pandas as pd
import torch
from datasets import Dataset
from dotenv import load_dotenv
from peft import LoraConfig
from sklearn.model_selection import train_test_split
from transformers import (
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments
)
from trl import SFTTrainer

load_dotenv()

True

In [2]:
from transformers import AutoTokenizer


original_model_name = 'google/gemma-3-270m-it'

original_model = AutoModelForCausalLM.from_pretrained(
    original_model_name,
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=getattr(torch, "float16"),
        bnb_4bit_use_double_quant=True,
    ),
    trust_remote_code=True,
    attn_implementation='eager'
).to("cuda:0")

In [3]:
def format_joke_prompt(sample):
    return f"<start_of_turn>user Розкажи жарт:<end_of_turn>\n<start_of_turn>model {sample['Joke'].replace('\\n', '\n')}<end_of_turn>"

In [4]:
def datasets():
    dataset = pd.read_csv('/home/ivan/git/dataset/jokes.csv', usecols=['theme', 'text', 'rating'],
                          dtype={'theme': 'string', 'text': 'string', 'rating': 'int8'})

    dataset = dataset.rename(columns={'text': 'Joke'})
    dataset = dataset[:3000]

    train, test = train_test_split(dataset, test_size=0.2, random_state=42)

    train = Dataset.from_list(train.to_dict(orient='records'))
    test = Dataset.from_list(test.to_dict(orient='records'))

    return train, test

In [5]:
def train():
    train, test = datasets()

    trainer = SFTTrainer(
        model=original_model,
        processing_class=AutoTokenizer.from_pretrained(
            original_model_name,
            use_default_system_prompt=False,
            device_map=original_model.device,
        ),
        args=TrainingArguments(
            output_dir='./out',
            optim="paged_adamw_8bit",
            auto_find_batch_size=True,
            gradient_accumulation_steps=4,
            num_train_epochs=3,
            learning_rate=5e-5,
            logging_steps=10,
            fp16=True,
        ),
        peft_config=LoraConfig(
            r=64,
            lora_alpha=32,
            lora_dropout=0.05,
            bias="none",
            task_type="CAUSAL_LM",
            peft_type="QLORA",
            target_modules=["q_proj", "v_proj"]
        ),
        train_dataset=train,
        eval_dataset=test,
        formatting_func=format_joke_prompt,
    )

    trainer.train()
    trainer.model.save_pretrained("joke-generator")

In [6]:
train()

tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

Applying formatting function to train dataset:   0%|          | 0/2400 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/2400 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/2400 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/2400 [00:00<?, ? examples/s]

Applying formatting function to eval dataset:   0%|          | 0/600 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/600 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/600 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/600 [00:00<?, ? examples/s]

Step,Training Loss


Step,Training Loss


Step,Training Loss
10,5.9968
20,5.316
30,5.0593
40,4.7572
50,4.4669
60,4.3472
70,4.2123
80,3.9954
90,4.077
100,3.8344


Step,Training Loss
10,3.5169
20,3.4835
30,3.4019
40,3.4811
50,3.6534
60,3.5406
70,3.4552
80,3.4406
90,3.48
100,3.4601
