In [16]:
from datasets import load_dataset

dataset = load_dataset("IlyaGusev/ru_turbo_saiga")

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "t-tech/T-pro-it-1.0"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [None]:
import torch

In [None]:
from trl import SFTConfig, SFTTrainer, TrainingArguments
from tqdm import tqdm

def preprocess_function(examples):
    inputs = []
    for messages in examples['messages']:
        conversation = ""
        for message in messages:
            conversation += f"{message['role']}: {message['content']}\n"
        inputs.append(conversation)
    return tokenizer(inputs, truncation=True, padding="max_length", max_length=512)

ready = dataset.map(preprocess_function, batched=True)

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=500,
    save_total_limit=2,
    num_train_epochs=3,
    report_to=["tensorboard"]
)
trainer = SFTTrainer(
    model=model,
    train_dataset=ready["train"],
    eval_dataset=ready["validation"],
    args=training_args,
    tokenizer=tokenizer
)
trainer.train()
