In [None]:
# !pip install transformers datasets accelerate torch pandas

In [None]:

import torch
import os
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)
from src.data_tokenizer import RapDataTokenizer, load_and_process_csv

In [None]:
DATA_URL = 'https://raw.githubusercontent.com/ivanchetvergov/neiroRap/main/data/lyrics_df.csv'
LOCAL_CSV = 'data/lyrics_df.csv'
MODEL_NAME = "sberbank-ai/rugpt3small_based_on_gpt2"

# параметры обучения
BLOCK_SIZE = 1024
BATCH_SIZE = 4
EPOCHS = 11
LEARNING_RATE = 2e-6

# выходные директории
OUTPUT_DIR = "./neiroRap_results_v2"
FINAL_MODEL_DIR = "./neiroRap_final_model_v2"
LOGS_DIR = './neiroRap_logs_v2'

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Используемое устройство: {device}")

In [None]:
rap_tokenizer = RapDataTokenizer(MODEL_NAME)

print(f"\nИнформация о токенизаторе:")
print(f"   • Базовая модель: {MODEL_NAME}")
print(f"   • Размер словаря: {len(rap_tokenizer.tokenizer)}")

In [None]:
df_processed = load_and_process_csv(LOCAL_CSV, rap_tokenizer)
print(f"=== Пример текста: === \n{df_processed['tokenized_text'].iloc[0][:500]}")

In [None]:
dataset_dict = {'text': df_processed['tokenized_text'].tolist()}
dataset = Dataset.from_dict(dataset_dict)

In [None]:
def tokenize_function(examples):
    """Токенизация текстов с truncation"""
    return rap_tokenizer.tokenizer(
        examples["text"],
        truncation=True,
        max_length=BLOCK_SIZE,
        padding=False
    )

In [None]:
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    num_proc=4,
    remove_columns=["text"],
    desc="Токенизация"
)

In [None]:
def group_texts(examples):
    """
    Группирует токенизированные тексты в блоки фиксированной длины.
    Это необходимо для эффективного обучения GPT-2.
    """
    # конкатенируем все тексты
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])

    # обрезаем до кратного BLOCK_SIZE
    total_length = (total_length // BLOCK_SIZE) * BLOCK_SIZE

    # разрезаем на блоки
    result = {
        k: [t[i : i + BLOCK_SIZE] for i in range(0, total_length, BLOCK_SIZE)]
        for k, t in concatenated_examples.items()
    }

    # Labels = input_ids для языкового моделирования
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
lm_dataset = tokenized_dataset.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
    desc="Группировка"
)

In [None]:
# Разделяем 90% train / 10% test
lm_dataset_split = lm_dataset.train_test_split(test_size=0.1, seed=42)

model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(device)

print(f"Размер embeddings до: {model.get_input_embeddings().weight.shape[0]}")
model.resize_token_embeddings(len(rap_tokenizer.tokenizer))
print(f"Размер embeddings после: {model.get_input_embeddings().weight.shape[0]}")
model.config.pad_token_id = model.config.eos_token_id

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=rap_tokenizer.tokenizer,
    mlm=False  # Causal LM, не masked LM
)

In [None]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,

    # Оптимизация
    learning_rate=LEARNING_RATE,
    warmup_steps=100,
    weight_decay=0.01,

    # Логирование
    logging_dir=LOGS_DIR,
    logging_steps=50,

    # Сохранение
    save_strategy="epoch",
    save_total_limit=3,  # Храним только последние 3 чекпоинта

    # Оценка
    eval_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",

    # Производительность
    fp16=torch.cuda.is_available(),  # Mixed precision для GPU
    gradient_accumulation_steps=2,    # Виртуальный batch_size x2

    # Отчёты
    report_to="none",  # Отключаем W&B/TensorBoard

    # Детерминизм
    seed=42,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset_split["train"],
    eval_dataset=lm_dataset_split["test"],
    data_collator=data_collator,
)

In [None]:
trainer.train()

In [None]:
os.makedirs(FINAL_MODEL_DIR, exist_ok=True)

trainer.save_model(FINAL_MODEL_DIR)
rap_tokenizer.tokenizer.save_pretrained(FINAL_MODEL_DIR)

In [None]:
import shutil
from google.colab import files

folder_to_download = 'neiroRap_final_model_v2'

zip_filename = f'{folder_to_download}.zip'
shutil.make_archive(folder_to_download, 'zip', folder_to_download)

files.download(zip_filename)