In [None]:
!pip install datasets
from datasets import load_dataset

import torch
from torch.utils.data import Dataset, DataLoader

from transformers import GPT2Tokenizer, GPT2LMHeadModel, DataCollatorWithPadding, AdamW, get_linear_schedule_with_warmup as WarmupLinearSchedule

import os

# загружаем датасет с hugginface
dataset = load_dataset("allenai/sciq")
train_dataset = dataset['train']

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'



In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
# добавляем токен для паддинга, чтобы выровнять наши текстовые последовательности
# иначе возникает ошибка
tokenizer.add_special_tokens({"pad_token": "<|pad|>"})

model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
# увеличиваем размер словаря токенизатора
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = tokenizer.pad_token_id  # Устанавливаем pad_token_id
model.config.eos_token_id = tokenizer.eos_token_id  # Устанавливаем eos_token_id для end_of_text
model = model.to(device)

In [None]:
# напишем функцию для преобразования датасета в формат, понятный модели
def preprocess_function(dataset_row):
    question = dataset_row["question"]
    correct_answer = dataset_row["correct_answer"]
    distractors = f"DISTRACTORS: {dataset_row['distractor1']} | {dataset_row['distractor2']} | {dataset_row['distractor3']}"
    return f"QUESTION: {question}\nANSWER: {correct_answer}\n{distractors}\n<|endoftext|>"

train_dataset = train_dataset.map(lambda x: {"text": preprocess_function(x)})

In [None]:
# токенизация данных построчно
def tokenize_function(dataset_row):
    return tokenizer(
        dataset_row["text"],
        truncation=True,
        max_length=MAX_SEQ_LEN,
        padding="max_length"
    )

# обрабатываем весь датасет
encoded_dataset = train_dataset.map(tokenize_function, batched=True)

In [None]:
BATCH_SIZE = 1 # колаб не тянет батч больше 1
GRADIENT_ACCUMULATION_STEPS = 8  # пришлось вставить, тк иначе при батче=1 модель совсем плохо обучается
EPOCHS = 2
LEARNING_RATE = 5e-5
WARMUP_STEPS = 10
MAX_SEQ_LEN = 256

In [None]:
# преобразуем данные в формат, понятный торчу
encoded_dataset = encoded_dataset.remove_columns(["text"])
encoded_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")
dataloader = DataLoader(encoded_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=data_collator)

In [None]:
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = torch.optim.lr_scheduler.LinearLR(
    optimizer, start_factor=0.1, total_iters=WARMUP_STEPS
)

models_folder = "trained_models"
os.makedirs(models_folder, exist_ok=True)

In [None]:
# начинаем обучение
model.train()
for epoch in range(EPOCHS):
    print(f"EPOCH {epoch + 1}/{EPOCHS} started" + "=" * 30)
    epoch_loss = 0

    for step, batch in enumerate(dataloader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss / GRADIENT_ACCUMULATION_STEPS  # делим loss для накопления градиентов
        loss.backward()

        epoch_loss += loss.item()

        # обновляем веса
        if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

    print(f"Epoch {epoch + 1} Loss: {epoch_loss / len(dataloader):.4f}")

    torch.save(model.state_dict(), os.path.join(models_folder, f"gpt2_medium_sciq_epoch_{epoch + 1}.pt"))

Epoch 1 Loss: 0.0789
Epoch 2 Loss: 0.0332


In [None]:
# тестируем дообученную модель
model_path = "trained_models/gpt2_medium_sciq_epoch_2.pt"
model.load_state_dict(torch.load(model_path))
model.eval()

test_prompt = "QUESTION: What is the main theory of the origin of life?\nANSWER: Evolution theory\nDISTRACTORS:"
inputs = tokenizer(test_prompt, return_tensors="pt", padding=True, truncation=True).to(device)

outputs = model.generate(
    inputs["input_ids"],
    max_length=100,
    temperature=0.7,
    top_k=50,
    top_p=0.9,
    eos_token_id=tokenizer.eos_token_id
)

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated Text:", generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated Text: QUESTION: What is the main theory of the origin of life?
ANSWER: Evolution theory
DISTRACTORS: theory of evolution | theory of matter | theory of energy



Теперь GPT способен выдавать дистракторы, но он воспринимает evolution theory theory of evolution как разные понятия, что говорит о проблемах при токенизации?