## Импорты

In [1]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW
from torch.utils.data import Dataset, DataLoader
import torch
from tqdm import tqdm
import nltk
from rouge_score import rouge_scorer
from sklearn.model_selection import train_test_split
import torch.nn as nn

## Подготовка данных

In [2]:
data_path = '../data/processed/context_answer.csv'
df = pd.read_csv(data_path)

# Фильтрация данных
min_length = 3
max_length = 50
df = df[(df['context'].str.split().str.len().between(min_length, max_length)) & 
        (df['response'].str.split().str.len().between(min_length, max_length))]

## Подготовка модели

In [3]:
# Инициализация токенизатора с дополнительным токеном
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
sep_token = '<|sep|>'
tokenizer.add_special_tokens({'additional_special_tokens': [sep_token]})
tokenizer.pad_token = tokenizer.eos_token

# Создание объединенных последовательностей
def create_sequences(row):
    return f"{row['context']} {sep_token} {row['response']}{tokenizer.eos_token}"

df['sequence'] = df.apply(create_sequences, axis=1)

# Разделение данных
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Класс Dataset
class ConversationDataset(Dataset):
    def __init__(self, sequences, tokenizer, max_length=128):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.encodings = tokenizer(
            sequences.tolist(),
            truncation=True,
            max_length=max_length,
            padding='max_length',
            return_tensors='pt'
        )

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.encodings['input_ids'][idx].clone()
        }

# Создание Dataset и DataLoader
train_dataset = ConversationDataset(train_df['sequence'], tokenizer)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# Инициализация модели
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.resize_token_embeddings(len(tokenizer))

# Заморозка слоев
for param in model.parameters():
    param.requires_grad = False
for param in model.transformer.h[-4:].parameters():
    param.requires_grad = True

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


## Подготовка модели

In [4]:
# Настройка обучения
optimizer = AdamW(model.parameters(), lr=3e-5)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Обучение с прогресс-баром
for epoch in range(3):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}")
    
    for batch in progress_bar:
        optimizer.zero_grad()
        
        inputs = batch['input_ids'].to(device)
        masks = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        # Маскировка loss для части контекста
        labels[inputs == tokenizer.pad_token_id] = -100
        sep_positions = (inputs == tokenizer.convert_tokens_to_ids(sep_token)).nonzero()[:,1]
        for i, pos in enumerate(sep_positions):
            labels[i, :pos+1] = -100
        
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        progress_bar.set_postfix({'loss': loss.item()})
    
    print(f"Epoch {epoch+1} Average Loss: {total_loss/len(train_loader):.4f}")

# Функция генерации ответа
def generate_response(model, tokenizer, context, max_length=50):
    prompt = f"{context} {sep_token}"
    inputs = tokenizer(prompt, return_tensors='pt').to(device)
    
    output = model.generate(
        inputs.input_ids,
        max_length=max_length,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.8,
        num_return_sequences=1,
        no_repeat_ngram_size=2  # Блокировка повторяющихся n-грамм
    )
    
    full_text = tokenizer.decode(output[0], skip_special_tokens=False)
    response = full_text.split(sep_token)[-1].split(tokenizer.eos_token)[0].strip()
    return response
    

Epoch 1:   0%|                                                                                 | 0/1188 [00:00<?, ?it/s]`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
Epoch 1: 100%|███████████████████████████████████████████████████████████| 1188/1188 [00:52<00:00, 22.65it/s, loss=4.74]


Epoch 1 Average Loss: 3.9978


Epoch 2: 100%|███████████████████████████████████████████████████████████| 1188/1188 [00:52<00:00, 22.70it/s, loss=3.43]


Epoch 2 Average Loss: 3.8383


Epoch 3: 100%|███████████████████████████████████████████████████████████| 1188/1188 [00:51<00:00, 23.15it/s, loss=2.92]

Epoch 3 Average Loss: 3.7448





## Сохранение модели и токенизатора

In [9]:
model.save_pretrained('../trained_model/')
tokenizer.save_pretrained('../trained_tokenizer/')

('../trained_tokenizer/tokenizer_config.json',
 '../trained_tokenizer/special_tokens_map.json',
 '../trained_tokenizer/vocab.json',
 '../trained_tokenizer/merges.txt',
 '../trained_tokenizer/added_tokens.json')