## HW 4 Трансформеры

In [1]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [2]:
!pip install transformers



In [5]:
!git clone https://github.com/RussianNLP/RuCoLA.git

Cloning into 'RuCoLA'...
remote: Enumerating objects: 73, done.[K
remote: Counting objects: 100% (73/73), done.[K
remote: Compressing objects: 100% (51/51), done.[K
remote: Total 73 (delta 30), reused 52 (delta 22), pack-reused 0[K
Receiving objects: 100% (73/73), 944.21 KiB | 6.09 MiB/s, done.
Resolving deltas: 100% (30/30), done.


In [4]:
import pandas as pd
import numpy as np
import torch

from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import get_scheduler

from tqdm import tqdm

from sklearn.metrics import f1_score

## Подготовка данных

In [6]:
# в качестве train возьмите in_domain_train.csv, а в качестве теста in_domain_dev.csv
train = pd.read_csv("/content/RuCoLA/data/in_domain_train.csv", usecols=[1,2])
test = pd.read_csv("/content/RuCoLA/data/in_domain_dev.csv", usecols=[1,2])

idx = train.sample(frac=0.8, random_state=123).index
val = train[~train.index.isin(idx)]
train = train[train.index.isin(idx)]

del idx

print('Train size:', len(train))
print('Val size:', len(val))
print('Test size:', len(test))
print('\n')
print('Train labels counts\n', train['acceptable'].value_counts().to_dict(), '\n')
print('Eval labels counts\n', val['acceptable'].value_counts().to_dict(), '\n')
print('Test labels counts\n', test['acceptable'].value_counts().to_dict(), '\n')

Train size: 6295
Val size: 1574
Test size: 983


Train labels counts
 {1: 4704, 0: 1591} 

Eval labels counts
 {1: 1160, 0: 414} 

Test labels counts
 {1: 733, 0: 250} 



## Модель ruBert

In [6]:
tokenizer = AutoTokenizer.from_pretrained("ai-forever/ruBert-base")
model = AutoModelForSequenceClassification.from_pretrained("ai-forever/ruBert-base")

# Copy the model to the GPU.
model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/590 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.78M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/716M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ai-forever/ruBert-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(120138, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [7]:
# Считаем максимальое количество токенов в предложении
max_len = 0

# For every sentence...
for sent in train['sentence']:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

Max sentence length:  45


In [35]:
class EvalDataset(Dataset):

    def __init__(self, X):
        self.text = X.reset_index(drop=True)

    def tokenize(self, text):
        return tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=max_len)

    def __len__(self):
        return self.text.shape[0]

    def __getitem__(self, index):
        output = self.text[index]
        output = self.tokenize(output)
        return {k: v.reshape(-1).to(device) for k, v in output.items()}

class TrainDataset(Dataset):

    def __init__(self, X, label):
        self.text = X.reset_index(drop=True)
        self.label = label.reset_index(drop=True)

    def tokenize(self, text):
        return tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=max_len)

    def __len__(self):
        return self.label.shape[0]

    def __getitem__(self, index):
        output = self.text[index]
        output = self.tokenize(output)
        output.update({'labels': torch.tensor(self.label[index])})
        return {k: v.reshape(-1).to(device) for k, v in output.items()}

In [9]:
batch_size = 32

train_ds = TrainDataset(train['sentence'], train['acceptable'])
train_dataloader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)

eval_ds = TrainDataset(val['sentence'], val['acceptable'])
eval_dataloader = DataLoader(eval_ds, batch_size=batch_size)

test_ds = EvalDataset(test['sentence'])
test_dataloader = DataLoader(test_ds, batch_size=batch_size)

In [10]:
# Задаем optimizer и sheduler, которые помогут нам с файнтьюном.

optimizer = Adam(model.parameters(), lr=5e-6)

num_epochs = 5
total_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,  # Default value in run_glue.py
    num_training_steps=total_steps
)

### Обучение Train Loop

In [12]:
def train_model(train_dataloader, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        print(f'Epoch {epoch+1} / {num_epochs} \n -------------------')
        for n_batch, batch in enumerate(train_dataloader):
            # Forward pass (скормить данные в нейросеть и пробросить вперед)
            outputs = model(**batch)
            loss = outputs.loss
            if n_batch % 50 == 0:
                loss_value, current = loss.item(), n_batch * batch['input_ids'].shape[0]
                print(f"Loss train: {loss_value:>7f}  [{current:>5d}/{len(train_ds):>5d}]")
                print('Evaluating...')
                preds, true = test_model(eval_dataloader, eval=True)
                print(f'F1-score = {f1_score(preds, true):>3f}\n')
            # Backward pass (backpropagation - посчитать градиенты по всем параметрам с помощью обратного распространения ошибки)
            loss.backward()
            # Обновить параметры с помощью optimizer.step()
            optimizer.step()
            lr_scheduler.step()
            # занулить градиенты предыдущего шага
            optimizer.zero_grad()

def test_model(test_dataloader, eval=False):
    model.eval()
    y_pred = np.array([])
    y_true = np.array([])
    for n_batch, batch in enumerate(test_dataloader):
        if eval:
            y_true = np.hstack([y_true, batch['labels'].cpu().numpy().reshape(-1)])
        outputs = model(**batch)
        y_pred = np.hstack([y_pred, outputs['logits'].argmax(axis=1).detach().cpu().numpy()])
    return y_pred, y_true

In [12]:
train_model(train_dataloader, num_epochs)

Epoch 1 / 5 
 -------------------
Loss train: 0.632347  [    0/ 6295]
Evaluating...
F1-score = 0.848440

Loss train: 0.549183  [ 1600/ 6295]
Evaluating...
F1-score = 0.849395

Loss train: 0.515076  [ 3200/ 6295]
Evaluating...
F1-score = 0.850954

Loss train: 0.666201  [ 4800/ 6295]
Evaluating...
F1-score = 0.854688

Epoch 2 / 5 
 -------------------
Loss train: 0.525757  [    0/ 6295]
Evaluating...
F1-score = 0.856401

Loss train: 0.296055  [ 1600/ 6295]
Evaluating...
F1-score = 0.855403

Loss train: 0.461658  [ 3200/ 6295]
Evaluating...
F1-score = 0.854183

Loss train: 0.578591  [ 4800/ 6295]
Evaluating...
F1-score = 0.853415

Epoch 3 / 5 
 -------------------
Loss train: 0.293576  [    0/ 6295]
Evaluating...
F1-score = 0.859198

Loss train: 0.388904  [ 1600/ 6295]
Evaluating...
F1-score = 0.851705

Loss train: 0.400165  [ 3200/ 6295]
Evaluating...
F1-score = 0.851896

Loss train: 0.321536  [ 4800/ 6295]
Evaluating...
F1-score = 0.825650

Epoch 4 / 5 
 -------------------
Loss train: 

In [13]:
y_pred, _ = test_model(test_dataloader, eval=False)
print(f'F1-score = {f1_score(y_pred, test["acceptable"]):>3f}\n')

F1-score = 0.840237



In [14]:
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import accuracy_score

# Calculate the MCC
mcc = matthews_corrcoef(y_pred, test["acceptable"])
accuracy = accuracy_score(y_pred, test["acceptable"])

#print('Total MCC: %.3f' % mcc)
print('Accuracy: %.3f, MCC: %.3f' % (accuracy, mcc))

Accuracy: 0.753, MCC: 0.301


In [15]:
# Сохраняем модель
import os

# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()

output_dir = './model_ruBert/'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

Saving model to ./model_ruBert/


('./model_ruBert/tokenizer_config.json',
 './model_ruBert/special_tokens_map.json',
 './model_ruBert/vocab.txt',
 './model_ruBert/added_tokens.json',
 './model_ruBert/tokenizer.json')

In [39]:
# Удаляем модель
import gc         # garbage collect library

#del model

torch.cuda.empty_cache()
gc.collect()

0

In [40]:
# ! pip install datasets
#from datasets import load_metric
#ACCURACY = load_metric("accuracy", keep_in_memory=True)
#MCC = load_metric("matthews_correlation", keep_in_memory=True)

#acc_result = ACCURACY.compute(predictions=y_pred, references=test["acceptable"])
#mcc_result = MCC.compute(predictions=y_pred, references=test["acceptable"])

#print('Accuracy: %.3f, MCC: %.3f' % (acc_result["accuracy"], mcc_result["matthews_correlation"]))


## Модель RuRoBerta

In [6]:
tokenizer = AutoTokenizer.from_pretrained("ai-forever/ruRoberta-large")
model = AutoModelForSequenceClassification.from_pretrained("ai-forever/ruRoberta-large")

# Copy the model to the GPU.
model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/674 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.81M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ai-forever/ruRoberta-large and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
 

In [7]:
# Считаем максимальную длину токенов в предложении
max_len = 0

# For every sentence...
for sent in train['sentence']:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

Max sentence length:  47


In [9]:
train_ds = TrainDataset(train['sentence'], train['acceptable'])
train_dataloader = DataLoader(train_ds, batch_size=32, shuffle=True)

eval_ds = TrainDataset(val['sentence'], val['acceptable'])
eval_dataloader = DataLoader(eval_ds, batch_size=32)

test_ds = EvalDataset(test['sentence'])
test_dataloader = DataLoader(test_ds, batch_size=32)


In [10]:
# Задаем optimizer и sheduler, которые помогут нам с файнтьюном.

optimizer = Adam(model.parameters(), lr=5e-6)

num_epochs = 5
total_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,  # Default value in run_glue.py
    num_training_steps=total_steps
)

In [13]:
train_model(train_dataloader, num_epochs)

Epoch 1 / 5 
 -------------------
Loss train: 0.790595  [    0/ 6295]
Evaluating...
F1-score = 0.299385

Loss train: 0.591709  [ 1600/ 6295]
Evaluating...
F1-score = 0.852471

Loss train: 0.556449  [ 3200/ 6295]
Evaluating...
F1-score = 0.863946

Loss train: 0.426609  [ 4800/ 6295]
Evaluating...
F1-score = 0.873352

Epoch 2 / 5 
 -------------------
Loss train: 0.401073  [    0/ 6295]
Evaluating...
F1-score = 0.872881

Loss train: 0.278027  [ 1600/ 6295]
Evaluating...
F1-score = 0.886218

Loss train: 0.176119  [ 3200/ 6295]
Evaluating...
F1-score = 0.887984

Loss train: 0.229842  [ 4800/ 6295]
Evaluating...
F1-score = 0.881818

Epoch 3 / 5 
 -------------------
Loss train: 0.200025  [    0/ 6295]
Evaluating...
F1-score = 0.878276

Loss train: 0.033719  [ 1600/ 6295]
Evaluating...
F1-score = 0.866287

Loss train: 0.192439  [ 3200/ 6295]
Evaluating...
F1-score = 0.893737

Loss train: 0.070032  [ 4800/ 6295]
Evaluating...
F1-score = 0.879158

Epoch 4 / 5 
 -------------------
Loss train: 

In [14]:
y_pred, _ = test_model(test_dataloader, eval=False)
print(f'F1-score = {f1_score(y_pred, test["acceptable"]):>3f}\n')

F1-score = 0.890551



In [16]:
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import accuracy_score

# Calculate the MCC
mcc = matthews_corrcoef(y_pred, test["acceptable"])
accuracy = accuracy_score(y_pred, test["acceptable"])

#print('Total MCC: %.3f' % mcc)
print('Accuracy: %.3f, MCC: %.3f' % (accuracy, mcc))

Accuracy: 0.836, MCC: 0.565


In [20]:
# Сохраняем модель
import os

# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()

output_dir = './drive/MyDrive/model_ruRoBerta/'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

Saving model to ./drive/MyDrive/model_ruRoBerta/


('./drive/MyDrive/model_ruRoBerta/tokenizer_config.json',
 './drive/MyDrive/model_ruRoBerta/special_tokens_map.json',
 './drive/MyDrive/model_ruRoBerta/vocab.json',
 './drive/MyDrive/model_ruRoBerta/merges.txt',
 './drive/MyDrive/model_ruRoBerta/added_tokens.json',
 './drive/MyDrive/model_ruRoBerta/tokenizer.json')

## Модель RuGPT3 large

In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("ai-forever/rugpt3large_based_on_gpt2")
model = AutoModelForCausalLM.from_pretrained("ai-forever/rugpt3large_based_on_gpt2")

# Copy the model to the GPU.
model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.71M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/574 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/622 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.14G [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1536)
    (wpe): Embedding(2048, 1536)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1536, out_features=50257, bias=False)
)

In [7]:
def calc_loss(text):
    with torch.no_grad():
        phrase = tokenizer.encode(text, return_tensors='pt').reshape(-1).to(device)
        phrase = phrase.unsqueeze(0)
        loss = model(input_ids=phrase, labels=phrase).loss.item()
        return loss

In [8]:
calc_loss('Проверка?')

5.107440948486328

### Zero shot

In [9]:
#zero shot
from tqdm import tqdm
tqdm.pandas()

def shot(start: str, text: str, end: list):
    first = ' '.join([start, text, end[0]])
    second = ' '.join([start, text, end[1]])

    loss_1, loss_2 =  calc_loss(first), calc_loss(second)
    return 1 if loss_1 > loss_2 else 0

y_pred = test['sentence'].progress_apply(
  lambda x: shot('Предложение далее корректное?', x,
  ['Ответ: да', 'Ответ: нет']))


100%|██████████| 983/983 [01:13<00:00, 13.39it/s]


In [10]:
print(f'F1-score = {f1_score(y_pred, test["acceptable"]):>3f}\n')

F1-score = 0.737518



In [11]:
y_pred = test['sentence'].progress_apply(
    lambda x: shot('Если ли здесь ошибка?',
    x, ['Правильное.', 'Ошибка.']))

100%|██████████| 983/983 [01:02<00:00, 15.77it/s]


In [12]:
print(f'F1-score = {f1_score(y_pred, test["acceptable"]):>3f}\n')

F1-score = 0.851462



### Few shot

In [13]:

# 1 shots
promt = """Проверь корректность предложения:
На поверку вся теория оказалась полной чепухой. => Верно
"""
y_pred = test['sentence'].apply(lambda x: shot(promt, x, ['=> Верно', '=> Неверно']))
print(f'F1-score = {f1_score(y_pred, test["acceptable"]):>3f}\n')

F1-score = 0.854301



In [14]:
# 2 shots
promt = """Проверь корректность предложения:
На поверку вся теория оказалась полной чепухой. => Верно
Последние пять человек пришло. => Неверно
"""
y_pred = test['sentence'].apply(lambda x: shot(promt, x, ['=> Верно', '=> Неверно']))
print(f'F1-score = {f1_score(y_pred, test["acceptable"]):>3f}\n')

F1-score = 0.854312



In [15]:
# 4 shots
promt = """Проверь корректность предложения:
На поверку вся теория оказалась полной чепухой. Предложение корректно
Последние пять человек пришло. Предложение некорректно
Мы увидели славный город-герой. Предложение корректно
Он немедленно не ушел. Предложение некорректно
"""
y_pred = test['sentence'].apply(lambda x: shot(promt, x, ['Предложение корректно', 'Предложение некорректно']))
print(f'F1-score = {f1_score(y_pred, test["acceptable"]):>3f}\n')

F1-score = 0.851567



## Модель T5

In [2]:
#!pip uninstall transformers
!pip install --no-cache-dir transformers sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [8]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("ai-forever/ruT5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("ai-forever/ruT5-base")

model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/20.4k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.00M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [17]:
# Считаем максимальое количество токенов в предложении
max_len = 0

# For every sentence...
for sent in train['sentence']:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

Max sentence length:  48


In [18]:
class EvalDataset(Dataset):

    def __init__(self, X):
        self.text = X.reset_index(drop=True)

    def tokenize(self, text):
        return tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=max_len)

    def __len__(self):
        return self.text.shape[0]

    def __getitem__(self, index):
        output = self.text[index]
        output = self.tokenize(output)
        return {k: v.reshape(-1).to(device) for k, v in output.items()}

class TrainDataset(Dataset):

    def __init__(self, X, label):
        self.text = X.reset_index(drop=True)
        self.label = label.reset_index(drop=True)

    def tokenize(self, text, length):
        return tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=length)

    def __len__(self):
        return self.label.shape[0]

    def __getitem__(self, index):
        output = self.text[index]
        output = self.tokenize(output, max_len)
        output = {k: v.reshape(-1).to(device) for k, v in output.items()}
        label = 'верно' if self.label[index] == 1 else 'неверно'
        label = self.tokenize(label, length=2).input_ids.reshape(-1).to(device)
        output.update({'labels': label})
        return output

In [19]:
train_ds = TrainDataset(train['sentence'], train['acceptable'])
train_dataloader = DataLoader(train_ds, batch_size=32, shuffle=True)

eval_ds = TrainDataset(val['sentence'], val['acceptable'])
eval_dataloader = DataLoader(eval_ds, batch_size=32)

test_ds = EvalDataset(test['sentence'])
test_dataloader = DataLoader(test_ds, batch_size=32)

In [12]:
# Задаем optimizer и sheduler, которые помогут нам с файнтьюном.

optimizer = Adam(model.parameters(), lr=5e-6)

num_epochs = 5
total_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,  # Default value in run_glue.py
    num_training_steps=total_steps
)

In [25]:
tokenizer.encode('верно')
#tokenizer.decode(2937)

[2937, 2]

In [22]:
def train_model(train_dataloader, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        print(f'Epoch {epoch+1} / {num_epochs} \n -------------------')
        for n_batch, batch in enumerate(train_dataloader):
            # Forward pass (скормить данные в нейросеть и пробросить вперед)
            outputs = model(**batch)
            loss = outputs.loss
            if n_batch % 50 == 0:
                loss_value, current = loss.item(), n_batch * batch['input_ids'].shape[0]
                print(f"Loss train: {loss_value:>7f}  [{current:>5d}/{len(train_ds):>5d}]")
                print('Evaluating...')
                loss_val, _ = test_model(eval_dataloader, eval=True)
                print(f"Loss test: {loss_val:>7f}\n")
            # Backward pass (backpropagation - посчитать градиенты по всем параметрам с помощью обратного распространения ошибки)
            loss.backward()
            # Обновить параметры с помощью optimizer.step()
            optimizer.step()
            lr_scheduler.step()
            # занулить градиенты предыдущего шага
            optimizer.zero_grad()

def test_model(test_dataloader, eval=False):
    model.eval()
    y_pred = np.array([])
    y_true = np.array([])
    loss = []
    for n_batch, batch in enumerate(test_dataloader):
        if not eval:
            gen_tok = model.generate(**batch)
            gen_tok = [1 if 2937 in i else 0 for i in gen_tok]  # tokenizer.decode(2937) == 'верно'
            y_true = np.hstack([y_true, gen_tok])
        else:
            outputs = model(**batch)
            loss.append(outputs.loss.item())
    if not eval:
        return y_true
    else:
        return np.sum(loss)/len(loss), y_true

In [23]:
train_model(train_dataloader, num_epochs)

Epoch 1 / 5 
 -------------------
Loss train: 14.171762  [    0/ 6295]
Evaluating...
Loss test: 16.114380

Loss train: 1.320245  [ 1600/ 6295]
Evaluating...
Loss test: 1.163514

Loss train: 0.366365  [ 3200/ 6295]
Evaluating...
Loss test: 0.353103

Loss train: 0.301297  [ 4800/ 6295]
Evaluating...
Loss test: 0.321653

Epoch 2 / 5 
 -------------------
Loss train: 0.177704  [    0/ 6295]
Evaluating...
Loss test: 0.308979

Loss train: 0.276062  [ 1600/ 6295]
Evaluating...
Loss test: 0.302700

Loss train: 0.228982  [ 3200/ 6295]
Evaluating...
Loss test: 0.300095

Loss train: 0.225308  [ 4800/ 6295]
Evaluating...
Loss test: 0.298625

Epoch 3 / 5 
 -------------------
Loss train: 0.423829  [    0/ 6295]
Evaluating...
Loss test: 0.297679

Loss train: 0.265565  [ 1600/ 6295]
Evaluating...
Loss test: 0.293199

Loss train: 0.255631  [ 3200/ 6295]
Evaluating...
Loss test: 0.295083

Loss train: 0.193955  [ 4800/ 6295]
Evaluating...
Loss test: 0.289621

Epoch 4 / 5 
 -------------------
Loss train

In [26]:
y_pred = test_model(test_dataloader, eval=False)
print(f'F1-score = {f1_score(y_pred, test["acceptable"]):>3f}\n')



F1-score = 0.857484



In [27]:
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import accuracy_score

# Calculate the MCC
mcc = matthews_corrcoef(y_pred, test["acceptable"])
accuracy = accuracy_score(y_pred, test["acceptable"])

#print('Total MCC: %.3f' % mcc)
print('Accuracy: %.3f, MCC: %.3f' % (accuracy, mcc))

Accuracy: 0.757, MCC: 0.180


# Выводы

Были рассмотрены следующие трансформенные модели:
ruBert-base,
ruRoBert-large,
RuGPT3 large,
и ruT5-base
Модель RUGPT3 была протестирована с помощью few-/zero-shot (0, 1,2,4).

Самый лучший результат был получен на модели ruRoBert-large (Accuracy: 0.836, MCC: 0.565), что совпадает со значениями с сайта https://rucola-benchmark.com/leaderboard

По удобству - самый простой вариант тюнинга с помощью few-/zero-shot для модел ruGPT3


