In [1]:
from pathlib import Path
from time import time

import numpy as np
import pandas as pd
import sklearn.metrics as skm
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig
# Adam с исправлениями и планировщик learning rate
from transformers.optimization import AdamW, get_linear_schedule_with_warmup

2021-12-07 01:45:12.470151: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1


In [2]:
import numpy as np
import os

from sklearn.model_selection import train_test_split

In [3]:
RANDOM_STATE = 42
DATA_PATH = 'data'

In [4]:
# Load train and validation data
train = pd.read_csv(os.path.join(DATA_PATH, "project02_not_full_train.csv"))
train, valid = train_test_split(train, stratify=train['Label'], random_state=RANDOM_STATE, train_size=0.85, shuffle=True)

# test = pd.read_csv(os.path.join(DATA_PATH, "project02_toloka_unlabeled.csv"))
test = pd.read_csv(os.path.join(DATA_PATH, "project02_submission_file.csv"))
train.head()

Unnamed: 0,Text,Label
930,Кто написал Отверженных?,ODQA
88,Какие рецепты ты знаешь?,COOKING
97,С тобой весело,LEGEND
348,Как добраться до ашана? . Сколько это займет п...,NAVIGATE
617,Зачем Ван Гог отрезал себе ухо?,ODQA


In [6]:
tokenizer = BertTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')

def convert_comments_to_tensors(comments):
    features = []
    for comment in comments:
        # full preparation for input to BERT model, including BPE-encoding,
        # converting tokens to ids, padding, adding special tokens in the beginning and end of a sequence 
        items = tokenizer.encode_plus(
            comment, 
            max_length=100, 
            truncation=True, 
            add_special_tokens=True, 
            pad_to_max_length=True
        )
        features.append(items)

    input_ids = torch.tensor([f['input_ids'] for f in features], dtype=torch.long)
    # a mask, it has 1 - where a token exists and 0 where it's a padding index
    attention_mask = torch.tensor([f['attention_mask'] for f in features], dtype=torch.long)
    return input_ids, attention_mask


x_train = convert_comments_to_tensors(train['Text'].values)
x_val = convert_comments_to_tensors(valid['Text'].values)

mlb = LabelEncoder()
y_train = mlb.fit_transform(train['Label'].values)
y_val = mlb.transform(valid['Label'].values)

print("{}/{} - train/validation split".format(y_train.shape[0], y_val.shape[0]))

Downloading:   0%|          | 0.00/1.57M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/642 [00:00<?, ?B/s]

974/172 - train/validation split




In [7]:
x_train[0][0]

tensor([   101,  26154,  12715, 115950,   4346,    166,    102,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0])

In [8]:
x_train[1][0]

tensor([1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0])

In [9]:
device = torch.device('cuda')

config = BertConfig.from_pretrained(
    'DeepPavlov/rubert-base-cased', # bert-base-uncased
    num_labels=len(mlb.classes_)
)

model = BertForSequenceClassification.from_pretrained(
    'DeepPavlov/rubert-base-cased',
    from_tf=False,
    config=config
)
model.to(device)

Downloading:   0%|          | 0.00/681M [00:00<?, ?B/s]

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were n

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [10]:
def evaluate(data):
    total_loss = 0.
    y_true = []
    y_pred = []

    model.eval()  # Set mode to evaluation to disable dropout & freeze BN
    data_loader = DataLoader(data, batch_size=batch_size)
    with torch.no_grad():
        for step, batch in enumerate(data_loader):
            batch = tuple(t.to(device) for t in batch)
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'labels': batch[2]
            }
            outputs = model(**inputs)
            total_loss += outputs[0]
            y_pred.extend(outputs[1].cpu().numpy())
            y_true.extend(batch[2].cpu().numpy())

    y_pred = np.asarray(y_pred)
    y_true = np.asarray(y_true)

    f1 = skm.f1_score(y_true, y_pred.argmax(1), average='macro')

    return {'val_f1': f1, 'val_loss': total_loss / len(data)}


In [11]:
class EarlyStopping:
    """
    Identify whether metric has not been improved for certain number of epochs
    """

    def __init__(self,
                 mode: str = 'min',
                 min_delta: float = 0,
                 patience: int = 10):
        self.mode = mode
        self.min_delta = min_delta
        self.patience = patience

        self.is_better = None
        if patience == 0:
            self.is_better = lambda *_: True
        else:
            self._init_is_better(mode, min_delta)

        self.best = None
        self.num_bad_epochs = 0

    def step(self, current) -> bool:
        """
        Make decision whether to stop training

        :param current: new metric value
        :return: whether to stop
        """
        if isinstance(current, torch.Tensor):
            current = current.cpu()
        if np.isnan(current):
            return True

        if self.best is None:
            self.best = current
        else:
            if self.is_better(current, self.best):
                self.num_bad_epochs = 0
                self.best = current
            else:
                self.num_bad_epochs += 1

        if self.num_bad_epochs >= self.patience:
            return True
        else:
            return False

    def _init_is_better(self, mode, min_delta):
        if mode not in {'min', 'max'}:
            raise ValueError('mode ' + mode + ' is unknown!')
        if mode == 'min':
            self.is_better = lambda value, best: value < best - min_delta
        if mode == 'max':
            self.is_better = lambda value, best: value > best + min_delta

In [12]:
class ModelCheckpoint:
    """Save the model after every epoch.
    `filepath` can contain named formatting options,
    which will be filled the value of `epoch` and `val_loss`.
    For example: if `filepath` is `weights.{epoch:02d}-{val_loss:.2f}.hdf5`,
    then the model checkpoints will be saved with the epoch number and
    the validation loss in the filename.
    # Arguments
        model: PyTorch model object
        filepath: string, path to save the model file.
        save_best_only: if `save_best_only=True`,
            the latest best model according to
            the quantity monitored will not be overwritten.
        mode: one of {min, max}.
            If `save_best_only=True`, the decision
            to overwrite the current save file is made
            based on either the maximization or the
            minimization of the monitored quantity. For `val_acc`,
            this should be `max`, for `val_loss` this should
            be `min`, etc.
        save_weights_only: if True, then only the model's weights will be
            saved, else the full model is saved.
    """

    def __init__(
        self,
        model: torch.nn.Module,
        filepath: str,
        mode: str = "min",
        save_best_only: bool = True,
        save_weights_only: bool = False,
    ):
        self.model = model
        self.filepath = filepath
        self.mode = mode
        self.save_best_only = save_best_only
        self.save_weights_only = save_weights_only
        self.num_saves = 0

        if mode == "min":
            self.monitor_op = np.less
            self.best = np.Inf
        elif mode == "max":
            self.monitor_op = np.greater
            self.best = -np.Inf
        else:
            raise ValueError("mode " + mode + " is unknown!")

        Path(self.filepath).parent.mkdir(exist_ok=True, parents=True)

    def _save_model(self):
        if self.save_weights_only:
            torch.save(self.model.state_dict(), self.filepath)
        else:
            torch.save(self.model, self.filepath)
        self.num_saves += 1

    def step(self, current, epoch=None):
        if isinstance(current, torch.Tensor):
            current = current.cpu()
        if self.save_best_only:
            if self.monitor_op(current, self.best):
                self.best = current
                self._save_model()
        else:
            self._save_model()

In [13]:
lr = 0.0000125  # usually from 1e-5 until 8e-5
warmup_steps = 50
num_steps = 12000

optimizer = AdamW([p for p in model.parameters() if p.requires_grad],
                   lr=lr, weight_decay=0)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_steps)

early_stopping = EarlyStopping(patience=8, mode='max')
model_checkpoint = ModelCheckpoint(model, 'models/rubert_base_cased_model.pt', mode="max")

In [14]:
batch_size = 16
gradient_accumulation_steps = 1
logging_steps = 100  # периодичность проверки качества модели, чтобы во время остановить обучение
max_grad_norm = 1

# стандартный pytorch код для обертки входных данных и выходных классов в загрузчик данных
train_dataset = TensorDataset(x_train[0], x_train[1], torch.LongTensor(y_train))
val_dataset = TensorDataset(x_val[0], x_val[1], torch.LongTensor(y_val))
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size, drop_last=True)

num_train_epochs = num_steps // (len(train_dataloader) // gradient_accumulation_steps) + 1
global_step = 0
tr_loss, logging_loss = 0.0, 0.0
print('Count of epochs: %s' % num_train_epochs)

for _ in range(num_train_epochs):
    for step, batch in enumerate(train_dataloader):
        model.train()
        batch = tuple(t.to(device) for t in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2]
        }
        outputs = model(**inputs) # model outputs are tuple: (loss, logits)
        loss = outputs[0]

        if gradient_accumulation_steps > 1:
            loss = loss / gradient_accumulation_steps

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

        tr_loss += loss.item()
        if (step + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            scheduler.step()  # Update learning rate schedule
            model.zero_grad()
            global_step += 1

            # Log metrics
            if global_step % logging_steps == 0:
                results = evaluate(val_dataset)
                results.update({'train_loss': (tr_loss - logging_loss) / logging_steps})
                print('Step {:3}, {}'.format(global_step, ' '.join(['{}: {:<6.4f}'.format(k, v) for k, v in
                                                                  results.items()])))
                logging_loss = tr_loss

                # Saving model checkpoint here if we have improvement
                model_checkpoint.step(results["val_f1"])

                if early_stopping.step(results['val_f1']):
                    global_step = num_steps + 1
                    print('Early training stopping!')
                    break

    if global_step > num_steps:
        break

Count of epochs: 201
Step 100, val_f1: 0.1858 val_loss: 0.1448 train_loss: 2.9354
Step 200, val_f1: 0.3852 val_loss: 0.0916 train_loss: 1.8113
Step 300, val_f1: 0.5372 val_loss: 0.0609 train_loss: 1.0425
Step 400, val_f1: 0.6606 val_loss: 0.0473 train_loss: 0.5741
Step 500, val_f1: 0.7028 val_loss: 0.0447 train_loss: 0.3265
Step 600, val_f1: 0.8386 val_loss: 0.0358 train_loss: 0.1666
Step 700, val_f1: 0.7399 val_loss: 0.0398 train_loss: 0.0886
Step 800, val_f1: 0.8382 val_loss: 0.0410 train_loss: 0.0514
Step 900, val_f1: 0.8462 val_loss: 0.0388 train_loss: 0.0321
Step 1000, val_f1: 0.8550 val_loss: 0.0388 train_loss: 0.0259
Step 1100, val_f1: 0.8460 val_loss: 0.0411 train_loss: 0.0186
Step 1200, val_f1: 0.8089 val_loss: 0.0399 train_loss: 0.0164
Step 1300, val_f1: 0.8460 val_loss: 0.0419 train_loss: 0.0120
Step 1400, val_f1: 0.8573 val_loss: 0.0417 train_loss: 0.0104
Step 1500, val_f1: 0.9008 val_loss: 0.0428 train_loss: 0.0085
Step 1600, val_f1: 0.8957 val_loss: 0.0436 train_loss: 0.0

In [16]:
x_test = convert_comments_to_tensors(test['Text'].values)
test_dataset = TensorDataset(x_test[0], x_test[1])
test_data_loader = DataLoader(test_dataset, batch_size=batch_size)

In [17]:
y_pred = []

model.eval()  # Set mode to evaluation to disable dropout & freeze BN

with torch.no_grad():
    for step, batch in enumerate(test_data_loader):
        batch = tuple(t.to(device) for t in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
        }
        outputs = model(**inputs)
        y_pred.extend(mlb.inverse_transform(outputs[0].argmax(axis=1).to('cpu').numpy()))
test['Label'] = np.asarray(y_pred)

In [18]:
test.to_csv('dmitry.ivashnikov_project02.csv', index=False)

In [19]:
!curl --user upload:newprolabupload -T dmitry.ivashnikov_project02.csv 'http://de.newprolab.com/upload/' -vvv

*   Trying 85.192.32.238:80...
* Connected to de.newprolab.com (85.192.32.238) port 80 (#0)
* Server auth using Basic with user 'upload'
> PUT /upload/dmitry.ivashnikov_project02.csv HTTP/1.1
> Host: de.newprolab.com
> Authorization: Basic dXBsb2FkOm5ld3Byb2xhYnVwbG9hZA==
> User-Agent: curl/7.71.1
> Accept: */*
> Content-Length: 20361
> Expect: 100-continue
> 
* Mark bundle as not supporting multiuse
< HTTP/1.1 100 Continue
* We are completely uploaded and fine
* Mark bundle as not supporting multiuse
< HTTP/1.1 204 No Content
< Server: nginx/1.10.3 (Ubuntu)
< Date: Mon, 06 Dec 2021 22:58:51 GMT
< Connection: keep-alive
< 
* Connection #0 to host de.newprolab.com left intact
