In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import pipeline, RobertaForSequenceClassification, RobertaTokenizer

In [7]:
model = RobertaForSequenceClassification.from_pretrained("youscan/ukr-roberta-base", num_labels=7)
tokenizer = RobertaTokenizer.from_pretrained("youscan/ukr-roberta-base")

Some weights of the model checkpoint at youscan/ukr-roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at youscan/ukr-roberta-base and are newly ini

In [8]:
train = pd.read_csv('data/train.csv').set_index('Id')
test = pd.read_csv('data/test_without_target.csv').set_index('Id')
train, val = train_test_split(train, test_size=0.25, stratify=train['source'], random_state=0)

In [9]:
train_text_tokenized = tokenizer(train['text'].tolist(), return_tensors='pt', padding=True, truncation=True)
val_text_tokenized = tokenizer(val['text'].tolist(), return_tensors='pt', padding=True, truncation=True)

In [6]:
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr=1e-5)

In [7]:
import torch

train_labels = torch.tensor(train['source'].values).long()
val_labels = torch.tensor(val['source'].values).long()

In [24]:
from copy import deepcopy
from tqdm import tqdm

def train_model(model,
                optimizer: torch.optim.Optimizer,
                num_epochs: int = 3,
                device: str = 'cuda:0',
                batch_size: int = 8):
    device = torch.device(device)
    model.to(device)
    
    best_model_wts = deepcopy(model.state_dict())
    best_acc = 0.0

    losses = [[], []]
    accuracies = [[], []]

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        for phase in ['train', 'test']:
            training = phase == 'train'
            if training:
                model.train()
                inputs = train_text_tokenized
                labels = train_labels
                num_samples = len(labels)
                order = torch.randperm(num_samples)
            else:
                model.eval()
                inputs = val_text_tokenized
                labels = val_labels
                num_samples = len(labels)
                order = torch.arange(num_samples)

            running_loss = 0.0
            running_corrects = 0

            for curr_ind in tqdm(range(0, num_samples, batch_size)):
                
                curr_order = order[curr_ind:curr_ind+batch_size]
                input_ids = inputs['input_ids'][curr_order]
                attention_mask = inputs['attention_mask'][curr_order]
                curr_labels = labels[curr_order]
                
                input_ids = input_ids.to(device)
                attention_mask = attention_mask.to(device)
                curr_labels = curr_labels.to(device)

                optimizer.zero_grad()

                with torch.set_grad_enabled(training):
                    outputs = model(input_ids, attention_mask=attention_mask, labels=curr_labels)
                    loss = outputs.loss
                    
                    if training:
                        loss.backward()
                        optimizer.step()

                preds = torch.argmax(outputs.logits, dim=1)
                
                corrects = torch.sum(preds == curr_labels)

                running_loss += loss.item() * batch_size
                running_corrects += corrects.item()

                losses[1-int(training)].append(loss.item())
                accuracies[1-int(training)].append(corrects / batch_size)
            
            epoch_loss = running_loss / num_samples
            epoch_acc = running_corrects / num_samples

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            if not training and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = deepcopy(model.state_dict())

        print()

    print('Best test Acc: {:4f}'.format(best_acc))

    model.load_state_dict(best_model_wts)
    model.cpu()

    return model, losses, accuracies

In [25]:
model, losses, accuracies = train_model(model, optimizer, num_epochs=2)

  0%|                                                                                         | 0/1517 [00:00<?, ?it/s]

Epoch 0/2
----------


100%|██████████████████████████████████████████████████████████████████████████████| 1517/1517 [17:33<00:00,  1.44it/s]
  0%|                                                                                          | 0/506 [00:00<?, ?it/s]

train Loss: 0.4141 Acc: 0.8650


100%|████████████████████████████████████████████████████████████████████████████████| 506/506 [01:57<00:00,  4.32it/s]
  0%|                                                                                         | 0/1517 [00:00<?, ?it/s]

test Loss: 0.1779 Acc: 0.9464

Epoch 1/2
----------


100%|██████████████████████████████████████████████████████████████████████████████| 1517/1517 [17:36<00:00,  1.44it/s]
  0%|                                                                                          | 0/506 [00:00<?, ?it/s]

train Loss: 0.1260 Acc: 0.9622


100%|████████████████████████████████████████████████████████████████████████████████| 506/506 [01:57<00:00,  4.32it/s]
  0%|                                                                                         | 0/1517 [00:00<?, ?it/s]

test Loss: 0.1730 Acc: 0.9520

Epoch 2/2
----------


100%|██████████████████████████████████████████████████████████████████████████████| 1517/1517 [17:36<00:00,  1.44it/s]
  0%|                                                                                          | 0/506 [00:00<?, ?it/s]

train Loss: 0.0825 Acc: 0.9749


100%|████████████████████████████████████████████████████████████████████████████████| 506/506 [01:57<00:00,  4.32it/s]


test Loss: 0.1398 Acc: 0.9567

Best test Acc: 0.956737


In [11]:
model.cuda().eval()
pred_probs = []

for i in range(0, len(val_labels), 64):
    inputs = val_text_tokenized['input_ids'][i:i+64].cuda()
    attention = val_text_tokenized['attention_mask'][i:i+64].cuda()
    labels = val_labels[i:i+64].cuda()
    with torch.set_grad_enabled(False):
        outputs = model(inputs, attention_mask=attention, labels=labels)
    logits = outputs.logits
    probs = torch.softmax(logits, dim=1).tolist()
    pred_probs.extend(probs)

model.cpu()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(52000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [19]:
test_text_tokenized = tokenizer(test['text'].tolist(), return_tensors='pt', padding=True, truncation=True)

In [21]:
model.cuda().eval()
pred_test_probs = []

for i in range(0, test_text_tokenized['input_ids'].shape[0], 64):
    inputs = test_text_tokenized['input_ids'][i:i+64].cuda()
    attention = test_text_tokenized['attention_mask'][i:i+64].cuda()
    with torch.set_grad_enabled(False):
        outputs = model(inputs, attention_mask=attention)
    logits = outputs.logits
    probs = torch.softmax(logits, dim=1).tolist()
    pred_test_probs.extend(probs)

model.cpu()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(52000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [22]:
df = pd.DataFrame({"Id": test.index, "Predicted": np.argmax(pred_test_probs, axis=1)})
df = df.set_index('Id')
df.to_csv('submission.csv')

In [None]:
# второе решение (лучшее) - этот же код, но обучение на всех данных