In [47]:
import random
import sys
import os

from joblib import load
from copy import copy
import seaborn as sns

from sklearn.model_selection import train_test_split

from tqdm.auto import tqdm
import numpy as np
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader, Subset
from transformers import AutoModelForTokenClassification, BertTokenizerFast, BertConfig, AutoModel, AutoTokenizer, RobertaTokenizerFast
from transformers import get_cosine_schedule_with_warmup

In [30]:
from dataclasses import dataclass


@dataclass
class TrainingConfig:
    # Model
    model_name = "ai-forever/ruRoberta-large"
    
    # Training
    batch_size = 256
    epochs = 20
    learning_rate = 5e-5
    lr_warmup_steps = 500

    # Accelerator
    gradient_accumulation_steps = 1
    mixed_precision = 'fp16'  # `no` for float32, `fp16` for automatic mixed precision

    device = "cuda"
    random_state = 42 


config = TrainingConfig()

In [31]:
def seed_everything(seed: int,
                    use_deterministic_algos: bool = False) -> None:
    
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.use_deterministic_algorithms(use_deterministic_algos)
    random.seed(seed)
    

seed_everything(config.random_state)

In [42]:
model = AutoModelForTokenClassification.from_pretrained(
    config.model_name,
    num_labels=12
)
model.eval()

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at ai-forever/ruRoberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
    

In [43]:
from transformers import AutoModelForSeq2SeqLM
from peft import get_peft_model, LoraConfig, TaskType

peft_config = LoraConfig(
    task_type=TaskType.TOKEN_CLS, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 798,732 || all params: 355,121,176 || trainable%: 0.22491815582408412


In [48]:
tokenizer = RobertaTokenizerFast.from_pretrained(config.model_name)

In [8]:
from datasets import load_dataset, Dataset
import pandas as pd

df = pd.DataFrame(dataset)
df = df.dropna()
dataset = Dataset.from_pandas(df)
dataset

Dataset({
    features: ['question', 'answer', 'relevance', '__index_level_0__'],
    num_rows: 2476083
})

In [9]:
encoded_dataset = dataset.map(
    lambda sample: tokenizer(
        sample['question'], sample['answer'], truncation=True, padding='max_length', max_length=256
    ),
    batched=True,
)

Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 2476083/2476083 [02:54<00:00, 14155.57 examples/s]


In [10]:
list(encoded_dataset[0].keys())

['question',
 'answer',
 'relevance',
 '__index_level_0__',
 'input_ids',
 'token_type_ids',
 'attention_mask']

In [11]:
encoded_dataset = encoded_dataset.remove_columns([
 'question',
 'answer',
 '__index_level_0__',
])

list(encoded_dataset[0].keys())

['relevance', 'input_ids', 'token_type_ids', 'attention_mask']

In [12]:
encoded_dataset.set_format(type='torch', columns=['relevance', 'input_ids', 'token_type_ids', 'attention_mask'])

In [13]:
train_inds, test_inds = train_test_split([i for i in range(len(encoded_dataset))], test_size=0.2)

train_dataloader = DataLoader(
    Subset(encoded_dataset, train_inds), 
    batch_size=76,
    shuffle=True
)

val_dataloader = DataLoader(
    Subset(encoded_dataset, test_inds), 
    batch_size=76,
    shuffle=False
)

In [14]:
from torchmetrics.classification import 

ROCAUC = ()

In [15]:
import torch
import torch.nn as nn
import torch.optim as optim


def train_epoch(model, dataloader, optimizer, scheduler, criterion):
    model.train()

    all_probas = []
    all_labels = []

    total_loss = 0

    for batch in tqdm(dataloader):
        input_ids, attention_masks, token_type_ids, labels = batch["input_ids"], batch["attention_mask"], batch["token_type_ids"], batch["relevance"]
        input_ids, attention_masks, token_type_ids, labels = input_ids.to(DEVICE), attention_masks.to(DEVICE), token_type_ids.to(DEVICE), labels.to(DEVICE)
        
        optimizer.zero_grad()
        output = model(
            input_ids=input_ids, 
            attention_mask=attention_masks,
            token_type_ids=token_type_ids
        ).logits

        loss = criterion(output, labels)
        total_loss += loss.item()
        
        accelerator.backward(loss)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
        probas = output.softmax(dim=-1)
        all_probas.append(probas.detach().cpu())
        all_labels.append(labels.cpu())
        
    metrics = {
        "Loss": total_loss / len(dataloader)
    }
        
    return metrics



def val_epoch(model, dataloader, criterion):
    model.eval()
    
    all_probas = []
    all_labels = []

    total_loss = 0

    for batch in tqdm(dataloader):
        input_ids, attention_masks, token_type_ids, labels = batch["input_ids"], batch["attention_mask"], batch["token_type_ids"], batch["relevance"]
        input_ids, attention_masks, token_type_ids, labels = input_ids.to(DEVICE), attention_masks.to(DEVICE), token_type_ids.to(DEVICE), labels.to(DEVICE)

        with torch.no_grad():
            output = model(
                input_ids=input_ids, 
                attention_mask=attention_masks,
                token_type_ids=token_type_ids
            ).logits

        loss = criterion(output, labels)
        total_loss += loss.item()

        probas = output.softmax(dim=-1)
        all_probas.append(probas.detach().cpu())
        all_labels.append(labels.cpu())
        

    metrics = {
        "Loss": total_loss / len(dataloader)
    }
        
    return metrics


def train_loop(
    project_name,
    model, 
    epochs,
    train_dataloader,
    test_dataloader,
    optimizer,
    scheduler,
    criterion
):
    for i in range(epochs):
        train_loss = train_epoch(model, train_dataloader, optimizer, scheduler, criterion)
        test_loss = val_epoch(model, test_dataloader, criterion)

        print(train_loss, test_loss)

        torch.save(model, f"{project_name}/{i}.pt")

In [16]:
from madgrad import MADGRAD

epochs = 20
optimizer = MADGRAD([
        {"params": model.parameters(), "lr": config.learning_rate},
])
total_steps = int(len(train_dataloader) * epochs)
scheduler = get_cosine_schedule_with_warmup(optimizer, 
                                    num_warmup_steps = cofig.lr_warmup_steps, # Default value in run_glue.py
                                    num_training_steps = total_steps)

train_loop(
    project_name="",
    model=model, 
    epochs=epochs,
    train_dataloader=train_dataloader,
    test_dataloader=val_dataloader,
    optimizer=optimizer,
    scheduler=scheduler,
    criterion=nn.CrossEntropyLoss(),
)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26065/26065 [1:56:39<00:00,  3.72it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6517/6517 [10:42<00:00, 10.14it/s]


{'ROCAUC': tensor(0.7484), 'Loss': 0.5820043456033029} {'ROCAUC': tensor(0.7690), 'Loss': 0.5696628986165735}


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26065/26065 [1:57:44<00:00,  3.69it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6517/6517 [10:45<00:00, 10.09it/s]


{'ROCAUC': tensor(0.7757), 'Loss': 0.5572885225995955} {'ROCAUC': tensor(0.7905), 'Loss': 0.54619039610904}


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26065/26065 [1:55:01<00:00,  3.78it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6517/6517 [09:34<00:00, 11.34it/s]


{'ROCAUC': tensor(0.8260), 'Loss': 0.5038450323406849} {'ROCAUC': tensor(0.8565), 'Loss': 0.4671186056466525}


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26065/26065 [2:00:14<00:00,  3.61it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6517/6517 [10:43<00:00, 10.13it/s]


{'ROCAUC': tensor(0.8759), 'Loss': 0.43368750158550873} {'ROCAUC': tensor(0.8766), 'Loss': 0.4372068587116107}


  7%|███████▉                                                                                                              | 1741/26065 [07:46<1:48:43,  3.73it/s]


KeyboardInterrupt: 

In [51]:
print("\n".join(['resume', 'resume_id', 'first_name', 'last_name', 'middle_name', 'birth_date', 'birth_date_year_only', 'country', 'city', 'about', 'key_skills', 'salary_expectations_amount', 'salary_expectations_currency', 'photo_path', 'gender', 'language', 'resume_name', 'source_link', 'contactItems', 'resume_contact_item_id', 'value', 'comment', 'contact_type', 'educationItems', 'resume_education_item_id', 'year', 'organization', 'faculty', 'specialty', 'result', 'education_type', 'education_level', 'experienceItems', 'resume_experience_item_id', 'starts', 'ends', 'employer', 'city', 'url', 'position', 'description', 'order', 'languageItems', 'resume_language_item_id', 'language', 'language_level', 'O']))

resume
resume_id
first_name
last_name
middle_name
birth_date
birth_date_year_only
country
city
about
key_skills
salary_expectations_amount
salary_expectations_currency
photo_path
gender
language
resume_name
source_link
contactItems
resume_contact_item_id
value
comment
contact_type
educationItems
resume_education_item_id
year
organization
faculty
specialty
result
education_type
education_level
experienceItems
resume_experience_item_id
starts
ends
employer
city
url
position
description
order
languageItems
resume_language_item_id
language
language_level
O
