In [1]:
from copy import deepcopy

import torch
import torch.nn as nn

from transformers import AutoTokenizer, AutoModel

from transformers import get_cosine_schedule_with_warmup

from tqdm.auto import tqdm
from sklearn.metrics import f1_score
from torch.optim import AdamW

import random

import numpy as np
import pandas as pd

import os

from sklearn.model_selection import KFold, StratifiedKFold

In [2]:
from train import train_epoch, eval_epoch
from model import MegaSiameseModel, MegaSiameseModelv2
from dataset import TopG_Dataset, topg_collate

In [3]:
random_state = 777


def seed_everything(seed: int,
                    use_deterministic_algos: bool = False) -> None:
    
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.use_deterministic_algorithms(use_deterministic_algos)
    random.seed(seed)
    
    
seed_everything(random_state)

In [4]:
def single_model(model, 
                     dataset, 
                     loss_function,
                     collate_fn,
                     device=torch.device("cuda"),
                     random_state: int=69, 
                     shuffle=True,
                     epochs: int=15, 
                     lr: float=1e-6,
                     batch_size: int=32,
                     start_epoch=0,
                     ):
    random.seed(random_state),
    np.random.seed(random_state)
    torch.manual_seed(random_state)
    torch.cuda.manual_seed_all(random_state)
    
    loss_function.to(device)
    model.to(device)
    optimizer = AdamW(
        [
            {"params": model.out.parameters(), "lr": 1e-4},
            {"params": model.bert1.parameters(), "lr": 1e-5},
            {"params": model.bert2.parameters(), "lr": 1e-5}
        ]            
    )

    data_loader = torch.utils.data.DataLoader(
                    dataset,
                    batch_size=batch_size,
                    shuffle=shuffle,
                     collate_fn=collate_fn
    )
    
    total_steps = len(data_loader) * epochs 

    scheduler = get_cosine_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

    for epoch_i in range(0, epochs):
        if epoch_i >= start_epoch:
            train_metrics = train_epoch(model, data_loader, loss_function, optimizer, scheduler, device)
            print("EPOCH", epoch_i)
            print(train_metrics)
            #eval_metrics = eval_epoch(model, eval_loader, loss_function, device)
            #print(eval_metrics)

In [5]:
DATA = "data/"
train = pd.read_csv(f"{DATA}train.csv")

In [6]:
"ai-forever/ruRoberta-large"
"RussianNLP/ruRoBERTa-large-rucola"
tokenizer = AutoTokenizer.from_pretrained("ai-forever/ruRoberta-large")

#model = AutoModelForPreTraining.from_pretrained("cointegrated/rubert-tiny2")
bert1 = AutoModel.from_pretrained("ai-forever/ruRoberta-large")
bert2 = AutoModel.from_pretrained("ai-forever/ruRoberta-large")
model = MegaSiameseModel(bert1, bert2, 2)

Some weights of the model checkpoint at ai-forever/ruRoberta-large were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at ai-forever/ruRoberta-large were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', '

In [7]:
dataset = TopG_Dataset(train, tokenizer, 200, 60)

In [8]:
def cross_validation(project_name,
                     model, 
                     dataset, 
                     loss_function,
                     collate_fn,
                     strat_array=None,
                     device=torch.device("cuda"),
                     random_state: int=69, 
                     shuffle: bool=True, 
                     n_folds: int=4, 
                     epochs: int=5, 
#                     lr: float=1e-6,
                     start_fold: int=0, 
                     batch_size: int=32,
                     iters_to_accumulate=None,
                     n_accumulated_grads: int = 0):
    random.seed(random_state),
    np.random.seed(random_state)
    torch.manual_seed(random_state)
    torch.cuda.manual_seed_all(random_state)
    
    loss_function.to(device)
    if strat_array:
        kfold = StratifiedKFold(n_folds, shuffle=shuffle, random_state=random_state)
        split = kfold.split(dataset, strat_array)
    else: 
        kfold = KFold(n_folds, shuffle=shuffle, random_state=random_state)
        split = kfold.split(dataset)
    os.mkdir(f"models/{project_name}/")
    for fold, (train_ids, eval_ids) in enumerate(split):
        if fold >= start_fold:
            print(f'FOLD {fold}')
            print('--------------------------------')
            
            fold_model = deepcopy(model)
            fold_optimizer = AdamW(
                [
                    {"params": fold_model.out.parameters(), "lr": 3e-4},
                    {"params": fold_model.bert1.parameters(), "lr": 1e-5},
                    {"params": fold_model.bert2.parameters(), "lr": 1e-5}
                ]            
            )

            train_subsampler = torch.utils.data.Subset(dataset,  train_ids)
            train_loader = torch.utils.data.DataLoader(
                          train_subsampler, 
                          batch_size=batch_size,
                          shuffle=shuffle,collate_fn=collate_fn,drop_last=True)

            eval_subsampler = torch.utils.data.Subset(dataset,  eval_ids)
            eval_loader = torch.utils.data.DataLoader(
                          eval_subsampler,
                          batch_size=batch_size,
                          shuffle=shuffle,collate_fn=collate_fn,drop_last=True)
            
            total_steps = len(train_loader) * epochs 

            fold_scheduler = get_cosine_schedule_with_warmup(fold_optimizer, 
                                                    num_warmup_steps = 10, # Default value in run_glue.py
                                                    num_training_steps = total_steps)

            mrrs = []

            for epoch_i in range(0, epochs):
                train_metrics = train_epoch(fold_model, train_loader, loss_function, fold_optimizer, fold_scheduler, device)
                eval_metrics = eval_epoch(fold_model, eval_loader, loss_function, device)
                
                print(f"EPOCH: {epoch_i}")
                print(train_metrics)
                print(eval_metrics)
                
                #run.log(train_metrics)
                #run.log(eval_metrics)
            torch.save(fold_model, f"models/{project_name}/fold_{fold}.pt")
                            

In [9]:
# cross_validation(
#     "A", 
#     model = model,
#     dataset=dataset, 
#     loss_function=nn.CrossEntropyLoss(), 
#     collate_fn=topg_collate,
#     batch_size=16,

# #             lr=3e-4,
#     epochs=5,
#     device=torch.device("cuda"),
#     random_state=69,
#     shuffle=True
# )

In [10]:
single_model(
    model = model,
    dataset=dataset, 
    loss_function=nn.CrossEntropyLoss(), 
    collate_fn=topg_collate,
    batch_size=16,

#             lr=3e-4,
    epochs=7,
    device=torch.device("cuda"),
    random_state=random_state,
    shuffle=True
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/385 [00:00<?, ?it/s]

  context = torch.tensor(context).squeeze()


EPOCH 0
{'Train Loss': 0.3852034061373054, 'Train Accuracy': 0.8497804999351501, 'Train F1*100': 84.44444444444444}


  0%|          | 0/385 [00:00<?, ?it/s]

EPOCH 1
{'Train Loss': 0.14922626252685275, 'Train Accuracy': 0.9681352376937866, 'Train F1*100': 96.74418604651163}


  0%|          | 0/385 [00:00<?, ?it/s]

EPOCH 2
{'Train Loss': 0.09036046719638172, 'Train Accuracy': 0.9842302203178406, 'Train F1*100': 98.39695918030078}


  0%|          | 0/385 [00:00<?, ?it/s]

EPOCH 3
{'Train Loss': 0.05897172284252071, 'Train Accuracy': 0.9904080629348755, 'Train F1*100': 99.02463217060672}


  0%|          | 0/385 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [11]:
torch.save(model, f"models/rrl_{random_state}.pt")

In [12]:
test_df = pd.read_csv(f"{DATA}public_test.csv")
sub_df = pd.read_csv(f"{DATA}sample_submission.csv")
sub_df

Unnamed: 0,label
0,people
1,people
2,people
3,people
4,people
...,...
1631,people
1632,people
1633,people
1634,people


In [13]:
models = []
random_states = [777]
for rs in random_states:
    models.append(torch.load(f"models/rrl_{rs}.pt"))

models

[MegaSiameseModel(
   (bert1): RobertaModel(
     (embeddings): RobertaEmbeddings(
       (word_embeddings): Embedding(50265, 1024, padding_idx=1)
       (position_embeddings): Embedding(514, 1024, padding_idx=1)
       (token_type_embeddings): Embedding(1, 1024)
       (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
       (dropout): Dropout(p=0.1, inplace=False)
     )
     (encoder): RobertaEncoder(
       (layer): ModuleList(
         (0-23): 24 x RobertaLayer(
           (attention): RobertaAttention(
             (self): RobertaSelfAttention(
               (query): Linear(in_features=1024, out_features=1024, bias=True)
               (key): Linear(in_features=1024, out_features=1024, bias=True)
               (value): Linear(in_features=1024, out_features=1024, bias=True)
               (dropout): Dropout(p=0.1, inplace=False)
             )
             (output): RobertaSelfOutput(
               (dense): Linear(in_features=1024, out_features=1024, bias=True

In [14]:
from utils import predict, predict_ensemble

predict_ensemble(
    models, 
    tokenizer, 
    test_df, 
    sub_df, 
    path_to_save="submission_rrl_777_check.csv",
    context_len=200,
    answer_len=60,
    batch_size=64,
    device=None
)

  0%|          | 0/1 [00:00<?, ?it/s]