In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
!pip install datasets
!pip install sentencepiece
!pip install transformers==4.21.2
!pip install tokenizers==0.12.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.10.1-py3-none-any.whl (469 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/469.0 KB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━[0m [32m235.5/469.0 KB[0m [31m7.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting xxhash
  Downloading xxhash-3.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (213 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.0/213.0 KB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess
  Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
!nvidia-smi

Tue Mar  7 13:43:01 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    50W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import os

DIR = "/content/drive/MyDrive/Competitions/Kaggle/LECR"
INPUT_DIR = os.path.join(DIR,"input")
OUTPUT_DIR = os.path.join(DIR,"output")
OUTPUT_FINETUNE = os.path.join(OUTPUT_DIR,"FINETUNE")
OUTPUT_MODELS = os.path.join(OUTPUT_DIR,"OUTPUT_MODELS")
OUTPUT_MODELS_EXP = os.path.join(OUTPUT_MODELS,"EXP006")
USE_MODEL_DIR = os.path.join(OUTPUT_FINETUNE,"003_paraphrase-multilingual-mpnet-base-v2")
CUSTOM_MODEL_DIR = os.path.join(USE_MODEL_DIR,"paraphrase-multilingual-mpnet-base-v2_fold0_epochs20")

if not os.path.exists(OUTPUT_FINETUNE):
    os.makedirs(OUTPUT_FINETUNE)

if not os.path.exists(OUTPUT_MODELS):
    os.makedirs(OUTPUT_MODELS)

if not os.path.exists(OUTPUT_MODELS_EXP):
    os.makedirs(OUTPUT_MODELS_EXP)

In [None]:
# =========================================================================================
# Libraries
# =========================================================================================
import gc
import time
import math
import random
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from torch.utils.checkpoint import checkpoint
import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_cosine_schedule_with_warmup, DataCollatorWithPadding
from sklearn.model_selection import StratifiedGroupKFold, KFold
%env TOKENIZERS_PARALLELISM=true
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

env: TOKENIZERS_PARALLELISM=true


In [None]:
# =========================================================================================
# Configurations
# =========================================================================================
class CFG:
    debug=False
    apex=True
    print_freq = 500
    num_workers = 4
    model = CUSTOM_MODEL_DIR
    model_name = "multilingual-mpnet-base-v2"
    gradient_checkpointing = False
    scheduler='cosine' # ['linear', 'cosine','polynomial']
    batch_scheduler=True
    num_cycles = 0.5
    warmup_ratio = 0.1
    epochs = 3
    encoder_lr = 1e-5
    decoder_lr = 1e-4
    eps = 1e-6
    betas = (0.9, 0.999)
    batch_size = 32
    weight_decay = 0.01
    gradient_accumulation_steps=1
    max_grad_norm = 0.012
    target_size=1
    target_cols='target'
    max_len = 512
    n_folds = 5
    seed = 42
    trn_fold=[0]
    train=True
    
if CFG.debug:
    CFG.epochs = 1

In [None]:
# =========================================================================================
# Seed everything for deterministic results
# =========================================================================================
def seed_everything(cfg):
    random.seed(cfg.seed)
    os.environ['PYTHONHASHSEED'] = str(cfg.seed)
    np.random.seed(cfg.seed)
    torch.manual_seed(cfg.seed)
    torch.cuda.manual_seed(cfg.seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(CFG)
    
    
def get_logger(filename=OUTPUT_MODELS_EXP+'/train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

In [None]:
# =========================================================================================
# F2 score metric
# =========================================================================================
def f2_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    tp = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    fp = np.array([len(x[1] - x[0]) for x in zip(y_true, y_pred)])
    fn = np.array([len(x[0] - x[1]) for x in zip(y_true, y_pred)])
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f2 = tp / (tp + 0.2 * fp + 0.8 * fn)
    return round(f2.mean(), 4)



# =========================================================================================
# Get best threshold
# =========================================================================================
def get_best_threshold(x_val, val_predictions, correlations):
    best_score = 0
    best_threshold = None
    for thres in np.arange(0.001, 0.1, 0.001):
        x_val['predictions'] = np.where(val_predictions > thres, 1, 0)
        x_val1 = x_val[x_val['predictions'] == 1]
        x_val1 = x_val1.groupby(['topics_ids'])['content_ids'].unique().reset_index()
        x_val1['content_ids'] = x_val1['content_ids'].apply(lambda x: ' '.join(x))
        x_val1.columns = ['topic_id', 'predictions']
        x_val0 = pd.Series(x_val['topics_ids'].unique())
        x_val0 = x_val0[~x_val0.isin(x_val1['topic_id'])]
        x_val0 = pd.DataFrame({'topic_id': x_val0.values, 'predictions': ""})
        x_val_r = pd.concat([x_val1, x_val0], axis = 0, ignore_index = True)
        x_val_r = x_val_r.merge(correlations, how = 'left', on = 'topic_id')
        score = f2_score(x_val_r['content_ids'], x_val_r['predictions'])
        if score > best_score:
            best_score = score
            best_threshold = thres
    return best_score, best_threshold

In [None]:
# =========================================================================================
# Data Loading
# =========================================================================================
train = pd.read_csv('/content/drive/MyDrive/Competitions/Kaggle/LECR/input/xlm-roberta-base_train.csv')
train['title1'].fillna("Title does not exist", inplace = True)
train['title2'].fillna("Title does not exist", inplace = True)
correlations = pd.read_csv(os.path.join(INPUT_DIR,'correlations.csv'))
# Create feature column
train['text'] = train['title1'] + '[SEP]' + train['title2']
print(' ')
print('-' * 50)
print(f"train.shape: {train.shape}")
print(f"correlations.shape: {correlations.shape}")
display(train.head(3))
display(correlations.head(3))

 
--------------------------------------------------
train.shape: (3088094, 9)
correlations.shape: (61517, 2)


Unnamed: 0,topics_ids,content_ids,title1,title2,topic_language,content_language,target,fold,text
0,t_30dd476279c8,c_3088e6cf9846,Medicine<|=t_sep=|>Description does not existM...,Addictive and Dangerous Drugs Part 2<|=t_sep=|...,en,fil,0,0,Medicine<|=t_sep=|>Description does not existM...
1,t_30dd476279c8,c_3f7e1fb5de50,Medicine<|=t_sep=|>Description does not existM...,HIV and drug therapy<|=t_sep=|>Description doe...,en,en,0,0,Medicine<|=t_sep=|>Description does not existM...
2,t_30dd476279c8,c_459fb30269cf,Medicine<|=t_sep=|>Description does not existM...,Emergency Medicine Questions 1-5<|=t_sep=|>Des...,en,en,0,0,Medicine<|=t_sep=|>Description does not existM...


Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_1108dd0c7a5d c_376c5a8eb028 c_5bc0e1e2cba0 c...
1,t_00068291e9a4,c_639ea2ef9c95 c_89ce9367be10 c_ac1672cdcd2c c...
2,t_00069b63a70a,c_11a1dc0bfb99


In [None]:
# =========================================================================================
# CV split
# =========================================================================================
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())


if CFG.debug:
    display(train.groupby('fold').size())
    train = train.sample(n=1000, random_state=CFG.seed).reset_index(drop=True)
    display(train.groupby('fold').size())

fold
0    615200
1    618306
2    618371
3    618601
4    617616
dtype: int64

In [None]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
#tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

In [None]:
# =========================================================================================
# Get max length
# =========================================================================================
#lengths = []
#for text in tqdm(train['text'].fillna("").values, total = len(train)):
#    length = len(CFG.tokenizer(text, add_special_tokens = False)['input_ids'])
#    lengths.append(length)
#CFG.max_len = max(lengths) + 3 # cls & sep
#print(f"max_len: {CFG.max_len}")

In [None]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(text, cfg):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        return_tensors = None, 
        add_special_tokens = True, 
        max_length = cfg.max_len,
        pad_to_max_length = True,
        truncation = True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype = torch.long)
    return inputs

class custom_dataset(Dataset):
    def __init__(self, df, cfg):
        self.cfg = cfg
        self.texts = df['text'].values
        self.labels = df['target'].values
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, item):
        inputs = prepare_input(self.texts[item], self.cfg)
        label = torch.tensor(self.labels[item], dtype = torch.float)
        return inputs, label
    
def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

In [None]:
# ====================================================
# Model
# ====================================================

class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    
class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim=1)
        return max_embeddings
    

    
class custom_model(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states = True)
            self.config.hidden_dropout = 0.0
            self.config.hidden_dropout_prob = 0.0
            self.config.attention_dropout = 0.0
            self.config.attention_probs_dropout_prob = 0.0
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config = self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
        #self.layer_norm1 = nn.LayerNorm(self.config.hidden_size,eps=1e-5)
        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        self._init_weights(self.fc)
        #self._init_weights(self.layer_norm1)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
            
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_state = outputs.last_hidden_state
        feature = self.pool(last_hidden_state, inputs['attention_mask'])
        #feature = self.layer_norm1(feature)
        return feature
    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output

In [None]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

In [None]:
# =========================================================================================
# Train function loop
# =========================================================================================
def train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device, cfg):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled = cfg.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled = cfg.apex):
            y_preds = model(inputs)
            loss = criterion(y_preds.view(-1), labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / cfg.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.max_grad_norm)
        if (step + 1) % cfg.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % cfg.print_freq == 0 or step == (len(train_loader) - 1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch + 1, 
                          step, 
                          len(train_loader), 
                          remain = timeSince(start, float(step + 1) / len(train_loader)),
                          loss = losses,
                          grad_norm = grad_norm,
                          lr = scheduler.get_lr()[0]))
    return losses.avg



def valid_fn(valid_loader, model, criterion, device, cfg):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1), labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / cfg.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().squeeze().to('cpu').numpy().reshape(-1))
        end = time.time()
        if step % cfg.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    return losses.avg, predictions

In [None]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, correlations, fold, cfg):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds[cfg.target_cols].values
    
    train_dataset = custom_dataset(train_folds, cfg)
    valid_dataset = custom_dataset(valid_folds, cfg)
    
    train_loader = DataLoader(
        train_dataset, 
        batch_size = cfg.batch_size, 
        shuffle = True, 
        num_workers = cfg.num_workers, 
        pin_memory = True, 
        drop_last = True
    )
    valid_loader = DataLoader(
        valid_dataset, 
        batch_size = cfg.batch_size, 
        shuffle = False, 
        num_workers = cfg.num_workers, 
        pin_memory = True, 
        drop_last = False
    )
    
    # ====================================================
    # model & optimizer
    # ====================================================
    
    model = custom_model(cfg, config_path=None, pretrained=True)
    torch.save(model.config, os.path.join(OUTPUT_MODELS_EXP,'config.pth'))
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr=5e-6, decoder_lr=1e-4, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        group1=['layer.0.','layer.1.','layer.2.','layer.3.']
        group2=['layer.4.','layer.5.','layer.6.','layer.7.']    
        group3=['layer.8.','layer.9.','layer.10.','layer.11.']
        group_all=['layer.0.','layer.1.','layer.2.','layer.3.','layer.4.','layer.5.','layer.6.','layer.7.','layer.8.','layer.9.','layer.10.','layer.11.']
        optimizer_parameters1 = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        optimizer_parameters2 = [
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay': weight_decay},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay': weight_decay, 'lr': encoder_lr/2.6},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay': weight_decay, 'lr': encoder_lr},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay': weight_decay, 'lr': encoder_lr*2.6},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay': 0.0},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay': 0.0, 'lr': encoder_lr/2.6},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay': 0.0, 'lr': encoder_lr},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay': 0.0, 'lr': encoder_lr*2.6},
        {'params': [p for n, p in model.named_parameters() if "model" not in n], 'lr':decoder_lr, "momentum" : 0.99},
    ]
        return optimizer_parameters1
    
    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay
                                               )
    optimizer = AdamW(
        optimizer_parameters, 
        lr = cfg.encoder_lr, 
        eps = cfg.eps, 
        betas = cfg.betas
    )
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps, num_warmup_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        elif cfg.scheduler == 'polynomial':
            scheduler = get_polynomial_decay_schedule_with_warmup(
                optimizer, warmup_steps, num_train_steps, lr_end=7e-7, power=3.0)
        return scheduler
    num_train_steps = int(len(train_folds) / cfg.batch_size * cfg.epochs)
    num_warmup_steps = num_train_steps * cfg.warmup_ratio
    scheduler = get_scheduler(cfg, optimizer, num_train_steps, num_warmup_steps)
    
    # Training & Validation loop
    criterion = nn.BCEWithLogitsLoss(reduction = "mean")
    
    best_score = 0
    for epoch in range(cfg.epochs):
        start_time = time.time()
        
        # Train
        avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device, cfg)
        
        # Validation
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device, cfg)
        
        # Compute f2_score
        score, threshold = get_best_threshold(valid_folds, predictions, correlations)
        
        elapsed = time.time() - start_time
        print(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        print(f'Epoch {epoch+1} - Score: {score:.4f} - Threshold: {threshold:.5f}')
        
        if score > best_score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save(
                {'model': model.state_dict(), 
                 'predictions': predictions}, 
                os.path.join(OUTPUT_MODELS_EXP,f"{cfg.model_name.replace('/', '-')}_fold{fold}_{cfg.seed}.pth")
                )
            val_predictions = predictions
    preds = torch.load(os.path.join(OUTPUT_MODELS_EXP,f"{cfg.model_name.replace('/', '-')}_fold{fold}_{cfg.seed}.pth"), 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds["pred"] = preds
    
    torch.cuda.empty_cache()
    gc.collect()
    
    # Get best threshold
    best_score, best_threshold = get_best_threshold(valid_folds, val_predictions, correlations)
    print(f'Our CV score is {best_score} using a threshold of {best_threshold}')
    return valid_folds

In [None]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        best_score, best_threshold = get_best_threshold(oof_df, oof_df.pred.values, correlations)
        LOGGER.info(f'Our CV score is {best_score} using a threshold of {best_threshold}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_folds):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, correlations, fold, CFG)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(os.path.join(OUTPUT_MODELS_EXP,'oof_df.pkl'))

XLMRobertaConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/Kaggle/LECR/output/FINETUNE/003_paraphrase-multilingual-mpnet-base-v2/paraphrase-multilingual-mpnet-base-v2_fold0_epochs20",
  "architectures": [
    "XLMRobertaModel"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.21.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 250002
}

INFO:__main__:XLMRo

Epoch: [1][0/77277] Elapsed 0m 5s (remain 6555m 47s) Loss: 0.6935(0.6935) Grad: 0.8778  LR: 0.00000000  
Epoch: [1][500/77277] Elapsed 1m 42s (remain 262m 32s) Loss: 0.6314(0.6731) Grad: 1.0341  LR: 0.00000022  
Epoch: [1][1000/77277] Elapsed 3m 20s (remain 254m 38s) Loss: 0.5010(0.6053) Grad: 0.6277  LR: 0.00000043  
Epoch: [1][1500/77277] Elapsed 4m 58s (remain 250m 57s) Loss: 0.2392(0.5106) Grad: 0.4787  LR: 0.00000065  
Epoch: [1][2000/77277] Elapsed 6m 35s (remain 248m 16s) Loss: 0.1692(0.4572) Grad: 0.5389  LR: 0.00000086  
Epoch: [1][2500/77277] Elapsed 8m 13s (remain 246m 4s) Loss: 0.2101(0.4275) Grad: 0.7639  LR: 0.00000108  
Epoch: [1][3000/77277] Elapsed 9m 51s (remain 243m 59s) Loss: 0.2379(0.4067) Grad: 0.5356  LR: 0.00000129  
Epoch: [1][3500/77277] Elapsed 11m 29s (remain 242m 3s) Loss: 0.2460(0.3917) Grad: 0.6882  LR: 0.00000151  
Epoch: [1][4000/77277] Elapsed 13m 6s (remain 240m 13s) Loss: 0.3883(0.3797) Grad: 1.1515  LR: 0.00000173  
Epoch: [1][4500/77277] Elapsed 14

Epoch 1 - Save Best Score: 0.5361 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.5361 Model


Epoch 1 - avg_train_loss: 0.2280  avg_val_loss: 0.1717  time: 18862s
Epoch 1 - Score: 0.5361 - Threshold: 0.09300
Epoch: [2][0/77277] Elapsed 0m 1s (remain 2320m 9s) Loss: 0.2475(0.2475) Grad: 2.4690  LR: 0.00000843  
Epoch: [2][500/77277] Elapsed 1m 39s (remain 254m 34s) Loss: 0.1704(0.1749) Grad: 2.6570  LR: 0.00000840  
Epoch: [2][1000/77277] Elapsed 3m 17s (remain 250m 48s) Loss: 0.1883(0.1668) Grad: 5.2302  LR: 0.00000838  
Epoch: [2][1500/77277] Elapsed 4m 55s (remain 248m 22s) Loss: 0.2186(0.1713) Grad: 3.5586  LR: 0.00000835  
Epoch: [2][2000/77277] Elapsed 6m 33s (remain 246m 28s) Loss: 0.1087(0.1695) Grad: 5.0236  LR: 0.00000832  
Epoch: [2][2500/77277] Elapsed 8m 10s (remain 244m 34s) Loss: 0.1341(0.1696) Grad: 2.2775  LR: 0.00000829  
Epoch: [2][3000/77277] Elapsed 9m 48s (remain 242m 46s) Loss: 0.3447(0.1694) Grad: 4.9029  LR: 0.00000826  
Epoch: [2][3500/77277] Elapsed 11m 26s (remain 241m 5s) Loss: 0.1131(0.1692) Grad: 3.2673  LR: 0.00000823  
Epoch: [2][4000/77277] Elap

Epoch 2 - Save Best Score: 0.6039 Model
--- Logging error ---
Traceback (most recent call last):
  File "/usr/lib/python3.8/logging/__init__.py", line 1089, in emit
    self.flush()
  File "/usr/lib/python3.8/logging/__init__.py", line 1069, in flush
    self.stream.flush()
OSError: [Errno 107] Transport endpoint is not connected
Call stack:
  File "/usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.8/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.8/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.8/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.8/dist-packages/ipykernel/kernelapp.py", line 612, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.8/dist-packages/tornado/platform/asyncio.py", line 215,

Epoch 2 - avg_train_loss: 0.1549  avg_val_loss: 0.1574  time: 18863s
Epoch 2 - Score: 0.6039 - Threshold: 0.05600
Epoch: [3][0/77277] Elapsed 0m 1s (remain 2540m 0s) Loss: 0.0120(0.0120) Grad: 0.5102  LR: 0.00000302  
Epoch: [3][500/77277] Elapsed 1m 40s (remain 255m 33s) Loss: 0.0325(0.1266) Grad: 1.8825  LR: 0.00000299  
Epoch: [3][1000/77277] Elapsed 3m 17s (remain 251m 16s) Loss: 0.1933(0.1221) Grad: 4.8757  LR: 0.00000295  
Epoch: [3][1500/77277] Elapsed 4m 55s (remain 248m 42s) Loss: 0.0290(0.1215) Grad: 2.4949  LR: 0.00000292  
Epoch: [3][2000/77277] Elapsed 6m 33s (remain 246m 37s) Loss: 0.0783(0.1227) Grad: 7.3949  LR: 0.00000288  
Epoch: [3][2500/77277] Elapsed 8m 11s (remain 244m 42s) Loss: 0.0606(0.1220) Grad: 4.8824  LR: 0.00000285  
Epoch: [3][3000/77277] Elapsed 9m 48s (remain 242m 52s) Loss: 0.1692(0.1223) Grad: 2.9837  LR: 0.00000281  
Epoch: [3][3500/77277] Elapsed 11m 26s (remain 241m 7s) Loss: 0.1177(0.1220) Grad: 4.9488  LR: 0.00000278  
Epoch: [3][4000/77277] Elap

Epoch 3 - Save Best Score: 0.6107 Model
--- Logging error ---
Traceback (most recent call last):
  File "/usr/lib/python3.8/logging/__init__.py", line 1089, in emit
    self.flush()
  File "/usr/lib/python3.8/logging/__init__.py", line 1069, in flush
    self.stream.flush()
OSError: [Errno 107] Transport endpoint is not connected
Call stack:
  File "/usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.8/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.8/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.8/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.8/dist-packages/ipykernel/kernelapp.py", line 612, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.8/dist-packages/tornado/platform/asyncio.py", line 215,

Epoch 3 - avg_train_loss: 0.1174  avg_val_loss: 0.1786  time: 18858s
Epoch 3 - Score: 0.6107 - Threshold: 0.03000


--- Logging error ---
Traceback (most recent call last):
  File "/usr/lib/python3.8/logging/__init__.py", line 1089, in emit
    self.flush()
  File "/usr/lib/python3.8/logging/__init__.py", line 1069, in flush
    self.stream.flush()
OSError: [Errno 107] Transport endpoint is not connected
Call stack:
  File "/usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.8/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.8/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.8/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.8/dist-packages/ipykernel/kernelapp.py", line 612, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.8/dist-packages/tornado/platform/asyncio.py", line 215, in start
    self.asyncio_loop.run_fore

Our CV score is 0.6107 using a threshold of 0.030000000000000002


Our CV score is 0.6107 using a threshold of 0.030000000000000002
--- Logging error ---
Traceback (most recent call last):
  File "/usr/lib/python3.8/logging/__init__.py", line 1089, in emit
    self.flush()
  File "/usr/lib/python3.8/logging/__init__.py", line 1069, in flush
    self.stream.flush()
OSError: [Errno 107] Transport endpoint is not connected
Call stack:
  File "/usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.8/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.8/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.8/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.8/dist-packages/ipykernel/kernelapp.py", line 612, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.8/dist-packages/tornado/platfo

In [None]:
from google.colab import runtime
runtime.unassign()