# About this notebook
- Deberta-v3-base starter code
- pip wheels is [here](https://www.kaggle.com/code/yasufuminakama/fb3-pip-wheels)
- Inference notebook is [here](https://www.kaggle.com/yasufuminakama/fb3-deberta-v3-base-baseline-inference)

If this notebook is helpful, feel free to upvote :)

# Directory settings

In [1]:
# ====================================================
# Directory settings
# ====================================================
import os

OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# CFG

In [2]:
# ====================================================
# CFG
# ====================================================
class CFG:
    wandb=True
    competition='FB3'
    _wandb_kernel='nakama'
    debug=False
    apex=True
    print_freq=20
    num_workers=4
    model="microsoft/deberta-v3-base"
    gradient_checkpointing=True
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=2
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=8
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]
    train=True
    
if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0]

In [3]:
# ====================================================
# wandb
# ====================================================
if CFG.wandb:
    
    import wandb

    try:
        from kaggle_secrets import UserSecretsClient
        user_secrets = UserSecretsClient()
        secret_value_0 = user_secrets.get_secret("wandb_api")
        wandb.login(key=secret_value_0)
        anony = None
    except:
        anony = "must"
        print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')


    def class2dict(f):
        return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))

    run = wandb.init(project='FB3-Public', 
                     name=CFG.model,
                     config=class2dict(CFG),
                     group=CFG.model,
                     job_type="train",
                     anonymous=anony)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdoughnut[0m ([33mteam-donut[0m). Use [1m`wandb login --relogin`[0m to force relogin


# Library

In [4]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

os.system('pip install iterative-stratification==0.1.7')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

os.system('pip uninstall -y transformers')
os.system('pip uninstall -y tokenizers')
os.system('python -m pip install --no-index --find-links=../input/fb3-pip-wheels transformers')
os.system('python -m pip install --no-index --find-links=../input/fb3-pip-wheels tokenizers')
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Collecting iterative-stratification==0.1.7
  Downloading iterative_stratification-0.1.7-py3-none-any.whl (8.5 kB)
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.7




Found existing installation: transformers 4.20.1
Uninstalling transformers-4.20.1:
  Successfully uninstalled transformers-4.20.1




Found existing installation: tokenizers 0.12.1
Uninstalling tokenizers-0.12.1:
  Successfully uninstalled tokenizers-0.12.1




Looking in links: ../input/fb3-pip-wheels
Processing /kaggle/input/fb3-pip-wheels/transformers-4.21.2-py3-none-any.whl
Processing /kaggle/input/fb3-pip-wheels/tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl
Installing collected packages: tokenizers, transformers


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
allennlp 2.10.0 requires protobuf==3.20.0, but you have protobuf 3.19.4 which is incompatible.
allennlp 2.10.0 requires transformers<4.21,>=4.1, but you have transformers 4.21.2 which is incompatible.


Successfully installed tokenizers-0.12.1 transformers-4.21.2
Looking in links: ../input/fb3-pip-wheels




tokenizers.__version__: 0.12.1
transformers.__version__: 4.21.2
env: TOKENIZERS_PARALLELISM=true


# Utils

In [5]:
# ====================================================
# Utils
# ====================================================
def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores


def get_score(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return mcrmse_score, scores


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

# Data Loading

In [6]:
# ====================================================
# Data Loading
# ====================================================
from sklearn.model_selection import train_test_split
data = pd.read_csv('../input/feedback-prize-english-language-learning/train.csv')
train, test = train_test_split(data, test_size=0.2, random_state=42)
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
# test = pd.read_csv('../input/feedback-prize-english-language-learning/test.csv')
# submission = pd.read_csv('../input/feedback-prize-english-language-learning/sample_submission.csv')

print(f"train.shape: {train.shape}")
display(train.head())
print(f"test.shape: {test.shape}")
display(test.head())
# print(f"submission.shape: {submission.shape}")
# display(submission.head())

train.shape: (3128, 8)


Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,1247CB034EF7,Wouldnt you want to have time and do your home...,4.0,3.5,3.0,3.5,4.0,3.5
1,68685615FE0C,There is a debate about the opportunity offere...,3.5,4.0,3.5,3.5,2.5,3.0
2,E597A35FA323,"Negative, We have to take Information about th...",2.0,2.5,2.5,2.5,2.5,2.5
3,AD9CEE5A6FFF,I think it better to talk to more people than ...,3.0,3.0,3.0,3.0,3.0,2.5
4,F4C52358CE03,In this reasons from Churchill's statement. I ...,2.5,2.5,2.5,2.0,2.5,2.5


test.shape: (783, 8)


Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,772D27D400BB,It god to have a possitive attitude when you d...,3.0,2.5,2.5,2.0,2.0,2.0
1,9E8F3C6405CA,Why do people ask more then one person for adv...,3.0,2.0,3.0,3.5,3.0,3.0
2,948771F795EB,"We accomplish more when we are active, and are...",4.0,4.0,3.0,4.0,4.0,4.0
3,FE14D7378CFB,Do you agree or disagree about imagination bei...,3.0,3.0,3.5,3.0,3.5,3.5
4,7AAE019F70D6,I disagree with the principal saying that all ...,3.5,3.5,3.5,3.5,3.0,3.5


# CV split

In [7]:
# ====================================================
# CV split
# ====================================================
Fold = MultilabelStratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
for n, (train_index, val_index) in enumerate(Fold.split(train, train[CFG.target_cols])):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())

fold
0    782
1    782
2    782
3    782
dtype: int64

In [8]:
if CFG.debug:
    display(train.groupby('fold').size())
    train = train.sample(n=1000, random_state=0).reset_index(drop=True)
    display(train.groupby('fold').size())

# tokenizer

In [9]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

Downloading tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Dataset

In [10]:
# ====================================================
# Define max_len
# ====================================================
lengths = []
tk0 = tqdm(train['full_text'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 3 # cls & sep & sep
LOGGER.info(f"max_len: {CFG.max_len}")

  0%|          | 0/3128 [00:00<?, ?it/s]

max_len: 1429


In [11]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        return_tensors=None, 
        add_special_tokens=True, 
        max_length=CFG.max_len,
        pad_to_max_length=True,
        truncation=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['full_text'].values
        self.labels = df[cfg.target_cols].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label
    

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

# Model

In [12]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, 6)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output

# Loss

In [13]:
# ====================================================
# Loss
# ====================================================
class RMSELoss(nn.Module):
    def __init__(self, reduction='mean', eps=1e-9):
        super().__init__()
        self.mse = nn.MSELoss(reduction='none')
        self.reduction = reduction
        self.eps = eps

    def forward(self, y_pred, y_true):
        loss = torch.sqrt(self.mse(y_pred, y_true) + self.eps)
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss

# Helpler functions

In [14]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
        if CFG.wandb:
            wandb.log({f"[fold{fold}] loss": losses.val,
                       f"[fold{fold}] lr": scheduler.get_lr()[0]})
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    return losses.avg, predictions

# Train Loop

In [15]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds[CFG.target_cols].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size * 2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.SmoothL1Loss(reduction='mean') # RMSELoss(reduction="mean")
    
    best_score = np.inf

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score, scores = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}  Scores: {scores}')
        if CFG.wandb:
            wandb.log({f"[fold{fold}] epoch": epoch+1, 
                       f"[fold{fold}] avg_train_loss": avg_loss, 
                       f"[fold{fold}] avg_val_loss": avg_val_loss,
                       f"[fold{fold}] score": score})
        
        if best_score > score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds[[f"pred_{c}" for c in CFG.target_cols]] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [16]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df[CFG.target_cols].values
        preds = oof_df[[f"pred_{c}" for c in CFG.target_cols]].values
        score, scores = get_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}  Scores: {scores}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_DIR+'oof_df.pkl')
        
    if CFG.wandb:
        wandb.finish()

DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}



Downloading pytorch_model.bin:   0%|          | 0.00/354M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.dense.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/293] Elapsed 0m 3s (remain 15m 19s) Loss: 2.6669(2.6669) Grad: inf  LR: 0.00002000  
Epoch: [1][20/293] Elapsed 0m 44s (remain 9m 37s) Loss: 0.3123(1.3006) Grad: 147894.9531  LR: 0.00001994  
Epoch: [1][40/293] Elapsed 1m 22s (remain 8m 26s) Loss: 0.1800(0.7700) Grad: 57320.8867  LR: 0.00001976  
Epoch: [1][60/293] Elapsed 2m 8s (remain 8m 8s) Loss: 0.0719(0.5690) Grad: 63681.4805  LR: 0.00001947  
Epoch: [1][80/293] Elapsed 2m 41s (remain 7m 3s) Loss: 0.1561(0.4677) Grad: 135419.4062  LR: 0.00001907  
Epoch: [1][100/293] Elapsed 3m 14s (remain 6m 9s) Loss: 0.1379(0.4027) Grad: 79917.3047  LR: 0.00001857  
Epoch: [1][120/293] Elapsed 3m 55s (remain 5m 34s) Loss: 0.1158(0.3559) Grad: 57050.0039  LR: 0.00001797  
Epoch: [1][140/293] Elapsed 4m 32s (remain 4m 53s) Loss: 0.1637(0.3232) Grad: 50002.2891  LR: 0.00001728  
Epoch: [1][160/293] Elapsed 5m 8s (remain 4m 12s) Loss: 0.1392(0.2975) Grad: 68783.7422  LR: 0.00001650  
Epoch: [1][180/293] Elapsed 5m 52s (remain 3m 38s) Lo

Epoch 1 - avg_train_loss: 0.2168  avg_val_loss: 0.1117  time: 628s
Epoch 1 - Score: 0.4732  Scores: [0.5457380666130269, 0.46453561670323384, 0.4283166024668232, 0.44975246960511306, 0.4843910205795539, 0.46632148075066204]
Epoch 1 - Save Best Score: 0.4732 Model


EVAL: [48/49] Elapsed 0m 57s (remain 0m 0s) Loss: 0.1379(0.1117) 
Epoch: [2][0/293] Elapsed 0m 1s (remain 6m 52s) Loss: 0.0839(0.0839) Grad: 260142.6406  LR: 0.00000995  
Epoch: [2][20/293] Elapsed 0m 42s (remain 9m 12s) Loss: 0.1130(0.1008) Grad: 306543.4062  LR: 0.00000888  
Epoch: [2][40/293] Elapsed 1m 23s (remain 8m 34s) Loss: 0.0869(0.1032) Grad: 164239.0156  LR: 0.00000782  
Epoch: [2][60/293] Elapsed 2m 0s (remain 7m 39s) Loss: 0.1249(0.1028) Grad: inf  LR: 0.00000679  
Epoch: [2][80/293] Elapsed 2m 38s (remain 6m 55s) Loss: 0.0816(0.1048) Grad: 109840.6875  LR: 0.00000579  
Epoch: [2][100/293] Elapsed 3m 25s (remain 6m 30s) Loss: 0.0883(0.1059) Grad: 85122.2578  LR: 0.00000485  
Epoch: [2][120/293] Elapsed 4m 4s (remain 5m 47s) Loss: 0.1212(0.1040) Grad: 106126.9844  LR: 0.00000396  
Epoch: [2][140/293] Elapsed 4m 46s (remain 5m 8s) Loss: 0.0643(0.1035) Grad: 60738.5078  LR: 0.00000314  
Epoch: [2][160/293] Elapsed 5m 28s (remain 4m 29s) Loss: 0.0855(0.1025) Grad: 136179.0156 

Epoch 2 - avg_train_loss: 0.1009  avg_val_loss: 0.1065  time: 628s
Epoch 2 - Score: 0.4616  Scores: [0.5255476736451616, 0.4491735255649419, 0.4150018889464276, 0.4469070519205784, 0.4786604548899618, 0.4544604112626731]
Epoch 2 - Save Best Score: 0.4616 Model


EVAL: [48/49] Elapsed 0m 56s (remain 0m 0s) Loss: 0.1330(0.1065) 


Score: 0.4616  Scores: [0.5255476736451616, 0.4491735255649419, 0.4150018889464276, 0.4469070519205784, 0.4786604548899618, 0.4544604112626731]
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size"

Epoch: [1][0/293] Elapsed 0m 1s (remain 7m 23s) Loss: 2.1498(2.1498) Grad: inf  LR: 0.00002000  
Epoch: [1][20/293] Elapsed 0m 46s (remain 9m 56s) Loss: 0.1495(1.3177) Grad: 113444.3203  LR: 0.00001994  
Epoch: [1][40/293] Elapsed 1m 24s (remain 8m 41s) Loss: 0.2479(0.7883) Grad: 318464.2500  LR: 0.00001976  
Epoch: [1][60/293] Elapsed 2m 4s (remain 7m 53s) Loss: 0.1774(0.5930) Grad: 140407.0000  LR: 0.00001947  
Epoch: [1][80/293] Elapsed 2m 41s (remain 7m 2s) Loss: 0.1741(0.4853) Grad: 125013.0234  LR: 0.00001907  
Epoch: [1][100/293] Elapsed 3m 16s (remain 6m 12s) Loss: 0.1199(0.4150) Grad: 50290.5039  LR: 0.00001857  
Epoch: [1][120/293] Elapsed 3m 56s (remain 5m 36s) Loss: 0.1141(0.3684) Grad: 65215.6836  LR: 0.00001797  
Epoch: [1][140/293] Elapsed 4m 38s (remain 5m 0s) Loss: 0.1019(0.3356) Grad: 54059.3008  LR: 0.00001728  
Epoch: [1][160/293] Elapsed 5m 14s (remain 4m 17s) Loss: 0.1053(0.3089) Grad: 77060.9297  LR: 0.00001650  
Epoch: [1][180/293] Elapsed 5m 50s (remain 3m 36s)

Epoch 1 - avg_train_loss: 0.2307  avg_val_loss: 0.1064  time: 620s
Epoch 1 - Score: 0.4620  Scores: [0.4847014765962732, 0.45334360637110416, 0.4238548356345285, 0.4645327169068475, 0.4883645446298583, 0.4574671999385159]
Epoch 1 - Save Best Score: 0.4620 Model


EVAL: [48/49] Elapsed 1m 3s (remain 0m 0s) Loss: 0.1274(0.1064) 
Epoch: [2][0/293] Elapsed 0m 1s (remain 6m 27s) Loss: 0.1118(0.1118) Grad: 158028.6562  LR: 0.00000995  
Epoch: [2][20/293] Elapsed 0m 44s (remain 9m 41s) Loss: 0.1310(0.1074) Grad: 207041.2031  LR: 0.00000888  
Epoch: [2][40/293] Elapsed 1m 24s (remain 8m 40s) Loss: 0.1017(0.1131) Grad: 56006.4531  LR: 0.00000782  
Epoch: [2][60/293] Elapsed 2m 4s (remain 7m 55s) Loss: 0.0923(0.1097) Grad: 125440.3594  LR: 0.00000679  
Epoch: [2][80/293] Elapsed 2m 41s (remain 7m 2s) Loss: 0.1614(0.1076) Grad: 68173.5156  LR: 0.00000579  
Epoch: [2][100/293] Elapsed 3m 20s (remain 6m 20s) Loss: 0.1289(0.1072) Grad: 137376.9844  LR: 0.00000485  
Epoch: [2][120/293] Elapsed 3m 57s (remain 5m 37s) Loss: 0.0940(0.1055) Grad: 81435.4062  LR: 0.00000396  
Epoch: [2][140/293] Elapsed 4m 37s (remain 4m 58s) Loss: 0.1236(0.1062) Grad: 58054.3906  LR: 0.00000314  
Epoch: [2][160/293] Elapsed 5m 18s (remain 4m 20s) Loss: 0.0623(0.1059) Grad: 47971.

Epoch 2 - avg_train_loss: 0.1043  avg_val_loss: 0.1028  time: 621s
Epoch 2 - Score: 0.4539  Scores: [0.4833298554039856, 0.44028204024537926, 0.4151665911867766, 0.46080089808915436, 0.4786449723776812, 0.44499036382991514]
Epoch 2 - Save Best Score: 0.4539 Model


EVAL: [48/49] Elapsed 1m 2s (remain 0m 0s) Loss: 0.1194(0.1028) 


Score: 0.4539  Scores: [0.4833298554039856, 0.44028204024537926, 0.4151665911867766, 0.46080089808915436, 0.4786449723776812, 0.44499036382991514]
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_si

Epoch: [1][0/293] Elapsed 0m 1s (remain 8m 34s) Loss: 2.2734(2.2734) Grad: inf  LR: 0.00002000  
Epoch: [1][20/293] Elapsed 0m 42s (remain 9m 4s) Loss: 0.3839(1.3221) Grad: 98528.9531  LR: 0.00001994  
Epoch: [1][40/293] Elapsed 1m 20s (remain 8m 13s) Loss: 0.3043(0.7736) Grad: 52472.8164  LR: 0.00001976  
Epoch: [1][60/293] Elapsed 1m 59s (remain 7m 35s) Loss: 0.1534(0.5796) Grad: 89325.5781  LR: 0.00001947  
Epoch: [1][80/293] Elapsed 2m 44s (remain 7m 10s) Loss: 0.1695(0.4724) Grad: 90869.4375  LR: 0.00001907  
Epoch: [1][100/293] Elapsed 3m 22s (remain 6m 25s) Loss: 0.1172(0.4058) Grad: 38556.6328  LR: 0.00001857  
Epoch: [1][120/293] Elapsed 3m 57s (remain 5m 37s) Loss: 0.1318(0.3613) Grad: 100389.9688  LR: 0.00001797  
Epoch: [1][140/293] Elapsed 4m 34s (remain 4m 55s) Loss: 0.1630(0.3282) Grad: 71079.1562  LR: 0.00001728  
Epoch: [1][160/293] Elapsed 5m 10s (remain 4m 14s) Loss: 0.1062(0.3018) Grad: 68774.7734  LR: 0.00001650  
Epoch: [1][180/293] Elapsed 5m 43s (remain 3m 32s) 

Epoch 1 - avg_train_loss: 0.2212  avg_val_loss: 0.1214  time: 610s
Epoch 1 - Score: 0.4945  Scores: [0.5233007958767268, 0.4758029383673772, 0.4587230112741174, 0.4905968757578576, 0.526010448338285, 0.49281407850409964]
Epoch 1 - Save Best Score: 0.4945 Model


EVAL: [48/49] Elapsed 0m 59s (remain 0m 0s) Loss: 0.1412(0.1214) 
Epoch: [2][0/293] Elapsed 0m 1s (remain 9m 5s) Loss: 0.0837(0.0837) Grad: 197190.4219  LR: 0.00000995  
Epoch: [2][20/293] Elapsed 0m 37s (remain 8m 3s) Loss: 0.0784(0.1119) Grad: 213115.6406  LR: 0.00000888  
Epoch: [2][40/293] Elapsed 1m 20s (remain 8m 14s) Loss: 0.0662(0.1066) Grad: 96384.1094  LR: 0.00000782  
Epoch: [2][60/293] Elapsed 1m 51s (remain 7m 4s) Loss: 0.0875(0.1065) Grad: 111473.7891  LR: 0.00000679  
Epoch: [2][80/293] Elapsed 2m 32s (remain 6m 38s) Loss: 0.0887(0.1061) Grad: 63118.1641  LR: 0.00000579  
Epoch: [2][100/293] Elapsed 3m 8s (remain 5m 59s) Loss: 0.0720(0.1045) Grad: 90545.0078  LR: 0.00000485  
Epoch: [2][120/293] Elapsed 3m 41s (remain 5m 14s) Loss: 0.0755(0.1023) Grad: 76750.3594  LR: 0.00000396  
Epoch: [2][140/293] Elapsed 4m 15s (remain 4m 35s) Loss: 0.0866(0.1014) Grad: 77131.2578  LR: 0.00000314  
Epoch: [2][160/293] Elapsed 4m 54s (remain 4m 1s) Loss: 0.1773(0.1023) Grad: 157454.09

Epoch 2 - avg_train_loss: 0.1027  avg_val_loss: 0.1123  time: 616s
Epoch 2 - Score: 0.4749  Scores: [0.504634842131219, 0.45887408899257354, 0.43462899248749637, 0.47446451174168935, 0.5074539374314938, 0.4695495817337974]
Epoch 2 - Save Best Score: 0.4749 Model


EVAL: [48/49] Elapsed 0m 59s (remain 0m 0s) Loss: 0.1301(0.1123) 


Score: 0.4749  Scores: [0.504634842131219, 0.45887408899257354, 0.43462899248749637, 0.47446451174168935, 0.5074539374314938, 0.4695495817337974]
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_siz

Epoch: [1][0/293] Elapsed 0m 1s (remain 7m 10s) Loss: 2.4108(2.4108) Grad: inf  LR: 0.00002000  
Epoch: [1][20/293] Elapsed 0m 40s (remain 8m 46s) Loss: 0.1757(1.2539) Grad: 53420.8203  LR: 0.00001994  
Epoch: [1][40/293] Elapsed 1m 18s (remain 8m 1s) Loss: 0.1933(0.7332) Grad: 57073.1094  LR: 0.00001976  
Epoch: [1][60/293] Elapsed 1m 57s (remain 7m 25s) Loss: 0.1788(0.5425) Grad: 72823.7734  LR: 0.00001947  
Epoch: [1][80/293] Elapsed 2m 31s (remain 6m 37s) Loss: 0.0942(0.4394) Grad: 89760.6250  LR: 0.00001907  
Epoch: [1][100/293] Elapsed 3m 3s (remain 5m 48s) Loss: 0.1781(0.3818) Grad: 79207.4766  LR: 0.00001857  
Epoch: [1][120/293] Elapsed 3m 35s (remain 5m 6s) Loss: 0.0889(0.3384) Grad: 73604.5156  LR: 0.00001797  
Epoch: [1][140/293] Elapsed 4m 20s (remain 4m 40s) Loss: 0.0917(0.3094) Grad: 59695.8633  LR: 0.00001728  
Epoch: [1][160/293] Elapsed 4m 58s (remain 4m 4s) Loss: 0.1600(0.2892) Grad: 51632.6484  LR: 0.00001650  
Epoch: [1][180/293] Elapsed 5m 36s (remain 3m 28s) Loss

Epoch 1 - avg_train_loss: 0.2144  avg_val_loss: 0.1104  time: 620s
Epoch 1 - Score: 0.4712  Scores: [0.49749815146117143, 0.4651145263630815, 0.43772817841526324, 0.46792496602471934, 0.48619428952892696, 0.47248825897299024]
Epoch 1 - Save Best Score: 0.4712 Model


EVAL: [48/49] Elapsed 0m 59s (remain 0m 0s) Loss: 0.0861(0.1104) 
Epoch: [2][0/293] Elapsed 0m 3s (remain 15m 31s) Loss: 0.0861(0.0861) Grad: 186238.7656  LR: 0.00000995  
Epoch: [2][20/293] Elapsed 0m 41s (remain 9m 2s) Loss: 0.0657(0.1074) Grad: 173687.5000  LR: 0.00000888  
Epoch: [2][40/293] Elapsed 1m 21s (remain 8m 19s) Loss: 0.1072(0.1042) Grad: 163229.1562  LR: 0.00000782  
Epoch: [2][60/293] Elapsed 2m 6s (remain 7m 59s) Loss: 0.1053(0.1011) Grad: 197277.8281  LR: 0.00000679  
Epoch: [2][80/293] Elapsed 2m 41s (remain 7m 1s) Loss: 0.1398(0.1039) Grad: 139283.6094  LR: 0.00000579  
Epoch: [2][100/293] Elapsed 3m 21s (remain 6m 23s) Loss: 0.0789(0.1037) Grad: 229738.6562  LR: 0.00000485  
Epoch: [2][120/293] Elapsed 3m 51s (remain 5m 29s) Loss: 0.0915(0.1031) Grad: 169514.1250  LR: 0.00000396  
Epoch: [2][140/293] Elapsed 4m 34s (remain 4m 56s) Loss: 0.0819(0.1023) Grad: 147763.7969  LR: 0.00000314  
Epoch: [2][160/293] Elapsed 5m 15s (remain 4m 18s) Loss: 0.1133(0.1023) Grad: 2

Epoch 2 - avg_train_loss: 0.1010  avg_val_loss: 0.1056  time: 624s
Epoch 2 - Score: 0.4604  Scores: [0.4851928227944414, 0.45755644404444595, 0.41221971485714637, 0.4603608472763681, 0.4736212588692853, 0.4735535815192223]
Epoch 2 - Save Best Score: 0.4604 Model


EVAL: [48/49] Elapsed 0m 58s (remain 0m 0s) Loss: 0.0813(0.1056) 


Score: 0.4604  Scores: [0.4851928227944414, 0.45755644404444595, 0.41221971485714637, 0.4603608472763681, 0.4736212588692853, 0.4735535815192223]
Score: 0.4629  Scores: [0.49996912544244104, 0.4515330718968534, 0.4193498899172248, 0.46073638431955016, 0.48477918584793567, 0.46078207798876714]


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
[fold0] avg_train_loss,█▁
[fold0] avg_val_loss,█▁
[fold0] epoch,▁█
[fold0] loss,█▅▂▁▂▁▂▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[fold0] lr,███████▇▇▇▇▇▆▆▆▆▅▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁
[fold0] score,█▁
[fold1] avg_train_loss,█▁
[fold1] avg_val_loss,█▁
[fold1] epoch,▁█
[fold1] loss,█▄▂▂▂▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁

0,1
[fold0] avg_train_loss,0.10094
[fold0] avg_val_loss,0.10646
[fold0] epoch,2.0
[fold0] loss,0.09306
[fold0] lr,0.0
[fold0] score,0.46163
[fold1] avg_train_loss,0.10434
[fold1] avg_val_loss,0.10276
[fold1] epoch,2.0
[fold1] loss,0.15935


In [17]:
!ls /kaggle/working

__notebook__.ipynb
config.pth
microsoft-deberta-v3-base_fold0_best.pth
microsoft-deberta-v3-base_fold1_best.pth
microsoft-deberta-v3-base_fold2_best.pth
microsoft-deberta-v3-base_fold3_best.pth
oof_df.pkl
tokenizer
train.log
wandb
