In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
!pip install transformers
!pip install datasets
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m56.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m99.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https://pypi.org/simple, https://

In [3]:
!nvidia-smi

Thu May  4 18:54:11 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   30C    P0    43W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:

import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re
import html

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD, AdamW, RAdam
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score, recall_score, accuracy_score, precision_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:

DIR = "/content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測"
INPUT_DIR = os.path.join(DIR,"input")
OUTPUT_DIR = os.path.join(DIR,"output")
CUSTOM_MODEL_DIR = os.path.join(OUTPUT_DIR,'clrp_deberta_v3_base_epoch10')
OUTPUT_EXP_DIR = DIR + '/output/EXP063/'
if not os.path.exists(OUTPUT_EXP_DIR):
    os.makedirs(OUTPUT_EXP_DIR)

In [6]:

# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    model_name="microsoft/deberta-v3-base"
    # model='microsoft/deberta-base'
    # model='roberta-base'
    # model='roberta-large'
    # model='roberta-large-mnli'
    # model='xlnet-large-cased'
    # model='albert-xxlarge-v2'
    # model="microsoft/deberta-large"
    # model="microsoft/deberta-v3-large"
    # model='microsoft/deberta-v2-xlarge'
    # model='funnel-transformer/large'
    # model='funnel-transformer/medium'
    # model='albert-base-v2'
    # model='albert-large-v2'
    # model='google/electra-large-discriminator'
    # model='google/electra-base-discriminator'
    # model="facebook/bart-large-mnli"
    # model="facebook/bart-large"
    # model="facebook/bart-base"
    model = CUSTOM_MODEL_DIR
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=5
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=16
    fc_dropout=0.2
    target_size=1
    max_len=256
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=10
    trn_fold=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    train=True
    nth_awp_start_epoch=1
    gradient_checkpointing = False
    freezing = False
    num_reinit_layers = 1
    is_reinit_layer = False
    fgm = False
    awp_start=1

if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0, 1]

In [7]:
def get_score(labels, outputs):
    thresh = 0.5
    y_pred = outputs
    y_true = labels
    f_score = f1_score(y_true, (y_pred>thresh).astype(int))
    r_score = recall_score(y_true, (y_pred>thresh).astype(int))
    p_score = precision_score(y_true, (y_pred>thresh).astype(int))
    print(f"f1 score : {f_score}")
    print(f"recall score : {r_score}")
    print(f"precision score : {p_score}")
    return accuracy_score(y_true, (y_pred>thresh).astype(int))

def get_acc_score(labels, outputs):
    y_pred = outputs
    y_true = labels
    best_score = 0
    best_thresh = 0.5
    for thresh in np.arange(0.1, 0.80, 0.01):
        thresh = np.round(thresh, 2)
        score = accuracy_score(y_true, (y_pred>thresh).astype(int))
        #print("Accuracy score at threshold {0} is {1}".format(thresh, score))
        if score > best_score:
          best_score = score
          best_thresh = thresh
    print(f"thresh : {best_thresh}")
    return accuracy_score(y_true, (y_pred>best_thresh).astype(int))


def get_logger(filename=OUTPUT_EXP_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=CFG.seed)

In [8]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False
        
def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
            
    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """
    
    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"
        
        if hasattr(embeddings_path, attr_name): 
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [9]:
import pandas as pd
import numpy as np

train = pd.read_csv(os.path.join(INPUT_DIR,"train_data.csv"))
test = pd.read_csv(os.path.join(INPUT_DIR,"test_data.csv"))
sample_sub = pd.read_csv(os.path.join(INPUT_DIR,"submission.csv"))

train = train.sample(frac=1, random_state=CFG.seed).reset_index()

print(train.shape)
display(train.head(3))

print(test.shape)
display(test.head(3))

print(sample_sub.shape)
display(sample_sub.head(3))

(4974, 7)


Unnamed: 0,index,id,title,year,abstract,keywords,y
0,721,722,Global Optimality Conditions for Deep Neural N...,2018,We study the error landscape of deep linear an...,"deep linear neural networks, global optimality...",1
1,144,145,Multi-Task Learning by Deep Collaboration and ...,2018,Convolutional neural networks (CNN) have becom...,"multi-task learning, soft parameter sharing, f...",0
2,4542,4543,On the Need for Topology-Aware Generative Mode...,2020,"ML algorithms or models, especially deep neura...","Manifold-based Defense, Robust Learning, Adver...",1


(6393, 5)


Unnamed: 0,id,title,year,abstract,keywords
0,1,StyleAlign: Analysis and Applications of Align...,2022,"In this paper, we perform an in-depth study of...","StyleGAN, transfer learning, fine tuning, mode..."
1,2,Embedding a random graph via GNN: mean-field i...,2021,We develop a theory for embedding a random gra...,"Graph neural network, graph embedding, multi-r..."
2,3,BBRefinement: an universal scheme to improve p...,2021,We present a conceptually simple yet powerful ...,"object detection, deep neural networks, refine..."


(6393, 2)


Unnamed: 0,id,y
0,1,0
1,2,0
2,3,0


In [10]:
train["texts"] = train["title"] + "[SEP]" + train["abstract"] 

In [11]:
skf = StratifiedKFold(n_splits=CFG.n_fold,shuffle=True,random_state=CFG.seed)
for fold, ( _, val_) in enumerate(skf.split(train, train.y)):
    train.loc[val_ , "kfold"] = int(fold)
    
train["kfold"] = train["kfold"].astype(int)

if CFG.debug:
    display(train.groupby('kfold').size())
    train = train.sample(n=500, random_state=0).reset_index(drop=True)
    display(train.groupby('kfold').size())

In [12]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_EXP_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

In [13]:
# ====================================================
# Define max_len
# ====================================================
lengths = []
tk0 = tqdm(train['texts'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 3 # cls + sep + sep
LOGGER.info(f"max_len: {CFG.max_len}")

100%|██████████| 4974/4974 [00:03<00:00, 1337.83it/s]
max_len: 522
INFO:__main__:max_len: 522


In [14]:
class AWP:
    def __init__(self, model, optimizer, *, adv_param='weight',
                 adv_lr=0.001, adv_eps=0.001):
        self.model = model
        self.optimizer = optimizer
        self.adv_param = adv_param
        self.adv_lr = adv_lr
        self.adv_eps = adv_eps
        self.backup = {}

    def perturb(self, inputs, y, criterion):
        """
        Perturb model parameters for AWP gradient
        Call before loss and loss.backward()
        """
        self._save()  # save model parameters
        self._attack_step()  # perturb weights

    def _attack_step(self):
        e = 1e-6
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                grad = self.optimizer.state[param]['exp_avg']
                norm_grad = torch.norm(grad)
                norm_data = torch.norm(param.detach())

                if norm_grad != 0 and not torch.isnan(norm_grad):
                    # Set lower and upper limit in change
                    limit_eps = self.adv_eps * param.detach().abs()
                    param_min = param.data - limit_eps
                    param_max = param.data + limit_eps

                    # Perturb along gradient
                    # w += (adv_lr * |w| / |grad|) * grad
                    param.data.add_(grad, alpha=(self.adv_lr * (norm_data + e) / (norm_grad + e)))

                    # Apply the limit to the change
                    param.data.clamp_(param_min, param_max)

    def _save(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                if name not in self.backup:
                    self.backup[name] = param.clone().detach()
                else:
                    self.backup[name].copy_(param.data)

    def restore(self):
        """
        Restore model parameter to correct position; AWP do not perturbe weights, it perturb gradients
        Call after loss.backward(), before optimizer.step()
        """
        for name, param in self.model.named_parameters():
            if name in self.backup:
                param.data.copy_(self.backup[name])

In [15]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False,
                           truncation=True)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['texts'].values
        self.labels = df['y'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

#collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [16]:
def reinit_layers(model):

    #for layer in model.model.encoder.layer[-CFG.num_reinit_layers:]:
    for layer in model.encoder.layer[-CFG.num_reinit_layers:]:    #Custome model内(backbone)

            for module in layer.modules():

                if isinstance(module,nn.Linear):
                    module.weight.data.normal_(mean=0.0,std=model.config.initializer_range)
                    if module.bias is not None:
                            module.bias.data.zero_()
                elif isinstance(module, nn.Embedding):
                        module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
                        if module.padding_idx is not None:
                            module.weight.data[module.padding_idx].zero_()
                elif isinstance(module, nn.LayerNorm):
                        module.bias.data.zero_()
                        module.weight.data.fill_(1.0)
                        
    return model

In [17]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim=1)
        return max_embeddings
    

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if CFG.is_reinit_layer:
            self.model = reinit_layers(self.model)
            print(f'Reinitializing Last {CFG.num_reinit_layers} Layers.')
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()

        # Freezing
        if cfg.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            cfg.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())

        self.pool = MaxPooling()
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        self._init_weights(self.fc)
        self.layer_norm1 = nn.LayerNorm(self.config.hidden_size)
        self._init_weights(self.fc)

        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        feature = self.layer_norm1(feature)
        output = self.fc(feature)
        #output = self.sig(output)
        return output

In [18]:
def calculate_loss(inputs, labels, model, criterion, is_valid=True, device="cpu"):    
    y_preds = model(inputs)
    loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
    return (loss, y_preds) if is_valid else loss

In [19]:

# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device, awp):
    model.zero_grad()
    model.train()
    awp_start = CFG.awp_start
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        if epoch >= awp_start:
            awp.perturb(inputs, labels, criterion)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            loss = calculate_loss(inputs=inputs, labels=labels, model=model, criterion=criterion, is_valid=False, device=device)
        #print(y_preds.sigmoid().squeeze().view(1, -1))
        #loss = criterion(y_preds.sigmoid().squeeze(), labels.squeeze())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        if scaler is not None:
            scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        awp.restore()
        if CFG.fgm:
          fgm.attack() 
          adversarial_loss = calculate_loss(inputs=inputs, labels=labels, model=model, criterion=criterion, is_valid=False, device=device)
          scaler.scale(adversarial_loss).backward()
          fgm.restore()
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  #'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          #grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))

    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            loss, y_preds = calculate_loss(inputs=inputs, labels=labels, model=model, criterion=criterion, is_valid=True, device=device)
        #loss = criterion(y_preds.sigmoid().squeeze(), labels.squeeze())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    predictions = np.concatenate(predictions)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [20]:

# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['kfold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['kfold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds['y'].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)


    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size*2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_EXP_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr=5e-6, decoder_lr=1e-4, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", 
                    "LayerNorm.weight"]
        group1=['layer.0.','layer.1.','layer.2.','layer.3.']
        group2=['layer.4.','layer.5.','layer.6.','layer.7.']    
        group3=['layer.8.','layer.9.','layer.10.','layer.11.']
        group_all=['layer.0.','layer.1.','layer.2.','layer.3.','layer.4.','layer.5.','layer.6.','layer.7.','layer.8.','layer.9.','layer.10.','layer.11.']
        optimizer_parameters = [
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay': weight_decay},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay': weight_decay, 'lr': encoder_lr/2.6},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay': weight_decay, 'lr': encoder_lr},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay': weight_decay, 'lr': encoder_lr*2.6},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay': 0.0},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay': 0.0, 'lr': encoder_lr/2.6},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay': 0.0, 'lr': encoder_lr},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay': 0.0, 'lr': encoder_lr*2.6},
        {'params': [p for n, p in model.named_parameters() if "model" not in n], 'lr':decoder_lr, "momentum" : 0.99},
    ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.BCEWithLogitsLoss()

    print('Enable AWP')
    awp = AWP(model, optimizer, adv_lr=0.001, adv_eps=0.001)
    #print('Enable FGM')
    #fgm = FGM(model=model, eps=0.1)
    
    best_score = -1.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device, awp)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score_05 = get_score(valid_labels, predictions)
        score = get_acc_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')

        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_EXP_DIR+f"{CFG.model_name.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_EXP_DIR+f"{CFG.model_name.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds['pred'] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [21]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df['y'].values
        preds = oof_df['pred'].values
        score = get_score(labels, preds)
        acc_score = get_acc_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
        LOGGER.info(f'ACC BEST Score: {acc_score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
            #break
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_EXP_DIR+'oof_df.pkl')

DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base_epoch10",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "torch_dt

Enable AWP
Epoch: [1][0/279] Elapsed 0m 4s (remain 18m 55s) Loss: 0.6411(0.6411) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 15s (remain 0m 27s) Loss: 0.8209(0.6225) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 26s (remain 0m 10s) Loss: 0.6975(0.6167) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 35s (remain 0m 0s) Loss: 0.5455(0.6141) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6022(0.6022) 


Epoch 1 - avg_train_loss: 0.6141  avg_val_loss: 0.5944  time: 39s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6141  avg_val_loss: 0.5944  time: 39s
Epoch 1 - Score: 0.7169
INFO:__main__:Epoch 1 - Score: 0.7169
Epoch 1 - Save Best Score: 0.7169 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7169 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.6459(0.5944) 
f1 score : 0.11042944785276074
recall score : 0.058823529411764705
precision score : 0.9
thresh : 0.35
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 34s) Loss: 0.9142(0.9142) LR: 0.00001808  
Epoch: [2][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.7304(0.5388) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 23s (remain 0m 8s) Loss: 0.3838(0.5242) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.7500(0.5300) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.5803(0.5803) 


Epoch 2 - avg_train_loss: 0.5300  avg_val_loss: 0.5881  time: 36s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5300  avg_val_loss: 0.5881  time: 36s
Epoch 2 - Score: 0.7129
INFO:__main__:Epoch 2 - Score: 0.7129


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7111(0.5881) 
f1 score : 0.14285714285714285
recall score : 0.0784313725490196
precision score : 0.8
thresh : 0.41
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 24s) Loss: 0.4055(0.4055) LR: 0.00001309  
Epoch: [3][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.2781(0.3499) LR: 0.00001090  
Epoch: [3][200/279] Elapsed 0m 22s (remain 0m 8s) Loss: 0.3046(0.3399) LR: 0.00000866  
Epoch: [3][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.3601(0.3235) LR: 0.00000695  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5640(0.5640) 


Epoch 3 - avg_train_loss: 0.3235  avg_val_loss: 0.6273  time: 35s
INFO:__main__:Epoch 3 - avg_train_loss: 0.3235  avg_val_loss: 0.6273  time: 35s
Epoch 3 - Score: 0.7189
INFO:__main__:Epoch 3 - Score: 0.7189
Epoch 3 - Save Best Score: 0.7189 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7189 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.6589(0.6273) 
f1 score : 0.2512562814070352
recall score : 0.16339869281045752
precision score : 0.5434782608695652
thresh : 0.64
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 36s) Loss: 0.1139(0.1139) LR: 0.00000693  
Epoch: [4][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.0530(0.0950) LR: 0.00000488  
Epoch: [4][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.0894(0.0894) LR: 0.00000310  
Epoch: [4][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.0620(0.0853) LR: 0.00000194  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5788(0.5788) 


Epoch 4 - avg_train_loss: 0.0853  avg_val_loss: 0.6454  time: 36s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0853  avg_val_loss: 0.6454  time: 36s
Epoch 4 - Score: 0.7108
INFO:__main__:Epoch 4 - Score: 0.7108


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.6648(0.6454) 
f1 score : 0.2938388625592417
recall score : 0.20261437908496732
precision score : 0.5344827586206896
thresh : 0.42
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 27s) Loss: 0.0418(0.0418) LR: 0.00000193  
Epoch: [5][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.0451(0.0465) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.0433(0.0458) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.0519(0.0453) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5981(0.5981) 


Epoch 5 - avg_train_loss: 0.0453  avg_val_loss: 0.6580  time: 36s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0453  avg_val_loss: 0.6580  time: 36s
Epoch 5 - Score: 0.7108
INFO:__main__:Epoch 5 - Score: 0.7108


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.6723(0.6580) 
f1 score : 0.2788461538461538
recall score : 0.1895424836601307
precision score : 0.5272727272727272
thresh : 0.4


Score: 0.7008
INFO:__main__:Score: 0.7008
ACC BEST Score: 0.7189
INFO:__main__:ACC BEST Score: 0.7189
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base_epoch10",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input"

f1 score : 0.2512562814070352
recall score : 0.16339869281045752
precision score : 0.5434782608695652
thresh : 0.64


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base_epoch10 were not used when initializing DebertaV2Model: ['cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 44s) Loss: 0.6861(0.6861) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.7377(0.6204) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 22s (remain 0m 8s) Loss: 0.3722(0.6193) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.7071(0.6162) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4778(0.4778) 


Epoch 1 - avg_train_loss: 0.6162  avg_val_loss: 0.5893  time: 35s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6162  avg_val_loss: 0.5893  time: 35s
Epoch 1 - Score: 0.7189
INFO:__main__:Epoch 1 - Score: 0.7189
Epoch 1 - Save Best Score: 0.7189 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7189 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.5123(0.5893) 
f1 score : 0.13333333333333333
recall score : 0.0718954248366013
precision score : 0.9166666666666666
thresh : 0.39
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 46s) Loss: 0.6478(0.6478) LR: 0.00001808  
Epoch: [2][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.6989(0.5518) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 23s (remain 0m 8s) Loss: 0.4828(0.5341) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.3944(0.5316) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4080(0.4080) 


Epoch 2 - avg_train_loss: 0.5316  avg_val_loss: 0.5893  time: 36s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5316  avg_val_loss: 0.5893  time: 36s
Epoch 2 - Score: 0.7189
INFO:__main__:Epoch 2 - Score: 0.7189


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.4819(0.5893) 
f1 score : 0.12195121951219513
recall score : 0.06535947712418301
precision score : 0.9090909090909091
thresh : 0.41
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 42s) Loss: 0.4401(0.4401) LR: 0.00001309  
Epoch: [3][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.3043(0.3565) LR: 0.00001090  
Epoch: [3][200/279] Elapsed 0m 23s (remain 0m 8s) Loss: 0.3675(0.3344) LR: 0.00000866  
Epoch: [3][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.2077(0.3112) LR: 0.00000695  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3850(0.3850) 


Epoch 3 - avg_train_loss: 0.3112  avg_val_loss: 0.6679  time: 36s
INFO:__main__:Epoch 3 - avg_train_loss: 0.3112  avg_val_loss: 0.6679  time: 36s
Epoch 3 - Score: 0.7068
INFO:__main__:Epoch 3 - Score: 0.7068


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.5571(0.6679) 
f1 score : 0.1935483870967742
recall score : 0.11764705882352941
precision score : 0.5454545454545454
thresh : 0.67
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 37s) Loss: 0.0358(0.0358) LR: 0.00000693  
Epoch: [4][100/279] Elapsed 0m 11s (remain 0m 21s) Loss: 0.0598(0.0859) LR: 0.00000488  
Epoch: [4][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.0552(0.0790) LR: 0.00000310  
Epoch: [4][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.0489(0.0765) LR: 0.00000194  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3957(0.3957) 


Epoch 4 - avg_train_loss: 0.0765  avg_val_loss: 0.6479  time: 36s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0765  avg_val_loss: 0.6479  time: 36s
Epoch 4 - Score: 0.7068
INFO:__main__:Epoch 4 - Score: 0.7068


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.5657(0.6479) 
f1 score : 0.26540284360189575
recall score : 0.1830065359477124
precision score : 0.4827586206896552
thresh : 0.64
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 37s) Loss: 0.0614(0.0614) LR: 0.00000193  
Epoch: [5][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.0506(0.0466) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 23s (remain 0m 8s) Loss: 0.0480(0.0460) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.0336(0.0456) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3954(0.3954) 


Epoch 5 - avg_train_loss: 0.0456  avg_val_loss: 0.6575  time: 36s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0456  avg_val_loss: 0.6575  time: 36s
Epoch 5 - Score: 0.7028
INFO:__main__:Epoch 5 - Score: 0.7028


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.5535(0.6575) 
f1 score : 0.2608695652173913
recall score : 0.17647058823529413
precision score : 0.5
thresh : 0.63


Score: 0.7129
INFO:__main__:Score: 0.7129
ACC BEST Score: 0.7189
INFO:__main__:ACC BEST Score: 0.7189
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base_epoch10",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input"

f1 score : 0.13333333333333333
recall score : 0.0718954248366013
precision score : 0.9166666666666666
thresh : 0.39


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base_epoch10 were not used when initializing DebertaV2Model: ['cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 2m 0s) Loss: 0.9049(0.9049) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.7167(0.6171) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 23s (remain 0m 8s) Loss: 0.6193(0.6142) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.5931(0.6081) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6049(0.6049) 


Epoch 1 - avg_train_loss: 0.6081  avg_val_loss: 0.5919  time: 35s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6081  avg_val_loss: 0.5919  time: 35s
Epoch 1 - Score: 0.7088
INFO:__main__:Epoch 1 - Score: 0.7088
Epoch 1 - Save Best Score: 0.7088 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7088 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.4150(0.5919) 
f1 score : 0.1581920903954802
recall score : 0.0915032679738562
precision score : 0.5833333333333334
thresh : 0.58
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 46s) Loss: 0.5300(0.5300) LR: 0.00001808  
Epoch: [2][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.5357(0.5445) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.6088(0.5270) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.4630(0.5191) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6401(0.6401) 


Epoch 2 - avg_train_loss: 0.5191  avg_val_loss: 0.6070  time: 36s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5191  avg_val_loss: 0.6070  time: 36s
Epoch 2 - Score: 0.7108
INFO:__main__:Epoch 2 - Score: 0.7108
Epoch 2 - Save Best Score: 0.7108 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7108 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.4004(0.6070) 
f1 score : 0.3769230769230769
recall score : 0.3202614379084967
precision score : 0.45794392523364486
thresh : 0.58
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 40s) Loss: 0.3237(0.3237) LR: 0.00001309  
Epoch: [3][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.3188(0.3253) LR: 0.00001090  
Epoch: [3][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.3005(0.3095) LR: 0.00000866  
Epoch: [3][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.1669(0.2959) LR: 0.00000695  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6466(0.6466) 


Epoch 3 - avg_train_loss: 0.2959  avg_val_loss: 0.6435  time: 36s
INFO:__main__:Epoch 3 - avg_train_loss: 0.2959  avg_val_loss: 0.6435  time: 36s
Epoch 3 - Score: 0.7068
INFO:__main__:Epoch 3 - Score: 0.7068


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.3022(0.6435) 
f1 score : 0.3018867924528302
recall score : 0.20915032679738563
precision score : 0.5423728813559322
thresh : 0.73
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 43s) Loss: 0.0831(0.0831) LR: 0.00000693  
Epoch: [4][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.0476(0.0794) LR: 0.00000488  
Epoch: [4][200/279] Elapsed 0m 23s (remain 0m 8s) Loss: 0.0608(0.0730) LR: 0.00000310  
Epoch: [4][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.0645(0.0699) LR: 0.00000194  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.7015(0.7015) 


Epoch 4 - avg_train_loss: 0.0699  avg_val_loss: 0.6746  time: 36s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0699  avg_val_loss: 0.6746  time: 36s
Epoch 4 - Score: 0.7088
INFO:__main__:Epoch 4 - Score: 0.7088


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.3243(0.6746) 
f1 score : 0.3177570093457944
recall score : 0.2222222222222222
precision score : 0.5573770491803278
thresh : 0.52
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 35s) Loss: 0.0467(0.0467) LR: 0.00000193  
Epoch: [5][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.0377(0.0418) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 22s (remain 0m 8s) Loss: 0.0429(0.0417) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.0395(0.0418) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.7093(0.7093) 


Epoch 5 - avg_train_loss: 0.0418  avg_val_loss: 0.6909  time: 35s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0418  avg_val_loss: 0.6909  time: 35s
Epoch 5 - Score: 0.7108
INFO:__main__:Epoch 5 - Score: 0.7108


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.3283(0.6909) 
f1 score : 0.29411764705882354
recall score : 0.19607843137254902
precision score : 0.5882352941176471
thresh : 0.5


Score: 0.6747
INFO:__main__:Score: 0.6747
ACC BEST Score: 0.7108
INFO:__main__:ACC BEST Score: 0.7108
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base_epoch10",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input"

f1 score : 0.3769230769230769
recall score : 0.3202614379084967
precision score : 0.45794392523364486
thresh : 0.58


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base_epoch10 were not used when initializing DebertaV2Model: ['cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 44s) Loss: 0.7654(0.7654) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.6659(0.6245) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 23s (remain 0m 8s) Loss: 0.6275(0.6156) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.7163(0.6100) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.6759(0.6759) 


Epoch 1 - avg_train_loss: 0.6100  avg_val_loss: 0.5862  time: 35s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6100  avg_val_loss: 0.5862  time: 35s
Epoch 1 - Score: 0.7108
INFO:__main__:Epoch 1 - Score: 0.7108
Epoch 1 - Save Best Score: 0.7108 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7108 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.5928(0.5862) 
f1 score : 0.13714285714285715
recall score : 0.07894736842105263
precision score : 0.5217391304347826
thresh : 0.46
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 44s) Loss: 0.5716(0.5716) LR: 0.00001808  
Epoch: [2][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.4826(0.5243) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.4130(0.5179) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.6041(0.5206) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.6624(0.6624) 


Epoch 2 - avg_train_loss: 0.5206  avg_val_loss: 0.5832  time: 36s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5206  avg_val_loss: 0.5832  time: 36s
Epoch 2 - Score: 0.7088
INFO:__main__:Epoch 2 - Score: 0.7088


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.6251(0.5832) 
f1 score : 0.22340425531914895
recall score : 0.13815789473684212
precision score : 0.5833333333333334
thresh : 0.51
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 34s) Loss: 0.3396(0.3396) LR: 0.00001309  
Epoch: [3][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.3129(0.3446) LR: 0.00001090  
Epoch: [3][200/279] Elapsed 0m 22s (remain 0m 8s) Loss: 0.1942(0.3181) LR: 0.00000866  
Epoch: [3][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.3217(0.3012) LR: 0.00000695  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.7378(0.7378) 


Epoch 3 - avg_train_loss: 0.3012  avg_val_loss: 0.6545  time: 36s
INFO:__main__:Epoch 3 - avg_train_loss: 0.3012  avg_val_loss: 0.6545  time: 36s
Epoch 3 - Score: 0.7008
INFO:__main__:Epoch 3 - Score: 0.7008


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7107(0.6545) 
f1 score : 0.5222222222222223
recall score : 0.618421052631579
precision score : 0.4519230769230769
thresh : 0.78
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 44s) Loss: 0.2215(0.2215) LR: 0.00000693  
Epoch: [4][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.0875(0.0969) LR: 0.00000488  
Epoch: [4][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.0522(0.0861) LR: 0.00000310  
Epoch: [4][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.0719(0.0821) LR: 0.00000194  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.8720(0.8720) 


Epoch 4 - avg_train_loss: 0.0821  avg_val_loss: 0.6541  time: 36s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0821  avg_val_loss: 0.6541  time: 36s
Epoch 4 - Score: 0.7088
INFO:__main__:Epoch 4 - Score: 0.7088


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.5819(0.6541) 
f1 score : 0.2692307692307692
recall score : 0.18421052631578946
precision score : 0.5
thresh : 0.65
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 38s) Loss: 0.0518(0.0518) LR: 0.00000193  
Epoch: [5][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.0340(0.0481) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.0354(0.0488) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.0503(0.0489) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.9019(0.9019) 


Epoch 5 - avg_train_loss: 0.0489  avg_val_loss: 0.6694  time: 36s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0489  avg_val_loss: 0.6694  time: 36s
Epoch 5 - Score: 0.7108
INFO:__main__:Epoch 5 - Score: 0.7108


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.5838(0.6694) 
f1 score : 0.25742574257425743
recall score : 0.17105263157894737
precision score : 0.52
thresh : 0.64


Score: 0.6968
INFO:__main__:Score: 0.6968
ACC BEST Score: 0.7108
INFO:__main__:ACC BEST Score: 0.7108
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base_epoch10",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input"

f1 score : 0.13714285714285715
recall score : 0.07894736842105263
precision score : 0.5217391304347826
thresh : 0.46


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base_epoch10 were not used when initializing DebertaV2Model: ['cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 52s) Loss: 0.9318(0.9318) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.5529(0.6326) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 23s (remain 0m 8s) Loss: 0.5599(0.6122) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.5801(0.6121) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5112(0.5112) 


Epoch 1 - avg_train_loss: 0.6121  avg_val_loss: 0.5897  time: 35s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6121  avg_val_loss: 0.5897  time: 35s
Epoch 1 - Score: 0.7042
INFO:__main__:Epoch 1 - Score: 0.7042
Epoch 1 - Save Best Score: 0.7042 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7042 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.5532(0.5897) 
f1 score : 0.08588957055214723
recall score : 0.046052631578947366
precision score : 0.6363636363636364
thresh : 0.41
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 34s) Loss: 0.6799(0.6799) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.6506(0.5296) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.4455(0.5374) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.4707(0.5290) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4880(0.4880) 


Epoch 2 - avg_train_loss: 0.5290  avg_val_loss: 0.5867  time: 36s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5290  avg_val_loss: 0.5867  time: 36s
Epoch 2 - Score: 0.7062
INFO:__main__:Epoch 2 - Score: 0.7062
Epoch 2 - Save Best Score: 0.7062 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7062 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.5223(0.5867) 
f1 score : 0.1694915254237288
recall score : 0.09868421052631579
precision score : 0.6
thresh : 0.45
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 39s) Loss: 0.5320(0.5320) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.3403(0.3517) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.2264(0.3304) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.2630(0.3113) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6113(0.6113) 


Epoch 3 - avg_train_loss: 0.3113  avg_val_loss: 0.6354  time: 36s
INFO:__main__:Epoch 3 - avg_train_loss: 0.3113  avg_val_loss: 0.6354  time: 36s
Epoch 3 - Score: 0.7022
INFO:__main__:Epoch 3 - Score: 0.7022


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.6425(0.6354) 
f1 score : 0.4234527687296417
recall score : 0.4276315789473684
precision score : 0.41935483870967744
thresh : 0.73
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 41s) Loss: 0.2138(0.2138) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.0641(0.1005) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 23s (remain 0m 8s) Loss: 0.0684(0.0863) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.0551(0.0797) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5474(0.5474) 


Epoch 4 - avg_train_loss: 0.0797  avg_val_loss: 0.6744  time: 36s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0797  avg_val_loss: 0.6744  time: 36s
Epoch 4 - Score: 0.7103
INFO:__main__:Epoch 4 - Score: 0.7103
Epoch 4 - Save Best Score: 0.7103 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7103 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.5299(0.6744) 
f1 score : 0.28436018957345965
recall score : 0.19736842105263158
precision score : 0.5084745762711864
thresh : 0.79
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 45s) Loss: 0.0287(0.0287) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.0274(0.0406) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.0483(0.0407) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.0582(0.0401) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5508(0.5508) 


Epoch 5 - avg_train_loss: 0.0401  avg_val_loss: 0.6717  time: 36s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0401  avg_val_loss: 0.6717  time: 36s
Epoch 5 - Score: 0.7062
INFO:__main__:Epoch 5 - Score: 0.7062


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.5194(0.6717) 
f1 score : 0.29493087557603687
recall score : 0.21052631578947367
precision score : 0.49230769230769234
thresh : 0.55


Score: 0.6962
INFO:__main__:Score: 0.6962
ACC BEST Score: 0.7103
INFO:__main__:ACC BEST Score: 0.7103
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base_epoch10",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input"

f1 score : 0.28436018957345965
recall score : 0.19736842105263158
precision score : 0.5084745762711864
thresh : 0.79


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base_epoch10 were not used when initializing DebertaV2Model: ['cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 43s) Loss: 0.6972(0.6972) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.6158(0.6271) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 22s (remain 0m 8s) Loss: 0.4331(0.6122) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.5497(0.6079) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6043(0.6043) 


Epoch 1 - avg_train_loss: 0.6079  avg_val_loss: 0.5924  time: 35s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6079  avg_val_loss: 0.5924  time: 35s
Epoch 1 - Score: 0.7062
INFO:__main__:Epoch 1 - Score: 0.7062
Epoch 1 - Save Best Score: 0.7062 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7062 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7081(0.5924) 
f1 score : 0.08695652173913043
recall score : 0.046052631578947366
precision score : 0.7777777777777778
thresh : 0.55
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 41s) Loss: 0.6411(0.6411) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.4366(0.5296) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.5496(0.5251) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.6753(0.5155) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6638(0.6638) 


Epoch 2 - avg_train_loss: 0.5155  avg_val_loss: 0.6201  time: 36s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5155  avg_val_loss: 0.6201  time: 36s
Epoch 2 - Score: 0.7082
INFO:__main__:Epoch 2 - Score: 0.7082
Epoch 2 - Save Best Score: 0.7082 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7082 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7755(0.6201) 
f1 score : 0.08641975308641975
recall score : 0.046052631578947366
precision score : 0.7
thresh : 0.3
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 50s) Loss: 0.3490(0.3490) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.2890(0.3241) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.1981(0.2967) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.1476(0.2792) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.7322(0.7322) 


Epoch 3 - avg_train_loss: 0.2792  avg_val_loss: 0.6022  time: 36s
INFO:__main__:Epoch 3 - avg_train_loss: 0.2792  avg_val_loss: 0.6022  time: 36s
Epoch 3 - Score: 0.7183
INFO:__main__:Epoch 3 - Score: 0.7183
Epoch 3 - Save Best Score: 0.7183 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7183 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.6824(0.6022) 
f1 score : 0.3225806451612903
recall score : 0.23026315789473684
precision score : 0.5384615384615384
thresh : 0.57
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 50s) Loss: 0.0988(0.0988) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.0763(0.0777) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.0691(0.0735) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.0673(0.0704) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.8499(0.8499) 


Epoch 4 - avg_train_loss: 0.0704  avg_val_loss: 0.6659  time: 36s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0704  avg_val_loss: 0.6659  time: 36s
Epoch 4 - Score: 0.7183
INFO:__main__:Epoch 4 - Score: 0.7183


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8564(0.6659) 
f1 score : 0.24598930481283424
recall score : 0.1513157894736842
precision score : 0.6571428571428571
thresh : 0.51
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 39s) Loss: 0.0338(0.0338) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.0393(0.0425) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 22s (remain 0m 8s) Loss: 0.0428(0.0426) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.0326(0.0428) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.8292(0.8292) 


Epoch 5 - avg_train_loss: 0.0428  avg_val_loss: 0.6470  time: 35s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0428  avg_val_loss: 0.6470  time: 35s
Epoch 5 - Score: 0.7183
INFO:__main__:Epoch 5 - Score: 0.7183


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7986(0.6470) 
f1 score : 0.27860696517412936
recall score : 0.18421052631578946
precision score : 0.5714285714285714
thresh : 0.55


Score: 0.7042
INFO:__main__:Score: 0.7042
ACC BEST Score: 0.7183
INFO:__main__:ACC BEST Score: 0.7183
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base_epoch10",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input"

f1 score : 0.3225806451612903
recall score : 0.23026315789473684
precision score : 0.5384615384615384
thresh : 0.57


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base_epoch10 were not used when initializing DebertaV2Model: ['cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 43s) Loss: 1.0733(1.0733) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.4964(0.6401) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 22s (remain 0m 8s) Loss: 0.6636(0.6272) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.5415(0.6154) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6248(0.6248) 


Epoch 1 - avg_train_loss: 0.6154  avg_val_loss: 0.5834  time: 35s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6154  avg_val_loss: 0.5834  time: 35s
Epoch 1 - Score: 0.7103
INFO:__main__:Epoch 1 - Score: 0.7103
Epoch 1 - Save Best Score: 0.7103 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7103 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.5161(0.5834) 
f1 score : 0.12048192771084337
recall score : 0.06578947368421052
precision score : 0.7142857142857143
thresh : 0.57
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 42s) Loss: 0.6027(0.6027) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.5797(0.5375) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.5825(0.5236) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.4160(0.5216) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6245(0.6245) 


Epoch 2 - avg_train_loss: 0.5216  avg_val_loss: 0.5944  time: 36s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5216  avg_val_loss: 0.5944  time: 36s
Epoch 2 - Score: 0.7123
INFO:__main__:Epoch 2 - Score: 0.7123
Epoch 2 - Save Best Score: 0.7123 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7123 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.5122(0.5944) 
f1 score : 0.1798941798941799
recall score : 0.1118421052631579
precision score : 0.4594594594594595
thresh : 0.61
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 50s) Loss: 0.3916(0.3916) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.2468(0.3402) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.1634(0.3114) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.2876(0.2887) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5492(0.5492) 


Epoch 3 - avg_train_loss: 0.2887  avg_val_loss: 0.6242  time: 36s
INFO:__main__:Epoch 3 - avg_train_loss: 0.2887  avg_val_loss: 0.6242  time: 36s
Epoch 3 - Score: 0.7243
INFO:__main__:Epoch 3 - Score: 0.7243
Epoch 3 - Save Best Score: 0.7243 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7243 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.6765(0.6242) 
f1 score : 0.43150684931506855
recall score : 0.4144736842105263
precision score : 0.45
thresh : 0.66
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 42s) Loss: 0.1209(0.1209) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.0657(0.0736) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.0491(0.0696) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.1217(0.0672) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6097(0.6097) 


Epoch 4 - avg_train_loss: 0.0672  avg_val_loss: 0.6712  time: 36s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0672  avg_val_loss: 0.6712  time: 36s
Epoch 4 - Score: 0.7243
INFO:__main__:Epoch 4 - Score: 0.7243


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.6044(0.6712) 
f1 score : 0.32075471698113206
recall score : 0.2236842105263158
precision score : 0.5666666666666667
thresh : 0.59
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 36s) Loss: 0.0398(0.0398) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 11s (remain 0m 21s) Loss: 0.0342(0.0414) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.0377(0.0415) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.0404(0.0412) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6102(0.6102) 


Epoch 5 - avg_train_loss: 0.0412  avg_val_loss: 0.6748  time: 36s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0412  avg_val_loss: 0.6748  time: 36s
Epoch 5 - Score: 0.7243
INFO:__main__:Epoch 5 - Score: 0.7243


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.6082(0.6748) 
f1 score : 0.3222748815165877
recall score : 0.2236842105263158
precision score : 0.576271186440678
thresh : 0.55


Score: 0.6660
INFO:__main__:Score: 0.6660
ACC BEST Score: 0.7243
INFO:__main__:ACC BEST Score: 0.7243
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base_epoch10",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input"

f1 score : 0.43150684931506855
recall score : 0.4144736842105263
precision score : 0.45
thresh : 0.66


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base_epoch10 were not used when initializing DebertaV2Model: ['cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 48s) Loss: 0.6410(0.6410) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.5983(0.6265) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 22s (remain 0m 8s) Loss: 0.5417(0.6138) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.5719(0.6084) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6781(0.6781) 


Epoch 1 - avg_train_loss: 0.6084  avg_val_loss: 0.6013  time: 36s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6084  avg_val_loss: 0.6013  time: 36s
Epoch 1 - Score: 0.7163
INFO:__main__:Epoch 1 - Score: 0.7163
Epoch 1 - Save Best Score: 0.7163 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7163 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.6888(0.6013) 
f1 score : 0.23469387755102042
recall score : 0.1513157894736842
precision score : 0.5227272727272727
thresh : 0.55
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 36s) Loss: 0.5314(0.5314) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 11s (remain 0m 21s) Loss: 0.4979(0.5099) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.3874(0.5074) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.4626(0.5083) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6700(0.6700) 


Epoch 2 - avg_train_loss: 0.5083  avg_val_loss: 0.5724  time: 36s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5083  avg_val_loss: 0.5724  time: 36s
Epoch 2 - Score: 0.7163
INFO:__main__:Epoch 2 - Score: 0.7163


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7180(0.5724) 
f1 score : 0.28712871287128716
recall score : 0.19078947368421054
precision score : 0.58
thresh : 0.52
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 34s) Loss: 0.2908(0.2908) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.3343(0.2947) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.1772(0.2780) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.2242(0.2631) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.6754(0.6754) 


Epoch 3 - avg_train_loss: 0.2631  avg_val_loss: 0.5905  time: 36s
INFO:__main__:Epoch 3 - avg_train_loss: 0.2631  avg_val_loss: 0.5905  time: 36s
Epoch 3 - Score: 0.7183
INFO:__main__:Epoch 3 - Score: 0.7183
Epoch 3 - Save Best Score: 0.7183 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7183 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.6949(0.5905) 
f1 score : 0.4427480916030534
recall score : 0.3815789473684211
precision score : 0.5272727272727272
thresh : 0.74
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 45s) Loss: 0.0836(0.0836) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 11s (remain 0m 21s) Loss: 0.0912(0.0736) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.0805(0.0672) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.0419(0.0647) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.7419(0.7419) 


Epoch 4 - avg_train_loss: 0.0647  avg_val_loss: 0.6122  time: 36s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0647  avg_val_loss: 0.6122  time: 36s
Epoch 4 - Score: 0.7223
INFO:__main__:Epoch 4 - Score: 0.7223
Epoch 4 - Save Best Score: 0.7223 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7223 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8298(0.6122) 
f1 score : 0.3272727272727273
recall score : 0.23684210526315788
precision score : 0.5294117647058824
thresh : 0.79
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 50s) Loss: 0.0402(0.0402) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.0478(0.0408) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.0508(0.0410) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.0418(0.0404) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.7763(0.7763) 


Epoch 5 - avg_train_loss: 0.0404  avg_val_loss: 0.6312  time: 36s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0404  avg_val_loss: 0.6312  time: 36s
Epoch 5 - Score: 0.7223
INFO:__main__:Epoch 5 - Score: 0.7223


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8897(0.6312) 
f1 score : 0.3106796116504854
recall score : 0.21052631578947367
precision score : 0.5925925925925926
thresh : 0.73


Score: 0.7022
INFO:__main__:Score: 0.7022
ACC BEST Score: 0.7223
INFO:__main__:ACC BEST Score: 0.7223
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base_epoch10",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input"

f1 score : 0.3272727272727273
recall score : 0.23684210526315788
precision score : 0.5294117647058824
thresh : 0.79


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base_epoch10 were not used when initializing DebertaV2Model: ['cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 56s) Loss: 0.6759(0.6759) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.6022(0.6251) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 23s (remain 0m 8s) Loss: 0.3891(0.6075) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.4067(0.6041) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5924(0.5924) 


Epoch 1 - avg_train_loss: 0.6041  avg_val_loss: 0.5876  time: 35s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6041  avg_val_loss: 0.5876  time: 35s
Epoch 1 - Score: 0.7183
INFO:__main__:Epoch 1 - Score: 0.7183
Epoch 1 - Save Best Score: 0.7183 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7183 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7298(0.5876) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
thresh : 0.35
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 40s) Loss: 0.6441(0.6441) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 11s (remain 0m 21s) Loss: 0.5079(0.5205) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.6616(0.5174) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.4820(0.5214) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6418(0.6418) 


Epoch 2 - avg_train_loss: 0.5214  avg_val_loss: 0.5810  time: 36s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5214  avg_val_loss: 0.5810  time: 36s
Epoch 2 - Score: 0.7324
INFO:__main__:Epoch 2 - Score: 0.7324
Epoch 2 - Save Best Score: 0.7324 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7324 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.6426(0.5810) 
f1 score : 0.358974358974359
recall score : 0.27631578947368424
precision score : 0.5121951219512195
thresh : 0.59
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 34s) Loss: 0.3461(0.3461) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.3005(0.3263) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.2392(0.3052) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.1710(0.2879) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6667(0.6667) 


Epoch 3 - avg_train_loss: 0.2879  avg_val_loss: 0.5986  time: 36s
INFO:__main__:Epoch 3 - avg_train_loss: 0.2879  avg_val_loss: 0.5986  time: 36s
Epoch 3 - Score: 0.7183
INFO:__main__:Epoch 3 - Score: 0.7183


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.6056(0.5986) 
f1 score : 0.3317972350230415
recall score : 0.23684210526315788
precision score : 0.5538461538461539
thresh : 0.59
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 41s) Loss: 0.0832(0.0832) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.0454(0.0784) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.1492(0.0727) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.1163(0.0707) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.7184(0.7184) 


Epoch 4 - avg_train_loss: 0.0707  avg_val_loss: 0.6124  time: 36s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0707  avg_val_loss: 0.6124  time: 36s
Epoch 4 - Score: 0.7284
INFO:__main__:Epoch 4 - Score: 0.7284


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.6262(0.6124) 
f1 score : 0.34259259259259256
recall score : 0.24342105263157895
precision score : 0.578125
thresh : 0.56
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 42s) Loss: 0.0448(0.0448) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 11s (remain 0m 21s) Loss: 0.0372(0.0457) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 23s (remain 0m 8s) Loss: 0.0462(0.0443) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.0424(0.0445) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.7338(0.7338) 


Epoch 5 - avg_train_loss: 0.0445  avg_val_loss: 0.6285  time: 36s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0445  avg_val_loss: 0.6285  time: 36s
Epoch 5 - Score: 0.7284
INFO:__main__:Epoch 5 - Score: 0.7284


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.6801(0.6285) 
f1 score : 0.3054187192118227
recall score : 0.20394736842105263
precision score : 0.6078431372549019
thresh : 0.56


Score: 0.6982
INFO:__main__:Score: 0.6982
ACC BEST Score: 0.7324
INFO:__main__:ACC BEST Score: 0.7324
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base_epoch10",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input"

f1 score : 0.358974358974359
recall score : 0.27631578947368424
precision score : 0.5121951219512195
thresh : 0.59


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base_epoch10 were not used when initializing DebertaV2Model: ['cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 51s) Loss: 0.6582(0.6582) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.7878(0.6290) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.5966(0.6109) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.3299(0.6080) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6053(0.6053) 


Epoch 1 - avg_train_loss: 0.6080  avg_val_loss: 0.5941  time: 36s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6080  avg_val_loss: 0.5941  time: 36s
Epoch 1 - Score: 0.7103
INFO:__main__:Epoch 1 - Score: 0.7103
Epoch 1 - Save Best Score: 0.7103 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7103 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.5136(0.5941) 
f1 score : 0.03870967741935484
recall score : 0.019736842105263157
precision score : 1.0
thresh : 0.39
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 43s) Loss: 0.4758(0.4758) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.4387(0.5284) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 23s (remain 0m 8s) Loss: 0.5480(0.5214) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.5271(0.5200) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6126(0.6126) 


Epoch 2 - avg_train_loss: 0.5200  avg_val_loss: 0.5889  time: 36s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5200  avg_val_loss: 0.5889  time: 36s
Epoch 2 - Score: 0.7143
INFO:__main__:Epoch 2 - Score: 0.7143
Epoch 2 - Save Best Score: 0.7143 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7143 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.4478(0.5889) 
f1 score : 0.13095238095238093
recall score : 0.07236842105263158
precision score : 0.6875
thresh : 0.47
Epoch: [3][0/279] Elapsed 0m 0s (remain 2m 0s) Loss: 0.3323(0.3323) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.3099(0.3522) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.3122(0.3119) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.1370(0.2948) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.7233(0.7233) 


Epoch 3 - avg_train_loss: 0.2948  avg_val_loss: 0.6298  time: 36s
INFO:__main__:Epoch 3 - avg_train_loss: 0.2948  avg_val_loss: 0.6298  time: 36s
Epoch 3 - Score: 0.7123
INFO:__main__:Epoch 3 - Score: 0.7123


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.3753(0.6298) 
f1 score : 0.2613065326633166
recall score : 0.17105263157894737
precision score : 0.5531914893617021
thresh : 0.62
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 43s) Loss: 0.1682(0.1682) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.0498(0.0868) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.0605(0.0795) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.0653(0.0758) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.7018(0.7018) 


Epoch 4 - avg_train_loss: 0.0758  avg_val_loss: 0.6206  time: 36s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0758  avg_val_loss: 0.6206  time: 36s
Epoch 4 - Score: 0.7183
INFO:__main__:Epoch 4 - Score: 0.7183
Epoch 4 - Save Best Score: 0.7183 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7183 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.3533(0.6206) 
f1 score : 0.351931330472103
recall score : 0.26973684210526316
precision score : 0.5061728395061729
thresh : 0.71
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 39s) Loss: 0.0504(0.0504) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.0382(0.0443) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.0351(0.0443) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.0506(0.0447) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.7145(0.7145) 


Epoch 5 - avg_train_loss: 0.0447  avg_val_loss: 0.6403  time: 36s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0447  avg_val_loss: 0.6403  time: 36s
Epoch 5 - Score: 0.7163
INFO:__main__:Epoch 5 - Score: 0.7163


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.3380(0.6403) 
f1 score : 0.29767441860465116
recall score : 0.21052631578947367
precision score : 0.5079365079365079
thresh : 0.61


Score: 0.6962
INFO:__main__:Score: 0.6962
ACC BEST Score: 0.7183
INFO:__main__:ACC BEST Score: 0.7183
Score: 0.6948
INFO:__main__:Score: 0.6948
ACC BEST Score: 0.7091
INFO:__main__:ACC BEST Score: 0.7091


f1 score : 0.351931330472103
recall score : 0.26973684210526316
precision score : 0.5061728395061729
thresh : 0.71
f1 score : 0.31187669990933814
recall score : 0.22586999343401182
precision score : 0.5036603221083455
thresh : 0.64


In [22]:
from google.colab import runtime
runtime.unassign()