In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
!pip install transformers
!pip install datasets
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m53.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m104.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https://pypi.org/simple, https:/

In [3]:
!nvidia-smi

Sat May  6 11:25:22 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    47W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:

import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re
import html

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD, AdamW, RAdam
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score, recall_score, accuracy_score, precision_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
DIR = "/content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測"
INPUT_DIR = os.path.join(DIR,"input")
OUTPUT_DIR = os.path.join(DIR,"output")
CUSTOM_MODEL_DIR = os.path.join(OUTPUT_DIR,'clrp_deberta_v3_small_epoch10')
OUTPUT_EXP_DIR = DIR + '/output/EXP072/'
if not os.path.exists(OUTPUT_EXP_DIR):
    os.makedirs(OUTPUT_EXP_DIR)

In [6]:

# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    model_name="microsoft/deberta-v3-small"
    # model='microsoft/deberta-base'
    # model='roberta-base'
    # model='roberta-large'
    # model='roberta-large-mnli'
    # model='xlnet-large-cased'
    # model='albert-xxlarge-v2'
    # model="microsoft/deberta-large"
    # model="microsoft/deberta-v3-large"
    # model='microsoft/deberta-v2-xlarge'
    # model='funnel-transformer/large'
    # model='funnel-transformer/medium'
    # model='albert-base-v2'
    # model='albert-large-v2'
    # model='google/electra-large-discriminator'
    # model='google/electra-base-discriminator'
    # model="facebook/bart-large-mnli"
    # model="facebook/bart-large"
    # model="facebook/bart-base"
    model = CUSTOM_MODEL_DIR
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=5
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=16
    fc_dropout=0.2
    target_size=1
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=10
    trn_fold=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    train=True
    nth_awp_start_epoch=1
    gradient_checkpointing = False
    freezing = False
    num_reinit_layers = 1
    is_reinit_layer = False
    fgm = False
    awp_start=1

if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0, 1]

In [7]:
def get_score(labels, outputs):
    thresh = 0.5
    y_pred = outputs
    y_true = labels
    f_score = f1_score(y_true, (y_pred>thresh).astype(int))
    r_score = recall_score(y_true, (y_pred>thresh).astype(int))
    p_score = precision_score(y_true, (y_pred>thresh).astype(int))
    print(f"f1 score : {f_score}")
    print(f"recall score : {r_score}")
    print(f"precision score : {p_score}")
    return accuracy_score(y_true, (y_pred>thresh).astype(int))

def get_acc_score(labels, outputs):
    y_pred = outputs
    y_true = labels
    best_score = 0
    best_thresh = 0.5
    for thresh in np.arange(0.1, 0.80, 0.01):
        thresh = np.round(thresh, 2)
        score = accuracy_score(y_true, (y_pred>thresh).astype(int))
        #print("Accuracy score at threshold {0} is {1}".format(thresh, score))
        if score > best_score:
          best_score = score
          best_thresh = thresh
    print(f"thresh : {best_thresh}")
    return accuracy_score(y_true, (y_pred>best_thresh).astype(int))


def get_logger(filename=OUTPUT_EXP_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=CFG.seed)

In [8]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False
        
def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
            
    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """
    
    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"
        
        if hasattr(embeddings_path, attr_name): 
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [9]:
import pandas as pd
import numpy as np

train = pd.read_csv(os.path.join(INPUT_DIR,"train_data.csv"))
test = pd.read_csv(os.path.join(INPUT_DIR,"test_data.csv"))
sample_sub = pd.read_csv(os.path.join(INPUT_DIR,"submission.csv"))

train = train.sample(frac=1, random_state=CFG.seed).reset_index()

print(train.shape)
display(train.head(3))

print(test.shape)
display(test.head(3))

print(sample_sub.shape)
display(sample_sub.head(3))

(4974, 7)


Unnamed: 0,index,id,title,year,abstract,keywords,y
0,721,722,Global Optimality Conditions for Deep Neural N...,2018,We study the error landscape of deep linear an...,"deep linear neural networks, global optimality...",1
1,144,145,Multi-Task Learning by Deep Collaboration and ...,2018,Convolutional neural networks (CNN) have becom...,"multi-task learning, soft parameter sharing, f...",0
2,4542,4543,On the Need for Topology-Aware Generative Mode...,2020,"ML algorithms or models, especially deep neura...","Manifold-based Defense, Robust Learning, Adver...",1


(6393, 5)


Unnamed: 0,id,title,year,abstract,keywords
0,1,StyleAlign: Analysis and Applications of Align...,2022,"In this paper, we perform an in-depth study of...","StyleGAN, transfer learning, fine tuning, mode..."
1,2,Embedding a random graph via GNN: mean-field i...,2021,We develop a theory for embedding a random gra...,"Graph neural network, graph embedding, multi-r..."
2,3,BBRefinement: an universal scheme to improve p...,2021,We present a conceptually simple yet powerful ...,"object detection, deep neural networks, refine..."


(6393, 2)


Unnamed: 0,id,y
0,1,0
1,2,0
2,3,0


In [10]:
train["texts"] = train["title"] + "[SEP]" + train["abstract"] 

In [11]:
skf = StratifiedKFold(n_splits=CFG.n_fold,shuffle=True,random_state=CFG.seed)
for fold, ( _, val_) in enumerate(skf.split(train, train.y)):
    train.loc[val_ , "kfold"] = int(fold)
    
train["kfold"] = train["kfold"].astype(int)

if CFG.debug:
    display(train.groupby('kfold').size())
    train = train.sample(n=500, random_state=0).reset_index(drop=True)
    display(train.groupby('kfold').size())

In [12]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_EXP_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

In [13]:
# ====================================================
# Define max_len
# ====================================================
lengths = []
tk0 = tqdm(train['texts'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 3 # cls + sep + sep
LOGGER.info(f"max_len: {CFG.max_len}")

100%|██████████| 4974/4974 [00:03<00:00, 1285.55it/s]
max_len: 522
INFO:__main__:max_len: 522


In [14]:
class AWP:
    def __init__(self, model, optimizer, *, adv_param='weight',
                 adv_lr=0.001, adv_eps=0.001):
        self.model = model
        self.optimizer = optimizer
        self.adv_param = adv_param
        self.adv_lr = adv_lr
        self.adv_eps = adv_eps
        self.backup = {}

    def perturb(self, inputs, y, criterion):
        """
        Perturb model parameters for AWP gradient
        Call before loss and loss.backward()
        """
        self._save()  # save model parameters
        self._attack_step()  # perturb weights

    def _attack_step(self):
        e = 1e-6
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                grad = self.optimizer.state[param]['exp_avg']
                norm_grad = torch.norm(grad)
                norm_data = torch.norm(param.detach())

                if norm_grad != 0 and not torch.isnan(norm_grad):
                    # Set lower and upper limit in change
                    limit_eps = self.adv_eps * param.detach().abs()
                    param_min = param.data - limit_eps
                    param_max = param.data + limit_eps

                    # Perturb along gradient
                    # w += (adv_lr * |w| / |grad|) * grad
                    param.data.add_(grad, alpha=(self.adv_lr * (norm_data + e) / (norm_grad + e)))

                    # Apply the limit to the change
                    param.data.clamp_(param_min, param_max)

    def _save(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                if name not in self.backup:
                    self.backup[name] = param.clone().detach()
                else:
                    self.backup[name].copy_(param.data)

    def restore(self):
        """
        Restore model parameter to correct position; AWP do not perturbe weights, it perturb gradients
        Call after loss.backward(), before optimizer.step()
        """
        for name, param in self.model.named_parameters():
            if name in self.backup:
                param.data.copy_(self.backup[name])

In [15]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False,
                           truncation=True)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['texts'].values
        self.labels = df['y'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

#collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [16]:
def reinit_layers(model):

    #for layer in model.model.encoder.layer[-CFG.num_reinit_layers:]:
    for layer in model.encoder.layer[-CFG.num_reinit_layers:]:    #Custome model内(backbone)

            for module in layer.modules():

                if isinstance(module,nn.Linear):
                    module.weight.data.normal_(mean=0.0,std=model.config.initializer_range)
                    if module.bias is not None:
                            module.bias.data.zero_()
                elif isinstance(module, nn.Embedding):
                        module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
                        if module.padding_idx is not None:
                            module.weight.data[module.padding_idx].zero_()
                elif isinstance(module, nn.LayerNorm):
                        module.bias.data.zero_()
                        module.weight.data.fill_(1.0)
                        
    return model

In [17]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim=1)
        return max_embeddings
    

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if CFG.is_reinit_layer:
            self.model = reinit_layers(self.model)
            print(f'Reinitializing Last {CFG.num_reinit_layers} Layers.')
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()

        # Freezing
        if cfg.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            cfg.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())

        self.pool = MaxPooling()
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        self._init_weights(self.fc)
        self.layer_norm1 = nn.LayerNorm(self.config.hidden_size)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        feature = self.layer_norm1(feature)
        output = self.fc(feature)
        return output

In [18]:
def calculate_loss(inputs, labels, model, criterion, is_valid=True, device="cpu"):    
    y_preds = model(inputs)
    loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
    return (loss, y_preds) if is_valid else loss

In [19]:

# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device, awp):
    model.zero_grad()
    model.train()
    awp_start = CFG.awp_start
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        if epoch >= awp_start:
            awp.perturb(inputs, labels, criterion)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            loss = calculate_loss(inputs=inputs, labels=labels, model=model, criterion=criterion, is_valid=False, device=device)
        #print(y_preds.sigmoid().squeeze().view(1, -1))
        #loss = criterion(y_preds.sigmoid().squeeze(), labels.squeeze())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        if scaler is not None:
            scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        awp.restore()
        if CFG.fgm:
          fgm.attack() 
          adversarial_loss = calculate_loss(inputs=inputs, labels=labels, model=model, criterion=criterion, is_valid=False, device=device)
          scaler.scale(adversarial_loss).backward()
          fgm.restore()
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  #'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          #grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))

    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            loss, y_preds = calculate_loss(inputs=inputs, labels=labels, model=model, criterion=criterion, is_valid=True, device=device)
        #loss = criterion(y_preds.sigmoid().squeeze(), labels.squeeze())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    predictions = np.concatenate(predictions)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [20]:

# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['kfold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['kfold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds['y'].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)


    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size*2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_EXP_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr=5e-6, decoder_lr=1e-4, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", 
                    "LayerNorm.weight"]
        group1=['layer.0.','layer.1.','layer.2.','layer.3.']
        group2=['layer.4.','layer.5.','layer.6.','layer.7.']    
        group3=['layer.8.','layer.9.','layer.10.','layer.11.']
        group_all=['layer.0.','layer.1.','layer.2.','layer.3.','layer.4.','layer.5.','layer.6.','layer.7.','layer.8.','layer.9.','layer.10.','layer.11.']
        optimizer_parameters = [
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay': weight_decay},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay': weight_decay, 'lr': encoder_lr/2.6},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay': weight_decay, 'lr': encoder_lr},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay': weight_decay, 'lr': encoder_lr*2.6},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay': 0.0},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay': 0.0, 'lr': encoder_lr/2.6},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay': 0.0, 'lr': encoder_lr},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay': 0.0, 'lr': encoder_lr*2.6},
        {'params': [p for n, p in model.named_parameters() if "model" not in n], 'lr':decoder_lr, "momentum" : 0.99},
    ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.BCEWithLogitsLoss()

    print('Enable AWP')
    awp = AWP(model, optimizer, adv_lr=0.001, adv_eps=0.001)
    #print('Enable FGM')
    #fgm = FGM(model=model, eps=0.1)
    
    best_score = -1.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device, awp)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score_05 = get_score(valid_labels, predictions)
        score = get_acc_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')

        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_EXP_DIR+f"{CFG.model_name.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_EXP_DIR+f"{CFG.model_name.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds['pred'] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [21]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df['y'].values
        preds = oof_df['pred'].values
        score = get_score(labels, preds)
        acc_score = get_acc_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
        LOGGER.info(f'ACC BEST Score: {acc_score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
            #break
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_EXP_DIR+'oof_df.pkl')

DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_small_epoch10",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "torch_dt

Enable AWP
Epoch: [1][0/279] Elapsed 0m 5s (remain 24m 40s) Loss: 0.7165(0.7165) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.5578(0.6313) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.4916(0.6160) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.5899(0.6052) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.6014(0.6014) 


Epoch 1 - avg_train_loss: 0.6052  avg_val_loss: 0.6128  time: 25s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6052  avg_val_loss: 0.6128  time: 25s
Epoch 1 - Score: 0.7088
INFO:__main__:Epoch 1 - Score: 0.7088
Epoch 1 - Save Best Score: 0.7088 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7088 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.7354(0.6128) 
f1 score : 0.012987012987012988
recall score : 0.006535947712418301
precision score : 1.0
thresh : 0.4
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 15s) Loss: 0.4776(0.4776) LR: 0.00001808  
Epoch: [2][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.5312(0.5142) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 13s (remain 0m 5s) Loss: 0.4715(0.5111) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 18s (remain 0m 0s) Loss: 0.5132(0.5128) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.5424(0.5424) 


Epoch 2 - avg_train_loss: 0.5128  avg_val_loss: 0.5804  time: 20s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5128  avg_val_loss: 0.5804  time: 20s
Epoch 2 - Score: 0.7149
INFO:__main__:Epoch 2 - Score: 0.7149
Epoch 2 - Save Best Score: 0.7149 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7149 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.6906(0.5804) 
f1 score : 0.15476190476190477
recall score : 0.08496732026143791
precision score : 0.8666666666666667
thresh : 0.5
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 22s) Loss: 0.3155(0.3155) LR: 0.00001309  
Epoch: [3][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.3487(0.4064) LR: 0.00001090  
Epoch: [3][200/279] Elapsed 0m 12s (remain 0m 4s) Loss: 0.4446(0.4019) LR: 0.00000866  
Epoch: [3][278/279] Elapsed 0m 17s (remain 0m 0s) Loss: 0.2651(0.4029) LR: 0.00000695  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.4998(0.4998) 


Epoch 3 - avg_train_loss: 0.4029  avg_val_loss: 0.5706  time: 20s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4029  avg_val_loss: 0.5706  time: 20s
Epoch 3 - Score: 0.7129
INFO:__main__:Epoch 3 - Score: 0.7129


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.7096(0.5706) 
f1 score : 0.2346938775510204
recall score : 0.1503267973856209
precision score : 0.5348837209302325
thresh : 0.46
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 15s) Loss: 0.3037(0.3037) LR: 0.00000693  
Epoch: [4][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.3106(0.3241) LR: 0.00000488  
Epoch: [4][200/279] Elapsed 0m 12s (remain 0m 4s) Loss: 0.4322(0.3267) LR: 0.00000310  
Epoch: [4][278/279] Elapsed 0m 17s (remain 0m 0s) Loss: 0.4213(0.3222) LR: 0.00000194  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.4957(0.4957) 


Epoch 4 - avg_train_loss: 0.3222  avg_val_loss: 0.5746  time: 19s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3222  avg_val_loss: 0.5746  time: 19s
Epoch 4 - Score: 0.7108
INFO:__main__:Epoch 4 - Score: 0.7108


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.7344(0.5746) 
f1 score : 0.2966507177033493
recall score : 0.20261437908496732
precision score : 0.5535714285714286
thresh : 0.43
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 23s) Loss: 0.3661(0.3661) LR: 0.00000193  
Epoch: [5][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.2078(0.2862) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 12s (remain 0m 4s) Loss: 0.2635(0.2850) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 17s (remain 0m 0s) Loss: 0.2946(0.2860) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.4860(0.4860) 


Epoch 5 - avg_train_loss: 0.2860  avg_val_loss: 0.5697  time: 20s
INFO:__main__:Epoch 5 - avg_train_loss: 0.2860  avg_val_loss: 0.5697  time: 20s
Epoch 5 - Score: 0.7088
INFO:__main__:Epoch 5 - Score: 0.7088


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.7293(0.5697) 
f1 score : 0.3317972350230415
recall score : 0.23529411764705882
precision score : 0.5625
thresh : 0.5


Score: 0.7149
INFO:__main__:Score: 0.7149
ACC BEST Score: 0.7149
INFO:__main__:ACC BEST Score: 0.7149
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_small_epoch10",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input"

f1 score : 0.15476190476190477
recall score : 0.08496732026143791
precision score : 0.8666666666666667
thresh : 0.5


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_small_epoch10 were not used when initializing DebertaV2Model: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 19s) Loss: 0.7010(0.7010) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.5764(0.6075) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 12s (remain 0m 4s) Loss: 0.4705(0.6053) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 17s (remain 0m 0s) Loss: 0.6750(0.6050) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.4636(0.4636) 


Epoch 1 - avg_train_loss: 0.6050  avg_val_loss: 0.5860  time: 20s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6050  avg_val_loss: 0.5860  time: 20s
Epoch 1 - Score: 0.7149
INFO:__main__:Epoch 1 - Score: 0.7149
Epoch 1 - Save Best Score: 0.7149 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7149 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.4916(0.5860) 
f1 score : 0.05095541401273886
recall score : 0.026143790849673203
precision score : 1.0
thresh : 0.41
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 22s) Loss: 0.5982(0.5982) LR: 0.00001808  
Epoch: [2][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.4620(0.4999) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 12s (remain 0m 4s) Loss: 0.5513(0.5051) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 17s (remain 0m 0s) Loss: 0.4174(0.5074) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.4491(0.4491) 


Epoch 2 - avg_train_loss: 0.5074  avg_val_loss: 0.5720  time: 20s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5074  avg_val_loss: 0.5720  time: 20s
Epoch 2 - Score: 0.7129
INFO:__main__:Epoch 2 - Score: 0.7129


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.4867(0.5720) 
f1 score : 0.2549019607843137
recall score : 0.16993464052287582
precision score : 0.5098039215686274
thresh : 0.57
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 15s) Loss: 0.4734(0.4734) LR: 0.00001309  
Epoch: [3][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.4897(0.4135) LR: 0.00001090  
Epoch: [3][200/279] Elapsed 0m 12s (remain 0m 4s) Loss: 0.4575(0.4198) LR: 0.00000866  
Epoch: [3][278/279] Elapsed 0m 17s (remain 0m 0s) Loss: 0.4287(0.4095) LR: 0.00000695  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.3894(0.3894) 


Epoch 3 - avg_train_loss: 0.4095  avg_val_loss: 0.5816  time: 20s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4095  avg_val_loss: 0.5816  time: 20s
Epoch 3 - Score: 0.7108
INFO:__main__:Epoch 3 - Score: 0.7108


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.4265(0.5816) 
f1 score : 0.18784530386740333
recall score : 0.1111111111111111
precision score : 0.6071428571428571
thresh : 0.36
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 19s) Loss: 0.3494(0.3494) LR: 0.00000693  
Epoch: [4][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.2666(0.3270) LR: 0.00000488  
Epoch: [4][200/279] Elapsed 0m 12s (remain 0m 4s) Loss: 0.3327(0.3221) LR: 0.00000310  
Epoch: [4][278/279] Elapsed 0m 17s (remain 0m 0s) Loss: 0.3142(0.3256) LR: 0.00000194  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.3948(0.3948) 


Epoch 4 - avg_train_loss: 0.3256  avg_val_loss: 0.5714  time: 20s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3256  avg_val_loss: 0.5714  time: 20s
Epoch 4 - Score: 0.7149
INFO:__main__:Epoch 4 - Score: 0.7149


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.4381(0.5714) 
f1 score : 0.2512562814070352
recall score : 0.16339869281045752
precision score : 0.5434782608695652
thresh : 0.4
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 17s) Loss: 0.4058(0.4058) LR: 0.00000193  
Epoch: [5][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.3772(0.2919) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 12s (remain 0m 4s) Loss: 0.2803(0.2900) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 17s (remain 0m 0s) Loss: 0.2683(0.2930) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.4034(0.4034) 


Epoch 5 - avg_train_loss: 0.2930  avg_val_loss: 0.5714  time: 20s
INFO:__main__:Epoch 5 - avg_train_loss: 0.2930  avg_val_loss: 0.5714  time: 20s
Epoch 5 - Score: 0.7129
INFO:__main__:Epoch 5 - Score: 0.7129


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.4449(0.5714) 
f1 score : 0.26794258373205737
recall score : 0.1830065359477124
precision score : 0.5
thresh : 0.43


Score: 0.7008
INFO:__main__:Score: 0.7008
ACC BEST Score: 0.7149
INFO:__main__:ACC BEST Score: 0.7149
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_small_epoch10",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input"

f1 score : 0.05095541401273886
recall score : 0.026143790849673203
precision score : 1.0
thresh : 0.41


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_small_epoch10 were not used when initializing DebertaV2Model: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 22s) Loss: 0.6672(0.6672) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.7447(0.6150) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 12s (remain 0m 4s) Loss: 0.5992(0.6131) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 17s (remain 0m 0s) Loss: 0.5894(0.6077) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.5565(0.5565) 


Epoch 1 - avg_train_loss: 0.6077  avg_val_loss: 0.5905  time: 19s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6077  avg_val_loss: 0.5905  time: 19s
Epoch 1 - Score: 0.7209
INFO:__main__:Epoch 1 - Score: 0.7209
Epoch 1 - Save Best Score: 0.7209 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7209 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.4490(0.5905) 
f1 score : 0.13173652694610777
recall score : 0.0718954248366013
precision score : 0.7857142857142857
thresh : 0.44
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 17s) Loss: 0.4950(0.4950) LR: 0.00001808  
Epoch: [2][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.3672(0.5256) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 12s (remain 0m 5s) Loss: 0.6061(0.5170) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 17s (remain 0m 0s) Loss: 0.4371(0.5092) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.5225(0.5225) 


Epoch 2 - avg_train_loss: 0.5092  avg_val_loss: 0.5780  time: 20s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5092  avg_val_loss: 0.5780  time: 20s
Epoch 2 - Score: 0.7249
INFO:__main__:Epoch 2 - Score: 0.7249
Epoch 2 - Save Best Score: 0.7249 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7249 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.3467(0.5780) 
f1 score : 0.15294117647058825
recall score : 0.08496732026143791
precision score : 0.7647058823529411
thresh : 0.39
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 11s) Loss: 0.5284(0.5284) LR: 0.00001309  
Epoch: [3][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.5483(0.4184) LR: 0.00001090  
Epoch: [3][200/279] Elapsed 0m 13s (remain 0m 5s) Loss: 0.2872(0.4079) LR: 0.00000866  
Epoch: [3][278/279] Elapsed 0m 17s (remain 0m 0s) Loss: 0.4043(0.4040) LR: 0.00000695  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.5285(0.5285) 


Epoch 3 - avg_train_loss: 0.4040  avg_val_loss: 0.5757  time: 20s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4040  avg_val_loss: 0.5757  time: 20s
Epoch 3 - Score: 0.7289
INFO:__main__:Epoch 3 - Score: 0.7289
Epoch 3 - Save Best Score: 0.7289 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7289 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.3711(0.5757) 
f1 score : 0.3982300884955752
recall score : 0.29411764705882354
precision score : 0.6164383561643836
thresh : 0.53
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 18s) Loss: 0.4120(0.4120) LR: 0.00000693  
Epoch: [4][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.3265(0.3287) LR: 0.00000488  
Epoch: [4][200/279] Elapsed 0m 12s (remain 0m 5s) Loss: 0.3954(0.3213) LR: 0.00000310  
Epoch: [4][278/279] Elapsed 0m 17s (remain 0m 0s) Loss: 0.3003(0.3203) LR: 0.00000194  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.5178(0.5178) 


Epoch 4 - avg_train_loss: 0.3203  avg_val_loss: 0.5809  time: 20s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3203  avg_val_loss: 0.5809  time: 20s
Epoch 4 - Score: 0.7209
INFO:__main__:Epoch 4 - Score: 0.7209


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.3219(0.5809) 
f1 score : 0.2421052631578947
recall score : 0.1503267973856209
precision score : 0.6216216216216216
thresh : 0.45
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 14s) Loss: 0.2660(0.2660) LR: 0.00000193  
Epoch: [5][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.1973(0.2832) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 12s (remain 0m 4s) Loss: 0.2407(0.2877) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 17s (remain 0m 0s) Loss: 0.2193(0.2844) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.5153(0.5153) 


Epoch 5 - avg_train_loss: 0.2844  avg_val_loss: 0.5765  time: 20s
INFO:__main__:Epoch 5 - avg_train_loss: 0.2844  avg_val_loss: 0.5765  time: 20s
Epoch 5 - Score: 0.7229
INFO:__main__:Epoch 5 - Score: 0.7229


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.3292(0.5765) 
f1 score : 0.297029702970297
recall score : 0.19607843137254902
precision score : 0.6122448979591837
thresh : 0.48


Score: 0.7269
INFO:__main__:Score: 0.7269
ACC BEST Score: 0.7289
INFO:__main__:ACC BEST Score: 0.7289
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_small_epoch10",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input"

f1 score : 0.3982300884955752
recall score : 0.29411764705882354
precision score : 0.6164383561643836
thresh : 0.53


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_small_epoch10 were not used when initializing DebertaV2Model: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 21s) Loss: 0.5362(0.5362) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.7148(0.5977) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 12s (remain 0m 4s) Loss: 0.8759(0.6063) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 17s (remain 0m 0s) Loss: 0.4669(0.6056) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.6602(0.6602) 


Epoch 1 - avg_train_loss: 0.6056  avg_val_loss: 0.5847  time: 19s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6056  avg_val_loss: 0.5847  time: 19s
Epoch 1 - Score: 0.7129
INFO:__main__:Epoch 1 - Score: 0.7129
Epoch 1 - Save Best Score: 0.7129 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7129 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.5199(0.5847) 
f1 score : 0.03870967741935484
recall score : 0.019736842105263157
precision score : 1.0
thresh : 0.45
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 14s) Loss: 0.5784(0.5784) LR: 0.00001808  
Epoch: [2][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.5603(0.5158) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 12s (remain 0m 4s) Loss: 0.4765(0.5101) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 17s (remain 0m 0s) Loss: 0.3737(0.5094) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.6707(0.6707) 


Epoch 2 - avg_train_loss: 0.5094  avg_val_loss: 0.5716  time: 20s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5094  avg_val_loss: 0.5716  time: 20s
Epoch 2 - Score: 0.7189
INFO:__main__:Epoch 2 - Score: 0.7189
Epoch 2 - Save Best Score: 0.7189 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7189 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.4965(0.5716) 
f1 score : 0.08750000000000001
recall score : 0.046052631578947366
precision score : 0.875
thresh : 0.36
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 20s) Loss: 0.4007(0.4007) LR: 0.00001309  
Epoch: [3][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.3427(0.4061) LR: 0.00001090  
Epoch: [3][200/279] Elapsed 0m 12s (remain 0m 5s) Loss: 0.4282(0.4093) LR: 0.00000866  
Epoch: [3][278/279] Elapsed 0m 17s (remain 0m 0s) Loss: 0.3124(0.4072) LR: 0.00000695  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.6558(0.6558) 


Epoch 3 - avg_train_loss: 0.4072  avg_val_loss: 0.5603  time: 20s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4072  avg_val_loss: 0.5603  time: 20s
Epoch 3 - Score: 0.7149
INFO:__main__:Epoch 3 - Score: 0.7149


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.5077(0.5603) 
f1 score : 0.21621621621621623
recall score : 0.13157894736842105
precision score : 0.6060606060606061
thresh : 0.41
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 17s) Loss: 0.2687(0.2687) LR: 0.00000693  
Epoch: [4][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.2720(0.3257) LR: 0.00000488  
Epoch: [4][200/279] Elapsed 0m 12s (remain 0m 4s) Loss: 0.3014(0.3214) LR: 0.00000310  
Epoch: [4][278/279] Elapsed 0m 17s (remain 0m 0s) Loss: 0.1955(0.3196) LR: 0.00000194  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.6561(0.6561) 


Epoch 4 - avg_train_loss: 0.3196  avg_val_loss: 0.5583  time: 20s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3196  avg_val_loss: 0.5583  time: 20s
Epoch 4 - Score: 0.7209
INFO:__main__:Epoch 4 - Score: 0.7209
Epoch 4 - Save Best Score: 0.7209 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7209 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.4996(0.5583) 
f1 score : 0.3076923076923077
recall score : 0.21052631578947367
precision score : 0.5714285714285714
thresh : 0.46
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 15s) Loss: 0.3802(0.3802) LR: 0.00000193  
Epoch: [5][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.3399(0.2844) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 13s (remain 0m 5s) Loss: 0.4381(0.2875) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 17s (remain 0m 0s) Loss: 0.3195(0.2834) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.6485(0.6485) 


Epoch 5 - avg_train_loss: 0.2834  avg_val_loss: 0.5570  time: 20s
INFO:__main__:Epoch 5 - avg_train_loss: 0.2834  avg_val_loss: 0.5570  time: 20s
Epoch 5 - Score: 0.7189
INFO:__main__:Epoch 5 - Score: 0.7189


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.5093(0.5570) 
f1 score : 0.33027522935779813
recall score : 0.23684210526315788
precision score : 0.5454545454545454
thresh : 0.46


Score: 0.7108
INFO:__main__:Score: 0.7108
ACC BEST Score: 0.7209
INFO:__main__:ACC BEST Score: 0.7209
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_small_epoch10",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input"

f1 score : 0.3076923076923077
recall score : 0.21052631578947367
precision score : 0.5714285714285714
thresh : 0.46


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_small_epoch10 were not used when initializing DebertaV2Model: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 21s) Loss: 0.7112(0.7112) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.5325(0.6280) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 12s (remain 0m 4s) Loss: 0.6439(0.6171) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 17s (remain 0m 0s) Loss: 0.5875(0.6085) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.5343(0.5343) 


Epoch 1 - avg_train_loss: 0.6085  avg_val_loss: 0.5889  time: 20s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6085  avg_val_loss: 0.5889  time: 20s
Epoch 1 - Score: 0.7163
INFO:__main__:Epoch 1 - Score: 0.7163
Epoch 1 - Save Best Score: 0.7163 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7163 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.5305(0.5889) 
f1 score : 0.07407407407407407
recall score : 0.039473684210526314
precision score : 0.6
thresh : 0.42
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 14s) Loss: 0.5745(0.5745) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.4287(0.5333) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 13s (remain 0m 5s) Loss: 0.4715(0.5157) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 18s (remain 0m 0s) Loss: 0.6431(0.5129) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.5526(0.5526) 


Epoch 2 - avg_train_loss: 0.5129  avg_val_loss: 0.5826  time: 20s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5129  avg_val_loss: 0.5826  time: 20s
Epoch 2 - Score: 0.7163
INFO:__main__:Epoch 2 - Score: 0.7163


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.5397(0.5826) 
f1 score : 0.3486238532110092
recall score : 0.25
precision score : 0.5757575757575758
thresh : 0.56
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 18s) Loss: 0.3892(0.3892) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.4270(0.4118) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 12s (remain 0m 4s) Loss: 0.3313(0.4079) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 17s (remain 0m 0s) Loss: 0.3791(0.4084) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.5325(0.5325) 


Epoch 3 - avg_train_loss: 0.4084  avg_val_loss: 0.5741  time: 20s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4084  avg_val_loss: 0.5741  time: 20s
Epoch 3 - Score: 0.7123
INFO:__main__:Epoch 3 - Score: 0.7123


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.4819(0.5741) 
f1 score : 0.227027027027027
recall score : 0.13815789473684212
precision score : 0.6363636363636364
thresh : 0.45
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 16s) Loss: 0.2732(0.2732) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.3869(0.3365) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 12s (remain 0m 4s) Loss: 0.4201(0.3252) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 17s (remain 0m 0s) Loss: 0.2545(0.3246) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.5334(0.5334) 


Epoch 4 - avg_train_loss: 0.3246  avg_val_loss: 0.5710  time: 20s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3246  avg_val_loss: 0.5710  time: 20s
Epoch 4 - Score: 0.7143
INFO:__main__:Epoch 4 - Score: 0.7143


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.4796(0.5710) 
f1 score : 0.2383419689119171
recall score : 0.1513157894736842
precision score : 0.5609756097560976
thresh : 0.57
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 16s) Loss: 0.2994(0.2994) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.2846(0.2869) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 12s (remain 0m 4s) Loss: 0.3305(0.2837) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 17s (remain 0m 0s) Loss: 0.3724(0.2888) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.5345(0.5345) 


Epoch 5 - avg_train_loss: 0.2888  avg_val_loss: 0.5717  time: 20s
INFO:__main__:Epoch 5 - avg_train_loss: 0.2888  avg_val_loss: 0.5717  time: 20s
Epoch 5 - Score: 0.7143
INFO:__main__:Epoch 5 - Score: 0.7143


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.4761(0.5717) 
f1 score : 0.24742268041237112
recall score : 0.15789473684210525
precision score : 0.5714285714285714
thresh : 0.56


Score: 0.6982
INFO:__main__:Score: 0.6982
ACC BEST Score: 0.7163
INFO:__main__:ACC BEST Score: 0.7163
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_small_epoch10",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input"

f1 score : 0.07407407407407407
recall score : 0.039473684210526314
precision score : 0.6
thresh : 0.42


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_small_epoch10 were not used when initializing DebertaV2Model: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 21s) Loss: 0.6638(0.6638) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.7131(0.6216) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 12s (remain 0m 4s) Loss: 0.7779(0.6125) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 17s (remain 0m 0s) Loss: 0.5256(0.6067) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.6536(0.6536) 


Epoch 1 - avg_train_loss: 0.6067  avg_val_loss: 0.6112  time: 19s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6067  avg_val_loss: 0.6112  time: 19s
Epoch 1 - Score: 0.7042
INFO:__main__:Epoch 1 - Score: 0.7042
Epoch 1 - Save Best Score: 0.7042 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7042 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.6506(0.6112) 
f1 score : 0.29357798165137616
recall score : 0.21052631578947367
precision score : 0.48484848484848486
thresh : 0.55
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 16s) Loss: 0.5496(0.5496) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.3351(0.5179) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 12s (remain 0m 4s) Loss: 0.4715(0.5126) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 17s (remain 0m 0s) Loss: 0.4330(0.5110) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.6917(0.6917) 


Epoch 2 - avg_train_loss: 0.5110  avg_val_loss: 0.5807  time: 20s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5110  avg_val_loss: 0.5807  time: 20s
Epoch 2 - Score: 0.7082
INFO:__main__:Epoch 2 - Score: 0.7082
Epoch 2 - Save Best Score: 0.7082 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7082 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.7026(0.5807) 
f1 score : 0.26
recall score : 0.17105263157894737
precision score : 0.5416666666666666
thresh : 0.53
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 20s) Loss: 0.5536(0.5536) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.4747(0.4139) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 13s (remain 0m 5s) Loss: 0.3440(0.4116) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 17s (remain 0m 0s) Loss: 0.5379(0.4062) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.6819(0.6819) 


Epoch 3 - avg_train_loss: 0.4062  avg_val_loss: 0.5893  time: 20s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4062  avg_val_loss: 0.5893  time: 20s
Epoch 3 - Score: 0.7042
INFO:__main__:Epoch 3 - Score: 0.7042


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.7990(0.5893) 
f1 score : 0.12790697674418605
recall score : 0.07236842105263158
precision score : 0.55
thresh : 0.61
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 24s) Loss: 0.3632(0.3632) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.2362(0.3322) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 12s (remain 0m 4s) Loss: 0.3689(0.3228) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 17s (remain 0m 0s) Loss: 0.2374(0.3206) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.7029(0.7029) 


Epoch 4 - avg_train_loss: 0.3206  avg_val_loss: 0.5844  time: 20s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3206  avg_val_loss: 0.5844  time: 20s
Epoch 4 - Score: 0.7062
INFO:__main__:Epoch 4 - Score: 0.7062


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.7461(0.5844) 
f1 score : 0.3013698630136986
recall score : 0.21710526315789475
precision score : 0.4925373134328358
thresh : 0.65
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 18s) Loss: 0.2869(0.2869) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.2954(0.2838) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 12s (remain 0m 4s) Loss: 0.3790(0.2792) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 17s (remain 0m 0s) Loss: 0.2697(0.2845) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.7115(0.7115) 


Epoch 5 - avg_train_loss: 0.2845  avg_val_loss: 0.5881  time: 20s
INFO:__main__:Epoch 5 - avg_train_loss: 0.2845  avg_val_loss: 0.5881  time: 20s
Epoch 5 - Score: 0.7042
INFO:__main__:Epoch 5 - Score: 0.7042


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.7693(0.5881) 
f1 score : 0.2710280373831776
recall score : 0.19078947368421054
precision score : 0.46774193548387094
thresh : 0.65


Score: 0.7022
INFO:__main__:Score: 0.7022
ACC BEST Score: 0.7082
INFO:__main__:ACC BEST Score: 0.7082
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_small_epoch10",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input"

f1 score : 0.26
recall score : 0.17105263157894737
precision score : 0.5416666666666666
thresh : 0.53


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_small_epoch10 were not used when initializing DebertaV2Model: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 31s) Loss: 0.5984(0.5984) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.6525(0.6156) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 12s (remain 0m 4s) Loss: 0.5790(0.6073) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 17s (remain 0m 0s) Loss: 0.6589(0.6070) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.6165(0.6165) 


Epoch 1 - avg_train_loss: 0.6070  avg_val_loss: 0.5914  time: 19s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6070  avg_val_loss: 0.5914  time: 19s
Epoch 1 - Score: 0.7143
INFO:__main__:Epoch 1 - Score: 0.7143
Epoch 1 - Save Best Score: 0.7143 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7143 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.5326(0.5914) 
f1 score : 0.05128205128205127
recall score : 0.02631578947368421
precision score : 1.0
thresh : 0.4
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 19s) Loss: 0.4480(0.4480) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.4337(0.4942) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 13s (remain 0m 5s) Loss: 0.4603(0.5027) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 17s (remain 0m 0s) Loss: 0.5904(0.5054) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.5459(0.5459) 


Epoch 2 - avg_train_loss: 0.5054  avg_val_loss: 0.5767  time: 20s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5054  avg_val_loss: 0.5767  time: 20s
Epoch 2 - Score: 0.7143
INFO:__main__:Epoch 2 - Score: 0.7143


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.5556(0.5767) 
f1 score : 0.2087912087912088
recall score : 0.125
precision score : 0.6333333333333333
thresh : 0.49
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 23s) Loss: 0.3636(0.3636) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.3817(0.4084) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 12s (remain 0m 4s) Loss: 0.3057(0.3987) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 17s (remain 0m 0s) Loss: 0.3540(0.3958) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.5161(0.5161) 


Epoch 3 - avg_train_loss: 0.3958  avg_val_loss: 0.5747  time: 20s
INFO:__main__:Epoch 3 - avg_train_loss: 0.3958  avg_val_loss: 0.5747  time: 20s
Epoch 3 - Score: 0.7183
INFO:__main__:Epoch 3 - Score: 0.7183
Epoch 3 - Save Best Score: 0.7183 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7183 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.5900(0.5747) 
f1 score : 0.324561403508772
recall score : 0.24342105263157895
precision score : 0.4868421052631579
thresh : 0.56
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 19s) Loss: 0.2577(0.2577) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.4250(0.3110) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 12s (remain 0m 5s) Loss: 0.3058(0.3099) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 17s (remain 0m 0s) Loss: 0.2923(0.3117) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.5222(0.5222) 


Epoch 4 - avg_train_loss: 0.3117  avg_val_loss: 0.5761  time: 20s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3117  avg_val_loss: 0.5761  time: 20s
Epoch 4 - Score: 0.7163
INFO:__main__:Epoch 4 - Score: 0.7163


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.5674(0.5761) 
f1 score : 0.30331753554502366
recall score : 0.21052631578947367
precision score : 0.5423728813559322
thresh : 0.56
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 22s) Loss: 0.2477(0.2477) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.2639(0.2744) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 12s (remain 0m 4s) Loss: 0.2577(0.2735) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 17s (remain 0m 0s) Loss: 0.3663(0.2759) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.5163(0.5163) 


Epoch 5 - avg_train_loss: 0.2759  avg_val_loss: 0.5773  time: 20s
INFO:__main__:Epoch 5 - avg_train_loss: 0.2759  avg_val_loss: 0.5773  time: 20s
Epoch 5 - Score: 0.7203
INFO:__main__:Epoch 5 - Score: 0.7203
Epoch 5 - Save Best Score: 0.7203 Model
INFO:__main__:Epoch 5 - Save Best Score: 0.7203 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.5696(0.5773) 
f1 score : 0.29523809523809524
recall score : 0.20394736842105263
precision score : 0.5344827586206896
thresh : 0.59


Score: 0.7022
INFO:__main__:Score: 0.7022
ACC BEST Score: 0.7203
INFO:__main__:ACC BEST Score: 0.7203
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_small_epoch10",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input"

f1 score : 0.29523809523809524
recall score : 0.20394736842105263
precision score : 0.5344827586206896
thresh : 0.59


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_small_epoch10 were not used when initializing DebertaV2Model: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 23s) Loss: 0.8093(0.8093) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.7741(0.6258) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 13s (remain 0m 5s) Loss: 0.6567(0.6117) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 17s (remain 0m 0s) Loss: 0.5670(0.6086) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.6778(0.6778) 


Epoch 1 - avg_train_loss: 0.6086  avg_val_loss: 0.5894  time: 20s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6086  avg_val_loss: 0.5894  time: 20s
Epoch 1 - Score: 0.7082
INFO:__main__:Epoch 1 - Score: 0.7082
Epoch 1 - Save Best Score: 0.7082 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7082 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.7537(0.5894) 
f1 score : 0.08750000000000001
recall score : 0.046052631578947366
precision score : 0.875
thresh : 0.45
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 19s) Loss: 0.4842(0.4842) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.5816(0.5110) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 13s (remain 0m 5s) Loss: 0.7145(0.5168) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 18s (remain 0m 0s) Loss: 0.4832(0.5154) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.7033(0.7033) 


Epoch 2 - avg_train_loss: 0.5154  avg_val_loss: 0.5790  time: 20s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5154  avg_val_loss: 0.5790  time: 20s
Epoch 2 - Score: 0.7143
INFO:__main__:Epoch 2 - Score: 0.7143
Epoch 2 - Save Best Score: 0.7143 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7143 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.7533(0.5790) 
f1 score : 0.192090395480226
recall score : 0.1118421052631579
precision score : 0.68
thresh : 0.54
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 25s) Loss: 0.4241(0.4241) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.3088(0.4153) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 13s (remain 0m 5s) Loss: 0.5252(0.4110) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 18s (remain 0m 0s) Loss: 0.5087(0.4113) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.7306(0.7306) 


Epoch 3 - avg_train_loss: 0.4113  avg_val_loss: 0.5706  time: 20s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4113  avg_val_loss: 0.5706  time: 20s
Epoch 3 - Score: 0.7284
INFO:__main__:Epoch 3 - Score: 0.7284
Epoch 3 - Save Best Score: 0.7284 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7284 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.7365(0.5706) 
f1 score : 0.30769230769230776
recall score : 0.19736842105263158
precision score : 0.6976744186046512
thresh : 0.49
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 21s) Loss: 0.3183(0.3183) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.2743(0.3334) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 13s (remain 0m 5s) Loss: 0.4499(0.3316) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 17s (remain 0m 0s) Loss: 0.2798(0.3307) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.7433(0.7433) 


Epoch 4 - avg_train_loss: 0.3307  avg_val_loss: 0.5685  time: 20s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3307  avg_val_loss: 0.5685  time: 20s
Epoch 4 - Score: 0.7304
INFO:__main__:Epoch 4 - Score: 0.7304
Epoch 4 - Save Best Score: 0.7304 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7304 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.7378(0.5685) 
f1 score : 0.32160804020100503
recall score : 0.21052631578947367
precision score : 0.6808510638297872
thresh : 0.51
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 26s) Loss: 0.3550(0.3550) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.2997(0.2949) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 12s (remain 0m 5s) Loss: 0.2893(0.2974) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 17s (remain 0m 0s) Loss: 0.2822(0.2970) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.7509(0.7509) 


Epoch 5 - avg_train_loss: 0.2970  avg_val_loss: 0.5691  time: 20s
INFO:__main__:Epoch 5 - avg_train_loss: 0.2970  avg_val_loss: 0.5691  time: 20s
Epoch 5 - Score: 0.7304
INFO:__main__:Epoch 5 - Score: 0.7304


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.7453(0.5691) 
f1 score : 0.32323232323232326
recall score : 0.21052631578947367
precision score : 0.6956521739130435
thresh : 0.5


Score: 0.7284
INFO:__main__:Score: 0.7284
ACC BEST Score: 0.7304
INFO:__main__:ACC BEST Score: 0.7304
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_small_epoch10",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input"

f1 score : 0.32160804020100503
recall score : 0.21052631578947367
precision score : 0.6808510638297872
thresh : 0.51


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_small_epoch10 were not used when initializing DebertaV2Model: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 25s) Loss: 0.6850(0.6850) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.4757(0.6149) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 12s (remain 0m 4s) Loss: 0.5526(0.6168) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 17s (remain 0m 0s) Loss: 0.6301(0.6128) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.6136(0.6136) 


Epoch 1 - avg_train_loss: 0.6128  avg_val_loss: 0.5928  time: 20s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6128  avg_val_loss: 0.5928  time: 20s
Epoch 1 - Score: 0.7103
INFO:__main__:Epoch 1 - Score: 0.7103
Epoch 1 - Save Best Score: 0.7103 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7103 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.6935(0.5928) 
f1 score : 0.09937888198757765
recall score : 0.05263157894736842
precision score : 0.8888888888888888
thresh : 0.51
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 26s) Loss: 0.4802(0.4802) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.5866(0.5257) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 13s (remain 0m 5s) Loss: 0.3493(0.5203) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 18s (remain 0m 0s) Loss: 0.4664(0.5201) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.6172(0.6172) 


Epoch 2 - avg_train_loss: 0.5201  avg_val_loss: 0.5712  time: 20s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5201  avg_val_loss: 0.5712  time: 20s
Epoch 2 - Score: 0.7163
INFO:__main__:Epoch 2 - Score: 0.7163
Epoch 2 - Save Best Score: 0.7163 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7163 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.6893(0.5712) 
f1 score : 0.2842639593908629
recall score : 0.18421052631578946
precision score : 0.6222222222222222
thresh : 0.5
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 27s) Loss: 0.3996(0.3996) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.3036(0.4200) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 12s (remain 0m 4s) Loss: 0.2677(0.4125) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 17s (remain 0m 0s) Loss: 0.4548(0.4108) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.5854(0.5854) 


Epoch 3 - avg_train_loss: 0.4108  avg_val_loss: 0.5578  time: 20s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4108  avg_val_loss: 0.5578  time: 20s
Epoch 3 - Score: 0.7183
INFO:__main__:Epoch 3 - Score: 0.7183
Epoch 3 - Save Best Score: 0.7183 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7183 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.7089(0.5578) 
f1 score : 0.2577319587628866
recall score : 0.16447368421052633
precision score : 0.5952380952380952
thresh : 0.58
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 23s) Loss: 0.2585(0.2585) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.3161(0.3290) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 12s (remain 0m 4s) Loss: 0.3347(0.3317) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 17s (remain 0m 0s) Loss: 0.3304(0.3275) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.5860(0.5860) 


Epoch 4 - avg_train_loss: 0.3275  avg_val_loss: 0.5555  time: 20s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3275  avg_val_loss: 0.5555  time: 20s
Epoch 4 - Score: 0.7143
INFO:__main__:Epoch 4 - Score: 0.7143


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.7090(0.5555) 
f1 score : 0.30392156862745096
recall score : 0.20394736842105263
precision score : 0.5961538461538461
thresh : 0.5
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 20s) Loss: 0.2768(0.2768) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.2657(0.2975) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 12s (remain 0m 4s) Loss: 0.3379(0.2926) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 17s (remain 0m 0s) Loss: 0.3439(0.2927) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.5887(0.5887) 


Epoch 5 - avg_train_loss: 0.2927  avg_val_loss: 0.5584  time: 19s
INFO:__main__:Epoch 5 - avg_train_loss: 0.2927  avg_val_loss: 0.5584  time: 19s
Epoch 5 - Score: 0.7163
INFO:__main__:Epoch 5 - Score: 0.7163


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.7131(0.5584) 
f1 score : 0.2755102040816326
recall score : 0.17763157894736842
precision score : 0.6136363636363636
thresh : 0.44


Score: 0.7103
INFO:__main__:Score: 0.7103
ACC BEST Score: 0.7183
INFO:__main__:ACC BEST Score: 0.7183
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_small_epoch10",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input"

f1 score : 0.2577319587628866
recall score : 0.16447368421052633
precision score : 0.5952380952380952
thresh : 0.58


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_small_epoch10 were not used when initializing DebertaV2Model: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 26s) Loss: 0.7026(0.7026) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.6680(0.6359) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 12s (remain 0m 4s) Loss: 0.6126(0.6154) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 17s (remain 0m 0s) Loss: 0.4842(0.6054) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.6271(0.6271) 


Epoch 1 - avg_train_loss: 0.6054  avg_val_loss: 0.5973  time: 19s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6054  avg_val_loss: 0.5973  time: 19s
Epoch 1 - Score: 0.7183
INFO:__main__:Epoch 1 - Score: 0.7183
Epoch 1 - Save Best Score: 0.7183 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7183 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.5241(0.5973) 
f1 score : 0.013071895424836602
recall score : 0.006578947368421052
precision score : 1.0
thresh : 0.36
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 19s) Loss: 0.6204(0.6204) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.4664(0.5160) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 13s (remain 0m 5s) Loss: 0.5155(0.5200) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 18s (remain 0m 0s) Loss: 0.5135(0.5160) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.6476(0.6476) 


Epoch 2 - avg_train_loss: 0.5160  avg_val_loss: 0.5856  time: 20s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5160  avg_val_loss: 0.5856  time: 20s
Epoch 2 - Score: 0.7103
INFO:__main__:Epoch 2 - Score: 0.7103


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.5273(0.5856) 
f1 score : 0.19889502762430936
recall score : 0.11842105263157894
precision score : 0.6206896551724138
thresh : 0.46
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 21s) Loss: 0.4356(0.4356) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.3832(0.4154) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 12s (remain 0m 4s) Loss: 0.4802(0.4128) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 17s (remain 0m 0s) Loss: 0.3588(0.4145) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.6617(0.6617) 


Epoch 3 - avg_train_loss: 0.4145  avg_val_loss: 0.5844  time: 19s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4145  avg_val_loss: 0.5844  time: 19s
Epoch 3 - Score: 0.7123
INFO:__main__:Epoch 3 - Score: 0.7123


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.5295(0.5844) 
f1 score : 0.27586206896551724
recall score : 0.18421052631578946
precision score : 0.5490196078431373
thresh : 0.54
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 23s) Loss: 0.4371(0.4371) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.3088(0.3367) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 12s (remain 0m 4s) Loss: 0.2676(0.3341) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 17s (remain 0m 0s) Loss: 0.3529(0.3316) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.6753(0.6753) 


Epoch 4 - avg_train_loss: 0.3316  avg_val_loss: 0.5896  time: 19s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3316  avg_val_loss: 0.5896  time: 19s
Epoch 4 - Score: 0.7203
INFO:__main__:Epoch 4 - Score: 0.7203
Epoch 4 - Save Best Score: 0.7203 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7203 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.5013(0.5896) 
f1 score : 0.24742268041237112
recall score : 0.15789473684210525
precision score : 0.5714285714285714
thresh : 0.53
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 24s) Loss: 0.2757(0.2757) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 6s (remain 0m 11s) Loss: 0.2667(0.2957) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 12s (remain 0m 5s) Loss: 0.2964(0.2947) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 17s (remain 0m 0s) Loss: 0.3162(0.2960) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.6801(0.6801) 


Epoch 5 - avg_train_loss: 0.2960  avg_val_loss: 0.5865  time: 20s
INFO:__main__:Epoch 5 - avg_train_loss: 0.2960  avg_val_loss: 0.5865  time: 20s
Epoch 5 - Score: 0.7183
INFO:__main__:Epoch 5 - Score: 0.7183


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.5107(0.5865) 
f1 score : 0.28571428571428575
recall score : 0.19736842105263158
precision score : 0.5172413793103449
thresh : 0.59


Score: 0.7062
INFO:__main__:Score: 0.7062
ACC BEST Score: 0.7203
INFO:__main__:ACC BEST Score: 0.7203
Score: 0.7101
INFO:__main__:Score: 0.7101
ACC BEST Score: 0.7125
INFO:__main__:ACC BEST Score: 0.7125


f1 score : 0.24742268041237112
recall score : 0.15789473684210525
precision score : 0.5714285714285714
thresh : 0.53
f1 score : 0.24817518248175183
recall score : 0.15627051871306633
precision score : 0.6025316455696202
thresh : 0.52


In [None]:
from google.colab import runtime
runtime.unassign()