In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
!pip install transformers
!pip install datasets
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m55.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m100.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https://pypi.org/simple, https:/

In [3]:
!nvidia-smi

Thu May  4 19:47:55 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   29C    P0    43W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:

import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re
import html

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD, AdamW, RAdam
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score, recall_score, accuracy_score, precision_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
DIR = "/content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測"
INPUT_DIR = os.path.join(DIR,"input")
OUTPUT_DIR = os.path.join(DIR,"output")
CUSTOM_MODEL_DIR = os.path.join(OUTPUT_DIR,'clrp_deberta_base_epoch10')
OUTPUT_EXP_DIR = DIR + '/output/EXP065/'
if not os.path.exists(OUTPUT_EXP_DIR):
    os.makedirs(OUTPUT_EXP_DIR)

In [6]:

# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    model_name="microsoft/deberta-base"
    # model='microsoft/deberta-base'
    # model='roberta-base'
    # model='roberta-large'
    # model='roberta-large-mnli'
    # model='xlnet-large-cased'
    # model='albert-xxlarge-v2'
    # model="microsoft/deberta-large"
    # model="microsoft/deberta-v3-large"
    # model='microsoft/deberta-v2-xlarge'
    # model='funnel-transformer/large'
    # model='funnel-transformer/medium'
    # model='albert-base-v2'
    # model='albert-large-v2'
    # model='google/electra-large-discriminator'
    # model='google/electra-base-discriminator'
    # model="facebook/bart-large-mnli"
    # model="facebook/bart-large"
    # model="facebook/bart-base"
    model = CUSTOM_MODEL_DIR
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=5
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=16
    fc_dropout=0.2
    target_size=1
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=10
    trn_fold=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    train=True
    nth_awp_start_epoch=1
    gradient_checkpointing = False
    freezing = False
    num_reinit_layers = 1
    is_reinit_layer = False
    fgm = False
    awp_start=1

if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0, 1]

In [7]:
def get_score(labels, outputs):
    thresh = 0.5
    y_pred = outputs
    y_true = labels
    f_score = f1_score(y_true, (y_pred>thresh).astype(int))
    r_score = recall_score(y_true, (y_pred>thresh).astype(int))
    p_score = precision_score(y_true, (y_pred>thresh).astype(int))
    print(f"f1 score : {f_score}")
    print(f"recall score : {r_score}")
    print(f"precision score : {p_score}")
    return accuracy_score(y_true, (y_pred>thresh).astype(int))

def get_acc_score(labels, outputs):
    y_pred = outputs
    y_true = labels
    best_score = 0
    best_thresh = 0.5
    for thresh in np.arange(0.1, 0.80, 0.01):
        thresh = np.round(thresh, 2)
        score = accuracy_score(y_true, (y_pred>thresh).astype(int))
        #print("Accuracy score at threshold {0} is {1}".format(thresh, score))
        if score > best_score:
          best_score = score
          best_thresh = thresh
    print(f"thresh : {best_thresh}")
    return accuracy_score(y_true, (y_pred>best_thresh).astype(int))


def get_logger(filename=OUTPUT_EXP_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=CFG.seed)

In [8]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False
        
def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
            
    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """
    
    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"
        
        if hasattr(embeddings_path, attr_name): 
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [9]:
import pandas as pd
import numpy as np

train = pd.read_csv(os.path.join(INPUT_DIR,"train_data.csv"))
test = pd.read_csv(os.path.join(INPUT_DIR,"test_data.csv"))
sample_sub = pd.read_csv(os.path.join(INPUT_DIR,"submission.csv"))

train = train.sample(frac=1, random_state=CFG.seed).reset_index()

print(train.shape)
display(train.head(3))

print(test.shape)
display(test.head(3))

print(sample_sub.shape)
display(sample_sub.head(3))

(4974, 7)


Unnamed: 0,index,id,title,year,abstract,keywords,y
0,721,722,Global Optimality Conditions for Deep Neural N...,2018,We study the error landscape of deep linear an...,"deep linear neural networks, global optimality...",1
1,144,145,Multi-Task Learning by Deep Collaboration and ...,2018,Convolutional neural networks (CNN) have becom...,"multi-task learning, soft parameter sharing, f...",0
2,4542,4543,On the Need for Topology-Aware Generative Mode...,2020,"ML algorithms or models, especially deep neura...","Manifold-based Defense, Robust Learning, Adver...",1


(6393, 5)


Unnamed: 0,id,title,year,abstract,keywords
0,1,StyleAlign: Analysis and Applications of Align...,2022,"In this paper, we perform an in-depth study of...","StyleGAN, transfer learning, fine tuning, mode..."
1,2,Embedding a random graph via GNN: mean-field i...,2021,We develop a theory for embedding a random gra...,"Graph neural network, graph embedding, multi-r..."
2,3,BBRefinement: an universal scheme to improve p...,2021,We present a conceptually simple yet powerful ...,"object detection, deep neural networks, refine..."


(6393, 2)


Unnamed: 0,id,y
0,1,0
1,2,0
2,3,0


In [10]:
train["texts"] = train["title"] + "[SEP]" + train["abstract"] 

In [11]:
skf = StratifiedKFold(n_splits=CFG.n_fold,shuffle=True,random_state=CFG.seed)
for fold, ( _, val_) in enumerate(skf.split(train, train.y)):
    train.loc[val_ , "kfold"] = int(fold)
    
train["kfold"] = train["kfold"].astype(int)

if CFG.debug:
    display(train.groupby('kfold').size())
    train = train.sample(n=500, random_state=0).reset_index(drop=True)
    display(train.groupby('kfold').size())

In [12]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_EXP_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

In [13]:
# ====================================================
# Define max_len
# ====================================================
lengths = []
tk0 = tqdm(train['texts'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 3 # cls + sep + sep
LOGGER.info(f"max_len: {CFG.max_len}")

 11%|█         | 557/4974 [00:00<00:03, 1396.86it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (572 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 4974/4974 [00:03<00:00, 1465.19it/s]
max_len: 653
INFO:__main__:max_len: 653


In [14]:
class AWP:
    def __init__(self, model, optimizer, *, adv_param='weight',
                 adv_lr=0.001, adv_eps=0.001):
        self.model = model
        self.optimizer = optimizer
        self.adv_param = adv_param
        self.adv_lr = adv_lr
        self.adv_eps = adv_eps
        self.backup = {}

    def perturb(self, inputs, y, criterion):
        """
        Perturb model parameters for AWP gradient
        Call before loss and loss.backward()
        """
        self._save()  # save model parameters
        self._attack_step()  # perturb weights

    def _attack_step(self):
        e = 1e-6
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                grad = self.optimizer.state[param]['exp_avg']
                norm_grad = torch.norm(grad)
                norm_data = torch.norm(param.detach())

                if norm_grad != 0 and not torch.isnan(norm_grad):
                    # Set lower and upper limit in change
                    limit_eps = self.adv_eps * param.detach().abs()
                    param_min = param.data - limit_eps
                    param_max = param.data + limit_eps

                    # Perturb along gradient
                    # w += (adv_lr * |w| / |grad|) * grad
                    param.data.add_(grad, alpha=(self.adv_lr * (norm_data + e) / (norm_grad + e)))

                    # Apply the limit to the change
                    param.data.clamp_(param_min, param_max)

    def _save(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                if name not in self.backup:
                    self.backup[name] = param.clone().detach()
                else:
                    self.backup[name].copy_(param.data)

    def restore(self):
        """
        Restore model parameter to correct position; AWP do not perturbe weights, it perturb gradients
        Call after loss.backward(), before optimizer.step()
        """
        for name, param in self.model.named_parameters():
            if name in self.backup:
                param.data.copy_(self.backup[name])

In [15]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False,
                           truncation=True)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['texts'].values
        self.labels = df['y'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

#collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [16]:
def reinit_layers(model):

    #for layer in model.model.encoder.layer[-CFG.num_reinit_layers:]:
    for layer in model.encoder.layer[-CFG.num_reinit_layers:]:    #Custome model内(backbone)

            for module in layer.modules():

                if isinstance(module,nn.Linear):
                    module.weight.data.normal_(mean=0.0,std=model.config.initializer_range)
                    if module.bias is not None:
                            module.bias.data.zero_()
                elif isinstance(module, nn.Embedding):
                        module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
                        if module.padding_idx is not None:
                            module.weight.data[module.padding_idx].zero_()
                elif isinstance(module, nn.LayerNorm):
                        module.bias.data.zero_()
                        module.weight.data.fill_(1.0)
                        
    return model

In [17]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim=1)
        return max_embeddings
    

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if CFG.is_reinit_layer:
            self.model = reinit_layers(self.model)
            print(f'Reinitializing Last {CFG.num_reinit_layers} Layers.')
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()

        # Freezing
        if cfg.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            cfg.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())

        self.pool = MaxPooling()
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        self._init_weights(self.fc)
        self.layer_norm1 = nn.LayerNorm(self.config.hidden_size)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        feature = self.layer_norm1(feature)
        output = self.fc(feature)
        return output

In [18]:
def calculate_loss(inputs, labels, model, criterion, is_valid=True, device="cpu"):    
    y_preds = model(inputs)
    loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
    return (loss, y_preds) if is_valid else loss

In [19]:

# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device, awp):
    model.zero_grad()
    model.train()
    awp_start = CFG.awp_start
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        if epoch >= awp_start:
            awp.perturb(inputs, labels, criterion)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            loss = calculate_loss(inputs=inputs, labels=labels, model=model, criterion=criterion, is_valid=False, device=device)
        #print(y_preds.sigmoid().squeeze().view(1, -1))
        #loss = criterion(y_preds.sigmoid().squeeze(), labels.squeeze())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        if scaler is not None:
            scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        awp.restore()
        if CFG.fgm:
          fgm.attack() 
          adversarial_loss = calculate_loss(inputs=inputs, labels=labels, model=model, criterion=criterion, is_valid=False, device=device)
          scaler.scale(adversarial_loss).backward()
          fgm.restore()
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  #'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          #grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))

    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            loss, y_preds = calculate_loss(inputs=inputs, labels=labels, model=model, criterion=criterion, is_valid=True, device=device)
        #loss = criterion(y_preds.sigmoid().squeeze(), labels.squeeze())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    predictions = np.concatenate(predictions)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [20]:

# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['kfold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['kfold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds['y'].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)


    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size*2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_EXP_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr=5e-6, decoder_lr=1e-4, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", 
                    "LayerNorm.weight"]
        group1=['layer.0.','layer.1.','layer.2.','layer.3.']
        group2=['layer.4.','layer.5.','layer.6.','layer.7.']    
        group3=['layer.8.','layer.9.','layer.10.','layer.11.']
        group_all=['layer.0.','layer.1.','layer.2.','layer.3.','layer.4.','layer.5.','layer.6.','layer.7.','layer.8.','layer.9.','layer.10.','layer.11.']
        optimizer_parameters = [
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay': weight_decay},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay': weight_decay, 'lr': encoder_lr/2.6},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay': weight_decay, 'lr': encoder_lr},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay': weight_decay, 'lr': encoder_lr*2.6},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay': 0.0},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay': 0.0, 'lr': encoder_lr/2.6},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay': 0.0, 'lr': encoder_lr},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay': 0.0, 'lr': encoder_lr*2.6},
        {'params': [p for n, p in model.named_parameters() if "model" not in n], 'lr':decoder_lr, "momentum" : 0.99},
    ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.BCEWithLogitsLoss()

    print('Enable AWP')
    awp = AWP(model, optimizer, adv_lr=0.001, adv_eps=0.001)
    #print('Enable FGM')
    #fgm = FGM(model=model, eps=0.1)
    
    best_score = -1.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device, awp)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score_05 = get_score(valid_labels, predictions)
        score = get_acc_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')

        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_EXP_DIR+f"{CFG.model_name.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_EXP_DIR+f"{CFG.model_name.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds['pred'] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [21]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df['y'].values
        preds = oof_df['pred'].values
        score = get_score(labels, preds)
        acc_score = get_acc_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
        LOGGER.info(f'ACC BEST Score: {acc_score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
            #break
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_EXP_DIR+'oof_df.pkl')

DebertaConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_base_epoch10",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "c2p",
    "p2c"
  ],
  "position_biased_input": false,
  "relative_attention": true,
  "torch_dtype": "float32",
  "transformers_version": "4.28.1",
  "type_vocab_size": 0,
  "vocab_size": 5

Enable AWP
Epoch: [1][0/279] Elapsed 0m 3s (remain 16m 50s) Loss: 0.9129(0.9129) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 19s (remain 0m 33s) Loss: 0.5915(0.6014) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 33s (remain 0m 13s) Loss: 0.5195(0.5964) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 44s (remain 0m 0s) Loss: 0.5301(0.5974) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.5939(0.5939) 


Epoch 1 - avg_train_loss: 0.5974  avg_val_loss: 0.5820  time: 49s
INFO:__main__:Epoch 1 - avg_train_loss: 0.5974  avg_val_loss: 0.5820  time: 49s
Epoch 1 - Score: 0.7189
INFO:__main__:Epoch 1 - Score: 0.7189
Epoch 1 - Save Best Score: 0.7189 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7189 Model


EVAL: [15/16] Elapsed 0m 4s (remain 0m 0s) Loss: 0.6540(0.5820) 
f1 score : 0.11976047904191617
recall score : 0.06535947712418301
precision score : 0.7142857142857143
thresh : 0.41
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 53s) Loss: 0.5055(0.5055) LR: 0.00001808  
Epoch: [2][100/279] Elapsed 0m 15s (remain 0m 27s) Loss: 0.3406(0.4182) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 30s (remain 0m 11s) Loss: 0.4513(0.3969) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 42s (remain 0m 0s) Loss: 0.4235(0.3926) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.5670(0.5670) 


Epoch 2 - avg_train_loss: 0.3926  avg_val_loss: 0.5897  time: 47s
INFO:__main__:Epoch 2 - avg_train_loss: 0.3926  avg_val_loss: 0.5897  time: 47s
Epoch 2 - Score: 0.7189
INFO:__main__:Epoch 2 - Score: 0.7189


EVAL: [15/16] Elapsed 0m 4s (remain 0m 0s) Loss: 0.6095(0.5897) 
f1 score : 0.3363636363636363
recall score : 0.24183006535947713
precision score : 0.5522388059701493
thresh : 0.56
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 49s) Loss: 0.0948(0.0948) LR: 0.00001309  
Epoch: [3][100/279] Elapsed 0m 15s (remain 0m 27s) Loss: 0.1103(0.1142) LR: 0.00001090  
Epoch: [3][200/279] Elapsed 0m 30s (remain 0m 11s) Loss: 0.0642(0.1029) LR: 0.00000866  
Epoch: [3][278/279] Elapsed 0m 41s (remain 0m 0s) Loss: 0.0703(0.0994) LR: 0.00000695  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.5402(0.5402) 


Epoch 3 - avg_train_loss: 0.0994  avg_val_loss: 0.6079  time: 46s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0994  avg_val_loss: 0.6079  time: 46s
Epoch 3 - Score: 0.7169
INFO:__main__:Epoch 3 - Score: 0.7169


EVAL: [15/16] Elapsed 0m 4s (remain 0m 0s) Loss: 0.7004(0.6079) 
f1 score : 0.37795275590551175
recall score : 0.3137254901960784
precision score : 0.4752475247524752
thresh : 0.66
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 37s) Loss: 0.0346(0.0346) LR: 0.00000693  
Epoch: [4][100/279] Elapsed 0m 15s (remain 0m 27s) Loss: 0.0244(0.0331) LR: 0.00000488  
Epoch: [4][200/279] Elapsed 0m 30s (remain 0m 11s) Loss: 0.0262(0.0316) LR: 0.00000310  
Epoch: [4][278/279] Elapsed 0m 42s (remain 0m 0s) Loss: 0.0271(0.0314) LR: 0.00000194  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.5827(0.5827) 


Epoch 4 - avg_train_loss: 0.0314  avg_val_loss: 0.6651  time: 46s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0314  avg_val_loss: 0.6651  time: 46s
Epoch 4 - Score: 0.7189
INFO:__main__:Epoch 4 - Score: 0.7189


EVAL: [15/16] Elapsed 0m 4s (remain 0m 0s) Loss: 0.7782(0.6651) 
f1 score : 0.3425925925925926
recall score : 0.24183006535947713
precision score : 0.5873015873015873
thresh : 0.56
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 39s) Loss: 0.0184(0.0184) LR: 0.00000193  
Epoch: [5][100/279] Elapsed 0m 15s (remain 0m 27s) Loss: 0.0192(0.0230) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 30s (remain 0m 11s) Loss: 0.0217(0.0233) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 42s (remain 0m 0s) Loss: 0.0178(0.0234) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.5866(0.5866) 


Epoch 5 - avg_train_loss: 0.0234  avg_val_loss: 0.6843  time: 47s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0234  avg_val_loss: 0.6843  time: 47s
Epoch 5 - Score: 0.7229
INFO:__main__:Epoch 5 - Score: 0.7229
Epoch 5 - Save Best Score: 0.7229 Model
INFO:__main__:Epoch 5 - Save Best Score: 0.7229 Model


EVAL: [15/16] Elapsed 0m 4s (remain 0m 0s) Loss: 0.8123(0.6843) 
f1 score : 0.3425925925925926
recall score : 0.24183006535947713
precision score : 0.5873015873015873
thresh : 0.56


Score: 0.7149
INFO:__main__:Score: 0.7149
ACC BEST Score: 0.7229
INFO:__main__:ACC BEST Score: 0.7229
DebertaConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_base_epoch10",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "c2p",
    "p2c"
  ],
  "position_biased_input": false,
  "relative_attention": true,
  "

f1 score : 0.3425925925925926
recall score : 0.24183006535947713
precision score : 0.5873015873015873
thresh : 0.56


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_base_epoch10 were not used when initializing DebertaModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 2m 1s) Loss: 0.7534(0.7534) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 15s (remain 0m 27s) Loss: 0.6105(0.6457) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 30s (remain 0m 11s) Loss: 0.6525(0.6189) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 41s (remain 0m 0s) Loss: 0.4849(0.6078) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.4226(0.4226) 


Epoch 1 - avg_train_loss: 0.6078  avg_val_loss: 0.5817  time: 46s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6078  avg_val_loss: 0.5817  time: 46s
Epoch 1 - Score: 0.7149
INFO:__main__:Epoch 1 - Score: 0.7149
Epoch 1 - Save Best Score: 0.7149 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7149 Model


EVAL: [15/16] Elapsed 0m 4s (remain 0m 0s) Loss: 0.4988(0.5817) 
f1 score : 0.09937888198757763
recall score : 0.05228758169934641
precision score : 1.0
thresh : 0.43
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 35s) Loss: 0.4222(0.4222) LR: 0.00001808  
Epoch: [2][100/279] Elapsed 0m 14s (remain 0m 26s) Loss: 0.3851(0.4205) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 29s (remain 0m 11s) Loss: 0.2222(0.3977) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 41s (remain 0m 0s) Loss: 0.6541(0.3867) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.3501(0.3501) 


Epoch 2 - avg_train_loss: 0.3867  avg_val_loss: 0.6055  time: 46s
INFO:__main__:Epoch 2 - avg_train_loss: 0.3867  avg_val_loss: 0.6055  time: 46s
Epoch 2 - Score: 0.7108
INFO:__main__:Epoch 2 - Score: 0.7108


EVAL: [15/16] Elapsed 0m 4s (remain 0m 0s) Loss: 0.5305(0.6055) 
f1 score : 0.30841121495327106
recall score : 0.21568627450980393
precision score : 0.5409836065573771
thresh : 0.48
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 33s) Loss: 0.1326(0.1326) LR: 0.00001309  
Epoch: [3][100/279] Elapsed 0m 15s (remain 0m 27s) Loss: 0.0870(0.0905) LR: 0.00001090  
Epoch: [3][200/279] Elapsed 0m 31s (remain 0m 12s) Loss: 0.0391(0.0752) LR: 0.00000866  
Epoch: [3][278/279] Elapsed 0m 42s (remain 0m 0s) Loss: 0.0278(0.0685) LR: 0.00000695  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.3466(0.3466) 


Epoch 3 - avg_train_loss: 0.0685  avg_val_loss: 0.6508  time: 47s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0685  avg_val_loss: 0.6508  time: 47s
Epoch 3 - Score: 0.7129
INFO:__main__:Epoch 3 - Score: 0.7129


EVAL: [15/16] Elapsed 0m 4s (remain 0m 0s) Loss: 0.5044(0.6508) 
f1 score : 0.28708133971291866
recall score : 0.19607843137254902
precision score : 0.5357142857142857
thresh : 0.33
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 44s) Loss: 0.0203(0.0203) LR: 0.00000693  
Epoch: [4][100/279] Elapsed 0m 15s (remain 0m 26s) Loss: 0.0193(0.0169) LR: 0.00000488  
Epoch: [4][200/279] Elapsed 0m 30s (remain 0m 11s) Loss: 0.0154(0.0161) LR: 0.00000310  
Epoch: [4][278/279] Elapsed 0m 42s (remain 0m 0s) Loss: 0.0269(0.0162) LR: 0.00000194  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.3269(0.3269) 


Epoch 4 - avg_train_loss: 0.0162  avg_val_loss: 0.6849  time: 47s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0162  avg_val_loss: 0.6849  time: 47s
Epoch 4 - Score: 0.7129
INFO:__main__:Epoch 4 - Score: 0.7129


EVAL: [15/16] Elapsed 0m 4s (remain 0m 0s) Loss: 0.5226(0.6849) 
f1 score : 0.30841121495327106
recall score : 0.21568627450980393
precision score : 0.5409836065573771
thresh : 0.45
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 45s) Loss: 0.0144(0.0144) LR: 0.00000193  
Epoch: [5][100/279] Elapsed 0m 15s (remain 0m 27s) Loss: 0.0073(0.0120) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 30s (remain 0m 11s) Loss: 0.0133(0.0129) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 41s (remain 0m 0s) Loss: 0.0098(0.0128) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.3279(0.3279) 


Epoch 5 - avg_train_loss: 0.0128  avg_val_loss: 0.7050  time: 46s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0128  avg_val_loss: 0.7050  time: 46s
Epoch 5 - Score: 0.7129
INFO:__main__:Epoch 5 - Score: 0.7129


EVAL: [15/16] Elapsed 0m 4s (remain 0m 0s) Loss: 0.5254(0.7050) 
f1 score : 0.2801932367149758
recall score : 0.1895424836601307
precision score : 0.5370370370370371
thresh : 0.43


Score: 0.7088
INFO:__main__:Score: 0.7088
ACC BEST Score: 0.7149
INFO:__main__:ACC BEST Score: 0.7149
DebertaConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_base_epoch10",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "c2p",
    "p2c"
  ],
  "position_biased_input": false,
  "relative_attention": true,
  "

f1 score : 0.09937888198757763
recall score : 0.05228758169934641
precision score : 1.0
thresh : 0.43


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_base_epoch10 were not used when initializing DebertaModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 2m 2s) Loss: 0.6160(0.6160) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 15s (remain 0m 26s) Loss: 0.6031(0.6153) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 29s (remain 0m 11s) Loss: 0.5552(0.6030) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 41s (remain 0m 0s) Loss: 0.5787(0.5990) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5675(0.5675) 


Epoch 1 - avg_train_loss: 0.5990  avg_val_loss: 0.5810  time: 46s
INFO:__main__:Epoch 1 - avg_train_loss: 0.5990  avg_val_loss: 0.5810  time: 46s
Epoch 1 - Score: 0.7088
INFO:__main__:Epoch 1 - Score: 0.7088
Epoch 1 - Save Best Score: 0.7088 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7088 Model


EVAL: [15/16] Elapsed 0m 4s (remain 0m 0s) Loss: 0.4041(0.5810) 
f1 score : 0.13793103448275862
recall score : 0.0784313725490196
precision score : 0.5714285714285714
thresh : 0.44
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 30s) Loss: 0.5478(0.5478) LR: 0.00001808  
Epoch: [2][100/279] Elapsed 0m 14s (remain 0m 26s) Loss: 0.3922(0.4729) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 30s (remain 0m 11s) Loss: 0.3437(0.4542) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 41s (remain 0m 0s) Loss: 0.3340(0.4460) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.5721(0.5721) 


Epoch 2 - avg_train_loss: 0.4460  avg_val_loss: 0.5869  time: 46s
INFO:__main__:Epoch 2 - avg_train_loss: 0.4460  avg_val_loss: 0.5869  time: 46s
Epoch 2 - Score: 0.7088
INFO:__main__:Epoch 2 - Score: 0.7088


EVAL: [15/16] Elapsed 0m 4s (remain 0m 0s) Loss: 0.3481(0.5869) 
f1 score : 0.22340425531914893
recall score : 0.13725490196078433
precision score : 0.6
thresh : 0.47
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 54s) Loss: 0.1485(0.1485) LR: 0.00001309  
Epoch: [3][100/279] Elapsed 0m 15s (remain 0m 26s) Loss: 0.1247(0.2062) LR: 0.00001090  
Epoch: [3][200/279] Elapsed 0m 30s (remain 0m 11s) Loss: 0.1564(0.1876) LR: 0.00000866  
Epoch: [3][278/279] Elapsed 0m 41s (remain 0m 0s) Loss: 0.1666(0.1802) LR: 0.00000695  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.6101(0.6101) 


Epoch 3 - avg_train_loss: 0.1802  avg_val_loss: 0.6009  time: 46s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1802  avg_val_loss: 0.6009  time: 46s
Epoch 3 - Score: 0.7149
INFO:__main__:Epoch 3 - Score: 0.7149
Epoch 3 - Save Best Score: 0.7149 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7149 Model


EVAL: [15/16] Elapsed 0m 4s (remain 0m 0s) Loss: 0.3769(0.6009) 
f1 score : 0.32432432432432434
recall score : 0.23529411764705882
precision score : 0.5217391304347826
thresh : 0.42
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 35s) Loss: 0.0565(0.0565) LR: 0.00000693  
Epoch: [4][100/279] Elapsed 0m 14s (remain 0m 26s) Loss: 0.0677(0.0554) LR: 0.00000488  
Epoch: [4][200/279] Elapsed 0m 29s (remain 0m 11s) Loss: 0.0569(0.0534) LR: 0.00000310  
Epoch: [4][278/279] Elapsed 0m 41s (remain 0m 0s) Loss: 0.0523(0.0527) LR: 0.00000194  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6435(0.6435) 


Epoch 4 - avg_train_loss: 0.0527  avg_val_loss: 0.6229  time: 46s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0527  avg_val_loss: 0.6229  time: 46s
Epoch 4 - Score: 0.7189
INFO:__main__:Epoch 4 - Score: 0.7189
Epoch 4 - Save Best Score: 0.7189 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7189 Model


EVAL: [15/16] Elapsed 0m 4s (remain 0m 0s) Loss: 0.3652(0.6229) 
f1 score : 0.34361233480176206
recall score : 0.2549019607843137
precision score : 0.527027027027027
thresh : 0.54
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 47s) Loss: 0.0289(0.0289) LR: 0.00000193  
Epoch: [5][100/279] Elapsed 0m 15s (remain 0m 27s) Loss: 0.0401(0.0379) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 30s (remain 0m 11s) Loss: 0.0308(0.0373) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 42s (remain 0m 0s) Loss: 0.0365(0.0371) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.6550(0.6550) 


Epoch 5 - avg_train_loss: 0.0371  avg_val_loss: 0.6332  time: 47s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0371  avg_val_loss: 0.6332  time: 47s
Epoch 5 - Score: 0.7169
INFO:__main__:Epoch 5 - Score: 0.7169


EVAL: [15/16] Elapsed 0m 4s (remain 0m 0s) Loss: 0.3562(0.6332) 
f1 score : 0.34703196347031967
recall score : 0.24836601307189543
precision score : 0.5757575757575758
thresh : 0.42


Score: 0.7008
INFO:__main__:Score: 0.7008
ACC BEST Score: 0.7189
INFO:__main__:ACC BEST Score: 0.7189
DebertaConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_base_epoch10",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "c2p",
    "p2c"
  ],
  "position_biased_input": false,
  "relative_attention": true,
  "

f1 score : 0.34361233480176206
recall score : 0.2549019607843137
precision score : 0.527027027027027
thresh : 0.54


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_base_epoch10 were not used when initializing DebertaModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 2m 3s) Loss: 0.6069(0.6069) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 15s (remain 0m 26s) Loss: 0.7778(0.6241) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 30s (remain 0m 11s) Loss: 0.5238(0.6095) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 42s (remain 0m 0s) Loss: 0.7550(0.5993) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.6843(0.6843) 


Epoch 1 - avg_train_loss: 0.5993  avg_val_loss: 0.5853  time: 46s
INFO:__main__:Epoch 1 - avg_train_loss: 0.5993  avg_val_loss: 0.5853  time: 46s
Epoch 1 - Score: 0.7068
INFO:__main__:Epoch 1 - Score: 0.7068
Epoch 1 - Save Best Score: 0.7068 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7068 Model


EVAL: [15/16] Elapsed 0m 4s (remain 0m 0s) Loss: 0.5592(0.5853) 
f1 score : 0.10714285714285712
recall score : 0.05921052631578947
precision score : 0.5625
thresh : 0.32
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 33s) Loss: 0.4630(0.4630) LR: 0.00001808  
Epoch: [2][100/279] Elapsed 0m 15s (remain 0m 27s) Loss: 0.5307(0.4250) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 30s (remain 0m 11s) Loss: 0.3703(0.3972) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 41s (remain 0m 0s) Loss: 0.3950(0.3797) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.6403(0.6403) 


Epoch 2 - avg_train_loss: 0.3797  avg_val_loss: 0.5968  time: 46s
INFO:__main__:Epoch 2 - avg_train_loss: 0.3797  avg_val_loss: 0.5968  time: 46s
Epoch 2 - Score: 0.7088
INFO:__main__:Epoch 2 - Score: 0.7088
Epoch 2 - Save Best Score: 0.7088 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7088 Model


EVAL: [15/16] Elapsed 0m 4s (remain 0m 0s) Loss: 0.6138(0.5968) 
f1 score : 0.3489361702127659
recall score : 0.26973684210526316
precision score : 0.4939759036144578
thresh : 0.62
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 50s) Loss: 0.1640(0.1640) LR: 0.00001309  
Epoch: [3][100/279] Elapsed 0m 15s (remain 0m 27s) Loss: 0.0828(0.1096) LR: 0.00001090  
Epoch: [3][200/279] Elapsed 0m 30s (remain 0m 11s) Loss: 0.0492(0.0937) LR: 0.00000866  
Epoch: [3][278/279] Elapsed 0m 42s (remain 0m 0s) Loss: 0.0719(0.0884) LR: 0.00000695  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.7322(0.7322) 


Epoch 3 - avg_train_loss: 0.0884  avg_val_loss: 0.6234  time: 46s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0884  avg_val_loss: 0.6234  time: 46s
Epoch 3 - Score: 0.7209
INFO:__main__:Epoch 3 - Score: 0.7209
Epoch 3 - Save Best Score: 0.7209 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7209 Model


EVAL: [15/16] Elapsed 0m 4s (remain 0m 0s) Loss: 0.5223(0.6234) 
f1 score : 0.29292929292929293
recall score : 0.19078947368421054
precision score : 0.6304347826086957
thresh : 0.52
Epoch: [4][0/279] Elapsed 0m 0s (remain 2m 1s) Loss: 0.0259(0.0259) LR: 0.00000693  
Epoch: [4][100/279] Elapsed 0m 15s (remain 0m 28s) Loss: 0.0253(0.0301) LR: 0.00000488  
Epoch: [4][200/279] Elapsed 0m 30s (remain 0m 11s) Loss: 0.0306(0.0292) LR: 0.00000310  
Epoch: [4][278/279] Elapsed 0m 42s (remain 0m 0s) Loss: 0.0247(0.0286) LR: 0.00000194  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.7521(0.7521) 


Epoch 4 - avg_train_loss: 0.0286  avg_val_loss: 0.6474  time: 47s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0286  avg_val_loss: 0.6474  time: 47s
Epoch 4 - Score: 0.7249
INFO:__main__:Epoch 4 - Score: 0.7249
Epoch 4 - Save Best Score: 0.7249 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7249 Model


EVAL: [15/16] Elapsed 0m 4s (remain 0m 0s) Loss: 0.4991(0.6474) 
f1 score : 0.2944162436548224
recall score : 0.19078947368421054
precision score : 0.6444444444444445
thresh : 0.51
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 43s) Loss: 0.0214(0.0214) LR: 0.00000193  
Epoch: [5][100/279] Elapsed 0m 14s (remain 0m 26s) Loss: 0.0206(0.0218) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 29s (remain 0m 11s) Loss: 0.0178(0.0217) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 42s (remain 0m 0s) Loss: 0.0238(0.0219) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.7602(0.7602) 


Epoch 5 - avg_train_loss: 0.0219  avg_val_loss: 0.6534  time: 46s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0219  avg_val_loss: 0.6534  time: 46s
Epoch 5 - Score: 0.7249
INFO:__main__:Epoch 5 - Score: 0.7249


EVAL: [15/16] Elapsed 0m 4s (remain 0m 0s) Loss: 0.4983(0.6534) 
f1 score : 0.2944162436548224
recall score : 0.19078947368421054
precision score : 0.6444444444444445
thresh : 0.51


Score: 0.7209
INFO:__main__:Score: 0.7209
ACC BEST Score: 0.7249
INFO:__main__:ACC BEST Score: 0.7249
DebertaConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_base_epoch10",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "c2p",
    "p2c"
  ],
  "position_biased_input": false,
  "relative_attention": true,
  "

f1 score : 0.2944162436548224
recall score : 0.19078947368421054
precision score : 0.6444444444444445
thresh : 0.51


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_base_epoch10 were not used when initializing DebertaModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 2m 3s) Loss: 0.6953(0.6953) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 15s (remain 0m 27s) Loss: 0.8176(0.6155) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 30s (remain 0m 11s) Loss: 0.5705(0.6025) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 42s (remain 0m 0s) Loss: 0.7194(0.6042) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.5491(0.5491) 


Epoch 1 - avg_train_loss: 0.6042  avg_val_loss: 0.5815  time: 47s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6042  avg_val_loss: 0.5815  time: 47s
Epoch 1 - Score: 0.7203
INFO:__main__:Epoch 1 - Score: 0.7203
Epoch 1 - Save Best Score: 0.7203 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7203 Model


EVAL: [15/16] Elapsed 0m 4s (remain 0m 0s) Loss: 0.5261(0.5815) 
f1 score : 0.3790322580645161
recall score : 0.3092105263157895
precision score : 0.4895833333333333
thresh : 0.57
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 38s) Loss: 0.4283(0.4283) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 15s (remain 0m 26s) Loss: 0.4226(0.3882) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 29s (remain 0m 11s) Loss: 0.2210(0.3686) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 42s (remain 0m 0s) Loss: 0.4161(0.3657) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.6038(0.6038) 


Epoch 2 - avg_train_loss: 0.3657  avg_val_loss: 0.5938  time: 46s
INFO:__main__:Epoch 2 - avg_train_loss: 0.3657  avg_val_loss: 0.5938  time: 46s
Epoch 2 - Score: 0.7042
INFO:__main__:Epoch 2 - Score: 0.7042


EVAL: [15/16] Elapsed 0m 4s (remain 0m 0s) Loss: 0.4896(0.5938) 
f1 score : 0.4738675958188153
recall score : 0.4473684210526316
precision score : 0.5037037037037037
thresh : 0.78
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 35s) Loss: 0.1070(0.1070) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 15s (remain 0m 27s) Loss: 0.0591(0.0834) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 30s (remain 0m 11s) Loss: 0.0259(0.0686) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 42s (remain 0m 0s) Loss: 0.0574(0.0644) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.5888(0.5888) 


Epoch 3 - avg_train_loss: 0.0644  avg_val_loss: 0.6247  time: 47s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0644  avg_val_loss: 0.6247  time: 47s
Epoch 3 - Score: 0.6982
INFO:__main__:Epoch 3 - Score: 0.6982


EVAL: [15/16] Elapsed 0m 4s (remain 0m 0s) Loss: 0.4218(0.6247) 
f1 score : 0.38888888888888884
recall score : 0.3223684210526316
precision score : 0.49
thresh : 0.72
Epoch: [4][0/279] Elapsed 0m 0s (remain 2m 6s) Loss: 0.0298(0.0298) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 15s (remain 0m 26s) Loss: 0.0179(0.0186) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 29s (remain 0m 11s) Loss: 0.0167(0.0180) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 41s (remain 0m 0s) Loss: 0.0173(0.0175) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.5873(0.5873) 


Epoch 4 - avg_train_loss: 0.0175  avg_val_loss: 0.6556  time: 46s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0175  avg_val_loss: 0.6556  time: 46s
Epoch 4 - Score: 0.6982
INFO:__main__:Epoch 4 - Score: 0.6982


EVAL: [15/16] Elapsed 0m 4s (remain 0m 0s) Loss: 0.4137(0.6556) 
f1 score : 0.2844036697247706
recall score : 0.20394736842105263
precision score : 0.4696969696969697
thresh : 0.77
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 44s) Loss: 0.0128(0.0128) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 15s (remain 0m 26s) Loss: 0.0129(0.0134) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 30s (remain 0m 11s) Loss: 0.0108(0.0134) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 42s (remain 0m 0s) Loss: 0.0105(0.0134) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.5937(0.5937) 


Epoch 5 - avg_train_loss: 0.0134  avg_val_loss: 0.6683  time: 46s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0134  avg_val_loss: 0.6683  time: 46s
Epoch 5 - Score: 0.6982
INFO:__main__:Epoch 5 - Score: 0.6982


EVAL: [15/16] Elapsed 0m 4s (remain 0m 0s) Loss: 0.4139(0.6683) 
f1 score : 0.2710280373831776
recall score : 0.19078947368421054
precision score : 0.46774193548387094
thresh : 0.64


Score: 0.6901
INFO:__main__:Score: 0.6901
ACC BEST Score: 0.7203
INFO:__main__:ACC BEST Score: 0.7203
DebertaConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_base_epoch10",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "c2p",
    "p2c"
  ],
  "position_biased_input": false,
  "relative_attention": true,
  "

f1 score : 0.3790322580645161
recall score : 0.3092105263157895
precision score : 0.4895833333333333
thresh : 0.57


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_base_epoch10 were not used when initializing DebertaModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 58s) Loss: 0.7546(0.7546) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 15s (remain 0m 27s) Loss: 0.7843(0.6287) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 30s (remain 0m 11s) Loss: 0.4337(0.6086) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 42s (remain 0m 0s) Loss: 0.5815(0.5997) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.6193(0.6193) 


Epoch 1 - avg_train_loss: 0.5997  avg_val_loss: 0.5766  time: 47s
INFO:__main__:Epoch 1 - avg_train_loss: 0.5997  avg_val_loss: 0.5766  time: 47s
Epoch 1 - Score: 0.7123
INFO:__main__:Epoch 1 - Score: 0.7123
Epoch 1 - Save Best Score: 0.7123 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7123 Model


EVAL: [15/16] Elapsed 0m 4s (remain 0m 0s) Loss: 0.6713(0.5766) 
f1 score : 0.17045454545454547
recall score : 0.09868421052631579
precision score : 0.625
thresh : 0.52
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 44s) Loss: 0.4239(0.4239) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 15s (remain 0m 27s) Loss: 0.4417(0.4094) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 30s (remain 0m 11s) Loss: 0.4686(0.3921) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 42s (remain 0m 0s) Loss: 0.3491(0.3837) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.7239(0.7239) 


Epoch 2 - avg_train_loss: 0.3837  avg_val_loss: 0.6164  time: 46s
INFO:__main__:Epoch 2 - avg_train_loss: 0.3837  avg_val_loss: 0.6164  time: 46s
Epoch 2 - Score: 0.7123
INFO:__main__:Epoch 2 - Score: 0.7123


EVAL: [15/16] Elapsed 0m 4s (remain 0m 0s) Loss: 0.7718(0.6164) 
f1 score : 0.24120603015075376
recall score : 0.15789473684210525
precision score : 0.5106382978723404
thresh : 0.57
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 40s) Loss: 0.2008(0.2008) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 15s (remain 0m 27s) Loss: 0.0430(0.0815) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 30s (remain 0m 11s) Loss: 0.0329(0.0741) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 42s (remain 0m 0s) Loss: 0.0309(0.0683) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.8812(0.8812) 


Epoch 3 - avg_train_loss: 0.0683  avg_val_loss: 0.7227  time: 46s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0683  avg_val_loss: 0.7227  time: 46s
Epoch 3 - Score: 0.7103
INFO:__main__:Epoch 3 - Score: 0.7103


EVAL: [15/16] Elapsed 0m 4s (remain 0m 0s) Loss: 1.0297(0.7227) 
f1 score : 0.2653061224489796
recall score : 0.17105263157894737
precision score : 0.5909090909090909
thresh : 0.49
Epoch: [4][0/279] Elapsed 0m 0s (remain 2m 0s) Loss: 0.0199(0.0199) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 15s (remain 0m 27s) Loss: 0.0111(0.0188) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 30s (remain 0m 11s) Loss: 0.0100(0.0176) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 41s (remain 0m 0s) Loss: 0.0181(0.0177) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.8767(0.8767) 


Epoch 4 - avg_train_loss: 0.0177  avg_val_loss: 0.6977  time: 46s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0177  avg_val_loss: 0.6977  time: 46s
Epoch 4 - Score: 0.7183
INFO:__main__:Epoch 4 - Score: 0.7183
Epoch 4 - Save Best Score: 0.7183 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7183 Model


EVAL: [15/16] Elapsed 0m 4s (remain 0m 0s) Loss: 0.9719(0.6977) 
f1 score : 0.3142857142857143
recall score : 0.21710526315789475
precision score : 0.5689655172413793
thresh : 0.45
Epoch: [5][0/279] Elapsed 0m 0s (remain 2m 20s) Loss: 0.0109(0.0109) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 15s (remain 0m 26s) Loss: 0.0140(0.0135) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 30s (remain 0m 11s) Loss: 0.0160(0.0134) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 42s (remain 0m 0s) Loss: 0.0172(0.0137) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.8966(0.8966) 


Epoch 5 - avg_train_loss: 0.0137  avg_val_loss: 0.7123  time: 46s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0137  avg_val_loss: 0.7123  time: 46s
Epoch 5 - Score: 0.7183
INFO:__main__:Epoch 5 - Score: 0.7183


EVAL: [15/16] Elapsed 0m 4s (remain 0m 0s) Loss: 1.0024(0.7123) 
f1 score : 0.2980769230769231
recall score : 0.20394736842105263
precision score : 0.5535714285714286
thresh : 0.43


Score: 0.7103
INFO:__main__:Score: 0.7103
ACC BEST Score: 0.7183
INFO:__main__:ACC BEST Score: 0.7183
DebertaConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_base_epoch10",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "c2p",
    "p2c"
  ],
  "position_biased_input": false,
  "relative_attention": true,
  "

f1 score : 0.3142857142857143
recall score : 0.21710526315789475
precision score : 0.5689655172413793
thresh : 0.45


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_base_epoch10 were not used when initializing DebertaModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 56s) Loss: 0.6564(0.6564) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 15s (remain 0m 27s) Loss: 0.4588(0.6136) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 30s (remain 0m 11s) Loss: 0.4758(0.6073) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 41s (remain 0m 0s) Loss: 0.5716(0.6021) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.5640(0.5640) 


Epoch 1 - avg_train_loss: 0.6021  avg_val_loss: 0.5755  time: 47s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6021  avg_val_loss: 0.5755  time: 47s
Epoch 1 - Score: 0.7163
INFO:__main__:Epoch 1 - Score: 0.7163
Epoch 1 - Save Best Score: 0.7163 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7163 Model


EVAL: [15/16] Elapsed 0m 4s (remain 0m 0s) Loss: 0.6032(0.5755) 
f1 score : 0.3364485981308411
recall score : 0.23684210526315788
precision score : 0.5806451612903226
thresh : 0.48
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 44s) Loss: 0.4503(0.4503) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 15s (remain 0m 27s) Loss: 0.4066(0.3891) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 30s (remain 0m 11s) Loss: 0.2993(0.3693) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 42s (remain 0m 0s) Loss: 0.3388(0.3638) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.6347(0.6347) 


Epoch 2 - avg_train_loss: 0.3638  avg_val_loss: 0.6040  time: 47s
INFO:__main__:Epoch 2 - avg_train_loss: 0.3638  avg_val_loss: 0.6040  time: 47s
Epoch 2 - Score: 0.7082
INFO:__main__:Epoch 2 - Score: 0.7082


EVAL: [15/16] Elapsed 0m 4s (remain 0m 0s) Loss: 0.6707(0.6040) 
f1 score : 0.37837837837837834
recall score : 0.3223684210526316
precision score : 0.45794392523364486
thresh : 0.71
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 55s) Loss: 0.1266(0.1266) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 15s (remain 0m 26s) Loss: 0.1281(0.0812) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 30s (remain 0m 11s) Loss: 0.0325(0.0739) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 41s (remain 0m 0s) Loss: 0.1137(0.0694) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.5813(0.5813) 


Epoch 3 - avg_train_loss: 0.0694  avg_val_loss: 0.6598  time: 46s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0694  avg_val_loss: 0.6598  time: 46s
Epoch 3 - Score: 0.7103
INFO:__main__:Epoch 3 - Score: 0.7103


EVAL: [15/16] Elapsed 0m 4s (remain 0m 0s) Loss: 0.8173(0.6598) 
f1 score : 0.36820083682008375
recall score : 0.2894736842105263
precision score : 0.5057471264367817
thresh : 0.77
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 50s) Loss: 0.0166(0.0166) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 15s (remain 0m 26s) Loss: 0.0144(0.0138) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 30s (remain 0m 11s) Loss: 0.0134(0.0130) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 41s (remain 0m 0s) Loss: 0.0086(0.0125) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.6689(0.6689) 


Epoch 4 - avg_train_loss: 0.0125  avg_val_loss: 0.7460  time: 46s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0125  avg_val_loss: 0.7460  time: 46s
Epoch 4 - Score: 0.7103
INFO:__main__:Epoch 4 - Score: 0.7103


EVAL: [15/16] Elapsed 0m 4s (remain 0m 0s) Loss: 0.7972(0.7460) 
f1 score : 0.32407407407407407
recall score : 0.23026315789473684
precision score : 0.546875
thresh : 0.45
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 53s) Loss: 0.0106(0.0106) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 15s (remain 0m 26s) Loss: 0.0100(0.0094) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 30s (remain 0m 11s) Loss: 0.0081(0.0094) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 41s (remain 0m 0s) Loss: 0.0128(0.0095) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.6857(0.6857) 


Epoch 5 - avg_train_loss: 0.0095  avg_val_loss: 0.7622  time: 46s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0095  avg_val_loss: 0.7622  time: 46s
Epoch 5 - Score: 0.7103
INFO:__main__:Epoch 5 - Score: 0.7103


EVAL: [15/16] Elapsed 0m 4s (remain 0m 0s) Loss: 0.8006(0.7622) 
f1 score : 0.30331753554502366
recall score : 0.21052631578947367
precision score : 0.5423728813559322
thresh : 0.63


Score: 0.7143
INFO:__main__:Score: 0.7143
ACC BEST Score: 0.7163
INFO:__main__:ACC BEST Score: 0.7163
DebertaConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_base_epoch10",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "c2p",
    "p2c"
  ],
  "position_biased_input": false,
  "relative_attention": true,
  "

f1 score : 0.3364485981308411
recall score : 0.23684210526315788
precision score : 0.5806451612903226
thresh : 0.48


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_base_epoch10 were not used when initializing DebertaModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 2m 3s) Loss: 0.5362(0.5362) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 15s (remain 0m 26s) Loss: 0.7107(0.6346) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 30s (remain 0m 11s) Loss: 0.6450(0.6084) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 41s (remain 0m 0s) Loss: 0.5250(0.6058) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.6780(0.6780) 


Epoch 1 - avg_train_loss: 0.6058  avg_val_loss: 0.5759  time: 46s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6058  avg_val_loss: 0.5759  time: 46s
Epoch 1 - Score: 0.7223
INFO:__main__:Epoch 1 - Score: 0.7223
Epoch 1 - Save Best Score: 0.7223 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7223 Model


EVAL: [15/16] Elapsed 0m 4s (remain 0m 0s) Loss: 0.6887(0.5759) 
f1 score : 0.18823529411764706
recall score : 0.10526315789473684
precision score : 0.8888888888888888
thresh : 0.5
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 48s) Loss: 0.4109(0.4109) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 15s (remain 0m 27s) Loss: 0.3616(0.4248) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 30s (remain 0m 11s) Loss: 0.5766(0.4188) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 41s (remain 0m 0s) Loss: 0.4699(0.4163) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.6833(0.6833) 


Epoch 2 - avg_train_loss: 0.4163  avg_val_loss: 0.5698  time: 47s
INFO:__main__:Epoch 2 - avg_train_loss: 0.4163  avg_val_loss: 0.5698  time: 47s
Epoch 2 - Score: 0.7264
INFO:__main__:Epoch 2 - Score: 0.7264
Epoch 2 - Save Best Score: 0.7264 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7264 Model


EVAL: [15/16] Elapsed 0m 4s (remain 0m 0s) Loss: 0.6319(0.5698) 
f1 score : 0.42911877394636017
recall score : 0.3684210526315789
precision score : 0.5137614678899083
thresh : 0.63
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 52s) Loss: 0.2240(0.2240) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 15s (remain 0m 27s) Loss: 0.0758(0.1422) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 30s (remain 0m 11s) Loss: 0.0689(0.1231) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 42s (remain 0m 0s) Loss: 0.0867(0.1176) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.6902(0.6902) 


Epoch 3 - avg_train_loss: 0.1176  avg_val_loss: 0.5786  time: 47s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1176  avg_val_loss: 0.5786  time: 47s
Epoch 3 - Score: 0.7243
INFO:__main__:Epoch 3 - Score: 0.7243


EVAL: [15/16] Elapsed 0m 4s (remain 0m 0s) Loss: 0.6560(0.5786) 
f1 score : 0.3211009174311926
recall score : 0.23026315789473684
precision score : 0.5303030303030303
thresh : 0.63
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 44s) Loss: 0.0461(0.0461) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 15s (remain 0m 26s) Loss: 0.0388(0.0357) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 30s (remain 0m 11s) Loss: 0.0297(0.0347) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 41s (remain 0m 0s) Loss: 0.0218(0.0347) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.8595(0.8595) 


Epoch 4 - avg_train_loss: 0.0347  avg_val_loss: 0.6327  time: 46s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0347  avg_val_loss: 0.6327  time: 46s
Epoch 4 - Score: 0.7243
INFO:__main__:Epoch 4 - Score: 0.7243


EVAL: [15/16] Elapsed 0m 4s (remain 0m 0s) Loss: 0.8080(0.6327) 
f1 score : 0.25
recall score : 0.1513157894736842
precision score : 0.71875
thresh : 0.38
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 51s) Loss: 0.0158(0.0158) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 15s (remain 0m 27s) Loss: 0.0201(0.0262) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 30s (remain 0m 11s) Loss: 0.0239(0.0263) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 42s (remain 0m 0s) Loss: 0.0260(0.0261) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.8469(0.8469) 


Epoch 5 - avg_train_loss: 0.0261  avg_val_loss: 0.6152  time: 47s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0261  avg_val_loss: 0.6152  time: 47s
Epoch 5 - Score: 0.7243
INFO:__main__:Epoch 5 - Score: 0.7243


EVAL: [15/16] Elapsed 0m 4s (remain 0m 0s) Loss: 0.7680(0.6152) 
f1 score : 0.2694300518134715
recall score : 0.17105263157894737
precision score : 0.6341463414634146
thresh : 0.42


Score: 0.7002
INFO:__main__:Score: 0.7002
ACC BEST Score: 0.7264
INFO:__main__:ACC BEST Score: 0.7264
DebertaConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_base_epoch10",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "c2p",
    "p2c"
  ],
  "position_biased_input": false,
  "relative_attention": true,
  "

f1 score : 0.42911877394636017
recall score : 0.3684210526315789
precision score : 0.5137614678899083
thresh : 0.63


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_base_epoch10 were not used when initializing DebertaModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 2m 9s) Loss: 0.7304(0.7304) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 15s (remain 0m 27s) Loss: 0.5388(0.6309) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 30s (remain 0m 11s) Loss: 0.5048(0.6149) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 42s (remain 0m 0s) Loss: 0.6573(0.6042) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.5920(0.5920) 


Epoch 1 - avg_train_loss: 0.6042  avg_val_loss: 0.6042  time: 46s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6042  avg_val_loss: 0.6042  time: 46s
Epoch 1 - Score: 0.7183
INFO:__main__:Epoch 1 - Score: 0.7183
Epoch 1 - Save Best Score: 0.7183 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7183 Model


EVAL: [15/16] Elapsed 0m 4s (remain 0m 0s) Loss: 0.6836(0.6042) 
f1 score : 0.42424242424242425
recall score : 0.4144736842105263
precision score : 0.43448275862068964
thresh : 0.6
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 45s) Loss: 0.4417(0.4417) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 15s (remain 0m 26s) Loss: 0.2987(0.4038) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 30s (remain 0m 11s) Loss: 0.2749(0.3902) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 41s (remain 0m 0s) Loss: 0.3310(0.3793) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.5034(0.5034) 


Epoch 2 - avg_train_loss: 0.3793  avg_val_loss: 0.5693  time: 46s
INFO:__main__:Epoch 2 - avg_train_loss: 0.3793  avg_val_loss: 0.5693  time: 46s
Epoch 2 - Score: 0.7264
INFO:__main__:Epoch 2 - Score: 0.7264
Epoch 2 - Save Best Score: 0.7264 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7264 Model


EVAL: [15/16] Elapsed 0m 4s (remain 0m 0s) Loss: 0.6759(0.5693) 
f1 score : 0.3300970873786408
recall score : 0.2236842105263158
precision score : 0.6296296296296297
thresh : 0.6
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 44s) Loss: 0.1023(0.1023) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 15s (remain 0m 27s) Loss: 0.0393(0.0743) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 30s (remain 0m 11s) Loss: 0.0457(0.0635) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 42s (remain 0m 0s) Loss: 0.0486(0.0604) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.4700(0.4700) 


Epoch 3 - avg_train_loss: 0.0604  avg_val_loss: 0.5808  time: 46s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0604  avg_val_loss: 0.5808  time: 46s
Epoch 3 - Score: 0.7264
INFO:__main__:Epoch 3 - Score: 0.7264


EVAL: [15/16] Elapsed 0m 4s (remain 0m 0s) Loss: 0.7886(0.5808) 
f1 score : 0.36283185840707965
recall score : 0.26973684210526316
precision score : 0.5540540540540541
thresh : 0.73
Epoch: [4][0/279] Elapsed 0m 0s (remain 2m 0s) Loss: 0.0205(0.0205) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 15s (remain 0m 27s) Loss: 0.0151(0.0165) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 30s (remain 0m 11s) Loss: 0.0124(0.0159) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 41s (remain 0m 0s) Loss: 0.0120(0.0155) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.5722(0.5722) 


Epoch 4 - avg_train_loss: 0.0155  avg_val_loss: 0.6706  time: 46s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0155  avg_val_loss: 0.6706  time: 46s
Epoch 4 - Score: 0.7264
INFO:__main__:Epoch 4 - Score: 0.7264


EVAL: [15/16] Elapsed 0m 4s (remain 0m 0s) Loss: 1.0323(0.6706) 
f1 score : 0.23529411764705882
recall score : 0.14473684210526316
precision score : 0.6285714285714286
thresh : 0.6
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 43s) Loss: 0.0133(0.0133) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 15s (remain 0m 26s) Loss: 0.0147(0.0124) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 30s (remain 0m 11s) Loss: 0.0119(0.0122) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 42s (remain 0m 0s) Loss: 0.0138(0.0120) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.5668(0.5668) 


Epoch 5 - avg_train_loss: 0.0120  avg_val_loss: 0.6668  time: 47s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0120  avg_val_loss: 0.6668  time: 47s
Epoch 5 - Score: 0.7243
INFO:__main__:Epoch 5 - Score: 0.7243


EVAL: [15/16] Elapsed 0m 4s (remain 0m 0s) Loss: 1.0325(0.6668) 
f1 score : 0.23529411764705882
recall score : 0.14473684210526316
precision score : 0.6285714285714286
thresh : 0.6


Score: 0.7223
INFO:__main__:Score: 0.7223
ACC BEST Score: 0.7264
INFO:__main__:ACC BEST Score: 0.7264
DebertaConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_base_epoch10",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "c2p",
    "p2c"
  ],
  "position_biased_input": false,
  "relative_attention": true,
  "

f1 score : 0.3300970873786408
recall score : 0.2236842105263158
precision score : 0.6296296296296297
thresh : 0.6


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_base_epoch10 were not used when initializing DebertaModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 2m 0s) Loss: 0.6347(0.6347) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 15s (remain 0m 26s) Loss: 0.4568(0.6125) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 30s (remain 0m 11s) Loss: 0.4986(0.6029) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 41s (remain 0m 0s) Loss: 0.5667(0.5977) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.5924(0.5924) 


Epoch 1 - avg_train_loss: 0.5977  avg_val_loss: 0.5721  time: 46s
INFO:__main__:Epoch 1 - avg_train_loss: 0.5977  avg_val_loss: 0.5721  time: 46s
Epoch 1 - Score: 0.7183
INFO:__main__:Epoch 1 - Score: 0.7183
Epoch 1 - Save Best Score: 0.7183 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7183 Model


EVAL: [15/16] Elapsed 0m 4s (remain 0m 0s) Loss: 0.5346(0.5721) 
f1 score : 0.1437125748502994
recall score : 0.07894736842105263
precision score : 0.8
thresh : 0.48
Epoch: [2][0/279] Elapsed 0m 0s (remain 2m 1s) Loss: 0.5217(0.5217) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 15s (remain 0m 27s) Loss: 0.4499(0.3886) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 30s (remain 0m 11s) Loss: 0.3235(0.3854) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 42s (remain 0m 0s) Loss: 0.4217(0.3865) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.6402(0.6402) 


Epoch 2 - avg_train_loss: 0.3865  avg_val_loss: 0.5836  time: 47s
INFO:__main__:Epoch 2 - avg_train_loss: 0.3865  avg_val_loss: 0.5836  time: 47s
Epoch 2 - Score: 0.7223
INFO:__main__:Epoch 2 - Score: 0.7223
Epoch 2 - Save Best Score: 0.7223 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7223 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.4808(0.5836) 
f1 score : 0.396551724137931
recall score : 0.3026315789473684
precision score : 0.575
thresh : 0.48
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 48s) Loss: 0.1192(0.1192) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 15s (remain 0m 26s) Loss: 0.0620(0.0970) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 30s (remain 0m 11s) Loss: 0.0427(0.0799) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 42s (remain 0m 0s) Loss: 0.0416(0.0733) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.7592(0.7592) 


Epoch 3 - avg_train_loss: 0.0733  avg_val_loss: 0.6564  time: 46s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0733  avg_val_loss: 0.6564  time: 46s
Epoch 3 - Score: 0.7404
INFO:__main__:Epoch 3 - Score: 0.7404
Epoch 3 - Save Best Score: 0.7404 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7404 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.3955(0.6564) 
f1 score : 0.3943661971830986
recall score : 0.27631578947368424
precision score : 0.6885245901639344
thresh : 0.5
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 43s) Loss: 0.0176(0.0176) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 15s (remain 0m 26s) Loss: 0.0261(0.0186) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 30s (remain 0m 11s) Loss: 0.0135(0.0190) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 42s (remain 0m 0s) Loss: 0.0212(0.0192) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.7922(0.7922) 


Epoch 4 - avg_train_loss: 0.0192  avg_val_loss: 0.6602  time: 46s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0192  avg_val_loss: 0.6602  time: 46s
Epoch 4 - Score: 0.7404
INFO:__main__:Epoch 4 - Score: 0.7404


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.3953(0.6602) 
f1 score : 0.4107142857142857
recall score : 0.3026315789473684
precision score : 0.6388888888888888
thresh : 0.54
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 40s) Loss: 0.0109(0.0109) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 14s (remain 0m 26s) Loss: 0.0143(0.0145) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 30s (remain 0m 11s) Loss: 0.0132(0.0145) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 42s (remain 0m 0s) Loss: 0.0142(0.0143) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.8082(0.8082) 


Epoch 5 - avg_train_loss: 0.0143  avg_val_loss: 0.6744  time: 46s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0143  avg_val_loss: 0.6744  time: 46s
Epoch 5 - Score: 0.7404
INFO:__main__:Epoch 5 - Score: 0.7404


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.4042(0.6744) 
f1 score : 0.4125560538116592
recall score : 0.3026315789473684
precision score : 0.647887323943662
thresh : 0.52


Score: 0.7404
INFO:__main__:Score: 0.7404
ACC BEST Score: 0.7404
INFO:__main__:ACC BEST Score: 0.7404
Score: 0.7123
INFO:__main__:Score: 0.7123
ACC BEST Score: 0.7141
INFO:__main__:ACC BEST Score: 0.7141


f1 score : 0.3943661971830986
recall score : 0.27631578947368424
precision score : 0.6885245901639344
thresh : 0.5
f1 score : 0.3353460287970274
recall score : 0.23703217334208798
precision score : 0.573015873015873
thresh : 0.53


In [None]:
from google.colab import runtime
runtime.unassign()