In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
!pip install transformers
!pip install datasets
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m97.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m113.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m32.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https://pypi.org/simple, https:/

In [3]:
!nvidia-smi

Thu May  4 19:09:45 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    49W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:

import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re
import html

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD, AdamW, RAdam
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score, recall_score, accuracy_score, precision_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
DIR = "/content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測"
INPUT_DIR = os.path.join(DIR,"input")
OUTPUT_DIR = os.path.join(DIR,"output")
CUSTOM_MODEL_DIR = os.path.join(OUTPUT_DIR,'clrp_bert_base_uncased_epoch10')
OUTPUT_EXP_DIR = DIR + '/output/EXP064/'
if not os.path.exists(OUTPUT_EXP_DIR):
    os.makedirs(OUTPUT_EXP_DIR)

In [6]:

# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    model_name="bert-base-uncased"
    # model='microsoft/deberta-base'
    # model='roberta-base'
    # model='roberta-large'
    # model='roberta-large-mnli'
    # model='xlnet-large-cased'
    # model='albert-xxlarge-v2'
    # model="microsoft/deberta-large"
    # model="microsoft/deberta-v3-large"
    # model='microsoft/deberta-v2-xlarge'
    # model='funnel-transformer/large'
    # model='funnel-transformer/medium'
    # model='albert-base-v2'
    # model='albert-large-v2'
    # model='google/electra-large-discriminator'
    # model='google/electra-base-discriminator'
    # model="facebook/bart-large-mnli"
    # model="facebook/bart-large"
    # model="facebook/bart-base"
    model = CUSTOM_MODEL_DIR
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=5
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=16
    fc_dropout=0.2
    target_size=1
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=10
    trn_fold=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    train=True
    nth_awp_start_epoch=1
    gradient_checkpointing = False
    freezing = False
    num_reinit_layers = 1
    is_reinit_layer = False
    fgm = False
    awp_start=1

if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0, 1]

In [7]:
def get_score(labels, outputs):
    thresh = 0.5
    y_pred = outputs
    y_true = labels
    f_score = f1_score(y_true, (y_pred>thresh).astype(int))
    r_score = recall_score(y_true, (y_pred>thresh).astype(int))
    p_score = precision_score(y_true, (y_pred>thresh).astype(int))
    print(f"f1 score : {f_score}")
    print(f"recall score : {r_score}")
    print(f"precision score : {p_score}")
    return accuracy_score(y_true, (y_pred>thresh).astype(int))

def get_acc_score(labels, outputs):
    y_pred = outputs
    y_true = labels
    best_score = 0
    best_thresh = 0.5
    for thresh in np.arange(0.1, 0.80, 0.01):
        thresh = np.round(thresh, 2)
        score = accuracy_score(y_true, (y_pred>thresh).astype(int))
        #print("Accuracy score at threshold {0} is {1}".format(thresh, score))
        if score > best_score:
          best_score = score
          best_thresh = thresh
    print(f"thresh : {best_thresh}")
    return accuracy_score(y_true, (y_pred>best_thresh).astype(int))


def get_logger(filename=OUTPUT_EXP_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=CFG.seed)

In [8]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False
        
def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
            
    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """
    
    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"
        
        if hasattr(embeddings_path, attr_name): 
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [9]:
import pandas as pd
import numpy as np

train = pd.read_csv(os.path.join(INPUT_DIR,"train_data.csv"))
test = pd.read_csv(os.path.join(INPUT_DIR,"test_data.csv"))
sample_sub = pd.read_csv(os.path.join(INPUT_DIR,"submission.csv"))

train = train.sample(frac=1, random_state=CFG.seed).reset_index()

print(train.shape)
display(train.head(3))

print(test.shape)
display(test.head(3))

print(sample_sub.shape)
display(sample_sub.head(3))

(4974, 7)


Unnamed: 0,index,id,title,year,abstract,keywords,y
0,721,722,Global Optimality Conditions for Deep Neural N...,2018,We study the error landscape of deep linear an...,"deep linear neural networks, global optimality...",1
1,144,145,Multi-Task Learning by Deep Collaboration and ...,2018,Convolutional neural networks (CNN) have becom...,"multi-task learning, soft parameter sharing, f...",0
2,4542,4543,On the Need for Topology-Aware Generative Mode...,2020,"ML algorithms or models, especially deep neura...","Manifold-based Defense, Robust Learning, Adver...",1


(6393, 5)


Unnamed: 0,id,title,year,abstract,keywords
0,1,StyleAlign: Analysis and Applications of Align...,2022,"In this paper, we perform an in-depth study of...","StyleGAN, transfer learning, fine tuning, mode..."
1,2,Embedding a random graph via GNN: mean-field i...,2021,We develop a theory for embedding a random gra...,"Graph neural network, graph embedding, multi-r..."
2,3,BBRefinement: an universal scheme to improve p...,2021,We present a conceptually simple yet powerful ...,"object detection, deep neural networks, refine..."


(6393, 2)


Unnamed: 0,id,y
0,1,0
1,2,0
2,3,0


In [10]:
train["texts"] = train["title"] + "[SEP]" + train["abstract"] 

In [11]:
skf = StratifiedKFold(n_splits=CFG.n_fold,shuffle=True,random_state=CFG.seed)
for fold, ( _, val_) in enumerate(skf.split(train, train.y)):
    train.loc[val_ , "kfold"] = int(fold)
    
train["kfold"] = train["kfold"].astype(int)

if CFG.debug:
    display(train.groupby('kfold').size())
    train = train.sample(n=500, random_state=0).reset_index(drop=True)
    display(train.groupby('kfold').size())

In [12]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_EXP_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

In [13]:
# ====================================================
# Define max_len
# ====================================================
#lengths = []
#tk0 = tqdm(train['texts'].fillna("").values, total=len(train))
#for text in tk0:
#    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
#    lengths.append(length)
#CFG.max_len = max(lengths) + 3 # cls + sep + sep
#LOGGER.info(f"max_len: {CFG.max_len}")

In [14]:
class AWP:
    def __init__(self, model, optimizer, *, adv_param='weight',
                 adv_lr=0.001, adv_eps=0.001):
        self.model = model
        self.optimizer = optimizer
        self.adv_param = adv_param
        self.adv_lr = adv_lr
        self.adv_eps = adv_eps
        self.backup = {}

    def perturb(self, inputs, y, criterion):
        """
        Perturb model parameters for AWP gradient
        Call before loss and loss.backward()
        """
        self._save()  # save model parameters
        self._attack_step()  # perturb weights

    def _attack_step(self):
        e = 1e-6
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                grad = self.optimizer.state[param]['exp_avg']
                norm_grad = torch.norm(grad)
                norm_data = torch.norm(param.detach())

                if norm_grad != 0 and not torch.isnan(norm_grad):
                    # Set lower and upper limit in change
                    limit_eps = self.adv_eps * param.detach().abs()
                    param_min = param.data - limit_eps
                    param_max = param.data + limit_eps

                    # Perturb along gradient
                    # w += (adv_lr * |w| / |grad|) * grad
                    param.data.add_(grad, alpha=(self.adv_lr * (norm_data + e) / (norm_grad + e)))

                    # Apply the limit to the change
                    param.data.clamp_(param_min, param_max)

    def _save(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                if name not in self.backup:
                    self.backup[name] = param.clone().detach()
                else:
                    self.backup[name].copy_(param.data)

    def restore(self):
        """
        Restore model parameter to correct position; AWP do not perturbe weights, it perturb gradients
        Call after loss.backward(), before optimizer.step()
        """
        for name, param in self.model.named_parameters():
            if name in self.backup:
                param.data.copy_(self.backup[name])

In [15]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False,
                           truncation=True)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['texts'].values
        self.labels = df['y'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

#collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [16]:
def reinit_layers(model):

    #for layer in model.model.encoder.layer[-CFG.num_reinit_layers:]:
    for layer in model.encoder.layer[-CFG.num_reinit_layers:]:    #Custome model内(backbone)

            for module in layer.modules():

                if isinstance(module,nn.Linear):
                    module.weight.data.normal_(mean=0.0,std=model.config.initializer_range)
                    if module.bias is not None:
                            module.bias.data.zero_()
                elif isinstance(module, nn.Embedding):
                        module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
                        if module.padding_idx is not None:
                            module.weight.data[module.padding_idx].zero_()
                elif isinstance(module, nn.LayerNorm):
                        module.bias.data.zero_()
                        module.weight.data.fill_(1.0)
                        
    return model

In [17]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim=1)
        return max_embeddings
    

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if CFG.is_reinit_layer:
            self.model = reinit_layers(self.model)
            print(f'Reinitializing Last {CFG.num_reinit_layers} Layers.')
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()

        # Freezing
        if cfg.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            cfg.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())

        self.pool = MaxPooling()
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        self._init_weights(self.fc)
        self.layer_norm1 = nn.LayerNorm(self.config.hidden_size)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        feature = self.layer_norm1(feature)
        output = self.fc(feature)
        return output

In [18]:
def calculate_loss(inputs, labels, model, criterion, is_valid=True, device="cpu"):    
    y_preds = model(inputs)
    loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
    return (loss, y_preds) if is_valid else loss

In [19]:

# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device, awp):
    model.zero_grad()
    model.train()
    awp_start = CFG.awp_start
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        if epoch >= awp_start:
            awp.perturb(inputs, labels, criterion)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            loss = calculate_loss(inputs=inputs, labels=labels, model=model, criterion=criterion, is_valid=False, device=device)
        #print(y_preds.sigmoid().squeeze().view(1, -1))
        #loss = criterion(y_preds.sigmoid().squeeze(), labels.squeeze())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        if scaler is not None:
            scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        awp.restore()
        if CFG.fgm:
          fgm.attack() 
          adversarial_loss = calculate_loss(inputs=inputs, labels=labels, model=model, criterion=criterion, is_valid=False, device=device)
          scaler.scale(adversarial_loss).backward()
          fgm.restore()
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  #'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          #grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))

    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            loss, y_preds = calculate_loss(inputs=inputs, labels=labels, model=model, criterion=criterion, is_valid=True, device=device)
        #loss = criterion(y_preds.sigmoid().squeeze(), labels.squeeze())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    predictions = np.concatenate(predictions)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [20]:

# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['kfold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['kfold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds['y'].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)


    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size*2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_EXP_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr=5e-6, decoder_lr=1e-4, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", 
                    "LayerNorm.weight"]
        group1=['layer.0.','layer.1.','layer.2.','layer.3.']
        group2=['layer.4.','layer.5.','layer.6.','layer.7.']    
        group3=['layer.8.','layer.9.','layer.10.','layer.11.']
        group_all=['layer.0.','layer.1.','layer.2.','layer.3.','layer.4.','layer.5.','layer.6.','layer.7.','layer.8.','layer.9.','layer.10.','layer.11.']
        optimizer_parameters = [
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay': weight_decay},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay': weight_decay, 'lr': encoder_lr/2.6},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay': weight_decay, 'lr': encoder_lr},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay': weight_decay, 'lr': encoder_lr*2.6},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay': 0.0},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay': 0.0, 'lr': encoder_lr/2.6},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay': 0.0, 'lr': encoder_lr},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay': 0.0, 'lr': encoder_lr*2.6},
        {'params': [p for n, p in model.named_parameters() if "model" not in n], 'lr':decoder_lr, "momentum" : 0.99},
    ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.BCEWithLogitsLoss()

    print('Enable AWP')
    awp = AWP(model, optimizer, adv_lr=0.001, adv_eps=0.001)
    #print('Enable FGM')
    #fgm = FGM(model=model, eps=0.1)
    
    best_score = -1.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device, awp)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score_05 = get_score(valid_labels, predictions)
        score = get_acc_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')

        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_EXP_DIR+f"{CFG.model_name.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_EXP_DIR+f"{CFG.model_name.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds['pred'] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [21]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df['y'].values
        preds = oof_df['pred'].values
        score = get_score(labels, preds)
        acc_score = get_acc_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
        LOGGER.info(f'ACC BEST Score: {acc_score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
            #break
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_EXP_DIR+'oof_df.pkl')

BertConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_bert_base_uncased_epoch10",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.28.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

INFO:__main__:BertConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\

Enable AWP
Epoch: [1][0/279] Elapsed 0m 4s (remain 18m 58s) Loss: 0.7408(0.7408) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.6999(0.6204) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 19s (remain 0m 7s) Loss: 0.5503(0.6061) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 26s (remain 0m 0s) Loss: 0.4547(0.5965) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5508(0.5508) 


Epoch 1 - avg_train_loss: 0.5965  avg_val_loss: 0.5967  time: 29s
INFO:__main__:Epoch 1 - avg_train_loss: 0.5965  avg_val_loss: 0.5967  time: 29s
Epoch 1 - Score: 0.7289
INFO:__main__:Epoch 1 - Score: 0.7289
Epoch 1 - Save Best Score: 0.7289 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7289 Model


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.7397(0.5967) 
f1 score : 0.07453416149068323
recall score : 0.0392156862745098
precision score : 0.75
thresh : 0.35
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 17s) Loss: 0.4853(0.4853) LR: 0.00001808  
Epoch: [2][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.3664(0.3930) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.3475(0.3703) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.2696(0.3592) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6138(0.6138) 


Epoch 2 - avg_train_loss: 0.3592  avg_val_loss: 0.6703  time: 25s
INFO:__main__:Epoch 2 - avg_train_loss: 0.3592  avg_val_loss: 0.6703  time: 25s
Epoch 2 - Score: 0.7088
INFO:__main__:Epoch 2 - Score: 0.7088


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.8077(0.6703) 
f1 score : 0.15999999999999998
recall score : 0.0915032679738562
precision score : 0.6363636363636364
thresh : 0.54
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 18s) Loss: 0.0860(0.0860) LR: 0.00001309  
Epoch: [3][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.0859(0.0908) LR: 0.00001090  
Epoch: [3][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.0618(0.0840) LR: 0.00000866  
Epoch: [3][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.0591(0.0811) LR: 0.00000695  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5688(0.5688) 


Epoch 3 - avg_train_loss: 0.0811  avg_val_loss: 0.6653  time: 25s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0811  avg_val_loss: 0.6653  time: 25s
Epoch 3 - Score: 0.7108
INFO:__main__:Epoch 3 - Score: 0.7108


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.8412(0.6653) 
f1 score : 0.2692307692307692
recall score : 0.1830065359477124
precision score : 0.509090909090909
thresh : 0.65
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 19s) Loss: 0.0288(0.0288) LR: 0.00000693  
Epoch: [4][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.0264(0.0270) LR: 0.00000488  
Epoch: [4][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.0324(0.0253) LR: 0.00000310  
Epoch: [4][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.0185(0.0248) LR: 0.00000194  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5985(0.5985) 


Epoch 4 - avg_train_loss: 0.0248  avg_val_loss: 0.7413  time: 25s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0248  avg_val_loss: 0.7413  time: 25s
Epoch 4 - Score: 0.7129
INFO:__main__:Epoch 4 - Score: 0.7129


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.9540(0.7413) 
f1 score : 0.26732673267326734
recall score : 0.17647058823529413
precision score : 0.5510204081632653
thresh : 0.57
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 18s) Loss: 0.0237(0.0237) LR: 0.00000193  
Epoch: [5][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.0108(0.0190) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.0203(0.0186) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.0203(0.0185) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5973(0.5973) 


Epoch 5 - avg_train_loss: 0.0185  avg_val_loss: 0.7474  time: 25s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0185  avg_val_loss: 0.7474  time: 25s
Epoch 5 - Score: 0.7129
INFO:__main__:Epoch 5 - Score: 0.7129


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.9672(0.7474) 
f1 score : 0.2634146341463415
recall score : 0.17647058823529413
precision score : 0.5192307692307693
thresh : 0.63


Score: 0.7008
INFO:__main__:Score: 0.7008
ACC BEST Score: 0.7289
INFO:__main__:ACC BEST Score: 0.7289
BertConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_bert_base_uncased_epoch10",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.28.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 3

f1 score : 0.07453416149068323
recall score : 0.0392156862745098
precision score : 0.75
thresh : 0.35


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_bert_base_uncased_epoch10 were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 29s) Loss: 0.7275(0.7275) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.5172(0.6254) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.5969(0.6151) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.6654(0.6081) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.4374(0.4374) 


Epoch 1 - avg_train_loss: 0.6081  avg_val_loss: 0.5745  time: 25s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6081  avg_val_loss: 0.5745  time: 25s
Epoch 1 - Score: 0.7229
INFO:__main__:Epoch 1 - Score: 0.7229
Epoch 1 - Save Best Score: 0.7229 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7229 Model


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.4844(0.5745) 
f1 score : 0.2068965517241379
recall score : 0.11764705882352941
precision score : 0.8571428571428571
thresh : 0.44
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 22s) Loss: 0.4775(0.4775) LR: 0.00001808  
Epoch: [2][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.3001(0.3702) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.6082(0.3520) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.3433(0.3498) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.4101(0.4101) 


Epoch 2 - avg_train_loss: 0.3498  avg_val_loss: 0.6001  time: 26s
INFO:__main__:Epoch 2 - avg_train_loss: 0.3498  avg_val_loss: 0.6001  time: 26s
Epoch 2 - Score: 0.7129
INFO:__main__:Epoch 2 - Score: 0.7129


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.3965(0.6001) 
f1 score : 0.2
recall score : 0.11764705882352941
precision score : 0.6666666666666666
thresh : 0.53
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 22s) Loss: 0.0787(0.0787) LR: 0.00001309  
Epoch: [3][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.0458(0.0810) LR: 0.00001090  
Epoch: [3][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.0563(0.0728) LR: 0.00000866  
Epoch: [3][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.0423(0.0693) LR: 0.00000695  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.3610(0.3610) 


Epoch 3 - avg_train_loss: 0.0693  avg_val_loss: 0.6194  time: 25s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0693  avg_val_loss: 0.6194  time: 25s
Epoch 3 - Score: 0.7108
INFO:__main__:Epoch 3 - Score: 0.7108


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.3846(0.6194) 
f1 score : 0.2962962962962963
recall score : 0.20915032679738563
precision score : 0.5079365079365079
thresh : 0.78
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 26s) Loss: 0.0298(0.0298) LR: 0.00000693  
Epoch: [4][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.0173(0.0242) LR: 0.00000488  
Epoch: [4][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.0301(0.0237) LR: 0.00000310  
Epoch: [4][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.0263(0.0233) LR: 0.00000194  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.3514(0.3514) 


Epoch 4 - avg_train_loss: 0.0233  avg_val_loss: 0.6718  time: 25s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0233  avg_val_loss: 0.6718  time: 25s
Epoch 4 - Score: 0.7108
INFO:__main__:Epoch 4 - Score: 0.7108


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.3626(0.6718) 
f1 score : 0.21538461538461542
recall score : 0.13725490196078433
precision score : 0.5
thresh : 0.76
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 25s) Loss: 0.0138(0.0138) LR: 0.00000193  
Epoch: [5][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.0167(0.0184) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.0219(0.0186) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.0130(0.0183) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.3511(0.3511) 


Epoch 5 - avg_train_loss: 0.0183  avg_val_loss: 0.6720  time: 25s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0183  avg_val_loss: 0.6720  time: 25s
Epoch 5 - Score: 0.7108
INFO:__main__:Epoch 5 - Score: 0.7108


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.3626(0.6720) 
f1 score : 0.23
recall score : 0.1503267973856209
precision score : 0.48936170212765956
thresh : 0.62


Score: 0.7229
INFO:__main__:Score: 0.7229
ACC BEST Score: 0.7229
INFO:__main__:ACC BEST Score: 0.7229
BertConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_bert_base_uncased_epoch10",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.28.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 3

f1 score : 0.2068965517241379
recall score : 0.11764705882352941
precision score : 0.8571428571428571
thresh : 0.44


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_bert_base_uncased_epoch10 were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 33s) Loss: 0.6642(0.6642) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.5765(0.6276) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.5851(0.6124) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.5956(0.6041) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5823(0.5823) 


Epoch 1 - avg_train_loss: 0.6041  avg_val_loss: 0.5713  time: 25s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6041  avg_val_loss: 0.5713  time: 25s
Epoch 1 - Score: 0.7149
INFO:__main__:Epoch 1 - Score: 0.7149
Epoch 1 - Save Best Score: 0.7149 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7149 Model


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.3751(0.5713) 
f1 score : 0.19101123595505617
recall score : 0.1111111111111111
precision score : 0.68
thresh : 0.47
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 31s) Loss: 0.5431(0.5431) LR: 0.00001808  
Epoch: [2][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.3978(0.3877) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.1943(0.3675) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.2941(0.3602) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6504(0.6504) 


Epoch 2 - avg_train_loss: 0.3602  avg_val_loss: 0.5930  time: 26s
INFO:__main__:Epoch 2 - avg_train_loss: 0.3602  avg_val_loss: 0.5930  time: 26s
Epoch 2 - Score: 0.7108
INFO:__main__:Epoch 2 - Score: 0.7108


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.3938(0.5930) 
f1 score : 0.40293040293040294
recall score : 0.35947712418300654
precision score : 0.4583333333333333
thresh : 0.78
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 25s) Loss: 0.1783(0.1783) LR: 0.00001309  
Epoch: [3][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.0614(0.0913) LR: 0.00001090  
Epoch: [3][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.0514(0.0844) LR: 0.00000866  
Epoch: [3][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.0377(0.0789) LR: 0.00000695  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6974(0.6974) 


Epoch 3 - avg_train_loss: 0.0789  avg_val_loss: 0.6295  time: 25s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0789  avg_val_loss: 0.6295  time: 25s
Epoch 3 - Score: 0.7149
INFO:__main__:Epoch 3 - Score: 0.7149


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.3181(0.6295) 
f1 score : 0.31336405529953915
recall score : 0.2222222222222222
precision score : 0.53125
thresh : 0.48
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 29s) Loss: 0.0304(0.0304) LR: 0.00000693  
Epoch: [4][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.0253(0.0246) LR: 0.00000488  
Epoch: [4][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.0220(0.0244) LR: 0.00000310  
Epoch: [4][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0164(0.0243) LR: 0.00000194  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.8449(0.8449) 


Epoch 4 - avg_train_loss: 0.0243  avg_val_loss: 0.6756  time: 26s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0243  avg_val_loss: 0.6756  time: 26s
Epoch 4 - Score: 0.7189
INFO:__main__:Epoch 4 - Score: 0.7189
Epoch 4 - Save Best Score: 0.7189 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7189 Model


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.3174(0.6756) 
f1 score : 0.3716814159292035
recall score : 0.27450980392156865
precision score : 0.5753424657534246
thresh : 0.48
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 28s) Loss: 0.0161(0.0161) LR: 0.00000193  
Epoch: [5][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.0195(0.0189) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.0123(0.0186) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.0177(0.0184) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.9025(0.9025) 


Epoch 5 - avg_train_loss: 0.0184  avg_val_loss: 0.7008  time: 26s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0184  avg_val_loss: 0.7008  time: 26s
Epoch 5 - Score: 0.7209
INFO:__main__:Epoch 5 - Score: 0.7209
Epoch 5 - Save Best Score: 0.7209 Model
INFO:__main__:Epoch 5 - Save Best Score: 0.7209 Model


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.3103(0.7008) 
f1 score : 0.35616438356164387
recall score : 0.2549019607843137
precision score : 0.5909090909090909
thresh : 0.43


Score: 0.7169
INFO:__main__:Score: 0.7169
ACC BEST Score: 0.7209
INFO:__main__:ACC BEST Score: 0.7209
BertConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_bert_base_uncased_epoch10",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.28.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 3

f1 score : 0.35616438356164387
recall score : 0.2549019607843137
precision score : 0.5909090909090909
thresh : 0.43


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_bert_base_uncased_epoch10 were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 37s) Loss: 0.6798(0.6798) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.6594(0.6150) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.6397(0.6055) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.6921(0.6068) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6814(0.6814) 


Epoch 1 - avg_train_loss: 0.6068  avg_val_loss: 0.5900  time: 26s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6068  avg_val_loss: 0.5900  time: 26s
Epoch 1 - Score: 0.7048
INFO:__main__:Epoch 1 - Score: 0.7048
Epoch 1 - Save Best Score: 0.7048 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7048 Model


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.4750(0.5900) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
thresh : 0.45
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 27s) Loss: 0.5157(0.5157) LR: 0.00001808  
Epoch: [2][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.3462(0.4034) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.2710(0.3821) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.4664(0.3698) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6538(0.6538) 


Epoch 2 - avg_train_loss: 0.3698  avg_val_loss: 0.5945  time: 26s
INFO:__main__:Epoch 2 - avg_train_loss: 0.3698  avg_val_loss: 0.5945  time: 26s
Epoch 2 - Score: 0.7129
INFO:__main__:Epoch 2 - Score: 0.7129
Epoch 2 - Save Best Score: 0.7129 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7129 Model


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.5762(0.5945) 
f1 score : 0.40727272727272723
recall score : 0.3684210526315789
precision score : 0.45528455284552843
thresh : 0.68
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 32s) Loss: 0.1609(0.1609) LR: 0.00001309  
Epoch: [3][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.0820(0.1064) LR: 0.00001090  
Epoch: [3][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.0519(0.0963) LR: 0.00000866  
Epoch: [3][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.0580(0.0917) LR: 0.00000695  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6640(0.6640) 


Epoch 3 - avg_train_loss: 0.0917  avg_val_loss: 0.6075  time: 26s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0917  avg_val_loss: 0.6075  time: 26s
Epoch 3 - Score: 0.7068
INFO:__main__:Epoch 3 - Score: 0.7068


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.5260(0.6075) 
f1 score : 0.3217391304347826
recall score : 0.24342105263157895
precision score : 0.47435897435897434
thresh : 0.78
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 31s) Loss: 0.0283(0.0283) LR: 0.00000693  
Epoch: [4][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.0329(0.0337) LR: 0.00000488  
Epoch: [4][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.0216(0.0324) LR: 0.00000310  
Epoch: [4][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.0289(0.0318) LR: 0.00000194  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.7177(0.7177) 


Epoch 4 - avg_train_loss: 0.0318  avg_val_loss: 0.6400  time: 26s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0318  avg_val_loss: 0.6400  time: 26s
Epoch 4 - Score: 0.7129
INFO:__main__:Epoch 4 - Score: 0.7129


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.5278(0.6400) 
f1 score : 0.32863849765258213
recall score : 0.23026315789473684
precision score : 0.5737704918032787
thresh : 0.5
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 33s) Loss: 0.0271(0.0271) LR: 0.00000193  
Epoch: [5][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.0288(0.0243) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.0163(0.0244) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.0258(0.0245) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.7241(0.7241) 


Epoch 5 - avg_train_loss: 0.0245  avg_val_loss: 0.6445  time: 26s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0245  avg_val_loss: 0.6445  time: 26s
Epoch 5 - Score: 0.7129
INFO:__main__:Epoch 5 - Score: 0.7129


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.5266(0.6445) 
f1 score : 0.32863849765258213
recall score : 0.23026315789473684
precision score : 0.5737704918032787
thresh : 0.49


Score: 0.6727
INFO:__main__:Score: 0.6727
ACC BEST Score: 0.7129
INFO:__main__:ACC BEST Score: 0.7129
BertConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_bert_base_uncased_epoch10",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.28.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 3

f1 score : 0.40727272727272723
recall score : 0.3684210526315789
precision score : 0.45528455284552843
thresh : 0.68


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_bert_base_uncased_epoch10 were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 29s) Loss: 0.6787(0.6787) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.5373(0.6293) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.5881(0.6176) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.3586(0.6098) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5321(0.5321) 


Epoch 1 - avg_train_loss: 0.6098  avg_val_loss: 0.5758  time: 25s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6098  avg_val_loss: 0.5758  time: 25s
Epoch 1 - Score: 0.7042
INFO:__main__:Epoch 1 - Score: 0.7042
Epoch 1 - Save Best Score: 0.7042 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7042 Model


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.4952(0.5758) 
f1 score : 0.10778443113772455
recall score : 0.05921052631578947
precision score : 0.6
thresh : 0.65
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 30s) Loss: 0.4432(0.4432) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.4371(0.4004) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.3297(0.3757) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.4866(0.3658) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4878(0.4878) 


Epoch 2 - avg_train_loss: 0.3658  avg_val_loss: 0.6169  time: 26s
INFO:__main__:Epoch 2 - avg_train_loss: 0.3658  avg_val_loss: 0.6169  time: 26s
Epoch 2 - Score: 0.7042
INFO:__main__:Epoch 2 - Score: 0.7042


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.4031(0.6169) 
f1 score : 0.10778443113772455
recall score : 0.05921052631578947
precision score : 0.6
thresh : 0.48
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 31s) Loss: 0.1163(0.1163) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.1646(0.1019) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.0685(0.0912) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0384(0.0884) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5434(0.5434) 


Epoch 3 - avg_train_loss: 0.0884  avg_val_loss: 0.6159  time: 27s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0884  avg_val_loss: 0.6159  time: 27s
Epoch 3 - Score: 0.7042
INFO:__main__:Epoch 3 - Score: 0.7042


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.3963(0.6159) 
f1 score : 0.29245283018867924
recall score : 0.20394736842105263
precision score : 0.5166666666666667
thresh : 0.71
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 25s) Loss: 0.0278(0.0278) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.0274(0.0284) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.0293(0.0281) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0239(0.0278) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5821(0.5821) 


Epoch 4 - avg_train_loss: 0.0278  avg_val_loss: 0.6610  time: 27s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0278  avg_val_loss: 0.6610  time: 27s
Epoch 4 - Score: 0.7042
INFO:__main__:Epoch 4 - Score: 0.7042


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.3802(0.6610) 
f1 score : 0.2962962962962963
recall score : 0.21052631578947367
precision score : 0.5
thresh : 0.76
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 27s) Loss: 0.0186(0.0186) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.0208(0.0207) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.0185(0.0209) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0130(0.0209) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5892(0.5892) 


Epoch 5 - avg_train_loss: 0.0209  avg_val_loss: 0.6809  time: 27s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0209  avg_val_loss: 0.6809  time: 27s
Epoch 5 - Score: 0.7062
INFO:__main__:Epoch 5 - Score: 0.7062
Epoch 5 - Save Best Score: 0.7062 Model
INFO:__main__:Epoch 5 - Save Best Score: 0.7062 Model


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.3841(0.6809) 
f1 score : 0.28169014084507044
recall score : 0.19736842105263158
precision score : 0.4918032786885246
thresh : 0.55


Score: 0.6922
INFO:__main__:Score: 0.6922
ACC BEST Score: 0.7062
INFO:__main__:ACC BEST Score: 0.7062
BertConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_bert_base_uncased_epoch10",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.28.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 3

f1 score : 0.28169014084507044
recall score : 0.19736842105263158
precision score : 0.4918032786885246
thresh : 0.55


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_bert_base_uncased_epoch10 were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 34s) Loss: 0.7673(0.7673) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.5850(0.6202) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.5237(0.6072) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.5893(0.6011) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5991(0.5991) 


Epoch 1 - avg_train_loss: 0.6011  avg_val_loss: 0.5813  time: 26s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6011  avg_val_loss: 0.5813  time: 26s
Epoch 1 - Score: 0.7062
INFO:__main__:Epoch 1 - Score: 0.7062
Epoch 1 - Save Best Score: 0.7062 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7062 Model


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.7006(0.5813) 
f1 score : 0.15300546448087432
recall score : 0.09210526315789473
precision score : 0.45161290322580644
thresh : 0.61
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 29s) Loss: 0.3974(0.3974) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.3084(0.3691) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.3161(0.3502) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.3921(0.3449) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.5594(0.5594) 


Epoch 2 - avg_train_loss: 0.3449  avg_val_loss: 0.5835  time: 27s
INFO:__main__:Epoch 2 - avg_train_loss: 0.3449  avg_val_loss: 0.5835  time: 27s
Epoch 2 - Score: 0.7022
INFO:__main__:Epoch 2 - Score: 0.7022


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.7394(0.5835) 
f1 score : 0.35000000000000003
recall score : 0.27631578947368424
precision score : 0.4772727272727273
thresh : 0.57
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 30s) Loss: 0.1608(0.1608) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.0716(0.0885) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.0769(0.0782) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.1044(0.0749) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.7051(0.7051) 


Epoch 3 - avg_train_loss: 0.0749  avg_val_loss: 0.6326  time: 27s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0749  avg_val_loss: 0.6326  time: 27s
Epoch 3 - Score: 0.7022
INFO:__main__:Epoch 3 - Score: 0.7022


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.9886(0.6326) 
f1 score : 0.2571428571428571
recall score : 0.17763157894736842
precision score : 0.46551724137931033
thresh : 0.77
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 31s) Loss: 0.0263(0.0263) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.0179(0.0257) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.0287(0.0249) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0286(0.0246) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.7702(0.7702) 


Epoch 4 - avg_train_loss: 0.0246  avg_val_loss: 0.6717  time: 26s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0246  avg_val_loss: 0.6717  time: 26s
Epoch 4 - Score: 0.7062
INFO:__main__:Epoch 4 - Score: 0.7062


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 1.1709(0.6717) 
f1 score : 0.2883720930232558
recall score : 0.20394736842105263
precision score : 0.49206349206349204
thresh : 0.79
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 28s) Loss: 0.0161(0.0161) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.0146(0.0184) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.0194(0.0182) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0142(0.0182) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.8219(0.8219) 


Epoch 5 - avg_train_loss: 0.0182  avg_val_loss: 0.7047  time: 26s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0182  avg_val_loss: 0.7047  time: 26s
Epoch 5 - Score: 0.7042
INFO:__main__:Epoch 5 - Score: 0.7042


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 1.2874(0.7047) 
f1 score : 0.25870646766169153
recall score : 0.17105263157894737
precision score : 0.5306122448979592
thresh : 0.75


Score: 0.6881
INFO:__main__:Score: 0.6881
ACC BEST Score: 0.7062
INFO:__main__:ACC BEST Score: 0.7062
BertConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_bert_base_uncased_epoch10",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.28.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 3

f1 score : 0.15300546448087432
recall score : 0.09210526315789473
precision score : 0.45161290322580644
thresh : 0.61


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_bert_base_uncased_epoch10 were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 31s) Loss: 0.7294(0.7294) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.6834(0.6330) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.4763(0.6079) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.7280(0.6069) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.5973(0.5973) 


Epoch 1 - avg_train_loss: 0.6069  avg_val_loss: 0.5849  time: 26s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6069  avg_val_loss: 0.5849  time: 26s
Epoch 1 - Score: 0.7103
INFO:__main__:Epoch 1 - Score: 0.7103
Epoch 1 - Save Best Score: 0.7103 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7103 Model


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.5498(0.5849) 
f1 score : 0.20105820105820105
recall score : 0.125
precision score : 0.5135135135135135
thresh : 0.57
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 28s) Loss: 0.4595(0.4595) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.4341(0.3772) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.3057(0.3595) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.3010(0.3463) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.5315(0.5315) 


Epoch 2 - avg_train_loss: 0.3463  avg_val_loss: 0.5745  time: 27s
INFO:__main__:Epoch 2 - avg_train_loss: 0.3463  avg_val_loss: 0.5745  time: 27s
Epoch 2 - Score: 0.7264
INFO:__main__:Epoch 2 - Score: 0.7264
Epoch 2 - Save Best Score: 0.7264 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7264 Model


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.5664(0.5745) 
f1 score : 0.39840637450199207
recall score : 0.32894736842105265
precision score : 0.5050505050505051
thresh : 0.63
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 30s) Loss: 0.1144(0.1144) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.0999(0.0817) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.0549(0.0768) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0679(0.0743) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5324(0.5324) 


Epoch 3 - avg_train_loss: 0.0743  avg_val_loss: 0.5917  time: 27s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0743  avg_val_loss: 0.5917  time: 27s
Epoch 3 - Score: 0.7163
INFO:__main__:Epoch 3 - Score: 0.7163


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.6252(0.5917) 
f1 score : 0.37229437229437223
recall score : 0.28289473684210525
precision score : 0.5443037974683544
thresh : 0.56
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 28s) Loss: 0.0410(0.0410) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.0313(0.0278) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.0293(0.0263) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0276(0.0260) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.5595(0.5595) 


Epoch 4 - avg_train_loss: 0.0260  avg_val_loss: 0.6406  time: 26s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0260  avg_val_loss: 0.6406  time: 26s
Epoch 4 - Score: 0.7183
INFO:__main__:Epoch 4 - Score: 0.7183


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.6403(0.6406) 
f1 score : 0.2912621359223301
recall score : 0.19736842105263158
precision score : 0.5555555555555556
thresh : 0.34
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 35s) Loss: 0.0138(0.0138) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.0194(0.0202) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.0179(0.0201) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0129(0.0198) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5785(0.5785) 


Epoch 5 - avg_train_loss: 0.0198  avg_val_loss: 0.6607  time: 27s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0198  avg_val_loss: 0.6607  time: 27s
Epoch 5 - Score: 0.7223
INFO:__main__:Epoch 5 - Score: 0.7223


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.6461(0.6607) 
f1 score : 0.28712871287128716
recall score : 0.19078947368421054
precision score : 0.58
thresh : 0.3


Score: 0.6962
INFO:__main__:Score: 0.6962
ACC BEST Score: 0.7264
INFO:__main__:ACC BEST Score: 0.7264
BertConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_bert_base_uncased_epoch10",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.28.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 3

f1 score : 0.39840637450199207
recall score : 0.32894736842105265
precision score : 0.5050505050505051
thresh : 0.63


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_bert_base_uncased_epoch10 were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 32s) Loss: 0.8004(0.8004) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.6820(0.6125) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.5614(0.6090) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.4649(0.6014) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6749(0.6749) 


Epoch 1 - avg_train_loss: 0.6014  avg_val_loss: 0.5660  time: 26s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6014  avg_val_loss: 0.5660  time: 26s
Epoch 1 - Score: 0.7183
INFO:__main__:Epoch 1 - Score: 0.7183
Epoch 1 - Save Best Score: 0.7183 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7183 Model


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.7117(0.5660) 
f1 score : 0.17341040462427745
recall score : 0.09868421052631579
precision score : 0.7142857142857143
thresh : 0.47
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 29s) Loss: 0.3562(0.3562) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.2778(0.3866) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.2035(0.3679) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.3135(0.3561) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.8236(0.8236) 


Epoch 2 - avg_train_loss: 0.3561  avg_val_loss: 0.5792  time: 27s
INFO:__main__:Epoch 2 - avg_train_loss: 0.3561  avg_val_loss: 0.5792  time: 27s
Epoch 2 - Score: 0.7243
INFO:__main__:Epoch 2 - Score: 0.7243
Epoch 2 - Save Best Score: 0.7243 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7243 Model


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.8280(0.5792) 
f1 score : 0.23204419889502764
recall score : 0.13815789473684212
precision score : 0.7241379310344828
thresh : 0.58
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 33s) Loss: 0.1559(0.1559) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.0514(0.0790) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.0541(0.0751) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0815(0.0715) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.9358(0.9358) 


Epoch 3 - avg_train_loss: 0.0715  avg_val_loss: 0.5843  time: 27s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0715  avg_val_loss: 0.5843  time: 27s
Epoch 3 - Score: 0.7304
INFO:__main__:Epoch 3 - Score: 0.7304
Epoch 3 - Save Best Score: 0.7304 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7304 Model


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.8572(0.5843) 
f1 score : 0.4070796460176991
recall score : 0.3026315789473684
precision score : 0.6216216216216216
thresh : 0.49
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 38s) Loss: 0.0267(0.0267) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.0211(0.0230) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.0190(0.0223) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 24s (remain 0m 0s) Loss: 0.0217(0.0221) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 1.1189(1.1189) 


Epoch 4 - avg_train_loss: 0.0221  avg_val_loss: 0.6460  time: 27s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0221  avg_val_loss: 0.6460  time: 27s
Epoch 4 - Score: 0.7324
INFO:__main__:Epoch 4 - Score: 0.7324
Epoch 4 - Save Best Score: 0.7324 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7324 Model


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.9802(0.6460) 
f1 score : 0.33497536945812806
recall score : 0.2236842105263158
precision score : 0.6666666666666666
thresh : 0.45
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 28s) Loss: 0.0195(0.0195) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.0202(0.0162) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.0112(0.0166) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0195(0.0166) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 1.1366(1.1366) 


Epoch 5 - avg_train_loss: 0.0166  avg_val_loss: 0.6495  time: 27s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0166  avg_val_loss: 0.6495  time: 27s
Epoch 5 - Score: 0.7324
INFO:__main__:Epoch 5 - Score: 0.7324


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.9829(0.6495) 
f1 score : 0.36018957345971564
recall score : 0.25
precision score : 0.6440677966101694
thresh : 0.48


Score: 0.7284
INFO:__main__:Score: 0.7284
ACC BEST Score: 0.7324
INFO:__main__:ACC BEST Score: 0.7324
BertConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_bert_base_uncased_epoch10",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.28.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 3

f1 score : 0.33497536945812806
recall score : 0.2236842105263158
precision score : 0.6666666666666666
thresh : 0.45


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_bert_base_uncased_epoch10 were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 33s) Loss: 0.6926(0.6926) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.5510(0.6404) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.4804(0.6190) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.4192(0.6089) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5899(0.5899) 


Epoch 1 - avg_train_loss: 0.6089  avg_val_loss: 0.5689  time: 26s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6089  avg_val_loss: 0.5689  time: 26s
Epoch 1 - Score: 0.7183
INFO:__main__:Epoch 1 - Score: 0.7183
Epoch 1 - Save Best Score: 0.7183 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7183 Model


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.7411(0.5689) 
f1 score : 0.12269938650306747
recall score : 0.06578947368421052
precision score : 0.9090909090909091
thresh : 0.31
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 25s) Loss: 0.4188(0.4188) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.3492(0.3764) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.2838(0.3736) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.4093(0.3658) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.6389(0.6389) 


Epoch 2 - avg_train_loss: 0.3658  avg_val_loss: 0.5764  time: 27s
INFO:__main__:Epoch 2 - avg_train_loss: 0.3658  avg_val_loss: 0.5764  time: 27s
Epoch 2 - Score: 0.7284
INFO:__main__:Epoch 2 - Score: 0.7284
Epoch 2 - Save Best Score: 0.7284 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7284 Model


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.8563(0.5764) 
f1 score : 0.32692307692307687
recall score : 0.2236842105263158
precision score : 0.6071428571428571
thresh : 0.59
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 33s) Loss: 0.1206(0.1206) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.0585(0.0928) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.0656(0.0807) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0685(0.0785) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6761(0.6761) 


Epoch 3 - avg_train_loss: 0.0785  avg_val_loss: 0.5905  time: 27s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0785  avg_val_loss: 0.5905  time: 27s
Epoch 3 - Score: 0.7243
INFO:__main__:Epoch 3 - Score: 0.7243


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.8516(0.5905) 
f1 score : 0.36123348017621143
recall score : 0.26973684210526316
precision score : 0.5466666666666666
thresh : 0.59
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 31s) Loss: 0.0260(0.0260) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.0232(0.0281) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.0360(0.0273) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0287(0.0265) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.7411(0.7411) 


Epoch 4 - avg_train_loss: 0.0265  avg_val_loss: 0.6521  time: 26s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0265  avg_val_loss: 0.6521  time: 26s
Epoch 4 - Score: 0.7264
INFO:__main__:Epoch 4 - Score: 0.7264


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.9902(0.6521) 
f1 score : 0.3121951219512195
recall score : 0.21052631578947367
precision score : 0.6037735849056604
thresh : 0.47
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 28s) Loss: 0.0249(0.0249) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.0233(0.0195) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.0206(0.0200) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0232(0.0205) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.7650(0.7650) 


Epoch 5 - avg_train_loss: 0.0205  avg_val_loss: 0.6718  time: 26s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0205  avg_val_loss: 0.6718  time: 26s
Epoch 5 - Score: 0.7264
INFO:__main__:Epoch 5 - Score: 0.7264


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 1.0237(0.6718) 
f1 score : 0.3121951219512195
recall score : 0.21052631578947367
precision score : 0.6037735849056604
thresh : 0.43


Score: 0.7183
INFO:__main__:Score: 0.7183
ACC BEST Score: 0.7284
INFO:__main__:ACC BEST Score: 0.7284
BertConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_bert_base_uncased_epoch10",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.28.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 3

f1 score : 0.32692307692307687
recall score : 0.2236842105263158
precision score : 0.6071428571428571
thresh : 0.59


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_bert_base_uncased_epoch10 were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 30s) Loss: 0.8238(0.8238) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.7043(0.6176) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.3344(0.6073) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.4375(0.6036) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6027(0.6027) 


Epoch 1 - avg_train_loss: 0.6036  avg_val_loss: 0.5762  time: 26s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6036  avg_val_loss: 0.5762  time: 26s
Epoch 1 - Score: 0.7143
INFO:__main__:Epoch 1 - Score: 0.7143
Epoch 1 - Save Best Score: 0.7143 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7143 Model


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.4976(0.5762) 
f1 score : 0.1395348837209302
recall score : 0.07894736842105263
precision score : 0.6
thresh : 0.45
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 26s) Loss: 0.4185(0.4185) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.3453(0.3436) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.2838(0.3215) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.2747(0.3087) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.6206(0.6206) 


Epoch 2 - avg_train_loss: 0.3087  avg_val_loss: 0.5876  time: 26s
INFO:__main__:Epoch 2 - avg_train_loss: 0.3087  avg_val_loss: 0.5876  time: 26s
Epoch 2 - Score: 0.7143
INFO:__main__:Epoch 2 - Score: 0.7143


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.5178(0.5876) 
f1 score : 0.4166666666666667
recall score : 0.3618421052631579
precision score : 0.49107142857142855
thresh : 0.64
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 30s) Loss: 0.0669(0.0669) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.0499(0.0641) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.0401(0.0584) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.0414(0.0566) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.7163(0.7163) 


Epoch 3 - avg_train_loss: 0.0566  avg_val_loss: 0.6886  time: 25s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0566  avg_val_loss: 0.6886  time: 25s
Epoch 3 - Score: 0.7163
INFO:__main__:Epoch 3 - Score: 0.7163
Epoch 3 - Save Best Score: 0.7163 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7163 Model


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.5174(0.6886) 
f1 score : 0.2282608695652174
recall score : 0.13815789473684212
precision score : 0.65625
thresh : 0.48
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 30s) Loss: 0.0266(0.0266) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.0211(0.0201) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.0154(0.0201) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.0139(0.0197) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.7797(0.7797) 


Epoch 4 - avg_train_loss: 0.0197  avg_val_loss: 0.7209  time: 26s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0197  avg_val_loss: 0.7209  time: 26s
Epoch 4 - Score: 0.7203
INFO:__main__:Epoch 4 - Score: 0.7203
Epoch 4 - Save Best Score: 0.7203 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7203 Model


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.5247(0.7209) 
f1 score : 0.2941176470588235
recall score : 0.19736842105263158
precision score : 0.5769230769230769
thresh : 0.61
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 32s) Loss: 0.0158(0.0158) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.0144(0.0155) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.0125(0.0154) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0164(0.0154) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.7788(0.7788) 


Epoch 5 - avg_train_loss: 0.0154  avg_val_loss: 0.7160  time: 26s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0154  avg_val_loss: 0.7160  time: 26s
Epoch 5 - Score: 0.7203
INFO:__main__:Epoch 5 - Score: 0.7203


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.5211(0.7160) 
f1 score : 0.29268292682926833
recall score : 0.19736842105263158
precision score : 0.5660377358490566
thresh : 0.64


Score: 0.7103
INFO:__main__:Score: 0.7103
ACC BEST Score: 0.7203
INFO:__main__:ACC BEST Score: 0.7203
Score: 0.7047
INFO:__main__:Score: 0.7047
ACC BEST Score: 0.7115
INFO:__main__:ACC BEST Score: 0.7115


f1 score : 0.2941176470588235
recall score : 0.19736842105263158
precision score : 0.5769230769230769
thresh : 0.61
f1 score : 0.2974653275944524
recall score : 0.20420223243598162
precision score : 0.5475352112676056
thresh : 0.6


In [22]:
from google.colab import runtime
runtime.unassign()