In [1]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers
!pip install datasets
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!nvidia-smi

Sat May  6 11:52:57 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   31C    P0    45W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:

import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re
import html

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD, AdamW, RAdam
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score, recall_score, accuracy_score, precision_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
DIR = "/content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測"
INPUT_DIR = os.path.join(DIR,"input")
OUTPUT_DIR = os.path.join(DIR,"output")
CUSTOM_MODEL_DIR = os.path.join(OUTPUT_DIR,'clrp_distilbert_base_uncased_epoch10')
OUTPUT_EXP_DIR = DIR + '/output/EXP074/'
if not os.path.exists(OUTPUT_EXP_DIR):
    os.makedirs(OUTPUT_EXP_DIR)

In [6]:

# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    model_name="distilbert-base-uncased"
    # model='microsoft/deberta-base'
    # model='roberta-base'
    # model='roberta-large'
    # model='roberta-large-mnli'
    # model='xlnet-large-cased'
    # model='albert-xxlarge-v2'
    # model="microsoft/deberta-large"
    # model="microsoft/deberta-v3-large"
    # model='microsoft/deberta-v2-xlarge'
    # model='funnel-transformer/large'
    # model='funnel-transformer/medium'
    # model='albert-base-v2'
    # model='albert-large-v2'
    # model='google/electra-large-discriminator'
    # model='google/electra-base-discriminator'
    # model="facebook/bart-large-mnli"
    # model="facebook/bart-large"
    # model="facebook/bart-base"
    model = CUSTOM_MODEL_DIR
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=5
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=16
    fc_dropout=0.2
    target_size=1
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=10
    trn_fold=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    train=True
    nth_awp_start_epoch=1
    gradient_checkpointing = False
    freezing = False
    num_reinit_layers = 1
    is_reinit_layer = False
    fgm = False
    awp_start=1

if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0, 1]

In [7]:
def get_score(labels, outputs):
    thresh = 0.5
    y_pred = outputs
    y_true = labels
    f_score = f1_score(y_true, (y_pred>thresh).astype(int))
    r_score = recall_score(y_true, (y_pred>thresh).astype(int))
    p_score = precision_score(y_true, (y_pred>thresh).astype(int))
    print(f"f1 score : {f_score}")
    print(f"recall score : {r_score}")
    print(f"precision score : {p_score}")
    return accuracy_score(y_true, (y_pred>thresh).astype(int))

def get_acc_score(labels, outputs):
    y_pred = outputs
    y_true = labels
    best_score = 0
    best_thresh = 0.5
    for thresh in np.arange(0.1, 0.80, 0.01):
        thresh = np.round(thresh, 2)
        score = accuracy_score(y_true, (y_pred>thresh).astype(int))
        #print("Accuracy score at threshold {0} is {1}".format(thresh, score))
        if score > best_score:
          best_score = score
          best_thresh = thresh
    print(f"thresh : {best_thresh}")
    return accuracy_score(y_true, (y_pred>best_thresh).astype(int))


def get_logger(filename=OUTPUT_EXP_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=CFG.seed)

In [8]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False
        
def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
            
    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """
    
    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"
        
        if hasattr(embeddings_path, attr_name): 
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [9]:
import pandas as pd
import numpy as np

train = pd.read_csv(os.path.join(INPUT_DIR,"train_data.csv"))
test = pd.read_csv(os.path.join(INPUT_DIR,"test_data.csv"))
sample_sub = pd.read_csv(os.path.join(INPUT_DIR,"submission.csv"))

train = train.sample(frac=1, random_state=CFG.seed).reset_index()

print(train.shape)
display(train.head(3))

print(test.shape)
display(test.head(3))

print(sample_sub.shape)
display(sample_sub.head(3))

(4974, 7)


Unnamed: 0,index,id,title,year,abstract,keywords,y
0,721,722,Global Optimality Conditions for Deep Neural N...,2018,We study the error landscape of deep linear an...,"deep linear neural networks, global optimality...",1
1,144,145,Multi-Task Learning by Deep Collaboration and ...,2018,Convolutional neural networks (CNN) have becom...,"multi-task learning, soft parameter sharing, f...",0
2,4542,4543,On the Need for Topology-Aware Generative Mode...,2020,"ML algorithms or models, especially deep neura...","Manifold-based Defense, Robust Learning, Adver...",1


(6393, 5)


Unnamed: 0,id,title,year,abstract,keywords
0,1,StyleAlign: Analysis and Applications of Align...,2022,"In this paper, we perform an in-depth study of...","StyleGAN, transfer learning, fine tuning, mode..."
1,2,Embedding a random graph via GNN: mean-field i...,2021,We develop a theory for embedding a random gra...,"Graph neural network, graph embedding, multi-r..."
2,3,BBRefinement: an universal scheme to improve p...,2021,We present a conceptually simple yet powerful ...,"object detection, deep neural networks, refine..."


(6393, 2)


Unnamed: 0,id,y
0,1,0
1,2,0
2,3,0


In [10]:
train["texts"] = train["title"] + "[SEP]" + train["abstract"] 

In [11]:
skf = StratifiedKFold(n_splits=CFG.n_fold,shuffle=True,random_state=CFG.seed)
for fold, ( _, val_) in enumerate(skf.split(train, train.y)):
    train.loc[val_ , "kfold"] = int(fold)
    
train["kfold"] = train["kfold"].astype(int)

if CFG.debug:
    display(train.groupby('kfold').size())
    train = train.sample(n=500, random_state=0).reset_index(drop=True)
    display(train.groupby('kfold').size())

In [12]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_EXP_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

In [13]:
# ====================================================
# Define max_len
# ====================================================
#lengths = []
#tk0 = tqdm(train['texts'].fillna("").values, total=len(train))
#for text in tk0:
#    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
#    lengths.append(length)
#CFG.max_len = max(lengths) + 3 # cls + sep + sep
#LOGGER.info(f"max_len: {CFG.max_len}")

In [14]:
class AWP:
    def __init__(self, model, optimizer, *, adv_param='weight',
                 adv_lr=0.001, adv_eps=0.001):
        self.model = model
        self.optimizer = optimizer
        self.adv_param = adv_param
        self.adv_lr = adv_lr
        self.adv_eps = adv_eps
        self.backup = {}

    def perturb(self, inputs, y, criterion):
        """
        Perturb model parameters for AWP gradient
        Call before loss and loss.backward()
        """
        self._save()  # save model parameters
        self._attack_step()  # perturb weights

    def _attack_step(self):
        e = 1e-6
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                grad = self.optimizer.state[param]['exp_avg']
                norm_grad = torch.norm(grad)
                norm_data = torch.norm(param.detach())

                if norm_grad != 0 and not torch.isnan(norm_grad):
                    # Set lower and upper limit in change
                    limit_eps = self.adv_eps * param.detach().abs()
                    param_min = param.data - limit_eps
                    param_max = param.data + limit_eps

                    # Perturb along gradient
                    # w += (adv_lr * |w| / |grad|) * grad
                    param.data.add_(grad, alpha=(self.adv_lr * (norm_data + e) / (norm_grad + e)))

                    # Apply the limit to the change
                    param.data.clamp_(param_min, param_max)

    def _save(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                if name not in self.backup:
                    self.backup[name] = param.clone().detach()
                else:
                    self.backup[name].copy_(param.data)

    def restore(self):
        """
        Restore model parameter to correct position; AWP do not perturbe weights, it perturb gradients
        Call after loss.backward(), before optimizer.step()
        """
        for name, param in self.model.named_parameters():
            if name in self.backup:
                param.data.copy_(self.backup[name])

In [15]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False,
                           truncation=True)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['texts'].values
        self.labels = df['y'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

#collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [16]:
def reinit_layers(model):

    #for layer in model.model.encoder.layer[-CFG.num_reinit_layers:]:
    for layer in model.encoder.layer[-CFG.num_reinit_layers:]:    #Custome model内(backbone)

            for module in layer.modules():

                if isinstance(module,nn.Linear):
                    module.weight.data.normal_(mean=0.0,std=model.config.initializer_range)
                    if module.bias is not None:
                            module.bias.data.zero_()
                elif isinstance(module, nn.Embedding):
                        module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
                        if module.padding_idx is not None:
                            module.weight.data[module.padding_idx].zero_()
                elif isinstance(module, nn.LayerNorm):
                        module.bias.data.zero_()
                        module.weight.data.fill_(1.0)
                        
    return model

In [17]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim=1)
        return max_embeddings
    

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if CFG.is_reinit_layer:
            self.model = reinit_layers(self.model)
            print(f'Reinitializing Last {CFG.num_reinit_layers} Layers.')
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()

        # Freezing
        if cfg.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            cfg.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())

        self.pool = MaxPooling()
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        self._init_weights(self.fc)
        self.layer_norm1 = nn.LayerNorm(self.config.hidden_size)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        feature = self.layer_norm1(feature)
        output = self.fc(feature)
        return output

In [18]:
def calculate_loss(inputs, labels, model, criterion, is_valid=True, device="cpu"):    
    y_preds = model(inputs)
    loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
    return (loss, y_preds) if is_valid else loss

In [19]:

# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device, awp):
    model.zero_grad()
    model.train()
    awp_start = CFG.awp_start
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        if epoch >= awp_start:
            awp.perturb(inputs, labels, criterion)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            loss = calculate_loss(inputs=inputs, labels=labels, model=model, criterion=criterion, is_valid=False, device=device)
        #print(y_preds.sigmoid().squeeze().view(1, -1))
        #loss = criterion(y_preds.sigmoid().squeeze(), labels.squeeze())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        if scaler is not None:
            scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        awp.restore()
        if CFG.fgm:
          fgm.attack() 
          adversarial_loss = calculate_loss(inputs=inputs, labels=labels, model=model, criterion=criterion, is_valid=False, device=device)
          scaler.scale(adversarial_loss).backward()
          fgm.restore()
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  #'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          #grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))

    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            loss, y_preds = calculate_loss(inputs=inputs, labels=labels, model=model, criterion=criterion, is_valid=True, device=device)
        #loss = criterion(y_preds.sigmoid().squeeze(), labels.squeeze())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    predictions = np.concatenate(predictions)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [20]:

# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['kfold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['kfold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds['y'].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)


    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size*2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_EXP_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr=5e-6, decoder_lr=1e-4, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", 
                    "LayerNorm.weight"]
        group1=['layer.0.','layer.1.','layer.2.','layer.3.']
        group2=['layer.4.','layer.5.','layer.6.','layer.7.']    
        group3=['layer.8.','layer.9.','layer.10.','layer.11.']
        group_all=['layer.0.','layer.1.','layer.2.','layer.3.','layer.4.','layer.5.','layer.6.','layer.7.','layer.8.','layer.9.','layer.10.','layer.11.']
        optimizer_parameters = [
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay': weight_decay},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay': weight_decay, 'lr': encoder_lr/2.6},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay': weight_decay, 'lr': encoder_lr},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay': weight_decay, 'lr': encoder_lr*2.6},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay': 0.0},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay': 0.0, 'lr': encoder_lr/2.6},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay': 0.0, 'lr': encoder_lr},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay': 0.0, 'lr': encoder_lr*2.6},
        {'params': [p for n, p in model.named_parameters() if "model" not in n], 'lr':decoder_lr, "momentum" : 0.99},
    ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.BCEWithLogitsLoss()

    print('Enable AWP')
    awp = AWP(model, optimizer, adv_lr=0.001, adv_eps=0.001)
    #print('Enable FGM')
    #fgm = FGM(model=model, eps=0.1)
    
    best_score = -1.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device, awp)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score_05 = get_score(valid_labels, predictions)
        score = get_acc_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')

        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_EXP_DIR+f"{CFG.model_name.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_EXP_DIR+f"{CFG.model_name.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds['pred'] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [21]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df['y'].values
        preds = oof_df['pred'].values
        score = get_score(labels, preds)
        acc_score = get_acc_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
        LOGGER.info(f'ACC BEST Score: {acc_score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
            #break
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_EXP_DIR+'oof_df.pkl')

DistilBertConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_distilbert_base_uncased_epoch10",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.28.1",
  "vocab_size": 30522
}

INFO:__main__:DistilBertConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1

Enable AWP
Epoch: [1][0/279] Elapsed 0m 1s (remain 6m 15s) Loss: 0.7087(0.7087) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 5s (remain 0m 9s) Loss: 0.8098(0.6127) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 9s (remain 0m 3s) Loss: 0.5664(0.6108) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 13s (remain 0m 0s) Loss: 0.8613(0.6046) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.6099(0.6099) 


Epoch 1 - avg_train_loss: 0.6046  avg_val_loss: 0.5899  time: 15s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6046  avg_val_loss: 0.5899  time: 15s
Epoch 1 - Score: 0.7129
INFO:__main__:Epoch 1 - Score: 0.7129
Epoch 1 - Save Best Score: 0.7129 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7129 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.6423(0.5899) 
f1 score : 0.2980769230769231
recall score : 0.20261437908496732
precision score : 0.5636363636363636
thresh : 0.58
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 14s) Loss: 0.5423(0.5423) LR: 0.00001808  
Epoch: [2][100/279] Elapsed 0m 4s (remain 0m 8s) Loss: 0.5890(0.5235) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.5195(0.5175) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 12s (remain 0m 0s) Loss: 0.5413(0.5181) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.5839(0.5839) 


Epoch 2 - avg_train_loss: 0.5181  avg_val_loss: 0.5829  time: 14s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5181  avg_val_loss: 0.5829  time: 14s
Epoch 2 - Score: 0.7169
INFO:__main__:Epoch 2 - Score: 0.7169
Epoch 2 - Save Best Score: 0.7169 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7169 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.5884(0.5829) 
f1 score : 0.27317073170731704
recall score : 0.1830065359477124
precision score : 0.5384615384615384
thresh : 0.56
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 11s) Loss: 0.4139(0.4139) LR: 0.00001309  
Epoch: [3][100/279] Elapsed 0m 4s (remain 0m 8s) Loss: 0.7019(0.4442) LR: 0.00001090  
Epoch: [3][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.5832(0.4362) LR: 0.00000866  
Epoch: [3][278/279] Elapsed 0m 12s (remain 0m 0s) Loss: 0.3519(0.4342) LR: 0.00000695  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.5812(0.5812) 


Epoch 3 - avg_train_loss: 0.4342  avg_val_loss: 0.5987  time: 14s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4342  avg_val_loss: 0.5987  time: 14s
Epoch 3 - Score: 0.7189
INFO:__main__:Epoch 3 - Score: 0.7189
Epoch 3 - Save Best Score: 0.7189 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7189 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.6700(0.5987) 
f1 score : 0.2512562814070352
recall score : 0.16339869281045752
precision score : 0.5434782608695652
thresh : 0.55
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 11s) Loss: 0.2830(0.2830) LR: 0.00000693  
Epoch: [4][100/279] Elapsed 0m 4s (remain 0m 8s) Loss: 0.3048(0.3662) LR: 0.00000488  
Epoch: [4][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.5546(0.3652) LR: 0.00000310  
Epoch: [4][278/279] Elapsed 0m 12s (remain 0m 0s) Loss: 0.1979(0.3638) LR: 0.00000194  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.5641(0.5641) 


Epoch 4 - avg_train_loss: 0.3638  avg_val_loss: 0.5970  time: 14s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3638  avg_val_loss: 0.5970  time: 14s
Epoch 4 - Score: 0.7169
INFO:__main__:Epoch 4 - Score: 0.7169


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.6343(0.5970) 
f1 score : 0.3346938775510204
recall score : 0.2679738562091503
precision score : 0.44565217391304346
thresh : 0.66
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 9s) Loss: 0.3529(0.3529) LR: 0.00000193  
Epoch: [5][100/279] Elapsed 0m 4s (remain 0m 8s) Loss: 0.3910(0.3407) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.3937(0.3334) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 12s (remain 0m 0s) Loss: 0.2046(0.3301) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.5608(0.5608) 


Epoch 5 - avg_train_loss: 0.3301  avg_val_loss: 0.5999  time: 14s
INFO:__main__:Epoch 5 - avg_train_loss: 0.3301  avg_val_loss: 0.5999  time: 14s
Epoch 5 - Score: 0.7169
INFO:__main__:Epoch 5 - Score: 0.7169


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.6428(0.5999) 
f1 score : 0.3306451612903226
recall score : 0.2679738562091503
precision score : 0.43157894736842106
thresh : 0.66


Score: 0.7008
INFO:__main__:Score: 0.7008
ACC BEST Score: 0.7189
INFO:__main__:ACC BEST Score: 0.7189
DistilBertConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_distilbert_base_uncased_epoch10",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.28.1",
  "vocab_size": 30522
}

INFO:__main__:DistilBertConfig {
  "_name_or_path": "/con

f1 score : 0.2512562814070352
recall score : 0.16339869281045752
precision score : 0.5434782608695652
thresh : 0.55


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_distilbert_base_uncased_epoch10 were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 18s) Loss: 0.6109(0.6109) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 4s (remain 0m 7s) Loss: 0.5030(0.6236) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.5611(0.6189) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 11s (remain 0m 0s) Loss: 0.8195(0.6166) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.4980(0.4980) 


Epoch 1 - avg_train_loss: 0.6166  avg_val_loss: 0.5928  time: 14s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6166  avg_val_loss: 0.5928  time: 14s
Epoch 1 - Score: 0.7129
INFO:__main__:Epoch 1 - Score: 0.7129
Epoch 1 - Save Best Score: 0.7129 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7129 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.5203(0.5928) 
f1 score : 0.20942408376963353
recall score : 0.13071895424836602
precision score : 0.5263157894736842
thresh : 0.45
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 14s) Loss: 0.5788(0.5788) LR: 0.00001808  
Epoch: [2][100/279] Elapsed 0m 4s (remain 0m 8s) Loss: 0.4610(0.5219) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.6461(0.5265) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 12s (remain 0m 0s) Loss: 0.4717(0.5298) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.4601(0.4601) 


Epoch 2 - avg_train_loss: 0.5298  avg_val_loss: 0.5836  time: 14s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5298  avg_val_loss: 0.5836  time: 14s
Epoch 2 - Score: 0.7209
INFO:__main__:Epoch 2 - Score: 0.7209
Epoch 2 - Save Best Score: 0.7209 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7209 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.5133(0.5836) 
f1 score : 0.3318777292576419
recall score : 0.24836601307189543
precision score : 0.5
thresh : 0.58
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 10s) Loss: 0.3727(0.3727) LR: 0.00001309  
Epoch: [3][100/279] Elapsed 0m 4s (remain 0m 7s) Loss: 0.4204(0.4492) LR: 0.00001090  
Epoch: [3][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.4787(0.4498) LR: 0.00000866  
Epoch: [3][278/279] Elapsed 0m 12s (remain 0m 0s) Loss: 0.3233(0.4448) LR: 0.00000695  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.3754(0.3754) 


Epoch 3 - avg_train_loss: 0.4448  avg_val_loss: 0.5862  time: 14s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4448  avg_val_loss: 0.5862  time: 14s
Epoch 3 - Score: 0.7189
INFO:__main__:Epoch 3 - Score: 0.7189


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.4727(0.5862) 
f1 score : 0.3024390243902439
recall score : 0.20261437908496732
precision score : 0.5961538461538461
thresh : 0.52
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 12s) Loss: 0.4327(0.4327) LR: 0.00000693  
Epoch: [4][100/279] Elapsed 0m 4s (remain 0m 7s) Loss: 0.4139(0.3830) LR: 0.00000488  
Epoch: [4][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.2722(0.3752) LR: 0.00000310  
Epoch: [4][278/279] Elapsed 0m 11s (remain 0m 0s) Loss: 0.2795(0.3690) LR: 0.00000194  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.3881(0.3881) 


Epoch 4 - avg_train_loss: 0.3690  avg_val_loss: 0.5883  time: 14s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3690  avg_val_loss: 0.5883  time: 14s
Epoch 4 - Score: 0.7189
INFO:__main__:Epoch 4 - Score: 0.7189


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.4791(0.5883) 
f1 score : 0.3879310344827587
recall score : 0.29411764705882354
precision score : 0.569620253164557
thresh : 0.52
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 11s) Loss: 0.4043(0.4043) LR: 0.00000193  
Epoch: [5][100/279] Elapsed 0m 4s (remain 0m 8s) Loss: 0.3534(0.3412) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.3045(0.3374) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 12s (remain 0m 0s) Loss: 0.2649(0.3342) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.4159(0.4159) 


Epoch 5 - avg_train_loss: 0.3342  avg_val_loss: 0.5908  time: 14s
INFO:__main__:Epoch 5 - avg_train_loss: 0.3342  avg_val_loss: 0.5908  time: 14s
Epoch 5 - Score: 0.7229
INFO:__main__:Epoch 5 - Score: 0.7229
Epoch 5 - Save Best Score: 0.7229 Model
INFO:__main__:Epoch 5 - Save Best Score: 0.7229 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.5090(0.5908) 
f1 score : 0.43283582089552236
recall score : 0.3790849673202614
precision score : 0.5043478260869565
thresh : 0.57


Score: 0.6948
INFO:__main__:Score: 0.6948
ACC BEST Score: 0.7229
INFO:__main__:ACC BEST Score: 0.7229
DistilBertConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_distilbert_base_uncased_epoch10",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.28.1",
  "vocab_size": 30522
}

INFO:__main__:DistilBertConfig {
  "_name_or_path": "/con

f1 score : 0.43283582089552236
recall score : 0.3790849673202614
precision score : 0.5043478260869565
thresh : 0.57


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_distilbert_base_uncased_epoch10 were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 19s) Loss: 0.7030(0.7030) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 4s (remain 0m 7s) Loss: 0.4712(0.6145) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.6282(0.6112) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 11s (remain 0m 0s) Loss: 0.6412(0.6010) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.5741(0.5741) 


Epoch 1 - avg_train_loss: 0.6010  avg_val_loss: 0.5883  time: 14s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6010  avg_val_loss: 0.5883  time: 14s
Epoch 1 - Score: 0.7048
INFO:__main__:Epoch 1 - Score: 0.7048
Epoch 1 - Save Best Score: 0.7048 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7048 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.4410(0.5883) 
f1 score : 0.13793103448275862
recall score : 0.0784313725490196
precision score : 0.5714285714285714
thresh : 0.56
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 11s) Loss: 0.6140(0.6140) LR: 0.00001808  
Epoch: [2][100/279] Elapsed 0m 4s (remain 0m 7s) Loss: 0.4195(0.5141) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.2923(0.5090) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 12s (remain 0m 0s) Loss: 0.4508(0.5035) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.5623(0.5623) 


Epoch 2 - avg_train_loss: 0.5035  avg_val_loss: 0.5931  time: 14s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5035  avg_val_loss: 0.5931  time: 14s
Epoch 2 - Score: 0.7068
INFO:__main__:Epoch 2 - Score: 0.7068
Epoch 2 - Save Best Score: 0.7068 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7068 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.4393(0.5931) 
f1 score : 0.33898305084745767
recall score : 0.26143790849673204
precision score : 0.4819277108433735
thresh : 0.76
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 15s) Loss: 0.3864(0.3864) LR: 0.00001309  
Epoch: [3][100/279] Elapsed 0m 4s (remain 0m 7s) Loss: 0.5190(0.4072) LR: 0.00001090  
Epoch: [3][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.4401(0.4114) LR: 0.00000866  
Epoch: [3][278/279] Elapsed 0m 12s (remain 0m 0s) Loss: 0.4281(0.4092) LR: 0.00000695  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.5585(0.5585) 


Epoch 3 - avg_train_loss: 0.4092  avg_val_loss: 0.5976  time: 14s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4092  avg_val_loss: 0.5976  time: 14s
Epoch 3 - Score: 0.7048
INFO:__main__:Epoch 3 - Score: 0.7048


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.4341(0.5976) 
f1 score : 0.4148148148148148
recall score : 0.3660130718954248
precision score : 0.47863247863247865
thresh : 0.76
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 14s) Loss: 0.4246(0.4246) LR: 0.00000693  
Epoch: [4][100/279] Elapsed 0m 4s (remain 0m 8s) Loss: 0.3343(0.3448) LR: 0.00000488  
Epoch: [4][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.3179(0.3376) LR: 0.00000310  
Epoch: [4][278/279] Elapsed 0m 12s (remain 0m 0s) Loss: 0.3180(0.3383) LR: 0.00000194  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.5474(0.5474) 


Epoch 4 - avg_train_loss: 0.3383  avg_val_loss: 0.5985  time: 14s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3383  avg_val_loss: 0.5985  time: 14s
Epoch 4 - Score: 0.7068
INFO:__main__:Epoch 4 - Score: 0.7068


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.3852(0.5985) 
f1 score : 0.39199999999999996
recall score : 0.3202614379084967
precision score : 0.5051546391752577
thresh : 0.76
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 15s) Loss: 0.3131(0.3131) LR: 0.00000193  
Epoch: [5][100/279] Elapsed 0m 4s (remain 0m 8s) Loss: 0.2849(0.3075) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.2755(0.3077) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 12s (remain 0m 0s) Loss: 0.3375(0.3048) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.5386(0.5386) 


Epoch 5 - avg_train_loss: 0.3048  avg_val_loss: 0.5973  time: 14s
INFO:__main__:Epoch 5 - avg_train_loss: 0.3048  avg_val_loss: 0.5973  time: 14s
Epoch 5 - Score: 0.7068
INFO:__main__:Epoch 5 - Score: 0.7068


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.3755(0.5973) 
f1 score : 0.37037037037037035
recall score : 0.29411764705882354
precision score : 0.5
thresh : 0.76


Score: 0.6867
INFO:__main__:Score: 0.6867
ACC BEST Score: 0.7068
INFO:__main__:ACC BEST Score: 0.7068
DistilBertConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_distilbert_base_uncased_epoch10",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.28.1",
  "vocab_size": 30522
}

INFO:__main__:DistilBertConfig {
  "_name_or_path": "/con

f1 score : 0.33898305084745767
recall score : 0.26143790849673204
precision score : 0.4819277108433735
thresh : 0.76


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_distilbert_base_uncased_epoch10 were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 20s) Loss: 0.6337(0.6337) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 4s (remain 0m 7s) Loss: 0.7181(0.6379) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.4869(0.6111) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 11s (remain 0m 0s) Loss: 0.5592(0.6079) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.6394(0.6394) 


Epoch 1 - avg_train_loss: 0.6079  avg_val_loss: 0.5952  time: 14s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6079  avg_val_loss: 0.5952  time: 14s
Epoch 1 - Score: 0.7108
INFO:__main__:Epoch 1 - Score: 0.7108
Epoch 1 - Save Best Score: 0.7108 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7108 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.6184(0.5952) 
f1 score : 0.2938388625592417
recall score : 0.20394736842105263
precision score : 0.5254237288135594
thresh : 0.56
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 17s) Loss: 0.5901(0.5901) LR: 0.00001808  
Epoch: [2][100/279] Elapsed 0m 4s (remain 0m 8s) Loss: 0.3824(0.5233) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.4751(0.5173) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 12s (remain 0m 0s) Loss: 0.6425(0.5144) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.6368(0.6368) 


Epoch 2 - avg_train_loss: 0.5144  avg_val_loss: 0.5903  time: 14s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5144  avg_val_loss: 0.5903  time: 14s
Epoch 2 - Score: 0.7149
INFO:__main__:Epoch 2 - Score: 0.7149
Epoch 2 - Save Best Score: 0.7149 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7149 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.6169(0.5903) 
f1 score : 0.3937007874015748
recall score : 0.32894736842105265
precision score : 0.49019607843137253
thresh : 0.58
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 17s) Loss: 0.5693(0.5693) LR: 0.00001309  
Epoch: [3][100/279] Elapsed 0m 4s (remain 0m 8s) Loss: 0.5748(0.4296) LR: 0.00001090  
Epoch: [3][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.5099(0.4297) LR: 0.00000866  
Epoch: [3][278/279] Elapsed 0m 12s (remain 0m 0s) Loss: 0.3935(0.4280) LR: 0.00000695  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.6129(0.6129) 


Epoch 3 - avg_train_loss: 0.4280  avg_val_loss: 0.5805  time: 14s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4280  avg_val_loss: 0.5805  time: 14s
Epoch 3 - Score: 0.7169
INFO:__main__:Epoch 3 - Score: 0.7169
Epoch 3 - Save Best Score: 0.7169 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7169 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.5749(0.5805) 
f1 score : 0.3952569169960475
recall score : 0.32894736842105265
precision score : 0.49504950495049505
thresh : 0.61
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 18s) Loss: 0.3880(0.3880) LR: 0.00000693  
Epoch: [4][100/279] Elapsed 0m 4s (remain 0m 8s) Loss: 0.3565(0.3547) LR: 0.00000488  
Epoch: [4][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.4965(0.3531) LR: 0.00000310  
Epoch: [4][278/279] Elapsed 0m 12s (remain 0m 0s) Loss: 0.2333(0.3504) LR: 0.00000194  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.6119(0.6119) 


Epoch 4 - avg_train_loss: 0.3504  avg_val_loss: 0.5807  time: 14s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3504  avg_val_loss: 0.5807  time: 14s
Epoch 4 - Score: 0.7108
INFO:__main__:Epoch 4 - Score: 0.7108


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.5686(0.5807) 
f1 score : 0.35000000000000003
recall score : 0.27631578947368424
precision score : 0.4772727272727273
thresh : 0.6
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 19s) Loss: 0.2439(0.2439) LR: 0.00000193  
Epoch: [5][100/279] Elapsed 0m 4s (remain 0m 8s) Loss: 0.3306(0.3220) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.4215(0.3203) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 12s (remain 0m 0s) Loss: 0.2739(0.3173) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.6100(0.6100) 


Epoch 5 - avg_train_loss: 0.3173  avg_val_loss: 0.5842  time: 14s
INFO:__main__:Epoch 5 - avg_train_loss: 0.3173  avg_val_loss: 0.5842  time: 14s
Epoch 5 - Score: 0.7108
INFO:__main__:Epoch 5 - Score: 0.7108


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.5770(0.5842) 
f1 score : 0.4045801526717557
recall score : 0.34868421052631576
precision score : 0.4818181818181818
thresh : 0.67


Score: 0.6928
INFO:__main__:Score: 0.6928
ACC BEST Score: 0.7169
INFO:__main__:ACC BEST Score: 0.7169
DistilBertConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_distilbert_base_uncased_epoch10",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.28.1",
  "vocab_size": 30522
}

INFO:__main__:DistilBertConfig {
  "_name_or_path": "/con

f1 score : 0.3952569169960475
recall score : 0.32894736842105265
precision score : 0.49504950495049505
thresh : 0.61


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_distilbert_base_uncased_epoch10 were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 19s) Loss: 0.5879(0.5879) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 4s (remain 0m 7s) Loss: 0.5391(0.6311) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.5607(0.6182) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 11s (remain 0m 0s) Loss: 0.6220(0.6092) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.5110(0.5110) 


Epoch 1 - avg_train_loss: 0.6092  avg_val_loss: 0.5829  time: 13s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6092  avg_val_loss: 0.5829  time: 13s
Epoch 1 - Score: 0.7123
INFO:__main__:Epoch 1 - Score: 0.7123
Epoch 1 - Save Best Score: 0.7123 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7123 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.5088(0.5829) 
f1 score : 0.21390374331550804
recall score : 0.13157894736842105
precision score : 0.5714285714285714
thresh : 0.48
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 13s) Loss: 0.6572(0.6572) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 4s (remain 0m 7s) Loss: 0.4786(0.5152) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.6869(0.5217) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 12s (remain 0m 0s) Loss: 0.5417(0.5184) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.4884(0.4884) 


Epoch 2 - avg_train_loss: 0.5184  avg_val_loss: 0.5709  time: 14s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5184  avg_val_loss: 0.5709  time: 14s
Epoch 2 - Score: 0.7123
INFO:__main__:Epoch 2 - Score: 0.7123


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.4272(0.5709) 
f1 score : 0.18784530386740333
recall score : 0.1118421052631579
precision score : 0.5862068965517241
thresh : 0.45
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 16s) Loss: 0.3806(0.3806) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 4s (remain 0m 7s) Loss: 0.4721(0.4412) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.4521(0.4363) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 11s (remain 0m 0s) Loss: 0.5359(0.4342) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.5103(0.5103) 


Epoch 3 - avg_train_loss: 0.4342  avg_val_loss: 0.5786  time: 14s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4342  avg_val_loss: 0.5786  time: 14s
Epoch 3 - Score: 0.7042
INFO:__main__:Epoch 3 - Score: 0.7042


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.4212(0.5786) 
f1 score : 0.30841121495327106
recall score : 0.21710526315789475
precision score : 0.532258064516129
thresh : 0.52
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 13s) Loss: 0.3097(0.3097) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 4s (remain 0m 8s) Loss: 0.5325(0.3640) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.5030(0.3624) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 12s (remain 0m 0s) Loss: 0.3756(0.3577) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.5150(0.5150) 


Epoch 4 - avg_train_loss: 0.3577  avg_val_loss: 0.5852  time: 14s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3577  avg_val_loss: 0.5852  time: 14s
Epoch 4 - Score: 0.7123
INFO:__main__:Epoch 4 - Score: 0.7123


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.3830(0.5852) 
f1 score : 0.384
recall score : 0.3157894736842105
precision score : 0.4897959183673469
thresh : 0.59
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 15s) Loss: 0.4183(0.4183) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 4s (remain 0m 7s) Loss: 0.3572(0.3277) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.2901(0.3235) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 11s (remain 0m 0s) Loss: 0.2659(0.3230) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.5136(0.5136) 


Epoch 5 - avg_train_loss: 0.3230  avg_val_loss: 0.5857  time: 14s
INFO:__main__:Epoch 5 - avg_train_loss: 0.3230  avg_val_loss: 0.5857  time: 14s
Epoch 5 - Score: 0.7143
INFO:__main__:Epoch 5 - Score: 0.7143
Epoch 5 - Save Best Score: 0.7143 Model
INFO:__main__:Epoch 5 - Save Best Score: 0.7143 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.3804(0.5857) 
f1 score : 0.37398373983739835
recall score : 0.3026315789473684
precision score : 0.48936170212765956
thresh : 0.58


Score: 0.6901
INFO:__main__:Score: 0.6901
ACC BEST Score: 0.7143
INFO:__main__:ACC BEST Score: 0.7143
DistilBertConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_distilbert_base_uncased_epoch10",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.28.1",
  "vocab_size": 30522
}

INFO:__main__:DistilBertConfig {
  "_name_or_path": "/con

f1 score : 0.37398373983739835
recall score : 0.3026315789473684
precision score : 0.48936170212765956
thresh : 0.58


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_distilbert_base_uncased_epoch10 were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 18s) Loss: 0.6295(0.6295) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 4s (remain 0m 7s) Loss: 0.6714(0.6178) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.5674(0.6058) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 12s (remain 0m 0s) Loss: 0.7362(0.6045) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.7276(0.7276) 


Epoch 1 - avg_train_loss: 0.6045  avg_val_loss: 0.6559  time: 14s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6045  avg_val_loss: 0.6559  time: 14s
Epoch 1 - Score: 0.7082
INFO:__main__:Epoch 1 - Score: 0.7082
Epoch 1 - Save Best Score: 0.7082 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7082 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.6861(0.6559) 
f1 score : 0.4922279792746114
recall score : 0.625
precision score : 0.405982905982906
thresh : 0.67
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 15s) Loss: 0.5364(0.5364) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 4s (remain 0m 7s) Loss: 0.4275(0.5247) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.7733(0.5171) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 12s (remain 0m 0s) Loss: 0.4412(0.5178) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.6744(0.6744) 


Epoch 2 - avg_train_loss: 0.5178  avg_val_loss: 0.5868  time: 14s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5178  avg_val_loss: 0.5868  time: 14s
Epoch 2 - Score: 0.7042
INFO:__main__:Epoch 2 - Score: 0.7042


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.8183(0.5868) 
f1 score : 0.2882882882882883
recall score : 0.21052631578947367
precision score : 0.45714285714285713
thresh : 0.65
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 13s) Loss: 0.4796(0.4796) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 4s (remain 0m 7s) Loss: 0.4182(0.4302) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.2935(0.4317) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 12s (remain 0m 0s) Loss: 0.3416(0.4281) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.7107(0.7107) 


Epoch 3 - avg_train_loss: 0.4281  avg_val_loss: 0.5930  time: 14s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4281  avg_val_loss: 0.5930  time: 14s
Epoch 3 - Score: 0.7062
INFO:__main__:Epoch 3 - Score: 0.7062


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.8384(0.5930) 
f1 score : 0.27947598253275113
recall score : 0.21052631578947367
precision score : 0.4155844155844156
thresh : 0.69
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 15s) Loss: 0.4904(0.4904) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 4s (remain 0m 7s) Loss: 0.3173(0.3674) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.3910(0.3590) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 11s (remain 0m 0s) Loss: 0.4555(0.3529) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.7357(0.7357) 


Epoch 4 - avg_train_loss: 0.3529  avg_val_loss: 0.6011  time: 14s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3529  avg_val_loss: 0.6011  time: 14s
Epoch 4 - Score: 0.7062
INFO:__main__:Epoch 4 - Score: 0.7062


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.8501(0.6011) 
f1 score : 0.3726235741444867
recall score : 0.3223684210526316
precision score : 0.44144144144144143
thresh : 0.75
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 14s) Loss: 0.3460(0.3460) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 4s (remain 0m 7s) Loss: 0.3597(0.3167) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.2449(0.3147) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 12s (remain 0m 0s) Loss: 0.4085(0.3200) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.7340(0.7340) 


Epoch 5 - avg_train_loss: 0.3200  avg_val_loss: 0.5997  time: 14s
INFO:__main__:Epoch 5 - avg_train_loss: 0.3200  avg_val_loss: 0.5997  time: 14s
Epoch 5 - Score: 0.7062
INFO:__main__:Epoch 5 - Score: 0.7062


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.8967(0.5997) 
f1 score : 0.3217391304347826
recall score : 0.24342105263157895
precision score : 0.47435897435897434
thresh : 0.71


Score: 0.6056
INFO:__main__:Score: 0.6056
ACC BEST Score: 0.7082
INFO:__main__:ACC BEST Score: 0.7082
DistilBertConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_distilbert_base_uncased_epoch10",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.28.1",
  "vocab_size": 30522
}

INFO:__main__:DistilBertConfig {
  "_name_or_path": "/con

f1 score : 0.4922279792746114
recall score : 0.625
precision score : 0.405982905982906
thresh : 0.67


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_distilbert_base_uncased_epoch10 were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 18s) Loss: 0.8981(0.8981) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 4s (remain 0m 7s) Loss: 0.7760(0.6342) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.5743(0.6217) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 11s (remain 0m 0s) Loss: 0.6063(0.6156) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.6299(0.6299) 


Epoch 1 - avg_train_loss: 0.6156  avg_val_loss: 0.5832  time: 13s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6156  avg_val_loss: 0.5832  time: 13s
Epoch 1 - Score: 0.7183
INFO:__main__:Epoch 1 - Score: 0.7183
Epoch 1 - Save Best Score: 0.7183 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7183 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.5158(0.5832) 
f1 score : 0.15116279069767444
recall score : 0.08552631578947369
precision score : 0.65
thresh : 0.42
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 15s) Loss: 0.5886(0.5886) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 4s (remain 0m 7s) Loss: 0.4547(0.5324) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.4272(0.5323) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 12s (remain 0m 0s) Loss: 0.4135(0.5283) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.5970(0.5970) 


Epoch 2 - avg_train_loss: 0.5283  avg_val_loss: 0.5753  time: 14s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5283  avg_val_loss: 0.5753  time: 14s
Epoch 2 - Score: 0.7203
INFO:__main__:Epoch 2 - Score: 0.7203
Epoch 2 - Save Best Score: 0.7203 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7203 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.5397(0.5753) 
f1 score : 0.2829268292682927
recall score : 0.19078947368421054
precision score : 0.5471698113207547
thresh : 0.48
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 15s) Loss: 0.3554(0.3554) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 4s (remain 0m 7s) Loss: 0.3382(0.4452) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.3221(0.4432) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 12s (remain 0m 0s) Loss: 0.4340(0.4434) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.5801(0.5801) 


Epoch 3 - avg_train_loss: 0.4434  avg_val_loss: 0.5706  time: 14s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4434  avg_val_loss: 0.5706  time: 14s
Epoch 3 - Score: 0.7284
INFO:__main__:Epoch 3 - Score: 0.7284
Epoch 3 - Save Best Score: 0.7284 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7284 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.5446(0.5706) 
f1 score : 0.3584905660377358
recall score : 0.25
precision score : 0.6333333333333333
thresh : 0.48
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 17s) Loss: 0.4853(0.4853) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 4s (remain 0m 8s) Loss: 0.2881(0.3798) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.3321(0.3683) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 12s (remain 0m 0s) Loss: 0.2192(0.3677) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.5692(0.5692) 


Epoch 4 - avg_train_loss: 0.3677  avg_val_loss: 0.5719  time: 14s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3677  avg_val_loss: 0.5719  time: 14s
Epoch 4 - Score: 0.7324
INFO:__main__:Epoch 4 - Score: 0.7324
Epoch 4 - Save Best Score: 0.7324 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7324 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.5660(0.5719) 
f1 score : 0.41525423728813565
recall score : 0.3223684210526316
precision score : 0.5833333333333334
thresh : 0.52
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 15s) Loss: 0.3186(0.3186) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 4s (remain 0m 8s) Loss: 0.4063(0.3373) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.3792(0.3365) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 12s (remain 0m 0s) Loss: 0.4754(0.3363) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.5708(0.5708) 


Epoch 5 - avg_train_loss: 0.3363  avg_val_loss: 0.5721  time: 14s
INFO:__main__:Epoch 5 - avg_train_loss: 0.3363  avg_val_loss: 0.5721  time: 14s
Epoch 5 - Score: 0.7284
INFO:__main__:Epoch 5 - Score: 0.7284


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.5608(0.5721) 
f1 score : 0.39826839826839827
recall score : 0.3026315789473684
precision score : 0.5822784810126582
thresh : 0.51


Score: 0.7223
INFO:__main__:Score: 0.7223
ACC BEST Score: 0.7324
INFO:__main__:ACC BEST Score: 0.7324
DistilBertConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_distilbert_base_uncased_epoch10",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.28.1",
  "vocab_size": 30522
}

INFO:__main__:DistilBertConfig {
  "_name_or_path": "/con

f1 score : 0.41525423728813565
recall score : 0.3223684210526316
precision score : 0.5833333333333334
thresh : 0.52


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_distilbert_base_uncased_epoch10 were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 24s) Loss: 0.6899(0.6899) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 4s (remain 0m 8s) Loss: 0.7286(0.6440) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.5276(0.6187) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 11s (remain 0m 0s) Loss: 0.3074(0.6075) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.7128(0.7128) 


Epoch 1 - avg_train_loss: 0.6075  avg_val_loss: 0.5924  time: 14s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6075  avg_val_loss: 0.5924  time: 14s
Epoch 1 - Score: 0.7103
INFO:__main__:Epoch 1 - Score: 0.7103
Epoch 1 - Save Best Score: 0.7103 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7103 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.7952(0.5924) 
f1 score : 0.07594936708860758
recall score : 0.039473684210526314
precision score : 1.0
thresh : 0.46
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 16s) Loss: 0.6356(0.6356) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 4s (remain 0m 7s) Loss: 1.1199(0.5140) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.7428(0.5174) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 11s (remain 0m 0s) Loss: 0.6122(0.5227) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.7141(0.7141) 


Epoch 2 - avg_train_loss: 0.5227  avg_val_loss: 0.5739  time: 14s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5227  avg_val_loss: 0.5739  time: 14s
Epoch 2 - Score: 0.7143
INFO:__main__:Epoch 2 - Score: 0.7143
Epoch 2 - Save Best Score: 0.7143 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7143 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.7353(0.5739) 
f1 score : 0.17045454545454547
recall score : 0.09868421052631579
precision score : 0.625
thresh : 0.46
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 18s) Loss: 0.5137(0.5137) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 4s (remain 0m 8s) Loss: 0.3172(0.4489) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.3539(0.4461) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 12s (remain 0m 0s) Loss: 0.3746(0.4421) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.7042(0.7042) 


Epoch 3 - avg_train_loss: 0.4421  avg_val_loss: 0.5717  time: 14s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4421  avg_val_loss: 0.5717  time: 14s
Epoch 3 - Score: 0.7163
INFO:__main__:Epoch 3 - Score: 0.7163
Epoch 3 - Save Best Score: 0.7163 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7163 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.6373(0.5717) 
f1 score : 0.32286995515695066
recall score : 0.23684210526315788
precision score : 0.5070422535211268
thresh : 0.55
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 16s) Loss: 0.4867(0.4867) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 4s (remain 0m 7s) Loss: 0.3683(0.3832) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.4820(0.3711) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 12s (remain 0m 0s) Loss: 0.3310(0.3678) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.7375(0.7375) 


Epoch 4 - avg_train_loss: 0.3678  avg_val_loss: 0.5737  time: 14s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3678  avg_val_loss: 0.5737  time: 14s
Epoch 4 - Score: 0.7163
INFO:__main__:Epoch 4 - Score: 0.7163


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.6445(0.5737) 
f1 score : 0.3733333333333334
recall score : 0.27631578947368424
precision score : 0.5753424657534246
thresh : 0.5
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 14s) Loss: 0.2579(0.2579) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 4s (remain 0m 8s) Loss: 0.3579(0.3407) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.2722(0.3367) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 12s (remain 0m 0s) Loss: 0.3810(0.3351) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.7424(0.7424) 


Epoch 5 - avg_train_loss: 0.3351  avg_val_loss: 0.5741  time: 14s
INFO:__main__:Epoch 5 - avg_train_loss: 0.3351  avg_val_loss: 0.5741  time: 14s
Epoch 5 - Score: 0.7163
INFO:__main__:Epoch 5 - Score: 0.7163


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.6436(0.5741) 
f1 score : 0.37554585152838427
recall score : 0.28289473684210525
precision score : 0.5584415584415584
thresh : 0.56


Score: 0.6962
INFO:__main__:Score: 0.6962
ACC BEST Score: 0.7163
INFO:__main__:ACC BEST Score: 0.7163
DistilBertConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_distilbert_base_uncased_epoch10",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.28.1",
  "vocab_size": 30522
}

INFO:__main__:DistilBertConfig {
  "_name_or_path": "/con

f1 score : 0.32286995515695066
recall score : 0.23684210526315788
precision score : 0.5070422535211268
thresh : 0.55


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_distilbert_base_uncased_epoch10 were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 19s) Loss: 0.6135(0.6135) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 4s (remain 0m 7s) Loss: 0.7401(0.6293) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.6245(0.6194) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 11s (remain 0m 0s) Loss: 0.5035(0.6078) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.6277(0.6277) 


Epoch 1 - avg_train_loss: 0.6078  avg_val_loss: 0.5771  time: 13s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6078  avg_val_loss: 0.5771  time: 13s
Epoch 1 - Score: 0.7123
INFO:__main__:Epoch 1 - Score: 0.7123
Epoch 1 - Save Best Score: 0.7123 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7123 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.7726(0.5771) 
f1 score : 0.1411764705882353
recall score : 0.07894736842105263
precision score : 0.6666666666666666
thresh : 0.55
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 16s) Loss: 0.5752(0.5752) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 4s (remain 0m 8s) Loss: 0.6074(0.5202) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.4253(0.5214) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 12s (remain 0m 0s) Loss: 0.5331(0.5202) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.6208(0.6208) 


Epoch 2 - avg_train_loss: 0.5202  avg_val_loss: 0.5724  time: 14s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5202  avg_val_loss: 0.5724  time: 14s
Epoch 2 - Score: 0.7163
INFO:__main__:Epoch 2 - Score: 0.7163
Epoch 2 - Save Best Score: 0.7163 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7163 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.7741(0.5724) 
f1 score : 0.2524271844660194
recall score : 0.17105263157894737
precision score : 0.48148148148148145
thresh : 0.61
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 17s) Loss: 0.3639(0.3639) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 4s (remain 0m 7s) Loss: 0.5152(0.4365) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.3359(0.4337) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 12s (remain 0m 0s) Loss: 0.3456(0.4316) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.6271(0.6271) 


Epoch 3 - avg_train_loss: 0.4316  avg_val_loss: 0.5768  time: 14s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4316  avg_val_loss: 0.5768  time: 14s
Epoch 3 - Score: 0.7203
INFO:__main__:Epoch 3 - Score: 0.7203
Epoch 3 - Save Best Score: 0.7203 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7203 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.8664(0.5768) 
f1 score : 0.28169014084507044
recall score : 0.19736842105263158
precision score : 0.4918032786885246
thresh : 0.59
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 16s) Loss: 0.2963(0.2963) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 4s (remain 0m 7s) Loss: 0.3632(0.3586) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.2674(0.3626) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 12s (remain 0m 0s) Loss: 0.4292(0.3613) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.6248(0.6248) 


Epoch 4 - avg_train_loss: 0.3613  avg_val_loss: 0.5803  time: 14s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3613  avg_val_loss: 0.5803  time: 14s
Epoch 4 - Score: 0.7243
INFO:__main__:Epoch 4 - Score: 0.7243
Epoch 4 - Save Best Score: 0.7243 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7243 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.8773(0.5803) 
f1 score : 0.33476394849785407
recall score : 0.2565789473684211
precision score : 0.48148148148148145
thresh : 0.66
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 17s) Loss: 0.3137(0.3137) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 4s (remain 0m 7s) Loss: 0.3515(0.3214) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.2088(0.3254) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 12s (remain 0m 0s) Loss: 0.3323(0.3273) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.6243(0.6243) 


Epoch 5 - avg_train_loss: 0.3273  avg_val_loss: 0.5810  time: 14s
INFO:__main__:Epoch 5 - avg_train_loss: 0.3273  avg_val_loss: 0.5810  time: 14s
Epoch 5 - Score: 0.7243
INFO:__main__:Epoch 5 - Score: 0.7243


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.8727(0.5810) 
f1 score : 0.34309623430962344
recall score : 0.26973684210526316
precision score : 0.47126436781609193
thresh : 0.67


Score: 0.6881
INFO:__main__:Score: 0.6881
ACC BEST Score: 0.7243
INFO:__main__:ACC BEST Score: 0.7243
DistilBertConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_distilbert_base_uncased_epoch10",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.28.1",
  "vocab_size": 30522
}

INFO:__main__:DistilBertConfig {
  "_name_or_path": "/con

f1 score : 0.33476394849785407
recall score : 0.2565789473684211
precision score : 0.48148148148148145
thresh : 0.66


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_distilbert_base_uncased_epoch10 were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 20s) Loss: 0.6547(0.6547) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 4s (remain 0m 8s) Loss: 0.5031(0.6233) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.6199(0.6066) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 12s (remain 0m 0s) Loss: 0.4978(0.5973) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.6182(0.6182) 


Epoch 1 - avg_train_loss: 0.5973  avg_val_loss: 0.5805  time: 14s
INFO:__main__:Epoch 1 - avg_train_loss: 0.5973  avg_val_loss: 0.5805  time: 14s
Epoch 1 - Score: 0.7082
INFO:__main__:Epoch 1 - Score: 0.7082
Epoch 1 - Save Best Score: 0.7082 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7082 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.5281(0.5805) 
f1 score : 0.1564245810055866
recall score : 0.09210526315789473
precision score : 0.5185185185185185
thresh : 0.55
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 14s) Loss: 0.4733(0.4733) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 4s (remain 0m 7s) Loss: 0.5811(0.5104) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.4667(0.5114) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 12s (remain 0m 0s) Loss: 0.6285(0.5095) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.6514(0.6514) 


Epoch 2 - avg_train_loss: 0.5095  avg_val_loss: 0.5743  time: 14s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5095  avg_val_loss: 0.5743  time: 14s
Epoch 2 - Score: 0.7183
INFO:__main__:Epoch 2 - Score: 0.7183
Epoch 2 - Save Best Score: 0.7183 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7183 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.5266(0.5743) 
f1 score : 0.2897196261682243
recall score : 0.20394736842105263
precision score : 0.5
thresh : 0.63
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 14s) Loss: 0.5037(0.5037) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 4s (remain 0m 8s) Loss: 0.3191(0.4258) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.4496(0.4252) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 12s (remain 0m 0s) Loss: 0.4332(0.4232) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.7080(0.7080) 


Epoch 3 - avg_train_loss: 0.4232  avg_val_loss: 0.5952  time: 14s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4232  avg_val_loss: 0.5952  time: 14s
Epoch 3 - Score: 0.7143
INFO:__main__:Epoch 3 - Score: 0.7143


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.5513(0.5952) 
f1 score : 0.467948717948718
recall score : 0.48026315789473684
precision score : 0.45625
thresh : 0.78
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 13s) Loss: 0.2942(0.2942) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 4s (remain 0m 8s) Loss: 0.2594(0.3521) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.4460(0.3533) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 12s (remain 0m 0s) Loss: 0.2898(0.3477) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.6876(0.6876) 


Epoch 4 - avg_train_loss: 0.3477  avg_val_loss: 0.5758  time: 14s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3477  avg_val_loss: 0.5758  time: 14s
Epoch 4 - Score: 0.7264
INFO:__main__:Epoch 4 - Score: 0.7264
Epoch 4 - Save Best Score: 0.7264 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7264 Model


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.4658(0.5758) 
f1 score : 0.41702127659574467
recall score : 0.3223684210526316
precision score : 0.5903614457831325
thresh : 0.51
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 19s) Loss: 0.2816(0.2816) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 4s (remain 0m 8s) Loss: 0.3319(0.3110) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 8s (remain 0m 3s) Loss: 0.2472(0.3137) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 12s (remain 0m 0s) Loss: 0.3544(0.3150) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.7021(0.7021) 


Epoch 5 - avg_train_loss: 0.3150  avg_val_loss: 0.5800  time: 14s
INFO:__main__:Epoch 5 - avg_train_loss: 0.3150  avg_val_loss: 0.5800  time: 14s
Epoch 5 - Score: 0.7264
INFO:__main__:Epoch 5 - Score: 0.7264


EVAL: [15/16] Elapsed 0m 1s (remain 0m 0s) Loss: 0.4770(0.5800) 
f1 score : 0.43678160919540227
recall score : 0.375
precision score : 0.5229357798165137
thresh : 0.55


Score: 0.7243
INFO:__main__:Score: 0.7243
ACC BEST Score: 0.7264
INFO:__main__:ACC BEST Score: 0.7264
Score: 0.6902
INFO:__main__:Score: 0.6902
ACC BEST Score: 0.7079
INFO:__main__:ACC BEST Score: 0.7079


f1 score : 0.41702127659574467
recall score : 0.3223684210526316
precision score : 0.5903614457831325
thresh : 0.51
f1 score : 0.3872763419483101
recall score : 0.319763624425476
precision score : 0.4909274193548387
thresh : 0.67


In [22]:
from google.colab import runtime
runtime.unassign()