In [1]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers
!pip install datasets
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!nvidia-smi

Sat May  6 10:12:48 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    45W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:

import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re
import html

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD, AdamW, RAdam
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score, recall_score, accuracy_score, precision_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
DIR = "/content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測"
INPUT_DIR = os.path.join(DIR,"input")
OUTPUT_DIR = os.path.join(DIR,"output")
CUSTOM_MODEL_DIR = os.path.join(OUTPUT_DIR,'clrp_google_electra_base_discriminator_epoch10')
OUTPUT_EXP_DIR = DIR + '/output/EXP069/'
if not os.path.exists(OUTPUT_EXP_DIR):
    os.makedirs(OUTPUT_EXP_DIR)

In [6]:

# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    model_name="google/electra-base-discriminator"
    # model='microsoft/deberta-base'
    # model='roberta-base'
    # model='roberta-large'
    # model='roberta-large-mnli'
    # model='xlnet-large-cased'
    # model='albert-xxlarge-v2'
    # model="microsoft/deberta-large"
    # model="microsoft/deberta-v3-large"
    # model='microsoft/deberta-v2-xlarge'
    # model='funnel-transformer/large'
    # model='funnel-transformer/medium'
    # model='albert-base-v2'
    # model='albert-large-v2'
    # model='google/electra-large-discriminator'
    # model='google/electra-base-discriminator'
    # model="facebook/bart-large-mnli"
    # model="facebook/bart-large"
    # model="facebook/bart-base"
    model = CUSTOM_MODEL_DIR
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=5
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=16
    fc_dropout=0.2
    target_size=1
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=10
    trn_fold=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    train=True
    nth_awp_start_epoch=1
    gradient_checkpointing = False
    freezing = False
    num_reinit_layers = 1
    is_reinit_layer = False
    fgm = False
    awp_start=1

if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0, 1]

In [7]:
def get_score(labels, outputs):
    thresh = 0.5
    y_pred = outputs
    y_true = labels
    f_score = f1_score(y_true, (y_pred>thresh).astype(int))
    r_score = recall_score(y_true, (y_pred>thresh).astype(int))
    p_score = precision_score(y_true, (y_pred>thresh).astype(int))
    print(f"f1 score : {f_score}")
    print(f"recall score : {r_score}")
    print(f"precision score : {p_score}")
    return accuracy_score(y_true, (y_pred>thresh).astype(int))

def get_acc_score(labels, outputs):
    y_pred = outputs
    y_true = labels
    best_score = 0
    best_thresh = 0.5
    for thresh in np.arange(0.1, 0.80, 0.01):
        thresh = np.round(thresh, 2)
        score = accuracy_score(y_true, (y_pred>thresh).astype(int))
        #print("Accuracy score at threshold {0} is {1}".format(thresh, score))
        if score > best_score:
          best_score = score
          best_thresh = thresh
    print(f"thresh : {best_thresh}")
    return accuracy_score(y_true, (y_pred>best_thresh).astype(int))


def get_logger(filename=OUTPUT_EXP_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=CFG.seed)

In [8]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False
        
def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
            
    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """
    
    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"
        
        if hasattr(embeddings_path, attr_name): 
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [9]:
import pandas as pd
import numpy as np

train = pd.read_csv(os.path.join(INPUT_DIR,"train_data.csv"))
test = pd.read_csv(os.path.join(INPUT_DIR,"test_data.csv"))
sample_sub = pd.read_csv(os.path.join(INPUT_DIR,"submission.csv"))

train = train.sample(frac=1, random_state=CFG.seed).reset_index()

print(train.shape)
display(train.head(3))

print(test.shape)
display(test.head(3))

print(sample_sub.shape)
display(sample_sub.head(3))

(4974, 7)


Unnamed: 0,index,id,title,year,abstract,keywords,y
0,721,722,Global Optimality Conditions for Deep Neural N...,2018,We study the error landscape of deep linear an...,"deep linear neural networks, global optimality...",1
1,144,145,Multi-Task Learning by Deep Collaboration and ...,2018,Convolutional neural networks (CNN) have becom...,"multi-task learning, soft parameter sharing, f...",0
2,4542,4543,On the Need for Topology-Aware Generative Mode...,2020,"ML algorithms or models, especially deep neura...","Manifold-based Defense, Robust Learning, Adver...",1


(6393, 5)


Unnamed: 0,id,title,year,abstract,keywords
0,1,StyleAlign: Analysis and Applications of Align...,2022,"In this paper, we perform an in-depth study of...","StyleGAN, transfer learning, fine tuning, mode..."
1,2,Embedding a random graph via GNN: mean-field i...,2021,We develop a theory for embedding a random gra...,"Graph neural network, graph embedding, multi-r..."
2,3,BBRefinement: an universal scheme to improve p...,2021,We present a conceptually simple yet powerful ...,"object detection, deep neural networks, refine..."


(6393, 2)


Unnamed: 0,id,y
0,1,0
1,2,0
2,3,0


In [10]:
train["texts"] = train["title"] + "[SEP]" + train["abstract"] 

In [11]:
skf = StratifiedKFold(n_splits=CFG.n_fold,shuffle=True,random_state=CFG.seed)
for fold, ( _, val_) in enumerate(skf.split(train, train.y)):
    train.loc[val_ , "kfold"] = int(fold)
    
train["kfold"] = train["kfold"].astype(int)

if CFG.debug:
    display(train.groupby('kfold').size())
    train = train.sample(n=500, random_state=0).reset_index(drop=True)
    display(train.groupby('kfold').size())

In [12]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_EXP_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

In [13]:
# ====================================================
# Define max_len
# ====================================================
#lengths = []
#tk0 = tqdm(train['texts'].fillna("").values, total=len(train))
#for text in tk0:
#    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
#    lengths.append(length)
#CFG.max_len = max(lengths) + 3 # cls + sep + sep
#LOGGER.info(f"max_len: {CFG.max_len}")

In [14]:
class AWP:
    def __init__(self, model, optimizer, *, adv_param='weight',
                 adv_lr=0.001, adv_eps=0.001):
        self.model = model
        self.optimizer = optimizer
        self.adv_param = adv_param
        self.adv_lr = adv_lr
        self.adv_eps = adv_eps
        self.backup = {}

    def perturb(self, inputs, y, criterion):
        """
        Perturb model parameters for AWP gradient
        Call before loss and loss.backward()
        """
        self._save()  # save model parameters
        self._attack_step()  # perturb weights

    def _attack_step(self):
        e = 1e-6
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                grad = self.optimizer.state[param]['exp_avg']
                norm_grad = torch.norm(grad)
                norm_data = torch.norm(param.detach())

                if norm_grad != 0 and not torch.isnan(norm_grad):
                    # Set lower and upper limit in change
                    limit_eps = self.adv_eps * param.detach().abs()
                    param_min = param.data - limit_eps
                    param_max = param.data + limit_eps

                    # Perturb along gradient
                    # w += (adv_lr * |w| / |grad|) * grad
                    param.data.add_(grad, alpha=(self.adv_lr * (norm_data + e) / (norm_grad + e)))

                    # Apply the limit to the change
                    param.data.clamp_(param_min, param_max)

    def _save(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                if name not in self.backup:
                    self.backup[name] = param.clone().detach()
                else:
                    self.backup[name].copy_(param.data)

    def restore(self):
        """
        Restore model parameter to correct position; AWP do not perturbe weights, it perturb gradients
        Call after loss.backward(), before optimizer.step()
        """
        for name, param in self.model.named_parameters():
            if name in self.backup:
                param.data.copy_(self.backup[name])

In [15]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False,
                           truncation=True)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['texts'].values
        self.labels = df['y'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

#collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [16]:
def reinit_layers(model):

    #for layer in model.model.encoder.layer[-CFG.num_reinit_layers:]:
    for layer in model.encoder.layer[-CFG.num_reinit_layers:]:    #Custome model内(backbone)

            for module in layer.modules():

                if isinstance(module,nn.Linear):
                    module.weight.data.normal_(mean=0.0,std=model.config.initializer_range)
                    if module.bias is not None:
                            module.bias.data.zero_()
                elif isinstance(module, nn.Embedding):
                        module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
                        if module.padding_idx is not None:
                            module.weight.data[module.padding_idx].zero_()
                elif isinstance(module, nn.LayerNorm):
                        module.bias.data.zero_()
                        module.weight.data.fill_(1.0)
                        
    return model

In [17]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim=1)
        return max_embeddings
    

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if CFG.is_reinit_layer:
            self.model = reinit_layers(self.model)
            print(f'Reinitializing Last {CFG.num_reinit_layers} Layers.')
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()

        # Freezing
        if cfg.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            cfg.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())

        self.pool = MaxPooling()
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        self._init_weights(self.fc)
        self.layer_norm1 = nn.LayerNorm(self.config.hidden_size)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        feature = self.layer_norm1(feature)
        output = self.fc(feature)
        return output

In [18]:
def calculate_loss(inputs, labels, model, criterion, is_valid=True, device="cpu"):    
    y_preds = model(inputs)
    loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
    return (loss, y_preds) if is_valid else loss

In [19]:

# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device, awp):
    model.zero_grad()
    model.train()
    awp_start = CFG.awp_start
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        if epoch >= awp_start:
            awp.perturb(inputs, labels, criterion)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            loss = calculate_loss(inputs=inputs, labels=labels, model=model, criterion=criterion, is_valid=False, device=device)
        #print(y_preds.sigmoid().squeeze().view(1, -1))
        #loss = criterion(y_preds.sigmoid().squeeze(), labels.squeeze())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        if scaler is not None:
            scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        awp.restore()
        if CFG.fgm:
          fgm.attack() 
          adversarial_loss = calculate_loss(inputs=inputs, labels=labels, model=model, criterion=criterion, is_valid=False, device=device)
          scaler.scale(adversarial_loss).backward()
          fgm.restore()
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  #'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          #grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))

    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            loss, y_preds = calculate_loss(inputs=inputs, labels=labels, model=model, criterion=criterion, is_valid=True, device=device)
        #loss = criterion(y_preds.sigmoid().squeeze(), labels.squeeze())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    predictions = np.concatenate(predictions)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [20]:

# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['kfold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['kfold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds['y'].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)


    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size*2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_EXP_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr=5e-6, decoder_lr=1e-4, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", 
                    "LayerNorm.weight"]
        group1=['layer.0.','layer.1.','layer.2.','layer.3.']
        group2=['layer.4.','layer.5.','layer.6.','layer.7.']    
        group3=['layer.8.','layer.9.','layer.10.','layer.11.']
        group_all=['layer.0.','layer.1.','layer.2.','layer.3.','layer.4.','layer.5.','layer.6.','layer.7.','layer.8.','layer.9.','layer.10.','layer.11.']
        optimizer_parameters = [
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay': weight_decay},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay': weight_decay, 'lr': encoder_lr/2.6},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay': weight_decay, 'lr': encoder_lr},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay': weight_decay, 'lr': encoder_lr*2.6},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay': 0.0},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay': 0.0, 'lr': encoder_lr/2.6},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay': 0.0, 'lr': encoder_lr},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay': 0.0, 'lr': encoder_lr*2.6},
        {'params': [p for n, p in model.named_parameters() if "model" not in n], 'lr':decoder_lr, "momentum" : 0.99},
    ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.BCEWithLogitsLoss()

    print('Enable AWP')
    awp = AWP(model, optimizer, adv_lr=0.001, adv_eps=0.001)
    #print('Enable FGM')
    #fgm = FGM(model=model, eps=0.1)
    
    best_score = -1.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device, awp)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score_05 = get_score(valid_labels, predictions)
        score = get_acc_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')

        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_EXP_DIR+f"{CFG.model_name.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_EXP_DIR+f"{CFG.model_name.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds['pred'] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [21]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df['y'].values
        preds = oof_df['pred'].values
        score = get_score(labels, preds)
        acc_score = get_acc_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
        LOGGER.info(f'ACC BEST Score: {acc_score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
            #break
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_EXP_DIR+'oof_df.pkl')

ElectraConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_google_electra_base_discriminator_epoch10",
  "architectures": [
    "ElectraForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": null,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "torch_dtype": "float32",
  "transformers_version": "4.28.1",
  "type_vocab_size": 2,
  "use_

Enable AWP
Epoch: [1][0/279] Elapsed 0m 1s (remain 6m 37s) Loss: 0.8219(0.8219) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 9s (remain 0m 16s) Loss: 0.5884(0.6171) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.7116(0.6016) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.4435(0.6004) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5622(0.5622) 


Epoch 1 - avg_train_loss: 0.6004  avg_val_loss: 0.5885  time: 27s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6004  avg_val_loss: 0.5885  time: 27s
Epoch 1 - Score: 0.7149
INFO:__main__:Epoch 1 - Score: 0.7149
Epoch 1 - Save Best Score: 0.7149 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7149 Model


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.6488(0.5885) 
f1 score : 0.08433734939759036
recall score : 0.0457516339869281
precision score : 0.5384615384615384
thresh : 0.39
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 23s) Loss: 0.4038(0.4038) LR: 0.00001808  
Epoch: [2][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.6015(0.4383) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.4941(0.4394) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.7094(0.4406) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6510(0.6510) 


Epoch 2 - avg_train_loss: 0.4406  avg_val_loss: 0.6546  time: 26s
INFO:__main__:Epoch 2 - avg_train_loss: 0.4406  avg_val_loss: 0.6546  time: 26s
Epoch 2 - Score: 0.7149
INFO:__main__:Epoch 2 - Score: 0.7149


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.7577(0.6546) 
f1 score : 0.0975609756097561
recall score : 0.05228758169934641
precision score : 0.7272727272727273
thresh : 0.38
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 23s) Loss: 0.2741(0.2741) LR: 0.00001309  
Epoch: [3][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.1579(0.2218) LR: 0.00001090  
Epoch: [3][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.1452(0.1975) LR: 0.00000866  
Epoch: [3][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.1507(0.1936) LR: 0.00000695  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5131(0.5131) 


Epoch 3 - avg_train_loss: 0.1936  avg_val_loss: 0.6118  time: 26s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1936  avg_val_loss: 0.6118  time: 26s
Epoch 3 - Score: 0.7068
INFO:__main__:Epoch 3 - Score: 0.7068


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.7502(0.6118) 
f1 score : 0.43986254295532645
recall score : 0.41830065359477125
precision score : 0.463768115942029
thresh : 0.74
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 23s) Loss: 0.0939(0.0939) LR: 0.00000693  
Epoch: [4][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.0962(0.0785) LR: 0.00000488  
Epoch: [4][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.0757(0.0740) LR: 0.00000310  
Epoch: [4][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.0685(0.0723) LR: 0.00000194  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5131(0.5131) 


Epoch 4 - avg_train_loss: 0.0723  avg_val_loss: 0.6443  time: 26s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0723  avg_val_loss: 0.6443  time: 26s
Epoch 4 - Score: 0.7108
INFO:__main__:Epoch 4 - Score: 0.7108


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.7975(0.6443) 
f1 score : 0.3291139240506329
recall score : 0.2549019607843137
precision score : 0.4642857142857143
thresh : 0.59
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 26s) Loss: 0.0457(0.0457) LR: 0.00000193  
Epoch: [5][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.0475(0.0479) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.0349(0.0471) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.0450(0.0474) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5205(0.5205) 


Epoch 5 - avg_train_loss: 0.0474  avg_val_loss: 0.6600  time: 25s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0474  avg_val_loss: 0.6600  time: 25s
Epoch 5 - Score: 0.7169
INFO:__main__:Epoch 5 - Score: 0.7169
Epoch 5 - Save Best Score: 0.7169 Model
INFO:__main__:Epoch 5 - Save Best Score: 0.7169 Model


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.8215(0.6600) 
f1 score : 0.34309623430962344
recall score : 0.2679738562091503
precision score : 0.47674418604651164
thresh : 0.63


Score: 0.6847
INFO:__main__:Score: 0.6847
ACC BEST Score: 0.7169
INFO:__main__:ACC BEST Score: 0.7169
ElectraConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_google_electra_base_discriminator_epoch10",
  "architectures": [
    "ElectraForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": null,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj":

f1 score : 0.34309623430962344
recall score : 0.2679738562091503
precision score : 0.47674418604651164
thresh : 0.63


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_google_electra_base_discriminator_epoch10 were not used when initializing ElectraModel: ['generator_predictions.dense.weight', 'generator_predictions.LayerNorm.bias', 'generator_predictions.dense.bias', 'generator_lm_head.weight', 'generator_predictions.LayerNorm.weight', 'generator_lm_head.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 42s) Loss: 0.7174(0.7174) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.5154(0.6142) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.5511(0.6135) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.7794(0.6074) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4983(0.4983) 


Epoch 1 - avg_train_loss: 0.6074  avg_val_loss: 0.5788  time: 26s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6074  avg_val_loss: 0.5788  time: 26s
Epoch 1 - Score: 0.7129
INFO:__main__:Epoch 1 - Score: 0.7129
Epoch 1 - Save Best Score: 0.7129 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7129 Model


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.4996(0.5788) 
f1 score : 0.25120772946859904
recall score : 0.16993464052287582
precision score : 0.48148148148148145
thresh : 0.52
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 31s) Loss: 0.5726(0.5726) LR: 0.00001808  
Epoch: [2][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.6105(0.4285) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.3958(0.4299) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.4304(0.4173) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.4548(0.4548) 


Epoch 2 - avg_train_loss: 0.4173  avg_val_loss: 0.5843  time: 26s
INFO:__main__:Epoch 2 - avg_train_loss: 0.4173  avg_val_loss: 0.5843  time: 26s
Epoch 2 - Score: 0.7189
INFO:__main__:Epoch 2 - Score: 0.7189
Epoch 2 - Save Best Score: 0.7189 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7189 Model


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.4912(0.5843) 
f1 score : 0.2955665024630542
recall score : 0.19607843137254902
precision score : 0.6
thresh : 0.49
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 35s) Loss: 0.2267(0.2267) LR: 0.00001309  
Epoch: [3][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.1514(0.1622) LR: 0.00001090  
Epoch: [3][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.1228(0.1586) LR: 0.00000866  
Epoch: [3][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0830(0.1517) LR: 0.00000695  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.4684(0.4684) 


Epoch 3 - avg_train_loss: 0.1517  avg_val_loss: 0.6104  time: 26s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1517  avg_val_loss: 0.6104  time: 26s
Epoch 3 - Score: 0.7028
INFO:__main__:Epoch 3 - Score: 0.7028


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.5423(0.6104) 
f1 score : 0.3514644351464435
recall score : 0.27450980392156865
precision score : 0.4883720930232558
thresh : 0.59
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 31s) Loss: 0.0551(0.0551) LR: 0.00000693  
Epoch: [4][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.0547(0.0585) LR: 0.00000488  
Epoch: [4][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.0469(0.0562) LR: 0.00000310  
Epoch: [4][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.0353(0.0550) LR: 0.00000194  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4256(0.4256) 


Epoch 4 - avg_train_loss: 0.0550  avg_val_loss: 0.6685  time: 26s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0550  avg_val_loss: 0.6685  time: 26s
Epoch 4 - Score: 0.7048
INFO:__main__:Epoch 4 - Score: 0.7048


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.5454(0.6685) 
f1 score : 0.25365853658536586
recall score : 0.16993464052287582
precision score : 0.5
thresh : 0.41
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 35s) Loss: 0.0312(0.0312) LR: 0.00000193  
Epoch: [5][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.0420(0.0395) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.0484(0.0395) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.0326(0.0393) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.4235(0.4235) 


Epoch 5 - avg_train_loss: 0.0393  avg_val_loss: 0.6665  time: 26s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0393  avg_val_loss: 0.6665  time: 26s
Epoch 5 - Score: 0.7048
INFO:__main__:Epoch 5 - Score: 0.7048


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.5538(0.6665) 
f1 score : 0.2830188679245283
recall score : 0.19607843137254902
precision score : 0.5084745762711864
thresh : 0.37


Score: 0.7129
INFO:__main__:Score: 0.7129
ACC BEST Score: 0.7189
INFO:__main__:ACC BEST Score: 0.7189
ElectraConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_google_electra_base_discriminator_epoch10",
  "architectures": [
    "ElectraForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": null,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj":

f1 score : 0.2955665024630542
recall score : 0.19607843137254902
precision score : 0.6
thresh : 0.49


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_google_electra_base_discriminator_epoch10 were not used when initializing ElectraModel: ['generator_predictions.dense.weight', 'generator_predictions.LayerNorm.bias', 'generator_predictions.dense.bias', 'generator_lm_head.weight', 'generator_predictions.LayerNorm.weight', 'generator_lm_head.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 34s) Loss: 0.7849(0.7849) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.6665(0.6247) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.6008(0.6107) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.4757(0.6025) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5507(0.5507) 


Epoch 1 - avg_train_loss: 0.6025  avg_val_loss: 0.5849  time: 25s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6025  avg_val_loss: 0.5849  time: 25s
Epoch 1 - Score: 0.7088
INFO:__main__:Epoch 1 - Score: 0.7088
Epoch 1 - Save Best Score: 0.7088 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7088 Model


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.4536(0.5849) 
f1 score : 0.16853932584269662
recall score : 0.09803921568627451
precision score : 0.6
thresh : 0.79
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 34s) Loss: 0.4411(0.4411) LR: 0.00001808  
Epoch: [2][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.3298(0.4420) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.4249(0.4404) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.3571(0.4385) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5252(0.5252) 


Epoch 2 - avg_train_loss: 0.4385  avg_val_loss: 0.5952  time: 26s
INFO:__main__:Epoch 2 - avg_train_loss: 0.4385  avg_val_loss: 0.5952  time: 26s
Epoch 2 - Score: 0.7068
INFO:__main__:Epoch 2 - Score: 0.7068


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.3821(0.5952) 
f1 score : 0.19672131147540983
recall score : 0.11764705882352941
precision score : 0.6
thresh : 0.41
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 29s) Loss: 0.1782(0.1782) LR: 0.00001309  
Epoch: [3][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.1573(0.1783) LR: 0.00001090  
Epoch: [3][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.0896(0.1678) LR: 0.00000866  
Epoch: [3][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.0829(0.1607) LR: 0.00000695  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5918(0.5918) 


Epoch 3 - avg_train_loss: 0.1607  avg_val_loss: 0.6380  time: 25s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1607  avg_val_loss: 0.6380  time: 25s
Epoch 3 - Score: 0.7149
INFO:__main__:Epoch 3 - Score: 0.7149
Epoch 3 - Save Best Score: 0.7149 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7149 Model


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.3665(0.6380) 
f1 score : 0.27
recall score : 0.17647058823529413
precision score : 0.574468085106383
thresh : 0.52
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 36s) Loss: 0.0656(0.0656) LR: 0.00000693  
Epoch: [4][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.0580(0.0553) LR: 0.00000488  
Epoch: [4][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.0509(0.0545) LR: 0.00000310  
Epoch: [4][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.0538(0.0530) LR: 0.00000194  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6271(0.6271) 


Epoch 4 - avg_train_loss: 0.0530  avg_val_loss: 0.6522  time: 26s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0530  avg_val_loss: 0.6522  time: 26s
Epoch 4 - Score: 0.7108
INFO:__main__:Epoch 4 - Score: 0.7108


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.3620(0.6522) 
f1 score : 0.32727272727272727
recall score : 0.23529411764705882
precision score : 0.5373134328358209
thresh : 0.6
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 32s) Loss: 0.0256(0.0256) LR: 0.00000193  
Epoch: [5][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.0578(0.0379) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.0304(0.0366) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.0238(0.0363) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6438(0.6438) 


Epoch 5 - avg_train_loss: 0.0363  avg_val_loss: 0.6677  time: 26s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0363  avg_val_loss: 0.6677  time: 26s
Epoch 5 - Score: 0.7108
INFO:__main__:Epoch 5 - Score: 0.7108


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.3589(0.6677) 
f1 score : 0.3317972350230415
recall score : 0.23529411764705882
precision score : 0.5625
thresh : 0.51


Score: 0.7068
INFO:__main__:Score: 0.7068
ACC BEST Score: 0.7149
INFO:__main__:ACC BEST Score: 0.7149
ElectraConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_google_electra_base_discriminator_epoch10",
  "architectures": [
    "ElectraForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": null,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj":

f1 score : 0.27
recall score : 0.17647058823529413
precision score : 0.574468085106383
thresh : 0.52


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_google_electra_base_discriminator_epoch10 were not used when initializing ElectraModel: ['generator_predictions.dense.weight', 'generator_predictions.LayerNorm.bias', 'generator_predictions.dense.bias', 'generator_lm_head.weight', 'generator_predictions.LayerNorm.weight', 'generator_lm_head.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 37s) Loss: 0.8034(0.8034) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.7962(0.6267) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.6173(0.6102) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.3706(0.6086) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.6634(0.6634) 


Epoch 1 - avg_train_loss: 0.6086  avg_val_loss: 0.5733  time: 25s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6086  avg_val_loss: 0.5733  time: 25s
Epoch 1 - Score: 0.7189
INFO:__main__:Epoch 1 - Score: 0.7189
Epoch 1 - Save Best Score: 0.7189 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7189 Model


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.5330(0.5733) 
f1 score : 0.08641975308641975
recall score : 0.046052631578947366
precision score : 0.7
thresh : 0.38
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 27s) Loss: 0.5395(0.5395) LR: 0.00001808  
Epoch: [2][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.5835(0.4199) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.3778(0.4150) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.3188(0.4113) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.7191(0.7191) 


Epoch 2 - avg_train_loss: 0.4113  avg_val_loss: 0.5773  time: 26s
INFO:__main__:Epoch 2 - avg_train_loss: 0.4113  avg_val_loss: 0.5773  time: 26s
Epoch 2 - Score: 0.7229
INFO:__main__:Epoch 2 - Score: 0.7229
Epoch 2 - Save Best Score: 0.7229 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7229 Model


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.6112(0.5773) 
f1 score : 0.36936936936936937
recall score : 0.26973684210526316
precision score : 0.5857142857142857
thresh : 0.47
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 34s) Loss: 0.1496(0.1496) LR: 0.00001309  
Epoch: [3][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.0904(0.1547) LR: 0.00001090  
Epoch: [3][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.0735(0.1427) LR: 0.00000866  
Epoch: [3][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.1311(0.1386) LR: 0.00000695  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.6754(0.6754) 


Epoch 3 - avg_train_loss: 0.1386  avg_val_loss: 0.5689  time: 26s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1386  avg_val_loss: 0.5689  time: 26s
Epoch 3 - Score: 0.7249
INFO:__main__:Epoch 3 - Score: 0.7249
Epoch 3 - Save Best Score: 0.7249 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7249 Model


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.5854(0.5689) 
f1 score : 0.4067796610169492
recall score : 0.3157894736842105
precision score : 0.5714285714285714
thresh : 0.47
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 36s) Loss: 0.0709(0.0709) LR: 0.00000693  
Epoch: [4][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.0772(0.0512) LR: 0.00000488  
Epoch: [4][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.0475(0.0503) LR: 0.00000310  
Epoch: [4][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0496(0.0496) LR: 0.00000194  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.7337(0.7337) 


Epoch 4 - avg_train_loss: 0.0496  avg_val_loss: 0.5919  time: 26s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0496  avg_val_loss: 0.5919  time: 26s
Epoch 4 - Score: 0.7289
INFO:__main__:Epoch 4 - Score: 0.7289
Epoch 4 - Save Best Score: 0.7289 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7289 Model


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.5477(0.5919) 
f1 score : 0.3364485981308411
recall score : 0.23684210526315788
precision score : 0.5806451612903226
thresh : 0.4
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 34s) Loss: 0.0442(0.0442) LR: 0.00000193  
Epoch: [5][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.0449(0.0359) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.0401(0.0361) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0384(0.0360) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.7754(0.7754) 


Epoch 5 - avg_train_loss: 0.0360  avg_val_loss: 0.6128  time: 26s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0360  avg_val_loss: 0.6128  time: 26s
Epoch 5 - Score: 0.7249
INFO:__main__:Epoch 5 - Score: 0.7249


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.5267(0.6128) 
f1 score : 0.30097087378640774
recall score : 0.20394736842105263
precision score : 0.5740740740740741
thresh : 0.35


Score: 0.7149
INFO:__main__:Score: 0.7149
ACC BEST Score: 0.7289
INFO:__main__:ACC BEST Score: 0.7289
ElectraConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_google_electra_base_discriminator_epoch10",
  "architectures": [
    "ElectraForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": null,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj":

f1 score : 0.3364485981308411
recall score : 0.23684210526315788
precision score : 0.5806451612903226
thresh : 0.4


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_google_electra_base_discriminator_epoch10 were not used when initializing ElectraModel: ['generator_predictions.dense.weight', 'generator_predictions.LayerNorm.bias', 'generator_predictions.dense.bias', 'generator_lm_head.weight', 'generator_predictions.LayerNorm.weight', 'generator_lm_head.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 36s) Loss: 0.8485(0.8485) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.4924(0.6433) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.6976(0.6163) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.5504(0.6116) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4862(0.4862) 


Epoch 1 - avg_train_loss: 0.6116  avg_val_loss: 0.5762  time: 25s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6116  avg_val_loss: 0.5762  time: 25s
Epoch 1 - Score: 0.7163
INFO:__main__:Epoch 1 - Score: 0.7163
Epoch 1 - Save Best Score: 0.7163 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7163 Model


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.4367(0.5762) 
f1 score : 0.075
recall score : 0.039473684210526314
precision score : 0.75
thresh : 0.31
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 28s) Loss: 0.5502(0.5502) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.3143(0.4530) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.4375(0.4580) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.5260(0.4655) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5452(0.5452) 


Epoch 2 - avg_train_loss: 0.4655  avg_val_loss: 0.5786  time: 26s
INFO:__main__:Epoch 2 - avg_train_loss: 0.4655  avg_val_loss: 0.5786  time: 26s
Epoch 2 - Score: 0.7042
INFO:__main__:Epoch 2 - Score: 0.7042


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.3969(0.5786) 
f1 score : 0.3063063063063063
recall score : 0.2236842105263158
precision score : 0.4857142857142857
thresh : 0.57
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 36s) Loss: 0.2210(0.2210) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.1712(0.2067) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.0983(0.1950) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.2074(0.1890) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5920(0.5920) 


Epoch 3 - avg_train_loss: 0.1890  avg_val_loss: 0.6201  time: 26s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1890  avg_val_loss: 0.6201  time: 26s
Epoch 3 - Score: 0.7062
INFO:__main__:Epoch 3 - Score: 0.7062


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.3378(0.6201) 
f1 score : 0.29767441860465116
recall score : 0.21052631578947367
precision score : 0.5079365079365079
thresh : 0.65
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 31s) Loss: 0.0588(0.0588) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.0503(0.0698) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.0506(0.0666) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.0463(0.0639) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6391(0.6391) 


Epoch 4 - avg_train_loss: 0.0639  avg_val_loss: 0.6505  time: 26s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0639  avg_val_loss: 0.6505  time: 26s
Epoch 4 - Score: 0.7042
INFO:__main__:Epoch 4 - Score: 0.7042


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.2866(0.6505) 
f1 score : 0.35087719298245607
recall score : 0.2631578947368421
precision score : 0.5263157894736842
thresh : 0.61
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 32s) Loss: 0.0535(0.0535) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.0485(0.0445) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.0334(0.0435) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.0476(0.0429) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6675(0.6675) 


Epoch 5 - avg_train_loss: 0.0429  avg_val_loss: 0.6602  time: 26s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0429  avg_val_loss: 0.6602  time: 26s
Epoch 5 - Score: 0.7042
INFO:__main__:Epoch 5 - Score: 0.7042


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.2854(0.6602) 
f1 score : 0.36206896551724144
recall score : 0.27631578947368424
precision score : 0.525
thresh : 0.65


Score: 0.7022
INFO:__main__:Score: 0.7022
ACC BEST Score: 0.7163
INFO:__main__:ACC BEST Score: 0.7163
ElectraConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_google_electra_base_discriminator_epoch10",
  "architectures": [
    "ElectraForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": null,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj":

f1 score : 0.075
recall score : 0.039473684210526314
precision score : 0.75
thresh : 0.31


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_google_electra_base_discriminator_epoch10 were not used when initializing ElectraModel: ['generator_predictions.dense.weight', 'generator_predictions.LayerNorm.bias', 'generator_predictions.dense.bias', 'generator_lm_head.weight', 'generator_predictions.LayerNorm.weight', 'generator_lm_head.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 33s) Loss: 0.8177(0.8177) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.6411(0.6265) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.4134(0.6113) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.6121(0.6125) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.6170(0.6170) 


Epoch 1 - avg_train_loss: 0.6125  avg_val_loss: 0.6160  time: 25s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6125  avg_val_loss: 0.6160  time: 25s
Epoch 1 - Score: 0.7082
INFO:__main__:Epoch 1 - Score: 0.7082
Epoch 1 - Save Best Score: 0.7082 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7082 Model


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.6832(0.6160) 
f1 score : 0.13559322033898302
recall score : 0.07894736842105263
precision score : 0.48
thresh : 0.76
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 26s) Loss: 0.5345(0.5345) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.3457(0.4820) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.4824(0.4739) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.3367(0.4733) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6716(0.6716) 


Epoch 2 - avg_train_loss: 0.4733  avg_val_loss: 0.6299  time: 26s
INFO:__main__:Epoch 2 - avg_train_loss: 0.4733  avg_val_loss: 0.6299  time: 26s
Epoch 2 - Score: 0.7082
INFO:__main__:Epoch 2 - Score: 0.7082


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.8707(0.6299) 
f1 score : 0.09876543209876543
recall score : 0.05263157894736842
precision score : 0.8
thresh : 0.56
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 30s) Loss: 0.2123(0.2123) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.1395(0.2262) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.1631(0.2057) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.1969(0.1996) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.7287(0.7287) 


Epoch 3 - avg_train_loss: 0.1996  avg_val_loss: 0.6103  time: 26s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1996  avg_val_loss: 0.6103  time: 26s
Epoch 3 - Score: 0.7042
INFO:__main__:Epoch 3 - Score: 0.7042


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.7955(0.6103) 
f1 score : 0.2621359223300971
recall score : 0.17763157894736842
precision score : 0.5
thresh : 0.75
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 41s) Loss: 0.0787(0.0787) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.0795(0.0734) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.0523(0.0701) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.0699(0.0690) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.7916(0.7916) 


Epoch 4 - avg_train_loss: 0.0690  avg_val_loss: 0.6197  time: 26s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0690  avg_val_loss: 0.6197  time: 26s
Epoch 4 - Score: 0.7042
INFO:__main__:Epoch 4 - Score: 0.7042


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.8154(0.6197) 
f1 score : 0.3259911894273128
recall score : 0.24342105263157895
precision score : 0.49333333333333335
thresh : 0.47
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 36s) Loss: 0.0576(0.0576) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.0486(0.0486) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.0516(0.0476) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.0482(0.0476) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.8456(0.8456) 


Epoch 5 - avg_train_loss: 0.0476  avg_val_loss: 0.6473  time: 26s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0476  avg_val_loss: 0.6473  time: 26s
Epoch 5 - Score: 0.7022
INFO:__main__:Epoch 5 - Score: 0.7022


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.8869(0.6473) 
f1 score : 0.2559241706161137
recall score : 0.17763157894736842
precision score : 0.4576271186440678
thresh : 0.41


Score: 0.6922
INFO:__main__:Score: 0.6922
ACC BEST Score: 0.7082
INFO:__main__:ACC BEST Score: 0.7082
ElectraConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_google_electra_base_discriminator_epoch10",
  "architectures": [
    "ElectraForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": null,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj":

f1 score : 0.13559322033898302
recall score : 0.07894736842105263
precision score : 0.48
thresh : 0.76


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_google_electra_base_discriminator_epoch10 were not used when initializing ElectraModel: ['generator_predictions.dense.weight', 'generator_predictions.LayerNorm.bias', 'generator_predictions.dense.bias', 'generator_lm_head.weight', 'generator_predictions.LayerNorm.weight', 'generator_lm_head.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 36s) Loss: 0.6493(0.6493) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.5918(0.6236) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.6505(0.6103) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.6484(0.6121) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6030(0.6030) 


Epoch 1 - avg_train_loss: 0.6121  avg_val_loss: 0.6234  time: 25s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6121  avg_val_loss: 0.6234  time: 25s
Epoch 1 - Score: 0.7123
INFO:__main__:Epoch 1 - Score: 0.7123
Epoch 1 - Save Best Score: 0.7123 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7123 Model


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.6277(0.6234) 
f1 score : 0.33884297520661155
recall score : 0.26973684210526316
precision score : 0.45555555555555555
thresh : 0.6
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 36s) Loss: 0.5836(0.5836) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.4047(0.4641) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.2460(0.4489) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.6025(0.4443) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5761(0.5761) 


Epoch 2 - avg_train_loss: 0.4443  avg_val_loss: 0.6030  time: 26s
INFO:__main__:Epoch 2 - avg_train_loss: 0.4443  avg_val_loss: 0.6030  time: 26s
Epoch 2 - Score: 0.7103
INFO:__main__:Epoch 2 - Score: 0.7103


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.6053(0.6030) 
f1 score : 0.23999999999999996
recall score : 0.15789473684210525
precision score : 0.5
thresh : 0.59
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 38s) Loss: 0.1833(0.1833) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.2050(0.1926) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.1370(0.1791) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.1729(0.1732) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6018(0.6018) 


Epoch 3 - avg_train_loss: 0.1732  avg_val_loss: 0.6603  time: 26s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1732  avg_val_loss: 0.6603  time: 26s
Epoch 3 - Score: 0.7183
INFO:__main__:Epoch 3 - Score: 0.7183
Epoch 3 - Save Best Score: 0.7183 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7183 Model


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.5582(0.6603) 
f1 score : 0.1910112359550562
recall score : 0.1118421052631579
precision score : 0.6538461538461539
thresh : 0.52
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 36s) Loss: 0.0625(0.0625) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.0460(0.0648) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.0426(0.0611) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0369(0.0597) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5397(0.5397) 


Epoch 4 - avg_train_loss: 0.0597  avg_val_loss: 0.6569  time: 26s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0597  avg_val_loss: 0.6569  time: 26s
Epoch 4 - Score: 0.7163
INFO:__main__:Epoch 4 - Score: 0.7163


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.6243(0.6569) 
f1 score : 0.2692307692307692
recall score : 0.18421052631578946
precision score : 0.5
thresh : 0.61
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 32s) Loss: 0.0405(0.0405) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.0402(0.0430) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.0545(0.0419) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.0406(0.0419) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5528(0.5528) 


Epoch 5 - avg_train_loss: 0.0419  avg_val_loss: 0.6727  time: 26s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0419  avg_val_loss: 0.6727  time: 26s
Epoch 5 - Score: 0.7183
INFO:__main__:Epoch 5 - Score: 0.7183


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.6215(0.6727) 
f1 score : 0.27
recall score : 0.17763157894736842
precision score : 0.5625
thresh : 0.59


Score: 0.7103
INFO:__main__:Score: 0.7103
ACC BEST Score: 0.7183
INFO:__main__:ACC BEST Score: 0.7183
ElectraConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_google_electra_base_discriminator_epoch10",
  "architectures": [
    "ElectraForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": null,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj":

f1 score : 0.1910112359550562
recall score : 0.1118421052631579
precision score : 0.6538461538461539
thresh : 0.52


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_google_electra_base_discriminator_epoch10 were not used when initializing ElectraModel: ['generator_predictions.dense.weight', 'generator_predictions.LayerNorm.bias', 'generator_predictions.dense.bias', 'generator_lm_head.weight', 'generator_predictions.LayerNorm.weight', 'generator_lm_head.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 37s) Loss: 0.7087(0.7087) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.6353(0.6301) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.6121(0.6125) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.5246(0.6044) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6643(0.6643) 


Epoch 1 - avg_train_loss: 0.6044  avg_val_loss: 0.5792  time: 25s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6044  avg_val_loss: 0.5792  time: 25s
Epoch 1 - Score: 0.7103
INFO:__main__:Epoch 1 - Score: 0.7103
Epoch 1 - Save Best Score: 0.7103 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7103 Model


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.8030(0.5792) 
f1 score : 0.1797752808988764
recall score : 0.10526315789473684
precision score : 0.6153846153846154
thresh : 0.6
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 29s) Loss: 0.5932(0.5932) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.3526(0.4509) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.3521(0.4354) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.5702(0.4239) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.7171(0.7171) 


Epoch 2 - avg_train_loss: 0.4239  avg_val_loss: 0.5786  time: 26s
INFO:__main__:Epoch 2 - avg_train_loss: 0.4239  avg_val_loss: 0.5786  time: 26s
Epoch 2 - Score: 0.7143
INFO:__main__:Epoch 2 - Score: 0.7143
Epoch 2 - Save Best Score: 0.7143 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7143 Model


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.8243(0.5786) 
f1 score : 0.23350253807106602
recall score : 0.1513157894736842
precision score : 0.5111111111111111
thresh : 0.41
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 36s) Loss: 0.1934(0.1934) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.1370(0.1687) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.0817(0.1536) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.1082(0.1474) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.8015(0.8015) 


Epoch 3 - avg_train_loss: 0.1474  avg_val_loss: 0.5961  time: 26s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1474  avg_val_loss: 0.5961  time: 26s
Epoch 3 - Score: 0.7143
INFO:__main__:Epoch 3 - Score: 0.7143


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.9103(0.5961) 
f1 score : 0.35242290748898675
recall score : 0.2631578947368421
precision score : 0.5333333333333333
thresh : 0.71
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 31s) Loss: 0.0649(0.0649) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.0471(0.0557) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.0364(0.0537) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.0501(0.0514) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.8996(0.8996) 


Epoch 4 - avg_train_loss: 0.0514  avg_val_loss: 0.6354  time: 26s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0514  avg_val_loss: 0.6354  time: 26s
Epoch 4 - Score: 0.7143
INFO:__main__:Epoch 4 - Score: 0.7143


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 1.0745(0.6354) 
f1 score : 0.31627906976744186
recall score : 0.2236842105263158
precision score : 0.5396825396825397
thresh : 0.71
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 33s) Loss: 0.0312(0.0312) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.0393(0.0352) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.0351(0.0353) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0323(0.0351) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.9560(0.9560) 


Epoch 5 - avg_train_loss: 0.0351  avg_val_loss: 0.6587  time: 26s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0351  avg_val_loss: 0.6587  time: 26s
Epoch 5 - Score: 0.7143
INFO:__main__:Epoch 5 - Score: 0.7143


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 1.1357(0.6587) 
f1 score : 0.31627906976744186
recall score : 0.2236842105263158
precision score : 0.5396825396825397
thresh : 0.73


Score: 0.6962
INFO:__main__:Score: 0.6962
ACC BEST Score: 0.7143
INFO:__main__:ACC BEST Score: 0.7143
ElectraConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_google_electra_base_discriminator_epoch10",
  "architectures": [
    "ElectraForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": null,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj":

f1 score : 0.23350253807106602
recall score : 0.1513157894736842
precision score : 0.5111111111111111
thresh : 0.41


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_google_electra_base_discriminator_epoch10 were not used when initializing ElectraModel: ['generator_predictions.dense.weight', 'generator_predictions.LayerNorm.bias', 'generator_predictions.dense.bias', 'generator_lm_head.weight', 'generator_predictions.LayerNorm.weight', 'generator_lm_head.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 39s) Loss: 0.8373(0.8373) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.6616(0.6260) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.6433(0.6176) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.7340(0.6114) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6088(0.6088) 


Epoch 1 - avg_train_loss: 0.6114  avg_val_loss: 0.5771  time: 26s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6114  avg_val_loss: 0.5771  time: 26s
Epoch 1 - Score: 0.7163
INFO:__main__:Epoch 1 - Score: 0.7163
Epoch 1 - Save Best Score: 0.7163 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7163 Model


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.7044(0.5771) 
f1 score : 0.10975609756097561
recall score : 0.05921052631578947
precision score : 0.75
thresh : 0.39
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 27s) Loss: 0.4763(0.4763) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.4789(0.4531) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.6299(0.4458) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.5570(0.4373) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5876(0.5876) 


Epoch 2 - avg_train_loss: 0.4373  avg_val_loss: 0.5761  time: 26s
INFO:__main__:Epoch 2 - avg_train_loss: 0.4373  avg_val_loss: 0.5761  time: 26s
Epoch 2 - Score: 0.7324
INFO:__main__:Epoch 2 - Score: 0.7324
Epoch 2 - Save Best Score: 0.7324 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7324 Model


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.7732(0.5761) 
f1 score : 0.2608695652173913
recall score : 0.15789473684210525
precision score : 0.75
thresh : 0.52
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 33s) Loss: 0.2098(0.2098) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.1115(0.1874) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.1603(0.1740) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.1423(0.1699) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6129(0.6129) 


Epoch 3 - avg_train_loss: 0.1699  avg_val_loss: 0.6144  time: 26s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1699  avg_val_loss: 0.6144  time: 26s
Epoch 3 - Score: 0.7344
INFO:__main__:Epoch 3 - Score: 0.7344
Epoch 3 - Save Best Score: 0.7344 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7344 Model


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.8458(0.6144) 
f1 score : 0.28877005347593576
recall score : 0.17763157894736842
precision score : 0.7714285714285715
thresh : 0.55
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 31s) Loss: 0.0600(0.0600) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.0537(0.0621) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.0464(0.0594) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.0628(0.0585) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6345(0.6345) 


Epoch 4 - avg_train_loss: 0.0585  avg_val_loss: 0.6117  time: 26s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0585  avg_val_loss: 0.6117  time: 26s
Epoch 4 - Score: 0.7364
INFO:__main__:Epoch 4 - Score: 0.7364
Epoch 4 - Save Best Score: 0.7364 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7364 Model


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.8732(0.6117) 
f1 score : 0.37837837837837845
recall score : 0.27631578947368424
precision score : 0.6
thresh : 0.59
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 33s) Loss: 0.0414(0.0414) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.0337(0.0415) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.0379(0.0411) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0365(0.0408) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6479(0.6479) 


Epoch 5 - avg_train_loss: 0.0408  avg_val_loss: 0.6270  time: 26s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0408  avg_val_loss: 0.6270  time: 26s
Epoch 5 - Score: 0.7344
INFO:__main__:Epoch 5 - Score: 0.7344


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.9172(0.6270) 
f1 score : 0.3766816143497759
recall score : 0.27631578947368424
precision score : 0.5915492957746479
thresh : 0.62


Score: 0.7223
INFO:__main__:Score: 0.7223
ACC BEST Score: 0.7364
INFO:__main__:ACC BEST Score: 0.7364
ElectraConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_google_electra_base_discriminator_epoch10",
  "architectures": [
    "ElectraForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": null,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj":

f1 score : 0.37837837837837845
recall score : 0.27631578947368424
precision score : 0.6
thresh : 0.59


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_google_electra_base_discriminator_epoch10 were not used when initializing ElectraModel: ['generator_predictions.dense.weight', 'generator_predictions.LayerNorm.bias', 'generator_predictions.dense.bias', 'generator_lm_head.weight', 'generator_predictions.LayerNorm.weight', 'generator_lm_head.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 37s) Loss: 0.8181(0.8181) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.6155(0.6124) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.4760(0.6029) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.7130(0.6016) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6346(0.6346) 


Epoch 1 - avg_train_loss: 0.6016  avg_val_loss: 0.5877  time: 25s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6016  avg_val_loss: 0.5877  time: 25s
Epoch 1 - Score: 0.7062
INFO:__main__:Epoch 1 - Score: 0.7062
Epoch 1 - Save Best Score: 0.7062 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7062 Model


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.4458(0.5877) 
f1 score : 0.12048192771084337
recall score : 0.06578947368421052
precision score : 0.7142857142857143
thresh : 0.48
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 32s) Loss: 0.4501(0.4501) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.3924(0.4273) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.3773(0.4150) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.3936(0.4101) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6622(0.6622) 


Epoch 2 - avg_train_loss: 0.4101  avg_val_loss: 0.5825  time: 26s
INFO:__main__:Epoch 2 - avg_train_loss: 0.4101  avg_val_loss: 0.5825  time: 26s
Epoch 2 - Score: 0.7123
INFO:__main__:Epoch 2 - Score: 0.7123
Epoch 2 - Save Best Score: 0.7123 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7123 Model


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.5210(0.5825) 
f1 score : 0.3470319634703196
recall score : 0.25
precision score : 0.5671641791044776
thresh : 0.5
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 37s) Loss: 0.1968(0.1968) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 8s (remain 0m 14s) Loss: 0.1984(0.1644) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.1405(0.1466) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.1160(0.1446) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6639(0.6639) 


Epoch 3 - avg_train_loss: 0.1446  avg_val_loss: 0.6210  time: 26s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1446  avg_val_loss: 0.6210  time: 26s
Epoch 3 - Score: 0.7143
INFO:__main__:Epoch 3 - Score: 0.7143
Epoch 3 - Save Best Score: 0.7143 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7143 Model


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.4898(0.6210) 
f1 score : 0.3333333333333333
recall score : 0.25
precision score : 0.5
thresh : 0.61
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 37s) Loss: 0.0515(0.0515) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.0476(0.0480) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.0479(0.0463) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0315(0.0450) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.7696(0.7696) 


Epoch 4 - avg_train_loss: 0.0450  avg_val_loss: 0.6953  time: 26s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0450  avg_val_loss: 0.6953  time: 26s
Epoch 4 - Score: 0.7123
INFO:__main__:Epoch 4 - Score: 0.7123


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.5360(0.6953) 
f1 score : 0.3983739837398374
recall score : 0.3223684210526316
precision score : 0.5212765957446809
thresh : 0.59
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 36s) Loss: 0.0278(0.0278) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.0216(0.0294) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.0428(0.0294) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 22s (remain 0m 0s) Loss: 0.0350(0.0290) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.7943(0.7943) 


Epoch 5 - avg_train_loss: 0.0290  avg_val_loss: 0.7334  time: 26s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0290  avg_val_loss: 0.7334  time: 26s
Epoch 5 - Score: 0.7143
INFO:__main__:Epoch 5 - Score: 0.7143


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.5465(0.7334) 
f1 score : 0.39999999999999997
recall score : 0.3157894736842105
precision score : 0.5454545454545454
thresh : 0.65


Score: 0.6942
INFO:__main__:Score: 0.6942
ACC BEST Score: 0.7143
INFO:__main__:ACC BEST Score: 0.7143
Score: 0.7037
INFO:__main__:Score: 0.7037
ACC BEST Score: 0.7093
INFO:__main__:ACC BEST Score: 0.7093


f1 score : 0.3333333333333333
recall score : 0.25
precision score : 0.5
thresh : 0.61
f1 score : 0.26957383548067393
recall score : 0.17859487852921865
precision score : 0.5494949494949495
thresh : 0.65


In [None]:
from google.colab import runtime
runtime.unassign()