In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
!pip install transformers
!pip install datasets
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m74.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m104.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https://pypi.org/simple, h

In [3]:
!nvidia-smi

Sat May  6 10:39:56 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   29C    P0    44W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:

import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re
import html

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD, AdamW, RAdam
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score, recall_score, accuracy_score, precision_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
DIR = "/content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測"
INPUT_DIR = os.path.join(DIR,"input")
OUTPUT_DIR = os.path.join(DIR,"output")
CUSTOM_MODEL_DIR = os.path.join(OUTPUT_DIR,'clrp_funnel_transformer_medium_epoch10')
OUTPUT_EXP_DIR = DIR + '/output/EXP070/'
if not os.path.exists(OUTPUT_EXP_DIR):
    os.makedirs(OUTPUT_EXP_DIR)

In [6]:

# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    model_name="funnel-transformer/medium"
    # model='microsoft/deberta-base'
    # model='roberta-base'
    # model='roberta-large'
    # model='roberta-large-mnli'
    # model='xlnet-large-cased'
    # model='albert-xxlarge-v2'
    # model="microsoft/deberta-large"
    # model="microsoft/deberta-v3-large"
    # model='microsoft/deberta-v2-xlarge'
    # model='funnel-transformer/large'
    # model='funnel-transformer/medium'
    # model='albert-base-v2'
    # model='albert-large-v2'
    # model='google/electra-large-discriminator'
    # model='google/electra-base-discriminator'
    # model="facebook/bart-large-mnli"
    # model="facebook/bart-large"
    # model="facebook/bart-base"
    model = CUSTOM_MODEL_DIR
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=5
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=16
    fc_dropout=0.2
    target_size=1
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=10
    trn_fold=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    train=True
    nth_awp_start_epoch=1
    gradient_checkpointing = False
    freezing = False
    num_reinit_layers = 1
    is_reinit_layer = False
    fgm = False
    awp_start=1

if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0, 1]

In [7]:
def get_score(labels, outputs):
    thresh = 0.5
    y_pred = outputs
    y_true = labels
    f_score = f1_score(y_true, (y_pred>thresh).astype(int))
    r_score = recall_score(y_true, (y_pred>thresh).astype(int))
    p_score = precision_score(y_true, (y_pred>thresh).astype(int))
    print(f"f1 score : {f_score}")
    print(f"recall score : {r_score}")
    print(f"precision score : {p_score}")
    return accuracy_score(y_true, (y_pred>thresh).astype(int))

def get_acc_score(labels, outputs):
    y_pred = outputs
    y_true = labels
    best_score = 0
    best_thresh = 0.5
    for thresh in np.arange(0.1, 0.80, 0.01):
        thresh = np.round(thresh, 2)
        score = accuracy_score(y_true, (y_pred>thresh).astype(int))
        #print("Accuracy score at threshold {0} is {1}".format(thresh, score))
        if score > best_score:
          best_score = score
          best_thresh = thresh
    print(f"thresh : {best_thresh}")
    return accuracy_score(y_true, (y_pred>best_thresh).astype(int))


def get_logger(filename=OUTPUT_EXP_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=CFG.seed)

In [8]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False
        
def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
            
    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """
    
    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"
        
        if hasattr(embeddings_path, attr_name): 
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [9]:
import pandas as pd
import numpy as np

train = pd.read_csv(os.path.join(INPUT_DIR,"train_data.csv"))
test = pd.read_csv(os.path.join(INPUT_DIR,"test_data.csv"))
sample_sub = pd.read_csv(os.path.join(INPUT_DIR,"submission.csv"))

train = train.sample(frac=1, random_state=CFG.seed).reset_index()

print(train.shape)
display(train.head(3))

print(test.shape)
display(test.head(3))

print(sample_sub.shape)
display(sample_sub.head(3))

(4974, 7)


Unnamed: 0,index,id,title,year,abstract,keywords,y
0,721,722,Global Optimality Conditions for Deep Neural N...,2018,We study the error landscape of deep linear an...,"deep linear neural networks, global optimality...",1
1,144,145,Multi-Task Learning by Deep Collaboration and ...,2018,Convolutional neural networks (CNN) have becom...,"multi-task learning, soft parameter sharing, f...",0
2,4542,4543,On the Need for Topology-Aware Generative Mode...,2020,"ML algorithms or models, especially deep neura...","Manifold-based Defense, Robust Learning, Adver...",1


(6393, 5)


Unnamed: 0,id,title,year,abstract,keywords
0,1,StyleAlign: Analysis and Applications of Align...,2022,"In this paper, we perform an in-depth study of...","StyleGAN, transfer learning, fine tuning, mode..."
1,2,Embedding a random graph via GNN: mean-field i...,2021,We develop a theory for embedding a random gra...,"Graph neural network, graph embedding, multi-r..."
2,3,BBRefinement: an universal scheme to improve p...,2021,We present a conceptually simple yet powerful ...,"object detection, deep neural networks, refine..."


(6393, 2)


Unnamed: 0,id,y
0,1,0
1,2,0
2,3,0


In [10]:
train["texts"] = train["title"] + "<sep>" + train["abstract"] 

In [11]:
skf = StratifiedKFold(n_splits=CFG.n_fold,shuffle=True,random_state=CFG.seed)
for fold, ( _, val_) in enumerate(skf.split(train, train.y)):
    train.loc[val_ , "kfold"] = int(fold)
    
train["kfold"] = train["kfold"].astype(int)

if CFG.debug:
    display(train.groupby('kfold').size())
    train = train.sample(n=500, random_state=0).reset_index(drop=True)
    display(train.groupby('kfold').size())

In [12]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_EXP_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

In [13]:
# ====================================================
# Define max_len
# ====================================================
lengths = []
tk0 = tqdm(train['texts'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 3 # cls + sep + sep
LOGGER.info(f"max_len: {CFG.max_len}")

 27%|██▋       | 1325/4974 [00:01<00:02, 1312.16it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (567 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 4974/4974 [00:03<00:00, 1295.10it/s]
max_len: 570
INFO:__main__:max_len: 570


In [14]:
class AWP:
    def __init__(self, model, optimizer, *, adv_param='weight',
                 adv_lr=0.001, adv_eps=0.001):
        self.model = model
        self.optimizer = optimizer
        self.adv_param = adv_param
        self.adv_lr = adv_lr
        self.adv_eps = adv_eps
        self.backup = {}

    def perturb(self, inputs, y, criterion):
        """
        Perturb model parameters for AWP gradient
        Call before loss and loss.backward()
        """
        self._save()  # save model parameters
        self._attack_step()  # perturb weights

    def _attack_step(self):
        e = 1e-6
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                grad = self.optimizer.state[param]['exp_avg']
                norm_grad = torch.norm(grad)
                norm_data = torch.norm(param.detach())

                if norm_grad != 0 and not torch.isnan(norm_grad):
                    # Set lower and upper limit in change
                    limit_eps = self.adv_eps * param.detach().abs()
                    param_min = param.data - limit_eps
                    param_max = param.data + limit_eps

                    # Perturb along gradient
                    # w += (adv_lr * |w| / |grad|) * grad
                    param.data.add_(grad, alpha=(self.adv_lr * (norm_data + e) / (norm_grad + e)))

                    # Apply the limit to the change
                    param.data.clamp_(param_min, param_max)

    def _save(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                if name not in self.backup:
                    self.backup[name] = param.clone().detach()
                else:
                    self.backup[name].copy_(param.data)

    def restore(self):
        """
        Restore model parameter to correct position; AWP do not perturbe weights, it perturb gradients
        Call after loss.backward(), before optimizer.step()
        """
        for name, param in self.model.named_parameters():
            if name in self.backup:
                param.data.copy_(self.backup[name])

In [15]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False,
                           truncation=True)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['texts'].values
        self.labels = df['y'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

#collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [16]:
def reinit_layers(model):

    #for layer in model.model.encoder.layer[-CFG.num_reinit_layers:]:
    for layer in model.encoder.layer[-CFG.num_reinit_layers:]:    #Custome model内(backbone)

            for module in layer.modules():

                if isinstance(module,nn.Linear):
                    module.weight.data.normal_(mean=0.0,std=model.config.initializer_range)
                    if module.bias is not None:
                            module.bias.data.zero_()
                elif isinstance(module, nn.Embedding):
                        module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
                        if module.padding_idx is not None:
                            module.weight.data[module.padding_idx].zero_()
                elif isinstance(module, nn.LayerNorm):
                        module.bias.data.zero_()
                        module.weight.data.fill_(1.0)
                        
    return model

In [17]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim=1)
        return max_embeddings
    

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if CFG.is_reinit_layer:
            self.model = reinit_layers(self.model)
            print(f'Reinitializing Last {CFG.num_reinit_layers} Layers.')
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()

        # Freezing
        if cfg.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            cfg.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())

        self.pool = MaxPooling()
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        self._init_weights(self.fc)
        self.layer_norm1 = nn.LayerNorm(self.config.hidden_size)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        feature = self.layer_norm1(feature)
        output = self.fc(feature)
        return output

In [18]:
def calculate_loss(inputs, labels, model, criterion, is_valid=True, device="cpu"):    
    y_preds = model(inputs)
    loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
    return (loss, y_preds) if is_valid else loss

In [19]:

# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device, awp):
    model.zero_grad()
    model.train()
    awp_start = CFG.awp_start
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        if epoch >= awp_start:
            awp.perturb(inputs, labels, criterion)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            loss = calculate_loss(inputs=inputs, labels=labels, model=model, criterion=criterion, is_valid=False, device=device)
        #print(y_preds.sigmoid().squeeze().view(1, -1))
        #loss = criterion(y_preds.sigmoid().squeeze(), labels.squeeze())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        if scaler is not None:
            scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        awp.restore()
        if CFG.fgm:
          fgm.attack() 
          adversarial_loss = calculate_loss(inputs=inputs, labels=labels, model=model, criterion=criterion, is_valid=False, device=device)
          scaler.scale(adversarial_loss).backward()
          fgm.restore()
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  #'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          #grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))

    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            loss, y_preds = calculate_loss(inputs=inputs, labels=labels, model=model, criterion=criterion, is_valid=True, device=device)
        #loss = criterion(y_preds.sigmoid().squeeze(), labels.squeeze())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    predictions = np.concatenate(predictions)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [20]:

# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['kfold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['kfold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds['y'].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)


    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size*2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_EXP_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr=5e-6, decoder_lr=1e-4, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", 
                    "LayerNorm.weight"]
        group1=['layer.0.','layer.1.','layer.2.','layer.3.']
        group2=['layer.4.','layer.5.','layer.6.','layer.7.']    
        group3=['layer.8.','layer.9.','layer.10.','layer.11.']
        group_all=['layer.0.','layer.1.','layer.2.','layer.3.','layer.4.','layer.5.','layer.6.','layer.7.','layer.8.','layer.9.','layer.10.','layer.11.']
        optimizer_parameters = [
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay': weight_decay},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay': weight_decay, 'lr': encoder_lr/2.6},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay': weight_decay, 'lr': encoder_lr},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay': weight_decay, 'lr': encoder_lr*2.6},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay': 0.0},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay': 0.0, 'lr': encoder_lr/2.6},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay': 0.0, 'lr': encoder_lr},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay': 0.0, 'lr': encoder_lr*2.6},
        {'params': [p for n, p in model.named_parameters() if "model" not in n], 'lr':decoder_lr, "momentum" : 0.99},
    ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.BCEWithLogitsLoss()

    print('Enable AWP')
    awp = AWP(model, optimizer, adv_lr=0.001, adv_eps=0.001)
    #print('Enable FGM')
    #fgm = FGM(model=model, eps=0.1)
    
    best_score = -1.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device, awp)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score_05 = get_score(valid_labels, predictions)
        score = get_acc_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')

        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_EXP_DIR+f"{CFG.model_name.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_EXP_DIR+f"{CFG.model_name.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds['pred'] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [21]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df['y'].values
        preds = oof_df['pred'].values
        score = get_score(labels, preds)
        acc_score = get_acc_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
        LOGGER.info(f'ACC BEST Score: {acc_score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
            #break
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_EXP_DIR+'oof_df.pkl')

FunnelConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_funnel_transformer_medium_epoch10",
  "activation_dropout": 0.0,
  "architectures": [
    "FunnelForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "attention_type": "relative_shift",
  "block_repeats": [
    1,
    2,
    2
  ],
  "block_sizes": [
    6,
    3,
    3
  ],
  "d_head": 64,
  "d_inner": 3072,
  "d_model": 768,
  "hidden_act": "gelu_new",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "initializer_range": 0.1,
  "initializer_std": null,
  "layer_norm_eps": 1e-09,
  "max_position_embeddings": 512,
  "model_type": "funnel",
  "n_head": 12,
  "num_decoder_layers": 2,
  "output_hidden_states": true,
  "pool_q_only": true,
  "pooling_type": "mean",
  "rel_attn_type": "factorized",
  "separate_cls": true,
  "torch_dtype": "float32",
  "transformers_version": "4.28

Enable AWP
Epoch: [1][0/279] Elapsed 0m 5s (remain 23m 15s) Loss: 1.1711(1.1711) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 22s (remain 0m 40s) Loss: 1.1207(0.7501) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 40s (remain 0m 15s) Loss: 0.5589(0.7018) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 54s (remain 0m 0s) Loss: 0.6744(0.6865) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.6374(0.6374) 


Epoch 1 - avg_train_loss: 0.6865  avg_val_loss: 0.6275  time: 58s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6865  avg_val_loss: 0.6275  time: 58s
Epoch 1 - Score: 0.7008
INFO:__main__:Epoch 1 - Score: 0.7008
Epoch 1 - Save Best Score: 0.7008 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7008 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.6874(0.6275) 
f1 score : 0.16042780748663102
recall score : 0.09803921568627451
precision score : 0.4411764705882353
thresh : 0.61
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 53s) Loss: 0.4028(0.4028) LR: 0.00001808  
Epoch: [2][100/279] Elapsed 0m 18s (remain 0m 33s) Loss: 0.2144(0.3147) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 36s (remain 0m 14s) Loss: 0.4959(0.3200) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 50s (remain 0m 0s) Loss: 0.4393(0.3206) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.6896(0.6896) 


Epoch 2 - avg_train_loss: 0.3206  avg_val_loss: 0.6474  time: 54s
INFO:__main__:Epoch 2 - avg_train_loss: 0.3206  avg_val_loss: 0.6474  time: 54s
Epoch 2 - Score: 0.7028
INFO:__main__:Epoch 2 - Score: 0.7028
Epoch 2 - Save Best Score: 0.7028 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7028 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7847(0.6474) 
f1 score : 0.2995951417004049
recall score : 0.24183006535947713
precision score : 0.39361702127659576
thresh : 0.79
Epoch: [3][0/279] Elapsed 0m 0s (remain 2m 2s) Loss: 0.2221(0.2221) LR: 0.00001309  
Epoch: [3][100/279] Elapsed 0m 18s (remain 0m 32s) Loss: 0.1302(0.1152) LR: 0.00001090  
Epoch: [3][200/279] Elapsed 0m 36s (remain 0m 14s) Loss: 0.1146(0.1098) LR: 0.00000866  
Epoch: [3][278/279] Elapsed 0m 50s (remain 0m 0s) Loss: 0.0934(0.1057) LR: 0.00000695  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.7225(0.7225) 


Epoch 3 - avg_train_loss: 0.1057  avg_val_loss: 0.6683  time: 55s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1057  avg_val_loss: 0.6683  time: 55s
Epoch 3 - Score: 0.7108
INFO:__main__:Epoch 3 - Score: 0.7108
Epoch 3 - Save Best Score: 0.7108 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7108 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8014(0.6683) 
f1 score : 0.3305785123966942
recall score : 0.26143790849673204
precision score : 0.449438202247191
thresh : 0.75
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 52s) Loss: 0.0491(0.0491) LR: 0.00000693  
Epoch: [4][100/279] Elapsed 0m 18s (remain 0m 33s) Loss: 0.0425(0.0444) LR: 0.00000488  
Epoch: [4][200/279] Elapsed 0m 36s (remain 0m 14s) Loss: 0.0344(0.0434) LR: 0.00000310  
Epoch: [4][278/279] Elapsed 0m 51s (remain 0m 0s) Loss: 0.0318(0.0426) LR: 0.00000194  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.7849(0.7849) 


Epoch 4 - avg_train_loss: 0.0426  avg_val_loss: 0.7110  time: 55s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0426  avg_val_loss: 0.7110  time: 55s
Epoch 4 - Score: 0.7088
INFO:__main__:Epoch 4 - Score: 0.7088


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8538(0.7110) 
f1 score : 0.2672811059907834
recall score : 0.1895424836601307
precision score : 0.453125
thresh : 0.58
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 54s) Loss: 0.0353(0.0353) LR: 0.00000193  
Epoch: [5][100/279] Elapsed 0m 18s (remain 0m 32s) Loss: 0.0316(0.0316) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 35s (remain 0m 13s) Loss: 0.0290(0.0309) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 50s (remain 0m 0s) Loss: 0.0241(0.0312) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.7962(0.7962) 


Epoch 5 - avg_train_loss: 0.0312  avg_val_loss: 0.7201  time: 54s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0312  avg_val_loss: 0.7201  time: 54s
Epoch 5 - Score: 0.7088
INFO:__main__:Epoch 5 - Score: 0.7088


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8669(0.7201) 
f1 score : 0.27230046948356806
recall score : 0.1895424836601307
precision score : 0.48333333333333334
thresh : 0.59


Score: 0.6747
INFO:__main__:Score: 0.6747
ACC BEST Score: 0.7108
INFO:__main__:ACC BEST Score: 0.7108
FunnelConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_funnel_transformer_medium_epoch10",
  "activation_dropout": 0.0,
  "architectures": [
    "FunnelForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "attention_type": "relative_shift",
  "block_repeats": [
    1,
    2,
    2
  ],
  "block_sizes": [
    6,
    3,
    3
  ],
  "d_head": 64,
  "d_inner": 3072,
  "d_model": 768,
  "hidden_act": "gelu_new",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "initializer_range": 0.1,
  "initializer_std": null,
  "layer_norm_eps": 1e-09,
  "max_position_embeddings": 512,
  "model_type": "funnel",
  "n_head": 12,
  "num_decoder_layers": 2,
  "output_hidden_states": true,
  "pool_q_only": true,
  "pooling_type": "mean",
  "rel_attn_ty

f1 score : 0.3305785123966942
recall score : 0.26143790849673204
precision score : 0.449438202247191
thresh : 0.75


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_funnel_transformer_medium_epoch10 were not used when initializing FunnelModel: ['lm_head.bias', 'lm_head.weight']
- This IS expected if you are initializing FunnelModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing FunnelModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 2m 17s) Loss: 0.8059(0.8059) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 18s (remain 0m 32s) Loss: 0.8958(0.7394) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 36s (remain 0m 14s) Loss: 0.6383(0.7084) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 49s (remain 0m 0s) Loss: 0.5787(0.6877) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4478(0.4478) 


Epoch 1 - avg_train_loss: 0.6877  avg_val_loss: 0.6219  time: 54s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6877  avg_val_loss: 0.6219  time: 54s
Epoch 1 - Score: 0.7108
INFO:__main__:Epoch 1 - Score: 0.7108
Epoch 1 - Save Best Score: 0.7108 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7108 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.5601(0.6219) 
f1 score : 0.23958333333333331
recall score : 0.1503267973856209
precision score : 0.5897435897435898
thresh : 0.48
Epoch: [2][0/279] Elapsed 0m 0s (remain 2m 4s) Loss: 0.3588(0.3588) LR: 0.00001808  
Epoch: [2][100/279] Elapsed 0m 18s (remain 0m 33s) Loss: 0.1706(0.3555) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 36s (remain 0m 14s) Loss: 0.3281(0.3422) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 51s (remain 0m 0s) Loss: 0.4198(0.3435) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5997(0.5997) 


Epoch 2 - avg_train_loss: 0.3435  avg_val_loss: 0.6432  time: 55s
INFO:__main__:Epoch 2 - avg_train_loss: 0.3435  avg_val_loss: 0.6432  time: 55s
Epoch 2 - Score: 0.7068
INFO:__main__:Epoch 2 - Score: 0.7068


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7327(0.6432) 
f1 score : 0.33050847457627114
recall score : 0.2549019607843137
precision score : 0.46987951807228917
thresh : 0.65
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 56s) Loss: 0.1063(0.1063) LR: 0.00001309  
Epoch: [3][100/279] Elapsed 0m 18s (remain 0m 32s) Loss: 0.0949(0.1340) LR: 0.00001090  
Epoch: [3][200/279] Elapsed 0m 36s (remain 0m 14s) Loss: 0.0854(0.1295) LR: 0.00000866  
Epoch: [3][278/279] Elapsed 0m 50s (remain 0m 0s) Loss: 0.1800(0.1224) LR: 0.00000695  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5798(0.5798) 


Epoch 3 - avg_train_loss: 0.1224  avg_val_loss: 0.6871  time: 54s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1224  avg_val_loss: 0.6871  time: 54s
Epoch 3 - Score: 0.7249
INFO:__main__:Epoch 3 - Score: 0.7249
Epoch 3 - Save Best Score: 0.7249 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7249 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7877(0.6871) 
f1 score : 0.3347280334728034
recall score : 0.26143790849673204
precision score : 0.46511627906976744
thresh : 0.67
Epoch: [4][0/279] Elapsed 0m 0s (remain 2m 3s) Loss: 0.0559(0.0559) LR: 0.00000693  
Epoch: [4][100/279] Elapsed 0m 18s (remain 0m 32s) Loss: 0.0694(0.0491) LR: 0.00000488  
Epoch: [4][200/279] Elapsed 0m 36s (remain 0m 14s) Loss: 0.0349(0.0469) LR: 0.00000310  
Epoch: [4][278/279] Elapsed 0m 50s (remain 0m 0s) Loss: 0.0439(0.0460) LR: 0.00000194  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5102(0.5102) 


Epoch 4 - avg_train_loss: 0.0460  avg_val_loss: 0.7484  time: 54s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0460  avg_val_loss: 0.7484  time: 54s
Epoch 4 - Score: 0.7209
INFO:__main__:Epoch 4 - Score: 0.7209


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7960(0.7484) 
f1 score : 0.30097087378640774
recall score : 0.20261437908496732
precision score : 0.5849056603773585
thresh : 0.6
Epoch: [5][0/279] Elapsed 0m 0s (remain 2m 2s) Loss: 0.0371(0.0371) LR: 0.00000193  
Epoch: [5][100/279] Elapsed 0m 18s (remain 0m 32s) Loss: 0.0276(0.0341) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 36s (remain 0m 14s) Loss: 0.0450(0.0336) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 50s (remain 0m 0s) Loss: 0.0235(0.0334) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5206(0.5206) 


Epoch 5 - avg_train_loss: 0.0334  avg_val_loss: 0.7448  time: 55s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0334  avg_val_loss: 0.7448  time: 55s
Epoch 5 - Score: 0.7209
INFO:__main__:Epoch 5 - Score: 0.7209


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8106(0.7448) 
f1 score : 0.3113207547169811
recall score : 0.21568627450980393
precision score : 0.559322033898305
thresh : 0.64


Score: 0.6807
INFO:__main__:Score: 0.6807
ACC BEST Score: 0.7249
INFO:__main__:ACC BEST Score: 0.7249
FunnelConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_funnel_transformer_medium_epoch10",
  "activation_dropout": 0.0,
  "architectures": [
    "FunnelForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "attention_type": "relative_shift",
  "block_repeats": [
    1,
    2,
    2
  ],
  "block_sizes": [
    6,
    3,
    3
  ],
  "d_head": 64,
  "d_inner": 3072,
  "d_model": 768,
  "hidden_act": "gelu_new",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "initializer_range": 0.1,
  "initializer_std": null,
  "layer_norm_eps": 1e-09,
  "max_position_embeddings": 512,
  "model_type": "funnel",
  "n_head": 12,
  "num_decoder_layers": 2,
  "output_hidden_states": true,
  "pool_q_only": true,
  "pooling_type": "mean",
  "rel_attn_ty

f1 score : 0.3347280334728034
recall score : 0.26143790849673204
precision score : 0.46511627906976744
thresh : 0.67


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_funnel_transformer_medium_epoch10 were not used when initializing FunnelModel: ['lm_head.bias', 'lm_head.weight']
- This IS expected if you are initializing FunnelModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing FunnelModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 2m 3s) Loss: 0.9931(0.9931) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 18s (remain 0m 32s) Loss: 0.7007(0.7205) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 36s (remain 0m 13s) Loss: 0.5880(0.6983) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 50s (remain 0m 0s) Loss: 0.6793(0.6814) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.6332(0.6332) 


Epoch 1 - avg_train_loss: 0.6814  avg_val_loss: 0.5933  time: 54s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6814  avg_val_loss: 0.5933  time: 54s
Epoch 1 - Score: 0.7088
INFO:__main__:Epoch 1 - Score: 0.7088
Epoch 1 - Save Best Score: 0.7088 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7088 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.3894(0.5933) 
f1 score : 0.2897196261682243
recall score : 0.20261437908496732
precision score : 0.5081967213114754
thresh : 0.59
Epoch: [2][0/279] Elapsed 0m 0s (remain 2m 1s) Loss: 0.3534(0.3534) LR: 0.00001808  
Epoch: [2][100/279] Elapsed 0m 18s (remain 0m 33s) Loss: 0.3158(0.3450) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 36s (remain 0m 14s) Loss: 0.2584(0.3330) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 50s (remain 0m 0s) Loss: 0.3969(0.3309) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.6796(0.6796) 


Epoch 2 - avg_train_loss: 0.3309  avg_val_loss: 0.6238  time: 55s
INFO:__main__:Epoch 2 - avg_train_loss: 0.3309  avg_val_loss: 0.6238  time: 55s
Epoch 2 - Score: 0.7209
INFO:__main__:Epoch 2 - Score: 0.7209
Epoch 2 - Save Best Score: 0.7209 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7209 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.4089(0.6238) 
f1 score : 0.43165467625899284
recall score : 0.39215686274509803
precision score : 0.48
thresh : 0.72
Epoch: [3][0/279] Elapsed 0m 0s (remain 2m 5s) Loss: 0.0929(0.0929) LR: 0.00001309  
Epoch: [3][100/279] Elapsed 0m 18s (remain 0m 33s) Loss: 0.1212(0.1318) LR: 0.00001090  
Epoch: [3][200/279] Elapsed 0m 36s (remain 0m 14s) Loss: 0.0680(0.1217) LR: 0.00000866  
Epoch: [3][278/279] Elapsed 0m 50s (remain 0m 0s) Loss: 0.0797(0.1174) LR: 0.00000695  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.6924(0.6924) 


Epoch 3 - avg_train_loss: 0.1174  avg_val_loss: 0.6565  time: 55s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1174  avg_val_loss: 0.6565  time: 55s
Epoch 3 - Score: 0.7169
INFO:__main__:Epoch 3 - Score: 0.7169


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.3567(0.6565) 
f1 score : 0.3373493975903615
recall score : 0.27450980392156865
precision score : 0.4375
thresh : 0.72
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 56s) Loss: 0.0471(0.0471) LR: 0.00000693  
Epoch: [4][100/279] Elapsed 0m 18s (remain 0m 32s) Loss: 0.0521(0.0480) LR: 0.00000488  
Epoch: [4][200/279] Elapsed 0m 36s (remain 0m 14s) Loss: 0.0357(0.0452) LR: 0.00000310  
Epoch: [4][278/279] Elapsed 0m 50s (remain 0m 0s) Loss: 0.0419(0.0438) LR: 0.00000194  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.7135(0.7135) 


Epoch 4 - avg_train_loss: 0.0438  avg_val_loss: 0.6964  time: 54s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0438  avg_val_loss: 0.6964  time: 54s
Epoch 4 - Score: 0.7169
INFO:__main__:Epoch 4 - Score: 0.7169


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.3465(0.6964) 
f1 score : 0.32599118942731276
recall score : 0.24183006535947713
precision score : 0.5
thresh : 0.72
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 59s) Loss: 0.0295(0.0295) LR: 0.00000193  
Epoch: [5][100/279] Elapsed 0m 18s (remain 0m 32s) Loss: 0.0268(0.0314) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 36s (remain 0m 14s) Loss: 0.0348(0.0314) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 50s (remain 0m 0s) Loss: 0.0341(0.0314) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.7203(0.7203) 


Epoch 5 - avg_train_loss: 0.0314  avg_val_loss: 0.7015  time: 54s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0314  avg_val_loss: 0.7015  time: 54s
Epoch 5 - Score: 0.7169
INFO:__main__:Epoch 5 - Score: 0.7169


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.3479(0.7015) 
f1 score : 0.32599118942731276
recall score : 0.24183006535947713
precision score : 0.5
thresh : 0.72


Score: 0.6827
INFO:__main__:Score: 0.6827
ACC BEST Score: 0.7209
INFO:__main__:ACC BEST Score: 0.7209
FunnelConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_funnel_transformer_medium_epoch10",
  "activation_dropout": 0.0,
  "architectures": [
    "FunnelForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "attention_type": "relative_shift",
  "block_repeats": [
    1,
    2,
    2
  ],
  "block_sizes": [
    6,
    3,
    3
  ],
  "d_head": 64,
  "d_inner": 3072,
  "d_model": 768,
  "hidden_act": "gelu_new",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "initializer_range": 0.1,
  "initializer_std": null,
  "layer_norm_eps": 1e-09,
  "max_position_embeddings": 512,
  "model_type": "funnel",
  "n_head": 12,
  "num_decoder_layers": 2,
  "output_hidden_states": true,
  "pool_q_only": true,
  "pooling_type": "mean",
  "rel_attn_ty

f1 score : 0.43165467625899284
recall score : 0.39215686274509803
precision score : 0.48
thresh : 0.72


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_funnel_transformer_medium_epoch10 were not used when initializing FunnelModel: ['lm_head.bias', 'lm_head.weight']
- This IS expected if you are initializing FunnelModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing FunnelModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 2m 11s) Loss: 1.9361(1.9361) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 18s (remain 0m 32s) Loss: 0.9115(0.7361) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 36s (remain 0m 14s) Loss: 0.8328(0.7008) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 50s (remain 0m 0s) Loss: 0.4294(0.6834) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.7957(0.7957) 


Epoch 1 - avg_train_loss: 0.6834  avg_val_loss: 0.6420  time: 54s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6834  avg_val_loss: 0.6420  time: 54s
Epoch 1 - Score: 0.7169
INFO:__main__:Epoch 1 - Score: 0.7169
Epoch 1 - Save Best Score: 0.7169 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7169 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.5532(0.6420) 
f1 score : 0.18497109826589594
recall score : 0.10526315789473684
precision score : 0.7619047619047619
thresh : 0.5
Epoch: [2][0/279] Elapsed 0m 0s (remain 2m 2s) Loss: 0.4441(0.4441) LR: 0.00001808  
Epoch: [2][100/279] Elapsed 0m 18s (remain 0m 33s) Loss: 0.4442(0.3802) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 36s (remain 0m 14s) Loss: 0.4965(0.3624) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 50s (remain 0m 0s) Loss: 0.3427(0.3602) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.8212(0.8212) 


Epoch 2 - avg_train_loss: 0.3602  avg_val_loss: 0.6317  time: 55s
INFO:__main__:Epoch 2 - avg_train_loss: 0.3602  avg_val_loss: 0.6317  time: 55s
Epoch 2 - Score: 0.7129
INFO:__main__:Epoch 2 - Score: 0.7129


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.5600(0.6317) 
f1 score : 0.29951690821256033
recall score : 0.20394736842105263
precision score : 0.5636363636363636
thresh : 0.71
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 59s) Loss: 0.1608(0.1608) LR: 0.00001309  
Epoch: [3][100/279] Elapsed 0m 18s (remain 0m 32s) Loss: 0.0946(0.1421) LR: 0.00001090  
Epoch: [3][200/279] Elapsed 0m 35s (remain 0m 13s) Loss: 0.1167(0.1360) LR: 0.00000866  
Epoch: [3][278/279] Elapsed 0m 50s (remain 0m 0s) Loss: 0.1294(0.1292) LR: 0.00000695  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.8155(0.8155) 


Epoch 3 - avg_train_loss: 0.1292  avg_val_loss: 0.6667  time: 54s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1292  avg_val_loss: 0.6667  time: 54s
Epoch 3 - Score: 0.7289
INFO:__main__:Epoch 3 - Score: 0.7289
Epoch 3 - Save Best Score: 0.7289 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7289 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7029(0.6667) 
f1 score : 0.3866666666666667
recall score : 0.3815789473684211
precision score : 0.3918918918918919
thresh : 0.75
Epoch: [4][0/279] Elapsed 0m 0s (remain 2m 12s) Loss: 0.0552(0.0552) LR: 0.00000693  
Epoch: [4][100/279] Elapsed 0m 18s (remain 0m 32s) Loss: 0.0402(0.0524) LR: 0.00000488  
Epoch: [4][200/279] Elapsed 0m 36s (remain 0m 14s) Loss: 0.0696(0.0491) LR: 0.00000310  
Epoch: [4][278/279] Elapsed 0m 50s (remain 0m 0s) Loss: 0.0427(0.0477) LR: 0.00000194  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.8833(0.8833) 


Epoch 4 - avg_train_loss: 0.0477  avg_val_loss: 0.6967  time: 55s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0477  avg_val_loss: 0.6967  time: 55s
Epoch 4 - Score: 0.7249
INFO:__main__:Epoch 4 - Score: 0.7249


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.6310(0.6967) 
f1 score : 0.27884615384615385
recall score : 0.19078947368421054
precision score : 0.5178571428571429
thresh : 0.58
Epoch: [5][0/279] Elapsed 0m 0s (remain 2m 1s) Loss: 0.0259(0.0259) LR: 0.00000193  
Epoch: [5][100/279] Elapsed 0m 18s (remain 0m 32s) Loss: 0.0305(0.0326) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 36s (remain 0m 14s) Loss: 0.0440(0.0329) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 50s (remain 0m 0s) Loss: 0.0441(0.0329) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.8899(0.8899) 


Epoch 5 - avg_train_loss: 0.0329  avg_val_loss: 0.7040  time: 55s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0329  avg_val_loss: 0.7040  time: 55s
Epoch 5 - Score: 0.7249
INFO:__main__:Epoch 5 - Score: 0.7249


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.6282(0.7040) 
f1 score : 0.27586206896551724
recall score : 0.18421052631578946
precision score : 0.5490196078431373
thresh : 0.61


Score: 0.6305
INFO:__main__:Score: 0.6305
ACC BEST Score: 0.7289
INFO:__main__:ACC BEST Score: 0.7289
FunnelConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_funnel_transformer_medium_epoch10",
  "activation_dropout": 0.0,
  "architectures": [
    "FunnelForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "attention_type": "relative_shift",
  "block_repeats": [
    1,
    2,
    2
  ],
  "block_sizes": [
    6,
    3,
    3
  ],
  "d_head": 64,
  "d_inner": 3072,
  "d_model": 768,
  "hidden_act": "gelu_new",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "initializer_range": 0.1,
  "initializer_std": null,
  "layer_norm_eps": 1e-09,
  "max_position_embeddings": 512,
  "model_type": "funnel",
  "n_head": 12,
  "num_decoder_layers": 2,
  "output_hidden_states": true,
  "pool_q_only": true,
  "pooling_type": "mean",
  "rel_attn_ty

f1 score : 0.3866666666666667
recall score : 0.3815789473684211
precision score : 0.3918918918918919
thresh : 0.75


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_funnel_transformer_medium_epoch10 were not used when initializing FunnelModel: ['lm_head.bias', 'lm_head.weight']
- This IS expected if you are initializing FunnelModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing FunnelModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 2m 4s) Loss: 1.2598(1.2598) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 18s (remain 0m 32s) Loss: 1.1086(0.7636) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 36s (remain 0m 14s) Loss: 0.6005(0.7132) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 50s (remain 0m 0s) Loss: 0.5176(0.6985) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.6466(0.6466) 


Epoch 1 - avg_train_loss: 0.6985  avg_val_loss: 0.6433  time: 54s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6985  avg_val_loss: 0.6433  time: 54s
Epoch 1 - Score: 0.7022
INFO:__main__:Epoch 1 - Score: 0.7022
Epoch 1 - Save Best Score: 0.7022 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7022 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.6860(0.6433) 
f1 score : 0.5277777777777778
recall score : 0.625
precision score : 0.4567307692307692
thresh : 0.74
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 56s) Loss: 0.5823(0.5823) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 18s (remain 0m 32s) Loss: 0.2638(0.3799) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 36s (remain 0m 14s) Loss: 0.4586(0.3742) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 50s (remain 0m 0s) Loss: 0.3693(0.3766) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.5264(0.5264) 


Epoch 2 - avg_train_loss: 0.3766  avg_val_loss: 0.5909  time: 55s
INFO:__main__:Epoch 2 - avg_train_loss: 0.3766  avg_val_loss: 0.5909  time: 55s
Epoch 2 - Score: 0.7243
INFO:__main__:Epoch 2 - Score: 0.7243
Epoch 2 - Save Best Score: 0.7243 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7243 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.4595(0.5909) 
f1 score : 0.47457627118644063
recall score : 0.4605263157894737
precision score : 0.48951048951048953
thresh : 0.73
Epoch: [3][0/279] Elapsed 0m 0s (remain 2m 2s) Loss: 0.1640(0.1640) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 19s (remain 0m 33s) Loss: 0.1239(0.1509) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 36s (remain 0m 14s) Loss: 0.1410(0.1444) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 50s (remain 0m 0s) Loss: 0.1380(0.1400) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.5194(0.5194) 


Epoch 3 - avg_train_loss: 0.1400  avg_val_loss: 0.5982  time: 55s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1400  avg_val_loss: 0.5982  time: 55s
Epoch 3 - Score: 0.7223
INFO:__main__:Epoch 3 - Score: 0.7223


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.4727(0.5982) 
f1 score : 0.4486692015209125
recall score : 0.3881578947368421
precision score : 0.5315315315315315
thresh : 0.76
Epoch: [4][0/279] Elapsed 0m 0s (remain 2m 0s) Loss: 0.0721(0.0721) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 18s (remain 0m 32s) Loss: 0.1051(0.0546) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 36s (remain 0m 14s) Loss: 0.0414(0.0530) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 50s (remain 0m 0s) Loss: 0.0535(0.0514) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.4681(0.4681) 


Epoch 4 - avg_train_loss: 0.0514  avg_val_loss: 0.6244  time: 55s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0514  avg_val_loss: 0.6244  time: 55s
Epoch 4 - Score: 0.7243
INFO:__main__:Epoch 4 - Score: 0.7243


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.4580(0.6244) 
f1 score : 0.3675213675213675
recall score : 0.28289473684210525
precision score : 0.524390243902439
thresh : 0.69
Epoch: [5][0/279] Elapsed 0m 0s (remain 2m 3s) Loss: 0.0405(0.0405) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 18s (remain 0m 32s) Loss: 0.0316(0.0358) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 36s (remain 0m 14s) Loss: 0.0355(0.0354) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 50s (remain 0m 0s) Loss: 0.0350(0.0354) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.4674(0.4674) 


Epoch 5 - avg_train_loss: 0.0354  avg_val_loss: 0.6300  time: 55s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0354  avg_val_loss: 0.6300  time: 55s
Epoch 5 - Score: 0.7243
INFO:__main__:Epoch 5 - Score: 0.7243


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.4623(0.6300) 
f1 score : 0.369098712446352
recall score : 0.28289473684210525
precision score : 0.5308641975308642
thresh : 0.67


Score: 0.6881
INFO:__main__:Score: 0.6881
ACC BEST Score: 0.7243
INFO:__main__:ACC BEST Score: 0.7243
FunnelConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_funnel_transformer_medium_epoch10",
  "activation_dropout": 0.0,
  "architectures": [
    "FunnelForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "attention_type": "relative_shift",
  "block_repeats": [
    1,
    2,
    2
  ],
  "block_sizes": [
    6,
    3,
    3
  ],
  "d_head": 64,
  "d_inner": 3072,
  "d_model": 768,
  "hidden_act": "gelu_new",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "initializer_range": 0.1,
  "initializer_std": null,
  "layer_norm_eps": 1e-09,
  "max_position_embeddings": 512,
  "model_type": "funnel",
  "n_head": 12,
  "num_decoder_layers": 2,
  "output_hidden_states": true,
  "pool_q_only": true,
  "pooling_type": "mean",
  "rel_attn_ty

f1 score : 0.47457627118644063
recall score : 0.4605263157894737
precision score : 0.48951048951048953
thresh : 0.73


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_funnel_transformer_medium_epoch10 were not used when initializing FunnelModel: ['lm_head.bias', 'lm_head.weight']
- This IS expected if you are initializing FunnelModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing FunnelModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 2m 7s) Loss: 1.6853(1.6853) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 18s (remain 0m 32s) Loss: 0.2745(0.7566) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 36s (remain 0m 14s) Loss: 0.7791(0.6989) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 50s (remain 0m 0s) Loss: 1.2681(0.6936) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.8985(0.8985) 


Epoch 1 - avg_train_loss: 0.6936  avg_val_loss: 0.7921  time: 54s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6936  avg_val_loss: 0.7921  time: 54s
Epoch 1 - Score: 0.7022
INFO:__main__:Epoch 1 - Score: 0.7022
Epoch 1 - Save Best Score: 0.7022 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7022 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.2536(0.7921) 
f1 score : 0.05095541401273885
recall score : 0.02631578947368421
precision score : 0.8
thresh : 0.49
Epoch: [2][0/279] Elapsed 0m 0s (remain 2m 8s) Loss: 0.2567(0.2567) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 18s (remain 0m 32s) Loss: 0.5110(0.3386) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 36s (remain 0m 14s) Loss: 0.3267(0.3344) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 50s (remain 0m 0s) Loss: 0.3476(0.3394) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.9328(0.9328) 


Epoch 2 - avg_train_loss: 0.3394  avg_val_loss: 0.6485  time: 55s
INFO:__main__:Epoch 2 - avg_train_loss: 0.3394  avg_val_loss: 0.6485  time: 55s
Epoch 2 - Score: 0.7022
INFO:__main__:Epoch 2 - Score: 0.7022


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7266(0.6485) 
f1 score : 0.40138408304498274
recall score : 0.3815789473684211
precision score : 0.4233576642335766
thresh : 0.79
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 55s) Loss: 0.2208(0.2208) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 18s (remain 0m 32s) Loss: 0.0817(0.1272) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 36s (remain 0m 14s) Loss: 0.0497(0.1197) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 50s (remain 0m 0s) Loss: 0.0633(0.1136) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.9828(0.9828) 


Epoch 3 - avg_train_loss: 0.1136  avg_val_loss: 0.6661  time: 55s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1136  avg_val_loss: 0.6661  time: 55s
Epoch 3 - Score: 0.7002
INFO:__main__:Epoch 3 - Score: 0.7002


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9857(0.6661) 
f1 score : 0.3119266055045872
recall score : 0.2236842105263158
precision score : 0.5151515151515151
thresh : 0.75
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 58s) Loss: 0.0433(0.0433) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 18s (remain 0m 32s) Loss: 0.0275(0.0434) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 36s (remain 0m 14s) Loss: 0.0454(0.0413) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 50s (remain 0m 0s) Loss: 0.0428(0.0403) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 1.0533(1.0533) 


Epoch 4 - avg_train_loss: 0.0403  avg_val_loss: 0.7063  time: 54s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0403  avg_val_loss: 0.7063  time: 54s
Epoch 4 - Score: 0.7022
INFO:__main__:Epoch 4 - Score: 0.7022


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1151(0.7063) 
f1 score : 0.2830188679245283
recall score : 0.19736842105263158
precision score : 0.5
thresh : 0.78
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 58s) Loss: 0.0287(0.0287) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 18s (remain 0m 31s) Loss: 0.0245(0.0294) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 36s (remain 0m 14s) Loss: 0.0269(0.0294) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 50s (remain 0m 0s) Loss: 0.0291(0.0288) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 1.0643(1.0643) 


Epoch 5 - avg_train_loss: 0.0288  avg_val_loss: 0.7157  time: 54s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0288  avg_val_loss: 0.7157  time: 54s
Epoch 5 - Score: 0.7042
INFO:__main__:Epoch 5 - Score: 0.7042
Epoch 5 - Save Best Score: 0.7042 Model
INFO:__main__:Epoch 5 - Save Best Score: 0.7042 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1381(0.7157) 
f1 score : 0.25961538461538464
recall score : 0.17763157894736842
precision score : 0.48214285714285715
thresh : 0.77


Score: 0.6901
INFO:__main__:Score: 0.6901
ACC BEST Score: 0.7042
INFO:__main__:ACC BEST Score: 0.7042
FunnelConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_funnel_transformer_medium_epoch10",
  "activation_dropout": 0.0,
  "architectures": [
    "FunnelForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "attention_type": "relative_shift",
  "block_repeats": [
    1,
    2,
    2
  ],
  "block_sizes": [
    6,
    3,
    3
  ],
  "d_head": 64,
  "d_inner": 3072,
  "d_model": 768,
  "hidden_act": "gelu_new",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "initializer_range": 0.1,
  "initializer_std": null,
  "layer_norm_eps": 1e-09,
  "max_position_embeddings": 512,
  "model_type": "funnel",
  "n_head": 12,
  "num_decoder_layers": 2,
  "output_hidden_states": true,
  "pool_q_only": true,
  "pooling_type": "mean",
  "rel_attn_ty

f1 score : 0.25961538461538464
recall score : 0.17763157894736842
precision score : 0.48214285714285715
thresh : 0.77


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_funnel_transformer_medium_epoch10 were not used when initializing FunnelModel: ['lm_head.bias', 'lm_head.weight']
- This IS expected if you are initializing FunnelModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing FunnelModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 2m 7s) Loss: 2.3392(2.3392) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 18s (remain 0m 32s) Loss: 0.6753(0.7690) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 36s (remain 0m 14s) Loss: 0.5824(0.7188) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 50s (remain 0m 0s) Loss: 0.7837(0.6972) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.8682(0.8682) 


Epoch 1 - avg_train_loss: 0.6972  avg_val_loss: 0.6691  time: 54s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6972  avg_val_loss: 0.6691  time: 54s
Epoch 1 - Score: 0.7183
INFO:__main__:Epoch 1 - Score: 0.7183
Epoch 1 - Save Best Score: 0.7183 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7183 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.6009(0.6691) 
f1 score : 0.1317365269461078
recall score : 0.07236842105263158
precision score : 0.7333333333333333
thresh : 0.41
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 50s) Loss: 0.4348(0.4348) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 18s (remain 0m 32s) Loss: 0.4019(0.3507) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 36s (remain 0m 14s) Loss: 0.3983(0.3301) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 50s (remain 0m 0s) Loss: 0.3129(0.3343) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.7601(0.7601) 


Epoch 2 - avg_train_loss: 0.3343  avg_val_loss: 0.6054  time: 55s
INFO:__main__:Epoch 2 - avg_train_loss: 0.3343  avg_val_loss: 0.6054  time: 55s
Epoch 2 - Score: 0.7163
INFO:__main__:Epoch 2 - Score: 0.7163


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.6041(0.6054) 
f1 score : 0.39999999999999997
recall score : 0.3157894736842105
precision score : 0.5454545454545454
thresh : 0.53
Epoch: [3][0/279] Elapsed 0m 0s (remain 2m 5s) Loss: 0.1131(0.1131) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 18s (remain 0m 32s) Loss: 0.1430(0.1300) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 36s (remain 0m 14s) Loss: 0.1058(0.1256) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 51s (remain 0m 0s) Loss: 0.2245(0.1223) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.7524(0.7524) 


Epoch 3 - avg_train_loss: 0.1223  avg_val_loss: 0.6481  time: 55s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1223  avg_val_loss: 0.6481  time: 55s
Epoch 3 - Score: 0.7264
INFO:__main__:Epoch 3 - Score: 0.7264
Epoch 3 - Save Best Score: 0.7264 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7264 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.6178(0.6481) 
f1 score : 0.3451327433628319
recall score : 0.2565789473684211
precision score : 0.527027027027027
thresh : 0.63
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 58s) Loss: 0.0406(0.0406) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 18s (remain 0m 32s) Loss: 0.0654(0.0509) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 36s (remain 0m 14s) Loss: 0.0366(0.0497) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 50s (remain 0m 0s) Loss: 0.0497(0.0489) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.8113(0.8113) 


Epoch 4 - avg_train_loss: 0.0489  avg_val_loss: 0.6748  time: 55s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0489  avg_val_loss: 0.6748  time: 55s
Epoch 4 - Score: 0.7183
INFO:__main__:Epoch 4 - Score: 0.7183


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.5874(0.6748) 
f1 score : 0.30697674418604654
recall score : 0.21710526315789475
precision score : 0.5238095238095238
thresh : 0.61
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 59s) Loss: 0.0339(0.0339) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 18s (remain 0m 32s) Loss: 0.0396(0.0355) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 36s (remain 0m 14s) Loss: 0.0229(0.0355) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 50s (remain 0m 0s) Loss: 0.0270(0.0356) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.8077(0.8077) 


Epoch 5 - avg_train_loss: 0.0356  avg_val_loss: 0.6763  time: 54s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0356  avg_val_loss: 0.6763  time: 54s
Epoch 5 - Score: 0.7183
INFO:__main__:Epoch 5 - Score: 0.7183


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.5962(0.6763) 
f1 score : 0.31336405529953915
recall score : 0.2236842105263158
precision score : 0.5230769230769231
thresh : 0.62


Score: 0.7022
INFO:__main__:Score: 0.7022
ACC BEST Score: 0.7264
INFO:__main__:ACC BEST Score: 0.7264
FunnelConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_funnel_transformer_medium_epoch10",
  "activation_dropout": 0.0,
  "architectures": [
    "FunnelForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "attention_type": "relative_shift",
  "block_repeats": [
    1,
    2,
    2
  ],
  "block_sizes": [
    6,
    3,
    3
  ],
  "d_head": 64,
  "d_inner": 3072,
  "d_model": 768,
  "hidden_act": "gelu_new",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "initializer_range": 0.1,
  "initializer_std": null,
  "layer_norm_eps": 1e-09,
  "max_position_embeddings": 512,
  "model_type": "funnel",
  "n_head": 12,
  "num_decoder_layers": 2,
  "output_hidden_states": true,
  "pool_q_only": true,
  "pooling_type": "mean",
  "rel_attn_ty

f1 score : 0.3451327433628319
recall score : 0.2565789473684211
precision score : 0.527027027027027
thresh : 0.63


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_funnel_transformer_medium_epoch10 were not used when initializing FunnelModel: ['lm_head.bias', 'lm_head.weight']
- This IS expected if you are initializing FunnelModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing FunnelModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 2m 13s) Loss: 1.4798(1.4798) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 18s (remain 0m 32s) Loss: 0.6155(0.7456) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 36s (remain 0m 14s) Loss: 0.7434(0.7101) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 50s (remain 0m 0s) Loss: 0.6772(0.6986) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.8300(0.8300) 


Epoch 1 - avg_train_loss: 0.6986  avg_val_loss: 0.6017  time: 54s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6986  avg_val_loss: 0.6017  time: 54s
Epoch 1 - Score: 0.7143
INFO:__main__:Epoch 1 - Score: 0.7143
Epoch 1 - Save Best Score: 0.7143 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7143 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7590(0.6017) 
f1 score : 0.3181818181818182
recall score : 0.23026315789473684
precision score : 0.5147058823529411
thresh : 0.53
Epoch: [2][0/279] Elapsed 0m 0s (remain 2m 0s) Loss: 0.4101(0.4101) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 18s (remain 0m 33s) Loss: 0.3767(0.3487) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 36s (remain 0m 14s) Loss: 0.3955(0.3326) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 50s (remain 0m 0s) Loss: 0.3389(0.3267) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.8282(0.8282) 


Epoch 2 - avg_train_loss: 0.3267  avg_val_loss: 0.6179  time: 55s
INFO:__main__:Epoch 2 - avg_train_loss: 0.3267  avg_val_loss: 0.6179  time: 55s
Epoch 2 - Score: 0.7143
INFO:__main__:Epoch 2 - Score: 0.7143


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7096(0.6179) 
f1 score : 0.33333333333333337
recall score : 0.26973684210526316
precision score : 0.43617021276595747
thresh : 0.71
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 51s) Loss: 0.1245(0.1245) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 18s (remain 0m 32s) Loss: 0.1362(0.1265) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 36s (remain 0m 14s) Loss: 0.0781(0.1220) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 50s (remain 0m 0s) Loss: 0.1592(0.1179) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.9087(0.9087) 


Epoch 3 - avg_train_loss: 0.1179  avg_val_loss: 0.6434  time: 55s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1179  avg_val_loss: 0.6434  time: 55s
Epoch 3 - Score: 0.7123
INFO:__main__:Epoch 3 - Score: 0.7123


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7673(0.6434) 
f1 score : 0.35684647302904565
recall score : 0.28289473684210525
precision score : 0.48314606741573035
thresh : 0.77
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 59s) Loss: 0.0580(0.0580) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 18s (remain 0m 32s) Loss: 0.0609(0.0481) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 36s (remain 0m 14s) Loss: 0.0398(0.0469) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 50s (remain 0m 0s) Loss: 0.0504(0.0465) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 1.0104(1.0104) 


Epoch 4 - avg_train_loss: 0.0465  avg_val_loss: 0.6891  time: 55s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0465  avg_val_loss: 0.6891  time: 55s
Epoch 4 - Score: 0.7123
INFO:__main__:Epoch 4 - Score: 0.7123


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8941(0.6891) 
f1 score : 0.30414746543778803
recall score : 0.21710526315789475
precision score : 0.5076923076923077
thresh : 0.67
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 59s) Loss: 0.0350(0.0350) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 18s (remain 0m 32s) Loss: 0.0384(0.0343) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 36s (remain 0m 14s) Loss: 0.0353(0.0338) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 50s (remain 0m 0s) Loss: 0.0454(0.0338) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 1.0390(1.0390) 


Epoch 5 - avg_train_loss: 0.0338  avg_val_loss: 0.7060  time: 55s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0338  avg_val_loss: 0.7060  time: 55s
Epoch 5 - Score: 0.7123
INFO:__main__:Epoch 5 - Score: 0.7123


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9368(0.7060) 
f1 score : 0.2938388625592417
recall score : 0.20394736842105263
precision score : 0.5254237288135594
thresh : 0.65


Score: 0.6982
INFO:__main__:Score: 0.6982
ACC BEST Score: 0.7143
INFO:__main__:ACC BEST Score: 0.7143
FunnelConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_funnel_transformer_medium_epoch10",
  "activation_dropout": 0.0,
  "architectures": [
    "FunnelForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "attention_type": "relative_shift",
  "block_repeats": [
    1,
    2,
    2
  ],
  "block_sizes": [
    6,
    3,
    3
  ],
  "d_head": 64,
  "d_inner": 3072,
  "d_model": 768,
  "hidden_act": "gelu_new",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "initializer_range": 0.1,
  "initializer_std": null,
  "layer_norm_eps": 1e-09,
  "max_position_embeddings": 512,
  "model_type": "funnel",
  "n_head": 12,
  "num_decoder_layers": 2,
  "output_hidden_states": true,
  "pool_q_only": true,
  "pooling_type": "mean",
  "rel_attn_ty

f1 score : 0.3181818181818182
recall score : 0.23026315789473684
precision score : 0.5147058823529411
thresh : 0.53


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_funnel_transformer_medium_epoch10 were not used when initializing FunnelModel: ['lm_head.bias', 'lm_head.weight']
- This IS expected if you are initializing FunnelModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing FunnelModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 2m 8s) Loss: 1.4262(1.4262) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 18s (remain 0m 32s) Loss: 0.6383(0.7574) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 36s (remain 0m 14s) Loss: 0.8708(0.7132) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 50s (remain 0m 0s) Loss: 0.7304(0.6945) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.6971(0.6971) 


Epoch 1 - avg_train_loss: 0.6945  avg_val_loss: 0.7179  time: 54s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6945  avg_val_loss: 0.7179  time: 54s
Epoch 1 - Score: 0.7103
INFO:__main__:Epoch 1 - Score: 0.7103
Epoch 1 - Save Best Score: 0.7103 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7103 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9236(0.7179) 
f1 score : 0.07594936708860758
recall score : 0.039473684210526314
precision score : 1.0
thresh : 0.43
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 57s) Loss: 0.4479(0.4479) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 18s (remain 0m 33s) Loss: 0.3213(0.4074) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 37s (remain 0m 14s) Loss: 0.2772(0.3795) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 50s (remain 0m 0s) Loss: 0.5255(0.3751) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.6474(0.6474) 


Epoch 2 - avg_train_loss: 0.3751  avg_val_loss: 0.6204  time: 55s
INFO:__main__:Epoch 2 - avg_train_loss: 0.3751  avg_val_loss: 0.6204  time: 55s
Epoch 2 - Score: 0.7123
INFO:__main__:Epoch 2 - Score: 0.7123
Epoch 2 - Save Best Score: 0.7123 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7123 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7617(0.6204) 
f1 score : 0.4383561643835617
recall score : 0.42105263157894735
precision score : 0.45714285714285713
thresh : 0.79
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 57s) Loss: 0.2623(0.2623) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 18s (remain 0m 32s) Loss: 0.1483(0.1589) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 36s (remain 0m 14s) Loss: 0.1171(0.1489) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 50s (remain 0m 0s) Loss: 0.0991(0.1436) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.6538(0.6538) 


Epoch 3 - avg_train_loss: 0.1436  avg_val_loss: 0.6241  time: 55s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1436  avg_val_loss: 0.6241  time: 55s
Epoch 3 - Score: 0.7143
INFO:__main__:Epoch 3 - Score: 0.7143
Epoch 3 - Save Best Score: 0.7143 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7143 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7561(0.6241) 
f1 score : 0.3745019920318725
recall score : 0.3092105263157895
precision score : 0.47474747474747475
thresh : 0.78
Epoch: [4][0/279] Elapsed 0m 0s (remain 2m 3s) Loss: 0.0552(0.0552) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 19s (remain 0m 33s) Loss: 0.0763(0.0582) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 37s (remain 0m 14s) Loss: 0.0508(0.0564) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 51s (remain 0m 0s) Loss: 0.0266(0.0550) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.6652(0.6652) 


Epoch 4 - avg_train_loss: 0.0550  avg_val_loss: 0.6571  time: 55s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0550  avg_val_loss: 0.6571  time: 55s
Epoch 4 - Score: 0.7123
INFO:__main__:Epoch 4 - Score: 0.7123


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8241(0.6571) 
f1 score : 0.29906542056074764
recall score : 0.21052631578947367
precision score : 0.5161290322580645
thresh : 0.62
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 59s) Loss: 0.0468(0.0468) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 18s (remain 0m 33s) Loss: 0.0352(0.0390) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 36s (remain 0m 14s) Loss: 0.0434(0.0388) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 50s (remain 0m 0s) Loss: 0.0344(0.0386) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.6676(0.6676) 


Epoch 5 - avg_train_loss: 0.0386  avg_val_loss: 0.6617  time: 55s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0386  avg_val_loss: 0.6617  time: 55s
Epoch 5 - Score: 0.7123
INFO:__main__:Epoch 5 - Score: 0.7123


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8299(0.6617) 
f1 score : 0.27488151658767773
recall score : 0.19078947368421054
precision score : 0.4915254237288136
thresh : 0.63


Score: 0.6841
INFO:__main__:Score: 0.6841
ACC BEST Score: 0.7143
INFO:__main__:ACC BEST Score: 0.7143
FunnelConfig {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_funnel_transformer_medium_epoch10",
  "activation_dropout": 0.0,
  "architectures": [
    "FunnelForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "attention_type": "relative_shift",
  "block_repeats": [
    1,
    2,
    2
  ],
  "block_sizes": [
    6,
    3,
    3
  ],
  "d_head": 64,
  "d_inner": 3072,
  "d_model": 768,
  "hidden_act": "gelu_new",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "initializer_range": 0.1,
  "initializer_std": null,
  "layer_norm_eps": 1e-09,
  "max_position_embeddings": 512,
  "model_type": "funnel",
  "n_head": 12,
  "num_decoder_layers": 2,
  "output_hidden_states": true,
  "pool_q_only": true,
  "pooling_type": "mean",
  "rel_attn_ty

f1 score : 0.3745019920318725
recall score : 0.3092105263157895
precision score : 0.47474747474747475
thresh : 0.78


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_funnel_transformer_medium_epoch10 were not used when initializing FunnelModel: ['lm_head.bias', 'lm_head.weight']
- This IS expected if you are initializing FunnelModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing FunnelModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 2m 29s) Loss: 0.7748(0.7748) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 18s (remain 0m 32s) Loss: 0.6504(0.6853) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 36s (remain 0m 14s) Loss: 0.6950(0.6790) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 50s (remain 0m 0s) Loss: 1.0110(0.6669) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.4826(0.4826) 


Epoch 1 - avg_train_loss: 0.6669  avg_val_loss: 0.6244  time: 54s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6669  avg_val_loss: 0.6244  time: 54s
Epoch 1 - Score: 0.7062
INFO:__main__:Epoch 1 - Score: 0.7062
Epoch 1 - Save Best Score: 0.7062 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7062 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.6933(0.6244) 
f1 score : 0.18784530386740333
recall score : 0.1118421052631579
precision score : 0.5862068965517241
thresh : 0.51
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 52s) Loss: 0.4691(0.4691) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 19s (remain 0m 33s) Loss: 0.2637(0.3601) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 37s (remain 0m 14s) Loss: 0.3630(0.3575) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 51s (remain 0m 0s) Loss: 0.3345(0.3613) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.5142(0.5142) 


Epoch 2 - avg_train_loss: 0.3613  avg_val_loss: 0.6242  time: 55s
INFO:__main__:Epoch 2 - avg_train_loss: 0.3613  avg_val_loss: 0.6242  time: 55s
Epoch 2 - Score: 0.7143
INFO:__main__:Epoch 2 - Score: 0.7143
Epoch 2 - Save Best Score: 0.7143 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7143 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.6802(0.6242) 
f1 score : 0.34959349593495936
recall score : 0.28289473684210525
precision score : 0.4574468085106383
thresh : 0.67
Epoch: [3][0/279] Elapsed 0m 0s (remain 2m 3s) Loss: 0.1338(0.1338) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 18s (remain 0m 33s) Loss: 0.1381(0.1452) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 37s (remain 0m 14s) Loss: 0.1152(0.1408) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 51s (remain 0m 0s) Loss: 0.1120(0.1370) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.5517(0.5517) 


Epoch 3 - avg_train_loss: 0.1370  avg_val_loss: 0.6645  time: 55s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1370  avg_val_loss: 0.6645  time: 55s
Epoch 3 - Score: 0.7183
INFO:__main__:Epoch 3 - Score: 0.7183
Epoch 3 - Save Best Score: 0.7183 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7183 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7980(0.6645) 
f1 score : 0.32340425531914896
recall score : 0.25
precision score : 0.4578313253012048
thresh : 0.69
Epoch: [4][0/279] Elapsed 0m 0s (remain 2m 0s) Loss: 0.0519(0.0519) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 19s (remain 0m 33s) Loss: 0.0340(0.0535) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 37s (remain 0m 14s) Loss: 0.0646(0.0529) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 51s (remain 0m 0s) Loss: 0.0435(0.0518) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.5599(0.5599) 


Epoch 4 - avg_train_loss: 0.0518  avg_val_loss: 0.6893  time: 55s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0518  avg_val_loss: 0.6893  time: 55s
Epoch 4 - Score: 0.7203
INFO:__main__:Epoch 4 - Score: 0.7203
Epoch 4 - Save Best Score: 0.7203 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7203 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8111(0.6893) 
f1 score : 0.30379746835443033
recall score : 0.23684210526315788
precision score : 0.4235294117647059
thresh : 0.69
Epoch: [5][0/279] Elapsed 0m 0s (remain 2m 5s) Loss: 0.0498(0.0498) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 18s (remain 0m 33s) Loss: 0.0236(0.0371) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 36s (remain 0m 14s) Loss: 0.0399(0.0368) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 51s (remain 0m 0s) Loss: 0.0422(0.0368) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.5645(0.5645) 


Epoch 5 - avg_train_loss: 0.0368  avg_val_loss: 0.6982  time: 55s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0368  avg_val_loss: 0.6982  time: 55s
Epoch 5 - Score: 0.7183
INFO:__main__:Epoch 5 - Score: 0.7183


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8104(0.6982) 
f1 score : 0.28820960698689957
recall score : 0.21710526315789475
precision score : 0.42857142857142855
thresh : 0.67


Score: 0.6680
INFO:__main__:Score: 0.6680
ACC BEST Score: 0.7203
INFO:__main__:ACC BEST Score: 0.7203
Score: 0.6799
INFO:__main__:Score: 0.6799
ACC BEST Score: 0.7127
INFO:__main__:ACC BEST Score: 0.7127


f1 score : 0.30379746835443033
recall score : 0.23684210526315788
precision score : 0.4235294117647059
thresh : 0.69
f1 score : 0.36217948717948717
recall score : 0.29678266579120155
precision score : 0.4645426515930113
thresh : 0.71


In [None]:
from google.colab import runtime
runtime.unassign()