In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
!pip install transformers
!pip install datasets
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m55.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m108.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.4
Looking in indexes: https://pypi.org/simple, htt

In [3]:
!nvidia-smi

Sun Apr  2 08:51:30 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P8    11W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:

import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re
import html

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD, AdamW, RAdam
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score, recall_score, accuracy_score, precision_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:

# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    model="microsoft/deberta-v3-base"
    # model='microsoft/deberta-base'
    # model='roberta-base'
    # model='roberta-large'
    # model='roberta-large-mnli'
    # model='xlnet-large-cased'
    # model='albert-xxlarge-v2'
    # model="microsoft/deberta-large"
    # model="microsoft/deberta-v3-large"
    # model='microsoft/deberta-v2-xlarge'
    # model='funnel-transformer/large'
    # model='funnel-transformer/medium'
    # model='albert-base-v2'
    # model='albert-large-v2'
    # model='google/electra-large-discriminator'
    # model='google/electra-base-discriminator'
    # model="facebook/bart-large-mnli"
    # model="facebook/bart-large"
    # model="facebook/bart-base"
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=6
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=16
    fc_dropout=0.2
    target_size=1
    max_len=256
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    train=True
    nth_awp_start_epoch=1
    gradient_checkpointing = False
    freezing = False

if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0, 1]

In [6]:
DIR = "/content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測"
INPUT_DIR = os.path.join(DIR,"input")
OUTPUT_DIR = os.path.join(DIR,"output")
OUTPUT_EXP_DIR = DIR + '/output/EXP012/'
if not os.path.exists(OUTPUT_EXP_DIR):
    os.makedirs(OUTPUT_EXP_DIR)

In [7]:
def get_score(labels, outputs):
    thresh = 0.5
    y_pred = outputs
    y_true = labels
    f_score = f1_score(y_true, (y_pred>thresh).astype(int))
    r_score = recall_score(y_true, (y_pred>thresh).astype(int))
    p_score = precision_score(y_true, (y_pred>thresh).astype(int))
    print(f"f1 score : {f_score}")
    print(f"recall score : {r_score}")
    print(f"precision score : {p_score}")
    return accuracy_score(y_true, (y_pred>thresh).astype(int))

def get_acc_score(labels, outputs):
    y_pred = outputs
    y_true = labels
    best_score = 0
    best_thresh = 0.5
    for thresh in np.arange(0.1, 0.80, 0.01):
        thresh = np.round(thresh, 2)
        score = accuracy_score(y_true, (y_pred>thresh).astype(int))
        #print("Accuracy score at threshold {0} is {1}".format(thresh, score))
        if score > best_score:
          best_score = score
          best_thresh = thresh
    return accuracy_score(y_true, (y_pred>best_thresh).astype(int))


def get_logger(filename=OUTPUT_EXP_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=CFG.seed)

In [8]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False
        
def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
            
    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """
    
    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"
        
        if hasattr(embeddings_path, attr_name): 
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [9]:
import pandas as pd
import numpy as np

train = pd.read_csv(os.path.join(INPUT_DIR,"train_data.csv"))
test = pd.read_csv(os.path.join(INPUT_DIR,"test_data.csv"))
sample_sub = pd.read_csv(os.path.join(INPUT_DIR,"submission.csv"))

print(train.shape)
display(train.head(3))

print(test.shape)
display(test.head(3))

print(sample_sub.shape)
display(sample_sub.head(3))

(4974, 6)


Unnamed: 0,id,title,year,abstract,keywords,y
0,1,Hierarchical Adversarially Learned Inference,2018,We propose a novel hierarchical generative mod...,"generative, hierarchical, unsupervised, semisu...",0
1,2,Learning to Compute Word Embeddings On the Fly,2018,Words in natural language follow a Zipfian dis...,"NLU, word embeddings, representation learning",0
2,3,Graph2Seq: Scalable Learning Dynamics for Graphs,2018,Neural networks are increasingly used as a gen...,,0


(6393, 5)


Unnamed: 0,id,title,year,abstract,keywords
0,1,StyleAlign: Analysis and Applications of Align...,2022,"In this paper, we perform an in-depth study of...","StyleGAN, transfer learning, fine tuning, mode..."
1,2,Embedding a random graph via GNN: mean-field i...,2021,We develop a theory for embedding a random gra...,"Graph neural network, graph embedding, multi-r..."
2,3,BBRefinement: an universal scheme to improve p...,2021,We present a conceptually simple yet powerful ...,"object detection, deep neural networks, refine..."


(6393, 2)


Unnamed: 0,id,y
0,1,0
1,2,0
2,3,0


In [10]:
train["texts"] = train["abstract"]  

In [11]:
skf = StratifiedKFold(n_splits=CFG.n_fold,shuffle=True,random_state=CFG.seed)
for fold, ( _, val_) in enumerate(skf.split(train, train.y)):
    train.loc[val_ , "kfold"] = int(fold)
    
train["kfold"] = train["kfold"].astype(int)

if CFG.debug:
    display(train.groupby('kfold').size())
    train = train.sample(n=500, random_state=0).reset_index(drop=True)
    display(train.groupby('kfold').size())

In [12]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_EXP_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
# ====================================================
# Define max_len
# ====================================================
lengths = []
tk0 = tqdm(train['texts'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 2 # cls
LOGGER.info(f"max_len: {CFG.max_len}")

100%|██████████| 4974/4974 [00:08<00:00, 558.48it/s]
max_len: 511
INFO:__main__:max_len: 511


In [14]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False,
                           truncation=True)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['texts'].values
        self.labels = df['y'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.half)
        return inputs, label

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

class ValidDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['texts'].values
        self.labels = df['y'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

#collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [15]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim=1)
        return max_embeddings
    

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()

        # Freezing
        if cfg.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            cfg.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())

        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        self._init_weights(self.fc)
        self.layer_norm1 = nn.LayerNorm(self.config.hidden_size)
        self.sig = nn.Sigmoid()
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        feature = self.layer_norm1(feature)
        output = self.fc(feature)
        #output = self.sig(output)
        return output

In [16]:

# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
        #print(y_preds.sigmoid().squeeze().view(1, -1))
        loss = criterion(y_preds.sigmoid().squeeze(), labels.squeeze())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))

    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds.sigmoid().squeeze(), labels.squeeze())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    predictions = np.concatenate(predictions)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [17]:

# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['kfold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['kfold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds['y'].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = ValidDataset(CFG, valid_folds)


    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size*2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_EXP_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.BCELoss()
    
    best_score = -1.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')

        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_EXP_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_EXP_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds['pred'] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [18]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df['y'].values
        preds = oof_df['pred'].values
        score = get_score(labels, preds)
        acc_score = get_acc_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
        LOGGER.info(f'ACC BEST Score: {acc_score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
            #break
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_EXP_DIR+'oof_df.pkl')

DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.27.4",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "at

Downloading pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/248] Elapsed 0m 3s (remain 14m 38s) Loss: 1.0479(1.0479) Grad: nan  LR: 0.00002000  
Epoch: [1][100/248] Elapsed 0m 58s (remain 1m 24s) Loss: 0.6626(0.6394) Grad: 1.3772  LR: 0.00001977  
Epoch: [1][200/248] Elapsed 1m 55s (remain 0m 27s) Loss: 0.5225(0.6296) Grad: 4.3044  LR: 0.00001912  
Epoch: [1][247/248] Elapsed 2m 22s (remain 0m 0s) Loss: 0.6104(0.6229) Grad: 2.3402  LR: 0.00001867  
EVAL: [0/32] Elapsed 0m 1s (remain 0m 32s) Loss: 0.4554(0.4554) 


Epoch 1 - avg_train_loss: 0.6229  avg_val_loss: 0.6097  time: 171s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6229  avg_val_loss: 0.6097  time: 171s
Epoch 1 - Score: 0.6945
INFO:__main__:Epoch 1 - Score: 0.6945
Epoch 1 - Save Best Score: 0.6945 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6945 Model


EVAL: [31/32] Elapsed 0m 28s (remain 0m 0s) Loss: 0.8477(0.6097) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
Epoch: [2][0/248] Elapsed 0m 1s (remain 4m 8s) Loss: 0.5503(0.5503) Grad: nan  LR: 0.00001866  
Epoch: [2][100/248] Elapsed 0m 58s (remain 1m 24s) Loss: 0.6284(0.6007) Grad: 3.2109  LR: 0.00001742  
Epoch: [2][200/248] Elapsed 1m 54s (remain 0m 26s) Loss: 0.6113(0.6018) Grad: 0.8881  LR: 0.00001585  
Epoch: [2][247/248] Elapsed 2m 22s (remain 0m 0s) Loss: 0.6460(0.5977) Grad: 4.0797  LR: 0.00001502  
EVAL: [0/32] Elapsed 0m 1s (remain 0m 33s) Loss: 0.5163(0.5163) 


Epoch 2 - avg_train_loss: 0.5977  avg_val_loss: 0.6139  time: 171s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5977  avg_val_loss: 0.6139  time: 171s
Epoch 2 - Score: 0.7065
INFO:__main__:Epoch 2 - Score: 0.7065
Epoch 2 - Save Best Score: 0.7065 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7065 Model


EVAL: [31/32] Elapsed 0m 28s (remain 0m 0s) Loss: 0.7178(0.6139) 
f1 score : 0.24742268041237112
recall score : 0.15789473684210525
precision score : 0.5714285714285714
Epoch: [3][0/248] Elapsed 0m 0s (remain 3m 45s) Loss: 0.5527(0.5527) Grad: nan  LR: 0.00001501  
Epoch: [3][100/248] Elapsed 0m 57s (remain 1m 23s) Loss: 0.5552(0.5445) Grad: 1.9472  LR: 0.00001309  
Epoch: [3][200/248] Elapsed 1m 55s (remain 0m 27s) Loss: 0.7251(0.5356) Grad: 3.6225  LR: 0.00001103  
Epoch: [3][247/248] Elapsed 2m 23s (remain 0m 0s) Loss: 0.3914(0.5335) Grad: 3.9813  LR: 0.00001004  
EVAL: [0/32] Elapsed 0m 1s (remain 0m 40s) Loss: 0.3571(0.3571) 


Epoch 3 - avg_train_loss: 0.5335  avg_val_loss: 0.5847  time: 172s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5335  avg_val_loss: 0.5847  time: 172s
Epoch 3 - Score: 0.7156
INFO:__main__:Epoch 3 - Score: 0.7156
Epoch 3 - Save Best Score: 0.7156 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7156 Model


EVAL: [31/32] Elapsed 0m 28s (remain 0m 0s) Loss: 0.7484(0.5847) 
f1 score : 0.3965884861407249
recall score : 0.3059210526315789
precision score : 0.5636363636363636
Epoch: [4][0/248] Elapsed 0m 0s (remain 3m 25s) Loss: 0.4246(0.4246) Grad: nan  LR: 0.00001002  
Epoch: [4][100/248] Elapsed 0m 58s (remain 1m 25s) Loss: 0.2336(0.3919) Grad: 2.9879  LR: 0.00000793  
Epoch: [4][200/248] Elapsed 1m 58s (remain 0m 27s) Loss: 0.2900(0.3687) Grad: 7.7691  LR: 0.00000593  
Epoch: [4][247/248] Elapsed 2m 24s (remain 0m 0s) Loss: 0.2314(0.3740) Grad: 4.0488  LR: 0.00000505  
EVAL: [0/32] Elapsed 0m 1s (remain 0m 33s) Loss: 0.3250(0.3250) 


Epoch 4 - avg_train_loss: 0.3740  avg_val_loss: 0.6713  time: 173s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3740  avg_val_loss: 0.6713  time: 173s
Epoch 4 - Score: 0.6945
INFO:__main__:Epoch 4 - Score: 0.6945


EVAL: [31/32] Elapsed 0m 28s (remain 0m 0s) Loss: 1.2407(0.6713) 
f1 score : 0.39199999999999996
recall score : 0.3223684210526316
precision score : 0.5
Epoch: [5][0/248] Elapsed 0m 0s (remain 3m 46s) Loss: 0.1957(0.1957) Grad: nan  LR: 0.00000503  
Epoch: [5][100/248] Elapsed 0m 57s (remain 1m 23s) Loss: 0.2603(0.1568) Grad: 7.5162  LR: 0.00000333  
Epoch: [5][200/248] Elapsed 1m 55s (remain 0m 27s) Loss: 0.0356(0.1474) Grad: 3.5299  LR: 0.00000192  
Epoch: [5][247/248] Elapsed 2m 22s (remain 0m 0s) Loss: 0.2739(0.1467) Grad: 14.2599  LR: 0.00000138  
EVAL: [0/32] Elapsed 0m 1s (remain 0m 38s) Loss: 0.4035(0.4035) 


Epoch 5 - avg_train_loss: 0.1467  avg_val_loss: 0.9841  time: 171s
INFO:__main__:Epoch 5 - avg_train_loss: 0.1467  avg_val_loss: 0.9841  time: 171s
Epoch 5 - Score: 0.6844
INFO:__main__:Epoch 5 - Score: 0.6844


EVAL: [31/32] Elapsed 0m 28s (remain 0m 0s) Loss: 2.3406(0.9841) 
f1 score : 0.39846743295019166
recall score : 0.34210526315789475
precision score : 0.47706422018348627
Epoch: [6][0/248] Elapsed 0m 0s (remain 4m 6s) Loss: 0.0340(0.0340) Grad: nan  LR: 0.00000136  
Epoch: [6][100/248] Elapsed 0m 59s (remain 1m 26s) Loss: 0.0143(0.0738) Grad: 0.5504  LR: 0.00000050  
Epoch: [6][200/248] Elapsed 1m 57s (remain 0m 27s) Loss: 0.1699(0.0754) Grad: 13.1597  LR: 0.00000006  
Epoch: [6][247/248] Elapsed 2m 24s (remain 0m 0s) Loss: 0.1108(0.0714) Grad: 8.1435  LR: 0.00000000  
EVAL: [0/32] Elapsed 0m 1s (remain 0m 34s) Loss: 0.5544(0.5544) 


Epoch 6 - avg_train_loss: 0.0714  avg_val_loss: 1.0223  time: 173s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0714  avg_val_loss: 1.0223  time: 173s
Epoch 6 - Score: 0.6744
INFO:__main__:Epoch 6 - Score: 0.6744


EVAL: [31/32] Elapsed 0m 28s (remain 0m 0s) Loss: 2.1356(1.0223) 
f1 score : 0.4255319148936171
recall score : 0.39473684210526316
precision score : 0.46153846153846156


Score: 0.7156
INFO:__main__:Score: 0.7156
ACC BEST Score: 0.7206
INFO:__main__:ACC BEST Score: 0.7206


f1 score : 0.3965884861407249
recall score : 0.3059210526315789
precision score : 0.5636363636363636


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.27.4",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "at

Epoch: [1][0/248] Elapsed 0m 0s (remain 2m 48s) Loss: 0.6421(0.6421) Grad: nan  LR: 0.00002000  
Epoch: [1][100/248] Elapsed 1m 0s (remain 1m 28s) Loss: 0.6890(0.6255) Grad: 2.7165  LR: 0.00001977  
Epoch: [1][200/248] Elapsed 1m 58s (remain 0m 27s) Loss: 0.4243(0.6142) Grad: 3.1238  LR: 0.00001912  
Epoch: [1][247/248] Elapsed 2m 26s (remain 0m 0s) Loss: 0.6094(0.6149) Grad: 4.8233  LR: 0.00001867  
EVAL: [0/32] Elapsed 0m 1s (remain 0m 33s) Loss: 0.4897(0.4897) 


Epoch 1 - avg_train_loss: 0.6149  avg_val_loss: 0.6154  time: 175s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6149  avg_val_loss: 0.6154  time: 175s
Epoch 1 - Score: 0.6935
INFO:__main__:Epoch 1 - Score: 0.6935
Epoch 1 - Save Best Score: 0.6935 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6935 Model


EVAL: [31/32] Elapsed 0m 28s (remain 0m 0s) Loss: 0.8305(0.6154) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
Epoch: [2][0/248] Elapsed 0m 0s (remain 3m 17s) Loss: 0.4814(0.4814) Grad: nan  LR: 0.00001866  
Epoch: [2][100/248] Elapsed 0m 57s (remain 1m 24s) Loss: 0.5493(0.5707) Grad: 3.1270  LR: 0.00001742  
Epoch: [2][200/248] Elapsed 1m 57s (remain 0m 27s) Loss: 0.5444(0.5434) Grad: 4.5993  LR: 0.00001585  
Epoch: [2][247/248] Elapsed 2m 24s (remain 0m 0s) Loss: 0.6304(0.5486) Grad: 6.5855  LR: 0.00001502  
EVAL: [0/32] Elapsed 0m 1s (remain 0m 35s) Loss: 0.5773(0.5773) 


Epoch 2 - avg_train_loss: 0.5486  avg_val_loss: 0.6336  time: 173s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5486  avg_val_loss: 0.6336  time: 173s
Epoch 2 - Score: 0.6482
INFO:__main__:Epoch 2 - Score: 0.6482


EVAL: [31/32] Elapsed 0m 28s (remain 0m 0s) Loss: 0.6368(0.6336) 
f1 score : 0.5320855614973261
recall score : 0.6524590163934426
precision score : 0.4492099322799097
Epoch: [3][0/248] Elapsed 0m 0s (remain 3m 24s) Loss: 0.5068(0.5068) Grad: nan  LR: 0.00001501  
Epoch: [3][100/248] Elapsed 0m 57s (remain 1m 23s) Loss: 0.0862(0.3152) Grad: 2.5958  LR: 0.00001309  
Epoch: [3][200/248] Elapsed 1m 56s (remain 0m 27s) Loss: 0.6616(0.2708) Grad: 18.8306  LR: 0.00001103  
Epoch: [3][247/248] Elapsed 2m 23s (remain 0m 0s) Loss: 0.1466(0.2610) Grad: 7.7165  LR: 0.00001004  
EVAL: [0/32] Elapsed 0m 1s (remain 0m 35s) Loss: 0.4262(0.4262) 


Epoch 3 - avg_train_loss: 0.2610  avg_val_loss: 0.7313  time: 172s
INFO:__main__:Epoch 3 - avg_train_loss: 0.2610  avg_val_loss: 0.7313  time: 172s
Epoch 3 - Score: 0.6834
INFO:__main__:Epoch 3 - Score: 0.6834


EVAL: [31/32] Elapsed 0m 28s (remain 0m 0s) Loss: 1.1257(0.7313) 
f1 score : 0.47761194029850745
recall score : 0.4721311475409836
precision score : 0.48322147651006714
Epoch: [4][0/248] Elapsed 0m 0s (remain 3m 25s) Loss: 0.0340(0.0340) Grad: nan  LR: 0.00001002  
Epoch: [4][100/248] Elapsed 0m 59s (remain 1m 26s) Loss: 0.0151(0.0478) Grad: 0.7168  LR: 0.00000793  
Epoch: [4][200/248] Elapsed 1m 54s (remain 0m 26s) Loss: 0.0134(0.0459) Grad: 1.3345  LR: 0.00000593  
Epoch: [4][247/248] Elapsed 2m 21s (remain 0m 0s) Loss: 0.3975(0.0425) Grad: 14.1910  LR: 0.00000505  
EVAL: [0/32] Elapsed 0m 1s (remain 0m 38s) Loss: 0.3140(0.3140) 


Epoch 4 - avg_train_loss: 0.0425  avg_val_loss: 1.0425  time: 170s


EVAL: [31/32] Elapsed 0m 28s (remain 0m 0s) Loss: 3.8570(1.0425) 
f1 score : 0.36585365853658536
recall score : 0.29508196721311475
precision score : 0.48128342245989303


INFO:__main__:Epoch 4 - avg_train_loss: 0.0425  avg_val_loss: 1.0425  time: 170s
Epoch 4 - Score: 0.6864
INFO:__main__:Epoch 4 - Score: 0.6864


Epoch: [5][0/248] Elapsed 0m 0s (remain 3m 18s) Loss: 0.0072(0.0072) Grad: nan  LR: 0.00000503  
Epoch: [5][100/248] Elapsed 0m 59s (remain 1m 26s) Loss: 0.0061(0.0180) Grad: 0.4004  LR: 0.00000333  
Epoch: [5][200/248] Elapsed 1m 56s (remain 0m 27s) Loss: 0.0202(0.0161) Grad: 3.5561  LR: 0.00000192  
Epoch: [5][247/248] Elapsed 2m 22s (remain 0m 0s) Loss: 0.0042(0.0141) Grad: 0.2056  LR: 0.00000138  
EVAL: [0/32] Elapsed 0m 1s (remain 0m 35s) Loss: 0.4925(0.4925) 


Epoch 5 - avg_train_loss: 0.0141  avg_val_loss: 1.0830  time: 171s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0141  avg_val_loss: 1.0830  time: 171s
Epoch 5 - Score: 0.6834
INFO:__main__:Epoch 5 - Score: 0.6834


EVAL: [31/32] Elapsed 0m 28s (remain 0m 0s) Loss: 3.4723(1.0830) 
f1 score : 0.4198895027624309
recall score : 0.3737704918032787
precision score : 0.4789915966386555
Epoch: [6][0/248] Elapsed 0m 0s (remain 2m 53s) Loss: 0.0053(0.0053) Grad: nan  LR: 0.00000136  
Epoch: [6][100/248] Elapsed 0m 58s (remain 1m 24s) Loss: 0.0036(0.0108) Grad: 0.1147  LR: 0.00000050  
Epoch: [6][200/248] Elapsed 1m 56s (remain 0m 27s) Loss: 0.0037(0.0096) Grad: 0.1222  LR: 0.00000006  
Epoch: [6][247/248] Elapsed 2m 24s (remain 0m 0s) Loss: 0.0052(0.0094) Grad: 0.2404  LR: 0.00000000  
EVAL: [0/32] Elapsed 0m 1s (remain 0m 38s) Loss: 0.4177(0.4177) 
EVAL: [31/32] Elapsed 0m 28s (remain 0m 0s) Loss: 3.8970(1.1093) 


Epoch 6 - avg_train_loss: 0.0094  avg_val_loss: 1.1093  time: 173s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0094  avg_val_loss: 1.1093  time: 173s
Epoch 6 - Score: 0.6814
INFO:__main__:Epoch 6 - Score: 0.6814


f1 score : 0.38206627680311894
recall score : 0.32131147540983607
precision score : 0.47115384615384615


Score: 0.6935
INFO:__main__:Score: 0.6935
ACC BEST Score: 0.7015
INFO:__main__:ACC BEST Score: 0.7015


f1 score : 0.0
recall score : 0.0
precision score : 0.0


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.27.4",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "at

Epoch: [1][0/248] Elapsed 0m 0s (remain 3m 48s) Loss: 0.5708(0.5708) Grad: nan  LR: 0.00002000  
Epoch: [1][100/248] Elapsed 0m 58s (remain 1m 24s) Loss: 0.5151(0.6170) Grad: 3.2515  LR: 0.00001977  
Epoch: [1][200/248] Elapsed 1m 56s (remain 0m 27s) Loss: 0.6821(0.6087) Grad: 5.3933  LR: 0.00001912  
Epoch: [1][247/248] Elapsed 2m 25s (remain 0m 0s) Loss: 0.5640(0.6108) Grad: 1.5255  LR: 0.00001867  
EVAL: [0/32] Elapsed 0m 1s (remain 0m 42s) Loss: 0.3946(0.3946) 


Epoch 1 - avg_train_loss: 0.6108  avg_val_loss: 0.5907  time: 174s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6108  avg_val_loss: 0.5907  time: 174s
Epoch 1 - Score: 0.6945
INFO:__main__:Epoch 1 - Score: 0.6945
Epoch 1 - Save Best Score: 0.6945 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6945 Model


EVAL: [31/32] Elapsed 0m 28s (remain 0m 0s) Loss: 0.9482(0.5907) 
f1 score : 0.012987012987012988
recall score : 0.006557377049180328
precision score : 0.6666666666666666
Epoch: [2][0/248] Elapsed 0m 0s (remain 3m 43s) Loss: 0.4985(0.4985) Grad: nan  LR: 0.00001866  
Epoch: [2][100/248] Elapsed 0m 59s (remain 1m 27s) Loss: 0.7192(0.5196) Grad: 9.0410  LR: 0.00001742  
Epoch: [2][200/248] Elapsed 1m 58s (remain 0m 27s) Loss: 0.7002(0.5344) Grad: 3.8778  LR: 0.00001585  
Epoch: [2][247/248] Elapsed 2m 26s (remain 0m 0s) Loss: 0.4802(0.5330) Grad: 3.4664  LR: 0.00001502  
EVAL: [0/32] Elapsed 0m 1s (remain 0m 39s) Loss: 0.3714(0.3714) 


Epoch 2 - avg_train_loss: 0.5330  avg_val_loss: 0.5800  time: 175s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5330  avg_val_loss: 0.5800  time: 175s
Epoch 2 - Score: 0.7045
INFO:__main__:Epoch 2 - Score: 0.7045
Epoch 2 - Save Best Score: 0.7045 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7045 Model


EVAL: [31/32] Elapsed 0m 28s (remain 0m 0s) Loss: 0.9384(0.5800) 
f1 score : 0.24615384615384617
recall score : 0.15737704918032788
precision score : 0.5647058823529412
Epoch: [3][0/248] Elapsed 0m 1s (remain 4m 32s) Loss: 0.3796(0.3796) Grad: nan  LR: 0.00001501  
Epoch: [3][100/248] Elapsed 0m 59s (remain 1m 26s) Loss: 0.2495(0.3376) Grad: 5.2736  LR: 0.00001309  
Epoch: [3][200/248] Elapsed 1m 57s (remain 0m 27s) Loss: 0.2487(0.3135) Grad: 7.5283  LR: 0.00001103  
Epoch: [3][247/248] Elapsed 2m 25s (remain 0m 0s) Loss: 0.1250(0.2991) Grad: 7.1812  LR: 0.00001004  
EVAL: [0/32] Elapsed 0m 1s (remain 0m 39s) Loss: 0.5421(0.5421) 


Epoch 3 - avg_train_loss: 0.2991  avg_val_loss: 0.7708  time: 174s
INFO:__main__:Epoch 3 - avg_train_loss: 0.2991  avg_val_loss: 0.7708  time: 174s
Epoch 3 - Score: 0.6764
INFO:__main__:Epoch 3 - Score: 0.6764


EVAL: [31/32] Elapsed 0m 28s (remain 0m 0s) Loss: 0.8990(0.7708) 
f1 score : 0.4124087591240876
recall score : 0.3704918032786885
precision score : 0.46502057613168724
Epoch: [4][0/248] Elapsed 0m 0s (remain 3m 57s) Loss: 0.4048(0.4048) Grad: nan  LR: 0.00001002  
Epoch: [4][100/248] Elapsed 0m 59s (remain 1m 25s) Loss: 0.0110(0.0794) Grad: 0.5501  LR: 0.00000793  
Epoch: [4][200/248] Elapsed 1m 57s (remain 0m 27s) Loss: 0.2198(0.0749) Grad: 20.0822  LR: 0.00000593  
Epoch: [4][247/248] Elapsed 2m 24s (remain 0m 0s) Loss: 0.0330(0.0689) Grad: 4.6385  LR: 0.00000505  
EVAL: [0/32] Elapsed 0m 1s (remain 0m 33s) Loss: 0.6618(0.6618) 


Epoch 4 - avg_train_loss: 0.0689  avg_val_loss: 1.0781  time: 173s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0689  avg_val_loss: 1.0781  time: 173s
Epoch 4 - Score: 0.6714
INFO:__main__:Epoch 4 - Score: 0.6714


EVAL: [31/32] Elapsed 0m 27s (remain 0m 0s) Loss: 1.9988(1.0781) 
f1 score : 0.35502958579881655
recall score : 0.29508196721311475
precision score : 0.44554455445544555
Epoch: [5][0/248] Elapsed 0m 0s (remain 3m 26s) Loss: 0.0068(0.0068) Grad: nan  LR: 0.00000503  
Epoch: [5][100/248] Elapsed 0m 58s (remain 1m 25s) Loss: 0.0080(0.0220) Grad: 0.5006  LR: 0.00000333  
Epoch: [5][200/248] Elapsed 1m 56s (remain 0m 27s) Loss: 0.0051(0.0192) Grad: 0.1895  LR: 0.00000192  
Epoch: [5][247/248] Elapsed 2m 24s (remain 0m 0s) Loss: 0.0051(0.0198) Grad: 0.2964  LR: 0.00000138  
EVAL: [0/32] Elapsed 0m 1s (remain 0m 34s) Loss: 0.5446(0.5446) 


Epoch 5 - avg_train_loss: 0.0198  avg_val_loss: 1.2223  time: 173s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0198  avg_val_loss: 1.2223  time: 173s
Epoch 5 - Score: 0.6794
INFO:__main__:Epoch 5 - Score: 0.6794


EVAL: [31/32] Elapsed 0m 27s (remain 0m 0s) Loss: 2.9317(1.2223) 
f1 score : 0.316916488222698
recall score : 0.24262295081967214
precision score : 0.4567901234567901
Epoch: [6][0/248] Elapsed 0m 0s (remain 2m 41s) Loss: 0.0073(0.0073) Grad: nan  LR: 0.00000136  
Epoch: [6][100/248] Elapsed 0m 58s (remain 1m 25s) Loss: 0.0042(0.0175) Grad: 0.2274  LR: 0.00000050  
Epoch: [6][200/248] Elapsed 1m 55s (remain 0m 27s) Loss: 0.0051(0.0149) Grad: 0.1982  LR: 0.00000006  
Epoch: [6][247/248] Elapsed 2m 21s (remain 0m 0s) Loss: 0.0044(0.0154) Grad: 0.2218  LR: 0.00000000  
EVAL: [0/32] Elapsed 0m 1s (remain 0m 40s) Loss: 0.6859(0.6859) 
EVAL: [31/32] Elapsed 0m 28s (remain 0m 0s) Loss: 2.4709(1.2110) 


Epoch 6 - avg_train_loss: 0.0154  avg_val_loss: 1.2110  time: 170s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0154  avg_val_loss: 1.2110  time: 170s
Epoch 6 - Score: 0.6704
INFO:__main__:Epoch 6 - Score: 0.6704


f1 score : 0.34920634920634924
recall score : 0.28852459016393445
precision score : 0.44221105527638194


Score: 0.7045
INFO:__main__:Score: 0.7045
ACC BEST Score: 0.7075
INFO:__main__:ACC BEST Score: 0.7075


f1 score : 0.24615384615384617
recall score : 0.15737704918032788
precision score : 0.5647058823529412


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.27.4",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "at

Epoch: [1][0/248] Elapsed 0m 1s (remain 4m 16s) Loss: 0.7827(0.7827) Grad: nan  LR: 0.00002000  
Epoch: [1][100/248] Elapsed 0m 57s (remain 1m 23s) Loss: 0.8203(0.6365) Grad: 7.0604  LR: 0.00001977  
Epoch: [1][200/248] Elapsed 1m 56s (remain 0m 27s) Loss: 0.6182(0.6205) Grad: 0.7800  LR: 0.00001912  
Epoch: [1][247/248] Elapsed 2m 24s (remain 0m 0s) Loss: 0.4216(0.6151) Grad: 5.1069  LR: 0.00001867  
EVAL: [0/32] Elapsed 0m 1s (remain 0m 41s) Loss: 0.3467(0.3467) 
EVAL: [31/32] Elapsed 0m 28s (remain 0m 0s) Loss: 1.0633(0.5949) 


Epoch 1 - avg_train_loss: 0.6151  avg_val_loss: 0.5949  time: 173s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6151  avg_val_loss: 0.5949  time: 173s
Epoch 1 - Score: 0.6935
INFO:__main__:Epoch 1 - Score: 0.6935
Epoch 1 - Save Best Score: 0.6935 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6935 Model


f1 score : 0.0
recall score : 0.0
precision score : 0.0
Epoch: [2][0/248] Elapsed 0m 1s (remain 4m 54s) Loss: 0.5020(0.5020) Grad: nan  LR: 0.00001866  
Epoch: [2][100/248] Elapsed 0m 58s (remain 1m 25s) Loss: 0.5151(0.5450) Grad: 3.2886  LR: 0.00001742  
Epoch: [2][200/248] Elapsed 1m 56s (remain 0m 27s) Loss: 0.4612(0.5430) Grad: 2.4956  LR: 0.00001585  
Epoch: [2][247/248] Elapsed 2m 24s (remain 0m 0s) Loss: 0.5156(0.5398) Grad: 3.5803  LR: 0.00001502  
EVAL: [0/32] Elapsed 0m 1s (remain 0m 37s) Loss: 0.2561(0.2561) 


Epoch 2 - avg_train_loss: 0.5398  avg_val_loss: 0.5902  time: 173s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5398  avg_val_loss: 0.5902  time: 173s
Epoch 2 - Score: 0.7106
INFO:__main__:Epoch 2 - Score: 0.7106
Epoch 2 - Save Best Score: 0.7106 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7106 Model


EVAL: [31/32] Elapsed 0m 27s (remain 0m 0s) Loss: 0.8410(0.5902) 
f1 score : 0.2835820895522388
recall score : 0.18688524590163935
precision score : 0.5876288659793815
Epoch: [3][0/248] Elapsed 0m 0s (remain 3m 56s) Loss: 0.4746(0.4746) Grad: nan  LR: 0.00001501  
Epoch: [3][100/248] Elapsed 0m 57s (remain 1m 23s) Loss: 0.3640(0.2919) Grad: 15.7685  LR: 0.00001309  
Epoch: [3][200/248] Elapsed 1m 56s (remain 0m 27s) Loss: 0.2175(0.2860) Grad: 7.1642  LR: 0.00001103  
Epoch: [3][247/248] Elapsed 2m 24s (remain 0m 0s) Loss: 0.1238(0.2775) Grad: 4.5440  LR: 0.00001004  
EVAL: [0/32] Elapsed 0m 1s (remain 0m 36s) Loss: 0.3582(0.3582) 


Epoch 3 - avg_train_loss: 0.2775  avg_val_loss: 0.7430  time: 173s
INFO:__main__:Epoch 3 - avg_train_loss: 0.2775  avg_val_loss: 0.7430  time: 173s
Epoch 3 - Score: 0.6884
INFO:__main__:Epoch 3 - Score: 0.6884


EVAL: [31/32] Elapsed 0m 27s (remain 0m 0s) Loss: 1.1432(0.7430) 
f1 score : 0.42592592592592593
recall score : 0.3770491803278688
precision score : 0.48936170212765956
Epoch: [4][0/248] Elapsed 0m 0s (remain 2m 58s) Loss: 0.0533(0.0533) Grad: nan  LR: 0.00001002  
Epoch: [4][100/248] Elapsed 0m 59s (remain 1m 27s) Loss: 0.0077(0.0485) Grad: 0.4182  LR: 0.00000793  
Epoch: [4][200/248] Elapsed 1m 57s (remain 0m 27s) Loss: 0.0079(0.0448) Grad: 0.3762  LR: 0.00000593  
Epoch: [4][247/248] Elapsed 2m 23s (remain 0m 0s) Loss: 0.0139(0.0408) Grad: 1.4995  LR: 0.00000505  
EVAL: [0/32] Elapsed 0m 1s (remain 0m 41s) Loss: 0.3252(0.3252) 
EVAL: [31/32] Elapsed 0m 28s (remain 0m 0s) Loss: 1.9476(1.0629) 


Epoch 4 - avg_train_loss: 0.0408  avg_val_loss: 1.0629  time: 172s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0408  avg_val_loss: 1.0629  time: 172s
Epoch 4 - Score: 0.6894
INFO:__main__:Epoch 4 - Score: 0.6894


f1 score : 0.3832335329341318
recall score : 0.31475409836065577
precision score : 0.4897959183673469
Epoch: [5][0/248] Elapsed 0m 0s (remain 3m 25s) Loss: 0.0034(0.0034) Grad: nan  LR: 0.00000503  
Epoch: [5][100/248] Elapsed 0m 56s (remain 1m 22s) Loss: 0.0044(0.0123) Grad: 0.2433  LR: 0.00000333  
Epoch: [5][200/248] Elapsed 1m 54s (remain 0m 26s) Loss: 0.0055(0.0132) Grad: 0.1879  LR: 0.00000192  
Epoch: [5][247/248] Elapsed 2m 21s (remain 0m 0s) Loss: 0.0035(0.0134) Grad: 0.1731  LR: 0.00000138  
EVAL: [0/32] Elapsed 0m 1s (remain 0m 38s) Loss: 0.6430(0.6430) 


Epoch 5 - avg_train_loss: 0.0134  avg_val_loss: 1.1193  time: 170s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0134  avg_val_loss: 1.1193  time: 170s
Epoch 5 - Score: 0.6633
INFO:__main__:Epoch 5 - Score: 0.6633


EVAL: [31/32] Elapsed 0m 27s (remain 0m 0s) Loss: 1.6856(1.1193) 
f1 score : 0.4273504273504274
recall score : 0.4098360655737705
precision score : 0.44642857142857145
Epoch: [6][0/248] Elapsed 0m 0s (remain 2m 56s) Loss: 0.0057(0.0057) Grad: nan  LR: 0.00000136  
Epoch: [6][100/248] Elapsed 0m 59s (remain 1m 26s) Loss: 0.0039(0.0041) Grad: 0.1398  LR: 0.00000050  
Epoch: [6][200/248] Elapsed 1m 58s (remain 0m 27s) Loss: 0.0043(0.0114) Grad: 0.1535  LR: 0.00000006  
Epoch: [6][247/248] Elapsed 2m 24s (remain 0m 0s) Loss: 0.0050(0.0107) Grad: 0.4647  LR: 0.00000000  
EVAL: [0/32] Elapsed 0m 1s (remain 0m 37s) Loss: 0.5306(0.5306) 


Epoch 6 - avg_train_loss: 0.0107  avg_val_loss: 1.1336  time: 173s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0107  avg_val_loss: 1.1336  time: 173s
Epoch 6 - Score: 0.6693
INFO:__main__:Epoch 6 - Score: 0.6693


EVAL: [31/32] Elapsed 0m 27s (remain 0m 0s) Loss: 1.8336(1.1336) 
f1 score : 0.4072072072072072
recall score : 0.3704918032786885
precision score : 0.452


Score: 0.7106
INFO:__main__:Score: 0.7106
ACC BEST Score: 0.7156
INFO:__main__:ACC BEST Score: 0.7156


f1 score : 0.2835820895522388
recall score : 0.18688524590163935
precision score : 0.5876288659793815


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.27.4",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "at

Epoch: [1][0/248] Elapsed 0m 1s (remain 4m 31s) Loss: 0.9409(0.9409) Grad: nan  LR: 0.00002000  
Epoch: [1][100/248] Elapsed 0m 59s (remain 1m 25s) Loss: 0.5371(0.6451) Grad: 4.8296  LR: 0.00001977  
Epoch: [1][200/248] Elapsed 1m 56s (remain 0m 27s) Loss: 0.6514(0.6288) Grad: 2.9997  LR: 0.00001912  
Epoch: [1][247/248] Elapsed 2m 23s (remain 0m 0s) Loss: 0.4929(0.6266) Grad: 1.4768  LR: 0.00001867  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 29s) Loss: 0.2209(0.2209) 


Epoch 1 - avg_train_loss: 0.6266  avg_val_loss: 0.6213  time: 172s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6266  avg_val_loss: 0.6213  time: 172s
Epoch 1 - Score: 0.6942
INFO:__main__:Epoch 1 - Score: 0.6942
Epoch 1 - Save Best Score: 0.6942 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6942 Model


EVAL: [31/32] Elapsed 0m 27s (remain 0m 0s) Loss: 1.3335(0.6213) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
Epoch: [2][0/248] Elapsed 0m 0s (remain 3m 9s) Loss: 0.6943(0.6943) Grad: nan  LR: 0.00001866  
Epoch: [2][100/248] Elapsed 0m 59s (remain 1m 26s) Loss: 0.4102(0.6012) Grad: 7.6349  LR: 0.00001742  
Epoch: [2][200/248] Elapsed 1m 58s (remain 0m 27s) Loss: 0.5806(0.5961) Grad: 1.3144  LR: 0.00001585  
Epoch: [2][247/248] Elapsed 2m 25s (remain 0m 0s) Loss: 0.6523(0.5942) Grad: 1.6973  LR: 0.00001502  
EVAL: [0/32] Elapsed 0m 1s (remain 0m 31s) Loss: 0.3791(0.3791) 


Epoch 2 - avg_train_loss: 0.5942  avg_val_loss: 0.5846  time: 173s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5942  avg_val_loss: 0.5846  time: 173s
Epoch 2 - Score: 0.6871
INFO:__main__:Epoch 2 - Score: 0.6871


EVAL: [31/32] Elapsed 0m 27s (remain 0m 0s) Loss: 0.7341(0.5846) 
f1 score : 0.35876288659793815
recall score : 0.28618421052631576
precision score : 0.48066298342541436
Epoch: [3][0/248] Elapsed 0m 1s (remain 4m 9s) Loss: 0.5127(0.5127) Grad: nan  LR: 0.00001501  
Epoch: [3][100/248] Elapsed 1m 1s (remain 1m 29s) Loss: 0.3062(0.5521) Grad: 3.1307  LR: 0.00001309  
Epoch: [3][200/248] Elapsed 1m 58s (remain 0m 27s) Loss: 0.7178(0.5489) Grad: 7.8303  LR: 0.00001103  
Epoch: [3][247/248] Elapsed 2m 25s (remain 0m 0s) Loss: 0.5034(0.5457) Grad: 1.8661  LR: 0.00001004  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 30s) Loss: 0.2942(0.2942) 


Epoch 3 - avg_train_loss: 0.5457  avg_val_loss: 0.5709  time: 173s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5457  avg_val_loss: 0.5709  time: 173s
Epoch 3 - Score: 0.7062
INFO:__main__:Epoch 3 - Score: 0.7062
Epoch 3 - Save Best Score: 0.7062 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7062 Model


EVAL: [31/32] Elapsed 0m 27s (remain 0m 0s) Loss: 1.0404(0.5709) 
f1 score : 0.15606936416184972
recall score : 0.08881578947368421
precision score : 0.6428571428571429
Epoch: [4][0/248] Elapsed 0m 0s (remain 3m 22s) Loss: 0.5210(0.5210) Grad: nan  LR: 0.00001002  
Epoch: [4][100/248] Elapsed 0m 57s (remain 1m 23s) Loss: 0.4829(0.4234) Grad: 6.8915  LR: 0.00000793  
Epoch: [4][200/248] Elapsed 1m 55s (remain 0m 26s) Loss: 0.3313(0.4139) Grad: 4.1910  LR: 0.00000593  
Epoch: [4][247/248] Elapsed 2m 23s (remain 0m 0s) Loss: 0.2498(0.4066) Grad: 4.4285  LR: 0.00000505  
EVAL: [0/32] Elapsed 0m 1s (remain 0m 31s) Loss: 0.4279(0.4279) 


Epoch 4 - avg_train_loss: 0.4066  avg_val_loss: 0.6250  time: 171s
INFO:__main__:Epoch 4 - avg_train_loss: 0.4066  avg_val_loss: 0.6250  time: 171s
Epoch 4 - Score: 0.6871
INFO:__main__:Epoch 4 - Score: 0.6871


EVAL: [31/32] Elapsed 0m 27s (remain 0m 0s) Loss: 0.5573(0.6250) 
f1 score : 0.47554806070826305
recall score : 0.46381578947368424
precision score : 0.48788927335640137
Epoch: [5][0/248] Elapsed 0m 0s (remain 3m 46s) Loss: 0.3142(0.3142) Grad: nan  LR: 0.00000503  
Epoch: [5][100/248] Elapsed 0m 58s (remain 1m 25s) Loss: 0.1567(0.1885) Grad: 10.4242  LR: 0.00000333  
Epoch: [5][200/248] Elapsed 1m 57s (remain 0m 27s) Loss: 0.1830(0.1911) Grad: 17.4133  LR: 0.00000192  
Epoch: [5][247/248] Elapsed 2m 25s (remain 0m 0s) Loss: 0.1451(0.1847) Grad: 5.4485  LR: 0.00000138  
EVAL: [0/32] Elapsed 0m 1s (remain 0m 31s) Loss: 0.3766(0.3766) 


Epoch 5 - avg_train_loss: 0.1847  avg_val_loss: 0.7731  time: 173s
INFO:__main__:Epoch 5 - avg_train_loss: 0.1847  avg_val_loss: 0.7731  time: 173s
Epoch 5 - Score: 0.6881
INFO:__main__:Epoch 5 - Score: 0.6881


EVAL: [31/32] Elapsed 0m 27s (remain 0m 0s) Loss: 0.6799(0.7731) 
f1 score : 0.42592592592592593
recall score : 0.3782894736842105
precision score : 0.4872881355932203
Epoch: [6][0/248] Elapsed 0m 0s (remain 3m 30s) Loss: 0.0575(0.0575) Grad: nan  LR: 0.00000136  
Epoch: [6][100/248] Elapsed 0m 57s (remain 1m 23s) Loss: 0.0474(0.0868) Grad: 6.4687  LR: 0.00000050  
Epoch: [6][200/248] Elapsed 1m 57s (remain 0m 27s) Loss: 0.0179(0.0885) Grad: 0.4645  LR: 0.00000006  
Epoch: [6][247/248] Elapsed 2m 25s (remain 0m 0s) Loss: 0.1498(0.0881) Grad: 11.6760  LR: 0.00000000  
EVAL: [0/32] Elapsed 0m 1s (remain 0m 31s) Loss: 0.4662(0.4662) 


Epoch 6 - avg_train_loss: 0.0881  avg_val_loss: 0.8272  time: 173s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0881  avg_val_loss: 0.8272  time: 173s


EVAL: [31/32] Elapsed 0m 27s (remain 0m 0s) Loss: 0.5138(0.8272) 
f1 score : 0.4562607204116638
recall score : 0.4375
precision score : 0.4767025089605735


Epoch 6 - Score: 0.6811
INFO:__main__:Epoch 6 - Score: 0.6811
Score: 0.7062
INFO:__main__:Score: 0.7062
ACC BEST Score: 0.7133
INFO:__main__:ACC BEST Score: 0.7133
Score: 0.7061
INFO:__main__:Score: 0.7061
ACC BEST Score: 0.7079
INFO:__main__:ACC BEST Score: 0.7079


f1 score : 0.15606936416184972
recall score : 0.08881578947368421
precision score : 0.6428571428571429
f1 score : 0.23535564853556487
recall score : 0.14773473407747867
precision score : 0.5784061696658098


In [19]:
from google.colab import runtime
runtime.unassign()