In [1]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers
!pip install datasets
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!nvidia-smi

Sat May  6 12:28:11 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   31C    P0    39W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:

import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re
import html

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD, AdamW, RAdam
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score, recall_score, accuracy_score, precision_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
DIR = "/content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測"
INPUT_DIR = os.path.join(DIR,"input")
OUTPUT_DIR = os.path.join(DIR,"output")
OUTPUT_EXP_DIR = DIR + '/output/EXP075/'
if not os.path.exists(OUTPUT_EXP_DIR):
    os.makedirs(OUTPUT_EXP_DIR)

In [6]:

# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    model_name="sshleifer/distilbart-cnn-12-6"
    # model='microsoft/deberta-base'
    # model='roberta-base'
    # model='roberta-large'
    # model='roberta-large-mnli'
    # model='xlnet-large-cased'
    # model='albert-xxlarge-v2'
    # model="microsoft/deberta-large"
    # model="microsoft/deberta-v3-large"
    # model='microsoft/deberta-v2-xlarge'
    # model='funnel-transformer/large'
    # model='funnel-transformer/medium'
    # model='albert-base-v2'
    # model='albert-large-v2'
    # model='google/electra-large-discriminator'
    # model='google/electra-base-discriminator'
    # model="facebook/bart-large-mnli"
    # model="facebook/bart-large"
    # model="facebook/bart-base"
    model = "sshleifer/distilbart-cnn-12-6"
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=5
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=16
    fc_dropout=0.2
    target_size=1
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=10
    trn_fold=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    train=True
    nth_awp_start_epoch=1
    gradient_checkpointing = False
    freezing = False
    num_reinit_layers = 1
    is_reinit_layer = False
    fgm = False
    awp_start=1

if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0, 1]

In [7]:
def get_score(labels, outputs):
    thresh = 0.5
    y_pred = outputs
    y_true = labels
    f_score = f1_score(y_true, (y_pred>thresh).astype(int))
    r_score = recall_score(y_true, (y_pred>thresh).astype(int))
    p_score = precision_score(y_true, (y_pred>thresh).astype(int))
    print(f"f1 score : {f_score}")
    print(f"recall score : {r_score}")
    print(f"precision score : {p_score}")
    return accuracy_score(y_true, (y_pred>thresh).astype(int))

def get_acc_score(labels, outputs):
    y_pred = outputs
    y_true = labels
    best_score = 0
    best_thresh = 0.5
    for thresh in np.arange(0.1, 0.80, 0.01):
        thresh = np.round(thresh, 2)
        score = accuracy_score(y_true, (y_pred>thresh).astype(int))
        #print("Accuracy score at threshold {0} is {1}".format(thresh, score))
        if score > best_score:
          best_score = score
          best_thresh = thresh
    print(f"thresh : {best_thresh}")
    return accuracy_score(y_true, (y_pred>best_thresh).astype(int))


def get_logger(filename=OUTPUT_EXP_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=CFG.seed)

In [8]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False
        
def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
            
    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """
    
    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"
        
        if hasattr(embeddings_path, attr_name): 
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [9]:
import pandas as pd
import numpy as np

train = pd.read_csv(os.path.join(INPUT_DIR,"train_data.csv"))
test = pd.read_csv(os.path.join(INPUT_DIR,"test_data.csv"))
sample_sub = pd.read_csv(os.path.join(INPUT_DIR,"submission.csv"))

train = train.sample(frac=1, random_state=CFG.seed).reset_index()

print(train.shape)
display(train.head(3))

print(test.shape)
display(test.head(3))

print(sample_sub.shape)
display(sample_sub.head(3))

(4974, 7)


Unnamed: 0,index,id,title,year,abstract,keywords,y
0,721,722,Global Optimality Conditions for Deep Neural N...,2018,We study the error landscape of deep linear an...,"deep linear neural networks, global optimality...",1
1,144,145,Multi-Task Learning by Deep Collaboration and ...,2018,Convolutional neural networks (CNN) have becom...,"multi-task learning, soft parameter sharing, f...",0
2,4542,4543,On the Need for Topology-Aware Generative Mode...,2020,"ML algorithms or models, especially deep neura...","Manifold-based Defense, Robust Learning, Adver...",1


(6393, 5)


Unnamed: 0,id,title,year,abstract,keywords
0,1,StyleAlign: Analysis and Applications of Align...,2022,"In this paper, we perform an in-depth study of...","StyleGAN, transfer learning, fine tuning, mode..."
1,2,Embedding a random graph via GNN: mean-field i...,2021,We develop a theory for embedding a random gra...,"Graph neural network, graph embedding, multi-r..."
2,3,BBRefinement: an universal scheme to improve p...,2021,We present a conceptually simple yet powerful ...,"object detection, deep neural networks, refine..."


(6393, 2)


Unnamed: 0,id,y
0,1,0
1,2,0
2,3,0


In [10]:
train["texts"] = train["title"] + "</s>" + train["abstract"] 

In [11]:
skf = StratifiedKFold(n_splits=CFG.n_fold,shuffle=True,random_state=CFG.seed)
for fold, ( _, val_) in enumerate(skf.split(train, train.y)):
    train.loc[val_ , "kfold"] = int(fold)
    
train["kfold"] = train["kfold"].astype(int)

if CFG.debug:
    display(train.groupby('kfold').size())
    train = train.sample(n=500, random_state=0).reset_index(drop=True)
    display(train.groupby('kfold').size())

In [12]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_EXP_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

In [13]:
# ====================================================
# Define max_len
# ====================================================
lengths = []
tk0 = tqdm(train['texts'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 3 # cls + sep + sep
LOGGER.info(f"max_len: {CFG.max_len}")

100%|██████████| 4974/4974 [00:03<00:00, 1524.75it/s]
max_len: 653
INFO:__main__:max_len: 653


In [14]:
class AWP:
    def __init__(self, model, optimizer, *, adv_param='weight',
                 adv_lr=0.001, adv_eps=0.001):
        self.model = model
        self.optimizer = optimizer
        self.adv_param = adv_param
        self.adv_lr = adv_lr
        self.adv_eps = adv_eps
        self.backup = {}

    def perturb(self, inputs, y, criterion):
        """
        Perturb model parameters for AWP gradient
        Call before loss and loss.backward()
        """
        self._save()  # save model parameters
        self._attack_step()  # perturb weights

    def _attack_step(self):
        e = 1e-6
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                grad = self.optimizer.state[param]['exp_avg']
                norm_grad = torch.norm(grad)
                norm_data = torch.norm(param.detach())

                if norm_grad != 0 and not torch.isnan(norm_grad):
                    # Set lower and upper limit in change
                    limit_eps = self.adv_eps * param.detach().abs()
                    param_min = param.data - limit_eps
                    param_max = param.data + limit_eps

                    # Perturb along gradient
                    # w += (adv_lr * |w| / |grad|) * grad
                    param.data.add_(grad, alpha=(self.adv_lr * (norm_data + e) / (norm_grad + e)))

                    # Apply the limit to the change
                    param.data.clamp_(param_min, param_max)

    def _save(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                if name not in self.backup:
                    self.backup[name] = param.clone().detach()
                else:
                    self.backup[name].copy_(param.data)

    def restore(self):
        """
        Restore model parameter to correct position; AWP do not perturbe weights, it perturb gradients
        Call after loss.backward(), before optimizer.step()
        """
        for name, param in self.model.named_parameters():
            if name in self.backup:
                param.data.copy_(self.backup[name])

In [15]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False,
                           truncation=True)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['texts'].values
        self.labels = df['y'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

#collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [16]:
def reinit_layers(model):

    #for layer in model.model.encoder.layer[-CFG.num_reinit_layers:]:
    for layer in model.encoder.layer[-CFG.num_reinit_layers:]:    #Custome model内(backbone)

            for module in layer.modules():

                if isinstance(module,nn.Linear):
                    module.weight.data.normal_(mean=0.0,std=model.config.initializer_range)
                    if module.bias is not None:
                            module.bias.data.zero_()
                elif isinstance(module, nn.Embedding):
                        module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
                        if module.padding_idx is not None:
                            module.weight.data[module.padding_idx].zero_()
                elif isinstance(module, nn.LayerNorm):
                        module.bias.data.zero_()
                        module.weight.data.fill_(1.0)
                        
    return model

In [17]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim=1)
        return max_embeddings
    

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if CFG.is_reinit_layer:
            self.model = reinit_layers(self.model)
            print(f'Reinitializing Last {CFG.num_reinit_layers} Layers.')
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()

        # Freezing
        if cfg.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            cfg.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())

        self.pool = MaxPooling()
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        #self._init_weights(self.fc)
        self.layer_norm1 = nn.LayerNorm(self.config.hidden_size)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        feature = self.layer_norm1(feature)
        output = self.fc(feature)
        return output

In [18]:
def calculate_loss(inputs, labels, model, criterion, is_valid=True, device="cpu"):    
    y_preds = model(inputs)
    loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
    return (loss, y_preds) if is_valid else loss

In [19]:

# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device, awp):
    model.zero_grad()
    model.train()
    awp_start = CFG.awp_start
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        if epoch >= awp_start:
            awp.perturb(inputs, labels, criterion)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            loss = calculate_loss(inputs=inputs, labels=labels, model=model, criterion=criterion, is_valid=False, device=device)
        #print(y_preds.sigmoid().squeeze().view(1, -1))
        #loss = criterion(y_preds.sigmoid().squeeze(), labels.squeeze())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        if scaler is not None:
            scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        awp.restore()
        if CFG.fgm:
          fgm.attack() 
          adversarial_loss = calculate_loss(inputs=inputs, labels=labels, model=model, criterion=criterion, is_valid=False, device=device)
          scaler.scale(adversarial_loss).backward()
          fgm.restore()
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  #'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          #grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))

    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            loss, y_preds = calculate_loss(inputs=inputs, labels=labels, model=model, criterion=criterion, is_valid=True, device=device)
        #loss = criterion(y_preds.sigmoid().squeeze(), labels.squeeze())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    predictions = np.concatenate(predictions)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [20]:

# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['kfold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['kfold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds['y'].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)


    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size*2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_EXP_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr=5e-6, decoder_lr=1e-4, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", 
                    "LayerNorm.weight"]
        group1=['layer.0.','layer.1.','layer.2.','layer.3.']
        group2=['layer.4.','layer.5.','layer.6.','layer.7.']    
        group3=['layer.8.','layer.9.','layer.10.','layer.11.']
        group_all=['layer.0.','layer.1.','layer.2.','layer.3.','layer.4.','layer.5.','layer.6.','layer.7.','layer.8.','layer.9.','layer.10.','layer.11.']
        optimizer_parameters = [
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay': weight_decay},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay': weight_decay, 'lr': encoder_lr/2.6},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay': weight_decay, 'lr': encoder_lr},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay': weight_decay, 'lr': encoder_lr*2.6},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay': 0.0},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay': 0.0, 'lr': encoder_lr/2.6},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay': 0.0, 'lr': encoder_lr},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay': 0.0, 'lr': encoder_lr*2.6},
        {'params': [p for n, p in model.named_parameters() if "model" not in n], 'lr':decoder_lr, "momentum" : 0.99},
    ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.BCEWithLogitsLoss()

    print('Enable AWP')
    awp = AWP(model, optimizer, adv_lr=0.001, adv_eps=0.001)
    #print('Enable FGM')
    #fgm = FGM(model=model, eps=0.1)
    
    best_score = -1.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device, awp)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score_05 = get_score(valid_labels, predictions)
        score = get_acc_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')

        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_EXP_DIR+f"{CFG.model_name.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_EXP_DIR+f"{CFG.model_name.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds['pred'] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [None]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df['y'].values
        preds = oof_df['pred'].values
        score = get_score(labels, preds)
        acc_score = get_acc_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
        LOGGER.info(f'ACC BEST Score: {acc_score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
            #break
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_EXP_DIR+'oof_df.pkl')

BartConfig {
  "_name_or_path": "sshleifer/distilbart-cnn-12-6",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "extra_pos_embeddings": 2,
  "force_bos_token_to_be_generated": true,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "id2label": {
    "0": "LABEL_

Enable AWP
Epoch: [1][0/279] Elapsed 0m 4s (remain 18m 59s) Loss: 0.5489(0.5489) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 23s (remain 0m 41s) Loss: 0.7158(0.6330) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 42s (remain 0m 16s) Loss: 0.5049(0.6189) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 57s (remain 0m 0s) Loss: 0.5115(0.6104) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 11s) Loss: 0.5955(0.5955) 


Epoch 1 - avg_train_loss: 0.6104  avg_val_loss: 0.5961  time: 65s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6104  avg_val_loss: 0.5961  time: 65s
Epoch 1 - Score: 0.7068
INFO:__main__:Epoch 1 - Score: 0.7068
Epoch 1 - Save Best Score: 0.7068 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7068 Model


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.6683(0.5961) 
f1 score : 0.06289308176100629
recall score : 0.032679738562091505
precision score : 0.8333333333333334
thresh : 0.34
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 52s) Loss: 0.5854(0.5854) LR: 0.00001808  
Epoch: [2][100/279] Elapsed 0m 19s (remain 0m 34s) Loss: 0.8329(0.5704) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 38s (remain 0m 14s) Loss: 0.6840(0.5658) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 53s (remain 0m 0s) Loss: 0.5245(0.5666) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 11s) Loss: 0.5465(0.5465) 


Epoch 2 - avg_train_loss: 0.5666  avg_val_loss: 0.5876  time: 61s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5666  avg_val_loss: 0.5876  time: 61s
Epoch 2 - Score: 0.7068
INFO:__main__:Epoch 2 - Score: 0.7068


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.6566(0.5876) 
f1 score : 0.0617283950617284
recall score : 0.032679738562091505
precision score : 0.5555555555555556
thresh : 0.42
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 37s) Loss: 0.4884(0.4884) LR: 0.00001309  
Epoch: [3][100/279] Elapsed 0m 19s (remain 0m 33s) Loss: 0.3303(0.4727) LR: 0.00001090  
Epoch: [3][200/279] Elapsed 0m 37s (remain 0m 14s) Loss: 0.4075(0.4588) LR: 0.00000866  
Epoch: [3][278/279] Elapsed 0m 53s (remain 0m 0s) Loss: 0.4054(0.4604) LR: 0.00000695  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 11s) Loss: 0.4604(0.4604) 


Epoch 3 - avg_train_loss: 0.4604  avg_val_loss: 0.5941  time: 61s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4604  avg_val_loss: 0.5941  time: 61s
Epoch 3 - Score: 0.7129
INFO:__main__:Epoch 3 - Score: 0.7129
Epoch 3 - Save Best Score: 0.7129 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7129 Model


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.6289(0.5941) 
f1 score : 0.351931330472103
recall score : 0.2679738562091503
precision score : 0.5125
thresh : 0.65
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 55s) Loss: 0.3129(0.3129) LR: 0.00000693  
Epoch: [4][100/279] Elapsed 0m 19s (remain 0m 34s) Loss: 0.4144(0.2905) LR: 0.00000488  
Epoch: [4][200/279] Elapsed 0m 39s (remain 0m 15s) Loss: 0.1827(0.2833) LR: 0.00000310  
Epoch: [4][278/279] Elapsed 0m 53s (remain 0m 0s) Loss: 0.1840(0.2770) LR: 0.00000194  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 11s) Loss: 0.4783(0.4783) 


Epoch 4 - avg_train_loss: 0.2770  avg_val_loss: 0.6481  time: 61s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2770  avg_val_loss: 0.6481  time: 61s
Epoch 4 - Score: 0.7149
INFO:__main__:Epoch 4 - Score: 0.7149
Epoch 4 - Save Best Score: 0.7149 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7149 Model


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.6792(0.6481) 
f1 score : 0.35294117647058826
recall score : 0.27450980392156865
precision score : 0.49411764705882355
thresh : 0.71
Epoch: [5][0/279] Elapsed 0m 0s (remain 2m 6s) Loss: 0.1365(0.1365) LR: 0.00000193  
Epoch: [5][100/279] Elapsed 0m 19s (remain 0m 34s) Loss: 0.1317(0.1906) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 38s (remain 0m 15s) Loss: 0.1134(0.1921) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 53s (remain 0m 0s) Loss: 0.1091(0.1895) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 11s) Loss: 0.4597(0.4597) 


Epoch 5 - avg_train_loss: 0.1895  avg_val_loss: 0.6714  time: 61s
INFO:__main__:Epoch 5 - avg_train_loss: 0.1895  avg_val_loss: 0.6714  time: 61s
Epoch 5 - Score: 0.7068
INFO:__main__:Epoch 5 - Score: 0.7068


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.6818(0.6714) 
f1 score : 0.32921810699588483
recall score : 0.26143790849673204
precision score : 0.4444444444444444
thresh : 0.72


Score: 0.6908
INFO:__main__:Score: 0.6908
ACC BEST Score: 0.7149
INFO:__main__:ACC BEST Score: 0.7149


f1 score : 0.35294117647058826
recall score : 0.27450980392156865
precision score : 0.49411764705882355
thresh : 0.71


BartConfig {
  "_name_or_path": "sshleifer/distilbart-cnn-12-6",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "extra_pos_embeddings": 2,
  "force_bos_token_to_be_generated": true,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "id2label": {
    "0": "LABEL_

Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 2m 26s) Loss: 0.7464(0.7464) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 19s (remain 0m 34s) Loss: 0.6585(0.6173) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 38s (remain 0m 14s) Loss: 0.7344(0.6183) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 54s (remain 0m 0s) Loss: 0.6503(0.6128) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 11s) Loss: 0.6279(0.6279) 


Epoch 1 - avg_train_loss: 0.6128  avg_val_loss: 0.6596  time: 62s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6128  avg_val_loss: 0.6596  time: 62s
Epoch 1 - Score: 0.7129
INFO:__main__:Epoch 1 - Score: 0.7129
Epoch 1 - Save Best Score: 0.7129 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7129 Model


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.6606(0.6596) 
f1 score : 0.47088607594936704
recall score : 0.6078431372549019
precision score : 0.384297520661157
thresh : 0.65
Epoch: [2][0/279] Elapsed 0m 0s (remain 2m 0s) Loss: 0.6823(0.6823) LR: 0.00001808  
Epoch: [2][100/279] Elapsed 0m 19s (remain 0m 34s) Loss: 0.5572(0.5764) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 38s (remain 0m 14s) Loss: 0.6801(0.5638) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 53s (remain 0m 0s) Loss: 0.6362(0.5673) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 11s) Loss: 0.4910(0.4910) 


Epoch 2 - avg_train_loss: 0.5673  avg_val_loss: 0.5881  time: 61s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5673  avg_val_loss: 0.5881  time: 61s
Epoch 2 - Score: 0.7088
INFO:__main__:Epoch 2 - Score: 0.7088


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.5174(0.5881) 
f1 score : 0.13953488372093023
recall score : 0.0784313725490196
precision score : 0.631578947368421
thresh : 0.54
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 59s) Loss: 0.6235(0.6235) LR: 0.00001309  
Epoch: [3][100/279] Elapsed 0m 19s (remain 0m 34s) Loss: 0.4581(0.5405) LR: 0.00001090  
Epoch: [3][200/279] Elapsed 0m 38s (remain 0m 15s) Loss: 0.6614(0.5543) LR: 0.00000866  
Epoch: [3][278/279] Elapsed 0m 53s (remain 0m 0s) Loss: 0.4220(0.5430) LR: 0.00000695  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 11s) Loss: 0.4071(0.4071) 


Epoch 3 - avg_train_loss: 0.5430  avg_val_loss: 0.5798  time: 62s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5430  avg_val_loss: 0.5798  time: 62s
Epoch 3 - Score: 0.7108
INFO:__main__:Epoch 3 - Score: 0.7108


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.4449(0.5798) 
f1 score : 0.19672131147540983
recall score : 0.11764705882352941
precision score : 0.6
thresh : 0.55
Epoch: [4][0/279] Elapsed 0m 0s (remain 2m 10s) Loss: 0.4598(0.4598) LR: 0.00000693  
Epoch: [4][100/279] Elapsed 0m 19s (remain 0m 33s) Loss: 0.5924(0.4593) LR: 0.00000488  
Epoch: [4][200/279] Elapsed 0m 37s (remain 0m 14s) Loss: 0.2470(0.4439) LR: 0.00000310  
Epoch: [4][278/279] Elapsed 0m 53s (remain 0m 0s) Loss: 0.4045(0.4357) LR: 0.00000194  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 11s) Loss: 0.3922(0.3922) 


Epoch 4 - avg_train_loss: 0.4357  avg_val_loss: 0.5888  time: 61s
INFO:__main__:Epoch 4 - avg_train_loss: 0.4357  avg_val_loss: 0.5888  time: 61s
Epoch 4 - Score: 0.7088
INFO:__main__:Epoch 4 - Score: 0.7088


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.4383(0.5888) 
f1 score : 0.30973451327433627
recall score : 0.22875816993464052
precision score : 0.4794520547945205
thresh : 0.65
Epoch: [5][0/279] Elapsed 0m 0s (remain 2m 19s) Loss: 0.4308(0.4308) LR: 0.00000193  
Epoch: [5][100/279] Elapsed 0m 19s (remain 0m 33s) Loss: 0.6549(0.3737) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 38s (remain 0m 14s) Loss: 0.2375(0.3744) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 53s (remain 0m 0s) Loss: 0.3140(0.3723) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 11s) Loss: 0.4168(0.4168) 


Epoch 5 - avg_train_loss: 0.3723  avg_val_loss: 0.5903  time: 61s
INFO:__main__:Epoch 5 - avg_train_loss: 0.3723  avg_val_loss: 0.5903  time: 61s
Epoch 5 - Score: 0.7108
INFO:__main__:Epoch 5 - Score: 0.7108


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.4725(0.5903) 
f1 score : 0.35797665369649806
recall score : 0.3006535947712418
precision score : 0.4423076923076923
thresh : 0.69


Score: 0.5803
INFO:__main__:Score: 0.5803
ACC BEST Score: 0.7129
INFO:__main__:ACC BEST Score: 0.7129


f1 score : 0.47088607594936704
recall score : 0.6078431372549019
precision score : 0.384297520661157
thresh : 0.65


BartConfig {
  "_name_or_path": "sshleifer/distilbart-cnn-12-6",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "extra_pos_embeddings": 2,
  "force_bos_token_to_be_generated": true,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "id2label": {
    "0": "LABEL_

Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 2m 17s) Loss: 0.6971(0.6971) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 19s (remain 0m 33s) Loss: 0.5412(0.6343) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 38s (remain 0m 14s) Loss: 0.4991(0.6286) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 53s (remain 0m 0s) Loss: 0.5383(0.6205) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 11s) Loss: 0.5592(0.5592) 


Epoch 1 - avg_train_loss: 0.6205  avg_val_loss: 0.6060  time: 61s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6205  avg_val_loss: 0.6060  time: 61s
Epoch 1 - Score: 0.7068
INFO:__main__:Epoch 1 - Score: 0.7068
Epoch 1 - Save Best Score: 0.7068 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7068 Model


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.3610(0.6060) 
f1 score : 0.10975609756097561
recall score : 0.058823529411764705
precision score : 0.8181818181818182
thresh : 0.45
Epoch: [2][0/279] Elapsed 0m 0s (remain 2m 19s) Loss: 0.4072(0.4072) LR: 0.00001808  
Epoch: [2][100/279] Elapsed 0m 19s (remain 0m 34s) Loss: 0.5224(0.5784) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 39s (remain 0m 15s) Loss: 0.6993(0.5695) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 53s (remain 0m 0s) Loss: 0.4808(0.5641) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 11s) Loss: 0.5114(0.5114) 


Epoch 2 - avg_train_loss: 0.5641  avg_val_loss: 0.5776  time: 62s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5641  avg_val_loss: 0.5776  time: 62s
Epoch 2 - Score: 0.7169
INFO:__main__:Epoch 2 - Score: 0.7169
Epoch 2 - Save Best Score: 0.7169 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7169 Model


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.3804(0.5776) 
f1 score : 0.18390804597701152
recall score : 0.10457516339869281
precision score : 0.7619047619047619
thresh : 0.48
Epoch: [3][0/279] Elapsed 0m 0s (remain 2m 31s) Loss: 0.5766(0.5766) LR: 0.00001309  
Epoch: [3][100/279] Elapsed 0m 19s (remain 0m 35s) Loss: 0.2760(0.4620) LR: 0.00001090  
Epoch: [3][200/279] Elapsed 0m 38s (remain 0m 15s) Loss: 0.6308(0.4594) LR: 0.00000866  
Epoch: [3][278/279] Elapsed 0m 53s (remain 0m 0s) Loss: 0.4783(0.4567) LR: 0.00000695  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 11s) Loss: 0.5826(0.5826) 


Epoch 3 - avg_train_loss: 0.4567  avg_val_loss: 0.5916  time: 62s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4567  avg_val_loss: 0.5916  time: 62s
Epoch 3 - Score: 0.7249
INFO:__main__:Epoch 3 - Score: 0.7249
Epoch 3 - Save Best Score: 0.7249 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7249 Model


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.4180(0.5916) 
f1 score : 0.41843971631205673
recall score : 0.38562091503267976
precision score : 0.4573643410852713
thresh : 0.69
Epoch: [4][0/279] Elapsed 0m 0s (remain 2m 23s) Loss: 0.3346(0.3346) LR: 0.00000693  
Epoch: [4][100/279] Elapsed 0m 19s (remain 0m 35s) Loss: 0.1860(0.3103) LR: 0.00000488  
Epoch: [4][200/279] Elapsed 0m 39s (remain 0m 15s) Loss: 0.2346(0.3073) LR: 0.00000310  
Epoch: [4][278/279] Elapsed 0m 53s (remain 0m 0s) Loss: 0.2938(0.3056) LR: 0.00000194  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 11s) Loss: 0.5911(0.5911) 


Epoch 4 - avg_train_loss: 0.3056  avg_val_loss: 0.6257  time: 62s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3056  avg_val_loss: 0.6257  time: 62s
Epoch 4 - Score: 0.7209
INFO:__main__:Epoch 4 - Score: 0.7209


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.3715(0.6257) 
f1 score : 0.39183673469387753
recall score : 0.3137254901960784
precision score : 0.5217391304347826
thresh : 0.75
Epoch: [5][0/279] Elapsed 0m 0s (remain 2m 15s) Loss: 0.2101(0.2101) LR: 0.00000193  
Epoch: [5][100/279] Elapsed 0m 19s (remain 0m 34s) Loss: 0.1658(0.2179) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 38s (remain 0m 15s) Loss: 0.2901(0.2130) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 53s (remain 0m 0s) Loss: 0.2356(0.2137) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 11s) Loss: 0.6206(0.6206) 


Epoch 5 - avg_train_loss: 0.2137  avg_val_loss: 0.6462  time: 62s
INFO:__main__:Epoch 5 - avg_train_loss: 0.2137  avg_val_loss: 0.6462  time: 62s
Epoch 5 - Score: 0.7149
INFO:__main__:Epoch 5 - Score: 0.7149


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.3796(0.6462) 
f1 score : 0.4153846153846154
recall score : 0.35294117647058826
precision score : 0.5046728971962616
thresh : 0.62


Score: 0.6707
INFO:__main__:Score: 0.6707
ACC BEST Score: 0.7249
INFO:__main__:ACC BEST Score: 0.7249


f1 score : 0.41843971631205673
recall score : 0.38562091503267976
precision score : 0.4573643410852713
thresh : 0.69


BartConfig {
  "_name_or_path": "sshleifer/distilbart-cnn-12-6",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "extra_pos_embeddings": 2,
  "force_bos_token_to_be_generated": true,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "id2label": {
    "0": "LABEL_

Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 2m 35s) Loss: 0.7633(0.7633) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 20s (remain 0m 35s) Loss: 0.4871(0.6568) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 38s (remain 0m 15s) Loss: 0.6114(0.6296) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 53s (remain 0m 0s) Loss: 0.4387(0.6179) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 12s) Loss: 0.6742(0.6742) 


Epoch 1 - avg_train_loss: 0.6179  avg_val_loss: 0.5951  time: 61s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6179  avg_val_loss: 0.5951  time: 61s
Epoch 1 - Score: 0.7108
INFO:__main__:Epoch 1 - Score: 0.7108
Epoch 1 - Save Best Score: 0.7108 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7108 Model


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.6555(0.5951) 
f1 score : 0.06329113924050632
recall score : 0.03289473684210526
precision score : 0.8333333333333334
thresh : 0.32
Epoch: [2][0/279] Elapsed 0m 0s (remain 2m 17s) Loss: 0.6180(0.6180) LR: 0.00001808  
Epoch: [2][100/279] Elapsed 0m 19s (remain 0m 34s) Loss: 0.7758(0.5635) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 39s (remain 0m 15s) Loss: 0.6854(0.5596) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 54s (remain 0m 0s) Loss: 0.5643(0.5554) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 12s) Loss: 0.6470(0.6470) 


Epoch 2 - avg_train_loss: 0.5554  avg_val_loss: 0.5761  time: 62s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5554  avg_val_loss: 0.5761  time: 62s
Epoch 2 - Score: 0.7108
INFO:__main__:Epoch 2 - Score: 0.7108


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.6401(0.5761) 
f1 score : 0.09756097560975609
recall score : 0.05263157894736842
precision score : 0.6666666666666666
thresh : 0.48
Epoch: [3][0/279] Elapsed 0m 0s (remain 2m 15s) Loss: 0.3196(0.3196) LR: 0.00001309  
Epoch: [3][100/279] Elapsed 0m 19s (remain 0m 34s) Loss: 0.4282(0.4670) LR: 0.00001090  
Epoch: [3][200/279] Elapsed 0m 38s (remain 0m 14s) Loss: 0.4682(0.4443) LR: 0.00000866  
Epoch: [3][278/279] Elapsed 0m 53s (remain 0m 0s) Loss: 0.3147(0.4347) LR: 0.00000695  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 12s) Loss: 0.6275(0.6275) 


Epoch 3 - avg_train_loss: 0.4347  avg_val_loss: 0.6483  time: 61s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4347  avg_val_loss: 0.6483  time: 61s
Epoch 3 - Score: 0.7108
INFO:__main__:Epoch 3 - Score: 0.7108


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.6409(0.6483) 
f1 score : 0.3387096774193548
recall score : 0.27631578947368424
precision score : 0.4375
thresh : 0.78
Epoch: [4][0/279] Elapsed 0m 0s (remain 2m 16s) Loss: 0.2585(0.2585) LR: 0.00000693  
Epoch: [4][100/279] Elapsed 0m 19s (remain 0m 34s) Loss: 0.2617(0.2748) LR: 0.00000488  
Epoch: [4][200/279] Elapsed 0m 38s (remain 0m 15s) Loss: 0.1686(0.2510) LR: 0.00000310  
Epoch: [4][278/279] Elapsed 0m 54s (remain 0m 0s) Loss: 0.1787(0.2494) LR: 0.00000194  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 12s) Loss: 0.6250(0.6250) 


Epoch 4 - avg_train_loss: 0.2494  avg_val_loss: 0.6621  time: 62s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2494  avg_val_loss: 0.6621  time: 62s
Epoch 4 - Score: 0.7209
INFO:__main__:Epoch 4 - Score: 0.7209
Epoch 4 - Save Best Score: 0.7209 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7209 Model


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.6835(0.6621) 
f1 score : 0.4090909090909091
recall score : 0.35526315789473684
precision score : 0.48214285714285715
thresh : 0.78
Epoch: [5][0/279] Elapsed 0m 0s (remain 3m 8s) Loss: 0.2231(0.2231) LR: 0.00000193  
Epoch: [5][100/279] Elapsed 0m 19s (remain 0m 35s) Loss: 0.1225(0.1633) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 39s (remain 0m 15s) Loss: 0.1356(0.1536) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 54s (remain 0m 0s) Loss: 0.2939(0.1567) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 12s) Loss: 0.6584(0.6584) 


Epoch 5 - avg_train_loss: 0.1567  avg_val_loss: 0.7189  time: 62s
INFO:__main__:Epoch 5 - avg_train_loss: 0.1567  avg_val_loss: 0.7189  time: 62s
Epoch 5 - Score: 0.7169
INFO:__main__:Epoch 5 - Score: 0.7169


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.7153(0.7189) 
f1 score : 0.3878326996197719
recall score : 0.3355263157894737
precision score : 0.4594594594594595
thresh : 0.79


Score: 0.6867
INFO:__main__:Score: 0.6867
ACC BEST Score: 0.7209
INFO:__main__:ACC BEST Score: 0.7209


f1 score : 0.4090909090909091
recall score : 0.35526315789473684
precision score : 0.48214285714285715
thresh : 0.78


BartConfig {
  "_name_or_path": "sshleifer/distilbart-cnn-12-6",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "extra_pos_embeddings": 2,
  "force_bos_token_to_be_generated": true,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "id2label": {
    "0": "LABEL_

Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 2m 29s) Loss: 0.5756(0.5756) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 19s (remain 0m 34s) Loss: 0.6621(0.6260) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 38s (remain 0m 15s) Loss: 0.5170(0.6151) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 53s (remain 0m 0s) Loss: 0.5089(0.6111) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 11s) Loss: 0.5409(0.5409) 


Epoch 1 - avg_train_loss: 0.6111  avg_val_loss: 0.5894  time: 62s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6111  avg_val_loss: 0.5894  time: 62s
Epoch 1 - Score: 0.7062
INFO:__main__:Epoch 1 - Score: 0.7062
Epoch 1 - Save Best Score: 0.7062 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7062 Model


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.5367(0.5894) 
f1 score : 0.08588957055214723
recall score : 0.046052631578947366
precision score : 0.6363636363636364
thresh : 0.79
Epoch: [2][0/279] Elapsed 0m 0s (remain 2m 6s) Loss: 0.6807(0.6807) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 19s (remain 0m 34s) Loss: 0.6998(0.5706) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 39s (remain 0m 15s) Loss: 0.4528(0.5642) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 53s (remain 0m 0s) Loss: 0.5670(0.5600) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 11s) Loss: 0.4941(0.4941) 


Epoch 2 - avg_train_loss: 0.5600  avg_val_loss: 0.5884  time: 62s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5600  avg_val_loss: 0.5884  time: 62s
Epoch 2 - Score: 0.7143
INFO:__main__:Epoch 2 - Score: 0.7143
Epoch 2 - Save Best Score: 0.7143 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7143 Model


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.4471(0.5884) 
f1 score : 0.1420118343195266
recall score : 0.07894736842105263
precision score : 0.7058823529411765
thresh : 0.45
Epoch: [3][0/279] Elapsed 0m 0s (remain 2m 13s) Loss: 0.5303(0.5303) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 19s (remain 0m 34s) Loss: 0.5898(0.4475) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 38s (remain 0m 15s) Loss: 0.4006(0.4382) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 53s (remain 0m 0s) Loss: 0.6603(0.4448) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 11s) Loss: 0.5171(0.5171) 


Epoch 3 - avg_train_loss: 0.4448  avg_val_loss: 0.6485  time: 62s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4448  avg_val_loss: 0.6485  time: 62s
Epoch 3 - Score: 0.7123
INFO:__main__:Epoch 3 - Score: 0.7123


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.4364(0.6485) 
f1 score : 0.1411764705882353
recall score : 0.07894736842105263
precision score : 0.6666666666666666
thresh : 0.47
Epoch: [4][0/279] Elapsed 0m 0s (remain 2m 2s) Loss: 0.4312(0.4312) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 19s (remain 0m 34s) Loss: 0.4486(0.2861) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 39s (remain 0m 15s) Loss: 0.2288(0.2757) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 53s (remain 0m 0s) Loss: 0.2002(0.2709) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 11s) Loss: 0.7106(0.7106) 


Epoch 4 - avg_train_loss: 0.2709  avg_val_loss: 0.6764  time: 61s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2709  avg_val_loss: 0.6764  time: 61s
Epoch 4 - Score: 0.7243
INFO:__main__:Epoch 4 - Score: 0.7243
Epoch 4 - Save Best Score: 0.7243 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7243 Model


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.4925(0.6764) 
f1 score : 0.4098939929328622
recall score : 0.3815789473684211
precision score : 0.44274809160305345
thresh : 0.69
Epoch: [5][0/279] Elapsed 0m 0s (remain 2m 35s) Loss: 0.2124(0.2124) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 19s (remain 0m 34s) Loss: 0.2706(0.1889) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 38s (remain 0m 15s) Loss: 0.2209(0.1829) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 53s (remain 0m 0s) Loss: 0.1850(0.1821) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 11s) Loss: 0.8706(0.8706) 


Epoch 5 - avg_train_loss: 0.1821  avg_val_loss: 0.7440  time: 61s
INFO:__main__:Epoch 5 - avg_train_loss: 0.1821  avg_val_loss: 0.7440  time: 61s
Epoch 5 - Score: 0.7203
INFO:__main__:Epoch 5 - Score: 0.7203


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.4993(0.7440) 
f1 score : 0.4357142857142857
recall score : 0.40131578947368424
precision score : 0.4765625
thresh : 0.73


Score: 0.6640
INFO:__main__:Score: 0.6640
ACC BEST Score: 0.7243
INFO:__main__:ACC BEST Score: 0.7243


f1 score : 0.4098939929328622
recall score : 0.3815789473684211
precision score : 0.44274809160305345
thresh : 0.69


BartConfig {
  "_name_or_path": "sshleifer/distilbart-cnn-12-6",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "extra_pos_embeddings": 2,
  "force_bos_token_to_be_generated": true,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "id2label": {
    "0": "LABEL_

Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 2m 9s) Loss: 1.0564(1.0564) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 19s (remain 0m 34s) Loss: 0.5092(0.6357) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 38s (remain 0m 15s) Loss: 0.5656(0.6205) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 53s (remain 0m 0s) Loss: 0.6053(0.6204) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 10s) Loss: 0.6309(0.6309) 


Epoch 1 - avg_train_loss: 0.6204  avg_val_loss: 0.6027  time: 61s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6204  avg_val_loss: 0.6027  time: 61s
Epoch 1 - Score: 0.7082
INFO:__main__:Epoch 1 - Score: 0.7082
Epoch 1 - Save Best Score: 0.7082 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7082 Model


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.7308(0.6027) 
f1 score : 0.08641975308641975
recall score : 0.046052631578947366
precision score : 0.7
thresh : 0.72
Epoch: [2][0/279] Elapsed 0m 0s (remain 2m 7s) Loss: 0.6583(0.6583) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 20s (remain 0m 35s) Loss: 0.6258(0.5768) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 39s (remain 0m 15s) Loss: 0.5796(0.5635) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 54s (remain 0m 0s) Loss: 0.3943(0.5609) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 10s) Loss: 0.6332(0.6332) 


Epoch 2 - avg_train_loss: 0.5609  avg_val_loss: 0.5816  time: 62s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5609  avg_val_loss: 0.5816  time: 62s
Epoch 2 - Score: 0.7143
INFO:__main__:Epoch 2 - Score: 0.7143
Epoch 2 - Save Best Score: 0.7143 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7143 Model


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.6424(0.5816) 
f1 score : 0.27999999999999997
recall score : 0.18421052631578946
precision score : 0.5833333333333334
thresh : 0.53
Epoch: [3][0/279] Elapsed 0m 0s (remain 2m 16s) Loss: 0.5682(0.5682) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 19s (remain 0m 34s) Loss: 0.5198(0.4593) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 39s (remain 0m 15s) Loss: 0.4200(0.4622) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 53s (remain 0m 0s) Loss: 0.4461(0.4595) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 10s) Loss: 0.6311(0.6311) 


Epoch 3 - avg_train_loss: 0.4595  avg_val_loss: 0.5855  time: 61s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4595  avg_val_loss: 0.5855  time: 61s
Epoch 3 - Score: 0.7143
INFO:__main__:Epoch 3 - Score: 0.7143


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.6528(0.5855) 
f1 score : 0.37944664031620556
recall score : 0.3157894736842105
precision score : 0.4752475247524752
thresh : 0.71
Epoch: [4][0/279] Elapsed 0m 0s (remain 2m 39s) Loss: 0.4558(0.4558) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 19s (remain 0m 35s) Loss: 0.3827(0.3105) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 38s (remain 0m 15s) Loss: 0.3145(0.2998) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 53s (remain 0m 0s) Loss: 0.2330(0.2944) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 10s) Loss: 0.7350(0.7350) 


Epoch 4 - avg_train_loss: 0.2944  avg_val_loss: 0.6595  time: 61s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2944  avg_val_loss: 0.6595  time: 61s
Epoch 4 - Score: 0.7082
INFO:__main__:Epoch 4 - Score: 0.7082


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.8293(0.6595) 
f1 score : 0.3261802575107296
recall score : 0.25
precision score : 0.4691358024691358
thresh : 0.74
Epoch: [5][0/279] Elapsed 0m 0s (remain 2m 3s) Loss: 0.0968(0.0968) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 19s (remain 0m 34s) Loss: 0.2485(0.2025) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 38s (remain 0m 15s) Loss: 0.3360(0.2005) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 53s (remain 0m 0s) Loss: 0.1940(0.1938) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 10s) Loss: 0.8027(0.8027) 


Epoch 5 - avg_train_loss: 0.1938  avg_val_loss: 0.6873  time: 61s
INFO:__main__:Epoch 5 - avg_train_loss: 0.1938  avg_val_loss: 0.6873  time: 61s
Epoch 5 - Score: 0.7042
INFO:__main__:Epoch 5 - Score: 0.7042


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.8685(0.6873) 
f1 score : 0.4045801526717557
recall score : 0.34868421052631576
precision score : 0.4818181818181818
thresh : 0.79


Score: 0.7103
INFO:__main__:Score: 0.7103
ACC BEST Score: 0.7143
INFO:__main__:ACC BEST Score: 0.7143


f1 score : 0.27999999999999997
recall score : 0.18421052631578946
precision score : 0.5833333333333334
thresh : 0.53


BartConfig {
  "_name_or_path": "sshleifer/distilbart-cnn-12-6",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "extra_pos_embeddings": 2,
  "force_bos_token_to_be_generated": true,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "id2label": {
    "0": "LABEL_

Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 2m 10s) Loss: 0.9143(0.9143) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 19s (remain 0m 34s) Loss: 0.9693(0.6174) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 38s (remain 0m 14s) Loss: 0.6464(0.6160) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 53s (remain 0m 0s) Loss: 0.5171(0.6139) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 10s) Loss: 0.5901(0.5901) 


Epoch 1 - avg_train_loss: 0.6139  avg_val_loss: 0.5970  time: 61s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6139  avg_val_loss: 0.5970  time: 61s
Epoch 1 - Score: 0.7163
INFO:__main__:Epoch 1 - Score: 0.7163
Epoch 1 - Save Best Score: 0.7163 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7163 Model


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.5626(0.5970) 
f1 score : 0.09937888198757765
recall score : 0.05263157894736842
precision score : 0.8888888888888888
thresh : 0.46
Epoch: [2][0/279] Elapsed 0m 0s (remain 2m 13s) Loss: 0.6006(0.6006) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 19s (remain 0m 34s) Loss: 0.5845(0.5619) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 39s (remain 0m 15s) Loss: 0.6559(0.5679) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 53s (remain 0m 0s) Loss: 0.5457(0.5745) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 10s) Loss: 0.5520(0.5520) 


Epoch 2 - avg_train_loss: 0.5745  avg_val_loss: 0.5834  time: 62s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5745  avg_val_loss: 0.5834  time: 62s
Epoch 2 - Score: 0.7123
INFO:__main__:Epoch 2 - Score: 0.7123


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.5688(0.5834) 
f1 score : 0.13253012048192772
recall score : 0.07236842105263158
precision score : 0.7857142857142857
thresh : 0.48
Epoch: [3][0/279] Elapsed 0m 0s (remain 2m 13s) Loss: 0.5069(0.5069) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 19s (remain 0m 34s) Loss: 0.3967(0.4918) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 38s (remain 0m 15s) Loss: 0.3204(0.4883) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 53s (remain 0m 0s) Loss: 0.6585(0.4862) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 10s) Loss: 0.4941(0.4941) 


Epoch 3 - avg_train_loss: 0.4862  avg_val_loss: 0.5933  time: 62s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4862  avg_val_loss: 0.5933  time: 62s
Epoch 3 - Score: 0.7163
INFO:__main__:Epoch 3 - Score: 0.7163


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.5941(0.5933) 
f1 score : 0.37719298245614036
recall score : 0.28289473684210525
precision score : 0.5657894736842105
thresh : 0.51
Epoch: [4][0/279] Elapsed 0m 0s (remain 2m 14s) Loss: 0.3422(0.3422) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 19s (remain 0m 34s) Loss: 0.3590(0.3209) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 38s (remain 0m 15s) Loss: 0.3741(0.3238) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 53s (remain 0m 0s) Loss: 0.1413(0.3212) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 10s) Loss: 0.4581(0.4581) 


Epoch 4 - avg_train_loss: 0.3212  avg_val_loss: 0.6178  time: 62s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3212  avg_val_loss: 0.6178  time: 62s
Epoch 4 - Score: 0.7163
INFO:__main__:Epoch 4 - Score: 0.7163


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.6914(0.6178) 
f1 score : 0.40485829959514175
recall score : 0.32894736842105265
precision score : 0.5263157894736842
thresh : 0.55
Epoch: [5][0/279] Elapsed 0m 0s (remain 2m 11s) Loss: 0.2367(0.2367) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 19s (remain 0m 34s) Loss: 0.1858(0.2471) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 38s (remain 0m 15s) Loss: 0.3536(0.2419) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 54s (remain 0m 0s) Loss: 0.2930(0.2369) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 10s) Loss: 0.4640(0.4640) 


Epoch 5 - avg_train_loss: 0.2369  avg_val_loss: 0.6251  time: 62s
INFO:__main__:Epoch 5 - avg_train_loss: 0.2369  avg_val_loss: 0.6251  time: 62s
Epoch 5 - Score: 0.7143
INFO:__main__:Epoch 5 - Score: 0.7143


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.6973(0.6251) 
f1 score : 0.4033613445378151
recall score : 0.3157894736842105
precision score : 0.5581395348837209
thresh : 0.5


Score: 0.7082
INFO:__main__:Score: 0.7082
ACC BEST Score: 0.7163
INFO:__main__:ACC BEST Score: 0.7163


f1 score : 0.09937888198757765
recall score : 0.05263157894736842
precision score : 0.8888888888888888
thresh : 0.46


BartConfig {
  "_name_or_path": "sshleifer/distilbart-cnn-12-6",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "extra_pos_embeddings": 2,
  "force_bos_token_to_be_generated": true,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "id2label": {
    "0": "LABEL_

Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 2m 23s) Loss: 0.8001(0.8001) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 20s (remain 0m 35s) Loss: 0.4957(0.6406) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 38s (remain 0m 15s) Loss: 0.5521(0.6258) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 53s (remain 0m 0s) Loss: 0.6000(0.6166) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 11s) Loss: 0.6538(0.6538) 


Epoch 1 - avg_train_loss: 0.6166  avg_val_loss: 0.5853  time: 61s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6166  avg_val_loss: 0.5853  time: 61s
Epoch 1 - Score: 0.7103
INFO:__main__:Epoch 1 - Score: 0.7103
Epoch 1 - Save Best Score: 0.7103 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7103 Model


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.7205(0.5853) 
f1 score : 0.07594936708860758
recall score : 0.039473684210526314
precision score : 1.0
thresh : 0.39
Epoch: [2][0/279] Elapsed 0m 0s (remain 2m 20s) Loss: 0.4179(0.4179) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 19s (remain 0m 35s) Loss: 0.4852(0.5637) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 38s (remain 0m 15s) Loss: 0.5609(0.5728) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 53s (remain 0m 0s) Loss: 0.5726(0.5697) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 11s) Loss: 0.6918(0.6918) 


Epoch 2 - avg_train_loss: 0.5697  avg_val_loss: 0.5775  time: 62s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5697  avg_val_loss: 0.5775  time: 62s
Epoch 2 - Score: 0.7143
INFO:__main__:Epoch 2 - Score: 0.7143
Epoch 2 - Save Best Score: 0.7143 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7143 Model


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.7316(0.5775) 
f1 score : 0.08750000000000001
recall score : 0.046052631578947366
precision score : 0.875
thresh : 0.38
Epoch: [3][0/279] Elapsed 0m 0s (remain 2m 45s) Loss: 0.4125(0.4125) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 19s (remain 0m 34s) Loss: 0.6192(0.5184) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 38s (remain 0m 14s) Loss: 0.4303(0.5055) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 53s (remain 0m 0s) Loss: 0.4372(0.5018) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 11s) Loss: 0.6937(0.6937) 


Epoch 3 - avg_train_loss: 0.5018  avg_val_loss: 0.5738  time: 62s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5018  avg_val_loss: 0.5738  time: 62s
Epoch 3 - Score: 0.7143
INFO:__main__:Epoch 3 - Score: 0.7143


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.6577(0.5738) 
f1 score : 0.35833333333333334
recall score : 0.28289473684210525
precision score : 0.48863636363636365
thresh : 0.6
Epoch: [4][0/279] Elapsed 0m 0s (remain 2m 18s) Loss: 0.4038(0.4038) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 19s (remain 0m 34s) Loss: 0.3048(0.3497) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 38s (remain 0m 15s) Loss: 0.3317(0.3489) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 53s (remain 0m 0s) Loss: 0.3541(0.3511) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 11s) Loss: 0.7346(0.7346) 


Epoch 4 - avg_train_loss: 0.3511  avg_val_loss: 0.5915  time: 62s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3511  avg_val_loss: 0.5915  time: 62s
Epoch 4 - Score: 0.7143
INFO:__main__:Epoch 4 - Score: 0.7143


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.7348(0.5915) 
f1 score : 0.2922374429223744
recall score : 0.21052631578947367
precision score : 0.47761194029850745
thresh : 0.58
Epoch: [5][0/279] Elapsed 0m 0s (remain 2m 52s) Loss: 0.3004(0.3004) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 19s (remain 0m 34s) Loss: 0.3221(0.2646) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 38s (remain 0m 15s) Loss: 0.3002(0.2630) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 53s (remain 0m 0s) Loss: 0.2868(0.2626) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 11s) Loss: 0.7571(0.7571) 


Epoch 5 - avg_train_loss: 0.2626  avg_val_loss: 0.5961  time: 62s
INFO:__main__:Epoch 5 - avg_train_loss: 0.2626  avg_val_loss: 0.5961  time: 62s
Epoch 5 - Score: 0.7183
INFO:__main__:Epoch 5 - Score: 0.7183
Epoch 5 - Save Best Score: 0.7183 Model
INFO:__main__:Epoch 5 - Save Best Score: 0.7183 Model


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.7303(0.5961) 
f1 score : 0.39024390243902435
recall score : 0.3157894736842105
precision score : 0.5106382978723404
thresh : 0.43


Score: 0.6982
INFO:__main__:Score: 0.6982
ACC BEST Score: 0.7183
INFO:__main__:ACC BEST Score: 0.7183


f1 score : 0.39024390243902435
recall score : 0.3157894736842105
precision score : 0.5106382978723404
thresh : 0.43


BartConfig {
  "_name_or_path": "sshleifer/distilbart-cnn-12-6",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "extra_pos_embeddings": 2,
  "force_bos_token_to_be_generated": true,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "id2label": {
    "0": "LABEL_

Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 2m 27s) Loss: 0.6154(0.6154) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 20s (remain 0m 35s) Loss: 0.6232(0.6269) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 38s (remain 0m 15s) Loss: 0.3284(0.6191) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 53s (remain 0m 0s) Loss: 0.6794(0.6160) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 12s) Loss: 0.5886(0.5886) 


Epoch 1 - avg_train_loss: 0.6160  avg_val_loss: 0.6308  time: 61s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6160  avg_val_loss: 0.6308  time: 61s
Epoch 1 - Score: 0.7123
INFO:__main__:Epoch 1 - Score: 0.7123
Epoch 1 - Save Best Score: 0.7123 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7123 Model


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.8340(0.6308) 
f1 score : 0.08750000000000001
recall score : 0.046052631578947366
precision score : 0.875
thresh : 0.27
Epoch: [2][0/279] Elapsed 0m 0s (remain 2m 18s) Loss: 0.7336(0.7336) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 19s (remain 0m 34s) Loss: 0.4627(0.5747) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 38s (remain 0m 14s) Loss: 0.4572(0.5729) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 53s (remain 0m 0s) Loss: 0.5376(0.5698) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 12s) Loss: 0.6158(0.6158) 


Epoch 2 - avg_train_loss: 0.5698  avg_val_loss: 0.5965  time: 62s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5698  avg_val_loss: 0.5965  time: 62s
Epoch 2 - Score: 0.7163
INFO:__main__:Epoch 2 - Score: 0.7163
Epoch 2 - Save Best Score: 0.7163 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7163 Model


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.7031(0.5965) 
f1 score : 0.09876543209876543
recall score : 0.05263157894736842
precision score : 0.8
thresh : 0.41
Epoch: [3][0/279] Elapsed 0m 0s (remain 2m 6s) Loss: 0.5827(0.5827) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 19s (remain 0m 34s) Loss: 0.4567(0.4701) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 38s (remain 0m 14s) Loss: 0.4746(0.4669) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 53s (remain 0m 0s) Loss: 0.6313(0.4662) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 12s) Loss: 0.6134(0.6134) 


Epoch 3 - avg_train_loss: 0.4662  avg_val_loss: 0.5906  time: 61s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4662  avg_val_loss: 0.5906  time: 61s
Epoch 3 - Score: 0.7143
INFO:__main__:Epoch 3 - Score: 0.7143


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.6894(0.5906) 
f1 score : 0.3559322033898305
recall score : 0.27631578947368424
precision score : 0.5
thresh : 0.68
Epoch: [4][0/279] Elapsed 0m 0s (remain 2m 22s) Loss: 0.2636(0.2636) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 19s (remain 0m 34s) Loss: 0.4367(0.3183) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 39s (remain 0m 15s) Loss: 0.2126(0.3123) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 54s (remain 0m 0s) Loss: 0.2359(0.3125) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 12s) Loss: 0.5934(0.5934) 


Epoch 4 - avg_train_loss: 0.3125  avg_val_loss: 0.6341  time: 62s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3125  avg_val_loss: 0.6341  time: 62s
Epoch 4 - Score: 0.7143
INFO:__main__:Epoch 4 - Score: 0.7143


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.7982(0.6341) 
f1 score : 0.3318385650224215
recall score : 0.24342105263157895
precision score : 0.5211267605633803
thresh : 0.66
Epoch: [5][0/279] Elapsed 0m 0s (remain 2m 18s) Loss: 0.2363(0.2363) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 19s (remain 0m 34s) Loss: 0.3374(0.2395) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 39s (remain 0m 15s) Loss: 0.1502(0.2297) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 53s (remain 0m 0s) Loss: 0.1866(0.2262) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 12s) Loss: 0.6129(0.6129) 


Epoch 5 - avg_train_loss: 0.2262  avg_val_loss: 0.6386  time: 61s
INFO:__main__:Epoch 5 - avg_train_loss: 0.2262  avg_val_loss: 0.6386  time: 61s
Epoch 5 - Score: 0.7163
INFO:__main__:Epoch 5 - Score: 0.7163


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.7867(0.6386) 
f1 score : 0.34567901234567905
recall score : 0.27631578947368424
precision score : 0.46153846153846156
thresh : 0.65


Score: 0.7062
INFO:__main__:Score: 0.7062
ACC BEST Score: 0.7163
INFO:__main__:ACC BEST Score: 0.7163


f1 score : 0.09876543209876543
recall score : 0.05263157894736842
precision score : 0.8
thresh : 0.41


BartConfig {
  "_name_or_path": "sshleifer/distilbart-cnn-12-6",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "extra_pos_embeddings": 2,
  "force_bos_token_to_be_generated": true,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "id2label": {
    "0": "LABEL_

Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 2m 30s) Loss: 0.7092(0.7092) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 18s (remain 0m 33s) Loss: 0.6360(0.6303) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 38s (remain 0m 14s) Loss: 0.6117(0.6199) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 53s (remain 0m 0s) Loss: 0.5456(0.6098) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 11s) Loss: 0.5709(0.5709) 


Epoch 1 - avg_train_loss: 0.6098  avg_val_loss: 0.5856  time: 61s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6098  avg_val_loss: 0.5856  time: 61s
Epoch 1 - Score: 0.7062
INFO:__main__:Epoch 1 - Score: 0.7062
Epoch 1 - Save Best Score: 0.7062 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7062 Model


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.5413(0.5856) 
f1 score : 0.07547169811320754
recall score : 0.039473684210526314
precision score : 0.8571428571428571
thresh : 0.46
Epoch: [2][0/279] Elapsed 0m 0s (remain 2m 11s) Loss: 0.4254(0.4254) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 19s (remain 0m 33s) Loss: 0.5039(0.5675) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 39s (remain 0m 15s) Loss: 0.6322(0.5654) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 54s (remain 0m 0s) Loss: 0.5023(0.5587) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 11s) Loss: 0.5607(0.5607) 


Epoch 2 - avg_train_loss: 0.5587  avg_val_loss: 0.5800  time: 61s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5587  avg_val_loss: 0.5800  time: 61s
Epoch 2 - Score: 0.7062
INFO:__main__:Epoch 2 - Score: 0.7062


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.5171(0.5800) 
f1 score : 0.2210526315789474
recall score : 0.13815789473684212
precision score : 0.5526315789473685
thresh : 0.6
Epoch: [3][0/279] Elapsed 0m 0s (remain 2m 8s) Loss: 0.5257(0.5257) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 19s (remain 0m 34s) Loss: 0.4600(0.4377) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 39s (remain 0m 15s) Loss: 0.6036(0.4367) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 54s (remain 0m 0s) Loss: 0.4514(0.4296) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 11s) Loss: 0.6302(0.6302) 


Epoch 3 - avg_train_loss: 0.4296  avg_val_loss: 0.6109  time: 62s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4296  avg_val_loss: 0.6109  time: 62s
Epoch 3 - Score: 0.7062
INFO:__main__:Epoch 3 - Score: 0.7062


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.4655(0.6109) 
f1 score : 0.354978354978355
recall score : 0.26973684210526316
precision score : 0.5189873417721519
thresh : 0.71
Epoch: [4][0/279] Elapsed 0m 0s (remain 2m 23s) Loss: 0.2168(0.2168) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 19s (remain 0m 34s) Loss: 0.1570(0.2414) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 38s (remain 0m 15s) Loss: 0.1690(0.2399) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 54s (remain 0m 0s) Loss: 0.1546(0.2372) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 11s) Loss: 0.8851(0.8851) 


Epoch 4 - avg_train_loss: 0.2372  avg_val_loss: 0.7156  time: 62s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2372  avg_val_loss: 0.7156  time: 62s
Epoch 4 - Score: 0.7002
INFO:__main__:Epoch 4 - Score: 0.7002


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.4161(0.7156) 
f1 score : 0.45051194539249145
recall score : 0.4342105263157895
precision score : 0.46808510638297873
thresh : 0.77
Epoch: [5][0/279] Elapsed 0m 0s (remain 2m 21s) Loss: 0.1010(0.1010) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 19s (remain 0m 34s) Loss: 0.2486(0.1482) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 39s (remain 0m 15s) Loss: 0.1112(0.1491) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 54s (remain 0m 0s) Loss: 0.1189(0.1501) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 11s) Loss: 1.0002(1.0002) 


Epoch 5 - avg_train_loss: 0.1501  avg_val_loss: 0.7786  time: 62s
INFO:__main__:Epoch 5 - avg_train_loss: 0.1501  avg_val_loss: 0.7786  time: 62s
Epoch 5 - Score: 0.7062
INFO:__main__:Epoch 5 - Score: 0.7062


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.3932(0.7786) 
f1 score : 0.4172661870503597
recall score : 0.3815789473684211
precision score : 0.4603174603174603
thresh : 0.64




In [None]:
from google.colab import runtime
runtime.unassign()