In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
!pip install transformers
!pip install datasets
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m56.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m106.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https://pypi.org/simple, h

In [3]:
!nvidia-smi

Sat May  6 13:36:31 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   29C    P0    44W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:

import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re
import html

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD, AdamW, RAdam
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score, recall_score, accuracy_score, precision_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
DIR = "/content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測"
INPUT_DIR = os.path.join(DIR,"input")
OUTPUT_DIR = os.path.join(DIR,"output")
OUTPUT_EXP_DIR = DIR + '/output/EXP077/'
if not os.path.exists(OUTPUT_EXP_DIR):
    os.makedirs(OUTPUT_EXP_DIR)

In [6]:

# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    model_name="EleutherAI/gpt-neo-125m"
    # model='microsoft/deberta-base'
    # model='roberta-base'
    # model='roberta-large'
    # model='roberta-large-mnli'
    # model='xlnet-large-cased'
    # model='albert-xxlarge-v2'
    # model="microsoft/deberta-large"
    # model="microsoft/deberta-v3-large"
    # model='microsoft/deberta-v2-xlarge'
    # model='funnel-transformer/large'
    # model='funnel-transformer/medium'
    # model='albert-base-v2'
    # model='albert-large-v2'
    # model='google/electra-large-discriminator'
    # model='google/electra-base-discriminator'
    # model="facebook/bart-large-mnli"
    # model="facebook/bart-large"
    # model="facebook/bart-base"
    model = "EleutherAI/gpt-neo-125m"
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=5
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=16
    fc_dropout=0.2
    target_size=1
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=10
    trn_fold=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    train=True
    nth_awp_start_epoch=1
    gradient_checkpointing = False
    freezing = False
    num_reinit_layers = 1
    is_reinit_layer = False
    fgm = False
    awp_start=1

if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0, 1]

In [7]:
def get_score(labels, outputs):
    thresh = 0.5
    y_pred = outputs
    y_true = labels
    f_score = f1_score(y_true, (y_pred>thresh).astype(int))
    r_score = recall_score(y_true, (y_pred>thresh).astype(int))
    p_score = precision_score(y_true, (y_pred>thresh).astype(int))
    print(f"f1 score : {f_score}")
    print(f"recall score : {r_score}")
    print(f"precision score : {p_score}")
    return accuracy_score(y_true, (y_pred>thresh).astype(int))

def get_acc_score(labels, outputs):
    y_pred = outputs
    y_true = labels
    best_score = 0
    best_thresh = 0.5
    for thresh in np.arange(0.1, 0.80, 0.01):
        thresh = np.round(thresh, 2)
        score = accuracy_score(y_true, (y_pred>thresh).astype(int))
        #print("Accuracy score at threshold {0} is {1}".format(thresh, score))
        if score > best_score:
          best_score = score
          best_thresh = thresh
    print(f"thresh : {best_thresh}")
    return accuracy_score(y_true, (y_pred>best_thresh).astype(int))


def get_logger(filename=OUTPUT_EXP_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=CFG.seed)

In [8]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False
        
def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
            
    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """
    
    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"
        
        if hasattr(embeddings_path, attr_name): 
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [9]:
import pandas as pd
import numpy as np

train = pd.read_csv(os.path.join(INPUT_DIR,"train_data.csv"))
test = pd.read_csv(os.path.join(INPUT_DIR,"test_data.csv"))
sample_sub = pd.read_csv(os.path.join(INPUT_DIR,"submission.csv"))

train = train.sample(frac=1, random_state=CFG.seed).reset_index()

print(train.shape)
display(train.head(3))

print(test.shape)
display(test.head(3))

print(sample_sub.shape)
display(sample_sub.head(3))

(4974, 7)


Unnamed: 0,index,id,title,year,abstract,keywords,y
0,721,722,Global Optimality Conditions for Deep Neural N...,2018,We study the error landscape of deep linear an...,"deep linear neural networks, global optimality...",1
1,144,145,Multi-Task Learning by Deep Collaboration and ...,2018,Convolutional neural networks (CNN) have becom...,"multi-task learning, soft parameter sharing, f...",0
2,4542,4543,On the Need for Topology-Aware Generative Mode...,2020,"ML algorithms or models, especially deep neura...","Manifold-based Defense, Robust Learning, Adver...",1


(6393, 5)


Unnamed: 0,id,title,year,abstract,keywords
0,1,StyleAlign: Analysis and Applications of Align...,2022,"In this paper, we perform an in-depth study of...","StyleGAN, transfer learning, fine tuning, mode..."
1,2,Embedding a random graph via GNN: mean-field i...,2021,We develop a theory for embedding a random gra...,"Graph neural network, graph embedding, multi-r..."
2,3,BBRefinement: an universal scheme to improve p...,2021,We present a conceptually simple yet powerful ...,"object detection, deep neural networks, refine..."


(6393, 2)


Unnamed: 0,id,y
0,1,0
1,2,0
2,3,0


In [10]:
train["texts"] = train["title"] + "<|endoftext|>" + train["abstract"] 

In [11]:
skf = StratifiedKFold(n_splits=CFG.n_fold,shuffle=True,random_state=CFG.seed)
for fold, ( _, val_) in enumerate(skf.split(train, train.y)):
    train.loc[val_ , "kfold"] = int(fold)
    
train["kfold"] = train["kfold"].astype(int)

if CFG.debug:
    display(train.groupby('kfold').size())
    train = train.sample(n=500, random_state=0).reset_index(drop=True)
    display(train.groupby('kfold').size())

In [12]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.save_pretrained(OUTPUT_EXP_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

Downloading (…)okenizer_config.json:   0%|          | 0.00/560 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

In [13]:
# ====================================================
# Define max_len
# ====================================================
lengths = []
tk0 = tqdm(train['texts'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 3 # cls + sep + sep
LOGGER.info(f"max_len: {CFG.max_len}")

100%|██████████| 4974/4974 [00:03<00:00, 1532.23it/s]
max_len: 653
INFO:__main__:max_len: 653


In [14]:
class AWP:
    def __init__(self, model, optimizer, *, adv_param='weight',
                 adv_lr=0.001, adv_eps=0.001):
        self.model = model
        self.optimizer = optimizer
        self.adv_param = adv_param
        self.adv_lr = adv_lr
        self.adv_eps = adv_eps
        self.backup = {}

    def perturb(self, inputs, y, criterion):
        """
        Perturb model parameters for AWP gradient
        Call before loss and loss.backward()
        """
        self._save()  # save model parameters
        self._attack_step()  # perturb weights

    def _attack_step(self):
        e = 1e-6
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                grad = self.optimizer.state[param]['exp_avg']
                norm_grad = torch.norm(grad)
                norm_data = torch.norm(param.detach())

                if norm_grad != 0 and not torch.isnan(norm_grad):
                    # Set lower and upper limit in change
                    limit_eps = self.adv_eps * param.detach().abs()
                    param_min = param.data - limit_eps
                    param_max = param.data + limit_eps

                    # Perturb along gradient
                    # w += (adv_lr * |w| / |grad|) * grad
                    param.data.add_(grad, alpha=(self.adv_lr * (norm_data + e) / (norm_grad + e)))

                    # Apply the limit to the change
                    param.data.clamp_(param_min, param_max)

    def _save(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                if name not in self.backup:
                    self.backup[name] = param.clone().detach()
                else:
                    self.backup[name].copy_(param.data)

    def restore(self):
        """
        Restore model parameter to correct position; AWP do not perturbe weights, it perturb gradients
        Call after loss.backward(), before optimizer.step()
        """
        for name, param in self.model.named_parameters():
            if name in self.backup:
                param.data.copy_(self.backup[name])

In [15]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False,
                           truncation=True)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['texts'].values
        self.labels = df['y'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

#collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [16]:
def reinit_layers(model):

    #for layer in model.model.encoder.layer[-CFG.num_reinit_layers:]:
    for layer in model.encoder.layer[-CFG.num_reinit_layers:]:    #Custome model内(backbone)

            for module in layer.modules():

                if isinstance(module,nn.Linear):
                    module.weight.data.normal_(mean=0.0,std=model.config.initializer_range)
                    if module.bias is not None:
                            module.bias.data.zero_()
                elif isinstance(module, nn.Embedding):
                        module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
                        if module.padding_idx is not None:
                            module.weight.data[module.padding_idx].zero_()
                elif isinstance(module, nn.LayerNorm):
                        module.bias.data.zero_()
                        module.weight.data.fill_(1.0)
                        
    return model

In [17]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim=1)
        return max_embeddings
    

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if CFG.is_reinit_layer:
            self.model = reinit_layers(self.model)
            print(f'Reinitializing Last {CFG.num_reinit_layers} Layers.')
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()

        # Freezing
        if cfg.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            cfg.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())

        self.pool = MaxPooling()
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        self._init_weights(self.fc)
        self.layer_norm1 = nn.LayerNorm(self.config.hidden_size)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        feature = self.layer_norm1(feature)
        output = self.fc(feature)
        return output

In [18]:
def calculate_loss(inputs, labels, model, criterion, is_valid=True, device="cpu"):    
    y_preds = model(inputs)
    loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
    return (loss, y_preds) if is_valid else loss

In [19]:

# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device, awp):
    model.zero_grad()
    model.train()
    awp_start = CFG.awp_start
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        if epoch >= awp_start:
            awp.perturb(inputs, labels, criterion)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            loss = calculate_loss(inputs=inputs, labels=labels, model=model, criterion=criterion, is_valid=False, device=device)
        #print(y_preds.sigmoid().squeeze().view(1, -1))
        #loss = criterion(y_preds.sigmoid().squeeze(), labels.squeeze())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        if scaler is not None:
            scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        awp.restore()
        if CFG.fgm:
          fgm.attack() 
          adversarial_loss = calculate_loss(inputs=inputs, labels=labels, model=model, criterion=criterion, is_valid=False, device=device)
          scaler.scale(adversarial_loss).backward()
          fgm.restore()
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  #'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          #grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))

    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            loss, y_preds = calculate_loss(inputs=inputs, labels=labels, model=model, criterion=criterion, is_valid=True, device=device)
        #loss = criterion(y_preds.sigmoid().squeeze(), labels.squeeze())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    predictions = np.concatenate(predictions)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [20]:

# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['kfold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['kfold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds['y'].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)


    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size*2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_EXP_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr=5e-6, decoder_lr=1e-4, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", 
                    "LayerNorm.weight"]
        group1=['layer.0.','layer.1.','layer.2.','layer.3.']
        group2=['layer.4.','layer.5.','layer.6.','layer.7.']    
        group3=['layer.8.','layer.9.','layer.10.','layer.11.']
        group_all=['layer.0.','layer.1.','layer.2.','layer.3.','layer.4.','layer.5.','layer.6.','layer.7.','layer.8.','layer.9.','layer.10.','layer.11.']
        optimizer_parameters = [
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay': weight_decay},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay': weight_decay, 'lr': encoder_lr/2.6},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay': weight_decay, 'lr': encoder_lr},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay': weight_decay, 'lr': encoder_lr*2.6},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay': 0.0},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay': 0.0, 'lr': encoder_lr/2.6},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay': 0.0, 'lr': encoder_lr},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay': 0.0, 'lr': encoder_lr*2.6},
        {'params': [p for n, p in model.named_parameters() if "model" not in n], 'lr':decoder_lr, "momentum" : 0.99},
    ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.BCEWithLogitsLoss()

    print('Enable AWP')
    awp = AWP(model, optimizer, adv_lr=0.001, adv_eps=0.001)
    #print('Enable FGM')
    #fgm = FGM(model=model, eps=0.1)
    
    best_score = -1.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device, awp)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score_05 = get_score(valid_labels, predictions)
        score = get_acc_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')

        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_EXP_DIR+f"{CFG.model_name.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_EXP_DIR+f"{CFG.model_name.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds['pred'] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [21]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df['y'].values
        preds = oof_df['pred'].values
        score = get_score(labels, preds)
        acc_score = get_acc_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
        LOGGER.info(f'ACC BEST Score: {acc_score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
            #break
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_EXP_DIR+'oof_df.pkl')

GPTNeoConfig {
  "_name_or_path": "EleutherAI/gpt-neo-125m",
  "activation_function": "gelu_new",
  "architectures": [
    "GPTNeoForCausalLM"
  ],
  "attention_dropout": 0.0,
  "attention_layers": [
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local"
  ],
  "attention_probs_dropout_prob": 0.0,
  "attention_types": [
    [
      [
        "global",
        "local"
      ],
      6
    ]
  ],
  "bos_token_id": 50256,
  "embed_dropout": 0,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": null,
  "layer_norm_epsilon": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "gpt_neo",
  "num_heads": 12,
  "num_layers": 12,
  "output_hidden_states": true,
  "resid_dropout": 0,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "sum

Downloading pytorch_model.bin:   0%|          | 0.00/526M [00:00<?, ?B/s]

Enable AWP
Epoch: [1][0/279] Elapsed 0m 3s (remain 17m 45s) Loss: 0.7209(0.7209) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 15s (remain 0m 27s) Loss: 0.5807(0.6236) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 26s (remain 0m 10s) Loss: 0.5121(0.6162) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 35s (remain 0m 0s) Loss: 0.6800(0.6138) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5867(0.5867) 


Epoch 1 - avg_train_loss: 0.6138  avg_val_loss: 0.6047  time: 39s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6138  avg_val_loss: 0.6047  time: 39s
Epoch 1 - Score: 0.7048
INFO:__main__:Epoch 1 - Score: 0.7048
Epoch 1 - Save Best Score: 0.7048 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7048 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.6375(0.6047) 
f1 score : 0.06329113924050632
recall score : 0.032679738562091505
precision score : 1.0
thresh : 0.41
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 20s) Loss: 0.6328(0.6328) LR: 0.00001808  
Epoch: [2][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.4389(0.5554) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 22s (remain 0m 8s) Loss: 0.6403(0.5511) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.4835(0.5512) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5604(0.5604) 


Epoch 2 - avg_train_loss: 0.5512  avg_val_loss: 0.5948  time: 36s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5512  avg_val_loss: 0.5948  time: 36s
Epoch 2 - Score: 0.7108
INFO:__main__:Epoch 2 - Score: 0.7108
Epoch 2 - Save Best Score: 0.7108 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7108 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.6123(0.5948) 
f1 score : 0.12048192771084339
recall score : 0.06535947712418301
precision score : 0.7692307692307693
thresh : 0.52
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 29s) Loss: 0.5798(0.5798) LR: 0.00001309  
Epoch: [3][100/279] Elapsed 0m 11s (remain 0m 21s) Loss: 0.6409(0.4659) LR: 0.00001090  
Epoch: [3][200/279] Elapsed 0m 23s (remain 0m 8s) Loss: 0.5926(0.4664) LR: 0.00000866  
Epoch: [3][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.4890(0.4652) LR: 0.00000695  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5227(0.5227) 


Epoch 3 - avg_train_loss: 0.4652  avg_val_loss: 0.5936  time: 35s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4652  avg_val_loss: 0.5936  time: 35s
Epoch 3 - Score: 0.7068
INFO:__main__:Epoch 3 - Score: 0.7068


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.5875(0.5936) 
f1 score : 0.20895522388059706
recall score : 0.13725490196078433
precision score : 0.4375
thresh : 0.59
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 31s) Loss: 0.4604(0.4604) LR: 0.00000693  
Epoch: [4][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.4477(0.3710) LR: 0.00000488  
Epoch: [4][200/279] Elapsed 0m 23s (remain 0m 8s) Loss: 0.5309(0.3673) LR: 0.00000310  
Epoch: [4][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.3590(0.3661) LR: 0.00000194  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5092(0.5092) 


Epoch 4 - avg_train_loss: 0.3661  avg_val_loss: 0.6183  time: 35s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3661  avg_val_loss: 0.6183  time: 35s
Epoch 4 - Score: 0.7068
INFO:__main__:Epoch 4 - Score: 0.7068


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.6191(0.6183) 
f1 score : 0.30588235294117644
recall score : 0.2549019607843137
precision score : 0.38235294117647056
thresh : 0.66
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 23s) Loss: 0.3088(0.3088) LR: 0.00000193  
Epoch: [5][100/279] Elapsed 0m 11s (remain 0m 21s) Loss: 0.2844(0.3165) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 23s (remain 0m 8s) Loss: 0.2505(0.3133) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.3666(0.3100) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5136(0.5136) 


Epoch 5 - avg_train_loss: 0.3100  avg_val_loss: 0.6240  time: 36s
INFO:__main__:Epoch 5 - avg_train_loss: 0.3100  avg_val_loss: 0.6240  time: 36s
Epoch 5 - Score: 0.7048
INFO:__main__:Epoch 5 - Score: 0.7048


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.6335(0.6240) 
f1 score : 0.2577777777777778
recall score : 0.1895424836601307
precision score : 0.4027777777777778
thresh : 0.77


Score: 0.7068
INFO:__main__:Score: 0.7068
ACC BEST Score: 0.7108
INFO:__main__:ACC BEST Score: 0.7108
GPTNeoConfig {
  "_name_or_path": "EleutherAI/gpt-neo-125m",
  "activation_function": "gelu_new",
  "architectures": [
    "GPTNeoForCausalLM"
  ],
  "attention_dropout": 0.0,
  "attention_layers": [
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local"
  ],
  "attention_probs_dropout_prob": 0.0,
  "attention_types": [
    [
      [
        "global",
        "local"
      ],
      6
    ]
  ],
  "bos_token_id": 50256,
  "embed_dropout": 0,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": null,
  "layer_norm_epsilon": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "gpt_neo",
  "num_heads": 12,
  "num_layers": 12,
  "output_hidden_sta

f1 score : 0.12048192771084339
recall score : 0.06535947712418301
precision score : 0.7692307692307693
thresh : 0.52
Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 37s) Loss: 0.7928(0.7928) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.6215(0.6309) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.6974(0.6207) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.5767(0.6174) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4961(0.4961) 


Epoch 1 - avg_train_loss: 0.6174  avg_val_loss: 0.6014  time: 35s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6174  avg_val_loss: 0.6014  time: 35s
Epoch 1 - Score: 0.7068
INFO:__main__:Epoch 1 - Score: 0.7068
Epoch 1 - Save Best Score: 0.7068 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7068 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.5390(0.6014) 
f1 score : 0.038461538461538464
recall score : 0.0196078431372549
precision score : 1.0
thresh : 0.36
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 33s) Loss: 0.5100(0.5100) LR: 0.00001808  
Epoch: [2][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.3932(0.5692) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 23s (remain 0m 8s) Loss: 0.4430(0.5636) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.5244(0.5604) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4590(0.4590) 


Epoch 2 - avg_train_loss: 0.5604  avg_val_loss: 0.5876  time: 35s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5604  avg_val_loss: 0.5876  time: 35s
Epoch 2 - Score: 0.7108
INFO:__main__:Epoch 2 - Score: 0.7108
Epoch 2 - Save Best Score: 0.7108 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7108 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.5190(0.5876) 
f1 score : 0.06329113924050632
recall score : 0.032679738562091505
precision score : 1.0
thresh : 0.39
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 38s) Loss: 0.4300(0.4300) LR: 0.00001309  
Epoch: [3][100/279] Elapsed 0m 11s (remain 0m 21s) Loss: 0.5329(0.4912) LR: 0.00001090  
Epoch: [3][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.5395(0.4859) LR: 0.00000866  
Epoch: [3][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.6235(0.4806) LR: 0.00000695  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4208(0.4208) 


Epoch 3 - avg_train_loss: 0.4806  avg_val_loss: 0.5895  time: 36s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4806  avg_val_loss: 0.5895  time: 36s
Epoch 3 - Score: 0.7129
INFO:__main__:Epoch 3 - Score: 0.7129
Epoch 3 - Save Best Score: 0.7129 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7129 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.5678(0.5895) 
f1 score : 0.1301775147928994
recall score : 0.0718954248366013
precision score : 0.6875
thresh : 0.45
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 32s) Loss: 0.4371(0.4371) LR: 0.00000693  
Epoch: [4][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.4512(0.3998) LR: 0.00000488  
Epoch: [4][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.4039(0.3930) LR: 0.00000310  
Epoch: [4][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.5620(0.3889) LR: 0.00000194  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4486(0.4486) 


Epoch 4 - avg_train_loss: 0.3889  avg_val_loss: 0.5899  time: 36s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3889  avg_val_loss: 0.5899  time: 36s
Epoch 4 - Score: 0.7129
INFO:__main__:Epoch 4 - Score: 0.7129


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.5852(0.5899) 
f1 score : 0.34782608695652173
recall score : 0.26143790849673204
precision score : 0.5194805194805194
thresh : 0.59
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 39s) Loss: 0.3824(0.3824) LR: 0.00000193  
Epoch: [5][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.3383(0.3357) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.2532(0.3362) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.3263(0.3366) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4504(0.4504) 


Epoch 5 - avg_train_loss: 0.3366  avg_val_loss: 0.5966  time: 36s
INFO:__main__:Epoch 5 - avg_train_loss: 0.3366  avg_val_loss: 0.5966  time: 36s
Epoch 5 - Score: 0.7108
INFO:__main__:Epoch 5 - Score: 0.7108


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.6054(0.5966) 
f1 score : 0.35042735042735046
recall score : 0.2679738562091503
precision score : 0.5061728395061729
thresh : 0.57


Score: 0.7048
INFO:__main__:Score: 0.7048
ACC BEST Score: 0.7129
INFO:__main__:ACC BEST Score: 0.7129
GPTNeoConfig {
  "_name_or_path": "EleutherAI/gpt-neo-125m",
  "activation_function": "gelu_new",
  "architectures": [
    "GPTNeoForCausalLM"
  ],
  "attention_dropout": 0.0,
  "attention_layers": [
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local"
  ],
  "attention_probs_dropout_prob": 0.0,
  "attention_types": [
    [
      [
        "global",
        "local"
      ],
      6
    ]
  ],
  "bos_token_id": 50256,
  "embed_dropout": 0,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": null,
  "layer_norm_epsilon": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "gpt_neo",
  "num_heads": 12,
  "num_layers": 12,
  "output_hidden_sta

f1 score : 0.1301775147928994
recall score : 0.0718954248366013
precision score : 0.6875
thresh : 0.45
Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 37s) Loss: 0.6673(0.6673) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.6183(0.6304) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.6494(0.6260) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.6735(0.6139) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5673(0.5673) 


Epoch 1 - avg_train_loss: 0.6139  avg_val_loss: 0.5983  time: 35s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6139  avg_val_loss: 0.5983  time: 35s
Epoch 1 - Score: 0.7169
INFO:__main__:Epoch 1 - Score: 0.7169
Epoch 1 - Save Best Score: 0.7169 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7169 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.5019(0.5983) 
f1 score : 0.02564102564102564
recall score : 0.013071895424836602
precision score : 0.6666666666666666
thresh : 0.38
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 41s) Loss: 0.5039(0.5039) LR: 0.00001808  
Epoch: [2][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.6590(0.5588) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.6348(0.5519) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.4234(0.5515) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5575(0.5575) 


Epoch 2 - avg_train_loss: 0.5515  avg_val_loss: 0.5813  time: 36s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5515  avg_val_loss: 0.5813  time: 36s
Epoch 2 - Score: 0.7169
INFO:__main__:Epoch 2 - Score: 0.7169


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.4536(0.5813) 
f1 score : 0.08641975308641976
recall score : 0.0457516339869281
precision score : 0.7777777777777778
thresh : 0.43
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 31s) Loss: 0.5808(0.5808) LR: 0.00001309  
Epoch: [3][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.4408(0.4835) LR: 0.00001090  
Epoch: [3][200/279] Elapsed 0m 23s (remain 0m 8s) Loss: 0.4910(0.4795) LR: 0.00000866  
Epoch: [3][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.4967(0.4700) LR: 0.00000695  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5623(0.5623) 


Epoch 3 - avg_train_loss: 0.4700  avg_val_loss: 0.5741  time: 36s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4700  avg_val_loss: 0.5741  time: 36s
Epoch 3 - Score: 0.7108
INFO:__main__:Epoch 3 - Score: 0.7108


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.4489(0.5741) 
f1 score : 0.2626262626262626
recall score : 0.16993464052287582
precision score : 0.5777777777777777
thresh : 0.49
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 38s) Loss: 0.4441(0.4441) LR: 0.00000693  
Epoch: [4][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.1898(0.3776) LR: 0.00000488  
Epoch: [4][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.4082(0.3732) LR: 0.00000310  
Epoch: [4][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.4499(0.3667) LR: 0.00000194  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5788(0.5788) 


Epoch 4 - avg_train_loss: 0.3667  avg_val_loss: 0.5894  time: 35s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3667  avg_val_loss: 0.5894  time: 35s
Epoch 4 - Score: 0.7108
INFO:__main__:Epoch 4 - Score: 0.7108


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.4400(0.5894) 
f1 score : 0.3247863247863248
recall score : 0.24836601307189543
precision score : 0.4691358024691358
thresh : 0.59
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 35s) Loss: 0.2473(0.2473) LR: 0.00000193  
Epoch: [5][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.2767(0.3157) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.1778(0.3095) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.2549(0.3063) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5831(0.5831) 


Epoch 5 - avg_train_loss: 0.3063  avg_val_loss: 0.5986  time: 36s
INFO:__main__:Epoch 5 - avg_train_loss: 0.3063  avg_val_loss: 0.5986  time: 36s
Epoch 5 - Score: 0.7108
INFO:__main__:Epoch 5 - Score: 0.7108


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.4359(0.5986) 
f1 score : 0.30493273542600896
recall score : 0.2222222222222222
precision score : 0.4857142857142857
thresh : 0.61


Score: 0.6948
INFO:__main__:Score: 0.6948
ACC BEST Score: 0.7169
INFO:__main__:ACC BEST Score: 0.7169
GPTNeoConfig {
  "_name_or_path": "EleutherAI/gpt-neo-125m",
  "activation_function": "gelu_new",
  "architectures": [
    "GPTNeoForCausalLM"
  ],
  "attention_dropout": 0.0,
  "attention_layers": [
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local"
  ],
  "attention_probs_dropout_prob": 0.0,
  "attention_types": [
    [
      [
        "global",
        "local"
      ],
      6
    ]
  ],
  "bos_token_id": 50256,
  "embed_dropout": 0,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": null,
  "layer_norm_epsilon": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "gpt_neo",
  "num_heads": 12,
  "num_layers": 12,
  "output_hidden_sta

f1 score : 0.02564102564102564
recall score : 0.013071895424836602
precision score : 0.6666666666666666
thresh : 0.38
Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 41s) Loss: 0.8296(0.8296) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.5801(0.6294) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.6607(0.6272) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.6117(0.6203) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6611(0.6611) 


Epoch 1 - avg_train_loss: 0.6203  avg_val_loss: 0.5989  time: 35s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6203  avg_val_loss: 0.5989  time: 35s
Epoch 1 - Score: 0.7149
INFO:__main__:Epoch 1 - Score: 0.7149
Epoch 1 - Save Best Score: 0.7149 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7149 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.5659(0.5989) 
f1 score : 0.025806451612903226
recall score : 0.013157894736842105
precision score : 0.6666666666666666
thresh : 0.36
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 39s) Loss: 0.6310(0.6310) LR: 0.00001808  
Epoch: [2][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.7246(0.5617) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.6278(0.5600) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.4778(0.5537) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6611(0.6611) 


Epoch 2 - avg_train_loss: 0.5537  avg_val_loss: 0.5811  time: 36s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5537  avg_val_loss: 0.5811  time: 36s
Epoch 2 - Score: 0.7068
INFO:__main__:Epoch 2 - Score: 0.7068


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.5707(0.5811) 
f1 score : 0.10843373493975902
recall score : 0.05921052631578947
precision score : 0.6428571428571429
thresh : 0.39
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 29s) Loss: 0.3763(0.3763) LR: 0.00001309  
Epoch: [3][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.4508(0.4756) LR: 0.00001090  
Epoch: [3][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.3732(0.4682) LR: 0.00000866  
Epoch: [3][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.2852(0.4677) LR: 0.00000695  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6718(0.6718) 


Epoch 3 - avg_train_loss: 0.4677  avg_val_loss: 0.5684  time: 36s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4677  avg_val_loss: 0.5684  time: 36s
Epoch 3 - Score: 0.7108
INFO:__main__:Epoch 3 - Score: 0.7108


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.5563(0.5684) 
f1 score : 0.20652173913043478
recall score : 0.125
precision score : 0.59375
thresh : 0.47
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 48s) Loss: 0.3656(0.3656) LR: 0.00000693  
Epoch: [4][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.3428(0.3767) LR: 0.00000488  
Epoch: [4][200/279] Elapsed 0m 23s (remain 0m 8s) Loss: 0.3557(0.3660) LR: 0.00000310  
Epoch: [4][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.2549(0.3564) LR: 0.00000194  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.7075(0.7075) 


Epoch 4 - avg_train_loss: 0.3564  avg_val_loss: 0.5825  time: 35s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3564  avg_val_loss: 0.5825  time: 35s
Epoch 4 - Score: 0.7129
INFO:__main__:Epoch 4 - Score: 0.7129


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.5536(0.5825) 
f1 score : 0.3466666666666667
recall score : 0.2565789473684211
precision score : 0.5342465753424658
thresh : 0.57
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 27s) Loss: 0.2876(0.2876) LR: 0.00000193  
Epoch: [5][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.4353(0.2860) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.2247(0.2923) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.2880(0.2936) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.7266(0.7266) 


Epoch 5 - avg_train_loss: 0.2936  avg_val_loss: 0.5906  time: 36s
INFO:__main__:Epoch 5 - avg_train_loss: 0.2936  avg_val_loss: 0.5906  time: 36s
Epoch 5 - Score: 0.7149
INFO:__main__:Epoch 5 - Score: 0.7149


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.5447(0.5906) 
f1 score : 0.35344827586206895
recall score : 0.26973684210526316
precision score : 0.5125
thresh : 0.6


Score: 0.6968
INFO:__main__:Score: 0.6968
ACC BEST Score: 0.7149
INFO:__main__:ACC BEST Score: 0.7149
GPTNeoConfig {
  "_name_or_path": "EleutherAI/gpt-neo-125m",
  "activation_function": "gelu_new",
  "architectures": [
    "GPTNeoForCausalLM"
  ],
  "attention_dropout": 0.0,
  "attention_layers": [
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local"
  ],
  "attention_probs_dropout_prob": 0.0,
  "attention_types": [
    [
      [
        "global",
        "local"
      ],
      6
    ]
  ],
  "bos_token_id": 50256,
  "embed_dropout": 0,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": null,
  "layer_norm_epsilon": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "gpt_neo",
  "num_heads": 12,
  "num_layers": 12,
  "output_hidden_sta

f1 score : 0.025806451612903226
recall score : 0.013157894736842105
precision score : 0.6666666666666666
thresh : 0.36
Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 40s) Loss: 0.7866(0.7866) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.5714(0.6411) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 23s (remain 0m 8s) Loss: 0.7546(0.6198) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.4862(0.6165) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4915(0.4915) 


Epoch 1 - avg_train_loss: 0.6165  avg_val_loss: 0.6055  time: 35s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6165  avg_val_loss: 0.6055  time: 35s
Epoch 1 - Score: 0.7062
INFO:__main__:Epoch 1 - Score: 0.7062
Epoch 1 - Save Best Score: 0.7062 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7062 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.5038(0.6055) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
thresh : 0.33
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 32s) Loss: 0.6908(0.6908) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.6911(0.5689) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 23s (remain 0m 8s) Loss: 0.4123(0.5605) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.6469(0.5596) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5333(0.5333) 


Epoch 2 - avg_train_loss: 0.5596  avg_val_loss: 0.5895  time: 36s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5596  avg_val_loss: 0.5895  time: 36s
Epoch 2 - Score: 0.7103
INFO:__main__:Epoch 2 - Score: 0.7103
Epoch 2 - Save Best Score: 0.7103 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7103 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.5054(0.5895) 
f1 score : 0.1420118343195266
recall score : 0.07894736842105263
precision score : 0.7058823529411765
thresh : 0.51
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 54s) Loss: 0.5526(0.5526) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.3953(0.4909) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.4476(0.4868) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.5074(0.4824) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5025(0.5025) 


Epoch 3 - avg_train_loss: 0.4824  avg_val_loss: 0.5723  time: 36s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4824  avg_val_loss: 0.5723  time: 36s
Epoch 3 - Score: 0.7103
INFO:__main__:Epoch 3 - Score: 0.7103


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.4581(0.5723) 
f1 score : 0.17045454545454547
recall score : 0.09868421052631579
precision score : 0.625
thresh : 0.47
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 41s) Loss: 0.4524(0.4524) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.3472(0.4010) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.4913(0.4042) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.4290(0.4047) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4990(0.4990) 


Epoch 4 - avg_train_loss: 0.4047  avg_val_loss: 0.5679  time: 36s
INFO:__main__:Epoch 4 - avg_train_loss: 0.4047  avg_val_loss: 0.5679  time: 36s
Epoch 4 - Score: 0.7103
INFO:__main__:Epoch 4 - Score: 0.7103


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.4339(0.5679) 
f1 score : 0.24615384615384617
recall score : 0.15789473684210525
precision score : 0.5581395348837209
thresh : 0.59
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 53s) Loss: 0.4046(0.4046) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.3515(0.3606) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.3483(0.3633) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.3413(0.3624) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4984(0.4984) 


Epoch 5 - avg_train_loss: 0.3624  avg_val_loss: 0.5679  time: 35s
INFO:__main__:Epoch 5 - avg_train_loss: 0.3624  avg_val_loss: 0.5679  time: 35s
Epoch 5 - Score: 0.7103
INFO:__main__:Epoch 5 - Score: 0.7103


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.4289(0.5679) 
f1 score : 0.2538071065989848
recall score : 0.16447368421052633
precision score : 0.5555555555555556
thresh : 0.51


Score: 0.7082
INFO:__main__:Score: 0.7082
ACC BEST Score: 0.7103
INFO:__main__:ACC BEST Score: 0.7103
GPTNeoConfig {
  "_name_or_path": "EleutherAI/gpt-neo-125m",
  "activation_function": "gelu_new",
  "architectures": [
    "GPTNeoForCausalLM"
  ],
  "attention_dropout": 0.0,
  "attention_layers": [
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local"
  ],
  "attention_probs_dropout_prob": 0.0,
  "attention_types": [
    [
      [
        "global",
        "local"
      ],
      6
    ]
  ],
  "bos_token_id": 50256,
  "embed_dropout": 0,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": null,
  "layer_norm_epsilon": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "gpt_neo",
  "num_heads": 12,
  "num_layers": 12,
  "output_hidden_sta

f1 score : 0.1420118343195266
recall score : 0.07894736842105263
precision score : 0.7058823529411765
thresh : 0.51
Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 43s) Loss: 0.6234(0.6234) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.6589(0.6281) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.6706(0.6280) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.5465(0.6200) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6293(0.6293) 


Epoch 1 - avg_train_loss: 0.6200  avg_val_loss: 0.5997  time: 35s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6200  avg_val_loss: 0.5997  time: 35s
Epoch 1 - Score: 0.7082
INFO:__main__:Epoch 1 - Score: 0.7082
Epoch 1 - Save Best Score: 0.7082 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7082 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7435(0.5997) 
f1 score : 0.07594936708860758
recall score : 0.039473684210526314
precision score : 1.0
thresh : 0.46
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 33s) Loss: 0.4776(0.4776) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.5676(0.5632) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.6766(0.5633) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.4590(0.5576) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6605(0.6605) 


Epoch 2 - avg_train_loss: 0.5576  avg_val_loss: 0.5953  time: 36s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5576  avg_val_loss: 0.5953  time: 36s
Epoch 2 - Score: 0.7082
INFO:__main__:Epoch 2 - Score: 0.7082


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7149(0.5953) 
f1 score : 0.10843373493975902
recall score : 0.05921052631578947
precision score : 0.6428571428571429
thresh : 0.6
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 42s) Loss: 0.5352(0.5352) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.4636(0.4904) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.5238(0.4923) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.3745(0.4864) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6787(0.6787) 


Epoch 3 - avg_train_loss: 0.4864  avg_val_loss: 0.5863  time: 36s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4864  avg_val_loss: 0.5863  time: 36s
Epoch 3 - Score: 0.7082
INFO:__main__:Epoch 3 - Score: 0.7082


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7165(0.5863) 
f1 score : 0.19565217391304343
recall score : 0.11842105263157894
precision score : 0.5625
thresh : 0.6
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 37s) Loss: 0.4418(0.4418) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.4346(0.4041) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.3122(0.4116) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.3402(0.4068) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6863(0.6863) 


Epoch 4 - avg_train_loss: 0.4068  avg_val_loss: 0.5881  time: 36s
INFO:__main__:Epoch 4 - avg_train_loss: 0.4068  avg_val_loss: 0.5881  time: 36s
Epoch 4 - Score: 0.7103
INFO:__main__:Epoch 4 - Score: 0.7103
Epoch 4 - Save Best Score: 0.7103 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7103 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7428(0.5881) 
f1 score : 0.16
recall score : 0.09210526315789473
precision score : 0.6086956521739131
thresh : 0.49
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 49s) Loss: 0.4065(0.4065) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.3670(0.3675) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.3740(0.3649) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.2733(0.3632) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6911(0.6911) 


Epoch 5 - avg_train_loss: 0.3632  avg_val_loss: 0.5868  time: 36s
INFO:__main__:Epoch 5 - avg_train_loss: 0.3632  avg_val_loss: 0.5868  time: 36s
Epoch 5 - Score: 0.7123
INFO:__main__:Epoch 5 - Score: 0.7123
Epoch 5 - Save Best Score: 0.7123 Model
INFO:__main__:Epoch 5 - Save Best Score: 0.7123 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7309(0.5868) 
f1 score : 0.227027027027027
recall score : 0.13815789473684212
precision score : 0.6363636363636364
thresh : 0.5


Score: 0.7123
INFO:__main__:Score: 0.7123
ACC BEST Score: 0.7123
INFO:__main__:ACC BEST Score: 0.7123
GPTNeoConfig {
  "_name_or_path": "EleutherAI/gpt-neo-125m",
  "activation_function": "gelu_new",
  "architectures": [
    "GPTNeoForCausalLM"
  ],
  "attention_dropout": 0.0,
  "attention_layers": [
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local"
  ],
  "attention_probs_dropout_prob": 0.0,
  "attention_types": [
    [
      [
        "global",
        "local"
      ],
      6
    ]
  ],
  "bos_token_id": 50256,
  "embed_dropout": 0,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": null,
  "layer_norm_epsilon": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "gpt_neo",
  "num_heads": 12,
  "num_layers": 12,
  "output_hidden_sta

f1 score : 0.227027027027027
recall score : 0.13815789473684212
precision score : 0.6363636363636364
thresh : 0.5
Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 39s) Loss: 0.7280(0.7280) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.5453(0.6333) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 22s (remain 0m 8s) Loss: 0.5541(0.6273) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.4079(0.6187) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6685(0.6685) 


Epoch 1 - avg_train_loss: 0.6187  avg_val_loss: 0.6042  time: 35s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6187  avg_val_loss: 0.6042  time: 35s
Epoch 1 - Score: 0.7082
INFO:__main__:Epoch 1 - Score: 0.7082
Epoch 1 - Save Best Score: 0.7082 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7082 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.5201(0.6042) 
f1 score : 0.025974025974025976
recall score : 0.013157894736842105
precision score : 1.0
thresh : 0.38
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 30s) Loss: 0.4499(0.4499) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.4631(0.5662) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.5113(0.5540) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.5915(0.5542) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6026(0.6026) 


Epoch 2 - avg_train_loss: 0.5542  avg_val_loss: 0.5973  time: 36s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5542  avg_val_loss: 0.5973  time: 36s
Epoch 2 - Score: 0.7082
INFO:__main__:Epoch 2 - Score: 0.7082


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.5667(0.5973) 
f1 score : 0.12048192771084337
recall score : 0.06578947368421052
precision score : 0.7142857142857143
thresh : 0.53
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 40s) Loss: 0.5365(0.5365) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.4936(0.4662) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 23s (remain 0m 8s) Loss: 0.5714(0.4700) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.4757(0.4662) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6093(0.6093) 


Epoch 3 - avg_train_loss: 0.4662  avg_val_loss: 0.5882  time: 36s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4662  avg_val_loss: 0.5882  time: 36s
Epoch 3 - Score: 0.7103
INFO:__main__:Epoch 3 - Score: 0.7103
Epoch 3 - Save Best Score: 0.7103 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7103 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.5157(0.5882) 
f1 score : 0.13253012048192772
recall score : 0.07236842105263158
precision score : 0.7857142857142857
thresh : 0.49
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 46s) Loss: 0.3867(0.3867) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.3512(0.3787) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.3508(0.3775) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.3431(0.3726) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5850(0.5850) 


Epoch 4 - avg_train_loss: 0.3726  avg_val_loss: 0.5906  time: 36s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3726  avg_val_loss: 0.5906  time: 36s
Epoch 4 - Score: 0.7143
INFO:__main__:Epoch 4 - Score: 0.7143
Epoch 4 - Save Best Score: 0.7143 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7143 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.5252(0.5906) 
f1 score : 0.1899441340782123
recall score : 0.1118421052631579
precision score : 0.6296296296296297
thresh : 0.54
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 39s) Loss: 0.4136(0.4136) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.2763(0.3358) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.4219(0.3303) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.2182(0.3279) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5767(0.5767) 


Epoch 5 - avg_train_loss: 0.3279  avg_val_loss: 0.5897  time: 36s
INFO:__main__:Epoch 5 - avg_train_loss: 0.3279  avg_val_loss: 0.5897  time: 36s
Epoch 5 - Score: 0.7103
INFO:__main__:Epoch 5 - Score: 0.7103


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.5305(0.5897) 
f1 score : 0.23157894736842105
recall score : 0.14473684210526316
precision score : 0.5789473684210527
thresh : 0.57


Score: 0.7082
INFO:__main__:Score: 0.7082
ACC BEST Score: 0.7143
INFO:__main__:ACC BEST Score: 0.7143
GPTNeoConfig {
  "_name_or_path": "EleutherAI/gpt-neo-125m",
  "activation_function": "gelu_new",
  "architectures": [
    "GPTNeoForCausalLM"
  ],
  "attention_dropout": 0.0,
  "attention_layers": [
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local"
  ],
  "attention_probs_dropout_prob": 0.0,
  "attention_types": [
    [
      [
        "global",
        "local"
      ],
      6
    ]
  ],
  "bos_token_id": 50256,
  "embed_dropout": 0,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": null,
  "layer_norm_epsilon": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "gpt_neo",
  "num_heads": 12,
  "num_layers": 12,
  "output_hidden_sta

f1 score : 0.1899441340782123
recall score : 0.1118421052631579
precision score : 0.6296296296296297
thresh : 0.54
Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 41s) Loss: 0.8055(0.8055) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.6378(0.6381) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 23s (remain 0m 8s) Loss: 0.6224(0.6247) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.6830(0.6193) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6805(0.6805) 


Epoch 1 - avg_train_loss: 0.6193  avg_val_loss: 0.6046  time: 36s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6193  avg_val_loss: 0.6046  time: 36s
Epoch 1 - Score: 0.7082
INFO:__main__:Epoch 1 - Score: 0.7082
Epoch 1 - Save Best Score: 0.7082 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7082 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7848(0.6046) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
thresh : 0.36
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 34s) Loss: 0.5713(0.5713) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.4797(0.5444) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.5450(0.5510) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.6584(0.5514) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6776(0.6776) 


Epoch 2 - avg_train_loss: 0.5514  avg_val_loss: 0.5813  time: 36s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5514  avg_val_loss: 0.5813  time: 36s
Epoch 2 - Score: 0.7123
INFO:__main__:Epoch 2 - Score: 0.7123
Epoch 2 - Save Best Score: 0.7123 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7123 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7329(0.5813) 
f1 score : 0.07594936708860758
recall score : 0.039473684210526314
precision score : 1.0
thresh : 0.37
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 40s) Loss: 0.4155(0.4155) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.4144(0.4788) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.5016(0.4750) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.3865(0.4688) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6956(0.6956) 


Epoch 3 - avg_train_loss: 0.4688  avg_val_loss: 0.5739  time: 36s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4688  avg_val_loss: 0.5739  time: 36s
Epoch 3 - Score: 0.7123
INFO:__main__:Epoch 3 - Score: 0.7123


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7552(0.5739) 
f1 score : 0.17777777777777778
recall score : 0.10526315789473684
precision score : 0.5714285714285714
thresh : 0.55
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 30s) Loss: 0.3964(0.3964) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.3139(0.3814) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.3863(0.3833) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.4921(0.3782) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.7308(0.7308) 


Epoch 4 - avg_train_loss: 0.3782  avg_val_loss: 0.5804  time: 36s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3782  avg_val_loss: 0.5804  time: 36s
Epoch 4 - Score: 0.7103
INFO:__main__:Epoch 4 - Score: 0.7103


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7926(0.5804) 
f1 score : 0.23232323232323232
recall score : 0.1513157894736842
precision score : 0.5
thresh : 0.61
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 40s) Loss: 0.3168(0.3168) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.3576(0.3281) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.3435(0.3298) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.3826(0.3301) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.7359(0.7359) 


Epoch 5 - avg_train_loss: 0.3301  avg_val_loss: 0.5825  time: 36s
INFO:__main__:Epoch 5 - avg_train_loss: 0.3301  avg_val_loss: 0.5825  time: 36s
Epoch 5 - Score: 0.7123
INFO:__main__:Epoch 5 - Score: 0.7123


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7994(0.5825) 
f1 score : 0.2561576354679803
recall score : 0.17105263157894737
precision score : 0.5098039215686274
thresh : 0.64


Score: 0.7062
INFO:__main__:Score: 0.7062
ACC BEST Score: 0.7123
INFO:__main__:ACC BEST Score: 0.7123
GPTNeoConfig {
  "_name_or_path": "EleutherAI/gpt-neo-125m",
  "activation_function": "gelu_new",
  "architectures": [
    "GPTNeoForCausalLM"
  ],
  "attention_dropout": 0.0,
  "attention_layers": [
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local"
  ],
  "attention_probs_dropout_prob": 0.0,
  "attention_types": [
    [
      [
        "global",
        "local"
      ],
      6
    ]
  ],
  "bos_token_id": 50256,
  "embed_dropout": 0,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": null,
  "layer_norm_epsilon": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "gpt_neo",
  "num_heads": 12,
  "num_layers": 12,
  "output_hidden_sta

f1 score : 0.07594936708860758
recall score : 0.039473684210526314
precision score : 1.0
thresh : 0.37
Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 40s) Loss: 0.6357(0.6357) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.5095(0.6130) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 23s (remain 0m 8s) Loss: 0.6220(0.6122) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.5759(0.6120) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5803(0.5803) 


Epoch 1 - avg_train_loss: 0.6120  avg_val_loss: 0.5999  time: 35s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6120  avg_val_loss: 0.5999  time: 35s
Epoch 1 - Score: 0.7082
INFO:__main__:Epoch 1 - Score: 0.7082
Epoch 1 - Save Best Score: 0.7082 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7082 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.6714(0.5999) 
f1 score : 0.05128205128205127
recall score : 0.02631578947368421
precision score : 1.0
thresh : 0.42
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 36s) Loss: 0.5719(0.5719) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.5221(0.5641) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.4903(0.5608) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.5126(0.5567) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5813(0.5813) 


Epoch 2 - avg_train_loss: 0.5567  avg_val_loss: 0.5917  time: 36s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5567  avg_val_loss: 0.5917  time: 36s
Epoch 2 - Score: 0.7103
INFO:__main__:Epoch 2 - Score: 0.7103
Epoch 2 - Save Best Score: 0.7103 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7103 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.6599(0.5917) 
f1 score : 0.10975609756097561
recall score : 0.05921052631578947
precision score : 0.75
thresh : 0.53
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 28s) Loss: 0.4305(0.4305) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.4841(0.4866) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.5357(0.4917) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.4079(0.4853) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5751(0.5751) 


Epoch 3 - avg_train_loss: 0.4853  avg_val_loss: 0.5850  time: 36s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4853  avg_val_loss: 0.5850  time: 36s
Epoch 3 - Score: 0.7103
INFO:__main__:Epoch 3 - Score: 0.7103


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.6905(0.5850) 
f1 score : 0.10975609756097561
recall score : 0.05921052631578947
precision score : 0.75
thresh : 0.54
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 34s) Loss: 0.4622(0.4622) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.4094(0.4173) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 23s (remain 0m 8s) Loss: 0.3168(0.4124) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.3957(0.4060) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5757(0.5757) 


Epoch 4 - avg_train_loss: 0.4060  avg_val_loss: 0.5837  time: 35s
INFO:__main__:Epoch 4 - avg_train_loss: 0.4060  avg_val_loss: 0.5837  time: 35s
Epoch 4 - Score: 0.7103
INFO:__main__:Epoch 4 - Score: 0.7103


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.6992(0.5837) 
f1 score : 0.152046783625731
recall score : 0.08552631578947369
precision score : 0.6842105263157895
thresh : 0.48
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 29s) Loss: 0.2856(0.2856) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.3085(0.3631) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.5254(0.3666) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.2918(0.3641) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5741(0.5741) 


Epoch 5 - avg_train_loss: 0.3641  avg_val_loss: 0.5819  time: 36s
INFO:__main__:Epoch 5 - avg_train_loss: 0.3641  avg_val_loss: 0.5819  time: 36s
Epoch 5 - Score: 0.7123
INFO:__main__:Epoch 5 - Score: 0.7123
Epoch 5 - Save Best Score: 0.7123 Model
INFO:__main__:Epoch 5 - Save Best Score: 0.7123 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.6877(0.5819) 
f1 score : 0.19889502762430936
recall score : 0.11842105263157894
precision score : 0.6206896551724138
thresh : 0.52


Score: 0.7082
INFO:__main__:Score: 0.7082
ACC BEST Score: 0.7123
INFO:__main__:ACC BEST Score: 0.7123
GPTNeoConfig {
  "_name_or_path": "EleutherAI/gpt-neo-125m",
  "activation_function": "gelu_new",
  "architectures": [
    "GPTNeoForCausalLM"
  ],
  "attention_dropout": 0.0,
  "attention_layers": [
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local"
  ],
  "attention_probs_dropout_prob": 0.0,
  "attention_types": [
    [
      [
        "global",
        "local"
      ],
      6
    ]
  ],
  "bos_token_id": 50256,
  "embed_dropout": 0,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": null,
  "layer_norm_epsilon": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "gpt_neo",
  "num_heads": 12,
  "num_layers": 12,
  "output_hidden_sta

f1 score : 0.19889502762430936
recall score : 0.11842105263157894
precision score : 0.6206896551724138
thresh : 0.52
Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 31s) Loss: 0.6135(0.6135) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.6145(0.6165) LR: 0.00001974  
Epoch: [1][200/279] Elapsed 0m 23s (remain 0m 8s) Loss: 0.6916(0.6140) LR: 0.00001900  
Epoch: [1][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.6038(0.6125) LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5998(0.5998) 


Epoch 1 - avg_train_loss: 0.6125  avg_val_loss: 0.6029  time: 35s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6125  avg_val_loss: 0.6029  time: 35s
Epoch 1 - Score: 0.7143
INFO:__main__:Epoch 1 - Score: 0.7143
Epoch 1 - Save Best Score: 0.7143 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7143 Model


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.5809(0.6029) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
thresh : 0.37
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 28s) Loss: 0.5685(0.5685) LR: 0.00001809  
Epoch: [2][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.5168(0.5540) LR: 0.00001657  
Epoch: [2][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.5751(0.5572) LR: 0.00001473  
Epoch: [2][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.4103(0.5558) LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5869(0.5869) 


Epoch 2 - avg_train_loss: 0.5558  avg_val_loss: 0.5927  time: 36s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5558  avg_val_loss: 0.5927  time: 36s
Epoch 2 - Score: 0.7123
INFO:__main__:Epoch 2 - Score: 0.7123


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.5830(0.5927) 
f1 score : 0.1
recall score : 0.05263157894736842
precision score : 1.0
thresh : 0.46
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 31s) Loss: 0.3669(0.3669) LR: 0.00001310  
Epoch: [3][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.4911(0.4696) LR: 0.00001091  
Epoch: [3][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.3731(0.4687) LR: 0.00000867  
Epoch: [3][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.3906(0.4675) LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6407(0.6407) 


Epoch 3 - avg_train_loss: 0.4675  avg_val_loss: 0.5934  time: 35s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4675  avg_val_loss: 0.5934  time: 35s
Epoch 3 - Score: 0.7103
INFO:__main__:Epoch 3 - Score: 0.7103


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.5661(0.5934) 
f1 score : 0.3063063063063063
recall score : 0.2236842105263158
precision score : 0.4857142857142857
thresh : 0.65
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 34s) Loss: 0.3877(0.3877) LR: 0.00000694  
Epoch: [4][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.3551(0.3804) LR: 0.00000490  
Epoch: [4][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.2936(0.3700) LR: 0.00000311  
Epoch: [4][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.3726(0.3614) LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.7233(0.7233) 


Epoch 4 - avg_train_loss: 0.3614  avg_val_loss: 0.6135  time: 36s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3614  avg_val_loss: 0.6135  time: 36s
Epoch 4 - Score: 0.7123
INFO:__main__:Epoch 4 - Score: 0.7123


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.4861(0.6135) 
f1 score : 0.33035714285714285
recall score : 0.24342105263157895
precision score : 0.5138888888888888
thresh : 0.65
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 32s) Loss: 0.4287(0.4287) LR: 0.00000194  
Epoch: [5][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.3561(0.2985) LR: 0.00000082  
Epoch: [5][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.2675(0.3007) LR: 0.00000017  
Epoch: [5][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.2978(0.2995) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.7466(0.7466) 


Epoch 5 - avg_train_loss: 0.2995  avg_val_loss: 0.6202  time: 36s
INFO:__main__:Epoch 5 - avg_train_loss: 0.2995  avg_val_loss: 0.6202  time: 36s
Epoch 5 - Score: 0.7082
INFO:__main__:Epoch 5 - Score: 0.7082


EVAL: [15/16] Elapsed 0m 2s (remain 0m 0s) Loss: 0.4870(0.6202) 
f1 score : 0.3801652892561984
recall score : 0.3026315789473684
precision score : 0.5111111111111111
thresh : 0.67


Score: 0.6942
INFO:__main__:Score: 0.6942
ACC BEST Score: 0.7143
INFO:__main__:ACC BEST Score: 0.7143
Score: 0.7041
INFO:__main__:Score: 0.7041
ACC BEST Score: 0.7043
INFO:__main__:ACC BEST Score: 0.7043


f1 score : 0.0
recall score : 0.0
precision score : 0.0
thresh : 0.37
f1 score : 0.118562874251497
recall score : 0.06500328299409061
precision score : 0.673469387755102
thresh : 0.51


In [None]:
from google.colab import runtime
runtime.unassign()