In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers
!pip install datasets
!pip install sentencepiece

Collecting transformers
  Downloading transformers-4.33.1-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m82.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m71.9 MB/s[0m eta [36m0:00:0

In [3]:
!nvidia-smi

Thu Sep  7 06:48:56 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0    24W / 300W |      0MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re
import html

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD, AdamW, RAdam
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score, recall_score, accuracy_score, precision_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
import os

DIR = "/content/drive/MyDrive/Competitions/Signate/MUFG2023"
INPUT_DIR = os.path.join(DIR,"input")
OUTPUT_DIR = os.path.join(DIR,"output")

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)


OUTPUT_EXP_DIR = DIR + '/output/EXP054/'
if not os.path.exists(OUTPUT_EXP_DIR):
    os.makedirs(OUTPUT_EXP_DIR)

In [6]:


# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    # model="microsoft/deberta-v3-base"
    # model='microsoft/deberta-base'
    model='roberta-base'
    # model='roberta-large'
    # model='roberta-large-mnli'
    # model='xlnet-large-cased'
    # model='albert-xxlarge-v2'
    # model="microsoft/deberta-large"
    # model="microsoft/deberta-v3-large"
    # model='microsoft/deberta-v2-xlarge'
    # model='funnel-transformer/large'
    # model='funnel-transformer/medium'
    # model='albert-base-v2'
    # model='albert-large-v2'
    # model='google/electra-large-discriminator'
    # model='google/electra-base-discriminator'
    # model="facebook/bart-large-mnli"
    # model="facebook/bart-large"
    # model="facebook/bart-base"
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=4
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=64
    fc_dropout=0.2
    target="is_fraud?"
    target_size=1
    max_len=97
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    train=True
    nth_awp_start_epoch=1
    gradient_checkpointing = False
    freezing = False

if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0, 1]

In [7]:
def get_score(labels, outputs):
    thresh = 0.5
    y_pred = outputs
    y_true = labels
    f_score = f1_score(y_true, (y_pred>thresh).astype(int))
    r_score = recall_score(y_true, (y_pred>thresh).astype(int))
    p_score = precision_score(y_true, (y_pred>thresh).astype(int))
    print(f"f1 score : {f_score}")
    print(f"recall score : {r_score}")
    print(f"precision score : {p_score}")
    return f1_score(y_true, (y_pred>thresh).astype(int))

def get_f1_score(labels, outputs):
    y_pred = outputs
    y_true = labels
    best_score = 0
    best_thresh = 0.5
    for thresh in np.arange(0.1, 0.80, 0.01):
        thresh = np.round(thresh, 2)
        score = f1_score(y_true, (y_pred>thresh).astype(int))
        #print("Accuracy score at threshold {0} is {1}".format(thresh, score))
        if score > best_score:
          best_score = score
          best_thresh = thresh
    return f1_score(y_true, (y_pred>best_thresh).astype(int))


def get_logger(filename=OUTPUT_EXP_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(seed=CFG.seed)

In [8]:
def freeze(module):
    """
    Freezes module's parameters.
    """

    for parameter in module.parameters():
        parameter.requires_grad = False

def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """

    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)

    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """

    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"

        if hasattr(embeddings_path, attr_name):
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [9]:
import pandas as pd
import numpy as np


train = pd.read_csv(os.path.join(INPUT_DIR,"train.csv"))
test = pd.read_csv(os.path.join(INPUT_DIR,"test.csv"))
card = pd.read_csv(os.path.join(INPUT_DIR, "card.csv"))
user = pd.read_csv(os.path.join(INPUT_DIR, "user.csv"))
sub = pd.read_csv(os.path.join(INPUT_DIR, "sample_submit.csv"), header=None)

print(train.shape)
display(train.head(3))

print(test.shape)
display(test.head(3))

print(card.shape)
display(card.head(3))

print(user.shape)
display(user.head(3))

print(sub.shape)
display(sub.head(3))

(471283, 12)


Unnamed: 0,index,user_id,card_id,amount,errors?,is_fraud?,merchant_id,merchant_city,merchant_state,zip,mcc,use_chip
0,0,1721,0,$2.623,OK,0,209237,Joliet,IL,60436.0,5541,Swipe Transaction
1,1,1629,3,$6.4,OK,0,2568,Edgerton,WI,53534.0,5814,Swipe Transaction
2,2,655,3,$123.5,OK,0,345310,Ridgefield,WA,98642.0,7538,Swipe Transaction


(457958, 11)


Unnamed: 0,index,user_id,card_id,amount,errors?,merchant_id,merchant_city,merchant_state,zip,mcc,use_chip
0,471283,541,3,$113.278,OK,324189,Orlando,FL,32821.0,4814,Swipe Transaction
1,471284,655,1,$293.944,OK,81219,Ridgefield,WA,98642.0,7538,Chip Transaction
2,471285,492,0,$47.4,OK,274755,Arlington Heights,IL,60004.0,5719,Swipe Transaction


(416, 10)


Unnamed: 0,user_id,card_id,card_brand,card_type,expires,has_chip,cards_issued,credit_limit,acct_open_date,year_pin_last_changed
0,39,0,Visa,Debit,09/2021,YES,1,$17117,05/2007,2010
1,39,1,Amex,Credit,11/2024,YES,2,$5400,10/2015,2015
2,41,0,Discover,Credit,03/2022,YES,2,$14800,12/2010,2011


(97, 17)


Unnamed: 0,user_id,current_age,retirement_age,birth_year,birth_month,gender,address,city,state,zipcode,latitude,longitude,per_capita_income_zipcode,yearly_income_person,total_debt,fico_score,num_credit_cards
0,39,57,64,1962,12,Female,442 Burns Boulevard,Mansfield,MA,2048,42.02,-71.21,$37407,$76274,$102611,698,2
1,41,39,66,1980,10,Female,3863 River Avenue,Lincoln,CA,95648,38.93,-121.25,$21829,$44506,$57994,849,3
2,47,40,67,1979,5,Female,8799 Elm Avenue,Mckinney,TX,75069,33.2,-96.65,$24684,$50329,$76759,625,4


(457958, 2)


Unnamed: 0,0,1
0,471283,0
1,471284,1
2,471285,0


In [10]:
month_dict = {
   "01": "January",
   "02": "February",
   "03": "March",
   "04": "April",
   "05": "May",
   "06": "June",
   "07": "July",
   "08": "August",
   "09": "September",
   "10": "October",
   "11": "November",
   "12": "December"
}

def get_expires_values(df):
  _df = df["expires"].str.split('/').apply(pd.Series)
  _df.columns = ["month","years"]
  df["expires_month"] = _df["month"].astype(str)
  df["expires_years"] = _df["years"].astype(str)
  return df

def get_acct_open_date_values(df):
  _df = df["acct_open_date"].str.split('/').apply(pd.Series)
  _df.columns = ["month","years"]
  df["acct_open_date_month"] = _df["month"].astype(str)
  df["acct_open_date_years"] = _df["years"].astype(str)
  return df


card = get_expires_values(card)
card = get_acct_open_date_values(card)
card["expires_month"] = card["expires_month"].map(month_dict)
card["acct_open_date_month"] = card["acct_open_date_month"].map(month_dict)

In [11]:
train = train.merge(card, how="left", on=["user_id", "card_id"]).merge(user, how="left", on="user_id")

In [12]:
train.fillna('unknown', inplace = True)

train["texts"] = "merchant" + "</s>" + train["amount"] + "</s>" + train["errors?"] + "</s>" + train["merchant_city"] + "</s>" + train["merchant_state"] + "</s>" + train["use_chip"] + "</s>" \
+ "card" + "</s>" + train["card_brand"] + "</s>" + train["card_type"] + "</s>" + train["expires_month"] + " " + train["expires_years"] + "</s>" + train["has_chip"] + "</s>" + train["acct_open_date_month"] + " " + train["acct_open_date_years"] + "</s>" + train["year_pin_last_changed"].astype(str) + "</s>" \
+ "user" + "</s>" + train["current_age"].astype(str) + " year old " + train["gender"] + "</s>" + "retired at age " + train["retirement_age"].astype(str) + "</s>" + train["address"] + "</s>" + train["city"] + "</s>" + train["state"] + "</s>" + train["per_capita_income_zipcode"] + "</s>" + train["yearly_income_person"] + "</s>" + train["total_debt"]

In [13]:
skf = StratifiedKFold(n_splits=CFG.n_fold,shuffle=True,random_state=CFG.seed)
for fold, ( _, val_) in enumerate(skf.split(train, train[CFG.target])):
    train.loc[val_ , "kfold"] = int(fold)

train["kfold"] = train["kfold"].astype(int)

if CFG.debug:
    display(train.groupby('kfold').size())
    train = train.sample(n=500, random_state=0).reset_index(drop=True)
    display(train.groupby('kfold').size())

In [14]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_EXP_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [15]:
# ====================================================
# Define max_len
# ====================================================
#lengths = []
#tk0 = tqdm(train['texts'].fillna("").values, total=len(train))
#for text in tk0:
#    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
#    lengths.append(length)
#CFG.max_len = max(lengths) + 23 # cls
#LOGGER.info(f"max_len: {CFG.max_len}")

In [16]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False,
                           truncation=True)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['texts'].values
        self.labels = df[CFG.target].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.half)
        return inputs, label

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

class ValidDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['texts'].values
        self.labels = df[CFG.target].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

#collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [17]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim=1)
        return max_embeddings


class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()

        # Freezing
        if cfg.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            cfg.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())

        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        self._init_weights(self.fc)
        self.layer_norm1 = nn.LayerNorm(self.config.hidden_size)
        self.sig = nn.Sigmoid()

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        feature = self.layer_norm1(feature)
        output = self.fc(feature)
        #output = self.sig(output)
        return output

In [18]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
        #print(y_preds.sigmoid().squeeze().view(1, -1))
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader),
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))

    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    predictions = np.concatenate(predictions)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [19]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):

    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['kfold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['kfold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds[CFG.target].values

    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = ValidDataset(CFG, valid_folds)


    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size*2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_EXP_DIR+'config.pth')
    model.to(device)

    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr,
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)

    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler

    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.BCEWithLogitsLoss()

    best_score = -1.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)

        # scoring
        score = get_score(valid_labels, predictions)
        f1_score = get_f1_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')


        if best_score < f1_score:
            best_score = f1_score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_EXP_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_EXP_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                             map_location=torch.device('cpu'))['predictions']
    valid_folds['pred'] = predictions

    torch.cuda.empty_cache()
    gc.collect()

    return valid_folds

In [None]:
if __name__ == '__main__':

    def get_result(oof_df):
        labels = oof_df[CFG.target].values
        preds = oof_df['pred'].values
        score = get_score(labels, preds)
        f1_score = get_f1_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
        LOGGER.info(f'F1 BEST Score: {f1_score:<.4f}')

    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
            #break
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_EXP_DIR+'oof_df.pkl')

RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.33.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

INFO:__main__:RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
 

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch: [1][0/5891] Elapsed 0m 3s (remain 362m 29s) Loss: 0.3457(0.3457) Grad: nan  LR: 0.00002000  
Epoch: [1][100/5891] Elapsed 0m 15s (remain 14m 20s) Loss: 0.1998(0.2397) Grad: 2.9685  LR: 0.00002000  
Epoch: [1][200/5891] Elapsed 0m 25s (remain 11m 54s) Loss: 0.2798(0.2312) Grad: 1.0500  LR: 0.00002000  
Epoch: [1][300/5891] Elapsed 0m 35s (remain 10m 58s) Loss: 0.1693(0.2275) Grad: 2.1446  LR: 0.00001999  
Epoch: [1][400/5891] Elapsed 0m 46s (remain 10m 35s) Loss: 0.0798(0.2219) Grad: 1.0917  LR: 0.00001999  
Epoch: [1][500/5891] Elapsed 0m 56s (remain 10m 7s) Loss: 0.1970(0.2198) Grad: 0.8545  LR: 0.00001998  
Epoch: [1][600/5891] Elapsed 1m 7s (remain 9m 52s) Loss: 0.0911(0.2187) Grad: 0.8277  LR: 0.00001997  
Epoch: [1][700/5891] Elapsed 1m 18s (remain 9m 42s) Loss: 0.2302(0.2183) Grad: 0.5727  LR: 0.00001996  
Epoch: [1][800/5891] Elapsed 1m 29s (remain 9m 25s) Loss: 0.2476(0.2176) Grad: 1.9058  LR: 0.00001994  
Epoch: [1][900/5891] Elapsed 1m 41s (remain 9m 19s) Loss: 0.2438(

Epoch 1 - avg_train_loss: 0.1884  avg_val_loss: 0.1772  time: 757s
INFO:__main__:Epoch 1 - avg_train_loss: 0.1884  avg_val_loss: 0.1772  time: 757s
Epoch 1 - Score: 0.4560
INFO:__main__:Epoch 1 - Score: 0.4560
Epoch 1 - Save Best Score: 0.4921 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.4921 Model


Epoch: [2][0/5891] Elapsed 0m 0s (remain 46m 37s) Loss: 0.2307(0.2307) Grad: nan  LR: 0.00001707  
Epoch: [2][100/5891] Elapsed 0m 11s (remain 10m 33s) Loss: 0.1675(0.1740) Grad: 1.0997  LR: 0.00001698  
Epoch: [2][200/5891] Elapsed 0m 22s (remain 10m 37s) Loss: 0.2725(0.1771) Grad: 1.9472  LR: 0.00001688  
Epoch: [2][300/5891] Elapsed 0m 33s (remain 10m 24s) Loss: 0.1869(0.1772) Grad: 1.0071  LR: 0.00001678  
Epoch: [2][400/5891] Elapsed 0m 43s (remain 9m 59s) Loss: 0.0732(0.1779) Grad: 0.8600  LR: 0.00001668  
Epoch: [2][500/5891] Elapsed 0m 54s (remain 9m 46s) Loss: 0.2788(0.1743) Grad: 2.2034  LR: 0.00001658  
Epoch: [2][600/5891] Elapsed 1m 6s (remain 9m 47s) Loss: 0.2617(0.1726) Grad: 2.4502  LR: 0.00001648  
Epoch: [2][700/5891] Elapsed 1m 18s (remain 9m 37s) Loss: 0.0725(0.1715) Grad: 1.8893  LR: 0.00001638  
Epoch: [2][800/5891] Elapsed 1m 28s (remain 9m 22s) Loss: 0.2805(0.1702) Grad: 2.1331  LR: 0.00001628  
Epoch: [2][900/5891] Elapsed 1m 39s (remain 9m 13s) Loss: 0.2500(0.

Epoch 2 - avg_train_loss: 0.1654  avg_val_loss: 0.1624  time: 784s
INFO:__main__:Epoch 2 - avg_train_loss: 0.1654  avg_val_loss: 0.1624  time: 784s
Epoch 2 - Score: 0.4765
INFO:__main__:Epoch 2 - Score: 0.4765
Epoch 2 - Save Best Score: 0.5303 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.5303 Model


Epoch: [3][0/5891] Elapsed 0m 0s (remain 49m 43s) Loss: 0.0494(0.0494) Grad: nan  LR: 0.00001000  
Epoch: [3][100/5891] Elapsed 0m 14s (remain 13m 27s) Loss: 0.0585(0.1520) Grad: 0.8742  LR: 0.00000987  
Epoch: [3][200/5891] Elapsed 0m 26s (remain 12m 20s) Loss: 0.2086(0.1540) Grad: 1.2240  LR: 0.00000973  
Epoch: [3][300/5891] Elapsed 0m 36s (remain 11m 14s) Loss: 0.0750(0.1552) Grad: 0.9868  LR: 0.00000960  
Epoch: [3][400/5891] Elapsed 0m 47s (remain 10m 48s) Loss: 0.0885(0.1560) Grad: 1.1110  LR: 0.00000947  
Epoch: [3][500/5891] Elapsed 0m 57s (remain 10m 23s) Loss: 0.2805(0.1556) Grad: 1.5324  LR: 0.00000933  
Epoch: [3][600/5891] Elapsed 1m 8s (remain 10m 2s) Loss: 0.1019(0.1551) Grad: 1.0111  LR: 0.00000920  
Epoch: [3][700/5891] Elapsed 1m 19s (remain 9m 49s) Loss: 0.1353(0.1556) Grad: 1.1258  LR: 0.00000907  
Epoch: [3][800/5891] Elapsed 1m 29s (remain 9m 30s) Loss: 0.1192(0.1549) Grad: 0.7437  LR: 0.00000893  
Epoch: [3][900/5891] Elapsed 1m 40s (remain 9m 17s) Loss: 0.3247(

Epoch 3 - avg_train_loss: 0.1542  avg_val_loss: 0.1568  time: 762s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1542  avg_val_loss: 0.1568  time: 762s
Epoch 3 - Score: 0.4813
INFO:__main__:Epoch 3 - Score: 0.4813
Epoch 3 - Save Best Score: 0.5419 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.5419 Model


Epoch: [4][0/5891] Elapsed 0m 0s (remain 46m 37s) Loss: 0.1449(0.1449) Grad: nan  LR: 0.00000293  
Epoch: [4][100/5891] Elapsed 0m 11s (remain 11m 23s) Loss: 0.1744(0.1490) Grad: 1.2483  LR: 0.00000283  
Epoch: [4][200/5891] Elapsed 0m 23s (remain 11m 4s) Loss: 0.0848(0.1482) Grad: 1.3044  LR: 0.00000274  
Epoch: [4][300/5891] Elapsed 0m 34s (remain 10m 48s) Loss: 0.1504(0.1468) Grad: 1.5806  LR: 0.00000265  
Epoch: [4][400/5891] Elapsed 0m 45s (remain 10m 27s) Loss: 0.2529(0.1465) Grad: 2.0895  LR: 0.00000256  
Epoch: [4][500/5891] Elapsed 0m 56s (remain 10m 5s) Loss: 0.2323(0.1479) Grad: 2.3067  LR: 0.00000247  
Epoch: [4][600/5891] Elapsed 1m 7s (remain 9m 55s) Loss: 0.0549(0.1459) Grad: 1.3893  LR: 0.00000239  
Epoch: [4][700/5891] Elapsed 1m 18s (remain 9m 41s) Loss: 0.1915(0.1471) Grad: 1.5285  LR: 0.00000230  
Epoch: [4][800/5891] Elapsed 1m 28s (remain 9m 25s) Loss: 0.0931(0.1466) Grad: 1.5491  LR: 0.00000222  
Epoch: [4][900/5891] Elapsed 1m 40s (remain 9m 15s) Loss: 0.1102(0.

Epoch 4 - avg_train_loss: 0.1426  avg_val_loss: 0.1567  time: 763s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1426  avg_val_loss: 0.1567  time: 763s
Epoch 4 - Score: 0.5009
INFO:__main__:Epoch 4 - Score: 0.5009
Epoch 4 - Save Best Score: 0.5505 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.5505 Model


f1 score : 0.5009045226130653
recall score : 0.3818571866380631
precision score : 0.727803738317757


Score: 0.5009
INFO:__main__:Score: 0.5009
F1 BEST Score: 0.5505
INFO:__main__:F1 BEST Score: 0.5505
RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.33.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

INFO:__main__:RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention

Epoch: [1][0/5891] Elapsed 0m 0s (remain 53m 28s) Loss: 0.3130(0.3130) Grad: nan  LR: 0.00002000  
Epoch: [1][100/5891] Elapsed 0m 12s (remain 11m 33s) Loss: 0.1495(0.2338) Grad: 0.8506  LR: 0.00002000  
Epoch: [1][200/5891] Elapsed 0m 22s (remain 10m 43s) Loss: 0.2112(0.2290) Grad: 0.3553  LR: 0.00002000  
Epoch: [1][300/5891] Elapsed 0m 33s (remain 10m 29s) Loss: 0.2085(0.2296) Grad: 0.9225  LR: 0.00001999  
Epoch: [1][400/5891] Elapsed 0m 43s (remain 10m 2s) Loss: 0.3115(0.2252) Grad: 2.2376  LR: 0.00001999  
Epoch: [1][500/5891] Elapsed 0m 54s (remain 9m 49s) Loss: 0.3264(0.2234) Grad: 2.5271  LR: 0.00001998  
Epoch: [1][600/5891] Elapsed 1m 5s (remain 9m 40s) Loss: 0.2020(0.2206) Grad: 1.0957  LR: 0.00001997  
Epoch: [1][700/5891] Elapsed 1m 15s (remain 9m 21s) Loss: 0.1674(0.2187) Grad: 0.6794  LR: 0.00001996  
Epoch: [1][800/5891] Elapsed 1m 26s (remain 9m 10s) Loss: 0.2842(0.2173) Grad: 1.0609  LR: 0.00001994  
Epoch: [1][900/5891] Elapsed 1m 38s (remain 9m 2s) Loss: 0.1997(0.2

Epoch 1 - avg_train_loss: 0.1894  avg_val_loss: 0.1755  time: 763s
INFO:__main__:Epoch 1 - avg_train_loss: 0.1894  avg_val_loss: 0.1755  time: 763s
Epoch 1 - Score: 0.3630
INFO:__main__:Epoch 1 - Score: 0.3630
Epoch 1 - Save Best Score: 0.4902 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.4902 Model


Epoch: [2][0/5891] Elapsed 0m 0s (remain 55m 47s) Loss: 0.0903(0.0903) Grad: nan  LR: 0.00001707  
Epoch: [2][100/5891] Elapsed 0m 11s (remain 11m 24s) Loss: 0.1404(0.1784) Grad: 2.4714  LR: 0.00001698  
Epoch: [2][200/5891] Elapsed 0m 22s (remain 10m 46s) Loss: 0.1761(0.1773) Grad: 1.2931  LR: 0.00001688  
Epoch: [2][300/5891] Elapsed 0m 34s (remain 10m 33s) Loss: 0.2395(0.1786) Grad: 1.3194  LR: 0.00001678  
Epoch: [2][400/5891] Elapsed 0m 44s (remain 10m 14s) Loss: 0.1437(0.1758) Grad: 1.6356  LR: 0.00001668  
Epoch: [2][500/5891] Elapsed 0m 55s (remain 9m 52s) Loss: 0.1466(0.1743) Grad: 1.9133  LR: 0.00001658  
Epoch: [2][600/5891] Elapsed 1m 6s (remain 9m 43s) Loss: 0.2225(0.1733) Grad: 1.0593  LR: 0.00001648  
Epoch: [2][700/5891] Elapsed 1m 16s (remain 9m 29s) Loss: 0.2524(0.1729) Grad: 1.5468  LR: 0.00001638  
Epoch: [2][800/5891] Elapsed 1m 27s (remain 9m 15s) Loss: 0.2808(0.1737) Grad: 1.5316  LR: 0.00001628  
Epoch: [2][900/5891] Elapsed 1m 38s (remain 9m 6s) Loss: 0.2334(0.

Epoch 2 - avg_train_loss: 0.1679  avg_val_loss: 0.1637  time: 771s
INFO:__main__:Epoch 2 - avg_train_loss: 0.1679  avg_val_loss: 0.1637  time: 771s
Epoch 2 - Score: 0.4694
INFO:__main__:Epoch 2 - Score: 0.4694
Epoch 2 - Save Best Score: 0.5308 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.5308 Model


Epoch: [3][0/5891] Elapsed 0m 0s (remain 49m 54s) Loss: 0.0650(0.0650) Grad: nan  LR: 0.00001000  
Epoch: [3][100/5891] Elapsed 0m 12s (remain 12m 24s) Loss: 0.0864(0.1473) Grad: 1.0581  LR: 0.00000987  
Epoch: [3][200/5891] Elapsed 0m 23s (remain 11m 9s) Loss: 0.1417(0.1593) Grad: 2.0271  LR: 0.00000973  
Epoch: [3][300/5891] Elapsed 0m 35s (remain 10m 52s) Loss: 0.2460(0.1581) Grad: 2.2057  LR: 0.00000960  
Epoch: [3][400/5891] Elapsed 0m 46s (remain 10m 37s) Loss: 0.1278(0.1583) Grad: 1.3273  LR: 0.00000947  
Epoch: [3][500/5891] Elapsed 0m 56s (remain 10m 10s) Loss: 0.1561(0.1570) Grad: 1.1846  LR: 0.00000933  
Epoch: [3][600/5891] Elapsed 1m 7s (remain 9m 58s) Loss: 0.0833(0.1558) Grad: 1.8169  LR: 0.00000920  
Epoch: [3][700/5891] Elapsed 1m 19s (remain 9m 48s) Loss: 0.1016(0.1562) Grad: 0.9356  LR: 0.00000907  
Epoch: [3][800/5891] Elapsed 1m 29s (remain 9m 28s) Loss: 0.3340(0.1550) Grad: 1.8821  LR: 0.00000893  
Epoch: [3][900/5891] Elapsed 1m 43s (remain 9m 30s) Loss: 0.1103(0

Epoch 3 - avg_train_loss: 0.1537  avg_val_loss: 0.1571  time: 786s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1537  avg_val_loss: 0.1571  time: 786s
Epoch 3 - Score: 0.4838
INFO:__main__:Epoch 3 - Score: 0.4838
Epoch 3 - Save Best Score: 0.5488 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.5488 Model


Epoch: [4][0/5891] Elapsed 0m 0s (remain 72m 52s) Loss: 0.1664(0.1664) Grad: nan  LR: 0.00000293  
Epoch: [4][100/5891] Elapsed 0m 11s (remain 11m 8s) Loss: 0.0640(0.1477) Grad: 0.6952  LR: 0.00000283  
Epoch: [4][200/5891] Elapsed 0m 23s (remain 11m 18s) Loss: 0.0562(0.1515) Grad: 1.2318  LR: 0.00000274  
Epoch: [4][300/5891] Elapsed 0m 34s (remain 10m 49s) Loss: 0.1954(0.1499) Grad: 2.6761  LR: 0.00000265  
Epoch: [4][400/5891] Elapsed 0m 45s (remain 10m 26s) Loss: 0.1218(0.1474) Grad: 1.3196  LR: 0.00000256  
Epoch: [4][500/5891] Elapsed 0m 57s (remain 10m 19s) Loss: 0.1833(0.1453) Grad: 1.9206  LR: 0.00000247  
Epoch: [4][600/5891] Elapsed 1m 11s (remain 10m 27s) Loss: 0.1164(0.1446) Grad: 1.4139  LR: 0.00000239  
Epoch: [4][700/5891] Elapsed 1m 21s (remain 10m 4s) Loss: 0.0800(0.1441) Grad: 1.0801  LR: 0.00000230  
Epoch: [4][800/5891] Elapsed 1m 33s (remain 9m 53s) Loss: 0.2771(0.1444) Grad: 2.9924  LR: 0.00000222  
Epoch: [4][900/5891] Elapsed 1m 45s (remain 9m 42s) Loss: 0.1219

Epoch 4 - avg_train_loss: 0.1410  avg_val_loss: 0.1579  time: 800s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1410  avg_val_loss: 0.1579  time: 800s
Epoch 4 - Score: 0.5103
INFO:__main__:Epoch 4 - Score: 0.5103
Epoch 4 - Save Best Score: 0.5493 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.5493 Model


f1 score : 0.5103259273759421
recall score : 0.3994790070487282
precision score : 0.7063126523977242


Score: 0.5103
INFO:__main__:Score: 0.5103
F1 BEST Score: 0.5493
INFO:__main__:F1 BEST Score: 0.5493
RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.33.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

INFO:__main__:RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention

Epoch: [1][0/5891] Elapsed 0m 0s (remain 67m 38s) Loss: 0.7578(0.7578) Grad: nan  LR: 0.00002000  
Epoch: [1][100/5891] Elapsed 0m 14s (remain 13m 42s) Loss: 0.1840(0.2762) Grad: 1.1563  LR: 0.00002000  
Epoch: [1][200/5891] Elapsed 0m 26s (remain 12m 28s) Loss: 0.2288(0.2546) Grad: 0.7119  LR: 0.00002000  
Epoch: [1][300/5891] Elapsed 0m 37s (remain 11m 44s) Loss: 0.1289(0.2406) Grad: 0.5052  LR: 0.00001999  
Epoch: [1][400/5891] Elapsed 0m 49s (remain 11m 11s) Loss: 0.1498(0.2349) Grad: 1.9773  LR: 0.00001999  
Epoch: [1][500/5891] Elapsed 1m 1s (remain 10m 57s) Loss: 0.1180(0.2301) Grad: 0.3737  LR: 0.00001998  
Epoch: [1][600/5891] Elapsed 1m 13s (remain 10m 42s) Loss: 0.1797(0.2276) Grad: 0.7669  LR: 0.00001997  
Epoch: [1][700/5891] Elapsed 1m 23s (remain 10m 19s) Loss: 0.1875(0.2263) Grad: 1.0170  LR: 0.00001996  
Epoch: [1][800/5891] Elapsed 1m 35s (remain 10m 8s) Loss: 0.1072(0.2252) Grad: 1.2008  LR: 0.00001994  
Epoch: [1][900/5891] Elapsed 1m 47s (remain 9m 56s) Loss: 0.304

Epoch 1 - avg_train_loss: 0.1916  avg_val_loss: 0.1714  time: 802s
INFO:__main__:Epoch 1 - avg_train_loss: 0.1916  avg_val_loss: 0.1714  time: 802s
Epoch 1 - Score: 0.3865
INFO:__main__:Epoch 1 - Score: 0.3865
Epoch 1 - Save Best Score: 0.4933 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.4933 Model


Epoch: [2][0/5891] Elapsed 0m 0s (remain 56m 24s) Loss: 0.1133(0.1133) Grad: nan  LR: 0.00001707  
Epoch: [2][100/5891] Elapsed 0m 14s (remain 13m 26s) Loss: 0.2446(0.1694) Grad: 1.7456  LR: 0.00001698  
Epoch: [2][200/5891] Elapsed 0m 24s (remain 11m 39s) Loss: 0.1241(0.1747) Grad: 1.2245  LR: 0.00001688  
Epoch: [2][300/5891] Elapsed 0m 36s (remain 11m 18s) Loss: 0.1420(0.1727) Grad: 2.6165  LR: 0.00001678  
Epoch: [2][400/5891] Elapsed 0m 48s (remain 11m 2s) Loss: 0.1335(0.1738) Grad: 0.8928  LR: 0.00001668  
Epoch: [2][500/5891] Elapsed 0m 58s (remain 10m 31s) Loss: 0.3052(0.1733) Grad: 2.1934  LR: 0.00001658  
Epoch: [2][600/5891] Elapsed 1m 10s (remain 10m 20s) Loss: 0.0910(0.1715) Grad: 1.1068  LR: 0.00001648  
Epoch: [2][700/5891] Elapsed 1m 22s (remain 10m 10s) Loss: 0.1097(0.1711) Grad: 0.9150  LR: 0.00001638  
Epoch: [2][800/5891] Elapsed 1m 32s (remain 9m 49s) Loss: 0.1696(0.1707) Grad: 1.4939  LR: 0.00001628  
Epoch: [2][900/5891] Elapsed 1m 44s (remain 9m 38s) Loss: 0.203

Epoch 2 - avg_train_loss: 0.1661  avg_val_loss: 0.1610  time: 805s
INFO:__main__:Epoch 2 - avg_train_loss: 0.1661  avg_val_loss: 0.1610  time: 805s
Epoch 2 - Score: 0.4894
INFO:__main__:Epoch 2 - Score: 0.4894
Epoch 2 - Save Best Score: 0.5262 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.5262 Model


Epoch: [3][0/5891] Elapsed 0m 0s (remain 86m 37s) Loss: 0.2100(0.2100) Grad: nan  LR: 0.00001000  
Epoch: [3][100/5891] Elapsed 0m 13s (remain 12m 36s) Loss: 0.1128(0.1552) Grad: 1.2373  LR: 0.00000987  
Epoch: [3][200/5891] Elapsed 0m 25s (remain 12m 4s) Loss: 0.1422(0.1555) Grad: 2.9378  LR: 0.00000973  
Epoch: [3][300/5891] Elapsed 0m 37s (remain 11m 42s) Loss: 0.0940(0.1608) Grad: 0.7868  LR: 0.00000960  
Epoch: [3][400/5891] Elapsed 0m 48s (remain 11m 7s) Loss: 0.1899(0.1559) Grad: 1.3974  LR: 0.00000947  
Epoch: [3][500/5891] Elapsed 1m 0s (remain 10m 49s) Loss: 0.1622(0.1547) Grad: 1.5384  LR: 0.00000933  
Epoch: [3][600/5891] Elapsed 1m 12s (remain 10m 38s) Loss: 0.2050(0.1557) Grad: 1.4428  LR: 0.00000920  
Epoch: [3][700/5891] Elapsed 1m 23s (remain 10m 20s) Loss: 0.1594(0.1567) Grad: 1.1573  LR: 0.00000907  
Epoch: [3][800/5891] Elapsed 1m 37s (remain 10m 17s) Loss: 0.2708(0.1563) Grad: 2.2583  LR: 0.00000893  
Epoch: [3][900/5891] Elapsed 1m 49s (remain 10m 5s) Loss: 0.0972

Epoch 3 - avg_train_loss: 0.1540  avg_val_loss: 0.1563  time: 821s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1540  avg_val_loss: 0.1563  time: 821s
Epoch 3 - Score: 0.4901
INFO:__main__:Epoch 3 - Score: 0.4901
Epoch 3 - Save Best Score: 0.5409 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.5409 Model


Epoch: [4][0/5891] Elapsed 0m 0s (remain 53m 21s) Loss: 0.2686(0.2686) Grad: nan  LR: 0.00000293  
Epoch: [4][100/5891] Elapsed 0m 13s (remain 13m 18s) Loss: 0.0748(0.1503) Grad: 1.3972  LR: 0.00000283  
Epoch: [4][200/5891] Elapsed 0m 25s (remain 12m 14s) Loss: 0.1649(0.1459) Grad: 1.6898  LR: 0.00000274  
Epoch: [4][300/5891] Elapsed 0m 36s (remain 11m 25s) Loss: 0.1824(0.1439) Grad: 1.8738  LR: 0.00000265  
Epoch: [4][400/5891] Elapsed 0m 49s (remain 11m 12s) Loss: 0.1188(0.1413) Grad: 1.4309  LR: 0.00000256  
Epoch: [4][500/5891] Elapsed 1m 0s (remain 10m 53s) Loss: 0.0961(0.1431) Grad: 1.4778  LR: 0.00000247  
Epoch: [4][600/5891] Elapsed 1m 11s (remain 10m 29s) Loss: 0.0900(0.1428) Grad: 1.7836  LR: 0.00000239  
Epoch: [4][700/5891] Elapsed 1m 23s (remain 10m 19s) Loss: 0.0924(0.1450) Grad: 1.3560  LR: 0.00000230  
Epoch: [4][800/5891] Elapsed 1m 35s (remain 10m 6s) Loss: 0.1901(0.1442) Grad: 1.5284  LR: 0.00000222  
Epoch: [4][900/5891] Elapsed 1m 45s (remain 9m 46s) Loss: 0.187

Epoch 4 - avg_train_loss: 0.1422  avg_val_loss: 0.1568  time: 812s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1422  avg_val_loss: 0.1568  time: 812s
Epoch 4 - Score: 0.5019
INFO:__main__:Epoch 4 - Score: 0.5019
Epoch 4 - Save Best Score: 0.5465 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.5465 Model


f1 score : 0.501913828638728
recall score : 0.39175731576528267
precision score : 0.6982523211359912


Score: 0.5019
INFO:__main__:Score: 0.5019
F1 BEST Score: 0.5465
INFO:__main__:F1 BEST Score: 0.5465
RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.33.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

INFO:__main__:RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention

Epoch: [1][0/5891] Elapsed 0m 0s (remain 57m 41s) Loss: 1.2432(1.2432) Grad: nan  LR: 0.00002000  
Epoch: [1][100/5891] Elapsed 0m 14s (remain 13m 50s) Loss: 0.1777(0.3025) Grad: 1.0416  LR: 0.00002000  
Epoch: [1][200/5891] Elapsed 0m 25s (remain 12m 4s) Loss: 0.2238(0.2613) Grad: 2.2748  LR: 0.00002000  
Epoch: [1][300/5891] Elapsed 0m 37s (remain 11m 29s) Loss: 0.3047(0.2465) Grad: 1.3903  LR: 0.00001999  
Epoch: [1][400/5891] Elapsed 0m 49s (remain 11m 16s) Loss: 0.2817(0.2403) Grad: 1.7591  LR: 0.00001999  
Epoch: [1][500/5891] Elapsed 1m 1s (remain 10m 57s) Loss: 0.1505(0.2358) Grad: 0.3836  LR: 0.00001998  
Epoch: [1][600/5891] Elapsed 1m 12s (remain 10m 34s) Loss: 0.2593(0.2332) Grad: 1.3594  LR: 0.00001997  
Epoch: [1][700/5891] Elapsed 1m 24s (remain 10m 24s) Loss: 0.1603(0.2284) Grad: 0.6919  LR: 0.00001996  
Epoch: [1][800/5891] Elapsed 1m 36s (remain 10m 11s) Loss: 0.1566(0.2256) Grad: 1.8856  LR: 0.00001994  
Epoch: [1][900/5891] Elapsed 1m 48s (remain 9m 59s) Loss: 0.184

Epoch 1 - avg_train_loss: 0.1947  avg_val_loss: 0.1787  time: 817s
INFO:__main__:Epoch 1 - avg_train_loss: 0.1947  avg_val_loss: 0.1787  time: 817s
Epoch 1 - Score: 0.3021
INFO:__main__:Epoch 1 - Score: 0.3021
Epoch 1 - Save Best Score: 0.4819 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.4819 Model


Epoch: [2][0/5891] Elapsed 0m 0s (remain 92m 57s) Loss: 0.1837(0.1837) Grad: nan  LR: 0.00001707  
Epoch: [2][100/5891] Elapsed 0m 12s (remain 11m 33s) Loss: 0.1511(0.1665) Grad: 1.4579  LR: 0.00001698  
Epoch: [2][200/5891] Elapsed 0m 24s (remain 11m 47s) Loss: 0.0498(0.1616) Grad: 1.3341  LR: 0.00001688  
Epoch: [2][300/5891] Elapsed 0m 36s (remain 11m 23s) Loss: 0.0768(0.1674) Grad: 1.8660  LR: 0.00001678  
Epoch: [2][400/5891] Elapsed 0m 46s (remain 10m 43s) Loss: 0.1479(0.1705) Grad: 0.9438  LR: 0.00001668  
Epoch: [2][500/5891] Elapsed 0m 58s (remain 10m 32s) Loss: 0.0867(0.1691) Grad: 1.1767  LR: 0.00001658  
Epoch: [2][600/5891] Elapsed 1m 10s (remain 10m 22s) Loss: 0.2013(0.1690) Grad: 1.3438  LR: 0.00001648  
Epoch: [2][700/5891] Elapsed 1m 21s (remain 10m 0s) Loss: 0.3167(0.1715) Grad: 1.7356  LR: 0.00001638  
Epoch: [2][800/5891] Elapsed 1m 32s (remain 9m 49s) Loss: 0.1365(0.1721) Grad: 1.1052  LR: 0.00001628  
Epoch: [2][900/5891] Elapsed 1m 44s (remain 9m 39s) Loss: 0.113

Epoch 2 - avg_train_loss: 0.1687  avg_val_loss: 0.1646  time: 790s
INFO:__main__:Epoch 2 - avg_train_loss: 0.1687  avg_val_loss: 0.1646  time: 790s
Epoch 2 - Score: 0.4366
INFO:__main__:Epoch 2 - Score: 0.4366
Epoch 2 - Save Best Score: 0.5192 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.5192 Model


Epoch: [3][0/5891] Elapsed 0m 0s (remain 54m 0s) Loss: 0.1671(0.1671) Grad: nan  LR: 0.00001000  
Epoch: [3][100/5891] Elapsed 0m 12s (remain 11m 53s) Loss: 0.2417(0.1573) Grad: 1.7105  LR: 0.00000987  
Epoch: [3][200/5891] Elapsed 0m 23s (remain 10m 58s) Loss: 0.1558(0.1516) Grad: 1.4949  LR: 0.00000973  
Epoch: [3][300/5891] Elapsed 0m 34s (remain 10m 42s) Loss: 0.1909(0.1534) Grad: 1.7372  LR: 0.00000960  
Epoch: [3][400/5891] Elapsed 0m 45s (remain 10m 19s) Loss: 0.2174(0.1556) Grad: 2.3640  LR: 0.00000947  
Epoch: [3][500/5891] Elapsed 0m 55s (remain 9m 56s) Loss: 0.3093(0.1571) Grad: 1.5630  LR: 0.00000933  
Epoch: [3][600/5891] Elapsed 1m 6s (remain 9m 46s) Loss: 0.0870(0.1579) Grad: 1.0001  LR: 0.00000920  
Epoch: [3][700/5891] Elapsed 1m 17s (remain 9m 30s) Loss: 0.1461(0.1578) Grad: 1.0528  LR: 0.00000907  
Epoch: [3][800/5891] Elapsed 1m 27s (remain 9m 15s) Loss: 0.0826(0.1577) Grad: 1.0131  LR: 0.00000893  
Epoch: [3][900/5891] Elapsed 1m 38s (remain 9m 5s) Loss: 0.1429(0.1

Epoch 3 - avg_train_loss: 0.1553  avg_val_loss: 0.1591  time: 755s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1553  avg_val_loss: 0.1591  time: 755s
Epoch 3 - Score: 0.4832
INFO:__main__:Epoch 3 - Score: 0.4832
Epoch 3 - Save Best Score: 0.5378 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.5378 Model


Epoch: [4][0/5891] Elapsed 0m 0s (remain 76m 34s) Loss: 0.2222(0.2222) Grad: nan  LR: 0.00000293  
Epoch: [4][100/5891] Elapsed 0m 11s (remain 10m 35s) Loss: 0.0996(0.1348) Grad: 1.2135  LR: 0.00000283  
Epoch: [4][200/5891] Elapsed 0m 23s (remain 10m 55s) Loss: 0.1997(0.1374) Grad: 2.7222  LR: 0.00000274  
Epoch: [4][300/5891] Elapsed 0m 34s (remain 10m 41s) Loss: 0.2358(0.1356) Grad: 2.4533  LR: 0.00000265  
Epoch: [4][400/5891] Elapsed 0m 44s (remain 10m 5s) Loss: 0.0760(0.1395) Grad: 1.3044  LR: 0.00000256  
Epoch: [4][500/5891] Elapsed 0m 55s (remain 9m 55s) Loss: 0.1105(0.1424) Grad: 1.1354  LR: 0.00000247  
Epoch: [4][600/5891] Elapsed 1m 6s (remain 9m 44s) Loss: 0.0853(0.1430) Grad: 1.6267  LR: 0.00000239  
Epoch: [4][700/5891] Elapsed 1m 16s (remain 9m 23s) Loss: 0.2578(0.1442) Grad: 2.6701  LR: 0.00000230  
Epoch: [4][800/5891] Elapsed 1m 27s (remain 9m 14s) Loss: 0.1259(0.1449) Grad: 1.3887  LR: 0.00000222  
Epoch: [4][900/5891] Elapsed 1m 38s (remain 9m 3s) Loss: 0.1730(0.1

Epoch 4 - avg_train_loss: 0.1446  avg_val_loss: 0.1585  time: 751s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1446  avg_val_loss: 0.1585  time: 751s
Epoch 4 - Score: 0.4986
INFO:__main__:Epoch 4 - Score: 0.4986
Epoch 4 - Save Best Score: 0.5410 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.5410 Model


f1 score : 0.4985633607450708
recall score : 0.3855347839411584
precision score : 0.705354639753294


Score: 0.4986
INFO:__main__:Score: 0.4986
F1 BEST Score: 0.5410
INFO:__main__:F1 BEST Score: 0.5410
RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.33.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

INFO:__main__:RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention

Epoch: [1][0/5891] Elapsed 0m 0s (remain 51m 41s) Loss: 1.1016(1.1016) Grad: nan  LR: 0.00002000  
Epoch: [1][100/5891] Elapsed 0m 12s (remain 11m 30s) Loss: 0.2150(0.2872) Grad: 0.7258  LR: 0.00002000  
Epoch: [1][200/5891] Elapsed 0m 22s (remain 10m 30s) Loss: 0.1539(0.2581) Grad: 1.1424  LR: 0.00002000  
Epoch: [1][300/5891] Elapsed 0m 33s (remain 10m 18s) Loss: 0.1713(0.2470) Grad: 1.1042  LR: 0.00001999  
Epoch: [1][400/5891] Elapsed 0m 43s (remain 9m 55s) Loss: 0.2917(0.2390) Grad: 0.8384  LR: 0.00001999  
Epoch: [1][500/5891] Elapsed 0m 53s (remain 9m 38s) Loss: 0.3206(0.2381) Grad: 1.4316  LR: 0.00001998  
Epoch: [1][600/5891] Elapsed 1m 4s (remain 9m 28s) Loss: 0.1411(0.2358) Grad: 3.0200  LR: 0.00001997  
Epoch: [1][700/5891] Elapsed 1m 14s (remain 9m 11s) Loss: 0.2201(0.2333) Grad: 0.8926  LR: 0.00001996  
Epoch: [1][800/5891] Elapsed 1m 24s (remain 8m 59s) Loss: 0.1346(0.2290) Grad: 1.0543  LR: 0.00001994  
Epoch: [1][900/5891] Elapsed 1m 35s (remain 8m 51s) Loss: 0.1940(0.

Epoch 1 - avg_train_loss: 0.1892  avg_val_loss: 0.1713  time: 748s
INFO:__main__:Epoch 1 - avg_train_loss: 0.1892  avg_val_loss: 0.1713  time: 748s
Epoch 1 - Score: 0.3790
INFO:__main__:Epoch 1 - Score: 0.3790
Epoch 1 - Save Best Score: 0.4998 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.4998 Model


Epoch: [2][0/5891] Elapsed 0m 0s (remain 62m 30s) Loss: 0.1874(0.1874) Grad: nan  LR: 0.00001707  
Epoch: [2][100/5891] Elapsed 0m 11s (remain 10m 43s) Loss: 0.0919(0.1737) Grad: 1.3367  LR: 0.00001698  
Epoch: [2][200/5891] Elapsed 0m 22s (remain 10m 45s) Loss: 0.0978(0.1679) Grad: 1.9450  LR: 0.00001688  
Epoch: [2][300/5891] Elapsed 0m 33s (remain 10m 30s) Loss: 0.1504(0.1702) Grad: 1.6447  LR: 0.00001678  
Epoch: [2][400/5891] Elapsed 0m 43s (remain 9m 59s) Loss: 0.1740(0.1709) Grad: 1.2411  LR: 0.00001668  
Epoch: [2][500/5891] Elapsed 0m 54s (remain 9m 49s) Loss: 0.1586(0.1690) Grad: 1.8110  LR: 0.00001658  
Epoch: [2][600/5891] Elapsed 1m 8s (remain 10m 0s) Loss: 0.1348(0.1666) Grad: 1.2082  LR: 0.00001648  
Epoch: [2][700/5891] Elapsed 1m 17s (remain 9m 36s) Loss: 0.1625(0.1651) Grad: 1.6539  LR: 0.00001638  
Epoch: [2][800/5891] Elapsed 1m 29s (remain 9m 26s) Loss: 0.1344(0.1653) Grad: 1.2050  LR: 0.00001628  
Epoch: [2][900/5891] Elapsed 1m 40s (remain 9m 16s) Loss: 0.2017(0.

Epoch 2 - avg_train_loss: 0.1633  avg_val_loss: 0.1635  time: 768s
INFO:__main__:Epoch 2 - avg_train_loss: 0.1633  avg_val_loss: 0.1635  time: 768s
Epoch 2 - Score: 0.4756
INFO:__main__:Epoch 2 - Score: 0.4756
Epoch 2 - Save Best Score: 0.5270 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.5270 Model


Epoch: [3][0/5891] Elapsed 0m 0s (remain 54m 13s) Loss: 0.1298(0.1298) Grad: nan  LR: 0.00001000  
Epoch: [3][100/5891] Elapsed 0m 12s (remain 11m 43s) Loss: 0.0870(0.1567) Grad: 0.7437  LR: 0.00000987  
Epoch: [3][200/5891] Elapsed 0m 23s (remain 11m 5s) Loss: 0.0783(0.1529) Grad: 1.7794  LR: 0.00000973  
Epoch: [3][300/5891] Elapsed 0m 34s (remain 10m 49s) Loss: 0.2793(0.1522) Grad: 2.4797  LR: 0.00000960  
Epoch: [3][400/5891] Elapsed 0m 46s (remain 10m 30s) Loss: 0.0793(0.1510) Grad: 1.1039  LR: 0.00000947  
Epoch: [3][500/5891] Elapsed 0m 56s (remain 10m 4s) Loss: 0.1277(0.1503) Grad: 0.9909  LR: 0.00000933  
Epoch: [3][600/5891] Elapsed 1m 7s (remain 9m 55s) Loss: 0.1099(0.1475) Grad: 0.9715  LR: 0.00000920  
Epoch: [3][700/5891] Elapsed 1m 18s (remain 9m 40s) Loss: 0.1537(0.1490) Grad: 1.1356  LR: 0.00000907  
Epoch: [3][800/5891] Elapsed 1m 28s (remain 9m 24s) Loss: 0.1355(0.1507) Grad: 1.9064  LR: 0.00000893  
Epoch: [3][900/5891] Elapsed 1m 40s (remain 9m 15s) Loss: 0.3147(0.

Epoch 3 - avg_train_loss: 0.1510  avg_val_loss: 0.1584  time: 768s
--- Logging error ---
Traceback (most recent call last):
  File "/usr/lib/python3.10/logging/__init__.py", line 1104, in emit
    self.flush()
  File "/usr/lib/python3.10/logging/__init__.py", line 1084, in flush
    self.stream.flush()
OSError: [Errno 107] Transport endpoint is not connected
Call stack:
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.10/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelapp.py", line 619, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.10/dist-packages/torn

Epoch: [4][0/5891] Elapsed 0m 0s (remain 57m 36s) Loss: 0.1104(0.1104) Grad: nan  LR: 0.00000293  
Epoch: [4][100/5891] Elapsed 0m 12s (remain 12m 6s) Loss: 0.1159(0.1318) Grad: 0.9879  LR: 0.00000283  
Epoch: [4][200/5891] Elapsed 0m 24s (remain 11m 40s) Loss: 0.0917(0.1387) Grad: 1.5911  LR: 0.00000274  
Epoch: [4][300/5891] Elapsed 0m 34s (remain 10m 40s) Loss: 0.1317(0.1335) Grad: 1.6045  LR: 0.00000265  
Epoch: [4][400/5891] Elapsed 0m 45s (remain 10m 26s) Loss: 0.1171(0.1341) Grad: 1.6264  LR: 0.00000256  
Epoch: [4][500/5891] Elapsed 0m 57s (remain 10m 16s) Loss: 0.1173(0.1376) Grad: 0.8803  LR: 0.00000247  
Epoch: [4][600/5891] Elapsed 1m 6s (remain 9m 49s) Loss: 0.0864(0.1380) Grad: 0.8543  LR: 0.00000239  
Epoch: [4][700/5891] Elapsed 1m 18s (remain 9m 39s) Loss: 0.0398(0.1379) Grad: 0.9978  LR: 0.00000230  
Epoch: [4][800/5891] Elapsed 1m 30s (remain 9m 36s) Loss: 0.1006(0.1376) Grad: 2.0446  LR: 0.00000222  
Epoch: [4][900/5891] Elapsed 1m 41s (remain 9m 23s) Loss: 0.0520(0

Epoch 4 - avg_train_loss: 0.1387  avg_val_loss: 0.1597  time: 757s
--- Logging error ---
Traceback (most recent call last):
  File "/usr/lib/python3.10/logging/__init__.py", line 1104, in emit
    self.flush()
  File "/usr/lib/python3.10/logging/__init__.py", line 1084, in flush
    self.stream.flush()
OSError: [Errno 107] Transport endpoint is not connected
Call stack:
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.10/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelapp.py", line 619, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.10/dist-packages/torn

f1 score : 0.4936670988331505
recall score : 0.37925222188170393
precision score : 0.7069408740359897


Score: 0.4937
--- Logging error ---
Traceback (most recent call last):
  File "/usr/lib/python3.10/logging/__init__.py", line 1104, in emit
    self.flush()
  File "/usr/lib/python3.10/logging/__init__.py", line 1084, in flush
    self.stream.flush()
OSError: [Errno 107] Transport endpoint is not connected
Call stack:
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.10/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelapp.py", line 619, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.10/dist-packages/tornado/platform/asyncio.py", line 195, in start
    self

f1 score : 0.5011094381488232
recall score : 0.38757623119119855
precision score : 0.7087139254693191


Score: 0.5011
--- Logging error ---
Traceback (most recent call last):
  File "/usr/lib/python3.10/logging/__init__.py", line 1104, in emit
    self.flush()
  File "/usr/lib/python3.10/logging/__init__.py", line 1084, in flush
    self.stream.flush()
OSError: [Errno 107] Transport endpoint is not connected
Call stack:
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.10/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelapp.py", line 619, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.10/dist-packages/tornado/platform/asyncio.py", line 195, in start
    self

In [None]:
from google.colab import runtime
runtime.unassign()