In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers
!pip install datasets
!pip install sentencepiece

Collecting transformers
  Downloading transformers-4.32.1-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m55.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m35.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m123.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m87.2 MB/s[0m eta [36m0:00:

In [3]:
!nvidia-smi

Wed Aug 30 08:54:34 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    46W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re
import html

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD, AdamW, RAdam
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score, recall_score, accuracy_score, precision_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
import os

DIR = "/content/drive/MyDrive/Competitions/Signate/MUFG2023"
INPUT_DIR = os.path.join(DIR,"input")
OUTPUT_DIR = os.path.join(DIR,"output")

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)


OUTPUT_EXP_DIR = DIR + '/output/EXP040/'
if not os.path.exists(OUTPUT_EXP_DIR):
    os.makedirs(OUTPUT_EXP_DIR)

In [6]:


# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    model="microsoft/deberta-v3-base"
    # model='microsoft/deberta-base'
    # model='roberta-base'
    # model='roberta-large'
    # model='roberta-large-mnli'
    # model='xlnet-large-cased'
    # model='albert-xxlarge-v2'
    # model="microsoft/deberta-large"
    # model="microsoft/deberta-v3-large"
    # model='microsoft/deberta-v2-xlarge'
    # model='funnel-transformer/large'
    # model='funnel-transformer/medium'
    # model='albert-base-v2'
    # model='albert-large-v2'
    # model='google/electra-large-discriminator'
    # model='google/electra-base-discriminator'
    # model="facebook/bart-large-mnli"
    # model="facebook/bart-large"
    # model="facebook/bart-base"
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=4
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=64
    fc_dropout=0.2
    target="is_fraud?"
    target_size=1
    max_len=256
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    train=True
    nth_awp_start_epoch=1
    gradient_checkpointing = False
    freezing = False

if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0, 1]

In [7]:
def get_score(labels, outputs):
    thresh = 0.5
    y_pred = outputs
    y_true = labels
    f_score = f1_score(y_true, (y_pred>thresh).astype(int))
    r_score = recall_score(y_true, (y_pred>thresh).astype(int))
    p_score = precision_score(y_true, (y_pred>thresh).astype(int))
    print(f"f1 score : {f_score}")
    print(f"recall score : {r_score}")
    print(f"precision score : {p_score}")
    return f1_score(y_true, (y_pred>thresh).astype(int))

def get_f1_score(labels, outputs):
    y_pred = outputs
    y_true = labels
    best_score = 0
    best_thresh = 0.5
    for thresh in np.arange(0.1, 0.80, 0.01):
        thresh = np.round(thresh, 2)
        score = f1_score(y_true, (y_pred>thresh).astype(int))
        #print("Accuracy score at threshold {0} is {1}".format(thresh, score))
        if score > best_score:
          best_score = score
          best_thresh = thresh
    return f1_score(y_true, (y_pred>best_thresh).astype(int))


def get_logger(filename=OUTPUT_EXP_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(seed=CFG.seed)

In [8]:
def freeze(module):
    """
    Freezes module's parameters.
    """

    for parameter in module.parameters():
        parameter.requires_grad = False

def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """

    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)

    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """

    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"

        if hasattr(embeddings_path, attr_name):
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [9]:
import pandas as pd
import numpy as np


train = pd.read_csv(os.path.join(INPUT_DIR,"train.csv"))
test = pd.read_csv(os.path.join(INPUT_DIR,"test.csv"))
card = pd.read_csv(os.path.join(INPUT_DIR, "card.csv"))
user = pd.read_csv(os.path.join(INPUT_DIR, "user.csv"))
sub = pd.read_csv(os.path.join(INPUT_DIR, "sample_submit.csv"), header=None)

print(train.shape)
display(train.head(3))

print(test.shape)
display(test.head(3))

print(card.shape)
display(card.head(3))

print(user.shape)
display(user.head(3))

print(sub.shape)
display(sub.head(3))

(471283, 12)


Unnamed: 0,index,user_id,card_id,amount,errors?,is_fraud?,merchant_id,merchant_city,merchant_state,zip,mcc,use_chip
0,0,1721,0,$2.623,OK,0,209237,Joliet,IL,60436.0,5541,Swipe Transaction
1,1,1629,3,$6.4,OK,0,2568,Edgerton,WI,53534.0,5814,Swipe Transaction
2,2,655,3,$123.5,OK,0,345310,Ridgefield,WA,98642.0,7538,Swipe Transaction


(457958, 11)


Unnamed: 0,index,user_id,card_id,amount,errors?,merchant_id,merchant_city,merchant_state,zip,mcc,use_chip
0,471283,541,3,$113.278,OK,324189,Orlando,FL,32821.0,4814,Swipe Transaction
1,471284,655,1,$293.944,OK,81219,Ridgefield,WA,98642.0,7538,Chip Transaction
2,471285,492,0,$47.4,OK,274755,Arlington Heights,IL,60004.0,5719,Swipe Transaction


(416, 10)


Unnamed: 0,user_id,card_id,card_brand,card_type,expires,has_chip,cards_issued,credit_limit,acct_open_date,year_pin_last_changed
0,39,0,Visa,Debit,09/2021,YES,1,$17117,05/2007,2010
1,39,1,Amex,Credit,11/2024,YES,2,$5400,10/2015,2015
2,41,0,Discover,Credit,03/2022,YES,2,$14800,12/2010,2011


(97, 17)


Unnamed: 0,user_id,current_age,retirement_age,birth_year,birth_month,gender,address,city,state,zipcode,latitude,longitude,per_capita_income_zipcode,yearly_income_person,total_debt,fico_score,num_credit_cards
0,39,57,64,1962,12,Female,442 Burns Boulevard,Mansfield,MA,2048,42.02,-71.21,$37407,$76274,$102611,698,2
1,41,39,66,1980,10,Female,3863 River Avenue,Lincoln,CA,95648,38.93,-121.25,$21829,$44506,$57994,849,3
2,47,40,67,1979,5,Female,8799 Elm Avenue,Mckinney,TX,75069,33.2,-96.65,$24684,$50329,$76759,625,4


(457958, 2)


Unnamed: 0,0,1
0,471283,0
1,471284,1
2,471285,0


In [10]:
train = train.merge(card, how="left", on=["user_id", "card_id"]).merge(user, how="left", on="user_id")

In [11]:
month_dict = {
   "01": "January",
   "02": "February",
   "03": "March",
   "04": "April",
   "05": "May",
   "06": "June",
   "07": "July",
   "08": "August",
   "09": "September",
   "10": "October",
   "11": "November",
   "12": "December"
}

def get_expires_values(df):
  _df = df["expires"].str.split('/').apply(pd.Series)
  _df.columns = ["month","years"]
  df["expires_month"] = _df["month"].astype(str)
  df["expires_years"] = _df["years"].astype(str)
  return df

def get_acct_open_date_values(df):
  _df = df["acct_open_date"].str.split('/').apply(pd.Series)
  _df.columns = ["month","years"]
  df["acct_open_date_month"] = _df["month"].astype(str)
  df["acct_open_date_years"] = _df["years"].astype(str)
  return df

train = get_expires_values(train)
train = get_acct_open_date_values(train)
train["expires_month"] = train["expires_month"].map(month_dict)
train["acct_open_date_month"] = train["acct_open_date_month"].map(month_dict)

In [12]:
train.fillna('unknown', inplace = True)

train["texts"] = "merchant" + "[SEP]" + train["amount"] + "[SEP]" + train["errors?"] + "[SEP]" + train["merchant_city"] + "[SEP]" + train["merchant_state"] + "[SEP]" + train["use_chip"] + "[SEP]" \
+ "card" + "[SEP]" + train["card_brand"] + "[SEP]" + train["card_type"] + "[SEP]" + train["expires_month"] + " " + train["expires_years"] + "[SEP]" + train["has_chip"] + "[SEP]" + train["acct_open_date_month"] + " " + train["acct_open_date_years"] + "[SEP]" + train["year_pin_last_changed"].astype(str) + "[SEP]" \
"user" + "[SEP]" + train["current_age"].astype(str) + " year old " + train["gender"] + "[SEP]" + "retired at age " + train["retirement_age"].astype(str) + "[SEP]" + train["address"] + "[SEP]" + train["city"] + "[SEP]" + train["state"] + "[SEP]" + train["per_capita_income_zipcode"] + "[SEP]" + train["yearly_income_person"] + "[SEP]" + train["total_debt"]

In [13]:
skf = StratifiedKFold(n_splits=CFG.n_fold,shuffle=True,random_state=CFG.seed)
for fold, ( _, val_) in enumerate(skf.split(train, train[CFG.target])):
    train.loc[val_ , "kfold"] = int(fold)

train["kfold"] = train["kfold"].astype(int)

if CFG.debug:
    display(train.groupby('kfold').size())
    train = train.sample(n=500, random_state=0).reset_index(drop=True)
    display(train.groupby('kfold').size())

In [14]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_EXP_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [15]:
# ====================================================
# Define max_len
# ====================================================
lengths = []
tk0 = tqdm(train['texts'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 23 # cls
LOGGER.info(f"max_len: {CFG.max_len}")

100%|██████████| 471283/471283 [01:30<00:00, 5193.19it/s]
max_len: 97
INFO:__main__:max_len: 97


In [16]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False,
                           truncation=True)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['texts'].values
        self.labels = df[CFG.target].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.half)
        return inputs, label

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

class ValidDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['texts'].values
        self.labels = df[CFG.target].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

#collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [17]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim=1)
        return max_embeddings


class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()

        # Freezing
        if cfg.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            cfg.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())

        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        self._init_weights(self.fc)
        self.layer_norm1 = nn.LayerNorm(self.config.hidden_size)
        self.sig = nn.Sigmoid()

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        feature = self.layer_norm1(feature)
        output = self.fc(feature)
        #output = self.sig(output)
        return output

In [18]:
class Focal_MultiLabel_Loss(nn.Module):
    def __init__(self, gamma):
      super(Focal_MultiLabel_Loss, self).__init__()
      self.gamma = gamma
      self.bceloss = nn.BCEWithLogitsLoss()

    def forward(self, outputs, targets):
      bce = self.bceloss(outputs.view(-1, 1), targets.view(-1, 1))
      bce_exp = torch.exp(-bce)
      focal_loss = (1-bce_exp)**self.gamma * bce
      return focal_loss.mean()

In [19]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
        #print(y_preds.sigmoid().squeeze().view(1, -1))
        loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader),
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))

    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    predictions = np.concatenate(predictions)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [20]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):

    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['kfold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['kfold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds[CFG.target].values

    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = ValidDataset(CFG, valid_folds)


    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size*2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_EXP_DIR+'config.pth')
    model.to(device)

    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr,
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)

    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler

    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = Focal_MultiLabel_Loss(gamma=2.0)

    best_score = -1.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)

        # scoring
        score = get_score(valid_labels, predictions)
        f1_score = get_f1_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')


        if best_score < f1_score:
            best_score = f1_score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_EXP_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_EXP_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                             map_location=torch.device('cpu'))['predictions']
    valid_folds['pred'] = predictions

    torch.cuda.empty_cache()
    gc.collect()

    return valid_folds

In [21]:
if __name__ == '__main__':

    def get_result(oof_df):
        labels = oof_df[CFG.target].values
        preds = oof_df['pred'].values
        score = get_score(labels, preds)
        f1_score = get_f1_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
        LOGGER.info(f'F1 BEST Score: {f1_score:<.4f}')

    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
            #break
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_EXP_DIR+'oof_df.pkl')

DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.32.1",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "at

Downloading pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Epoch: [1][0/5891] Elapsed 0m 3s (remain 381m 56s) Loss: 0.3000(0.3000) Grad: nan  LR: 0.00002000  
Epoch: [1][100/5891] Elapsed 0m 12s (remain 12m 8s) Loss: 0.0195(0.0289) Grad: 0.0379  LR: 0.00002000  
Epoch: [1][200/5891] Elapsed 0m 21s (remain 10m 7s) Loss: 0.0105(0.0211) Grad: 0.0638  LR: 0.00002000  
Epoch: [1][300/5891] Elapsed 0m 30s (remain 9m 21s) Loss: 0.0079(0.0181) Grad: 0.1107  LR: 0.00001999  
Epoch: [1][400/5891] Elapsed 0m 39s (remain 8m 55s) Loss: 0.0054(0.0165) Grad: 0.1158  LR: 0.00001999  
Epoch: [1][500/5891] Elapsed 0m 47s (remain 8m 35s) Loss: 0.0126(0.0153) Grad: 0.1324  LR: 0.00001998  
Epoch: [1][600/5891] Elapsed 0m 56s (remain 8m 19s) Loss: 0.0025(0.0147) Grad: 0.0615  LR: 0.00001997  
Epoch: [1][700/5891] Elapsed 1m 5s (remain 8m 6s) Loss: 0.0167(0.0141) Grad: 0.2039  LR: 0.00001996  
Epoch: [1][800/5891] Elapsed 1m 14s (remain 7m 52s) Loss: 0.0222(0.0139) Grad: 0.0454  LR: 0.00001994  
Epoch: [1][900/5891] Elapsed 1m 23s (remain 7m 40s) Loss: 0.0128(0.013

Epoch 1 - avg_train_loss: 0.0084  avg_val_loss: 0.0054  time: 622s
INFO:__main__:Epoch 1 - avg_train_loss: 0.0084  avg_val_loss: 0.0054  time: 622s
Epoch 1 - Score: 0.3901
INFO:__main__:Epoch 1 - Score: 0.3901
Epoch 1 - Save Best Score: 0.4879 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.4879 Model


Epoch: [2][0/5891] Elapsed 0m 0s (remain 37m 46s) Loss: 0.0121(0.0121) Grad: nan  LR: 0.00001707  
Epoch: [2][100/5891] Elapsed 0m 9s (remain 8m 55s) Loss: 0.0005(0.0050) Grad: 0.0229  LR: 0.00001698  
Epoch: [2][200/5891] Elapsed 0m 18s (remain 8m 36s) Loss: 0.0039(0.0058) Grad: 0.0689  LR: 0.00001688  
Epoch: [2][300/5891] Elapsed 0m 27s (remain 8m 21s) Loss: 0.0097(0.0058) Grad: 0.2700  LR: 0.00001678  
Epoch: [2][400/5891] Elapsed 0m 35s (remain 8m 11s) Loss: 0.0003(0.0058) Grad: 0.0184  LR: 0.00001668  
Epoch: [2][500/5891] Elapsed 0m 44s (remain 8m 0s) Loss: 0.0024(0.0060) Grad: 0.0284  LR: 0.00001658  
Epoch: [2][600/5891] Elapsed 0m 53s (remain 7m 49s) Loss: 0.0169(0.0058) Grad: 0.2101  LR: 0.00001648  
Epoch: [2][700/5891] Elapsed 1m 2s (remain 7m 40s) Loss: 0.0028(0.0059) Grad: 0.0387  LR: 0.00001638  
Epoch: [2][800/5891] Elapsed 1m 10s (remain 7m 30s) Loss: 0.0090(0.0058) Grad: 0.0961  LR: 0.00001628  
Epoch: [2][900/5891] Elapsed 1m 19s (remain 7m 21s) Loss: 0.0088(0.0058)

Epoch 2 - avg_train_loss: 0.0055  avg_val_loss: 0.0045  time: 619s
INFO:__main__:Epoch 2 - avg_train_loss: 0.0055  avg_val_loss: 0.0045  time: 619s
Epoch 2 - Score: 0.4397
INFO:__main__:Epoch 2 - Score: 0.4397
Epoch 2 - Save Best Score: 0.5300 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.5300 Model


Epoch: [3][0/5891] Elapsed 0m 0s (remain 37m 44s) Loss: 0.0082(0.0082) Grad: nan  LR: 0.00001000  
Epoch: [3][100/5891] Elapsed 0m 9s (remain 8m 49s) Loss: 0.0023(0.0043) Grad: 0.0268  LR: 0.00000987  
Epoch: [3][200/5891] Elapsed 0m 18s (remain 8m 46s) Loss: 0.0051(0.0043) Grad: 0.0752  LR: 0.00000973  
Epoch: [3][300/5891] Elapsed 0m 27s (remain 8m 29s) Loss: 0.0153(0.0045) Grad: 0.1691  LR: 0.00000960  
Epoch: [3][400/5891] Elapsed 0m 36s (remain 8m 15s) Loss: 0.0023(0.0048) Grad: 0.0414  LR: 0.00000947  
Epoch: [3][500/5891] Elapsed 0m 45s (remain 8m 4s) Loss: 0.0005(0.0047) Grad: 0.0170  LR: 0.00000933  
Epoch: [3][600/5891] Elapsed 0m 53s (remain 7m 54s) Loss: 0.0043(0.0047) Grad: 0.0402  LR: 0.00000920  
Epoch: [3][700/5891] Elapsed 1m 2s (remain 7m 44s) Loss: 0.0089(0.0047) Grad: 0.1220  LR: 0.00000907  
Epoch: [3][800/5891] Elapsed 1m 11s (remain 7m 33s) Loss: 0.0043(0.0047) Grad: 0.0431  LR: 0.00000893  
Epoch: [3][900/5891] Elapsed 1m 20s (remain 7m 24s) Loss: 0.0088(0.0046)

Epoch 3 - avg_train_loss: 0.0045  avg_val_loss: 0.0042  time: 620s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0045  avg_val_loss: 0.0042  time: 620s
Epoch 3 - Score: 0.4946
INFO:__main__:Epoch 3 - Score: 0.4946
Epoch 3 - Save Best Score: 0.5501 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.5501 Model


Epoch: [4][0/5891] Elapsed 0m 0s (remain 37m 21s) Loss: 0.0003(0.0003) Grad: nan  LR: 0.00000293  
Epoch: [4][100/5891] Elapsed 0m 9s (remain 8m 52s) Loss: 0.0065(0.0043) Grad: 0.0741  LR: 0.00000283  
Epoch: [4][200/5891] Elapsed 0m 18s (remain 8m 40s) Loss: 0.0012(0.0040) Grad: 0.0154  LR: 0.00000274  
Epoch: [4][300/5891] Elapsed 0m 27s (remain 8m 24s) Loss: 0.0012(0.0042) Grad: 0.0202  LR: 0.00000265  
Epoch: [4][400/5891] Elapsed 0m 35s (remain 8m 11s) Loss: 0.0118(0.0039) Grad: 0.1265  LR: 0.00000256  
Epoch: [4][500/5891] Elapsed 0m 44s (remain 8m 0s) Loss: 0.0083(0.0039) Grad: 0.0714  LR: 0.00000247  
Epoch: [4][600/5891] Elapsed 0m 53s (remain 7m 51s) Loss: 0.0008(0.0040) Grad: 0.0267  LR: 0.00000239  
Epoch: [4][700/5891] Elapsed 1m 2s (remain 7m 41s) Loss: 0.0027(0.0040) Grad: 0.0481  LR: 0.00000230  
Epoch: [4][800/5891] Elapsed 1m 11s (remain 7m 31s) Loss: 0.0044(0.0040) Grad: 0.0634  LR: 0.00000222  
Epoch: [4][900/5891] Elapsed 1m 19s (remain 7m 22s) Loss: 0.0001(0.0040)

Epoch 4 - avg_train_loss: 0.0039  avg_val_loss: 0.0041  time: 620s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0039  avg_val_loss: 0.0041  time: 620s
Epoch 4 - Score: 0.4950
INFO:__main__:Epoch 4 - Score: 0.4950
Epoch 4 - Save Best Score: 0.5505 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.5505 Model


f1 score : 0.49496981891348085
recall score : 0.37695372356726936
precision score : 0.7205623901581723


Score: 0.4950
INFO:__main__:Score: 0.4950
F1 BEST Score: 0.5505
INFO:__main__:F1 BEST Score: 0.5505
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.32.1",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__

Epoch: [1][0/5891] Elapsed 0m 0s (remain 39m 56s) Loss: 0.1646(0.1646) Grad: nan  LR: 0.00002000  
Epoch: [1][100/5891] Elapsed 0m 9s (remain 9m 25s) Loss: 0.0129(0.0210) Grad: 0.1377  LR: 0.00002000  
Epoch: [1][200/5891] Elapsed 0m 18s (remain 8m 46s) Loss: 0.0040(0.0170) Grad: 0.0358  LR: 0.00002000  
Epoch: [1][300/5891] Elapsed 0m 27s (remain 8m 28s) Loss: 0.0029(0.0148) Grad: 0.0601  LR: 0.00001999  
Epoch: [1][400/5891] Elapsed 0m 36s (remain 8m 14s) Loss: 0.0132(0.0135) Grad: 0.1076  LR: 0.00001999  
Epoch: [1][500/5891] Elapsed 0m 44s (remain 8m 3s) Loss: 0.0168(0.0129) Grad: 0.1722  LR: 0.00001998  
Epoch: [1][600/5891] Elapsed 0m 53s (remain 7m 52s) Loss: 0.0033(0.0120) Grad: 0.0391  LR: 0.00001997  
Epoch: [1][700/5891] Elapsed 1m 2s (remain 7m 42s) Loss: 0.0313(0.0117) Grad: 0.2880  LR: 0.00001996  
Epoch: [1][800/5891] Elapsed 1m 11s (remain 7m 33s) Loss: 0.0048(0.0113) Grad: 0.1166  LR: 0.00001994  
Epoch: [1][900/5891] Elapsed 1m 20s (remain 7m 23s) Loss: 0.0054(0.0110)

Epoch 1 - avg_train_loss: 0.0080  avg_val_loss: 0.0057  time: 621s
INFO:__main__:Epoch 1 - avg_train_loss: 0.0080  avg_val_loss: 0.0057  time: 621s
Epoch 1 - Score: 0.4340
INFO:__main__:Epoch 1 - Score: 0.4340
Epoch 1 - Save Best Score: 0.5025 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.5025 Model


Epoch: [2][0/5891] Elapsed 0m 0s (remain 38m 2s) Loss: 0.0026(0.0026) Grad: nan  LR: 0.00001707  
Epoch: [2][100/5891] Elapsed 0m 9s (remain 8m 47s) Loss: 0.0023(0.0052) Grad: 0.0717  LR: 0.00001698  
Epoch: [2][200/5891] Elapsed 0m 18s (remain 8m 44s) Loss: 0.0026(0.0055) Grad: 0.0427  LR: 0.00001688  
Epoch: [2][300/5891] Elapsed 0m 27s (remain 8m 29s) Loss: 0.0016(0.0055) Grad: 0.0294  LR: 0.00001678  
Epoch: [2][400/5891] Elapsed 0m 36s (remain 8m 15s) Loss: 0.0184(0.0055) Grad: 0.3364  LR: 0.00001668  
Epoch: [2][500/5891] Elapsed 0m 44s (remain 8m 3s) Loss: 0.0014(0.0055) Grad: 0.0161  LR: 0.00001658  
Epoch: [2][600/5891] Elapsed 0m 53s (remain 7m 52s) Loss: 0.0006(0.0057) Grad: 0.0241  LR: 0.00001648  
Epoch: [2][700/5891] Elapsed 1m 2s (remain 7m 42s) Loss: 0.0020(0.0057) Grad: 0.0244  LR: 0.00001638  
Epoch: [2][800/5891] Elapsed 1m 11s (remain 7m 32s) Loss: 0.0135(0.0058) Grad: 0.1063  LR: 0.00001628  
Epoch: [2][900/5891] Elapsed 1m 20s (remain 7m 23s) Loss: 0.0010(0.0058) 

Epoch 2 - avg_train_loss: 0.0055  avg_val_loss: 0.0046  time: 621s
INFO:__main__:Epoch 2 - avg_train_loss: 0.0055  avg_val_loss: 0.0046  time: 621s
Epoch 2 - Score: 0.4583
INFO:__main__:Epoch 2 - Score: 0.4583
Epoch 2 - Save Best Score: 0.5285 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.5285 Model


Epoch: [3][0/5891] Elapsed 0m 0s (remain 38m 30s) Loss: 0.0042(0.0042) Grad: nan  LR: 0.00001000  
Epoch: [3][100/5891] Elapsed 0m 9s (remain 8m 54s) Loss: 0.0005(0.0053) Grad: 0.0202  LR: 0.00000987  
Epoch: [3][200/5891] Elapsed 0m 18s (remain 8m 43s) Loss: 0.0061(0.0046) Grad: 0.1019  LR: 0.00000973  
Epoch: [3][300/5891] Elapsed 0m 27s (remain 8m 25s) Loss: 0.0062(0.0048) Grad: 0.0615  LR: 0.00000960  
Epoch: [3][400/5891] Elapsed 0m 36s (remain 8m 12s) Loss: 0.0028(0.0047) Grad: 0.0614  LR: 0.00000947  
Epoch: [3][500/5891] Elapsed 0m 44s (remain 8m 1s) Loss: 0.0016(0.0046) Grad: 0.0275  LR: 0.00000933  
Epoch: [3][600/5891] Elapsed 0m 53s (remain 7m 51s) Loss: 0.0228(0.0047) Grad: 0.2589  LR: 0.00000920  
Epoch: [3][700/5891] Elapsed 1m 2s (remain 7m 41s) Loss: 0.0126(0.0046) Grad: 0.2358  LR: 0.00000907  
Epoch: [3][800/5891] Elapsed 1m 11s (remain 7m 31s) Loss: 0.0020(0.0045) Grad: 0.0267  LR: 0.00000893  
Epoch: [3][900/5891] Elapsed 1m 19s (remain 7m 22s) Loss: 0.0005(0.0046)

Epoch 3 - avg_train_loss: 0.0045  avg_val_loss: 0.0042  time: 619s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0045  avg_val_loss: 0.0042  time: 619s
Epoch 3 - Score: 0.4840
INFO:__main__:Epoch 3 - Score: 0.4840
Epoch 3 - Save Best Score: 0.5453 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.5453 Model


Epoch: [4][0/5891] Elapsed 0m 0s (remain 38m 13s) Loss: 0.0008(0.0008) Grad: nan  LR: 0.00000293  
Epoch: [4][100/5891] Elapsed 0m 9s (remain 8m 49s) Loss: 0.0022(0.0040) Grad: 0.0605  LR: 0.00000283  
Epoch: [4][200/5891] Elapsed 0m 18s (remain 8m 40s) Loss: 0.0004(0.0037) Grad: 0.0149  LR: 0.00000274  
Epoch: [4][300/5891] Elapsed 0m 27s (remain 8m 24s) Loss: 0.0014(0.0038) Grad: 0.0184  LR: 0.00000265  
Epoch: [4][400/5891] Elapsed 0m 35s (remain 8m 11s) Loss: 0.0020(0.0040) Grad: 0.0361  LR: 0.00000256  
Epoch: [4][500/5891] Elapsed 0m 44s (remain 8m 1s) Loss: 0.0084(0.0041) Grad: 0.1022  LR: 0.00000247  
Epoch: [4][600/5891] Elapsed 0m 53s (remain 7m 51s) Loss: 0.0050(0.0042) Grad: 0.1136  LR: 0.00000239  
Epoch: [4][700/5891] Elapsed 1m 2s (remain 7m 41s) Loss: 0.0013(0.0041) Grad: 0.0239  LR: 0.00000230  
Epoch: [4][800/5891] Elapsed 1m 11s (remain 7m 32s) Loss: 0.0009(0.0041) Grad: 0.0212  LR: 0.00000222  
Epoch: [4][900/5891] Elapsed 1m 19s (remain 7m 22s) Loss: 0.0065(0.0041)

Epoch 4 - avg_train_loss: 0.0039  avg_val_loss: 0.0041  time: 620s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0039  avg_val_loss: 0.0041  time: 620s
Epoch 4 - Score: 0.4960
INFO:__main__:Epoch 4 - Score: 0.4960
Epoch 4 - Save Best Score: 0.5515 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.5515 Model


f1 score : 0.495990376904571
recall score : 0.37909898866074165
precision score : 0.7171014492753623


Score: 0.4960
INFO:__main__:Score: 0.4960
F1 BEST Score: 0.5515
INFO:__main__:F1 BEST Score: 0.5515
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.32.1",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__

Epoch: [1][0/5891] Elapsed 0m 0s (remain 42m 36s) Loss: 0.0515(0.0515) Grad: nan  LR: 0.00002000  
Epoch: [1][100/5891] Elapsed 0m 9s (remain 9m 9s) Loss: 0.0091(0.0161) Grad: 0.0737  LR: 0.00002000  
Epoch: [1][200/5891] Elapsed 0m 18s (remain 8m 40s) Loss: 0.0022(0.0140) Grad: 0.0425  LR: 0.00002000  
Epoch: [1][300/5891] Elapsed 0m 27s (remain 8m 24s) Loss: 0.0335(0.0126) Grad: 0.4652  LR: 0.00001999  
Epoch: [1][400/5891] Elapsed 0m 35s (remain 8m 12s) Loss: 0.0075(0.0121) Grad: 0.0211  LR: 0.00001999  
Epoch: [1][500/5891] Elapsed 0m 44s (remain 8m 1s) Loss: 0.0014(0.0114) Grad: 0.0742  LR: 0.00001998  
Epoch: [1][600/5891] Elapsed 0m 53s (remain 7m 50s) Loss: 0.0040(0.0115) Grad: 0.1045  LR: 0.00001997  
Epoch: [1][700/5891] Elapsed 1m 2s (remain 7m 41s) Loss: 0.0009(0.0113) Grad: 0.0355  LR: 0.00001996  
Epoch: [1][800/5891] Elapsed 1m 11s (remain 7m 32s) Loss: 0.0122(0.0109) Grad: 0.1440  LR: 0.00001994  
Epoch: [1][900/5891] Elapsed 1m 19s (remain 7m 22s) Loss: 0.0159(0.0107) 

Epoch 1 - avg_train_loss: 0.0121  avg_val_loss: 0.0144  time: 622s
INFO:__main__:Epoch 1 - avg_train_loss: 0.0121  avg_val_loss: 0.0144  time: 622s
Epoch 1 - Score: 0.0000
INFO:__main__:Epoch 1 - Score: 0.0000
Epoch 1 - Save Best Score: 0.0000 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.0000 Model


Epoch: [2][0/5891] Elapsed 0m 0s (remain 37m 50s) Loss: 0.0019(0.0019) Grad: nan  LR: 0.00001707  
Epoch: [2][100/5891] Elapsed 0m 9s (remain 8m 53s) Loss: 0.0159(0.0152) Grad: 0.0362  LR: 0.00001698  
Epoch: [2][200/5891] Elapsed 0m 18s (remain 8m 34s) Loss: 0.0104(0.0151) Grad: 0.0509  LR: 0.00001688  
Epoch: [2][300/5891] Elapsed 0m 27s (remain 8m 24s) Loss: 0.0017(0.0147) Grad: 0.0763  LR: 0.00001678  
Epoch: [2][400/5891] Elapsed 0m 35s (remain 8m 12s) Loss: 0.0066(0.0148) Grad: 0.0957  LR: 0.00001668  
Epoch: [2][500/5891] Elapsed 0m 44s (remain 8m 0s) Loss: 0.0224(0.0148) Grad: 0.0560  LR: 0.00001658  
Epoch: [2][600/5891] Elapsed 0m 53s (remain 7m 50s) Loss: 0.0106(0.0149) Grad: 0.0746  LR: 0.00001648  
Epoch: [2][700/5891] Elapsed 1m 2s (remain 7m 41s) Loss: 0.0159(0.0150) Grad: 0.0389  LR: 0.00001638  
Epoch: [2][800/5891] Elapsed 1m 11s (remain 7m 31s) Loss: 0.0018(0.0152) Grad: 0.0816  LR: 0.00001628  
Epoch: [2][900/5891] Elapsed 1m 19s (remain 7m 21s) Loss: 0.0223(0.0153)

Epoch 2 - avg_train_loss: 0.0153  avg_val_loss: 0.0143  time: 621s
INFO:__main__:Epoch 2 - avg_train_loss: 0.0153  avg_val_loss: 0.0143  time: 621s
Epoch 2 - Score: 0.0000
INFO:__main__:Epoch 2 - Score: 0.0000


Epoch: [3][0/5891] Elapsed 0m 0s (remain 37m 45s) Loss: 0.0227(0.0227) Grad: nan  LR: 0.00001000  
Epoch: [3][100/5891] Elapsed 0m 9s (remain 8m 45s) Loss: 0.0034(0.0139) Grad: 0.0826  LR: 0.00000987  
Epoch: [3][200/5891] Elapsed 0m 18s (remain 8m 30s) Loss: 0.0065(0.0140) Grad: 0.0927  LR: 0.00000973  
Epoch: [3][300/5891] Elapsed 0m 26s (remain 8m 19s) Loss: 0.0223(0.0139) Grad: 0.0549  LR: 0.00000960  
Epoch: [3][400/5891] Elapsed 0m 35s (remain 8m 8s) Loss: 0.0222(0.0151) Grad: 0.0241  LR: 0.00000947  
Epoch: [3][500/5891] Elapsed 0m 44s (remain 7m 59s) Loss: 0.0158(0.0154) Grad: 0.0294  LR: 0.00000933  
Epoch: [3][600/5891] Elapsed 0m 53s (remain 7m 48s) Loss: 0.0298(0.0156) Grad: 0.1182  LR: 0.00000920  
Epoch: [3][700/5891] Elapsed 1m 2s (remain 7m 40s) Loss: 0.0038(0.0157) Grad: 0.1018  LR: 0.00000907  
Epoch: [3][800/5891] Elapsed 1m 10s (remain 7m 31s) Loss: 0.0066(0.0157) Grad: 0.0986  LR: 0.00000893  
Epoch: [3][900/5891] Elapsed 1m 19s (remain 7m 21s) Loss: 0.0038(0.0156)

Epoch 3 - avg_train_loss: 0.0153  avg_val_loss: 0.0143  time: 622s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0153  avg_val_loss: 0.0143  time: 622s
Epoch 3 - Score: 0.0000
INFO:__main__:Epoch 3 - Score: 0.0000


Epoch: [4][0/5891] Elapsed 0m 0s (remain 38m 24s) Loss: 0.0065(0.0065) Grad: nan  LR: 0.00000293  
Epoch: [4][100/5891] Elapsed 0m 9s (remain 8m 49s) Loss: 0.0106(0.0147) Grad: 0.0706  LR: 0.00000283  
Epoch: [4][200/5891] Elapsed 0m 18s (remain 8m 30s) Loss: 0.0224(0.0149) Grad: 0.0562  LR: 0.00000274  
Epoch: [4][300/5891] Elapsed 0m 26s (remain 8m 18s) Loss: 0.0158(0.0151) Grad: 0.0278  LR: 0.00000265  
Epoch: [4][400/5891] Elapsed 0m 35s (remain 8m 7s) Loss: 0.0158(0.0149) Grad: 0.0234  LR: 0.00000256  
Epoch: [4][500/5891] Elapsed 0m 44s (remain 7m 57s) Loss: 0.0158(0.0150) Grad: 0.0263  LR: 0.00000247  
Epoch: [4][600/5891] Elapsed 0m 53s (remain 7m 48s) Loss: 0.0036(0.0149) Grad: 0.0906  LR: 0.00000239  
Epoch: [4][700/5891] Elapsed 1m 2s (remain 7m 39s) Loss: 0.0106(0.0150) Grad: 0.0769  LR: 0.00000230  
Epoch: [4][800/5891] Elapsed 1m 10s (remain 7m 30s) Loss: 0.0304(0.0148) Grad: 0.1649  LR: 0.00000222  
Epoch: [4][900/5891] Elapsed 1m 19s (remain 7m 21s) Loss: 0.0224(0.0147)

Epoch 4 - avg_train_loss: 0.0151  avg_val_loss: 0.0143  time: 621s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0151  avg_val_loss: 0.0143  time: 621s
Epoch 4 - Score: 0.0000
INFO:__main__:Epoch 4 - Score: 0.0000


f1 score : 0.0
recall score : 0.0
precision score : 0.0


Score: 0.0000
INFO:__main__:Score: 0.0000
F1 BEST Score: 0.0000
INFO:__main__:F1 BEST Score: 0.0000
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.32.1",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__

Epoch: [1][0/5891] Elapsed 0m 0s (remain 41m 45s) Loss: 0.0997(0.0997) Grad: nan  LR: 0.00002000  
Epoch: [1][100/5891] Elapsed 0m 9s (remain 8m 49s) Loss: 0.0290(0.0189) Grad: 0.1749  LR: 0.00002000  
Epoch: [1][200/5891] Elapsed 0m 18s (remain 8m 33s) Loss: 0.0019(0.0159) Grad: 0.0724  LR: 0.00002000  
Epoch: [1][300/5891] Elapsed 0m 27s (remain 8m 22s) Loss: 0.0082(0.0142) Grad: 0.2382  LR: 0.00001999  
Epoch: [1][400/5891] Elapsed 0m 35s (remain 8m 11s) Loss: 0.0102(0.0134) Grad: 0.0414  LR: 0.00001999  
Epoch: [1][500/5891] Elapsed 0m 44s (remain 8m 1s) Loss: 0.0227(0.0128) Grad: 0.1028  LR: 0.00001998  
Epoch: [1][600/5891] Elapsed 0m 53s (remain 7m 52s) Loss: 0.0057(0.0124) Grad: 0.0462  LR: 0.00001997  
Epoch: [1][700/5891] Elapsed 1m 2s (remain 7m 42s) Loss: 0.0052(0.0122) Grad: 0.0440  LR: 0.00001996  
Epoch: [1][800/5891] Elapsed 1m 11s (remain 7m 32s) Loss: 0.0056(0.0119) Grad: 0.0369  LR: 0.00001994  
Epoch: [1][900/5891] Elapsed 1m 20s (remain 7m 23s) Loss: 0.0094(0.0117)

Epoch 1 - avg_train_loss: 0.0082  avg_val_loss: 0.0058  time: 622s
INFO:__main__:Epoch 1 - avg_train_loss: 0.0082  avg_val_loss: 0.0058  time: 622s
Epoch 1 - Score: 0.3444
INFO:__main__:Epoch 1 - Score: 0.3444
Epoch 1 - Save Best Score: 0.4830 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.4830 Model


Epoch: [2][0/5891] Elapsed 0m 0s (remain 38m 4s) Loss: 0.0020(0.0020) Grad: nan  LR: 0.00001707  
Epoch: [2][100/5891] Elapsed 0m 9s (remain 8m 47s) Loss: 0.0106(0.0055) Grad: 0.1489  LR: 0.00001698  
Epoch: [2][200/5891] Elapsed 0m 18s (remain 8m 35s) Loss: 0.0018(0.0055) Grad: 0.0313  LR: 0.00001688  
Epoch: [2][300/5891] Elapsed 0m 27s (remain 8m 22s) Loss: 0.0031(0.0058) Grad: 0.0652  LR: 0.00001678  
Epoch: [2][400/5891] Elapsed 0m 35s (remain 8m 11s) Loss: 0.0074(0.0058) Grad: 0.0596  LR: 0.00001668  
Epoch: [2][500/5891] Elapsed 0m 44s (remain 8m 0s) Loss: 0.0015(0.0057) Grad: 0.0271  LR: 0.00001658  
Epoch: [2][600/5891] Elapsed 0m 53s (remain 7m 50s) Loss: 0.0031(0.0060) Grad: 0.1071  LR: 0.00001648  
Epoch: [2][700/5891] Elapsed 1m 2s (remain 7m 41s) Loss: 0.0061(0.0060) Grad: 0.0689  LR: 0.00001638  
Epoch: [2][800/5891] Elapsed 1m 11s (remain 7m 31s) Loss: 0.0089(0.0059) Grad: 0.1605  LR: 0.00001628  
Epoch: [2][900/5891] Elapsed 1m 20s (remain 7m 23s) Loss: 0.0004(0.0058) 

Epoch 2 - avg_train_loss: 0.0056  avg_val_loss: 0.0046  time: 623s
INFO:__main__:Epoch 2 - avg_train_loss: 0.0056  avg_val_loss: 0.0046  time: 623s
Epoch 2 - Score: 0.4699
INFO:__main__:Epoch 2 - Score: 0.4699
Epoch 2 - Save Best Score: 0.5251 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.5251 Model


Epoch: [3][0/5891] Elapsed 0m 0s (remain 40m 11s) Loss: 0.0015(0.0015) Grad: nan  LR: 0.00001000  
Epoch: [3][100/5891] Elapsed 0m 9s (remain 8m 52s) Loss: 0.0002(0.0044) Grad: 0.0127  LR: 0.00000987  
Epoch: [3][200/5891] Elapsed 0m 18s (remain 8m 39s) Loss: 0.0046(0.0049) Grad: 0.0568  LR: 0.00000973  
Epoch: [3][300/5891] Elapsed 0m 27s (remain 8m 25s) Loss: 0.0069(0.0050) Grad: 0.0905  LR: 0.00000960  
Epoch: [3][400/5891] Elapsed 0m 36s (remain 8m 13s) Loss: 0.0034(0.0053) Grad: 0.0555  LR: 0.00000947  
Epoch: [3][500/5891] Elapsed 0m 44s (remain 8m 3s) Loss: 0.0009(0.0052) Grad: 0.0422  LR: 0.00000933  
Epoch: [3][600/5891] Elapsed 0m 53s (remain 7m 54s) Loss: 0.0053(0.0052) Grad: 0.1025  LR: 0.00000920  
Epoch: [3][700/5891] Elapsed 1m 2s (remain 7m 44s) Loss: 0.0077(0.0051) Grad: 0.1040  LR: 0.00000907  
Epoch: [3][800/5891] Elapsed 1m 11s (remain 7m 34s) Loss: 0.0030(0.0052) Grad: 0.0410  LR: 0.00000893  
Epoch: [3][900/5891] Elapsed 1m 20s (remain 7m 24s) Loss: 0.0002(0.0052)

Epoch 3 - avg_train_loss: 0.0046  avg_val_loss: 0.0042  time: 622s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0046  avg_val_loss: 0.0042  time: 622s
Epoch 3 - Score: 0.4763
INFO:__main__:Epoch 3 - Score: 0.4763
Epoch 3 - Save Best Score: 0.5426 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.5426 Model


Epoch: [4][0/5891] Elapsed 0m 0s (remain 39m 34s) Loss: 0.0041(0.0041) Grad: nan  LR: 0.00000293  
Epoch: [4][100/5891] Elapsed 0m 9s (remain 8m 54s) Loss: 0.0116(0.0042) Grad: 0.1158  LR: 0.00000283  
Epoch: [4][200/5891] Elapsed 0m 18s (remain 8m 43s) Loss: 0.0047(0.0041) Grad: 0.0892  LR: 0.00000274  
Epoch: [4][300/5891] Elapsed 0m 27s (remain 8m 27s) Loss: 0.0003(0.0039) Grad: 0.0194  LR: 0.00000265  
Epoch: [4][400/5891] Elapsed 0m 36s (remain 8m 15s) Loss: 0.0040(0.0041) Grad: 0.0717  LR: 0.00000256  
Epoch: [4][500/5891] Elapsed 0m 45s (remain 8m 4s) Loss: 0.0025(0.0040) Grad: 0.0446  LR: 0.00000247  
Epoch: [4][600/5891] Elapsed 0m 53s (remain 7m 54s) Loss: 0.0004(0.0041) Grad: 0.0263  LR: 0.00000239  
Epoch: [4][700/5891] Elapsed 1m 2s (remain 7m 43s) Loss: 0.0108(0.0041) Grad: 0.1292  LR: 0.00000230  
Epoch: [4][800/5891] Elapsed 1m 11s (remain 7m 33s) Loss: 0.0021(0.0040) Grad: 0.0520  LR: 0.00000222  
Epoch: [4][900/5891] Elapsed 1m 20s (remain 7m 23s) Loss: 0.0074(0.0039)

Epoch 4 - avg_train_loss: 0.0040  avg_val_loss: 0.0042  time: 621s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0040  avg_val_loss: 0.0042  time: 621s
Epoch 4 - Score: 0.4888
INFO:__main__:Epoch 4 - Score: 0.4888


f1 score : 0.47628908676744663
recall score : 0.35243640821330063
precision score : 0.7343550446998723


Score: 0.4763
INFO:__main__:Score: 0.4763
F1 BEST Score: 0.5426
INFO:__main__:F1 BEST Score: 0.5426
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.32.1",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__

Epoch: [1][0/5891] Elapsed 0m 0s (remain 40m 51s) Loss: 0.2208(0.2208) Grad: nan  LR: 0.00002000  
Epoch: [1][100/5891] Elapsed 0m 9s (remain 8m 55s) Loss: 0.0341(0.0221) Grad: 0.4081  LR: 0.00002000  
Epoch: [1][200/5891] Elapsed 0m 18s (remain 8m 36s) Loss: 0.0162(0.0163) Grad: 0.3505  LR: 0.00002000  
Epoch: [1][300/5891] Elapsed 0m 27s (remain 8m 22s) Loss: 0.0092(0.0143) Grad: 0.0300  LR: 0.00001999  
Epoch: [1][400/5891] Elapsed 0m 36s (remain 8m 13s) Loss: 0.0073(0.0135) Grad: 0.0515  LR: 0.00001999  
Epoch: [1][500/5891] Elapsed 0m 44s (remain 8m 1s) Loss: 0.0105(0.0129) Grad: 0.0687  LR: 0.00001998  
Epoch: [1][600/5891] Elapsed 0m 53s (remain 7m 52s) Loss: 0.0038(0.0124) Grad: 0.0830  LR: 0.00001997  
Epoch: [1][700/5891] Elapsed 1m 2s (remain 7m 42s) Loss: 0.0007(0.0119) Grad: 0.0309  LR: 0.00001996  
Epoch: [1][800/5891] Elapsed 1m 11s (remain 7m 32s) Loss: 0.0020(0.0117) Grad: 0.0817  LR: 0.00001994  
Epoch: [1][900/5891] Elapsed 1m 20s (remain 7m 23s) Loss: 0.0102(0.0116)

Epoch 1 - avg_train_loss: 0.0082  avg_val_loss: 0.0057  time: 621s
INFO:__main__:Epoch 1 - avg_train_loss: 0.0082  avg_val_loss: 0.0057  time: 621s
Epoch 1 - Score: 0.3447
INFO:__main__:Epoch 1 - Score: 0.3447
Epoch 1 - Save Best Score: 0.4835 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.4835 Model


Epoch: [2][0/5891] Elapsed 0m 0s (remain 39m 6s) Loss: 0.0047(0.0047) Grad: nan  LR: 0.00001707  
Epoch: [2][100/5891] Elapsed 0m 9s (remain 8m 48s) Loss: 0.0094(0.0063) Grad: 0.0867  LR: 0.00001698  
Epoch: [2][200/5891] Elapsed 0m 18s (remain 8m 36s) Loss: 0.0042(0.0067) Grad: 0.0373  LR: 0.00001688  
Epoch: [2][300/5891] Elapsed 0m 27s (remain 8m 21s) Loss: 0.0029(0.0062) Grad: 0.0858  LR: 0.00001678  
Epoch: [2][400/5891] Elapsed 0m 35s (remain 8m 11s) Loss: 0.0032(0.0060) Grad: 0.0432  LR: 0.00001668  
Epoch: [2][500/5891] Elapsed 0m 44s (remain 8m 2s) Loss: 0.0071(0.0062) Grad: 0.0572  LR: 0.00001658  
Epoch: [2][600/5891] Elapsed 0m 53s (remain 7m 51s) Loss: 0.0044(0.0061) Grad: 0.1079  LR: 0.00001648  
Epoch: [2][700/5891] Elapsed 1m 2s (remain 7m 41s) Loss: 0.0050(0.0062) Grad: 0.0791  LR: 0.00001638  
Epoch: [2][800/5891] Elapsed 1m 11s (remain 7m 31s) Loss: 0.0023(0.0063) Grad: 0.0438  LR: 0.00001628  
Epoch: [2][900/5891] Elapsed 1m 19s (remain 7m 21s) Loss: 0.0142(0.0062) 

Epoch 2 - avg_train_loss: 0.0056  avg_val_loss: 0.0048  time: 621s
INFO:__main__:Epoch 2 - avg_train_loss: 0.0056  avg_val_loss: 0.0048  time: 621s
Epoch 2 - Score: 0.4037
INFO:__main__:Epoch 2 - Score: 0.4037
Epoch 2 - Save Best Score: 0.5214 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.5214 Model


Epoch: [3][0/5891] Elapsed 0m 0s (remain 38m 6s) Loss: 0.0013(0.0013) Grad: nan  LR: 0.00001000  
Epoch: [3][100/5891] Elapsed 0m 9s (remain 8m 50s) Loss: 0.0002(0.0045) Grad: 0.0043  LR: 0.00000987  
Epoch: [3][200/5891] Elapsed 0m 18s (remain 8m 40s) Loss: 0.0046(0.0046) Grad: 0.0545  LR: 0.00000973  
Epoch: [3][300/5891] Elapsed 0m 27s (remain 8m 23s) Loss: 0.0011(0.0046) Grad: 0.0200  LR: 0.00000960  
Epoch: [3][400/5891] Elapsed 0m 35s (remain 8m 11s) Loss: 0.0065(0.0047) Grad: 0.1210  LR: 0.00000947  
Epoch: [3][500/5891] Elapsed 0m 44s (remain 8m 0s) Loss: 0.0064(0.0050) Grad: 0.1123  LR: 0.00000933  
Epoch: [3][600/5891] Elapsed 0m 53s (remain 7m 50s) Loss: 0.0043(0.0050) Grad: 0.0526  LR: 0.00000920  
Epoch: [3][700/5891] Elapsed 1m 2s (remain 7m 40s) Loss: 0.0113(0.0049) Grad: 0.1507  LR: 0.00000907  
Epoch: [3][800/5891] Elapsed 1m 10s (remain 7m 30s) Loss: 0.0072(0.0049) Grad: 0.0878  LR: 0.00000893  
Epoch: [3][900/5891] Elapsed 1m 19s (remain 7m 22s) Loss: 0.0246(0.0048) 

Epoch 3 - avg_train_loss: 0.0046  avg_val_loss: 0.0044  time: 621s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0046  avg_val_loss: 0.0044  time: 621s
Epoch 3 - Score: 0.4413
INFO:__main__:Epoch 3 - Score: 0.4413
Epoch 3 - Save Best Score: 0.5282 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.5282 Model


Epoch: [4][0/5891] Elapsed 0m 0s (remain 39m 56s) Loss: 0.0011(0.0011) Grad: nan  LR: 0.00000293  
Epoch: [4][100/5891] Elapsed 0m 9s (remain 8m 49s) Loss: 0.0005(0.0043) Grad: 0.0184  LR: 0.00000283  
Epoch: [4][200/5891] Elapsed 0m 18s (remain 8m 37s) Loss: 0.0009(0.0040) Grad: 0.0178  LR: 0.00000274  
Epoch: [4][300/5891] Elapsed 0m 26s (remain 8m 21s) Loss: 0.0026(0.0040) Grad: 0.0238  LR: 0.00000265  
Epoch: [4][400/5891] Elapsed 0m 35s (remain 8m 9s) Loss: 0.0024(0.0041) Grad: 0.0495  LR: 0.00000256  
Epoch: [4][500/5891] Elapsed 0m 44s (remain 7m 59s) Loss: 0.0071(0.0040) Grad: 0.0656  LR: 0.00000247  
Epoch: [4][600/5891] Elapsed 0m 53s (remain 7m 49s) Loss: 0.0035(0.0039) Grad: 0.0520  LR: 0.00000239  
Epoch: [4][700/5891] Elapsed 1m 2s (remain 7m 40s) Loss: 0.0041(0.0040) Grad: 0.0740  LR: 0.00000230  
Epoch: [4][800/5891] Elapsed 1m 11s (remain 7m 31s) Loss: 0.0052(0.0039) Grad: 0.0703  LR: 0.00000222  
Epoch: [4][900/5891] Elapsed 1m 19s (remain 7m 22s) Loss: 0.0015(0.0039)

Epoch 4 - avg_train_loss: 0.0040  avg_val_loss: 0.0043  time: 620s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0040  avg_val_loss: 0.0043  time: 620s
Epoch 4 - Score: 0.4787
INFO:__main__:Epoch 4 - Score: 0.4787
Epoch 4 - Save Best Score: 0.5345 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.5345 Model


f1 score : 0.47867444001227366
recall score : 0.35856573705179284
precision score : 0.7197785296831745


Score: 0.4787
INFO:__main__:Score: 0.4787
F1 BEST Score: 0.5345
INFO:__main__:F1 BEST Score: 0.5345


f1 score : 0.4173678015606609
recall score : 0.2934019797125433
precision score : 0.7227296746433155


Score: 0.4174
INFO:__main__:Score: 0.4174
F1 BEST Score: 0.4846
INFO:__main__:F1 BEST Score: 0.4846


In [None]:
from google.colab import runtime
runtime.unassign()