In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers
!pip install datasets
!pip install sentencepiece

Collecting transformers
  Downloading transformers-4.32.0-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m63.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m121.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m82.1 MB/s[0m eta [36m0:00:

In [3]:
!nvidia-smi

Mon Aug 28 05:08:46 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    43W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re
import html

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD, AdamW, RAdam
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score, recall_score, accuracy_score, precision_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
import os

DIR = "/content/drive/MyDrive/Competitions/Signate/MUFG2023"
INPUT_DIR = os.path.join(DIR,"input")
OUTPUT_DIR = os.path.join(DIR,"output")

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)


OUTPUT_EXP_DIR = DIR + '/output/EXP033/'
if not os.path.exists(OUTPUT_EXP_DIR):
    os.makedirs(OUTPUT_EXP_DIR)

In [6]:


# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    model="microsoft/deberta-v3-base"
    # model='microsoft/deberta-base'
    # model='roberta-base'
    # model='roberta-large'
    # model='roberta-large-mnli'
    # model='xlnet-large-cased'
    # model='albert-xxlarge-v2'
    # model="microsoft/deberta-large"
    # model="microsoft/deberta-v3-large"
    # model='microsoft/deberta-v2-xlarge'
    # model='funnel-transformer/large'
    # model='funnel-transformer/medium'
    # model='albert-base-v2'
    # model='albert-large-v2'
    # model='google/electra-large-discriminator'
    # model='google/electra-base-discriminator'
    # model="facebook/bart-large-mnli"
    # model="facebook/bart-large"
    # model="facebook/bart-base"
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=4
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=64
    fc_dropout=0.2
    target="is_fraud?"
    target_size=1
    max_len=256
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    train=True
    nth_awp_start_epoch=1
    gradient_checkpointing = False
    freezing = False

if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0, 1]

In [7]:
def get_score(labels, outputs):
    thresh = 0.5
    y_pred = outputs
    y_true = labels
    f_score = f1_score(y_true, (y_pred>thresh).astype(int))
    r_score = recall_score(y_true, (y_pred>thresh).astype(int))
    p_score = precision_score(y_true, (y_pred>thresh).astype(int))
    print(f"f1 score : {f_score}")
    print(f"recall score : {r_score}")
    print(f"precision score : {p_score}")
    return f1_score(y_true, (y_pred>thresh).astype(int))

def get_f1_score(labels, outputs):
    y_pred = outputs
    y_true = labels
    best_score = 0
    best_thresh = 0.5
    for thresh in np.arange(0.1, 0.80, 0.01):
        thresh = np.round(thresh, 2)
        score = f1_score(y_true, (y_pred>thresh).astype(int))
        #print("Accuracy score at threshold {0} is {1}".format(thresh, score))
        if score > best_score:
          best_score = score
          best_thresh = thresh
    return f1_score(y_true, (y_pred>best_thresh).astype(int))


def get_logger(filename=OUTPUT_EXP_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(seed=CFG.seed)

In [8]:
def freeze(module):
    """
    Freezes module's parameters.
    """

    for parameter in module.parameters():
        parameter.requires_grad = False

def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """

    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)

    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """

    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"

        if hasattr(embeddings_path, attr_name):
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [9]:
import pandas as pd
import numpy as np


train = pd.read_csv(os.path.join(INPUT_DIR,"train.csv"))
test = pd.read_csv(os.path.join(INPUT_DIR,"test.csv"))
card = pd.read_csv(os.path.join(INPUT_DIR, "card.csv"))
user = pd.read_csv(os.path.join(INPUT_DIR, "user.csv"))
sub = pd.read_csv(os.path.join(INPUT_DIR, "sample_submit.csv"), header=None)

print(train.shape)
display(train.head(3))

print(test.shape)
display(test.head(3))

print(card.shape)
display(card.head(3))

print(user.shape)
display(user.head(3))

print(sub.shape)
display(sub.head(3))

(471283, 12)


Unnamed: 0,index,user_id,card_id,amount,errors?,is_fraud?,merchant_id,merchant_city,merchant_state,zip,mcc,use_chip
0,0,1721,0,$2.623,OK,0,209237,Joliet,IL,60436.0,5541,Swipe Transaction
1,1,1629,3,$6.4,OK,0,2568,Edgerton,WI,53534.0,5814,Swipe Transaction
2,2,655,3,$123.5,OK,0,345310,Ridgefield,WA,98642.0,7538,Swipe Transaction


(457958, 11)


Unnamed: 0,index,user_id,card_id,amount,errors?,merchant_id,merchant_city,merchant_state,zip,mcc,use_chip
0,471283,541,3,$113.278,OK,324189,Orlando,FL,32821.0,4814,Swipe Transaction
1,471284,655,1,$293.944,OK,81219,Ridgefield,WA,98642.0,7538,Chip Transaction
2,471285,492,0,$47.4,OK,274755,Arlington Heights,IL,60004.0,5719,Swipe Transaction


(416, 10)


Unnamed: 0,user_id,card_id,card_brand,card_type,expires,has_chip,cards_issued,credit_limit,acct_open_date,year_pin_last_changed
0,39,0,Visa,Debit,09/2021,YES,1,$17117,05/2007,2010
1,39,1,Amex,Credit,11/2024,YES,2,$5400,10/2015,2015
2,41,0,Discover,Credit,03/2022,YES,2,$14800,12/2010,2011


(97, 17)


Unnamed: 0,user_id,current_age,retirement_age,birth_year,birth_month,gender,address,city,state,zipcode,latitude,longitude,per_capita_income_zipcode,yearly_income_person,total_debt,fico_score,num_credit_cards
0,39,57,64,1962,12,Female,442 Burns Boulevard,Mansfield,MA,2048,42.02,-71.21,$37407,$76274,$102611,698,2
1,41,39,66,1980,10,Female,3863 River Avenue,Lincoln,CA,95648,38.93,-121.25,$21829,$44506,$57994,849,3
2,47,40,67,1979,5,Female,8799 Elm Avenue,Mckinney,TX,75069,33.2,-96.65,$24684,$50329,$76759,625,4


(457958, 2)


Unnamed: 0,0,1
0,471283,0
1,471284,1
2,471285,0


In [10]:
train = train.merge(card, how="left", on=["user_id", "card_id"]).merge(user, how="left", on="user_id")

In [11]:
month_dict = {
   "01": "January",
   "02": "February",
   "03": "March",
   "04": "April",
   "05": "May",
   "06": "June",
   "07": "July",
   "08": "August",
   "09": "September",
   "10": "October",
   "11": "November",
   "12": "December"
}

def get_expires_values(df):
  _df = df["expires"].str.split('/').apply(pd.Series)
  _df.columns = ["month","years"]
  df["expires_month"] = _df["month"].astype(str)
  df["expires_years"] = _df["years"].astype(str)
  return df

def get_acct_open_date_values(df):
  _df = df["acct_open_date"].str.split('/').apply(pd.Series)
  _df.columns = ["month","years"]
  df["acct_open_date_month"] = _df["month"].astype(str)
  df["acct_open_date_years"] = _df["years"].astype(str)
  return df

train = get_expires_values(train)
train = get_acct_open_date_values(train)
train["expires_month"] = train["expires_month"].map(month_dict)
train["acct_open_date_month"] = train["acct_open_date_month"].map(month_dict)

In [12]:
train.fillna('unknown', inplace = True)

train["texts"] = "[MERCHANT]" + train["amount"] + "[SEP]" + train["errors?"] + "[SEP]" + train["merchant_city"] + "[SEP]" + train["merchant_state"] + "[SEP]" + train["use_chip"] + "[SEP]" \
+ "[CARD]"  + train["card_brand"] + "[SEP]" + train["card_type"] + "[SEP]" + train["expires_month"] + " " + train["expires_years"] + "[SEP]" + train["has_chip"] + "[SEP]" + train["acct_open_date_month"] + " " + train["acct_open_date_years"] + "[SEP]" + train["year_pin_last_changed"].astype(str) + "[SEP]" \
"[USER]" + train["current_age"].astype(str) + " year old " + train["gender"] + "[SEP]" + "retired at age " + train["retirement_age"].astype(str) + "[SEP]" + train["address"] + "[SEP]" + train["city"] + "[SEP]" + train["state"] + "[SEP]" + train["per_capita_income_zipcode"] + "[SEP]" + train["yearly_income_person"] + "[SEP]" + train["total_debt"]

In [13]:
skf = StratifiedKFold(n_splits=CFG.n_fold,shuffle=True,random_state=CFG.seed)
for fold, ( _, val_) in enumerate(skf.split(train, train[CFG.target])):
    train.loc[val_ , "kfold"] = int(fold)

train["kfold"] = train["kfold"].astype(int)

if CFG.debug:
    display(train.groupby('kfold').size())
    train = train.sample(n=500, random_state=0).reset_index(drop=True)
    display(train.groupby('kfold').size())

In [14]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
special_tokens_dict = {'additional_special_tokens': ["[MERCHANT]"] + ["[CARD]"] + ["[USER]"]}
tokenizer.add_special_tokens(special_tokens_dict)
tokenizer.save_pretrained(OUTPUT_EXP_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [15]:
# ====================================================
# Define max_len
# ====================================================
lengths = []
tk0 = tqdm(train['texts'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 23 # cls
LOGGER.info(f"max_len: {CFG.max_len}")

100%|██████████| 471283/471283 [01:28<00:00, 5311.63it/s]
max_len: 94
INFO:__main__:max_len: 94


In [16]:
# ====================================================
# Dataset
# ====================================================
# def prepare_input(cfg, text):
#     inputs = cfg.tokenizer(text,
#                            add_special_tokens=True,
#                            max_length=cfg.max_len,
#                            padding="max_length",
#                            return_offsets_mapping=False,
#                            truncation=True)
#     for k, v in inputs.items():
#         inputs[k] = torch.tensor(v, dtype=torch.long)
#     return inputs

def prepare_input(cfg, text):
    inputs = cfg.tokenizer.encode_plus(
        text,
        return_tensors=None,
        add_special_tokens=True,
        max_length=CFG.max_len,
        pad_to_max_length=True,
        truncation=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['texts'].values
        self.labels = df[CFG.target].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.half)
        return inputs, label

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

class ValidDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['texts'].values
        self.labels = df[CFG.target].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

#collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [17]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim=1)
        return max_embeddings


class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()

        # Freezing
        if cfg.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            cfg.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())

        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        self._init_weights(self.fc)
        self.layer_norm1 = nn.LayerNorm(self.config.hidden_size)
        self.sig = nn.Sigmoid()

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        feature = self.layer_norm1(feature)
        output = self.fc(feature)
        #output = self.sig(output)
        return output

In [18]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
        #print(y_preds.sigmoid().squeeze().view(1, -1))
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader),
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))

    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    predictions = np.concatenate(predictions)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [19]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):

    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['kfold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['kfold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds[CFG.target].values

    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = ValidDataset(CFG, valid_folds)


    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size*2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_EXP_DIR+'config.pth')
    model.to(device)

    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr,
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)

    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler

    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.BCEWithLogitsLoss()

    best_score = -1.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)

        # scoring
        score = get_score(valid_labels, predictions)
        f1_score = get_f1_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')


        if best_score < f1_score:
            best_score = f1_score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_EXP_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_EXP_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                             map_location=torch.device('cpu'))['predictions']
    valid_folds['pred'] = predictions

    torch.cuda.empty_cache()
    gc.collect()

    return valid_folds

In [20]:
if __name__ == '__main__':

    def get_result(oof_df):
        labels = oof_df[CFG.target].values
        preds = oof_df['pred'].values
        score = get_score(labels, preds)
        f1_score = get_f1_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
        LOGGER.info(f'F1 BEST Score: {f1_score:<.4f}')

    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
            #break
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_EXP_DIR+'oof_df.pkl')

DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.32.0",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "at

Downloading pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Epoch: [1][0/5891] Elapsed 0m 3s (remain 373m 44s) Loss: 0.8848(0.8848) Grad: nan  LR: 0.00002000  
Epoch: [1][100/5891] Elapsed 0m 12s (remain 12m 3s) Loss: 0.2393(0.2644) Grad: 0.8625  LR: 0.00002000  
Epoch: [1][200/5891] Elapsed 0m 21s (remain 10m 6s) Loss: 0.2352(0.2489) Grad: 0.7933  LR: 0.00002000  
Epoch: [1][300/5891] Elapsed 0m 30s (remain 9m 22s) Loss: 0.2239(0.2399) Grad: 1.1491  LR: 0.00001999  
Epoch: [1][400/5891] Elapsed 0m 39s (remain 8m 55s) Loss: 0.1743(0.2362) Grad: 0.9238  LR: 0.00001999  
Epoch: [1][500/5891] Elapsed 0m 47s (remain 8m 35s) Loss: 0.2489(0.2325) Grad: 1.0065  LR: 0.00001998  
Epoch: [1][600/5891] Elapsed 0m 56s (remain 8m 18s) Loss: 0.1338(0.2310) Grad: 0.8067  LR: 0.00001997  
Epoch: [1][700/5891] Elapsed 1m 5s (remain 8m 4s) Loss: 0.2568(0.2297) Grad: 1.0734  LR: 0.00001996  
Epoch: [1][800/5891] Elapsed 1m 14s (remain 7m 51s) Loss: 0.2959(0.2297) Grad: 0.7678  LR: 0.00001994  
Epoch: [1][900/5891] Elapsed 1m 22s (remain 7m 39s) Loss: 0.2646(0.226

Epoch 1 - avg_train_loss: 0.1932  avg_val_loss: 0.1767  time: 621s
INFO:__main__:Epoch 1 - avg_train_loss: 0.1932  avg_val_loss: 0.1767  time: 621s
Epoch 1 - Score: 0.3457
INFO:__main__:Epoch 1 - Score: 0.3457
Epoch 1 - Save Best Score: 0.4912 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.4912 Model


Epoch: [2][0/5891] Elapsed 0m 0s (remain 34m 32s) Loss: 0.2262(0.2262) Grad: nan  LR: 0.00001707  
Epoch: [2][100/5891] Elapsed 0m 9s (remain 8m 48s) Loss: 0.0773(0.1576) Grad: 1.3671  LR: 0.00001698  
Epoch: [2][200/5891] Elapsed 0m 18s (remain 8m 34s) Loss: 0.1340(0.1688) Grad: 0.9092  LR: 0.00001688  
Epoch: [2][300/5891] Elapsed 0m 26s (remain 8m 20s) Loss: 0.1851(0.1671) Grad: 1.1504  LR: 0.00001678  
Epoch: [2][400/5891] Elapsed 0m 35s (remain 8m 8s) Loss: 0.0538(0.1667) Grad: 1.0122  LR: 0.00001668  
Epoch: [2][500/5891] Elapsed 0m 44s (remain 7m 58s) Loss: 0.1209(0.1687) Grad: 0.4834  LR: 0.00001658  
Epoch: [2][600/5891] Elapsed 0m 53s (remain 7m 48s) Loss: 0.2646(0.1671) Grad: 1.0370  LR: 0.00001648  
Epoch: [2][700/5891] Elapsed 1m 1s (remain 7m 38s) Loss: 0.1476(0.1677) Grad: 0.5801  LR: 0.00001638  
Epoch: [2][800/5891] Elapsed 1m 10s (remain 7m 29s) Loss: 0.2115(0.1672) Grad: 1.2820  LR: 0.00001628  
Epoch: [2][900/5891] Elapsed 1m 19s (remain 7m 20s) Loss: 0.2406(0.1673)

Epoch 2 - avg_train_loss: 0.1641  avg_val_loss: 0.1593  time: 617s
INFO:__main__:Epoch 2 - avg_train_loss: 0.1641  avg_val_loss: 0.1593  time: 617s
Epoch 2 - Score: 0.4642
INFO:__main__:Epoch 2 - Score: 0.4642
Epoch 2 - Save Best Score: 0.5380 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.5380 Model


Epoch: [3][0/5891] Elapsed 0m 0s (remain 36m 39s) Loss: 0.2107(0.2107) Grad: nan  LR: 0.00001000  
Epoch: [3][100/5891] Elapsed 0m 9s (remain 8m 48s) Loss: 0.1272(0.1436) Grad: 0.7240  LR: 0.00000987  
Epoch: [3][200/5891] Elapsed 0m 18s (remain 8m 38s) Loss: 0.1620(0.1477) Grad: 0.7011  LR: 0.00000973  
Epoch: [3][300/5891] Elapsed 0m 27s (remain 8m 22s) Loss: 0.2480(0.1499) Grad: 1.2341  LR: 0.00000960  
Epoch: [3][400/5891] Elapsed 0m 35s (remain 8m 11s) Loss: 0.1383(0.1544) Grad: 0.7264  LR: 0.00000947  
Epoch: [3][500/5891] Elapsed 0m 44s (remain 8m 1s) Loss: 0.0699(0.1545) Grad: 0.7444  LR: 0.00000933  
Epoch: [3][600/5891] Elapsed 0m 53s (remain 7m 50s) Loss: 0.1860(0.1532) Grad: 1.0439  LR: 0.00000920  
Epoch: [3][700/5891] Elapsed 1m 2s (remain 7m 41s) Loss: 0.2294(0.1543) Grad: 1.1175  LR: 0.00000907  
Epoch: [3][800/5891] Elapsed 1m 11s (remain 7m 31s) Loss: 0.1454(0.1538) Grad: 0.5740  LR: 0.00000893  
Epoch: [3][900/5891] Elapsed 1m 19s (remain 7m 21s) Loss: 0.2318(0.1534)

Epoch 3 - avg_train_loss: 0.1519  avg_val_loss: 0.1559  time: 616s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1519  avg_val_loss: 0.1559  time: 616s
Epoch 3 - Score: 0.4872
INFO:__main__:Epoch 3 - Score: 0.4872
Epoch 3 - Save Best Score: 0.5536 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.5536 Model


Epoch: [4][0/5891] Elapsed 0m 0s (remain 37m 22s) Loss: 0.0459(0.0459) Grad: nan  LR: 0.00000293  
Epoch: [4][100/5891] Elapsed 0m 9s (remain 8m 51s) Loss: 0.1970(0.1495) Grad: 1.0647  LR: 0.00000283  
Epoch: [4][200/5891] Elapsed 0m 18s (remain 8m 39s) Loss: 0.1027(0.1422) Grad: 0.5164  LR: 0.00000274  
Epoch: [4][300/5891] Elapsed 0m 27s (remain 8m 26s) Loss: 0.1041(0.1445) Grad: 0.6739  LR: 0.00000265  
Epoch: [4][400/5891] Elapsed 0m 36s (remain 8m 14s) Loss: 0.2598(0.1409) Grad: 1.2162  LR: 0.00000256  
Epoch: [4][500/5891] Elapsed 0m 44s (remain 8m 3s) Loss: 0.2125(0.1419) Grad: 0.8632  LR: 0.00000247  
Epoch: [4][600/5891] Elapsed 0m 53s (remain 7m 53s) Loss: 0.1005(0.1426) Grad: 1.6105  LR: 0.00000239  
Epoch: [4][700/5891] Elapsed 1m 2s (remain 7m 42s) Loss: 0.1749(0.1430) Grad: 1.4150  LR: 0.00000230  
Epoch: [4][800/5891] Elapsed 1m 11s (remain 7m 32s) Loss: 0.1542(0.1434) Grad: 0.8561  LR: 0.00000222  
Epoch: [4][900/5891] Elapsed 1m 19s (remain 7m 22s) Loss: 0.0490(0.1433)

Epoch 4 - avg_train_loss: 0.1421  avg_val_loss: 0.1560  time: 618s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1421  avg_val_loss: 0.1560  time: 618s
Epoch 4 - Score: 0.5131
INFO:__main__:Epoch 4 - Score: 0.5131


f1 score : 0.4872346970163027
recall score : 0.36408213300643577
precision score : 0.7362875735977689


Score: 0.4872
INFO:__main__:Score: 0.4872
F1 BEST Score: 0.5536
INFO:__main__:F1 BEST Score: 0.5536
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.32.0",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__

Epoch: [1][0/5891] Elapsed 0m 0s (remain 40m 45s) Loss: 0.6865(0.6865) Grad: nan  LR: 0.00002000  
Epoch: [1][100/5891] Elapsed 0m 9s (remain 8m 59s) Loss: 0.2625(0.2731) Grad: 1.1978  LR: 0.00002000  
Epoch: [1][200/5891] Elapsed 0m 18s (remain 8m 35s) Loss: 0.1688(0.2532) Grad: 0.4189  LR: 0.00002000  
Epoch: [1][300/5891] Elapsed 0m 27s (remain 8m 24s) Loss: 0.1315(0.2407) Grad: 0.7211  LR: 0.00001999  
Epoch: [1][400/5891] Elapsed 0m 36s (remain 8m 15s) Loss: 0.2771(0.2329) Grad: 1.0342  LR: 0.00001999  
Epoch: [1][500/5891] Elapsed 0m 45s (remain 8m 4s) Loss: 0.2830(0.2290) Grad: 0.8104  LR: 0.00001998  
Epoch: [1][600/5891] Elapsed 0m 53s (remain 7m 53s) Loss: 0.1339(0.2219) Grad: 0.7344  LR: 0.00001997  
Epoch: [1][700/5891] Elapsed 1m 2s (remain 7m 43s) Loss: 0.3621(0.2192) Grad: 1.1331  LR: 0.00001996  
Epoch: [1][800/5891] Elapsed 1m 11s (remain 7m 34s) Loss: 0.1576(0.2168) Grad: 1.0213  LR: 0.00001994  
Epoch: [1][900/5891] Elapsed 1m 20s (remain 7m 25s) Loss: 0.1765(0.2141)

Epoch 1 - avg_train_loss: 0.1882  avg_val_loss: 0.1689  time: 622s
INFO:__main__:Epoch 1 - avg_train_loss: 0.1882  avg_val_loss: 0.1689  time: 622s
Epoch 1 - Score: 0.4351
INFO:__main__:Epoch 1 - Score: 0.4351
Epoch 1 - Save Best Score: 0.5114 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.5114 Model


Epoch: [2][0/5891] Elapsed 0m 0s (remain 41m 37s) Loss: 0.1236(0.1236) Grad: nan  LR: 0.00001707  
Epoch: [2][100/5891] Elapsed 0m 9s (remain 8m 54s) Loss: 0.1635(0.1599) Grad: 1.4350  LR: 0.00001698  
Epoch: [2][200/5891] Elapsed 0m 18s (remain 8m 34s) Loss: 0.1113(0.1637) Grad: 0.6017  LR: 0.00001688  
Epoch: [2][300/5891] Elapsed 0m 27s (remain 8m 22s) Loss: 0.0909(0.1647) Grad: 0.4736  LR: 0.00001678  
Epoch: [2][400/5891] Elapsed 0m 35s (remain 8m 11s) Loss: 0.2600(0.1620) Grad: 0.7906  LR: 0.00001668  
Epoch: [2][500/5891] Elapsed 0m 44s (remain 8m 1s) Loss: 0.1187(0.1626) Grad: 0.6261  LR: 0.00001658  
Epoch: [2][600/5891] Elapsed 0m 53s (remain 7m 51s) Loss: 0.0698(0.1643) Grad: 0.8030  LR: 0.00001648  
Epoch: [2][700/5891] Elapsed 1m 2s (remain 7m 41s) Loss: 0.1182(0.1644) Grad: 0.4200  LR: 0.00001638  
Epoch: [2][800/5891] Elapsed 1m 11s (remain 7m 32s) Loss: 0.2878(0.1655) Grad: 1.4307  LR: 0.00001628  
Epoch: [2][900/5891] Elapsed 1m 20s (remain 7m 23s) Loss: 0.0881(0.1649)

Epoch 2 - avg_train_loss: 0.1624  avg_val_loss: 0.1598  time: 620s
INFO:__main__:Epoch 2 - avg_train_loss: 0.1624  avg_val_loss: 0.1598  time: 620s
Epoch 2 - Score: 0.4640
INFO:__main__:Epoch 2 - Score: 0.4640
Epoch 2 - Save Best Score: 0.5389 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.5389 Model


Epoch: [3][0/5891] Elapsed 0m 0s (remain 41m 34s) Loss: 0.1956(0.1956) Grad: nan  LR: 0.00001000  
Epoch: [3][100/5891] Elapsed 0m 9s (remain 8m 54s) Loss: 0.0668(0.1565) Grad: 0.9570  LR: 0.00000987  
Epoch: [3][200/5891] Elapsed 0m 18s (remain 8m 38s) Loss: 0.1880(0.1521) Grad: 0.8088  LR: 0.00000973  
Epoch: [3][300/5891] Elapsed 0m 27s (remain 8m 25s) Loss: 0.2062(0.1536) Grad: 0.9529  LR: 0.00000960  
Epoch: [3][400/5891] Elapsed 0m 36s (remain 8m 13s) Loss: 0.1175(0.1531) Grad: 0.8823  LR: 0.00000947  
Epoch: [3][500/5891] Elapsed 0m 44s (remain 8m 2s) Loss: 0.1276(0.1512) Grad: 1.0819  LR: 0.00000933  
Epoch: [3][600/5891] Elapsed 0m 53s (remain 7m 51s) Loss: 0.2700(0.1521) Grad: 1.8190  LR: 0.00000920  
Epoch: [3][700/5891] Elapsed 1m 2s (remain 7m 42s) Loss: 0.2693(0.1512) Grad: 1.8018  LR: 0.00000907  
Epoch: [3][800/5891] Elapsed 1m 11s (remain 7m 32s) Loss: 0.1161(0.1502) Grad: 0.6138  LR: 0.00000893  
Epoch: [3][900/5891] Elapsed 1m 19s (remain 7m 22s) Loss: 0.0535(0.1510)

Epoch 3 - avg_train_loss: 0.1510  avg_val_loss: 0.1559  time: 618s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1510  avg_val_loss: 0.1559  time: 618s
Epoch 3 - Score: 0.4991
INFO:__main__:Epoch 3 - Score: 0.4991
Epoch 3 - Save Best Score: 0.5532 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.5532 Model


Epoch: [4][0/5891] Elapsed 0m 0s (remain 37m 57s) Loss: 0.0837(0.0837) Grad: nan  LR: 0.00000293  
Epoch: [4][100/5891] Elapsed 0m 9s (remain 8m 49s) Loss: 0.1224(0.1424) Grad: 1.0889  LR: 0.00000283  
Epoch: [4][200/5891] Elapsed 0m 18s (remain 8m 36s) Loss: 0.0731(0.1390) Grad: 0.6585  LR: 0.00000274  
Epoch: [4][300/5891] Elapsed 0m 27s (remain 8m 25s) Loss: 0.1285(0.1411) Grad: 0.6558  LR: 0.00000265  
Epoch: [4][400/5891] Elapsed 0m 35s (remain 8m 12s) Loss: 0.1453(0.1427) Grad: 0.8279  LR: 0.00000256  
Epoch: [4][500/5891] Elapsed 0m 44s (remain 8m 2s) Loss: 0.2289(0.1433) Grad: 1.0996  LR: 0.00000247  
Epoch: [4][600/5891] Elapsed 0m 53s (remain 7m 52s) Loss: 0.2335(0.1451) Grad: 1.5236  LR: 0.00000239  
Epoch: [4][700/5891] Elapsed 1m 2s (remain 7m 42s) Loss: 0.0936(0.1444) Grad: 0.8700  LR: 0.00000230  
Epoch: [4][800/5891] Elapsed 1m 11s (remain 7m 33s) Loss: 0.1094(0.1444) Grad: 0.9235  LR: 0.00000222  
Epoch: [4][900/5891] Elapsed 1m 20s (remain 7m 23s) Loss: 0.1992(0.1439)

Epoch 4 - avg_train_loss: 0.1415  avg_val_loss: 0.1568  time: 622s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1415  avg_val_loss: 0.1568  time: 622s
Epoch 4 - Score: 0.5090
INFO:__main__:Epoch 4 - Score: 0.5090
Epoch 4 - Save Best Score: 0.5537 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.5537 Model


f1 score : 0.5090444357058592
recall score : 0.39672080907140667
precision score : 0.7100932528798684


Score: 0.5090
INFO:__main__:Score: 0.5090
F1 BEST Score: 0.5537
INFO:__main__:F1 BEST Score: 0.5537
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.32.0",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__

Epoch: [1][0/5891] Elapsed 0m 0s (remain 41m 31s) Loss: 0.4429(0.4429) Grad: nan  LR: 0.00002000  
Epoch: [1][100/5891] Elapsed 0m 9s (remain 9m 0s) Loss: 0.2064(0.2514) Grad: 0.5387  LR: 0.00002000  
Epoch: [1][200/5891] Elapsed 0m 18s (remain 8m 41s) Loss: 0.1268(0.2405) Grad: 0.7900  LR: 0.00002000  
Epoch: [1][300/5891] Elapsed 0m 27s (remain 8m 29s) Loss: 0.3472(0.2296) Grad: 1.5889  LR: 0.00001999  
Epoch: [1][400/5891] Elapsed 0m 36s (remain 8m 16s) Loss: 0.2114(0.2262) Grad: 0.3918  LR: 0.00001999  
Epoch: [1][500/5891] Elapsed 0m 45s (remain 8m 5s) Loss: 0.0967(0.2212) Grad: 1.5708  LR: 0.00001998  
Epoch: [1][600/5891] Elapsed 0m 53s (remain 7m 55s) Loss: 0.1389(0.2218) Grad: 0.8548  LR: 0.00001997  
Epoch: [1][700/5891] Elapsed 1m 2s (remain 7m 45s) Loss: 0.0706(0.2203) Grad: 0.8289  LR: 0.00001996  
Epoch: [1][800/5891] Elapsed 1m 11s (remain 7m 35s) Loss: 0.2278(0.2173) Grad: 1.5615  LR: 0.00001994  
Epoch: [1][900/5891] Elapsed 1m 20s (remain 7m 25s) Loss: 0.2710(0.2156) 

Epoch 1 - avg_train_loss: 0.1886  avg_val_loss: 0.1685  time: 621s
INFO:__main__:Epoch 1 - avg_train_loss: 0.1886  avg_val_loss: 0.1685  time: 621s
Epoch 1 - Score: 0.4370
INFO:__main__:Epoch 1 - Score: 0.4370
Epoch 1 - Save Best Score: 0.5106 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.5106 Model


Epoch: [2][0/5891] Elapsed 0m 0s (remain 38m 15s) Loss: 0.0867(0.0867) Grad: nan  LR: 0.00001707  
Epoch: [2][100/5891] Elapsed 0m 9s (remain 8m 51s) Loss: 0.1879(0.1729) Grad: 1.0137  LR: 0.00001698  
Epoch: [2][200/5891] Elapsed 0m 18s (remain 8m 35s) Loss: 0.1044(0.1698) Grad: 0.4768  LR: 0.00001688  
Epoch: [2][300/5891] Elapsed 0m 27s (remain 8m 24s) Loss: 0.0679(0.1673) Grad: 0.6711  LR: 0.00001678  
Epoch: [2][400/5891] Elapsed 0m 36s (remain 8m 14s) Loss: 0.1185(0.1658) Grad: 0.5835  LR: 0.00001668  
Epoch: [2][500/5891] Elapsed 0m 44s (remain 8m 3s) Loss: 0.1780(0.1665) Grad: 0.9377  LR: 0.00001658  
Epoch: [2][600/5891] Elapsed 0m 53s (remain 7m 53s) Loss: 0.1353(0.1664) Grad: 1.0990  LR: 0.00001648  
Epoch: [2][700/5891] Elapsed 1m 2s (remain 7m 44s) Loss: 0.1884(0.1659) Grad: 1.2226  LR: 0.00001638  
Epoch: [2][800/5891] Elapsed 1m 11s (remain 7m 34s) Loss: 0.0671(0.1653) Grad: 0.7581  LR: 0.00001628  
Epoch: [2][900/5891] Elapsed 1m 20s (remain 7m 25s) Loss: 0.2664(0.1661)

Epoch 2 - avg_train_loss: 0.1623  avg_val_loss: 0.1593  time: 621s
INFO:__main__:Epoch 2 - avg_train_loss: 0.1623  avg_val_loss: 0.1593  time: 621s
Epoch 2 - Score: 0.4414
INFO:__main__:Epoch 2 - Score: 0.4414
Epoch 2 - Save Best Score: 0.5352 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.5352 Model


Epoch: [3][0/5891] Elapsed 0m 0s (remain 40m 23s) Loss: 0.0698(0.0698) Grad: nan  LR: 0.00001000  
Epoch: [3][100/5891] Elapsed 0m 9s (remain 8m 48s) Loss: 0.1174(0.1446) Grad: 0.8527  LR: 0.00000987  
Epoch: [3][200/5891] Elapsed 0m 18s (remain 8m 42s) Loss: 0.1876(0.1507) Grad: 1.4889  LR: 0.00000973  
Epoch: [3][300/5891] Elapsed 0m 27s (remain 8m 34s) Loss: 0.1210(0.1478) Grad: 1.0454  LR: 0.00000960  
Epoch: [3][400/5891] Elapsed 0m 36s (remain 8m 20s) Loss: 0.2415(0.1527) Grad: 0.7968  LR: 0.00000947  
Epoch: [3][500/5891] Elapsed 0m 45s (remain 8m 8s) Loss: 0.3384(0.1529) Grad: 1.5579  LR: 0.00000933  
Epoch: [3][600/5891] Elapsed 0m 54s (remain 7m 57s) Loss: 0.0978(0.1537) Grad: 0.7124  LR: 0.00000920  
Epoch: [3][700/5891] Elapsed 1m 3s (remain 7m 47s) Loss: 0.1279(0.1554) Grad: 1.0416  LR: 0.00000907  
Epoch: [3][800/5891] Elapsed 1m 12s (remain 7m 39s) Loss: 0.1315(0.1559) Grad: 1.2481  LR: 0.00000893  
Epoch: [3][900/5891] Elapsed 1m 21s (remain 7m 30s) Loss: 0.0799(0.1550)

Epoch 3 - avg_train_loss: 0.1514  avg_val_loss: 0.1544  time: 622s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1514  avg_val_loss: 0.1544  time: 622s
Epoch 3 - Score: 0.4809
INFO:__main__:Epoch 3 - Score: 0.4809
Epoch 3 - Save Best Score: 0.5502 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.5502 Model


Epoch: [4][0/5891] Elapsed 0m 0s (remain 39m 43s) Loss: 0.1405(0.1405) Grad: nan  LR: 0.00000293  
Epoch: [4][100/5891] Elapsed 0m 9s (remain 9m 10s) Loss: 0.1536(0.1445) Grad: 0.9814  LR: 0.00000283  
Epoch: [4][200/5891] Elapsed 0m 18s (remain 8m 57s) Loss: 0.1387(0.1491) Grad: 0.8059  LR: 0.00000274  
Epoch: [4][300/5891] Elapsed 0m 28s (remain 8m 42s) Loss: 0.1721(0.1486) Grad: 0.7292  LR: 0.00000265  
Epoch: [4][400/5891] Elapsed 0m 37s (remain 8m 27s) Loss: 0.1946(0.1467) Grad: 1.2900  LR: 0.00000256  
Epoch: [4][500/5891] Elapsed 0m 45s (remain 8m 13s) Loss: 0.0979(0.1452) Grad: 0.4940  LR: 0.00000247  
Epoch: [4][600/5891] Elapsed 0m 54s (remain 8m 2s) Loss: 0.1467(0.1439) Grad: 1.0415  LR: 0.00000239  
Epoch: [4][700/5891] Elapsed 1m 3s (remain 7m 51s) Loss: 0.1636(0.1440) Grad: 1.0207  LR: 0.00000230  
Epoch: [4][800/5891] Elapsed 1m 12s (remain 7m 40s) Loss: 0.1350(0.1419) Grad: 8.1935  LR: 0.00000222  
Epoch: [4][900/5891] Elapsed 1m 21s (remain 7m 30s) Loss: 0.1998(0.1414)

Epoch 4 - avg_train_loss: 0.1424  avg_val_loss: 0.1549  time: 623s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1424  avg_val_loss: 0.1549  time: 623s
Epoch 4 - Score: 0.5079
INFO:__main__:Epoch 4 - Score: 0.5079
Epoch 4 - Save Best Score: 0.5539 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.5539 Model


f1 score : 0.5079022283302248
recall score : 0.3963536080894745
precision score : 0.7068306010928962


Score: 0.5079
INFO:__main__:Score: 0.5079
F1 BEST Score: 0.5539
INFO:__main__:F1 BEST Score: 0.5539
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.32.0",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__

Epoch: [1][0/5891] Elapsed 0m 0s (remain 41m 38s) Loss: 0.5479(0.5479) Grad: nan  LR: 0.00002000  
Epoch: [1][100/5891] Elapsed 0m 9s (remain 9m 16s) Loss: 0.3242(0.2608) Grad: 1.2394  LR: 0.00002000  
Epoch: [1][200/5891] Elapsed 0m 19s (remain 9m 1s) Loss: 0.1287(0.2391) Grad: 1.8245  LR: 0.00002000  
Epoch: [1][300/5891] Elapsed 0m 28s (remain 8m 40s) Loss: 0.1863(0.2317) Grad: 0.8837  LR: 0.00001999  
Epoch: [1][400/5891] Elapsed 0m 36s (remain 8m 26s) Loss: 0.2483(0.2284) Grad: 1.0017  LR: 0.00001999  
Epoch: [1][500/5891] Elapsed 0m 45s (remain 8m 13s) Loss: 0.3113(0.2246) Grad: 0.8633  LR: 0.00001998  
Epoch: [1][600/5891] Elapsed 0m 54s (remain 8m 2s) Loss: 0.1897(0.2224) Grad: 0.3128  LR: 0.00001997  
Epoch: [1][700/5891] Elapsed 1m 3s (remain 7m 51s) Loss: 0.1852(0.2217) Grad: 0.9894  LR: 0.00001996  
Epoch: [1][800/5891] Elapsed 1m 12s (remain 7m 41s) Loss: 0.1954(0.2194) Grad: 0.3461  LR: 0.00001994  
Epoch: [1][900/5891] Elapsed 1m 21s (remain 7m 31s) Loss: 0.2107(0.2189) 

Epoch 1 - avg_train_loss: 0.2372  avg_val_loss: 0.2519  time: 625s
INFO:__main__:Epoch 1 - avg_train_loss: 0.2372  avg_val_loss: 0.2519  time: 625s
Epoch 1 - Score: 0.0000
INFO:__main__:Epoch 1 - Score: 0.0000
Epoch 1 - Save Best Score: 0.0000 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.0000 Model


Epoch: [2][0/5891] Elapsed 0m 0s (remain 37m 24s) Loss: 0.2343(0.2343) Grad: nan  LR: 0.00001707  
Epoch: [2][100/5891] Elapsed 0m 9s (remain 8m 53s) Loss: 0.2764(0.2294) Grad: 0.4476  LR: 0.00001698  
Epoch: [2][200/5891] Elapsed 0m 18s (remain 8m 37s) Loss: 0.1941(0.2379) Grad: 0.6678  LR: 0.00001688  
Epoch: [2][300/5891] Elapsed 0m 27s (remain 8m 29s) Loss: 0.1514(0.2434) Grad: 0.9862  LR: 0.00001678  
Epoch: [2][400/5891] Elapsed 0m 36s (remain 8m 20s) Loss: 0.2343(0.2415) Grad: 0.0304  LR: 0.00001668  
Epoch: [2][500/5891] Elapsed 0m 45s (remain 8m 8s) Loss: 0.1921(0.2421) Grad: 0.4993  LR: 0.00001658  
Epoch: [2][600/5891] Elapsed 0m 54s (remain 7m 57s) Loss: 0.1503(0.2460) Grad: 0.9418  LR: 0.00001648  
Epoch: [2][700/5891] Elapsed 1m 3s (remain 7m 47s) Loss: 0.2742(0.2472) Grad: 0.2179  LR: 0.00001638  
Epoch: [2][800/5891] Elapsed 1m 12s (remain 7m 37s) Loss: 0.4380(0.2474) Grad: 2.0036  LR: 0.00001628  
Epoch: [2][900/5891] Elapsed 1m 20s (remain 7m 28s) Loss: 0.1953(0.2485)

Epoch 2 - avg_train_loss: 0.2518  avg_val_loss: 0.2523  time: 623s
INFO:__main__:Epoch 2 - avg_train_loss: 0.2518  avg_val_loss: 0.2523  time: 623s
Epoch 2 - Score: 0.0000
INFO:__main__:Epoch 2 - Score: 0.0000


Epoch: [3][0/5891] Elapsed 0m 0s (remain 38m 3s) Loss: 0.1978(0.1978) Grad: nan  LR: 0.00001000  
Epoch: [3][100/5891] Elapsed 0m 9s (remain 8m 51s) Loss: 0.1080(0.2415) Grad: 1.3544  LR: 0.00000987  
Epoch: [3][200/5891] Elapsed 0m 18s (remain 8m 34s) Loss: 0.3159(0.2467) Grad: 0.6948  LR: 0.00000973  
Epoch: [3][300/5891] Elapsed 0m 27s (remain 8m 22s) Loss: 0.3137(0.2512) Grad: 0.5618  LR: 0.00000960  
Epoch: [3][400/5891] Elapsed 0m 35s (remain 8m 11s) Loss: 0.2345(0.2535) Grad: 0.3078  LR: 0.00000947  
Epoch: [3][500/5891] Elapsed 0m 44s (remain 8m 1s) Loss: 0.1129(0.2535) Grad: 1.5153  LR: 0.00000933  
Epoch: [3][600/5891] Elapsed 0m 53s (remain 7m 51s) Loss: 0.2744(0.2556) Grad: 0.1343  LR: 0.00000920  
Epoch: [3][700/5891] Elapsed 1m 2s (remain 7m 42s) Loss: 0.2749(0.2541) Grad: 0.2734  LR: 0.00000907  
Epoch: [3][800/5891] Elapsed 1m 11s (remain 7m 32s) Loss: 0.2742(0.2561) Grad: 0.0862  LR: 0.00000893  
Epoch: [3][900/5891] Elapsed 1m 20s (remain 7m 23s) Loss: 0.1538(0.2557) 

Epoch 3 - avg_train_loss: 0.2517  avg_val_loss: 0.2517  time: 623s
INFO:__main__:Epoch 3 - avg_train_loss: 0.2517  avg_val_loss: 0.2517  time: 623s
Epoch 3 - Score: 0.0000
INFO:__main__:Epoch 3 - Score: 0.0000


Epoch: [4][0/5891] Elapsed 0m 0s (remain 38m 5s) Loss: 0.2747(0.2747) Grad: nan  LR: 0.00000293  
Epoch: [4][100/5891] Elapsed 0m 9s (remain 8m 53s) Loss: 0.3145(0.2547) Grad: 0.6520  LR: 0.00000283  
Epoch: [4][200/5891] Elapsed 0m 18s (remain 8m 36s) Loss: 0.3950(0.2560) Grad: 1.5049  LR: 0.00000274  
Epoch: [4][300/5891] Elapsed 0m 27s (remain 8m 24s) Loss: 0.1528(0.2521) Grad: 1.0548  LR: 0.00000265  
Epoch: [4][400/5891] Elapsed 0m 36s (remain 8m 14s) Loss: 0.4407(0.2490) Grad: 2.0733  LR: 0.00000256  
Epoch: [4][500/5891] Elapsed 0m 45s (remain 8m 4s) Loss: 0.2345(0.2513) Grad: 0.1990  LR: 0.00000247  
Epoch: [4][600/5891] Elapsed 0m 53s (remain 7m 55s) Loss: 0.1136(0.2526) Grad: 1.5350  LR: 0.00000239  
Epoch: [4][700/5891] Elapsed 1m 2s (remain 7m 45s) Loss: 0.3157(0.2523) Grad: 0.6747  LR: 0.00000230  
Epoch: [4][800/5891] Elapsed 1m 11s (remain 7m 36s) Loss: 0.3584(0.2499) Grad: 1.2318  LR: 0.00000222  
Epoch: [4][900/5891] Elapsed 1m 20s (remain 7m 26s) Loss: 0.2754(0.2504) 

Epoch 4 - avg_train_loss: 0.2517  avg_val_loss: 0.2517  time: 623s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2517  avg_val_loss: 0.2517  time: 623s
Epoch 4 - Score: 0.0000
INFO:__main__:Epoch 4 - Score: 0.0000


f1 score : 0.0
recall score : 0.0
precision score : 0.0


Score: 0.0000
INFO:__main__:Score: 0.0000
F1 BEST Score: 0.0000
INFO:__main__:F1 BEST Score: 0.0000
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.32.0",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__

Epoch: [1][0/5891] Elapsed 0m 0s (remain 41m 55s) Loss: 0.7554(0.7554) Grad: nan  LR: 0.00002000  
Epoch: [1][100/5891] Elapsed 0m 9s (remain 8m 53s) Loss: 0.3694(0.2707) Grad: 2.2052  LR: 0.00002000  
Epoch: [1][200/5891] Elapsed 0m 18s (remain 8m 34s) Loss: 0.2903(0.2439) Grad: 2.1884  LR: 0.00002000  
Epoch: [1][300/5891] Elapsed 0m 27s (remain 8m 23s) Loss: 0.2378(0.2327) Grad: 0.7689  LR: 0.00001999  
Epoch: [1][400/5891] Elapsed 0m 35s (remain 8m 12s) Loss: 0.2012(0.2298) Grad: 0.4638  LR: 0.00001999  
Epoch: [1][500/5891] Elapsed 0m 44s (remain 8m 3s) Loss: 0.2426(0.2274) Grad: 0.9667  LR: 0.00001998  
Epoch: [1][600/5891] Elapsed 0m 53s (remain 7m 54s) Loss: 0.1608(0.2238) Grad: 1.1020  LR: 0.00001997  
Epoch: [1][700/5891] Elapsed 1m 2s (remain 7m 44s) Loss: 0.0923(0.2207) Grad: 1.2714  LR: 0.00001996  
Epoch: [1][800/5891] Elapsed 1m 11s (remain 7m 35s) Loss: 0.1227(0.2198) Grad: 1.4426  LR: 0.00001994  
Epoch: [1][900/5891] Elapsed 1m 20s (remain 7m 26s) Loss: 0.2250(0.2199)

Epoch 1 - avg_train_loss: 0.1909  avg_val_loss: 0.1723  time: 624s
INFO:__main__:Epoch 1 - avg_train_loss: 0.1909  avg_val_loss: 0.1723  time: 624s
Epoch 1 - Score: 0.3900
INFO:__main__:Epoch 1 - Score: 0.3900
Epoch 1 - Save Best Score: 0.4960 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.4960 Model


Epoch: [2][0/5891] Elapsed 0m 0s (remain 39m 11s) Loss: 0.1598(0.1598) Grad: nan  LR: 0.00001707  
Epoch: [2][100/5891] Elapsed 0m 9s (remain 8m 52s) Loss: 0.2207(0.1686) Grad: 0.7738  LR: 0.00001698  
Epoch: [2][200/5891] Elapsed 0m 18s (remain 8m 42s) Loss: 0.1763(0.1711) Grad: 0.5409  LR: 0.00001688  
Epoch: [2][300/5891] Elapsed 0m 27s (remain 8m 27s) Loss: 0.1218(0.1671) Grad: 0.8488  LR: 0.00001678  
Epoch: [2][400/5891] Elapsed 0m 36s (remain 8m 17s) Loss: 0.1371(0.1659) Grad: 0.6175  LR: 0.00001668  
Epoch: [2][500/5891] Elapsed 0m 45s (remain 8m 6s) Loss: 0.1876(0.1673) Grad: 0.8873  LR: 0.00001658  
Epoch: [2][600/5891] Elapsed 0m 53s (remain 7m 55s) Loss: 0.1906(0.1680) Grad: 1.3438  LR: 0.00001648  
Epoch: [2][700/5891] Elapsed 1m 2s (remain 7m 44s) Loss: 0.1736(0.1687) Grad: 0.6384  LR: 0.00001638  
Epoch: [2][800/5891] Elapsed 1m 11s (remain 7m 36s) Loss: 0.1431(0.1695) Grad: 0.8246  LR: 0.00001628  
Epoch: [2][900/5891] Elapsed 1m 20s (remain 7m 26s) Loss: 0.2710(0.1689)

Epoch 2 - avg_train_loss: 0.1635  avg_val_loss: 0.1624  time: 624s
INFO:__main__:Epoch 2 - avg_train_loss: 0.1635  avg_val_loss: 0.1624  time: 624s
Epoch 2 - Score: 0.4563
INFO:__main__:Epoch 2 - Score: 0.4563
Epoch 2 - Save Best Score: 0.5283 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.5283 Model


Epoch: [3][0/5891] Elapsed 0m 0s (remain 39m 2s) Loss: 0.1002(0.1002) Grad: nan  LR: 0.00001000  
Epoch: [3][100/5891] Elapsed 0m 9s (remain 8m 58s) Loss: 0.0443(0.1542) Grad: 0.4624  LR: 0.00000987  
Epoch: [3][200/5891] Elapsed 0m 18s (remain 8m 46s) Loss: 0.1659(0.1519) Grad: 0.6900  LR: 0.00000973  
Epoch: [3][300/5891] Elapsed 0m 27s (remain 8m 37s) Loss: 0.1026(0.1521) Grad: 0.4932  LR: 0.00000960  
Epoch: [3][400/5891] Elapsed 0m 36s (remain 8m 22s) Loss: 0.2433(0.1530) Grad: 1.5564  LR: 0.00000947  
Epoch: [3][500/5891] Elapsed 0m 45s (remain 8m 9s) Loss: 0.1833(0.1548) Grad: 1.6290  LR: 0.00000933  
Epoch: [3][600/5891] Elapsed 0m 54s (remain 7m 57s) Loss: 0.2048(0.1559) Grad: 0.8495  LR: 0.00000920  
Epoch: [3][700/5891] Elapsed 1m 3s (remain 7m 47s) Loss: 0.2097(0.1552) Grad: 1.2535  LR: 0.00000907  
Epoch: [3][800/5891] Elapsed 1m 11s (remain 7m 36s) Loss: 0.2064(0.1553) Grad: 0.7726  LR: 0.00000893  
Epoch: [3][900/5891] Elapsed 1m 20s (remain 7m 27s) Loss: 0.2952(0.1547) 

Epoch 3 - avg_train_loss: 0.1522  avg_val_loss: 0.1591  time: 623s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1522  avg_val_loss: 0.1591  time: 623s
Epoch 3 - Score: 0.4729
INFO:__main__:Epoch 3 - Score: 0.4729
Epoch 3 - Save Best Score: 0.5413 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.5413 Model


Epoch: [4][0/5891] Elapsed 0m 0s (remain 39m 35s) Loss: 0.0914(0.0914) Grad: nan  LR: 0.00000293  
Epoch: [4][100/5891] Elapsed 0m 9s (remain 8m 58s) Loss: 0.0604(0.1482) Grad: 0.7479  LR: 0.00000283  
Epoch: [4][200/5891] Elapsed 0m 18s (remain 8m 44s) Loss: 0.0938(0.1430) Grad: 0.5413  LR: 0.00000274  
Epoch: [4][300/5891] Elapsed 0m 27s (remain 8m 34s) Loss: 0.1353(0.1420) Grad: 0.5954  LR: 0.00000265  
Epoch: [4][400/5891] Elapsed 0m 36s (remain 8m 20s) Loss: 0.1422(0.1426) Grad: 0.8412  LR: 0.00000256  
Epoch: [4][500/5891] Elapsed 0m 45s (remain 8m 9s) Loss: 0.2081(0.1419) Grad: 1.0069  LR: 0.00000247  
Epoch: [4][600/5891] Elapsed 0m 54s (remain 7m 58s) Loss: 0.1716(0.1422) Grad: 0.7565  LR: 0.00000239  
Epoch: [4][700/5891] Elapsed 1m 3s (remain 7m 48s) Loss: 0.1427(0.1429) Grad: 1.2995  LR: 0.00000230  
Epoch: [4][800/5891] Elapsed 1m 12s (remain 7m 38s) Loss: 0.1512(0.1428) Grad: 0.7697  LR: 0.00000222  
Epoch: [4][900/5891] Elapsed 1m 21s (remain 7m 28s) Loss: 0.1116(0.1426)

Epoch 4 - avg_train_loss: 0.1431  avg_val_loss: 0.1593  time: 623s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1431  avg_val_loss: 0.1593  time: 623s
Epoch 4 - Score: 0.4908
INFO:__main__:Epoch 4 - Score: 0.4908


f1 score : 0.47289719626168225
recall score : 0.34891204413116766
precision score : 0.7335695876288659


Score: 0.4729
INFO:__main__:Score: 0.4729
F1 BEST Score: 0.5413
INFO:__main__:F1 BEST Score: 0.5413


f1 score : 0.4248724820610357
recall score : 0.3012166344886764
precision score : 0.7207596978807655


Score: 0.4249
INFO:__main__:Score: 0.4249
F1 BEST Score: 0.4886
INFO:__main__:F1 BEST Score: 0.4886


In [None]:
from google.colab import runtime
runtime.unassign()