In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers
!pip install datasets
!pip install sentencepiece

Collecting transformers
  Downloading transformers-4.33.1-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m50.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m54.6 MB/s[0m eta [36m0:00:0

In [3]:
!nvidia-smi

Thu Sep  7 13:28:19 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    24W / 300W |      0MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re
import html

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD, AdamW, RAdam
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score, recall_score, accuracy_score, precision_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
import os

DIR = "/content/drive/MyDrive/Competitions/Signate/MUFG2023"
INPUT_DIR = os.path.join(DIR,"input")
OUTPUT_DIR = os.path.join(DIR,"output")

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)


OUTPUT_EXP_DIR = DIR + '/output/EXP071/'
if not os.path.exists(OUTPUT_EXP_DIR):
    os.makedirs(OUTPUT_EXP_DIR)

In [6]:


# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    # model="microsoft/deberta-v3-base"
    # model='microsoft/deberta-base'
    # model='roberta-base'
    # model='roberta-large'
    # model='roberta-large-mnli'
    # model='xlnet-large-cased'
    # model='albert-xxlarge-v2'
    # model="microsoft/deberta-large"
    # model="microsoft/deberta-v3-large"
    # model='microsoft/deberta-v2-xlarge'
    # model='funnel-transformer/large'
    # model='funnel-transformer/medium'
    # model='albert-base-v2'
    # model='albert-large-v2'
    # model='google/electra-large-discriminator'
    model='google/electra-base-discriminator'
    # model="facebook/bart-large-mnli"
    # model="facebook/bart-large"
    # model="facebook/bart-base"
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=4
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=64
    fc_dropout=0.2
    target="is_fraud?"
    target_size=1
    max_len=97
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    train=True
    nth_awp_start_epoch=1
    gradient_checkpointing = False
    freezing = False

if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0, 1]

In [7]:
def get_score(labels, outputs):
    thresh = 0.5
    y_pred = outputs
    y_true = labels
    f_score = f1_score(y_true, (y_pred>thresh).astype(int))
    r_score = recall_score(y_true, (y_pred>thresh).astype(int))
    p_score = precision_score(y_true, (y_pred>thresh).astype(int))
    print(f"f1 score : {f_score}")
    print(f"recall score : {r_score}")
    print(f"precision score : {p_score}")
    return f1_score(y_true, (y_pred>thresh).astype(int))

def get_f1_score(labels, outputs):
    y_pred = outputs
    y_true = labels
    best_score = 0
    best_thresh = 0.5
    for thresh in np.arange(0.1, 0.80, 0.01):
        thresh = np.round(thresh, 2)
        score = f1_score(y_true, (y_pred>thresh).astype(int))
        #print("Accuracy score at threshold {0} is {1}".format(thresh, score))
        if score > best_score:
          best_score = score
          best_thresh = thresh
    return f1_score(y_true, (y_pred>best_thresh).astype(int))


def get_logger(filename=OUTPUT_EXP_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(seed=CFG.seed)

In [8]:
def freeze(module):
    """
    Freezes module's parameters.
    """

    for parameter in module.parameters():
        parameter.requires_grad = False

def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """

    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)

    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """

    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"

        if hasattr(embeddings_path, attr_name):
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [9]:
import pandas as pd
import numpy as np


train = pd.read_csv(os.path.join(INPUT_DIR,"train.csv"))
test = pd.read_csv(os.path.join(INPUT_DIR,"test.csv"))
card = pd.read_csv(os.path.join(INPUT_DIR, "card.csv"))
user = pd.read_csv(os.path.join(INPUT_DIR, "user.csv"))
sub = pd.read_csv(os.path.join(INPUT_DIR, "sample_submit.csv"), header=None)

print(train.shape)
display(train.head(3))

print(test.shape)
display(test.head(3))

print(card.shape)
display(card.head(3))

print(user.shape)
display(user.head(3))

print(sub.shape)
display(sub.head(3))

(471283, 12)


Unnamed: 0,index,user_id,card_id,amount,errors?,is_fraud?,merchant_id,merchant_city,merchant_state,zip,mcc,use_chip
0,0,1721,0,$2.623,OK,0,209237,Joliet,IL,60436.0,5541,Swipe Transaction
1,1,1629,3,$6.4,OK,0,2568,Edgerton,WI,53534.0,5814,Swipe Transaction
2,2,655,3,$123.5,OK,0,345310,Ridgefield,WA,98642.0,7538,Swipe Transaction


(457958, 11)


Unnamed: 0,index,user_id,card_id,amount,errors?,merchant_id,merchant_city,merchant_state,zip,mcc,use_chip
0,471283,541,3,$113.278,OK,324189,Orlando,FL,32821.0,4814,Swipe Transaction
1,471284,655,1,$293.944,OK,81219,Ridgefield,WA,98642.0,7538,Chip Transaction
2,471285,492,0,$47.4,OK,274755,Arlington Heights,IL,60004.0,5719,Swipe Transaction


(416, 10)


Unnamed: 0,user_id,card_id,card_brand,card_type,expires,has_chip,cards_issued,credit_limit,acct_open_date,year_pin_last_changed
0,39,0,Visa,Debit,09/2021,YES,1,$17117,05/2007,2010
1,39,1,Amex,Credit,11/2024,YES,2,$5400,10/2015,2015
2,41,0,Discover,Credit,03/2022,YES,2,$14800,12/2010,2011


(97, 17)


Unnamed: 0,user_id,current_age,retirement_age,birth_year,birth_month,gender,address,city,state,zipcode,latitude,longitude,per_capita_income_zipcode,yearly_income_person,total_debt,fico_score,num_credit_cards
0,39,57,64,1962,12,Female,442 Burns Boulevard,Mansfield,MA,2048,42.02,-71.21,$37407,$76274,$102611,698,2
1,41,39,66,1980,10,Female,3863 River Avenue,Lincoln,CA,95648,38.93,-121.25,$21829,$44506,$57994,849,3
2,47,40,67,1979,5,Female,8799 Elm Avenue,Mckinney,TX,75069,33.2,-96.65,$24684,$50329,$76759,625,4


(457958, 2)


Unnamed: 0,0,1
0,471283,0
1,471284,1
2,471285,0


In [10]:
month_dict = {
   "01": "January",
   "02": "February",
   "03": "March",
   "04": "April",
   "05": "May",
   "06": "June",
   "07": "July",
   "08": "August",
   "09": "September",
   "10": "October",
   "11": "November",
   "12": "December"
}

def get_expires_values(df):
  _df = df["expires"].str.split('/').apply(pd.Series)
  _df.columns = ["month","years"]
  df["expires_month"] = _df["month"].astype(str)
  df["expires_years"] = _df["years"].astype(str)
  return df

def get_acct_open_date_values(df):
  _df = df["acct_open_date"].str.split('/').apply(pd.Series)
  _df.columns = ["month","years"]
  df["acct_open_date_month"] = _df["month"].astype(str)
  df["acct_open_date_years"] = _df["years"].astype(str)
  return df


card = get_expires_values(card)
card = get_acct_open_date_values(card)
card["expires_month"] = card["expires_month"].map(month_dict)
card["acct_open_date_month"] = card["acct_open_date_month"].map(month_dict)

In [11]:
train = train.merge(card, how="left", on=["user_id", "card_id"]).merge(user, how="left", on="user_id")

In [12]:
train.fillna('unknown', inplace = True)

train["texts"] = "merchant" + "[SEP]" + train["amount"] + "[SEP]" + train["errors?"] + "[SEP]" + train["merchant_city"] + "[SEP]" + train["merchant_state"] + "[SEP]" + train["use_chip"] + "[SEP]" \
+ "card" + "[SEP]" + train["card_brand"] + "[SEP]" + train["card_type"] + "[SEP]" + train["expires_month"] + " " + train["expires_years"] + "[SEP]" + train["has_chip"] + "[SEP]" + train["acct_open_date_month"] + " " + train["acct_open_date_years"] + "[SEP]" + train["year_pin_last_changed"].astype(str) + "[SEP]" \
+ "user" + "[SEP]" + train["current_age"].astype(str) + " year old " + train["gender"] + "[SEP]" + "retired at age " + train["retirement_age"].astype(str) + "[SEP]" + train["address"] + "[SEP]" + train["city"] + "[SEP]" + train["state"] + "[SEP]" + train["per_capita_income_zipcode"] + "[SEP]" + train["yearly_income_person"] + "[SEP]" + train["total_debt"]

In [13]:
skf = StratifiedKFold(n_splits=CFG.n_fold,shuffle=True,random_state=CFG.seed)
for fold, ( _, val_) in enumerate(skf.split(train, train[CFG.target])):
    train.loc[val_ , "kfold"] = int(fold)

train["kfold"] = train["kfold"].astype(int)

if CFG.debug:
    display(train.groupby('kfold').size())
    train = train.sample(n=500, random_state=0).reset_index(drop=True)
    display(train.groupby('kfold').size())

In [14]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_EXP_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

Downloading (…)okenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [15]:
# ====================================================
# Define max_len
# ====================================================
lengths = []
tk0 = tqdm(train['texts'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 23 # cls
LOGGER.info(f"max_len: {CFG.max_len}")

100%|██████████| 471283/471283 [02:52<00:00, 2729.11it/s]
max_len: 104
INFO:__main__:max_len: 104


In [16]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False,
                           truncation=True)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['texts'].values
        self.labels = df[CFG.target].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.half)
        return inputs, label

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

class ValidDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['texts'].values
        self.labels = df[CFG.target].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

#collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [17]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim=1)
        return max_embeddings


class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()

        # Freezing
        if cfg.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            cfg.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())

        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        self._init_weights(self.fc)
        self.layer_norm1 = nn.LayerNorm(self.config.hidden_size)
        self.sig = nn.Sigmoid()

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        feature = self.layer_norm1(feature)
        output = self.fc(feature)
        #output = self.sig(output)
        return output

In [18]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
        #print(y_preds.sigmoid().squeeze().view(1, -1))
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader),
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))

    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    predictions = np.concatenate(predictions)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [19]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):

    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['kfold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['kfold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds[CFG.target].values

    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = ValidDataset(CFG, valid_folds)


    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size*2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_EXP_DIR+'config.pth')
    model.to(device)

    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr,
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)

    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler

    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.BCEWithLogitsLoss()

    best_score = -1.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)

        # scoring
        score = get_score(valid_labels, predictions)
        f1_score = get_f1_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')


        if best_score < f1_score:
            best_score = f1_score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_EXP_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_EXP_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                             map_location=torch.device('cpu'))['predictions']
    valid_folds['pred'] = predictions

    torch.cuda.empty_cache()
    gc.collect()

    return valid_folds

In [20]:
if __name__ == '__main__':

    def get_result(oof_df):
        labels = oof_df[CFG.target].values
        preds = oof_df['pred'].values
        score = get_score(labels, preds)
        f1_score = get_f1_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
        LOGGER.info(f'F1 BEST Score: {f1_score:<.4f}')

    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
            #break
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_EXP_DIR+'oof_df.pkl')

ElectraConfig {
  "_name_or_path": "google/electra-base-discriminator",
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": null,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "transformers_version": "4.33.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

INFO:__main__:ElectraConfig {
  "_name_or_path": "google/electra-base-discriminator",
  "architectures": [
    "ElectraForPreTraini

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch: [1][0/5891] Elapsed 0m 3s (remain 360m 51s) Loss: 0.4253(0.4253) Grad: nan  LR: 0.00002000  
Epoch: [1][100/5891] Elapsed 0m 14s (remain 14m 3s) Loss: 0.1174(0.2471) Grad: 0.9570  LR: 0.00002000  
Epoch: [1][200/5891] Elapsed 0m 25s (remain 12m 9s) Loss: 0.0768(0.2357) Grad: 1.2253  LR: 0.00002000  
Epoch: [1][300/5891] Elapsed 0m 34s (remain 10m 49s) Loss: 0.2695(0.2261) Grad: 1.0071  LR: 0.00001999  
Epoch: [1][400/5891] Elapsed 0m 46s (remain 10m 30s) Loss: 0.1862(0.2229) Grad: 0.9000  LR: 0.00001999  
Epoch: [1][500/5891] Elapsed 0m 56s (remain 10m 11s) Loss: 0.2620(0.2241) Grad: 1.2746  LR: 0.00001998  
Epoch: [1][600/5891] Elapsed 1m 6s (remain 9m 48s) Loss: 0.3196(0.2230) Grad: 0.8637  LR: 0.00001997  
Epoch: [1][700/5891] Elapsed 1m 18s (remain 9m 41s) Loss: 0.1350(0.2218) Grad: 1.1651  LR: 0.00001996  
Epoch: [1][800/5891] Elapsed 1m 29s (remain 9m 26s) Loss: 0.2070(0.2195) Grad: 0.6830  LR: 0.00001994  
Epoch: [1][900/5891] Elapsed 1m 39s (remain 9m 13s) Loss: 0.0578(0

Epoch 1 - avg_train_loss: 0.1915  avg_val_loss: 0.1726  time: 760s
INFO:__main__:Epoch 1 - avg_train_loss: 0.1915  avg_val_loss: 0.1726  time: 760s
Epoch 1 - Score: 0.4115
INFO:__main__:Epoch 1 - Score: 0.4115
Epoch 1 - Save Best Score: 0.4952 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.4952 Model


Epoch: [2][0/5891] Elapsed 0m 0s (remain 79m 41s) Loss: 0.1460(0.1460) Grad: nan  LR: 0.00001707  
Epoch: [2][100/5891] Elapsed 0m 11s (remain 11m 7s) Loss: 0.2859(0.1803) Grad: 1.3629  LR: 0.00001698  
Epoch: [2][200/5891] Elapsed 0m 23s (remain 11m 18s) Loss: 0.2039(0.1734) Grad: 1.7020  LR: 0.00001688  
Epoch: [2][300/5891] Elapsed 0m 35s (remain 10m 51s) Loss: 0.1981(0.1740) Grad: 1.0610  LR: 0.00001678  
Epoch: [2][400/5891] Elapsed 0m 45s (remain 10m 16s) Loss: 0.1381(0.1726) Grad: 0.9118  LR: 0.00001668  
Epoch: [2][500/5891] Elapsed 0m 56s (remain 10m 8s) Loss: 0.1606(0.1734) Grad: 0.5558  LR: 0.00001658  
Epoch: [2][600/5891] Elapsed 1m 7s (remain 9m 52s) Loss: 0.2961(0.1750) Grad: 2.1138  LR: 0.00001648  
Epoch: [2][700/5891] Elapsed 1m 17s (remain 9m 36s) Loss: 0.2211(0.1723) Grad: 1.4743  LR: 0.00001638  
Epoch: [2][800/5891] Elapsed 1m 29s (remain 9m 29s) Loss: 0.1918(0.1707) Grad: 1.3540  LR: 0.00001628  
Epoch: [2][900/5891] Elapsed 1m 40s (remain 9m 17s) Loss: 0.0694(0.

Epoch 2 - avg_train_loss: 0.1637  avg_val_loss: 0.1618  time: 758s
INFO:__main__:Epoch 2 - avg_train_loss: 0.1637  avg_val_loss: 0.1618  time: 758s
Epoch 2 - Score: 0.4990
INFO:__main__:Epoch 2 - Score: 0.4990
Epoch 2 - Save Best Score: 0.5336 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.5336 Model


Epoch: [3][0/5891] Elapsed 0m 0s (remain 51m 32s) Loss: 0.3936(0.3936) Grad: nan  LR: 0.00001000  
Epoch: [3][100/5891] Elapsed 0m 12s (remain 11m 30s) Loss: 0.1561(0.1487) Grad: 0.9699  LR: 0.00000987  
Epoch: [3][200/5891] Elapsed 0m 22s (remain 10m 40s) Loss: 0.0811(0.1399) Grad: 1.0757  LR: 0.00000973  
Epoch: [3][300/5891] Elapsed 0m 34s (remain 10m 35s) Loss: 0.2410(0.1418) Grad: 1.1609  LR: 0.00000960  
Epoch: [3][400/5891] Elapsed 0m 46s (remain 10m 32s) Loss: 0.1292(0.1426) Grad: 0.8866  LR: 0.00000947  
Epoch: [3][500/5891] Elapsed 0m 55s (remain 10m 0s) Loss: 0.1364(0.1445) Grad: 0.8449  LR: 0.00000933  
Epoch: [3][600/5891] Elapsed 1m 7s (remain 9m 52s) Loss: 0.1592(0.1451) Grad: 0.9366  LR: 0.00000920  
Epoch: [3][700/5891] Elapsed 1m 18s (remain 9m 39s) Loss: 0.1815(0.1463) Grad: 0.9274  LR: 0.00000907  
Epoch: [3][800/5891] Elapsed 1m 28s (remain 9m 20s) Loss: 0.0349(0.1466) Grad: 0.9364  LR: 0.00000893  
Epoch: [3][900/5891] Elapsed 1m 39s (remain 9m 12s) Loss: 0.2319(0

Epoch 3 - avg_train_loss: 0.1503  avg_val_loss: 0.1557  time: 733s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1503  avg_val_loss: 0.1557  time: 733s
Epoch 3 - Score: 0.4915
INFO:__main__:Epoch 3 - Score: 0.4915
Epoch 3 - Save Best Score: 0.5522 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.5522 Model


Epoch: [4][0/5891] Elapsed 0m 0s (remain 46m 42s) Loss: 0.1522(0.1522) Grad: nan  LR: 0.00000293  
Epoch: [4][100/5891] Elapsed 0m 10s (remain 9m 44s) Loss: 0.0855(0.1409) Grad: 1.1343  LR: 0.00000283  
Epoch: [4][200/5891] Elapsed 0m 21s (remain 10m 17s) Loss: 0.1266(0.1365) Grad: 0.9742  LR: 0.00000274  
Epoch: [4][300/5891] Elapsed 0m 32s (remain 10m 8s) Loss: 0.0534(0.1382) Grad: 1.4023  LR: 0.00000265  
Epoch: [4][400/5891] Elapsed 0m 41s (remain 9m 34s) Loss: 0.0825(0.1398) Grad: 0.9010  LR: 0.00000256  
Epoch: [4][500/5891] Elapsed 0m 53s (remain 9m 30s) Loss: 0.1293(0.1394) Grad: 1.2253  LR: 0.00000247  
Epoch: [4][600/5891] Elapsed 1m 3s (remain 9m 19s) Loss: 0.1873(0.1392) Grad: 0.7951  LR: 0.00000239  
Epoch: [4][700/5891] Elapsed 1m 13s (remain 9m 2s) Loss: 0.1404(0.1399) Grad: 1.2410  LR: 0.00000230  
Epoch: [4][800/5891] Elapsed 1m 24s (remain 8m 55s) Loss: 0.1388(0.1403) Grad: 1.1754  LR: 0.00000222  
Epoch: [4][900/5891] Elapsed 1m 34s (remain 8m 41s) Loss: 0.1572(0.139

Epoch 4 - avg_train_loss: 0.1389  avg_val_loss: 0.1571  time: 721s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1389  avg_val_loss: 0.1571  time: 721s
Epoch 4 - Score: 0.5086
INFO:__main__:Epoch 4 - Score: 0.5086
Epoch 4 - Save Best Score: 0.5532 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.5532 Model


f1 score : 0.5085748078060318
recall score : 0.39534171008274593
precision score : 0.712707182320442


Score: 0.5086
INFO:__main__:Score: 0.5086
F1 BEST Score: 0.5532
INFO:__main__:F1 BEST Score: 0.5532
ElectraConfig {
  "_name_or_path": "google/electra-base-discriminator",
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": null,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "transformers_version": "4.33.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

INFO:__main__:ElectraConfig {
 

Epoch: [1][0/5891] Elapsed 0m 1s (remain 106m 13s) Loss: 0.9214(0.9214) Grad: nan  LR: 0.00002000  
Epoch: [1][100/5891] Elapsed 0m 11s (remain 10m 56s) Loss: 0.3650(0.2527) Grad: 1.7841  LR: 0.00002000  
Epoch: [1][200/5891] Elapsed 0m 22s (remain 10m 34s) Loss: 0.1559(0.2298) Grad: 0.9031  LR: 0.00002000  
Epoch: [1][300/5891] Elapsed 0m 32s (remain 10m 4s) Loss: 0.1576(0.2203) Grad: 0.5855  LR: 0.00001999  
Epoch: [1][400/5891] Elapsed 0m 42s (remain 9m 41s) Loss: 0.1833(0.2178) Grad: 1.1438  LR: 0.00001999  
Epoch: [1][500/5891] Elapsed 0m 53s (remain 9m 33s) Loss: 0.2688(0.2154) Grad: 0.7233  LR: 0.00001998  
Epoch: [1][600/5891] Elapsed 1m 2s (remain 9m 12s) Loss: 0.1832(0.2132) Grad: 0.5445  LR: 0.00001997  
Epoch: [1][700/5891] Elapsed 1m 13s (remain 9m 2s) Loss: 0.2367(0.2103) Grad: 1.3753  LR: 0.00001996  
Epoch: [1][800/5891] Elapsed 1m 24s (remain 8m 55s) Loss: 0.0809(0.2092) Grad: 0.5848  LR: 0.00001994  
Epoch: [1][900/5891] Elapsed 1m 33s (remain 8m 37s) Loss: 0.1665(0.2

Epoch 1 - avg_train_loss: 0.1844  avg_val_loss: 0.1686  time: 726s
INFO:__main__:Epoch 1 - avg_train_loss: 0.1844  avg_val_loss: 0.1686  time: 726s
Epoch 1 - Score: 0.3989
INFO:__main__:Epoch 1 - Score: 0.3989
Epoch 1 - Save Best Score: 0.5155 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.5155 Model


Epoch: [2][0/5891] Elapsed 0m 0s (remain 57m 9s) Loss: 0.2285(0.2285) Grad: nan  LR: 0.00001707  
Epoch: [2][100/5891] Elapsed 0m 13s (remain 12m 26s) Loss: 0.1887(0.1633) Grad: 1.5774  LR: 0.00001698  
Epoch: [2][200/5891] Elapsed 0m 22s (remain 10m 42s) Loss: 0.0815(0.1588) Grad: 0.6108  LR: 0.00001688  
Epoch: [2][300/5891] Elapsed 0m 33s (remain 10m 22s) Loss: 0.1013(0.1626) Grad: 1.3830  LR: 0.00001678  
Epoch: [2][400/5891] Elapsed 0m 44s (remain 10m 10s) Loss: 0.0981(0.1635) Grad: 1.3349  LR: 0.00001668  
Epoch: [2][500/5891] Elapsed 0m 53s (remain 9m 39s) Loss: 0.2056(0.1669) Grad: 3.7748  LR: 0.00001658  
Epoch: [2][600/5891] Elapsed 1m 4s (remain 9m 32s) Loss: 0.1920(0.1736) Grad: 0.8209  LR: 0.00001648  
Epoch: [2][700/5891] Elapsed 1m 15s (remain 9m 22s) Loss: 0.3135(0.1803) Grad: 0.9467  LR: 0.00001638  
Epoch: [2][800/5891] Elapsed 1m 25s (remain 9m 1s) Loss: 0.0770(0.1858) Grad: 0.9117  LR: 0.00001628  
Epoch: [2][900/5891] Elapsed 1m 36s (remain 8m 54s) Loss: 0.1620(0.1

Epoch 2 - avg_train_loss: 0.1995  avg_val_loss: 0.1770  time: 730s
INFO:__main__:Epoch 2 - avg_train_loss: 0.1995  avg_val_loss: 0.1770  time: 730s
Epoch 2 - Score: 0.3633
INFO:__main__:Epoch 2 - Score: 0.3633


Epoch: [3][0/5891] Elapsed 0m 0s (remain 52m 42s) Loss: 0.2122(0.2122) Grad: nan  LR: 0.00001000  
Epoch: [3][100/5891] Elapsed 0m 10s (remain 10m 11s) Loss: 0.1166(0.1563) Grad: 1.3305  LR: 0.00000987  
Epoch: [3][200/5891] Elapsed 0m 20s (remain 9m 45s) Loss: 0.0720(0.1631) Grad: 1.6225  LR: 0.00000973  
Epoch: [3][300/5891] Elapsed 0m 31s (remain 9m 46s) Loss: 0.3184(0.1673) Grad: 1.6475  LR: 0.00000960  
Epoch: [3][400/5891] Elapsed 0m 41s (remain 9m 22s) Loss: 0.1725(0.1701) Grad: 2.4818  LR: 0.00000947  
Epoch: [3][500/5891] Elapsed 0m 51s (remain 9m 16s) Loss: 0.2048(0.1696) Grad: 1.0732  LR: 0.00000933  
Epoch: [3][600/5891] Elapsed 1m 2s (remain 9m 11s) Loss: 0.1180(0.1684) Grad: 0.9040  LR: 0.00000920  
Epoch: [3][700/5891] Elapsed 1m 11s (remain 8m 51s) Loss: 0.1260(0.1683) Grad: 0.9437  LR: 0.00000907  
Epoch: [3][800/5891] Elapsed 1m 22s (remain 8m 45s) Loss: 0.1383(0.1701) Grad: 1.4293  LR: 0.00000893  
Epoch: [3][900/5891] Elapsed 1m 32s (remain 8m 34s) Loss: 0.2168(0.17

Epoch 3 - avg_train_loss: 0.1682  avg_val_loss: 0.1640  time: 722s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1682  avg_val_loss: 0.1640  time: 722s
Epoch 3 - Score: 0.4724
INFO:__main__:Epoch 3 - Score: 0.4724
Epoch 3 - Save Best Score: 0.5244 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.5244 Model


Epoch: [4][0/5891] Elapsed 0m 0s (remain 50m 15s) Loss: 0.3025(0.3025) Grad: nan  LR: 0.00000293  
Epoch: [4][100/5891] Elapsed 0m 12s (remain 11m 28s) Loss: 0.1764(0.1642) Grad: 1.3048  LR: 0.00000283  
Epoch: [4][200/5891] Elapsed 0m 23s (remain 10m 56s) Loss: 0.1941(0.1584) Grad: 1.7177  LR: 0.00000274  
Epoch: [4][300/5891] Elapsed 0m 33s (remain 10m 14s) Loss: 0.1105(0.1590) Grad: 1.5056  LR: 0.00000265  
Epoch: [4][400/5891] Elapsed 0m 44s (remain 10m 4s) Loss: 0.1425(0.1571) Grad: 2.2564  LR: 0.00000256  
Epoch: [4][500/5891] Elapsed 0m 53s (remain 9m 40s) Loss: 0.1121(0.1590) Grad: 1.2451  LR: 0.00000247  
Epoch: [4][600/5891] Elapsed 1m 4s (remain 9m 26s) Loss: 0.1366(0.1584) Grad: 1.3920  LR: 0.00000239  
Epoch: [4][700/5891] Elapsed 1m 15s (remain 9m 17s) Loss: 0.2163(0.1587) Grad: 1.5443  LR: 0.00000230  
Epoch: [4][800/5891] Elapsed 1m 25s (remain 9m 0s) Loss: 0.2307(0.1583) Grad: 2.6176  LR: 0.00000222  
Epoch: [4][900/5891] Elapsed 1m 35s (remain 8m 50s) Loss: 0.1486(0.1

Epoch 4 - avg_train_loss: 0.1537  avg_val_loss: 0.1608  time: 736s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1537  avg_val_loss: 0.1608  time: 736s
Epoch 4 - Score: 0.4825
INFO:__main__:Epoch 4 - Score: 0.4825
Epoch 4 - Save Best Score: 0.5351 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.5351 Model


f1 score : 0.48250758341759353
recall score : 0.36561446521605884
precision score : 0.7092746730083235


Score: 0.4825
INFO:__main__:Score: 0.4825
F1 BEST Score: 0.5351
INFO:__main__:F1 BEST Score: 0.5351
ElectraConfig {
  "_name_or_path": "google/electra-base-discriminator",
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": null,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "transformers_version": "4.33.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

INFO:__main__:ElectraConfig {
 

Epoch: [1][0/5891] Elapsed 0m 0s (remain 66m 17s) Loss: 0.7178(0.7178) Grad: nan  LR: 0.00002000  
Epoch: [1][100/5891] Elapsed 0m 12s (remain 11m 37s) Loss: 0.2593(0.2749) Grad: 0.3186  LR: 0.00002000  
Epoch: [1][200/5891] Elapsed 0m 22s (remain 10m 28s) Loss: 0.1782(0.2485) Grad: 0.7360  LR: 0.00002000  
Epoch: [1][300/5891] Elapsed 0m 33s (remain 10m 19s) Loss: 0.4272(0.2392) Grad: 2.2993  LR: 0.00001999  
Epoch: [1][400/5891] Elapsed 0m 43s (remain 9m 58s) Loss: 0.1277(0.2322) Grad: 0.5041  LR: 0.00001999  
Epoch: [1][500/5891] Elapsed 0m 53s (remain 9m 40s) Loss: 0.0886(0.2266) Grad: 0.9785  LR: 0.00001998  
Epoch: [1][600/5891] Elapsed 1m 5s (remain 9m 32s) Loss: 0.2499(0.2228) Grad: 0.6280  LR: 0.00001997  
Epoch: [1][700/5891] Elapsed 1m 14s (remain 9m 14s) Loss: 0.2671(0.2204) Grad: 1.3697  LR: 0.00001996  
Epoch: [1][800/5891] Elapsed 1m 25s (remain 9m 4s) Loss: 0.1776(0.2179) Grad: 1.3402  LR: 0.00001994  
Epoch: [1][900/5891] Elapsed 1m 36s (remain 8m 56s) Loss: 0.1732(0.2

Epoch 1 - avg_train_loss: 0.1961  avg_val_loss: 0.1769  time: 738s
INFO:__main__:Epoch 1 - avg_train_loss: 0.1961  avg_val_loss: 0.1769  time: 738s
Epoch 1 - Score: 0.4057
INFO:__main__:Epoch 1 - Score: 0.4057
Epoch 1 - Save Best Score: 0.4873 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.4873 Model


Epoch: [2][0/5891] Elapsed 0m 0s (remain 90m 31s) Loss: 0.1544(0.1544) Grad: nan  LR: 0.00001707  
Epoch: [2][100/5891] Elapsed 0m 12s (remain 12m 10s) Loss: 0.1073(0.1682) Grad: 1.0076  LR: 0.00001698  
Epoch: [2][200/5891] Elapsed 0m 23s (remain 11m 16s) Loss: 0.1842(0.1659) Grad: 0.9089  LR: 0.00001688  
Epoch: [2][300/5891] Elapsed 0m 33s (remain 10m 30s) Loss: 0.1753(0.1686) Grad: 2.0796  LR: 0.00001678  
Epoch: [2][400/5891] Elapsed 0m 44s (remain 10m 8s) Loss: 0.1185(0.1714) Grad: 0.8560  LR: 0.00001668  
Epoch: [2][500/5891] Elapsed 0m 55s (remain 9m 58s) Loss: 0.2379(0.1706) Grad: 1.4181  LR: 0.00001658  
Epoch: [2][600/5891] Elapsed 1m 4s (remain 9m 31s) Loss: 0.1971(0.1711) Grad: 0.9883  LR: 0.00001648  
Epoch: [2][700/5891] Elapsed 1m 16s (remain 9m 23s) Loss: 0.1821(0.1724) Grad: 1.7729  LR: 0.00001638  
Epoch: [2][800/5891] Elapsed 1m 27s (remain 9m 13s) Loss: 0.1387(0.1727) Grad: 1.1626  LR: 0.00001628  
Epoch: [2][900/5891] Elapsed 1m 36s (remain 8m 55s) Loss: 0.0963(0.

Epoch 2 - avg_train_loss: 0.1676  avg_val_loss: 0.1604  time: 742s
INFO:__main__:Epoch 2 - avg_train_loss: 0.1676  avg_val_loss: 0.1604  time: 742s
Epoch 2 - Score: 0.4779
INFO:__main__:Epoch 2 - Score: 0.4779
Epoch 2 - Save Best Score: 0.5345 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.5345 Model


Epoch: [3][0/5891] Elapsed 0m 0s (remain 58m 33s) Loss: 0.1842(0.1842) Grad: nan  LR: 0.00001000  
Epoch: [3][100/5891] Elapsed 0m 11s (remain 11m 15s) Loss: 0.1204(0.1705) Grad: 0.8346  LR: 0.00000987  
Epoch: [3][200/5891] Elapsed 0m 22s (remain 10m 43s) Loss: 0.1521(0.1585) Grad: 0.9831  LR: 0.00000973  
Epoch: [3][300/5891] Elapsed 0m 34s (remain 10m 34s) Loss: 0.1350(0.1551) Grad: 1.1009  LR: 0.00000960  
Epoch: [3][400/5891] Elapsed 0m 44s (remain 10m 13s) Loss: 0.1987(0.1575) Grad: 1.2104  LR: 0.00000947  
Epoch: [3][500/5891] Elapsed 0m 54s (remain 9m 51s) Loss: 0.1342(0.1569) Grad: 2.0535  LR: 0.00000933  
Epoch: [3][600/5891] Elapsed 1m 6s (remain 9m 42s) Loss: 0.1681(0.1562) Grad: 1.4853  LR: 0.00000920  
Epoch: [3][700/5891] Elapsed 1m 16s (remain 9m 29s) Loss: 0.0573(0.1550) Grad: 0.9785  LR: 0.00000907  
Epoch: [3][800/5891] Elapsed 1m 26s (remain 9m 12s) Loss: 0.1294(0.1545) Grad: 1.0194  LR: 0.00000893  
Epoch: [3][900/5891] Elapsed 1m 38s (remain 9m 4s) Loss: 0.1338(0.

Epoch 3 - avg_train_loss: 0.1540  avg_val_loss: 0.1562  time: 747s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1540  avg_val_loss: 0.1562  time: 747s
Epoch 3 - Score: 0.5002
INFO:__main__:Epoch 3 - Score: 0.5002
Epoch 3 - Save Best Score: 0.5418 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.5418 Model


Epoch: [4][0/5891] Elapsed 0m 0s (remain 84m 17s) Loss: 0.1924(0.1924) Grad: nan  LR: 0.00000293  
Epoch: [4][100/5891] Elapsed 0m 12s (remain 11m 32s) Loss: 0.1025(0.1478) Grad: 1.0600  LR: 0.00000283  
Epoch: [4][200/5891] Elapsed 0m 24s (remain 11m 27s) Loss: 0.0900(0.1466) Grad: 0.9565  LR: 0.00000274  
Epoch: [4][300/5891] Elapsed 0m 35s (remain 10m 55s) Loss: 0.1858(0.1460) Grad: 1.1507  LR: 0.00000265  
Epoch: [4][400/5891] Elapsed 0m 45s (remain 10m 20s) Loss: 0.1790(0.1452) Grad: 1.9675  LR: 0.00000256  
Epoch: [4][500/5891] Elapsed 0m 56s (remain 10m 9s) Loss: 0.1185(0.1461) Grad: 1.2075  LR: 0.00000247  
Epoch: [4][600/5891] Elapsed 1m 7s (remain 9m 52s) Loss: 0.2094(0.1457) Grad: 1.3007  LR: 0.00000239  
Epoch: [4][700/5891] Elapsed 1m 17s (remain 9m 35s) Loss: 0.1689(0.1462) Grad: 1.1407  LR: 0.00000230  
Epoch: [4][800/5891] Elapsed 1m 29s (remain 9m 25s) Loss: 0.1403(0.1448) Grad: 1.8485  LR: 0.00000222  
Epoch: [4][900/5891] Elapsed 1m 39s (remain 9m 11s) Loss: 0.1328(0

Epoch 4 - avg_train_loss: 0.1423  avg_val_loss: 0.1552  time: 744s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1423  avg_val_loss: 0.1552  time: 744s
Epoch 4 - Score: 0.5069
INFO:__main__:Epoch 4 - Score: 0.5069
Epoch 4 - Save Best Score: 0.5485 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.5485 Model


f1 score : 0.506904318871805
recall score : 0.39650681783361424
precision score : 0.7024972855591748


Score: 0.5069
INFO:__main__:Score: 0.5069
F1 BEST Score: 0.5485
INFO:__main__:F1 BEST Score: 0.5485
ElectraConfig {
  "_name_or_path": "google/electra-base-discriminator",
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": null,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "transformers_version": "4.33.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

INFO:__main__:ElectraConfig {
 

Epoch: [1][0/5891] Elapsed 0m 0s (remain 59m 35s) Loss: 0.8311(0.8311) Grad: nan  LR: 0.00002000  
Epoch: [1][100/5891] Elapsed 0m 13s (remain 12m 38s) Loss: 0.4119(0.2736) Grad: 2.6448  LR: 0.00002000  
Epoch: [1][200/5891] Elapsed 0m 24s (remain 11m 24s) Loss: 0.4216(0.2553) Grad: 2.7739  LR: 0.00002000  
Epoch: [1][300/5891] Elapsed 0m 33s (remain 10m 28s) Loss: 0.1687(0.2441) Grad: 0.2961  LR: 0.00001999  
Epoch: [1][400/5891] Elapsed 0m 44s (remain 10m 15s) Loss: 0.2759(0.2367) Grad: 0.9344  LR: 0.00001999  
Epoch: [1][500/5891] Elapsed 0m 55s (remain 9m 57s) Loss: 0.1647(0.2289) Grad: 0.5872  LR: 0.00001998  
Epoch: [1][600/5891] Elapsed 1m 5s (remain 9m 36s) Loss: 0.1584(0.2202) Grad: 0.5110  LR: 0.00001997  
Epoch: [1][700/5891] Elapsed 1m 16s (remain 9m 27s) Loss: 0.0925(0.2174) Grad: 0.5618  LR: 0.00001996  
Epoch: [1][800/5891] Elapsed 1m 26s (remain 9m 11s) Loss: 0.1248(0.2153) Grad: 0.5826  LR: 0.00001994  
Epoch: [1][900/5891] Elapsed 1m 36s (remain 8m 56s) Loss: 0.1637(0

Epoch 1 - avg_train_loss: 0.1870  avg_val_loss: 0.1845  time: 735s
INFO:__main__:Epoch 1 - avg_train_loss: 0.1870  avg_val_loss: 0.1845  time: 735s
Epoch 1 - Score: 0.2459
INFO:__main__:Epoch 1 - Score: 0.2459
Epoch 1 - Save Best Score: 0.4931 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.4931 Model


Epoch: [2][0/5891] Elapsed 0m 0s (remain 83m 34s) Loss: 0.1678(0.1678) Grad: nan  LR: 0.00001707  
Epoch: [2][100/5891] Elapsed 0m 12s (remain 12m 0s) Loss: 0.1522(0.1837) Grad: 1.3373  LR: 0.00001698  
Epoch: [2][200/5891] Elapsed 0m 24s (remain 11m 25s) Loss: 0.0972(0.1763) Grad: 1.1973  LR: 0.00001688  
Epoch: [2][300/5891] Elapsed 0m 34s (remain 10m 31s) Loss: 0.1729(0.1747) Grad: 0.8450  LR: 0.00001678  
Epoch: [2][400/5891] Elapsed 0m 44s (remain 10m 13s) Loss: 0.1383(0.1738) Grad: 1.4489  LR: 0.00001668  
Epoch: [2][500/5891] Elapsed 0m 55s (remain 10m 2s) Loss: 0.0601(0.1720) Grad: 0.7513  LR: 0.00001658  
Epoch: [2][600/5891] Elapsed 1m 5s (remain 9m 37s) Loss: 0.0857(0.1698) Grad: 0.8581  LR: 0.00001648  
Epoch: [2][700/5891] Elapsed 1m 16s (remain 9m 27s) Loss: 0.1405(0.1697) Grad: 0.9577  LR: 0.00001638  
Epoch: [2][800/5891] Elapsed 1m 27s (remain 9m 18s) Loss: 0.1965(0.1698) Grad: 1.1319  LR: 0.00001628  
Epoch: [2][900/5891] Elapsed 1m 37s (remain 8m 59s) Loss: 0.2996(0.

Epoch 2 - avg_train_loss: 0.1630  avg_val_loss: 0.1610  time: 742s
INFO:__main__:Epoch 2 - avg_train_loss: 0.1630  avg_val_loss: 0.1610  time: 742s
Epoch 2 - Score: 0.4228
INFO:__main__:Epoch 2 - Score: 0.4228
Epoch 2 - Save Best Score: 0.5382 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.5382 Model


Epoch: [3][0/5891] Elapsed 0m 0s (remain 57m 53s) Loss: 0.0569(0.0569) Grad: nan  LR: 0.00001000  
Epoch: [3][100/5891] Elapsed 0m 11s (remain 11m 20s) Loss: 0.1980(0.1532) Grad: 1.2244  LR: 0.00000987  
Epoch: [3][200/5891] Elapsed 0m 22s (remain 10m 44s) Loss: 0.1425(0.1516) Grad: 0.7321  LR: 0.00000973  
Epoch: [3][300/5891] Elapsed 0m 34s (remain 10m 31s) Loss: 0.0983(0.1534) Grad: 0.8642  LR: 0.00000960  
Epoch: [3][400/5891] Elapsed 0m 44s (remain 10m 9s) Loss: 0.1934(0.1513) Grad: 1.1160  LR: 0.00000947  
Epoch: [3][500/5891] Elapsed 0m 54s (remain 9m 48s) Loss: 0.2634(0.1535) Grad: 1.2931  LR: 0.00000933  
Epoch: [3][600/5891] Elapsed 1m 5s (remain 9m 40s) Loss: 0.2742(0.1548) Grad: 1.7511  LR: 0.00000920  
Epoch: [3][700/5891] Elapsed 1m 16s (remain 9m 23s) Loss: 0.1192(0.1560) Grad: 0.8197  LR: 0.00000907  
Epoch: [3][800/5891] Elapsed 1m 26s (remain 9m 10s) Loss: 0.0583(0.1556) Grad: 1.0971  LR: 0.00000893  
Epoch: [3][900/5891] Elapsed 1m 37s (remain 9m 0s) Loss: 0.2247(0.1

Epoch 3 - avg_train_loss: 0.1505  avg_val_loss: 0.1565  time: 747s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1505  avg_val_loss: 0.1565  time: 747s
Epoch 3 - Score: 0.5065
INFO:__main__:Epoch 3 - Score: 0.5065
Epoch 3 - Save Best Score: 0.5476 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.5476 Model


Epoch: [4][0/5891] Elapsed 0m 0s (remain 53m 59s) Loss: 0.1176(0.1176) Grad: nan  LR: 0.00000293  
Epoch: [4][100/5891] Elapsed 0m 15s (remain 14m 36s) Loss: 0.2998(0.1438) Grad: 2.1247  LR: 0.00000283  
Epoch: [4][200/5891] Elapsed 0m 25s (remain 12m 14s) Loss: 0.1168(0.1443) Grad: 0.8416  LR: 0.00000274  
Epoch: [4][300/5891] Elapsed 0m 36s (remain 11m 14s) Loss: 0.1246(0.1434) Grad: 1.0714  LR: 0.00000265  
Epoch: [4][400/5891] Elapsed 0m 47s (remain 10m 51s) Loss: 0.1056(0.1408) Grad: 1.3501  LR: 0.00000256  
Epoch: [4][500/5891] Elapsed 0m 57s (remain 10m 21s) Loss: 0.1421(0.1419) Grad: 1.1029  LR: 0.00000247  
Epoch: [4][600/5891] Elapsed 1m 8s (remain 10m 2s) Loss: 0.2527(0.1406) Grad: 1.6000  LR: 0.00000239  
Epoch: [4][700/5891] Elapsed 1m 19s (remain 9m 50s) Loss: 0.0988(0.1394) Grad: 7.6296  LR: 0.00000230  
Epoch: [4][800/5891] Elapsed 1m 29s (remain 9m 30s) Loss: 0.1066(0.1393) Grad: 0.8221  LR: 0.00000222  
Epoch: [4][900/5891] Elapsed 1m 40s (remain 9m 17s) Loss: 0.0567(

Epoch 4 - avg_train_loss: 0.1392  avg_val_loss: 0.1574  time: 760s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1392  avg_val_loss: 0.1574  time: 760s
Epoch 4 - Score: 0.5073
INFO:__main__:Epoch 4 - Score: 0.5073
Epoch 4 - Save Best Score: 0.5484 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.5484 Model


f1 score : 0.5073313782991202
recall score : 0.3976402083971805
precision score : 0.7005939524838013


Score: 0.5073
INFO:__main__:Score: 0.5073
F1 BEST Score: 0.5484
INFO:__main__:F1 BEST Score: 0.5484
ElectraConfig {
  "_name_or_path": "google/electra-base-discriminator",
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": null,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "transformers_version": "4.33.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

INFO:__main__:ElectraConfig {
 

Epoch: [1][0/5891] Elapsed 0m 1s (remain 116m 38s) Loss: 0.9590(0.9590) Grad: nan  LR: 0.00002000  
Epoch: [1][100/5891] Elapsed 0m 12s (remain 12m 21s) Loss: 0.1392(0.2781) Grad: 0.6998  LR: 0.00002000  
Epoch: [1][200/5891] Elapsed 0m 24s (remain 11m 26s) Loss: 0.3213(0.2524) Grad: 1.5475  LR: 0.00002000  
Epoch: [1][300/5891] Elapsed 0m 34s (remain 10m 40s) Loss: 0.2057(0.2429) Grad: 0.7056  LR: 0.00001999  
Epoch: [1][400/5891] Elapsed 0m 45s (remain 10m 20s) Loss: 0.3015(0.2414) Grad: 0.9043  LR: 0.00001999  
Epoch: [1][500/5891] Elapsed 0m 58s (remain 10m 29s) Loss: 0.1583(0.2352) Grad: 1.1137  LR: 0.00001998  
Epoch: [1][600/5891] Elapsed 1m 8s (remain 10m 4s) Loss: 0.3921(0.2329) Grad: 1.7562  LR: 0.00001997  
Epoch: [1][700/5891] Elapsed 1m 20s (remain 9m 52s) Loss: 0.2810(0.2275) Grad: 1.6245  LR: 0.00001996  
Epoch: [1][800/5891] Elapsed 1m 30s (remain 9m 35s) Loss: 0.1099(0.2239) Grad: 1.0387  LR: 0.00001994  
Epoch: [1][900/5891] Elapsed 1m 40s (remain 9m 18s) Loss: 0.1636

Epoch 1 - avg_train_loss: 0.1903  avg_val_loss: 0.1727  time: 759s
INFO:__main__:Epoch 1 - avg_train_loss: 0.1903  avg_val_loss: 0.1727  time: 759s
Epoch 1 - Score: 0.4315
INFO:__main__:Epoch 1 - Score: 0.4315
Epoch 1 - Save Best Score: 0.4956 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.4956 Model


Epoch: [2][0/5891] Elapsed 0m 0s (remain 85m 39s) Loss: 0.1192(0.1192) Grad: nan  LR: 0.00001707  
Epoch: [2][100/5891] Elapsed 0m 11s (remain 10m 36s) Loss: 0.1885(0.1630) Grad: 1.3999  LR: 0.00001698  
Epoch: [2][200/5891] Elapsed 0m 23s (remain 11m 1s) Loss: 0.1814(0.1704) Grad: 1.4359  LR: 0.00001688  
Epoch: [2][300/5891] Elapsed 0m 34s (remain 10m 34s) Loss: 0.1349(0.1692) Grad: 1.4780  LR: 0.00001678  
Epoch: [2][400/5891] Elapsed 0m 43s (remain 10m 0s) Loss: 0.1914(0.1668) Grad: 1.8448  LR: 0.00001668  
Epoch: [2][500/5891] Elapsed 0m 54s (remain 9m 51s) Loss: 0.2188(0.1663) Grad: 0.9977  LR: 0.00001658  
Epoch: [2][600/5891] Elapsed 1m 5s (remain 9m 38s) Loss: 0.2341(0.1674) Grad: 1.3181  LR: 0.00001648  
Epoch: [2][700/5891] Elapsed 1m 17s (remain 9m 35s) Loss: 0.1006(0.1675) Grad: 1.0446  LR: 0.00001638  
Epoch: [2][800/5891] Elapsed 1m 28s (remain 9m 25s) Loss: 0.1434(0.1678) Grad: 1.1726  LR: 0.00001628  
Epoch: [2][900/5891] Elapsed 1m 39s (remain 9m 9s) Loss: 0.1081(0.16

Epoch 2 - avg_train_loss: 0.1649  avg_val_loss: 0.1640  time: 749s
INFO:__main__:Epoch 2 - avg_train_loss: 0.1649  avg_val_loss: 0.1640  time: 749s
Epoch 2 - Score: 0.4674
INFO:__main__:Epoch 2 - Score: 0.4674
Epoch 2 - Save Best Score: 0.5224 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.5224 Model


Epoch: [3][0/5891] Elapsed 0m 0s (remain 55m 58s) Loss: 0.1293(0.1293) Grad: nan  LR: 0.00001000  
Epoch: [3][100/5891] Elapsed 0m 11s (remain 11m 4s) Loss: 0.2595(0.1621) Grad: 1.1538  LR: 0.00000987  
Epoch: [3][200/5891] Elapsed 0m 22s (remain 10m 48s) Loss: 0.2227(0.1590) Grad: 1.2431  LR: 0.00000973  
Epoch: [3][300/5891] Elapsed 0m 34s (remain 10m 33s) Loss: 0.1957(0.1575) Grad: 0.9743  LR: 0.00000960  
Epoch: [3][400/5891] Elapsed 0m 44s (remain 10m 8s) Loss: 0.0525(0.1567) Grad: 1.1915  LR: 0.00000947  
Epoch: [3][500/5891] Elapsed 0m 54s (remain 9m 50s) Loss: 0.2367(0.1560) Grad: 1.7856  LR: 0.00000933  
Epoch: [3][600/5891] Elapsed 1m 6s (remain 9m 43s) Loss: 0.2091(0.1547) Grad: 1.6010  LR: 0.00000920  
Epoch: [3][700/5891] Elapsed 1m 16s (remain 9m 24s) Loss: 0.1188(0.1552) Grad: 0.7870  LR: 0.00000907  
Epoch: [3][800/5891] Elapsed 1m 29s (remain 9m 28s) Loss: 0.1016(0.1545) Grad: 0.9204  LR: 0.00000893  
Epoch: [3][900/5891] Elapsed 1m 40s (remain 9m 18s) Loss: 0.1333(0.1

Epoch 3 - avg_train_loss: 0.1510  avg_val_loss: 0.1597  time: 753s
--- Logging error ---
Traceback (most recent call last):
  File "/usr/lib/python3.10/logging/__init__.py", line 1104, in emit
    self.flush()
  File "/usr/lib/python3.10/logging/__init__.py", line 1084, in flush
    self.stream.flush()
OSError: [Errno 107] Transport endpoint is not connected
Call stack:
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.10/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelapp.py", line 619, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.10/dist-packages/torn

Epoch: [4][0/5891] Elapsed 0m 0s (remain 82m 14s) Loss: 0.1166(0.1166) Grad: nan  LR: 0.00000293  
Epoch: [4][100/5891] Elapsed 0m 12s (remain 11m 36s) Loss: 0.1716(0.1332) Grad: 1.2596  LR: 0.00000283  
Epoch: [4][200/5891] Elapsed 0m 24s (remain 11m 20s) Loss: 0.2019(0.1335) Grad: 1.3964  LR: 0.00000274  
Epoch: [4][300/5891] Elapsed 0m 33s (remain 10m 25s) Loss: 0.1692(0.1372) Grad: 1.1377  LR: 0.00000265  
Epoch: [4][400/5891] Elapsed 0m 44s (remain 10m 9s) Loss: 0.0732(0.1395) Grad: 0.9249  LR: 0.00000256  
Epoch: [4][500/5891] Elapsed 0m 55s (remain 9m 59s) Loss: 0.0509(0.1404) Grad: 1.2756  LR: 0.00000247  
Epoch: [4][600/5891] Elapsed 1m 5s (remain 9m 33s) Loss: 0.3013(0.1404) Grad: 1.5376  LR: 0.00000239  
Epoch: [4][700/5891] Elapsed 1m 18s (remain 9m 40s) Loss: 0.0921(0.1405) Grad: 0.7346  LR: 0.00000230  
Epoch: [4][800/5891] Elapsed 1m 29s (remain 9m 29s) Loss: 0.1829(0.1410) Grad: 0.9324  LR: 0.00000222  
Epoch: [4][900/5891] Elapsed 1m 39s (remain 9m 11s) Loss: 0.1438(0.

Epoch 4 - avg_train_loss: 0.1393  avg_val_loss: 0.1604  time: 750s
--- Logging error ---
Traceback (most recent call last):
  File "/usr/lib/python3.10/logging/__init__.py", line 1104, in emit
    self.flush()
  File "/usr/lib/python3.10/logging/__init__.py", line 1084, in flush
    self.stream.flush()
OSError: [Errno 107] Transport endpoint is not connected
Call stack:
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.10/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelapp.py", line 619, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.10/dist-packages/torn

f1 score : 0.500049062898636
recall score : 0.3904382470119522
precision score : 0.6952251023192361


Score: 0.5000
--- Logging error ---
Traceback (most recent call last):
  File "/usr/lib/python3.10/logging/__init__.py", line 1104, in emit
    self.flush()
  File "/usr/lib/python3.10/logging/__init__.py", line 1084, in flush
    self.stream.flush()
OSError: [Errno 107] Transport endpoint is not connected
Call stack:
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.10/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelapp.py", line 619, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.10/dist-packages/tornado/platform/asyncio.py", line 195, in start
    self

f1 score : 0.501184179363701
recall score : 0.38910851644142075
precision score : 0.7039418972112879


Score: 0.5012
--- Logging error ---
Traceback (most recent call last):
  File "/usr/lib/python3.10/logging/__init__.py", line 1104, in emit
    self.flush()
  File "/usr/lib/python3.10/logging/__init__.py", line 1084, in flush
    self.stream.flush()
OSError: [Errno 107] Transport endpoint is not connected
Call stack:
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.10/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelapp.py", line 619, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.10/dist-packages/tornado/platform/asyncio.py", line 195, in start
    self

In [21]:
from google.colab import runtime
runtime.unassign()