In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers
!pip install datasets
!pip install sentencepiece

Collecting transformers
  Downloading transformers-4.33.1-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m88.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m71.5 MB/s[0m eta [36m0:00:0

In [3]:
!nvidia-smi

Thu Sep  7 06:42:11 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    23W / 300W |      0MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re
import html

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD, AdamW, RAdam
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score, recall_score, accuracy_score, precision_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
import os

DIR = "/content/drive/MyDrive/Competitions/Signate/MUFG2023"
INPUT_DIR = os.path.join(DIR,"input")
OUTPUT_DIR = os.path.join(DIR,"output")

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)


OUTPUT_EXP_DIR = DIR + '/output/EXP053/'
if not os.path.exists(OUTPUT_EXP_DIR):
    os.makedirs(OUTPUT_EXP_DIR)

In [6]:


# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    # model="microsoft/deberta-v3-base"
    model='microsoft/deberta-base'
    # model='roberta-base'
    # model='roberta-large'
    # model='roberta-large-mnli'
    # model='xlnet-large-cased'
    # model='albert-xxlarge-v2'
    # model="microsoft/deberta-large"
    # model="microsoft/deberta-v3-large"
    # model='microsoft/deberta-v2-xlarge'
    # model='funnel-transformer/large'
    # model='funnel-transformer/medium'
    # model='albert-base-v2'
    # model='albert-large-v2'
    # model='google/electra-large-discriminator'
    # model='google/electra-base-discriminator'
    # model="facebook/bart-large-mnli"
    # model="facebook/bart-large"
    # model="facebook/bart-base"
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=4
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=64
    fc_dropout=0.2
    target="is_fraud?"
    target_size=1
    max_len=97
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    train=True
    nth_awp_start_epoch=1
    gradient_checkpointing = False
    freezing = False

if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0, 1]

In [7]:
def get_score(labels, outputs):
    thresh = 0.5
    y_pred = outputs
    y_true = labels
    f_score = f1_score(y_true, (y_pred>thresh).astype(int))
    r_score = recall_score(y_true, (y_pred>thresh).astype(int))
    p_score = precision_score(y_true, (y_pred>thresh).astype(int))
    print(f"f1 score : {f_score}")
    print(f"recall score : {r_score}")
    print(f"precision score : {p_score}")
    return f1_score(y_true, (y_pred>thresh).astype(int))

def get_f1_score(labels, outputs):
    y_pred = outputs
    y_true = labels
    best_score = 0
    best_thresh = 0.5
    for thresh in np.arange(0.1, 0.80, 0.01):
        thresh = np.round(thresh, 2)
        score = f1_score(y_true, (y_pred>thresh).astype(int))
        #print("Accuracy score at threshold {0} is {1}".format(thresh, score))
        if score > best_score:
          best_score = score
          best_thresh = thresh
    return f1_score(y_true, (y_pred>best_thresh).astype(int))


def get_logger(filename=OUTPUT_EXP_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(seed=CFG.seed)

In [8]:
def freeze(module):
    """
    Freezes module's parameters.
    """

    for parameter in module.parameters():
        parameter.requires_grad = False

def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """

    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)

    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """

    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"

        if hasattr(embeddings_path, attr_name):
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [9]:
import pandas as pd
import numpy as np


train = pd.read_csv(os.path.join(INPUT_DIR,"train.csv"))
test = pd.read_csv(os.path.join(INPUT_DIR,"test.csv"))
card = pd.read_csv(os.path.join(INPUT_DIR, "card.csv"))
user = pd.read_csv(os.path.join(INPUT_DIR, "user.csv"))
sub = pd.read_csv(os.path.join(INPUT_DIR, "sample_submit.csv"), header=None)

print(train.shape)
display(train.head(3))

print(test.shape)
display(test.head(3))

print(card.shape)
display(card.head(3))

print(user.shape)
display(user.head(3))

print(sub.shape)
display(sub.head(3))

(471283, 12)


Unnamed: 0,index,user_id,card_id,amount,errors?,is_fraud?,merchant_id,merchant_city,merchant_state,zip,mcc,use_chip
0,0,1721,0,$2.623,OK,0,209237,Joliet,IL,60436.0,5541,Swipe Transaction
1,1,1629,3,$6.4,OK,0,2568,Edgerton,WI,53534.0,5814,Swipe Transaction
2,2,655,3,$123.5,OK,0,345310,Ridgefield,WA,98642.0,7538,Swipe Transaction


(457958, 11)


Unnamed: 0,index,user_id,card_id,amount,errors?,merchant_id,merchant_city,merchant_state,zip,mcc,use_chip
0,471283,541,3,$113.278,OK,324189,Orlando,FL,32821.0,4814,Swipe Transaction
1,471284,655,1,$293.944,OK,81219,Ridgefield,WA,98642.0,7538,Chip Transaction
2,471285,492,0,$47.4,OK,274755,Arlington Heights,IL,60004.0,5719,Swipe Transaction


(416, 10)


Unnamed: 0,user_id,card_id,card_brand,card_type,expires,has_chip,cards_issued,credit_limit,acct_open_date,year_pin_last_changed
0,39,0,Visa,Debit,09/2021,YES,1,$17117,05/2007,2010
1,39,1,Amex,Credit,11/2024,YES,2,$5400,10/2015,2015
2,41,0,Discover,Credit,03/2022,YES,2,$14800,12/2010,2011


(97, 17)


Unnamed: 0,user_id,current_age,retirement_age,birth_year,birth_month,gender,address,city,state,zipcode,latitude,longitude,per_capita_income_zipcode,yearly_income_person,total_debt,fico_score,num_credit_cards
0,39,57,64,1962,12,Female,442 Burns Boulevard,Mansfield,MA,2048,42.02,-71.21,$37407,$76274,$102611,698,2
1,41,39,66,1980,10,Female,3863 River Avenue,Lincoln,CA,95648,38.93,-121.25,$21829,$44506,$57994,849,3
2,47,40,67,1979,5,Female,8799 Elm Avenue,Mckinney,TX,75069,33.2,-96.65,$24684,$50329,$76759,625,4


(457958, 2)


Unnamed: 0,0,1
0,471283,0
1,471284,1
2,471285,0


In [10]:
month_dict = {
   "01": "January",
   "02": "February",
   "03": "March",
   "04": "April",
   "05": "May",
   "06": "June",
   "07": "July",
   "08": "August",
   "09": "September",
   "10": "October",
   "11": "November",
   "12": "December"
}

def get_expires_values(df):
  _df = df["expires"].str.split('/').apply(pd.Series)
  _df.columns = ["month","years"]
  df["expires_month"] = _df["month"].astype(str)
  df["expires_years"] = _df["years"].astype(str)
  return df

def get_acct_open_date_values(df):
  _df = df["acct_open_date"].str.split('/').apply(pd.Series)
  _df.columns = ["month","years"]
  df["acct_open_date_month"] = _df["month"].astype(str)
  df["acct_open_date_years"] = _df["years"].astype(str)
  return df


card = get_expires_values(card)
card = get_acct_open_date_values(card)
card["expires_month"] = card["expires_month"].map(month_dict)
card["acct_open_date_month"] = card["acct_open_date_month"].map(month_dict)

In [11]:
train = train.merge(card, how="left", on=["user_id", "card_id"]).merge(user, how="left", on="user_id")

In [12]:
train.fillna('unknown', inplace = True)

train["texts"] = "merchant" + "[SEP]" + train["amount"] + "[SEP]" + train["errors?"] + "[SEP]" + train["merchant_city"] + "[SEP]" + train["merchant_state"] + "[SEP]" + train["use_chip"] + "[SEP]" \
+ "card" + "[SEP]" + train["card_brand"] + "[SEP]" + train["card_type"] + "[SEP]" + train["expires_month"] + " " + train["expires_years"] + "[SEP]" + train["has_chip"] + "[SEP]" + train["acct_open_date_month"] + " " + train["acct_open_date_years"] + "[SEP]" + train["year_pin_last_changed"].astype(str) + "[SEP]" \
"user" + "[SEP]" + train["current_age"].astype(str) + " year old " + train["gender"] + "[SEP]" + "retired at age " + train["retirement_age"].astype(str) + "[SEP]" + train["address"] + "[SEP]" + train["city"] + "[SEP]" + train["state"] + "[SEP]" + train["per_capita_income_zipcode"] + "[SEP]" + train["yearly_income_person"] + "[SEP]" + train["total_debt"]

In [13]:
skf = StratifiedKFold(n_splits=CFG.n_fold,shuffle=True,random_state=CFG.seed)
for fold, ( _, val_) in enumerate(skf.split(train, train[CFG.target])):
    train.loc[val_ , "kfold"] = int(fold)

train["kfold"] = train["kfold"].astype(int)

if CFG.debug:
    display(train.groupby('kfold').size())
    train = train.sample(n=500, random_state=0).reset_index(drop=True)
    display(train.groupby('kfold').size())

In [14]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_EXP_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/474 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

In [15]:
# ====================================================
# Define max_len
# ====================================================
lengths = []
tk0 = tqdm(train['texts'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 23 # cls
LOGGER.info(f"max_len: {CFG.max_len}")

100%|██████████| 471283/471283 [01:34<00:00, 4974.30it/s]
max_len: 105
INFO:__main__:max_len: 105


In [16]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False,
                           truncation=True)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['texts'].values
        self.labels = df[CFG.target].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.half)
        return inputs, label

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

class ValidDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['texts'].values
        self.labels = df[CFG.target].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

#collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [17]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim=1)
        return max_embeddings


class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()

        # Freezing
        if cfg.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            cfg.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())

        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        self._init_weights(self.fc)
        self.layer_norm1 = nn.LayerNorm(self.config.hidden_size)
        self.sig = nn.Sigmoid()

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        feature = self.layer_norm1(feature)
        output = self.fc(feature)
        #output = self.sig(output)
        return output

In [18]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
        #print(y_preds.sigmoid().squeeze().view(1, -1))
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader),
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))

    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    predictions = np.concatenate(predictions)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [19]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):

    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['kfold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['kfold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds[CFG.target].values

    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = ValidDataset(CFG, valid_folds)


    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size*2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_EXP_DIR+'config.pth')
    model.to(device)

    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr,
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)

    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler

    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.BCEWithLogitsLoss()

    best_score = -1.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)

        # scoring
        score = get_score(valid_labels, predictions)
        f1_score = get_f1_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')


        if best_score < f1_score:
            best_score = f1_score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_EXP_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_EXP_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                             map_location=torch.device('cpu'))['predictions']
    valid_folds['pred'] = predictions

    torch.cuda.empty_cache()
    gc.collect()

    return valid_folds

In [20]:
if __name__ == '__main__':

    def get_result(oof_df):
        labels = oof_df[CFG.target].values
        preds = oof_df['pred'].values
        score = get_score(labels, preds)
        f1_score = get_f1_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
        LOGGER.info(f'F1 BEST Score: {f1_score:<.4f}')

    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
            #break
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_EXP_DIR+'oof_df.pkl')

DebertaConfig {
  "_name_or_path": "microsoft/deberta-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "c2p",
    "p2c"
  ],
  "position_biased_input": false,
  "relative_attention": true,
  "transformers_version": "4.33.1",
  "type_vocab_size": 0,
  "vocab_size": 50265
}

INFO:__main__:DebertaConfig {
  "_name_or_path": "microsoft/deberta-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropo

Downloading pytorch_model.bin:   0%|          | 0.00/559M [00:00<?, ?B/s]

Epoch: [1][0/5891] Elapsed 0m 4s (remain 478m 15s) Loss: 0.8276(0.8276) Grad: nan  LR: 0.00002000  
Epoch: [1][100/5891] Elapsed 0m 18s (remain 17m 41s) Loss: 0.3313(0.2841) Grad: 1.7755  LR: 0.00002000  
Epoch: [1][200/5891] Elapsed 0m 32s (remain 15m 9s) Loss: 0.2351(0.2675) Grad: 0.3507  LR: 0.00002000  
Epoch: [1][300/5891] Elapsed 0m 45s (remain 14m 9s) Loss: 0.2791(0.2580) Grad: 0.9121  LR: 0.00001999  
Epoch: [1][400/5891] Elapsed 0m 59s (remain 13m 34s) Loss: 0.2146(0.2548) Grad: 0.7683  LR: 0.00001999  
Epoch: [1][500/5891] Elapsed 1m 13s (remain 13m 6s) Loss: 0.2644(0.2490) Grad: 1.0419  LR: 0.00001998  
Epoch: [1][600/5891] Elapsed 1m 26s (remain 12m 43s) Loss: 0.3262(0.2432) Grad: 1.7449  LR: 0.00001997  
Epoch: [1][700/5891] Elapsed 1m 40s (remain 12m 22s) Loss: 0.3232(0.2394) Grad: 1.5144  LR: 0.00001996  
Epoch: [1][800/5891] Elapsed 1m 53s (remain 12m 3s) Loss: 0.1807(0.2348) Grad: 2.0245  LR: 0.00001994  
Epoch: [1][900/5891] Elapsed 2m 7s (remain 11m 46s) Loss: 0.2012

Epoch 1 - avg_train_loss: 0.1944  avg_val_loss: 0.1736  time: 952s
INFO:__main__:Epoch 1 - avg_train_loss: 0.1944  avg_val_loss: 0.1736  time: 952s
Epoch 1 - Score: 0.3136
INFO:__main__:Epoch 1 - Score: 0.3136
Epoch 1 - Save Best Score: 0.5062 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.5062 Model


Epoch: [2][0/5891] Elapsed 0m 0s (remain 40m 5s) Loss: 0.2360(0.2360) Grad: nan  LR: 0.00001707  
Epoch: [2][100/5891] Elapsed 0m 14s (remain 13m 25s) Loss: 0.0753(0.1758) Grad: 1.4697  LR: 0.00001698  
Epoch: [2][200/5891] Elapsed 0m 27s (remain 13m 5s) Loss: 0.1736(0.1738) Grad: 1.9996  LR: 0.00001688  
Epoch: [2][300/5891] Elapsed 0m 41s (remain 12m 49s) Loss: 0.2410(0.1711) Grad: 0.9795  LR: 0.00001678  
Epoch: [2][400/5891] Elapsed 0m 55s (remain 12m 33s) Loss: 0.1034(0.1694) Grad: 1.0205  LR: 0.00001668  
Epoch: [2][500/5891] Elapsed 1m 8s (remain 12m 18s) Loss: 0.1292(0.1694) Grad: 1.2446  LR: 0.00001658  
Epoch: [2][600/5891] Elapsed 1m 22s (remain 12m 4s) Loss: 0.1627(0.1677) Grad: 0.7665  LR: 0.00001648  
Epoch: [2][700/5891] Elapsed 1m 35s (remain 11m 49s) Loss: 0.1267(0.1693) Grad: 1.4422  LR: 0.00001638  
Epoch: [2][800/5891] Elapsed 1m 49s (remain 11m 36s) Loss: 0.2607(0.1701) Grad: 1.4931  LR: 0.00001628  
Epoch: [2][900/5891] Elapsed 2m 3s (remain 11m 22s) Loss: 0.1796(

Epoch 2 - avg_train_loss: 0.1661  avg_val_loss: 0.1608  time: 947s
INFO:__main__:Epoch 2 - avg_train_loss: 0.1661  avg_val_loss: 0.1608  time: 947s
Epoch 2 - Score: 0.4696
INFO:__main__:Epoch 2 - Score: 0.4696
Epoch 2 - Save Best Score: 0.5323 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.5323 Model


Epoch: [3][0/5891] Elapsed 0m 0s (remain 42m 35s) Loss: 0.1603(0.1603) Grad: nan  LR: 0.00001000  
Epoch: [3][100/5891] Elapsed 0m 14s (remain 13m 37s) Loss: 0.1777(0.1487) Grad: 1.2537  LR: 0.00000987  
Epoch: [3][200/5891] Elapsed 0m 27s (remain 13m 12s) Loss: 0.0784(0.1565) Grad: 0.9029  LR: 0.00000973  
Epoch: [3][300/5891] Elapsed 0m 41s (remain 12m 52s) Loss: 0.1001(0.1585) Grad: 0.7828  LR: 0.00000960  
Epoch: [3][400/5891] Elapsed 0m 55s (remain 12m 35s) Loss: 0.2252(0.1556) Grad: 0.7893  LR: 0.00000947  
Epoch: [3][500/5891] Elapsed 1m 8s (remain 12m 20s) Loss: 0.1398(0.1548) Grad: 1.3752  LR: 0.00000933  
Epoch: [3][600/5891] Elapsed 1m 22s (remain 12m 5s) Loss: 0.1935(0.1551) Grad: 1.1626  LR: 0.00000920  
Epoch: [3][700/5891] Elapsed 1m 36s (remain 11m 51s) Loss: 0.0834(0.1547) Grad: 1.3585  LR: 0.00000907  
Epoch: [3][800/5891] Elapsed 1m 49s (remain 11m 36s) Loss: 0.0690(0.1547) Grad: 0.8778  LR: 0.00000893  
Epoch: [3][900/5891] Elapsed 2m 3s (remain 11m 22s) Loss: 0.114

Epoch 3 - avg_train_loss: 0.1526  avg_val_loss: 0.1567  time: 948s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1526  avg_val_loss: 0.1567  time: 948s
Epoch 3 - Score: 0.4894
INFO:__main__:Epoch 3 - Score: 0.4894
Epoch 3 - Save Best Score: 0.5482 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.5482 Model


Epoch: [4][0/5891] Elapsed 0m 0s (remain 43m 17s) Loss: 0.3186(0.3186) Grad: nan  LR: 0.00000293  
Epoch: [4][100/5891] Elapsed 0m 14s (remain 13m 37s) Loss: 0.2306(0.1389) Grad: 1.0361  LR: 0.00000283  
Epoch: [4][200/5891] Elapsed 0m 28s (remain 13m 14s) Loss: 0.2037(0.1416) Grad: 1.3166  LR: 0.00000274  
Epoch: [4][300/5891] Elapsed 0m 41s (remain 12m 54s) Loss: 0.0644(0.1384) Grad: 0.3798  LR: 0.00000265  
Epoch: [4][400/5891] Elapsed 0m 55s (remain 12m 38s) Loss: 0.1830(0.1402) Grad: 1.4394  LR: 0.00000256  
Epoch: [4][500/5891] Elapsed 1m 8s (remain 12m 22s) Loss: 0.1147(0.1402) Grad: 1.1073  LR: 0.00000247  
Epoch: [4][600/5891] Elapsed 1m 22s (remain 12m 7s) Loss: 0.1582(0.1404) Grad: 2.4766  LR: 0.00000239  
Epoch: [4][700/5891] Elapsed 1m 36s (remain 11m 53s) Loss: 0.1602(0.1417) Grad: 1.7273  LR: 0.00000230  
Epoch: [4][800/5891] Elapsed 1m 50s (remain 11m 39s) Loss: 0.0755(0.1425) Grad: 0.9171  LR: 0.00000222  
Epoch: [4][900/5891] Elapsed 2m 3s (remain 11m 25s) Loss: 0.096

Epoch 4 - avg_train_loss: 0.1409  avg_val_loss: 0.1579  time: 948s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1409  avg_val_loss: 0.1579  time: 948s
Epoch 4 - Score: 0.4987
INFO:__main__:Epoch 4 - Score: 0.4987
Epoch 4 - Save Best Score: 0.5490 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.5490 Model


f1 score : 0.498697133694127
recall score : 0.3812442537542139
precision score : 0.7207415990730012


Score: 0.4987
INFO:__main__:Score: 0.4987
F1 BEST Score: 0.5490
INFO:__main__:F1 BEST Score: 0.5490
DebertaConfig {
  "_name_or_path": "microsoft/deberta-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "c2p",
    "p2c"
  ],
  "position_biased_input": false,
  "relative_attention": true,
  "transformers_version": "4.33.1",
  "type_vocab_size": 0,
  "vocab_size": 50265
}

INFO:__main__:DebertaConfig {
  "_name_or_path": "microsoft/deberta-base",
  "attention_dropout": 0.0,
  "

Epoch: [1][0/5891] Elapsed 0m 0s (remain 43m 32s) Loss: 0.6562(0.6562) Grad: nan  LR: 0.00002000  
Epoch: [1][100/5891] Elapsed 0m 14s (remain 13m 38s) Loss: 0.2808(0.2594) Grad: 1.2877  LR: 0.00002000  
Epoch: [1][200/5891] Elapsed 0m 27s (remain 13m 12s) Loss: 0.2145(0.2419) Grad: 0.5710  LR: 0.00002000  
Epoch: [1][300/5891] Elapsed 0m 41s (remain 12m 56s) Loss: 0.1682(0.2350) Grad: 0.6219  LR: 0.00001999  
Epoch: [1][400/5891] Elapsed 0m 55s (remain 12m 39s) Loss: 0.1038(0.2301) Grad: 0.9904  LR: 0.00001999  
Epoch: [1][500/5891] Elapsed 1m 9s (remain 12m 24s) Loss: 0.2181(0.2276) Grad: 1.2125  LR: 0.00001998  
Epoch: [1][600/5891] Elapsed 1m 22s (remain 12m 9s) Loss: 0.1779(0.2254) Grad: 0.6568  LR: 0.00001997  
Epoch: [1][700/5891] Elapsed 1m 36s (remain 11m 54s) Loss: 0.0991(0.2221) Grad: 1.4799  LR: 0.00001996  
Epoch: [1][800/5891] Elapsed 1m 50s (remain 11m 39s) Loss: 0.2219(0.2214) Grad: 1.2044  LR: 0.00001994  
Epoch: [1][900/5891] Elapsed 2m 3s (remain 11m 25s) Loss: 0.140

Epoch 1 - avg_train_loss: 0.1899  avg_val_loss: 0.1713  time: 949s
INFO:__main__:Epoch 1 - avg_train_loss: 0.1899  avg_val_loss: 0.1713  time: 949s
Epoch 1 - Score: 0.4515
INFO:__main__:Epoch 1 - Score: 0.4515
Epoch 1 - Save Best Score: 0.5059 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.5059 Model


Epoch: [2][0/5891] Elapsed 0m 0s (remain 40m 24s) Loss: 0.3027(0.3027) Grad: nan  LR: 0.00001707  
Epoch: [2][100/5891] Elapsed 0m 14s (remain 13m 34s) Loss: 0.3059(0.1903) Grad: 1.4016  LR: 0.00001698  
Epoch: [2][200/5891] Elapsed 0m 27s (remain 13m 9s) Loss: 0.0408(0.1763) Grad: 1.1977  LR: 0.00001688  
Epoch: [2][300/5891] Elapsed 0m 41s (remain 12m 50s) Loss: 0.1345(0.1708) Grad: 0.8838  LR: 0.00001678  
Epoch: [2][400/5891] Elapsed 0m 55s (remain 12m 35s) Loss: 0.0831(0.1735) Grad: 0.5403  LR: 0.00001668  
Epoch: [2][500/5891] Elapsed 1m 8s (remain 12m 21s) Loss: 0.4119(0.1732) Grad: 2.1339  LR: 0.00001658  
Epoch: [2][600/5891] Elapsed 1m 22s (remain 12m 7s) Loss: 0.1559(0.1710) Grad: 1.2346  LR: 0.00001648  
Epoch: [2][700/5891] Elapsed 1m 36s (remain 11m 52s) Loss: 0.2201(0.1692) Grad: 1.1532  LR: 0.00001638  
Epoch: [2][800/5891] Elapsed 1m 49s (remain 11m 38s) Loss: 0.1040(0.1692) Grad: 0.5573  LR: 0.00001628  
Epoch: [2][900/5891] Elapsed 2m 3s (remain 11m 24s) Loss: 0.0914

Epoch 2 - avg_train_loss: 0.1641  avg_val_loss: 0.1612  time: 949s
INFO:__main__:Epoch 2 - avg_train_loss: 0.1641  avg_val_loss: 0.1612  time: 949s
Epoch 2 - Score: 0.4450
INFO:__main__:Epoch 2 - Score: 0.4450
Epoch 2 - Save Best Score: 0.5366 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.5366 Model


Epoch: [3][0/5891] Elapsed 0m 0s (remain 41m 42s) Loss: 0.2388(0.2388) Grad: nan  LR: 0.00001000  
Epoch: [3][100/5891] Elapsed 0m 14s (remain 13m 49s) Loss: 0.1841(0.1529) Grad: 1.0628  LR: 0.00000987  
Epoch: [3][200/5891] Elapsed 0m 28s (remain 13m 18s) Loss: 0.1250(0.1592) Grad: 0.8655  LR: 0.00000973  
Epoch: [3][300/5891] Elapsed 0m 42s (remain 13m 0s) Loss: 0.0698(0.1583) Grad: 0.9869  LR: 0.00000960  
Epoch: [3][400/5891] Elapsed 0m 55s (remain 12m 41s) Loss: 0.1394(0.1597) Grad: 0.8711  LR: 0.00000947  
Epoch: [3][500/5891] Elapsed 1m 9s (remain 12m 25s) Loss: 0.1453(0.1578) Grad: 0.9390  LR: 0.00000933  
Epoch: [3][600/5891] Elapsed 1m 22s (remain 12m 10s) Loss: 0.0963(0.1569) Grad: 0.5877  LR: 0.00000920  
Epoch: [3][700/5891] Elapsed 1m 36s (remain 11m 54s) Loss: 0.0967(0.1562) Grad: 0.8577  LR: 0.00000907  
Epoch: [3][800/5891] Elapsed 1m 50s (remain 11m 39s) Loss: 0.1482(0.1550) Grad: 1.2801  LR: 0.00000893  
Epoch: [3][900/5891] Elapsed 2m 3s (remain 11m 25s) Loss: 0.118

Epoch 3 - avg_train_loss: 0.1515  avg_val_loss: 0.1562  time: 947s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1515  avg_val_loss: 0.1562  time: 947s
Epoch 3 - Score: 0.4922
INFO:__main__:Epoch 3 - Score: 0.4922
Epoch 3 - Save Best Score: 0.5527 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.5527 Model


Epoch: [4][0/5891] Elapsed 0m 0s (remain 40m 0s) Loss: 0.1273(0.1273) Grad: nan  LR: 0.00000293  
Epoch: [4][100/5891] Elapsed 0m 14s (remain 13m 31s) Loss: 0.2256(0.1546) Grad: 1.2909  LR: 0.00000283  
Epoch: [4][200/5891] Elapsed 0m 27s (remain 13m 9s) Loss: 0.2542(0.1501) Grad: 1.6616  LR: 0.00000274  
Epoch: [4][300/5891] Elapsed 0m 41s (remain 12m 50s) Loss: 0.1008(0.1492) Grad: 0.7915  LR: 0.00000265  
Epoch: [4][400/5891] Elapsed 0m 55s (remain 12m 34s) Loss: 0.0688(0.1473) Grad: 1.1621  LR: 0.00000256  
Epoch: [4][500/5891] Elapsed 1m 8s (remain 12m 19s) Loss: 0.1032(0.1454) Grad: 0.7176  LR: 0.00000247  
Epoch: [4][600/5891] Elapsed 1m 22s (remain 12m 5s) Loss: 0.1707(0.1442) Grad: 1.1925  LR: 0.00000239  
Epoch: [4][700/5891] Elapsed 1m 36s (remain 11m 50s) Loss: 0.0842(0.1429) Grad: 1.1783  LR: 0.00000230  
Epoch: [4][800/5891] Elapsed 1m 49s (remain 11m 36s) Loss: 0.1923(0.1416) Grad: 1.4552  LR: 0.00000222  
Epoch: [4][900/5891] Elapsed 2m 3s (remain 11m 22s) Loss: 0.1902(

Epoch 4 - avg_train_loss: 0.1396  avg_val_loss: 0.1574  time: 947s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1396  avg_val_loss: 0.1574  time: 947s
Epoch 4 - Score: 0.5158
INFO:__main__:Epoch 4 - Score: 0.5158
Epoch 4 - Save Best Score: 0.5559 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.5559 Model


f1 score : 0.5157587548638132
recall score : 0.40622126877106957
precision score : 0.7061800745871071


Score: 0.5158
INFO:__main__:Score: 0.5158
F1 BEST Score: 0.5559
INFO:__main__:F1 BEST Score: 0.5559
DebertaConfig {
  "_name_or_path": "microsoft/deberta-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "c2p",
    "p2c"
  ],
  "position_biased_input": false,
  "relative_attention": true,
  "transformers_version": "4.33.1",
  "type_vocab_size": 0,
  "vocab_size": 50265
}

INFO:__main__:DebertaConfig {
  "_name_or_path": "microsoft/deberta-base",
  "attention_dropout": 0.0,
  "

Epoch: [1][0/5891] Elapsed 0m 0s (remain 44m 19s) Loss: 1.1240(1.1240) Grad: nan  LR: 0.00002000  
Epoch: [1][100/5891] Elapsed 0m 14s (remain 13m 36s) Loss: 0.2825(0.2862) Grad: 1.7626  LR: 0.00002000  
Epoch: [1][200/5891] Elapsed 0m 27s (remain 13m 11s) Loss: 0.2751(0.2701) Grad: 0.7446  LR: 0.00002000  
Epoch: [1][300/5891] Elapsed 0m 41s (remain 12m 52s) Loss: 0.3625(0.2582) Grad: 2.2695  LR: 0.00001999  
Epoch: [1][400/5891] Elapsed 0m 55s (remain 12m 35s) Loss: 0.3142(0.2487) Grad: 1.8777  LR: 0.00001999  
Epoch: [1][500/5891] Elapsed 1m 8s (remain 12m 19s) Loss: 0.2006(0.2423) Grad: 0.8145  LR: 0.00001998  
Epoch: [1][600/5891] Elapsed 1m 22s (remain 12m 5s) Loss: 0.2004(0.2405) Grad: 0.4787  LR: 0.00001997  
Epoch: [1][700/5891] Elapsed 1m 36s (remain 11m 50s) Loss: 0.2418(0.2385) Grad: 1.7961  LR: 0.00001996  
Epoch: [1][800/5891] Elapsed 1m 49s (remain 11m 36s) Loss: 0.2201(0.2346) Grad: 0.9911  LR: 0.00001994  
Epoch: [1][900/5891] Elapsed 2m 3s (remain 11m 22s) Loss: 0.241

Epoch 1 - avg_train_loss: 0.1971  avg_val_loss: 0.1742  time: 947s
INFO:__main__:Epoch 1 - avg_train_loss: 0.1971  avg_val_loss: 0.1742  time: 947s
Epoch 1 - Score: 0.3855
INFO:__main__:Epoch 1 - Score: 0.3855
Epoch 1 - Save Best Score: 0.4959 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.4959 Model


Epoch: [2][0/5891] Elapsed 0m 0s (remain 40m 27s) Loss: 0.1038(0.1038) Grad: nan  LR: 0.00001707  
Epoch: [2][100/5891] Elapsed 0m 14s (remain 13m 30s) Loss: 0.2115(0.1676) Grad: 1.0995  LR: 0.00001698  
Epoch: [2][200/5891] Elapsed 0m 27s (remain 13m 7s) Loss: 0.1197(0.1681) Grad: 0.5841  LR: 0.00001688  
Epoch: [2][300/5891] Elapsed 0m 41s (remain 12m 49s) Loss: 0.0806(0.1707) Grad: 1.2910  LR: 0.00001678  
Epoch: [2][400/5891] Elapsed 0m 55s (remain 12m 34s) Loss: 0.2219(0.1718) Grad: 1.1772  LR: 0.00001668  
Epoch: [2][500/5891] Elapsed 1m 8s (remain 12m 20s) Loss: 0.1284(0.1725) Grad: 0.8861  LR: 0.00001658  
Epoch: [2][600/5891] Elapsed 1m 22s (remain 12m 5s) Loss: 0.2126(0.1729) Grad: 1.1554  LR: 0.00001648  
Epoch: [2][700/5891] Elapsed 1m 36s (remain 11m 51s) Loss: 0.1057(0.1720) Grad: 0.7189  LR: 0.00001638  
Epoch: [2][800/5891] Elapsed 1m 49s (remain 11m 37s) Loss: 0.1644(0.1720) Grad: 1.2818  LR: 0.00001628  
Epoch: [2][900/5891] Elapsed 2m 3s (remain 11m 24s) Loss: 0.1041

Epoch 2 - avg_train_loss: 0.1678  avg_val_loss: 0.1636  time: 950s
INFO:__main__:Epoch 2 - avg_train_loss: 0.1678  avg_val_loss: 0.1636  time: 950s
Epoch 2 - Score: 0.4467
INFO:__main__:Epoch 2 - Score: 0.4467
Epoch 2 - Save Best Score: 0.5224 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.5224 Model


Epoch: [3][0/5891] Elapsed 0m 0s (remain 41m 59s) Loss: 0.0456(0.0456) Grad: nan  LR: 0.00001000  
Epoch: [3][100/5891] Elapsed 0m 14s (remain 13m 36s) Loss: 0.2157(0.1670) Grad: 1.1462  LR: 0.00000987  
Epoch: [3][200/5891] Elapsed 0m 28s (remain 13m 12s) Loss: 0.1373(0.1636) Grad: 1.0270  LR: 0.00000973  
Epoch: [3][300/5891] Elapsed 0m 42s (remain 13m 0s) Loss: 0.0498(0.1627) Grad: 1.0469  LR: 0.00000960  
Epoch: [3][400/5891] Elapsed 0m 55s (remain 12m 42s) Loss: 0.1885(0.1569) Grad: 1.6115  LR: 0.00000947  
Epoch: [3][500/5891] Elapsed 1m 9s (remain 12m 26s) Loss: 0.2251(0.1556) Grad: 1.2940  LR: 0.00000933  
Epoch: [3][600/5891] Elapsed 1m 23s (remain 12m 11s) Loss: 0.1892(0.1546) Grad: 1.5631  LR: 0.00000920  
Epoch: [3][700/5891] Elapsed 1m 36s (remain 11m 57s) Loss: 0.1459(0.1531) Grad: 1.2010  LR: 0.00000907  
Epoch: [3][800/5891] Elapsed 1m 50s (remain 11m 42s) Loss: 0.1829(0.1541) Grad: 1.3491  LR: 0.00000893  
Epoch: [3][900/5891] Elapsed 2m 4s (remain 11m 27s) Loss: 0.122

Epoch 3 - avg_train_loss: 0.1554  avg_val_loss: 0.1579  time: 954s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1554  avg_val_loss: 0.1579  time: 954s
Epoch 3 - Score: 0.4833
INFO:__main__:Epoch 3 - Score: 0.4833
Epoch 3 - Save Best Score: 0.5374 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.5374 Model


Epoch: [4][0/5891] Elapsed 0m 0s (remain 41m 33s) Loss: 0.0848(0.0848) Grad: nan  LR: 0.00000293  
Epoch: [4][100/5891] Elapsed 0m 14s (remain 13m 34s) Loss: 0.1715(0.1450) Grad: 1.4324  LR: 0.00000283  
Epoch: [4][200/5891] Elapsed 0m 27s (remain 13m 11s) Loss: 0.1057(0.1486) Grad: 1.3773  LR: 0.00000274  
Epoch: [4][300/5891] Elapsed 0m 41s (remain 12m 55s) Loss: 0.1174(0.1482) Grad: 1.6458  LR: 0.00000265  
Epoch: [4][400/5891] Elapsed 0m 55s (remain 12m 41s) Loss: 0.0773(0.1474) Grad: 1.4206  LR: 0.00000256  
Epoch: [4][500/5891] Elapsed 1m 9s (remain 12m 27s) Loss: 0.0999(0.1458) Grad: 1.2844  LR: 0.00000247  
Epoch: [4][600/5891] Elapsed 1m 23s (remain 12m 12s) Loss: 0.0872(0.1464) Grad: 0.8370  LR: 0.00000239  
Epoch: [4][700/5891] Elapsed 1m 37s (remain 11m 58s) Loss: 0.0573(0.1483) Grad: 1.1984  LR: 0.00000230  
Epoch: [4][800/5891] Elapsed 1m 50s (remain 11m 44s) Loss: 0.0713(0.1486) Grad: 1.0431  LR: 0.00000222  
Epoch: [4][900/5891] Elapsed 2m 4s (remain 11m 30s) Loss: 0.22

Epoch 4 - avg_train_loss: 0.1461  avg_val_loss: 0.1575  time: 958s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1461  avg_val_loss: 0.1575  time: 958s
Epoch 4 - Score: 0.4979
INFO:__main__:Epoch 4 - Score: 0.4979
Epoch 4 - Save Best Score: 0.5422 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.5422 Model


f1 score : 0.4978869778869778
recall score : 0.3880802819059292
precision score : 0.6943530701754386


Score: 0.4979
INFO:__main__:Score: 0.4979
F1 BEST Score: 0.5422
INFO:__main__:F1 BEST Score: 0.5422
DebertaConfig {
  "_name_or_path": "microsoft/deberta-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "c2p",
    "p2c"
  ],
  "position_biased_input": false,
  "relative_attention": true,
  "transformers_version": "4.33.1",
  "type_vocab_size": 0,
  "vocab_size": 50265
}

INFO:__main__:DebertaConfig {
  "_name_or_path": "microsoft/deberta-base",
  "attention_dropout": 0.0,
  "

Epoch: [1][0/5891] Elapsed 0m 0s (remain 50m 17s) Loss: 0.7271(0.7271) Grad: nan  LR: 0.00002000  
Epoch: [1][100/5891] Elapsed 0m 14s (remain 13m 57s) Loss: 0.1943(0.2834) Grad: 0.9000  LR: 0.00002000  
Epoch: [1][200/5891] Elapsed 0m 28s (remain 13m 23s) Loss: 0.2705(0.2668) Grad: 0.2353  LR: 0.00002000  
Epoch: [1][300/5891] Elapsed 0m 42s (remain 13m 4s) Loss: 0.2332(0.2554) Grad: 1.3260  LR: 0.00001999  
Epoch: [1][400/5891] Elapsed 0m 56s (remain 12m 47s) Loss: 0.1755(0.2506) Grad: 0.9465  LR: 0.00001999  
Epoch: [1][500/5891] Elapsed 1m 9s (remain 12m 31s) Loss: 0.1655(0.2460) Grad: 0.8275  LR: 0.00001998  
Epoch: [1][600/5891] Elapsed 1m 23s (remain 12m 16s) Loss: 0.1862(0.2410) Grad: 0.6854  LR: 0.00001997  
Epoch: [1][700/5891] Elapsed 1m 37s (remain 12m 1s) Loss: 0.1593(0.2389) Grad: 1.6101  LR: 0.00001996  
Epoch: [1][800/5891] Elapsed 1m 51s (remain 11m 48s) Loss: 0.0912(0.2351) Grad: 1.3201  LR: 0.00001994  
Epoch: [1][900/5891] Elapsed 2m 5s (remain 11m 34s) Loss: 0.1923

Epoch 1 - avg_train_loss: 0.1934  avg_val_loss: 0.1741  time: 961s
INFO:__main__:Epoch 1 - avg_train_loss: 0.1934  avg_val_loss: 0.1741  time: 961s
Epoch 1 - Score: 0.4624
INFO:__main__:Epoch 1 - Score: 0.4624
Epoch 1 - Save Best Score: 0.4975 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.4975 Model


Epoch: [2][0/5891] Elapsed 0m 0s (remain 47m 9s) Loss: 0.3818(0.3818) Grad: nan  LR: 0.00001707  
Epoch: [2][100/5891] Elapsed 0m 14s (remain 13m 43s) Loss: 0.2408(0.1653) Grad: 1.7195  LR: 0.00001698  
Epoch: [2][200/5891] Elapsed 0m 28s (remain 13m 21s) Loss: 0.1881(0.1634) Grad: 1.6039  LR: 0.00001688  
Epoch: [2][300/5891] Elapsed 0m 42s (remain 13m 3s) Loss: 0.2832(0.1646) Grad: 1.3350  LR: 0.00001678  
Epoch: [2][400/5891] Elapsed 0m 55s (remain 12m 46s) Loss: 0.1855(0.1656) Grad: 1.3089  LR: 0.00001668  
Epoch: [2][500/5891] Elapsed 1m 9s (remain 12m 30s) Loss: 0.1344(0.1642) Grad: 0.7507  LR: 0.00001658  
Epoch: [2][600/5891] Elapsed 1m 23s (remain 12m 15s) Loss: 0.0931(0.1636) Grad: 0.6160  LR: 0.00001648  
Epoch: [2][700/5891] Elapsed 1m 37s (remain 12m 1s) Loss: 0.0660(0.1649) Grad: 0.6459  LR: 0.00001638  
Epoch: [2][800/5891] Elapsed 1m 51s (remain 11m 46s) Loss: 0.1063(0.1648) Grad: 0.9193  LR: 0.00001628  
Epoch: [2][900/5891] Elapsed 2m 4s (remain 11m 32s) Loss: 0.0967(

Epoch 2 - avg_train_loss: 0.1648  avg_val_loss: 0.1622  time: 959s
INFO:__main__:Epoch 2 - avg_train_loss: 0.1648  avg_val_loss: 0.1622  time: 959s
Epoch 2 - Score: 0.4857
INFO:__main__:Epoch 2 - Score: 0.4857
Epoch 2 - Save Best Score: 0.5299 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.5299 Model


Epoch: [3][0/5891] Elapsed 0m 0s (remain 45m 56s) Loss: 0.2133(0.2133) Grad: nan  LR: 0.00001000  
Epoch: [3][100/5891] Elapsed 0m 14s (remain 13m 56s) Loss: 0.2003(0.1562) Grad: 1.0258  LR: 0.00000987  
Epoch: [3][200/5891] Elapsed 0m 29s (remain 13m 41s) Loss: 0.1306(0.1587) Grad: 1.5612  LR: 0.00000973  
Epoch: [3][300/5891] Elapsed 0m 42s (remain 13m 15s) Loss: 0.1351(0.1585) Grad: 0.6047  LR: 0.00000960  
Epoch: [3][400/5891] Elapsed 0m 56s (remain 12m 56s) Loss: 0.1522(0.1576) Grad: 1.2625  LR: 0.00000947  
Epoch: [3][500/5891] Elapsed 1m 10s (remain 12m 39s) Loss: 0.1058(0.1585) Grad: 1.3314  LR: 0.00000933  
Epoch: [3][600/5891] Elapsed 1m 24s (remain 12m 23s) Loss: 0.1938(0.1578) Grad: 0.9858  LR: 0.00000920  
Epoch: [3][700/5891] Elapsed 1m 38s (remain 12m 7s) Loss: 0.2285(0.1579) Grad: 1.6875  LR: 0.00000907  
Epoch: [3][800/5891] Elapsed 1m 52s (remain 11m 52s) Loss: 0.0536(0.1591) Grad: 1.0626  LR: 0.00000893  
Epoch: [3][900/5891] Elapsed 2m 6s (remain 11m 38s) Loss: 0.22

Epoch 3 - avg_train_loss: 0.1515  avg_val_loss: 0.1580  time: 961s
--- Logging error ---
Traceback (most recent call last):
  File "/usr/lib/python3.10/logging/__init__.py", line 1104, in emit
    self.flush()
  File "/usr/lib/python3.10/logging/__init__.py", line 1084, in flush
    self.stream.flush()
OSError: [Errno 107] Transport endpoint is not connected
Call stack:
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.10/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelapp.py", line 619, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.10/dist-packages/torn

Epoch: [4][0/5891] Elapsed 0m 0s (remain 46m 53s) Loss: 0.1223(0.1223) Grad: nan  LR: 0.00000293  
Epoch: [4][100/5891] Elapsed 0m 14s (remain 13m 59s) Loss: 0.1273(0.1443) Grad: 1.2919  LR: 0.00000283  
Epoch: [4][200/5891] Elapsed 0m 28s (remain 13m 36s) Loss: 0.1492(0.1435) Grad: 1.1382  LR: 0.00000274  
Epoch: [4][300/5891] Elapsed 0m 42s (remain 13m 12s) Loss: 0.1736(0.1433) Grad: 1.1919  LR: 0.00000265  
Epoch: [4][400/5891] Elapsed 0m 56s (remain 12m 53s) Loss: 0.0793(0.1411) Grad: 0.6893  LR: 0.00000256  
Epoch: [4][500/5891] Elapsed 1m 10s (remain 12m 36s) Loss: 0.1337(0.1429) Grad: 1.3186  LR: 0.00000247  
Epoch: [4][600/5891] Elapsed 1m 24s (remain 12m 20s) Loss: 0.0480(0.1421) Grad: 0.5619  LR: 0.00000239  
Epoch: [4][700/5891] Elapsed 1m 37s (remain 12m 5s) Loss: 0.1425(0.1433) Grad: 1.3219  LR: 0.00000230  
Epoch: [4][800/5891] Elapsed 1m 51s (remain 11m 50s) Loss: 0.1086(0.1420) Grad: 1.4986  LR: 0.00000222  
Epoch: [4][900/5891] Elapsed 2m 5s (remain 11m 35s) Loss: 0.11

Epoch 4 - avg_train_loss: 0.1401  avg_val_loss: 0.1582  time: 958s
--- Logging error ---
Traceback (most recent call last):
  File "/usr/lib/python3.10/logging/__init__.py", line 1104, in emit
    self.flush()
  File "/usr/lib/python3.10/logging/__init__.py", line 1084, in flush
    self.stream.flush()
OSError: [Errno 107] Transport endpoint is not connected
Call stack:
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.10/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelapp.py", line 619, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.10/dist-packages/torn

f1 score : 0.5007895775759967
recall score : 0.38875268158136683
precision score : 0.7035496394897394


Score: 0.5008
--- Logging error ---
Traceback (most recent call last):
  File "/usr/lib/python3.10/logging/__init__.py", line 1104, in emit
    self.flush()
  File "/usr/lib/python3.10/logging/__init__.py", line 1084, in flush
    self.stream.flush()
OSError: [Errno 107] Transport endpoint is not connected
Call stack:
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.10/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelapp.py", line 619, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.10/dist-packages/tornado/platform/asyncio.py", line 195, in start
    self

Epoch: [1][0/5891] Elapsed 0m 0s (remain 48m 18s) Loss: 0.6343(0.6343) Grad: nan  LR: 0.00002000  
Epoch: [1][100/5891] Elapsed 0m 14s (remain 13m 50s) Loss: 0.1226(0.2582) Grad: 2.1578  LR: 0.00002000  
Epoch: [1][200/5891] Elapsed 0m 28s (remain 13m 19s) Loss: 0.2301(0.2531) Grad: 15.8501  LR: 0.00002000  
Epoch: [1][300/5891] Elapsed 0m 42s (remain 13m 2s) Loss: 0.3501(0.2546) Grad: 0.9165  LR: 0.00001999  
Epoch: [1][400/5891] Elapsed 0m 56s (remain 12m 47s) Loss: 0.1350(0.2528) Grad: 0.6461  LR: 0.00001999  
Epoch: [1][500/5891] Elapsed 1m 9s (remain 12m 31s) Loss: 0.2776(0.2454) Grad: 0.7633  LR: 0.00001998  
Epoch: [1][600/5891] Elapsed 1m 23s (remain 12m 16s) Loss: 0.1487(0.2395) Grad: 0.8888  LR: 0.00001997  
Epoch: [1][700/5891] Elapsed 1m 37s (remain 12m 1s) Loss: 0.2244(0.2374) Grad: 0.7041  LR: 0.00001996  
Epoch: [1][800/5891] Elapsed 1m 51s (remain 11m 47s) Loss: 0.2595(0.2358) Grad: 0.5601  LR: 0.00001994  
Epoch: [1][900/5891] Elapsed 2m 5s (remain 11m 33s) Loss: 0.289

Epoch 1 - avg_train_loss: 0.1959  avg_val_loss: 0.1774  time: 958s
--- Logging error ---
Traceback (most recent call last):
  File "/usr/lib/python3.10/logging/__init__.py", line 1104, in emit
    self.flush()
  File "/usr/lib/python3.10/logging/__init__.py", line 1084, in flush
    self.stream.flush()
OSError: [Errno 107] Transport endpoint is not connected
Call stack:
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.10/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelapp.py", line 619, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.10/dist-packages/torn

Epoch: [2][0/5891] Elapsed 0m 0s (remain 46m 9s) Loss: 0.1923(0.1923) Grad: nan  LR: 0.00001707  
Epoch: [2][100/5891] Elapsed 0m 14s (remain 13m 41s) Loss: 0.1538(0.1748) Grad: 1.5196  LR: 0.00001698  
Epoch: [2][200/5891] Elapsed 0m 28s (remain 13m 20s) Loss: 0.3408(0.1710) Grad: 2.0229  LR: 0.00001688  
Epoch: [2][300/5891] Elapsed 0m 42s (remain 13m 2s) Loss: 0.1677(0.1736) Grad: 1.0721  LR: 0.00001678  
Epoch: [2][400/5891] Elapsed 0m 55s (remain 12m 45s) Loss: 0.2507(0.1751) Grad: 1.0541  LR: 0.00001668  
Epoch: [2][500/5891] Elapsed 1m 9s (remain 12m 30s) Loss: 0.1935(0.1751) Grad: 0.8424  LR: 0.00001658  
Epoch: [2][600/5891] Elapsed 1m 23s (remain 12m 15s) Loss: 0.0962(0.1738) Grad: 1.3422  LR: 0.00001648  
Epoch: [2][700/5891] Elapsed 1m 37s (remain 12m 1s) Loss: 0.1266(0.1724) Grad: 0.4778  LR: 0.00001638  
Epoch: [2][800/5891] Elapsed 1m 51s (remain 11m 46s) Loss: 0.1594(0.1734) Grad: 1.1946  LR: 0.00001628  
Epoch: [2][900/5891] Elapsed 2m 5s (remain 11m 32s) Loss: 0.1141(

Epoch 2 - avg_train_loss: 0.1653  avg_val_loss: 0.1647  time: 959s
--- Logging error ---
Traceback (most recent call last):
  File "/usr/lib/python3.10/logging/__init__.py", line 1104, in emit
    self.flush()
  File "/usr/lib/python3.10/logging/__init__.py", line 1084, in flush
    self.stream.flush()
OSError: [Errno 107] Transport endpoint is not connected
Call stack:
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.10/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelapp.py", line 619, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.10/dist-packages/torn

Epoch: [3][0/5891] Elapsed 0m 0s (remain 44m 22s) Loss: 0.0776(0.0776) Grad: nan  LR: 0.00001000  
Epoch: [3][100/5891] Elapsed 0m 14s (remain 13m 45s) Loss: 0.0882(0.1574) Grad: 0.4905  LR: 0.00000987  
Epoch: [3][200/5891] Elapsed 0m 28s (remain 13m 25s) Loss: 0.2441(0.1532) Grad: 1.2488  LR: 0.00000973  
Epoch: [3][300/5891] Elapsed 0m 42s (remain 13m 4s) Loss: 0.0496(0.1518) Grad: 0.7741  LR: 0.00000960  
Epoch: [3][400/5891] Elapsed 0m 56s (remain 12m 47s) Loss: 0.2649(0.1541) Grad: 2.1798  LR: 0.00000947  
Epoch: [3][500/5891] Elapsed 1m 9s (remain 12m 31s) Loss: 0.0659(0.1545) Grad: 0.6755  LR: 0.00000933  
Epoch: [3][600/5891] Elapsed 1m 23s (remain 12m 16s) Loss: 0.1150(0.1553) Grad: 0.9413  LR: 0.00000920  
Epoch: [3][700/5891] Elapsed 1m 37s (remain 12m 2s) Loss: 0.0987(0.1546) Grad: 1.0377  LR: 0.00000907  
Epoch: [3][800/5891] Elapsed 1m 51s (remain 11m 48s) Loss: 0.1678(0.1548) Grad: 0.9864  LR: 0.00000893  
Epoch: [3][900/5891] Elapsed 2m 5s (remain 11m 34s) Loss: 0.1957

Epoch 3 - avg_train_loss: 0.1526  avg_val_loss: 0.1592  time: 960s
--- Logging error ---
Traceback (most recent call last):
  File "/usr/lib/python3.10/logging/__init__.py", line 1104, in emit
    self.flush()
  File "/usr/lib/python3.10/logging/__init__.py", line 1084, in flush
    self.stream.flush()
OSError: [Errno 107] Transport endpoint is not connected
Call stack:
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.10/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelapp.py", line 619, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.10/dist-packages/torn

Epoch: [4][0/5891] Elapsed 0m 0s (remain 44m 15s) Loss: 0.0661(0.0661) Grad: nan  LR: 0.00000293  
Epoch: [4][100/5891] Elapsed 0m 14s (remain 13m 50s) Loss: 0.2013(0.1368) Grad: 1.0347  LR: 0.00000283  
Epoch: [4][200/5891] Elapsed 0m 28s (remain 13m 26s) Loss: 0.1631(0.1379) Grad: 0.6474  LR: 0.00000274  
Epoch: [4][300/5891] Elapsed 0m 42s (remain 13m 6s) Loss: 0.0771(0.1342) Grad: 1.6613  LR: 0.00000265  
Epoch: [4][400/5891] Elapsed 0m 56s (remain 12m 48s) Loss: 0.2006(0.1346) Grad: 2.1122  LR: 0.00000256  
Epoch: [4][500/5891] Elapsed 1m 10s (remain 12m 33s) Loss: 0.2112(0.1365) Grad: 1.7112  LR: 0.00000247  
Epoch: [4][600/5891] Elapsed 1m 23s (remain 12m 18s) Loss: 0.1177(0.1377) Grad: 1.1607  LR: 0.00000239  
Epoch: [4][700/5891] Elapsed 1m 37s (remain 12m 3s) Loss: 0.1915(0.1375) Grad: 1.4108  LR: 0.00000230  
Epoch: [4][800/5891] Elapsed 1m 51s (remain 11m 48s) Loss: 0.0983(0.1374) Grad: 1.0148  LR: 0.00000222  
Epoch: [4][900/5891] Elapsed 2m 5s (remain 11m 34s) Loss: 0.098

Epoch 4 - avg_train_loss: 0.1410  avg_val_loss: 0.1601  time: 959s
--- Logging error ---
Traceback (most recent call last):
  File "/usr/lib/python3.10/logging/__init__.py", line 1104, in emit
    self.flush()
  File "/usr/lib/python3.10/logging/__init__.py", line 1084, in flush
    self.stream.flush()
OSError: [Errno 107] Transport endpoint is not connected
Call stack:
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.10/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelapp.py", line 619, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.10/dist-packages/torn

f1 score : 0.49622566547477154
recall score : 0.38277658596383696
precision score : 0.7052512704686618


Score: 0.4962
--- Logging error ---
Traceback (most recent call last):
  File "/usr/lib/python3.10/logging/__init__.py", line 1104, in emit
    self.flush()
  File "/usr/lib/python3.10/logging/__init__.py", line 1084, in flush
    self.stream.flush()
OSError: [Errno 107] Transport endpoint is not connected
Call stack:
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.10/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelapp.py", line 619, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.10/dist-packages/tornado/platform/asyncio.py", line 195, in start
    self

f1 score : 0.5019256216301621
recall score : 0.38941497349146514
precision score : 0.7058660148872348


Score: 0.5019
--- Logging error ---
Traceback (most recent call last):
  File "/usr/lib/python3.10/logging/__init__.py", line 1104, in emit
    self.flush()
  File "/usr/lib/python3.10/logging/__init__.py", line 1084, in flush
    self.stream.flush()
OSError: [Errno 107] Transport endpoint is not connected
Call stack:
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.10/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelapp.py", line 619, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.10/dist-packages/tornado/platform/asyncio.py", line 195, in start
    self

In [None]:
from google.colab import runtime
runtime.unassign()