In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers
!pip install datasets
!pip install sentencepiece

Collecting transformers
  Downloading transformers-4.32.0-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m82.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m75.0 MB/s[0m eta [36m0:00:0

In [None]:
!nvidia-smi

Sat Aug 26 09:09:15 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    25W / 300W |      0MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re
import html

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD, AdamW, RAdam
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score, recall_score, accuracy_score, precision_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
import os

DIR = "/content/drive/MyDrive/Competitions/Signate/MUFG2023"
INPUT_DIR = os.path.join(DIR,"input")
OUTPUT_DIR = os.path.join(DIR,"output")

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)


OUTPUT_EXP_DIR = DIR + '/output/EXP026/'
if not os.path.exists(OUTPUT_EXP_DIR):
    os.makedirs(OUTPUT_EXP_DIR)

In [None]:


# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    model="microsoft/deberta-v3-base"
    # model='microsoft/deberta-base'
    # model='roberta-base'
    # model='roberta-large'
    # model='roberta-large-mnli'
    # model='xlnet-large-cased'
    # model='albert-xxlarge-v2'
    # model="microsoft/deberta-large"
    # model="microsoft/deberta-v3-large"
    # model='microsoft/deberta-v2-xlarge'
    # model='funnel-transformer/large'
    # model='funnel-transformer/medium'
    # model='albert-base-v2'
    # model='albert-large-v2'
    # model='google/electra-large-discriminator'
    # model='google/electra-base-discriminator'
    # model="facebook/bart-large-mnli"
    # model="facebook/bart-large"
    # model="facebook/bart-base"
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=4
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=64
    fc_dropout=0.2
    target="is_fraud?"
    target_size=1
    max_len=256
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    train=True
    nth_awp_start_epoch=1
    gradient_checkpointing = False
    freezing = False

if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0, 1]

In [None]:
def get_score(labels, outputs):
    thresh = 0.5
    y_pred = outputs
    y_true = labels
    f_score = f1_score(y_true, (y_pred>thresh).astype(int))
    r_score = recall_score(y_true, (y_pred>thresh).astype(int))
    p_score = precision_score(y_true, (y_pred>thresh).astype(int))
    print(f"f1 score : {f_score}")
    print(f"recall score : {r_score}")
    print(f"precision score : {p_score}")
    return f1_score(y_true, (y_pred>thresh).astype(int))

def get_f1_score(labels, outputs):
    y_pred = outputs
    y_true = labels
    best_score = 0
    best_thresh = 0.5
    for thresh in np.arange(0.1, 0.80, 0.01):
        thresh = np.round(thresh, 2)
        score = f1_score(y_true, (y_pred>thresh).astype(int))
        #print("Accuracy score at threshold {0} is {1}".format(thresh, score))
        if score > best_score:
          best_score = score
          best_thresh = thresh
    return f1_score(y_true, (y_pred>best_thresh).astype(int))


def get_logger(filename=OUTPUT_EXP_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(seed=CFG.seed)

In [None]:
def freeze(module):
    """
    Freezes module's parameters.
    """

    for parameter in module.parameters():
        parameter.requires_grad = False

def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """

    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)

    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """

    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"

        if hasattr(embeddings_path, attr_name):
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [None]:
import pandas as pd
import numpy as np


train = pd.read_csv(os.path.join(INPUT_DIR,"train.csv"))
test = pd.read_csv(os.path.join(INPUT_DIR,"test.csv"))
card = pd.read_csv(os.path.join(INPUT_DIR, "card.csv"))
user = pd.read_csv(os.path.join(INPUT_DIR, "user.csv"))
sub = pd.read_csv(os.path.join(INPUT_DIR, "sample_submit.csv"), header=None)

print(train.shape)
display(train.head(3))

print(test.shape)
display(test.head(3))

print(card.shape)
display(card.head(3))

print(user.shape)
display(user.head(3))

print(sub.shape)
display(sub.head(3))

(471283, 12)


Unnamed: 0,index,user_id,card_id,amount,errors?,is_fraud?,merchant_id,merchant_city,merchant_state,zip,mcc,use_chip
0,0,1721,0,$2.623,OK,0,209237,Joliet,IL,60436.0,5541,Swipe Transaction
1,1,1629,3,$6.4,OK,0,2568,Edgerton,WI,53534.0,5814,Swipe Transaction
2,2,655,3,$123.5,OK,0,345310,Ridgefield,WA,98642.0,7538,Swipe Transaction


(457958, 11)


Unnamed: 0,index,user_id,card_id,amount,errors?,merchant_id,merchant_city,merchant_state,zip,mcc,use_chip
0,471283,541,3,$113.278,OK,324189,Orlando,FL,32821.0,4814,Swipe Transaction
1,471284,655,1,$293.944,OK,81219,Ridgefield,WA,98642.0,7538,Chip Transaction
2,471285,492,0,$47.4,OK,274755,Arlington Heights,IL,60004.0,5719,Swipe Transaction


(416, 10)


Unnamed: 0,user_id,card_id,card_brand,card_type,expires,has_chip,cards_issued,credit_limit,acct_open_date,year_pin_last_changed
0,39,0,Visa,Debit,09/2021,YES,1,$17117,05/2007,2010
1,39,1,Amex,Credit,11/2024,YES,2,$5400,10/2015,2015
2,41,0,Discover,Credit,03/2022,YES,2,$14800,12/2010,2011


(97, 17)


Unnamed: 0,user_id,current_age,retirement_age,birth_year,birth_month,gender,address,city,state,zipcode,latitude,longitude,per_capita_income_zipcode,yearly_income_person,total_debt,fico_score,num_credit_cards
0,39,57,64,1962,12,Female,442 Burns Boulevard,Mansfield,MA,2048,42.02,-71.21,$37407,$76274,$102611,698,2
1,41,39,66,1980,10,Female,3863 River Avenue,Lincoln,CA,95648,38.93,-121.25,$21829,$44506,$57994,849,3
2,47,40,67,1979,5,Female,8799 Elm Avenue,Mckinney,TX,75069,33.2,-96.65,$24684,$50329,$76759,625,4


(457958, 2)


Unnamed: 0,0,1
0,471283,0
1,471284,1
2,471285,0


In [None]:
train = train.merge(card, how="left", on=["user_id", "card_id"]).merge(user, how="left", on="user_id")

In [None]:
month_dict = {
   "01": "January",
   "02": "February",
   "03": "March",
   "04": "April",
   "05": "May",
   "06": "June",
   "07": "July",
   "08": "August",
   "09": "September",
   "10": "October",
   "11": "November",
   "12": "December"
}

def get_expires_values(df):
  _df = df["expires"].str.split('/').apply(pd.Series)
  _df.columns = ["month","years"]
  df["expires_month"] = _df["month"].astype(str)
  df["expires_years"] = _df["years"].astype(str)
  return df

def get_acct_open_date_values(df):
  _df = df["acct_open_date"].str.split('/').apply(pd.Series)
  _df.columns = ["month","years"]
  df["acct_open_date_month"] = _df["month"].astype(str)
  df["acct_open_date_years"] = _df["years"].astype(str)
  return df

train = get_expires_values(train)
train = get_acct_open_date_values(train)
train["expires_month"] = train["expires_month"].map(month_dict)
train["acct_open_date_month"] = train["acct_open_date_month"].map(month_dict)

In [None]:
train.fillna('unknown', inplace = True)

train["texts"] = "merchant" + "[SEP]" + train["amount"] + "[SEP]" + train["errors?"] + "[SEP]" + train["merchant_city"] + "[SEP]" + train["merchant_state"] + "[SEP]" + train["use_chip"] + "[SEP]" \
+ "card" + "[SEP]" + train["card_brand"] + "[SEP]" + train["card_type"] + "[SEP]" + train["expires_month"] + " " + train["expires_years"] + "[SEP]" + train["has_chip"] + "[SEP]" + train["acct_open_date_month"] + " " + train["acct_open_date_years"] + "[SEP]" + train["year_pin_last_changed"].astype(str) + "[SEP]" \
"user" + "[SEP]" + train["current_age"].astype(str) + " year old " + train["gender"] + "[SEP]" + "retired at age " + train["retirement_age"].astype(str) + "[SEP]" + train["address"] + "[SEP]" + train["city"] + "[SEP]" + train["state"] + "[SEP]" + train["per_capita_income_zipcode"] + "[SEP]" + train["yearly_income_person"] + "[SEP]" + train["total_debt"]

In [None]:
skf = StratifiedKFold(n_splits=CFG.n_fold,shuffle=True,random_state=CFG.seed)
for fold, ( _, val_) in enumerate(skf.split(train, train[CFG.target])):
    train.loc[val_ , "kfold"] = int(fold)

train["kfold"] = train["kfold"].astype(int)

if CFG.debug:
    display(train.groupby('kfold').size())
    train = train.sample(n=500, random_state=0).reset_index(drop=True)
    display(train.groupby('kfold').size())

In [None]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_EXP_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# ====================================================
# Define max_len
# ====================================================
lengths = []
tk0 = tqdm(train['texts'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 23 # cls
LOGGER.info(f"max_len: {CFG.max_len}")

100%|██████████| 471283/471283 [02:06<00:00, 3731.52it/s]
max_len: 97
INFO:__main__:max_len: 97


In [None]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False,
                           truncation=True)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['texts'].values
        self.labels = df[CFG.target].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.half)
        return inputs, label

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

class ValidDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['texts'].values
        self.labels = df[CFG.target].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

#collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim=1)
        return max_embeddings


class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()

        # Freezing
        if cfg.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            cfg.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())

        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        self._init_weights(self.fc)
        self.layer_norm1 = nn.LayerNorm(self.config.hidden_size)
        self.sig = nn.Sigmoid()

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        feature = self.layer_norm1(feature)
        output = self.fc(feature)
        #output = self.sig(output)
        return output

In [None]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
        #print(y_preds.sigmoid().squeeze().view(1, -1))
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader),
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))

    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    predictions = np.concatenate(predictions)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [None]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):

    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['kfold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['kfold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds[CFG.target].values

    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = ValidDataset(CFG, valid_folds)


    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size*2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_EXP_DIR+'config.pth')
    model.to(device)

    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr,
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)

    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler

    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.BCEWithLogitsLoss()

    best_score = -1.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)

        # scoring
        score = get_score(valid_labels, predictions)
        f1_score = get_f1_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')


        if best_score < f1_score:
            best_score = f1_score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_EXP_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_EXP_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                             map_location=torch.device('cpu'))['predictions']
    valid_folds['pred'] = predictions

    torch.cuda.empty_cache()
    gc.collect()

    return valid_folds

In [None]:
if __name__ == '__main__':

    def get_result(oof_df):
        labels = oof_df[CFG.target].values
        preds = oof_df['pred'].values
        score = get_score(labels, preds)
        f1_score = get_f1_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
        LOGGER.info(f'F1 BEST Score: {f1_score:<.4f}')

    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
            #break
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_EXP_DIR+'oof_df.pkl')

DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.32.0",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "at

Downloading pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Epoch: [1][0/5891] Elapsed 0m 3s (remain 386m 58s) Loss: 0.8779(0.8779) Grad: nan  LR: 0.00002000  
Epoch: [1][100/5891] Elapsed 0m 19s (remain 18m 35s) Loss: 0.2460(0.2716) Grad: 1.1382  LR: 0.00002000  
Epoch: [1][200/5891] Elapsed 0m 35s (remain 16m 32s) Loss: 0.2573(0.2529) Grad: 1.3086  LR: 0.00002000  
Epoch: [1][300/5891] Elapsed 0m 50s (remain 15m 33s) Loss: 0.2310(0.2429) Grad: 1.3982  LR: 0.00001999  
Epoch: [1][400/5891] Elapsed 1m 5s (remain 14m 59s) Loss: 0.1750(0.2385) Grad: 0.9111  LR: 0.00001999  
Epoch: [1][500/5891] Elapsed 1m 21s (remain 14m 35s) Loss: 0.2432(0.2340) Grad: 1.3765  LR: 0.00001998  
Epoch: [1][600/5891] Elapsed 1m 36s (remain 14m 11s) Loss: 0.1420(0.2312) Grad: 1.2037  LR: 0.00001997  
Epoch: [1][700/5891] Elapsed 1m 52s (remain 13m 50s) Loss: 0.2595(0.2287) Grad: 1.1543  LR: 0.00001996  
Epoch: [1][800/5891] Elapsed 2m 7s (remain 13m 30s) Loss: 0.2981(0.2276) Grad: 0.8151  LR: 0.00001994  
Epoch: [1][900/5891] Elapsed 2m 25s (remain 13m 26s) Loss: 0.2

Epoch 1 - avg_train_loss: 0.1907  avg_val_loss: 0.1737  time: 1065s
INFO:__main__:Epoch 1 - avg_train_loss: 0.1907  avg_val_loss: 0.1737  time: 1065s
Epoch 1 - Score: 0.4084
INFO:__main__:Epoch 1 - Score: 0.4084
Epoch 1 - Save Best Score: 0.5016 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.5016 Model


Epoch: [2][0/5891] Elapsed 0m 0s (remain 73m 56s) Loss: 0.2505(0.2505) Grad: nan  LR: 0.00001707  
Epoch: [2][100/5891] Elapsed 0m 18s (remain 17m 34s) Loss: 0.0740(0.1596) Grad: 1.4657  LR: 0.00001698  
Epoch: [2][200/5891] Elapsed 0m 34s (remain 16m 12s) Loss: 0.1292(0.1704) Grad: 0.9309  LR: 0.00001688  
Epoch: [2][300/5891] Elapsed 0m 49s (remain 15m 27s) Loss: 0.2126(0.1691) Grad: 1.0573  LR: 0.00001678  
Epoch: [2][400/5891] Elapsed 1m 5s (remain 14m 54s) Loss: 0.0588(0.1682) Grad: 1.0193  LR: 0.00001668  
Epoch: [2][500/5891] Elapsed 1m 20s (remain 14m 27s) Loss: 0.1392(0.1702) Grad: 0.3424  LR: 0.00001658  
Epoch: [2][600/5891] Elapsed 1m 36s (remain 14m 6s) Loss: 0.2793(0.1680) Grad: 1.2662  LR: 0.00001648  
Epoch: [2][700/5891] Elapsed 1m 51s (remain 13m 45s) Loss: 0.1515(0.1686) Grad: 0.5364  LR: 0.00001638  
Epoch: [2][800/5891] Elapsed 2m 6s (remain 13m 25s) Loss: 0.2137(0.1677) Grad: 1.1241  LR: 0.00001628  
Epoch: [2][900/5891] Elapsed 2m 22s (remain 13m 9s) Loss: 0.2289

Epoch 2 - avg_train_loss: 0.1640  avg_val_loss: 0.1587  time: 1088s
INFO:__main__:Epoch 2 - avg_train_loss: 0.1640  avg_val_loss: 0.1587  time: 1088s
Epoch 2 - Score: 0.4729
INFO:__main__:Epoch 2 - Score: 0.4729
Epoch 2 - Save Best Score: 0.5337 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.5337 Model


Epoch: [3][0/5891] Elapsed 0m 0s (remain 59m 34s) Loss: 0.2137(0.2137) Grad: nan  LR: 0.00001000  
Epoch: [3][100/5891] Elapsed 0m 17s (remain 16m 34s) Loss: 0.1592(0.1453) Grad: 0.9910  LR: 0.00000987  
Epoch: [3][200/5891] Elapsed 0m 33s (remain 15m 55s) Loss: 0.1791(0.1481) Grad: 1.0324  LR: 0.00000973  
Epoch: [3][300/5891] Elapsed 0m 49s (remain 15m 16s) Loss: 0.2395(0.1503) Grad: 1.5315  LR: 0.00000960  
Epoch: [3][400/5891] Elapsed 1m 4s (remain 14m 48s) Loss: 0.1305(0.1543) Grad: 0.8272  LR: 0.00000947  
Epoch: [3][500/5891] Elapsed 1m 20s (remain 14m 27s) Loss: 0.0743(0.1541) Grad: 0.9021  LR: 0.00000933  
Epoch: [3][600/5891] Elapsed 1m 36s (remain 14m 6s) Loss: 0.1967(0.1531) Grad: 0.8713  LR: 0.00000920  
Epoch: [3][700/5891] Elapsed 1m 51s (remain 13m 45s) Loss: 0.2288(0.1539) Grad: 0.8015  LR: 0.00000907  
Epoch: [3][800/5891] Elapsed 2m 9s (remain 13m 43s) Loss: 0.1584(0.1537) Grad: 0.6136  LR: 0.00000893  
Epoch: [3][900/5891] Elapsed 2m 25s (remain 13m 25s) Loss: 0.226

Epoch 3 - avg_train_loss: 0.1523  avg_val_loss: 0.1563  time: 1073s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1523  avg_val_loss: 0.1563  time: 1073s
Epoch 3 - Score: 0.4893
INFO:__main__:Epoch 3 - Score: 0.4893
Epoch 3 - Save Best Score: 0.5496 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.5496 Model


Epoch: [4][0/5891] Elapsed 0m 0s (remain 56m 35s) Loss: 0.0557(0.0557) Grad: nan  LR: 0.00000293  
Epoch: [4][100/5891] Elapsed 0m 17s (remain 16m 32s) Loss: 0.1824(0.1510) Grad: 0.9307  LR: 0.00000283  
Epoch: [4][200/5891] Elapsed 0m 33s (remain 15m 41s) Loss: 0.1097(0.1440) Grad: 0.5207  LR: 0.00000274  
Epoch: [4][300/5891] Elapsed 0m 48s (remain 15m 1s) Loss: 0.1058(0.1465) Grad: 0.7026  LR: 0.00000265  
Epoch: [4][400/5891] Elapsed 1m 3s (remain 14m 35s) Loss: 0.2727(0.1425) Grad: 1.2320  LR: 0.00000256  
Epoch: [4][500/5891] Elapsed 1m 19s (remain 14m 15s) Loss: 0.2173(0.1441) Grad: 0.7424  LR: 0.00000247  
Epoch: [4][600/5891] Elapsed 1m 34s (remain 13m 55s) Loss: 0.0859(0.1444) Grad: 1.1496  LR: 0.00000239  
Epoch: [4][700/5891] Elapsed 1m 50s (remain 13m 36s) Loss: 0.1646(0.1447) Grad: 0.9040  LR: 0.00000230  
Epoch: [4][800/5891] Elapsed 2m 5s (remain 13m 18s) Loss: 0.1671(0.1450) Grad: 0.9063  LR: 0.00000222  
Epoch: [4][900/5891] Elapsed 2m 21s (remain 13m 3s) Loss: 0.0347

Epoch 4 - avg_train_loss: 0.1431  avg_val_loss: 0.1558  time: 1068s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1431  avg_val_loss: 0.1558  time: 1068s
Epoch 4 - Score: 0.5037
INFO:__main__:Epoch 4 - Score: 0.5037
Epoch 4 - Save Best Score: 0.5550 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.5550 Model


f1 score : 0.5036948272418613
recall score : 0.38645418326693226
precision score : 0.7230504587155964


Score: 0.5037
INFO:__main__:Score: 0.5037
F1 BEST Score: 0.5550
INFO:__main__:F1 BEST Score: 0.5550
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.32.0",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__

Epoch: [1][0/5891] Elapsed 0m 1s (remain 143m 11s) Loss: 0.6782(0.6782) Grad: nan  LR: 0.00002000  
Epoch: [1][100/5891] Elapsed 0m 19s (remain 18m 34s) Loss: 0.2499(0.2752) Grad: 0.9331  LR: 0.00002000  
Epoch: [1][200/5891] Elapsed 0m 34s (remain 16m 19s) Loss: 0.1604(0.2529) Grad: 0.7039  LR: 0.00002000  
Epoch: [1][300/5891] Elapsed 0m 50s (remain 15m 29s) Loss: 0.1444(0.2407) Grad: 0.6625  LR: 0.00001999  
Epoch: [1][400/5891] Elapsed 1m 5s (remain 14m 57s) Loss: 0.2632(0.2329) Grad: 1.1136  LR: 0.00001999  
Epoch: [1][500/5891] Elapsed 1m 20s (remain 14m 31s) Loss: 0.2837(0.2293) Grad: 0.7751  LR: 0.00001998  
Epoch: [1][600/5891] Elapsed 1m 36s (remain 14m 8s) Loss: 0.1332(0.2222) Grad: 0.4975  LR: 0.00001997  
Epoch: [1][700/5891] Elapsed 1m 51s (remain 13m 48s) Loss: 0.3552(0.2198) Grad: 1.0047  LR: 0.00001996  
Epoch: [1][800/5891] Elapsed 2m 7s (remain 13m 29s) Loss: 0.1577(0.2173) Grad: 1.0632  LR: 0.00001994  
Epoch: [1][900/5891] Elapsed 2m 23s (remain 13m 15s) Loss: 0.17

Epoch 1 - avg_train_loss: 0.1870  avg_val_loss: 0.1665  time: 1065s
INFO:__main__:Epoch 1 - avg_train_loss: 0.1870  avg_val_loss: 0.1665  time: 1065s
Epoch 1 - Score: 0.4177
INFO:__main__:Epoch 1 - Score: 0.4177
Epoch 1 - Save Best Score: 0.5149 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.5149 Model


Epoch: [2][0/5891] Elapsed 0m 0s (remain 61m 42s) Loss: 0.1142(0.1142) Grad: nan  LR: 0.00001707  
Epoch: [2][100/5891] Elapsed 0m 17s (remain 16m 26s) Loss: 0.1337(0.1582) Grad: 1.5159  LR: 0.00001698  
Epoch: [2][200/5891] Elapsed 0m 32s (remain 15m 32s) Loss: 0.1069(0.1619) Grad: 0.5461  LR: 0.00001688  
Epoch: [2][300/5891] Elapsed 0m 48s (remain 14m 54s) Loss: 0.0955(0.1626) Grad: 0.6463  LR: 0.00001678  
Epoch: [2][400/5891] Elapsed 1m 3s (remain 14m 27s) Loss: 0.2423(0.1607) Grad: 0.6304  LR: 0.00001668  
Epoch: [2][500/5891] Elapsed 1m 18s (remain 14m 4s) Loss: 0.1184(0.1613) Grad: 0.3937  LR: 0.00001658  
Epoch: [2][600/5891] Elapsed 1m 33s (remain 13m 44s) Loss: 0.0673(0.1632) Grad: 0.5698  LR: 0.00001648  
Epoch: [2][700/5891] Elapsed 1m 48s (remain 13m 24s) Loss: 0.1065(0.1628) Grad: 0.5968  LR: 0.00001638  
Epoch: [2][800/5891] Elapsed 2m 5s (remain 13m 15s) Loss: 0.2849(0.1637) Grad: 1.2411  LR: 0.00001628  
Epoch: [2][900/5891] Elapsed 2m 21s (remain 13m 4s) Loss: 0.0777

Epoch 2 - avg_train_loss: 0.1617  avg_val_loss: 0.1588  time: 1055s
INFO:__main__:Epoch 2 - avg_train_loss: 0.1617  avg_val_loss: 0.1588  time: 1055s
Epoch 2 - Score: 0.4665
INFO:__main__:Epoch 2 - Score: 0.4665
Epoch 2 - Save Best Score: 0.5466 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.5466 Model


Epoch: [3][0/5891] Elapsed 0m 1s (remain 98m 18s) Loss: 0.1791(0.1791) Grad: nan  LR: 0.00001000  
Epoch: [3][100/5891] Elapsed 0m 17s (remain 16m 18s) Loss: 0.0688(0.1573) Grad: 1.0158  LR: 0.00000987  
Epoch: [3][200/5891] Elapsed 0m 33s (remain 15m 41s) Loss: 0.1897(0.1522) Grad: 0.9492  LR: 0.00000973  
Epoch: [3][300/5891] Elapsed 0m 48s (remain 15m 2s) Loss: 0.2120(0.1535) Grad: 0.8954  LR: 0.00000960  
Epoch: [3][400/5891] Elapsed 1m 3s (remain 14m 33s) Loss: 0.1261(0.1524) Grad: 0.7488  LR: 0.00000947  
Epoch: [3][500/5891] Elapsed 1m 18s (remain 14m 9s) Loss: 0.1099(0.1506) Grad: 0.5360  LR: 0.00000933  
Epoch: [3][600/5891] Elapsed 1m 34s (remain 13m 49s) Loss: 0.2715(0.1519) Grad: 1.3450  LR: 0.00000920  
Epoch: [3][700/5891] Elapsed 1m 49s (remain 13m 30s) Loss: 0.2329(0.1510) Grad: 1.6990  LR: 0.00000907  
Epoch: [3][800/5891] Elapsed 2m 4s (remain 13m 13s) Loss: 0.1120(0.1497) Grad: 0.5127  LR: 0.00000893  
Epoch: [3][900/5891] Elapsed 2m 23s (remain 13m 15s) Loss: 0.0649

Epoch 3 - avg_train_loss: 0.1507  avg_val_loss: 0.1550  time: 1072s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1507  avg_val_loss: 0.1550  time: 1072s
Epoch 3 - Score: 0.5026
INFO:__main__:Epoch 3 - Score: 0.5026
Epoch 3 - Save Best Score: 0.5531 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.5531 Model


Epoch: [4][0/5891] Elapsed 0m 0s (remain 60m 8s) Loss: 0.0714(0.0714) Grad: nan  LR: 0.00000293  
Epoch: [4][100/5891] Elapsed 0m 17s (remain 17m 4s) Loss: 0.1526(0.1428) Grad: 1.2623  LR: 0.00000283  
Epoch: [4][200/5891] Elapsed 0m 33s (remain 15m 55s) Loss: 0.0710(0.1396) Grad: 0.5867  LR: 0.00000274  
Epoch: [4][300/5891] Elapsed 0m 49s (remain 15m 14s) Loss: 0.1229(0.1413) Grad: 0.6500  LR: 0.00000265  
Epoch: [4][400/5891] Elapsed 1m 4s (remain 14m 47s) Loss: 0.1248(0.1426) Grad: 0.8720  LR: 0.00000256  
Epoch: [4][500/5891] Elapsed 1m 20s (remain 14m 24s) Loss: 0.2117(0.1435) Grad: 1.0172  LR: 0.00000247  
Epoch: [4][600/5891] Elapsed 1m 35s (remain 14m 2s) Loss: 0.2375(0.1452) Grad: 1.6620  LR: 0.00000239  
Epoch: [4][700/5891] Elapsed 1m 51s (remain 13m 44s) Loss: 0.0970(0.1445) Grad: 0.7865  LR: 0.00000230  
Epoch: [4][800/5891] Elapsed 2m 6s (remain 13m 25s) Loss: 0.1014(0.1444) Grad: 0.5606  LR: 0.00000222  
Epoch: [4][900/5891] Elapsed 2m 25s (remain 13m 23s) Loss: 0.1884(

Epoch 4 - avg_train_loss: 0.1415  avg_val_loss: 0.1561  time: 1079s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1415  avg_val_loss: 0.1561  time: 1079s
Epoch 4 - Score: 0.5131
INFO:__main__:Epoch 4 - Score: 0.5131
Epoch 4 - Save Best Score: 0.5531 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.5531 Model


f1 score : 0.5130869522595823
recall score : 0.4010113392583512
precision score : 0.712108843537415


Score: 0.5131
INFO:__main__:Score: 0.5131
F1 BEST Score: 0.5531
INFO:__main__:F1 BEST Score: 0.5531
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.32.0",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__

Epoch: [1][0/5891] Elapsed 0m 0s (remain 63m 20s) Loss: 0.4268(0.4268) Grad: nan  LR: 0.00002000  
Epoch: [1][100/5891] Elapsed 0m 18s (remain 17m 50s) Loss: 0.2034(0.2501) Grad: 0.5860  LR: 0.00002000  
Epoch: [1][200/5891] Elapsed 0m 34s (remain 16m 7s) Loss: 0.1707(0.2400) Grad: 1.8798  LR: 0.00002000  
Epoch: [1][300/5891] Elapsed 0m 49s (remain 15m 23s) Loss: 0.3359(0.2288) Grad: 1.6633  LR: 0.00001999  
Epoch: [1][400/5891] Elapsed 1m 5s (remain 14m 50s) Loss: 0.2065(0.2253) Grad: 0.1656  LR: 0.00001999  
Epoch: [1][500/5891] Elapsed 1m 20s (remain 14m 28s) Loss: 0.0904(0.2202) Grad: 1.4075  LR: 0.00001998  
Epoch: [1][600/5891] Elapsed 1m 36s (remain 14m 5s) Loss: 0.1575(0.2211) Grad: 1.2565  LR: 0.00001997  
Epoch: [1][700/5891] Elapsed 1m 51s (remain 13m 45s) Loss: 0.0775(0.2197) Grad: 0.9676  LR: 0.00001996  
Epoch: [1][800/5891] Elapsed 2m 7s (remain 13m 30s) Loss: 0.2375(0.2167) Grad: 1.5751  LR: 0.00001994  
Epoch: [1][900/5891] Elapsed 2m 25s (remain 13m 26s) Loss: 0.2739

Epoch 1 - avg_train_loss: 0.1999  avg_val_loss: 0.1752  time: 1073s
INFO:__main__:Epoch 1 - avg_train_loss: 0.1999  avg_val_loss: 0.1752  time: 1073s
Epoch 1 - Score: 0.3758
INFO:__main__:Epoch 1 - Score: 0.3758
Epoch 1 - Save Best Score: 0.4933 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.4933 Model


Epoch: [2][0/5891] Elapsed 0m 0s (remain 62m 54s) Loss: 0.1052(0.1052) Grad: nan  LR: 0.00001707  
Epoch: [2][100/5891] Elapsed 0m 16s (remain 16m 10s) Loss: 0.1895(0.1782) Grad: 0.5710  LR: 0.00001698  
Epoch: [2][200/5891] Elapsed 0m 32s (remain 15m 29s) Loss: 0.1163(0.1743) Grad: 0.5199  LR: 0.00001688  
Epoch: [2][300/5891] Elapsed 0m 48s (remain 14m 53s) Loss: 0.0478(0.1716) Grad: 0.7507  LR: 0.00001678  
Epoch: [2][400/5891] Elapsed 1m 3s (remain 14m 29s) Loss: 0.1160(0.1694) Grad: 0.6112  LR: 0.00001668  
Epoch: [2][500/5891] Elapsed 1m 19s (remain 14m 10s) Loss: 0.1517(0.1703) Grad: 0.8029  LR: 0.00001658  
Epoch: [2][600/5891] Elapsed 1m 34s (remain 13m 53s) Loss: 0.1674(0.1705) Grad: 1.0094  LR: 0.00001648  
Epoch: [2][700/5891] Elapsed 1m 50s (remain 13m 37s) Loss: 0.2148(0.1701) Grad: 1.1134  LR: 0.00001638  
Epoch: [2][800/5891] Elapsed 2m 5s (remain 13m 18s) Loss: 0.0728(0.1697) Grad: 0.8866  LR: 0.00001628  
Epoch: [2][900/5891] Elapsed 2m 23s (remain 13m 15s) Loss: 0.27

Epoch 2 - avg_train_loss: 0.1652  avg_val_loss: 0.1605  time: 1065s
INFO:__main__:Epoch 2 - avg_train_loss: 0.1652  avg_val_loss: 0.1605  time: 1065s
Epoch 2 - Score: 0.4243
INFO:__main__:Epoch 2 - Score: 0.4243
Epoch 2 - Save Best Score: 0.5332 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.5332 Model


Epoch: [3][0/5891] Elapsed 0m 0s (remain 64m 12s) Loss: 0.0753(0.0753) Grad: nan  LR: 0.00001000  
Epoch: [3][100/5891] Elapsed 0m 17s (remain 16m 54s) Loss: 0.1285(0.1461) Grad: 0.9974  LR: 0.00000987  
Epoch: [3][200/5891] Elapsed 0m 33s (remain 15m 54s) Loss: 0.1888(0.1524) Grad: 1.5031  LR: 0.00000973  
Epoch: [3][300/5891] Elapsed 0m 49s (remain 15m 12s) Loss: 0.1177(0.1500) Grad: 1.0957  LR: 0.00000960  
Epoch: [3][400/5891] Elapsed 1m 4s (remain 14m 49s) Loss: 0.2148(0.1542) Grad: 0.6620  LR: 0.00000947  
Epoch: [3][500/5891] Elapsed 1m 20s (remain 14m 27s) Loss: 0.3372(0.1544) Grad: 1.3960  LR: 0.00000933  
Epoch: [3][600/5891] Elapsed 1m 36s (remain 14m 7s) Loss: 0.1011(0.1551) Grad: 0.5530  LR: 0.00000920  
Epoch: [3][700/5891] Elapsed 1m 51s (remain 13m 48s) Loss: 0.1385(0.1573) Grad: 1.1871  LR: 0.00000907  
Epoch: [3][800/5891] Elapsed 2m 7s (remain 13m 29s) Loss: 0.1154(0.1578) Grad: 1.1110  LR: 0.00000893  
Epoch: [3][900/5891] Elapsed 2m 25s (remain 13m 24s) Loss: 0.079

Epoch 3 - avg_train_loss: 0.1532  avg_val_loss: 0.1551  time: 1074s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1532  avg_val_loss: 0.1551  time: 1074s
Epoch 3 - Score: 0.4887
INFO:__main__:Epoch 3 - Score: 0.4887
Epoch 3 - Save Best Score: 0.5494 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.5494 Model


Epoch: [4][0/5891] Elapsed 0m 0s (remain 62m 12s) Loss: 0.1335(0.1335) Grad: nan  LR: 0.00000293  
Epoch: [4][100/5891] Elapsed 0m 21s (remain 20m 47s) Loss: 0.1476(0.1479) Grad: 0.9779  LR: 0.00000283  
Epoch: [4][200/5891] Elapsed 0m 37s (remain 17m 37s) Loss: 0.1387(0.1526) Grad: 0.7977  LR: 0.00000274  
Epoch: [4][300/5891] Elapsed 0m 52s (remain 16m 17s) Loss: 0.1864(0.1517) Grad: 0.9846  LR: 0.00000265  
Epoch: [4][400/5891] Elapsed 1m 7s (remain 15m 28s) Loss: 0.1831(0.1495) Grad: 1.1340  LR: 0.00000256  
Epoch: [4][500/5891] Elapsed 1m 22s (remain 14m 51s) Loss: 0.0878(0.1482) Grad: 0.5445  LR: 0.00000247  
Epoch: [4][600/5891] Elapsed 1m 37s (remain 14m 22s) Loss: 0.1320(0.1467) Grad: 1.0683  LR: 0.00000239  
Epoch: [4][700/5891] Elapsed 1m 53s (remain 13m 58s) Loss: 0.1790(0.1471) Grad: 0.8514  LR: 0.00000230  
Epoch: [4][800/5891] Elapsed 2m 8s (remain 13m 36s) Loss: 0.1327(0.1450) Grad: 0.7393  LR: 0.00000222  
Epoch: [4][900/5891] Elapsed 2m 23s (remain 13m 15s) Loss: 0.22

Epoch 4 - avg_train_loss: 0.1444  avg_val_loss: 0.1551  time: 1052s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1444  avg_val_loss: 0.1551  time: 1052s
Epoch 4 - Score: 0.5069
INFO:__main__:Epoch 4 - Score: 0.5069
Epoch 4 - Save Best Score: 0.5525 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.5525 Model


f1 score : 0.5068870523415978
recall score : 0.3946683009039375
precision score : 0.7082760516909541


Score: 0.5069
INFO:__main__:Score: 0.5069
F1 BEST Score: 0.5525
INFO:__main__:F1 BEST Score: 0.5525
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.32.0",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__

Epoch: [1][0/5891] Elapsed 0m 1s (remain 102m 48s) Loss: 0.5527(0.5527) Grad: nan  LR: 0.00002000  
Epoch: [1][100/5891] Elapsed 0m 17s (remain 16m 37s) Loss: 0.3123(0.2600) Grad: 1.4306  LR: 0.00002000  
Epoch: [1][200/5891] Elapsed 0m 32s (remain 15m 16s) Loss: 0.1120(0.2404) Grad: 1.3881  LR: 0.00002000  
Epoch: [1][300/5891] Elapsed 0m 47s (remain 14m 39s) Loss: 0.2052(0.2329) Grad: 1.2028  LR: 0.00001999  
Epoch: [1][400/5891] Elapsed 1m 4s (remain 14m 37s) Loss: 0.2404(0.2298) Grad: 0.8147  LR: 0.00001999  
Epoch: [1][500/5891] Elapsed 1m 20s (remain 14m 29s) Loss: 0.3352(0.2265) Grad: 0.8317  LR: 0.00001998  
Epoch: [1][600/5891] Elapsed 1m 36s (remain 14m 5s) Loss: 0.1954(0.2246) Grad: 0.6389  LR: 0.00001997  
Epoch: [1][700/5891] Elapsed 1m 51s (remain 13m 43s) Loss: 0.1691(0.2245) Grad: 0.5348  LR: 0.00001996  
Epoch: [1][800/5891] Elapsed 2m 6s (remain 13m 23s) Loss: 0.1844(0.2223) Grad: 0.2452  LR: 0.00001994  
Epoch: [1][900/5891] Elapsed 2m 21s (remain 13m 3s) Loss: 0.222

Epoch 1 - avg_train_loss: 0.1913  avg_val_loss: 0.1776  time: 1051s
INFO:__main__:Epoch 1 - avg_train_loss: 0.1913  avg_val_loss: 0.1776  time: 1051s
Epoch 1 - Score: 0.3751
INFO:__main__:Epoch 1 - Score: 0.3751
Epoch 1 - Save Best Score: 0.4869 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.4869 Model


Epoch: [2][0/5891] Elapsed 0m 0s (remain 89m 49s) Loss: 0.1060(0.1060) Grad: nan  LR: 0.00001707  
Epoch: [2][100/5891] Elapsed 0m 17s (remain 16m 22s) Loss: 0.2073(0.1626) Grad: 0.7751  LR: 0.00001698  
Epoch: [2][200/5891] Elapsed 0m 32s (remain 15m 28s) Loss: 0.1410(0.1636) Grad: 0.7628  LR: 0.00001688  
Epoch: [2][300/5891] Elapsed 0m 48s (remain 14m 59s) Loss: 0.1379(0.1677) Grad: 0.7789  LR: 0.00001678  
Epoch: [2][400/5891] Elapsed 1m 3s (remain 14m 34s) Loss: 0.1859(0.1664) Grad: 0.7795  LR: 0.00001668  
Epoch: [2][500/5891] Elapsed 1m 19s (remain 14m 16s) Loss: 0.1138(0.1651) Grad: 0.6006  LR: 0.00001658  
Epoch: [2][600/5891] Elapsed 1m 35s (remain 14m 0s) Loss: 0.1056(0.1680) Grad: 0.8392  LR: 0.00001648  
Epoch: [2][700/5891] Elapsed 1m 52s (remain 13m 50s) Loss: 0.1796(0.1682) Grad: 0.6154  LR: 0.00001638  
Epoch: [2][800/5891] Elapsed 2m 9s (remain 13m 41s) Loss: 0.1862(0.1678) Grad: 1.4302  LR: 0.00001628  
Epoch: [2][900/5891] Elapsed 2m 25s (remain 13m 23s) Loss: 0.073

Epoch 2 - avg_train_loss: 0.1638  avg_val_loss: 0.1615  time: 1083s
--- Logging error ---
Traceback (most recent call last):
  File "/usr/lib/python3.10/logging/__init__.py", line 1104, in emit
    self.flush()
  File "/usr/lib/python3.10/logging/__init__.py", line 1084, in flush
    self.stream.flush()
OSError: [Errno 107] Transport endpoint is not connected
Call stack:
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.10/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelapp.py", line 619, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.10/dist-packages/tor

Epoch: [3][0/5891] Elapsed 0m 0s (remain 61m 3s) Loss: 0.1041(0.1041) Grad: nan  LR: 0.00001000  
Epoch: [3][100/5891] Elapsed 0m 21s (remain 20m 11s) Loss: 0.0437(0.1505) Grad: 1.0660  LR: 0.00000987  
Epoch: [3][200/5891] Elapsed 0m 36s (remain 17m 20s) Loss: 0.1584(0.1565) Grad: 0.9417  LR: 0.00000973  
Epoch: [3][300/5891] Elapsed 0m 51s (remain 16m 5s) Loss: 0.2031(0.1562) Grad: 0.9477  LR: 0.00000960  
Epoch: [3][400/5891] Elapsed 1m 7s (remain 15m 20s) Loss: 0.1356(0.1591) Grad: 0.6555  LR: 0.00000947  
Epoch: [3][500/5891] Elapsed 1m 22s (remain 14m 45s) Loss: 0.0690(0.1589) Grad: 1.2061  LR: 0.00000933  
Epoch: [3][600/5891] Elapsed 1m 37s (remain 14m 19s) Loss: 0.1558(0.1593) Grad: 1.2209  LR: 0.00000920  
Epoch: [3][700/5891] Elapsed 1m 52s (remain 13m 54s) Loss: 0.2118(0.1579) Grad: 0.8173  LR: 0.00000907  
Epoch: [3][800/5891] Elapsed 2m 7s (remain 13m 31s) Loss: 0.1526(0.1592) Grad: 1.2247  LR: 0.00000893  
Epoch: [3][900/5891] Elapsed 2m 23s (remain 13m 12s) Loss: 0.0330

Epoch 3 - avg_train_loss: 0.1509  avg_val_loss: 0.1573  time: 1046s
--- Logging error ---
Traceback (most recent call last):
  File "/usr/lib/python3.10/logging/__init__.py", line 1104, in emit
    self.flush()
  File "/usr/lib/python3.10/logging/__init__.py", line 1084, in flush
    self.stream.flush()
OSError: [Errno 107] Transport endpoint is not connected
Call stack:
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.10/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelapp.py", line 619, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.10/dist-packages/tor

Epoch: [4][0/5891] Elapsed 0m 0s (remain 59m 58s) Loss: 0.1615(0.1615) Grad: nan  LR: 0.00000293  
Epoch: [4][100/5891] Elapsed 0m 16s (remain 15m 55s) Loss: 0.2340(0.1441) Grad: 1.0140  LR: 0.00000283  
Epoch: [4][200/5891] Elapsed 0m 32s (remain 15m 23s) Loss: 0.1562(0.1427) Grad: 2.7502  LR: 0.00000274  
Epoch: [4][300/5891] Elapsed 0m 48s (remain 14m 56s) Loss: 0.0551(0.1409) Grad: 0.8285  LR: 0.00000265  
Epoch: [4][400/5891] Elapsed 1m 6s (remain 15m 12s) Loss: 0.1566(0.1417) Grad: 1.1818  LR: 0.00000256  
Epoch: [4][500/5891] Elapsed 1m 22s (remain 14m 49s) Loss: 0.1167(0.1410) Grad: 0.5696  LR: 0.00000247  
Epoch: [4][600/5891] Elapsed 1m 38s (remain 14m 27s) Loss: 0.0549(0.1421) Grad: 1.1826  LR: 0.00000239  
Epoch: [4][700/5891] Elapsed 1m 53s (remain 14m 0s) Loss: 0.2333(0.1423) Grad: 1.0366  LR: 0.00000230  
Epoch: [4][800/5891] Elapsed 2m 8s (remain 13m 37s) Loss: 0.1268(0.1405) Grad: 1.3311  LR: 0.00000222  
Epoch: [4][900/5891] Elapsed 2m 23s (remain 13m 15s) Loss: 0.210

In [None]:
from google.colab import runtime
runtime.unassign()