#epoch増やす、パラメータ調整

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.1-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 3.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 11.3 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 54.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 59.7 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstal

In [3]:
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 3.2 MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.96


In [4]:
import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
INPUT_DIR = '/content/drive/MyDrive/Competitions/Signate/Student Cup 2022/input/'
OUTPUT_DIR = '/content/drive/MyDrive/Competitions/Signate/Student Cup 2022/output/'
OUTPUT_SUB_DIR = os.path.join(OUTPUT_DIR,'Submission')
OUTPUT_MODEL_DIR = os.path.join(OUTPUT_DIR,'Model/DeBERTa-base[ver2]/')

In [6]:
class CFG:
    wandb = False
    apex = True
    model = 'microsoft/deberta-v3-base'
    seed = 42
    n_splits = 5
    max_len = 1024
    dropout = 0.1
    target_size=4
    n_accumulate=1
    print_freq = 50
    min_lr=1e-6
    scheduler = 'cosine'
    batch_size = 8
    num_workers = 2
    lr = 3e-5
    weigth_decay = 0.01
    epochs = 10
    n_fold = 5
    trn_fold = [0, 1, 2, 3, 4]
    train = True 
    num_warmup_steps = 0
    num_cycles=0.5
    debug = False
    debug_ver2 = False
    gradient_checkpointing = True
    freezing = True

In [7]:
# Loss Func
def criterion(outputs, labels):
    return nn.CrossEntropyLoss()(outputs, labels)

def softmax(z):
    assert len(z.shape) == 2
    s = np.max(z, axis=1)
    s = s[:, np.newaxis] # necessary step to do broadcasting
    e_x = np.exp(z - s)
    div = np.sum(e_x, axis=1)
    div = div[:, np.newaxis] # dito
    return e_x / div
"""
def get_score(y_true, y_pred):
    y_pred = softmax(y_pred)
    score = log_loss(y_true, y_pred)
    return round(score, 5)
"""
def get_score(outputs, labels):
    outputs = F.softmax(torch.tensor(outputs)).numpy()
    return f1_score(np.argmax(outputs,axis=1),labels ,average='macro')

def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

def prepare_input(cfg, text, text_2=None):
    inputs = cfg.tokenizer(text, text_2,
                           padding="max_length",
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           truncation=True)

    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
        
    return inputs

In [8]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False
        
def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
            
    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """
    
    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"
        
        if hasattr(embeddings_path, attr_name): 
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [9]:
train = pd.read_csv(os.path.join(INPUT_DIR, 'train.csv'))
test = pd.read_csv(os.path.join(INPUT_DIR, 'test.csv'))
submission_df = pd.read_csv(os.path.join(INPUT_DIR, 'submit_sample.csv'))

display(train.head())
print(train.shape)
display(test.head())
print(test.shape)

Unnamed: 0,id,description,jobflag
0,0,<li>Develop cutting-edge web applications that...,3
1,1,"<li> Designs and develops high quality, scalab...",3
2,2,<li>Functions as a point person for Network St...,4
3,3,"<li> Work on the technical design, development...",3
4,4,<li>Quantify the resources required for a task...,4


(1516, 3)


Unnamed: 0,id,description
0,1516,<li>Building decision-making models and propos...
1,1517,<li>Educate homeowners on the benefits of sola...
2,1518,"<li><span>Design, develop, document, and imple..."
3,1519,<li>Apply advanced technical expertise and ski...
4,1520,<li>Project manage and deliver against our roa...


(1517, 2)


In [10]:
def remove_tag(x):
    p = re.compile(r"<[^>]*?>")
    return p.sub('',x)

def cleaning(texts):
    clean_texts = []
    for text in texts:
        # htmlタグを削除
        text = remove_tag(text)
        #アルファベット以外をスペースに置き換え
        #clean_punc = re.sub(r'[^a-zA-Z]', ' ', text)
        clean_texts.append(text)
    return clean_texts



from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs

def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

# Register the encoding and decoding error handlers for `utf-8` and `cp1252`.
codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)

def resolve_encodings_and_normalize(text: str) -> str:
    """Resolve the encoding problems and normalize the abnormal characters."""
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text

train['description'] = cleaning(train['description'])
test['description'] = cleaning(test['description'])
train['inputs'] = train['description'].apply(lambda x : resolve_encodings_and_normalize(x))
test['inputs'] = test['description'].apply(lambda x : resolve_encodings_and_normalize(x))
train = train.rename(columns = {"jobflag": "label"})
train["label"] = train["label"].apply(lambda x : 0 if x == 4 else x)
train

Unnamed: 0,id,description,label,inputs
0,0,Develop cutting-edge web applications that per...,3,Develop cutting-edge web applications that per...
1,1,"Designs and develops high quality, scalable a...",3,"Designs and develops high quality, scalable a..."
2,2,Functions as a point person for Network Strate...,0,Functions as a point person for Network Strate...
3,3,"Work on the technical design, development, re...",3,"Work on the technical design, development, re..."
4,4,Quantify the resources required for a task/pro...,0,Quantify the resources required for a task/pro...
...,...,...,...,...
1511,1511,"Support detailed reporting, statistical analys...",1,"Support detailed reporting, statistical analys..."
1512,1512,Collaborate with teams to support the ML techn...,2,Collaborate with teams to support the ML techn...
1513,1513,Work with executives and other business leade...,1,Work with executives and other business leade...
1514,1514,Leading design ideation sessions to ensure we ...,3,Leading design ideation sessions to ensure we ...


In [11]:
skf = StratifiedKFold(n_splits=CFG.n_splits,shuffle=True,random_state=CFG.seed)
for fold, ( _, val_) in enumerate(skf.split(train, train.label)):
    train.loc[val_ , "kfold"] = int(fold)
    
train["kfold"] = train["kfold"].astype(int)

In [12]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_MODEL_DIR+'tokenizer/')
CFG.tokenizer = tokenizer
SEP = tokenizer.sep_token

Downloading tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
class Dataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = CFG.max_len
        self.text = df['inputs'].values
        self.tokenizer = CFG.tokenizer
        self.targets = df['label'].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        text = self.text[index]
        inputs = tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length = self.max_len
        )
        samples = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
            'target': self.targets[index]
        }

        if 'token_type_ids' in inputs:
            samples['token_type_ids'] = inputs['token_type_ids']
            
        return samples

In [14]:
# Dynamic Padding (Collate)
#collate_fn = DataCollatorWithPadding(tokenizer=CFG.tokenizer)
class Collate:
    def __init__(self, tokenizer, isTrain=True):
        self.tokenizer = tokenizer
        self.isTrain = isTrain
        # self.args = args

    def __call__(self, batch):
        output = dict()
        output["input_ids"] = [sample["input_ids"] for sample in batch]
        output["attention_mask"] = [sample["attention_mask"] for sample in batch]
        if self.isTrain:
            output["target"] = [sample["target"] for sample in batch]

        # calculate max token length of this batch
        batch_max = max([len(ids) for ids in output["input_ids"]])

        # add padding
        if self.tokenizer.padding_side == "right":
            output["input_ids"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in output["input_ids"]]
            output["attention_mask"] = [s + (batch_max - len(s)) * [0] for s in output["attention_mask"]]
        else:
            output["input_ids"] = [(batch_max - len(s)) * [self.tokenizer.pad_token_id] + s for s in output["input_ids"]]
            output["attention_mask"] = [(batch_max - len(s)) * [0] + s for s in output["attention_mask"]]

        # convert to tensors
        output["input_ids"] = torch.tensor(output["input_ids"], dtype=torch.long)
        output["attention_mask"] = torch.tensor(output["attention_mask"], dtype=torch.long)
        if self.isTrain:
            output["target"] = torch.tensor(output["target"], dtype=torch.long)

        return output
    
collate_fn = Collate(CFG.tokenizer, isTrain=True)

In [15]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9) #
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

In [16]:
# ====================================================
# Model
# ====================================================
class CustomModel(nn.Module):
    def __init__(self, model_name):
        super(CustomModel, self).__init__()
        # Header (fast or normal)
        self.model = AutoModel.from_pretrained(model_name)
        
        # Gradient_checkpointing
        if CFG.gradient_checkpointing:
            (self.model).gradient_checkpointing_enable()
        
        # Freezing
        if CFG.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            CFG.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())
        
        self.config = AutoConfig.from_pretrained(model_name)
        self.drop = nn.Dropout(p=CFG.dropout)
        self.pooler = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, CFG.target_size)
        
    def forward(self, ids, mask):        
        out = self.model(input_ids=ids, 
                         attention_mask=mask,
                         output_hidden_states=False)
        out = self.pooler(out.last_hidden_state, mask)
        out = self.drop(out)
        outputs = self.fc(out)
        return outputs

In [17]:
def asMinutes(s):
    m = math.floor(s/60)
    s -= m * 60
    return "%dm %ds" % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))

def get_scheduler(cfg, optimizer, num_train_steps):
    if cfg.scheduler == 'linear':
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
        )
    elif cfg.scheduler == 'cosine':
        scheduler = get_cosine_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
        )
    return scheduler

def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):
    model.train()

    dataset_size = 0
    running_loss = 0

    start = end = time.time()

    for step, data in enumerate(dataloader):
        ids = data['input_ids'].to(device, dtype=torch.long)
        mask = data['attention_mask'].to(device, dtype=torch.long)
        targets = data['target'].to(device, dtype=torch.long)

        batch_size = ids.size(0)
        
        outputs = model(ids, mask)
        loss = criterion(outputs, targets)

        #accumulate
        loss = loss / CFG.n_accumulate 
        loss.backward()
        if (step +1) % CFG.n_accumulate == 0:
            optimizer.step()

            optimizer.zero_grad()
            if scheduler is not None:
                scheduler.step()
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size

        epoch_loss = running_loss / dataset_size

        end = time.time()
        
        if step % CFG.print_freq == 0 or step == (len(dataloader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  .format(epoch+1, step, len(dataloader), 
                          remain=timeSince(start, float(step+1)/len(dataloader))))

    gc.collect()

    return epoch_loss


@torch.no_grad()
def valid_one_epoch(model, dataloader, device, epoch):
    model.eval()

    dataset_size = 0
    running_loss = 0

    start = end = time.time()
    pred = []

    for step, data in enumerate(dataloader):
        ids = data['input_ids'].to(device, dtype=torch.long)
        mask = data['attention_mask'].to(device, dtype=torch.long)
        targets = data['target'].to(device, dtype=torch.long)

        batch_size = ids.size(0)
        outputs = model(ids, mask)
        loss = criterion(outputs, targets)
        pred.append(outputs.to('cpu').numpy())

        running_loss += (loss.item()* batch_size)
        dataset_size += batch_size

        epoch_loss = running_loss / dataset_size

        end = time.time()

        if step % CFG.print_freq == 0 or step == (len(dataloader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  .format(step, len(dataloader),
                          remain=timeSince(start, float(step+1)/len(dataloader))))
            
    pred = np.concatenate(pred)
            
    return epoch_loss, pred

In [18]:
def train_loop(fold):
    #wandb.watch(model, log_freq=100)

    LOGGER.info(f'-------------fold:{fold} training-------------')

    train_data = train[train.kfold != fold].reset_index(drop=True)
    valid_data = train[train.kfold == fold].reset_index(drop=True)
    valid_labels = valid_data.label.values

    trainDataset = Dataset(train_data, CFG.tokenizer, CFG.max_len)
    validDataset = Dataset(valid_data, CFG.tokenizer, CFG.max_len)

    train_loader = DataLoader(trainDataset,
                              batch_size = CFG.batch_size,
                              shuffle=True,
                              collate_fn = collate_fn,
                              num_workers = CFG.num_workers,
                              pin_memory = True,
                              drop_last=True)
    
    valid_loader = DataLoader(validDataset,
                              batch_size = CFG.batch_size*2,
                              shuffle=False,
                              collate_fn = collate_fn,
                              num_workers = CFG.num_workers,
                              pin_memory = True,
                              drop_last=False)
    
    model = CustomModel(CFG.model)
    torch.save(model.config, OUTPUT_MODEL_DIR+'config.pth')
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=CFG.lr, weight_decay=CFG.weigth_decay)
    num_train_steps = int(len(train_data) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # loop
    best_score = 0

    for epoch in range(CFG.epochs):
        start_time = time.time()

        train_epoch_loss = train_one_epoch(model, optimizer, scheduler, train_loader, device, epoch)
        valid_epoch_loss, pred = valid_one_epoch(model, valid_loader, device, epoch)

        score = get_score(pred, valid_labels)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {train_epoch_loss:.4f}  avg_val_loss: {valid_epoch_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')
        if CFG.wandb:
            wandb.log({f"[fold{fold}] epoch": epoch+1, 
                       f"[fold{fold}] avg_train_loss": train_epoch_loss, 
                       f"[fold{fold}] avg_val_loss": valid_epoch_loss,
                       f"[fold{fold}] score": score})
            
        if score > best_score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': pred},
                        OUTPUT_MODEL_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")
            
    predictions = torch.load(OUTPUT_MODEL_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_data['Consultant'] = predictions[:, 0]
    valid_data['Data scientist'] = predictions[:, 1]
    valid_data['Machine learning engineer'] = predictions[:, 2]
    valid_data['Software engineer'] = predictions[:, 3]
    
    
    temp = valid_data[['Consultant','Data scientist','Machine learning engineer','Software engineer']].values.tolist()
    print(get_score(temp, valid_data['label'].values))

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_data

In [19]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df['label'].values
        preds = oof_df[['Consultant','Data scientist','Machine learning engineer','Software engineer']].values.tolist()
        score = get_score(preds, labels)
        LOGGER.info(f'Score: {score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_MODEL_DIR+'oof_df.pkl')
        oof_df.to_csv(OUTPUT_MODEL_DIR+f'oof_df.csv', index=False)

-------------fold:0 training-------------


Downloading pytorch_model.bin:   0%|          | 0.00/354M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/151] Elapsed 0m 4s (remain 10m 9s) 
Epoch: [1][50/151] Elapsed 0m 22s (remain 0m 43s) 
Epoch: [1][100/151] Elapsed 0m 38s (remain 0m 19s) 
Epoch: [1][150/151] Elapsed 0m 55s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 6s) 


Epoch 1 - avg_train_loss: 0.9333  avg_val_loss: 0.8344  time: 60s
Epoch 1 - Score: 0.5196
Epoch 1 - Save Best Score: 0.5196 Model


EVAL: [18/19] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [2][0/151] Elapsed 0m 0s (remain 1m 4s) 
Epoch: [2][50/151] Elapsed 0m 18s (remain 0m 36s) 
Epoch: [2][100/151] Elapsed 0m 36s (remain 0m 18s) 
Epoch: [2][150/151] Elapsed 0m 52s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 6s) 


Epoch 2 - avg_train_loss: 0.5791  avg_val_loss: 0.6335  time: 57s
Epoch 2 - Score: 0.6113
Epoch 2 - Save Best Score: 0.6113 Model


EVAL: [18/19] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [3][0/151] Elapsed 0m 0s (remain 1m 21s) 
Epoch: [3][50/151] Elapsed 0m 18s (remain 0m 35s) 
Epoch: [3][100/151] Elapsed 0m 36s (remain 0m 18s) 
Epoch: [3][150/151] Elapsed 0m 53s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 6s) 


Epoch 3 - avg_train_loss: 0.4245  avg_val_loss: 0.6633  time: 58s
Epoch 3 - Score: 0.6799
Epoch 3 - Save Best Score: 0.6799 Model


EVAL: [18/19] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [4][0/151] Elapsed 0m 0s (remain 1m 10s) 
Epoch: [4][50/151] Elapsed 0m 19s (remain 0m 37s) 
Epoch: [4][100/151] Elapsed 0m 37s (remain 0m 18s) 
Epoch: [4][150/151] Elapsed 0m 52s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 6s) 


Epoch 4 - avg_train_loss: 0.2863  avg_val_loss: 0.6578  time: 58s
Epoch 4 - Score: 0.7607
Epoch 4 - Save Best Score: 0.7607 Model


EVAL: [18/19] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [5][0/151] Elapsed 0m 0s (remain 2m 19s) 
Epoch: [5][50/151] Elapsed 0m 18s (remain 0m 36s) 
Epoch: [5][100/151] Elapsed 0m 36s (remain 0m 17s) 
Epoch: [5][150/151] Elapsed 0m 53s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 6s) 


Epoch 5 - avg_train_loss: 0.1688  avg_val_loss: 0.7163  time: 58s
Epoch 5 - Score: 0.7297


EVAL: [18/19] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [6][0/151] Elapsed 0m 0s (remain 2m 13s) 
Epoch: [6][50/151] Elapsed 0m 17s (remain 0m 34s) 
Epoch: [6][100/151] Elapsed 0m 35s (remain 0m 17s) 
Epoch: [6][150/151] Elapsed 0m 52s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 6s) 


Epoch 6 - avg_train_loss: 0.0653  avg_val_loss: 0.8007  time: 57s
Epoch 6 - Score: 0.7592


EVAL: [18/19] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [7][0/151] Elapsed 0m 0s (remain 1m 25s) 
Epoch: [7][50/151] Elapsed 0m 17s (remain 0m 35s) 
Epoch: [7][100/151] Elapsed 0m 33s (remain 0m 16s) 
Epoch: [7][150/151] Elapsed 0m 52s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) 


Epoch 7 - avg_train_loss: 0.0305  avg_val_loss: 0.8364  time: 57s
Epoch 7 - Score: 0.7549


EVAL: [18/19] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [8][0/151] Elapsed 0m 0s (remain 1m 7s) 
Epoch: [8][50/151] Elapsed 0m 18s (remain 0m 36s) 
Epoch: [8][100/151] Elapsed 0m 35s (remain 0m 17s) 
Epoch: [8][150/151] Elapsed 0m 52s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 6s) 


Epoch 8 - avg_train_loss: 0.0108  avg_val_loss: 0.8457  time: 58s
Epoch 8 - Score: 0.7632
Epoch 8 - Save Best Score: 0.7632 Model


EVAL: [18/19] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [9][0/151] Elapsed 0m 0s (remain 1m 19s) 
Epoch: [9][50/151] Elapsed 0m 18s (remain 0m 35s) 
Epoch: [9][100/151] Elapsed 0m 35s (remain 0m 17s) 
Epoch: [9][150/151] Elapsed 0m 52s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) 


Epoch 9 - avg_train_loss: 0.0082  avg_val_loss: 0.8556  time: 58s
Epoch 9 - Score: 0.7509


EVAL: [18/19] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [10][0/151] Elapsed 0m 0s (remain 1m 19s) 
Epoch: [10][50/151] Elapsed 0m 18s (remain 0m 36s) 
Epoch: [10][100/151] Elapsed 0m 35s (remain 0m 17s) 
Epoch: [10][150/151] Elapsed 0m 52s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 6s) 


Epoch 10 - avg_train_loss: 0.0070  avg_val_loss: 0.8599  time: 58s
Epoch 10 - Score: 0.7509


EVAL: [18/19] Elapsed 0m 4s (remain 0m 0s) 


Score: 0.7632
-------------fold:1 training-------------


0.7632318732463171


Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/151] Elapsed 0m 0s (remain 1m 18s) 
Epoch: [1][50/151] Elapsed 0m 18s (remain 0m 35s) 
Epoch: [1][100/151] Elapsed 0m 35s (remain 0m 17s) 
Epoch: [1][150/151] Elapsed 0m 53s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 6s) 


Epoch 1 - avg_train_loss: 0.9985  avg_val_loss: 0.6970  time: 57s
Epoch 1 - Score: 0.5630
Epoch 1 - Save Best Score: 0.5630 Model


EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) 
Epoch: [2][0/151] Elapsed 0m 0s (remain 0m 55s) 
Epoch: [2][50/151] Elapsed 0m 17s (remain 0m 34s) 
Epoch: [2][100/151] Elapsed 0m 35s (remain 0m 17s) 
Epoch: [2][150/151] Elapsed 0m 54s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 5s) 


Epoch 2 - avg_train_loss: 0.6154  avg_val_loss: 0.7007  time: 58s
Epoch 2 - Score: 0.6753
Epoch 2 - Save Best Score: 0.6753 Model


EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) 
Epoch: [3][0/151] Elapsed 0m 0s (remain 1m 22s) 
Epoch: [3][50/151] Elapsed 0m 16s (remain 0m 32s) 
Epoch: [3][100/151] Elapsed 0m 35s (remain 0m 17s) 
Epoch: [3][150/151] Elapsed 0m 51s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 6s) 


Epoch 3 - avg_train_loss: 0.4948  avg_val_loss: 0.6335  time: 56s
Epoch 3 - Score: 0.6731


EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) 
Epoch: [4][0/151] Elapsed 0m 0s (remain 0m 47s) 
Epoch: [4][50/151] Elapsed 0m 17s (remain 0m 33s) 
Epoch: [4][100/151] Elapsed 0m 34s (remain 0m 17s) 
Epoch: [4][150/151] Elapsed 0m 53s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 6s) 


Epoch 4 - avg_train_loss: 0.3415  avg_val_loss: 0.6557  time: 57s
Epoch 4 - Score: 0.7048
Epoch 4 - Save Best Score: 0.7048 Model


EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) 
Epoch: [5][0/151] Elapsed 0m 0s (remain 0m 52s) 
Epoch: [5][50/151] Elapsed 0m 20s (remain 0m 39s) 
Epoch: [5][100/151] Elapsed 0m 35s (remain 0m 17s) 
Epoch: [5][150/151] Elapsed 0m 52s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 6s) 


Epoch 5 - avg_train_loss: 0.2272  avg_val_loss: 0.7381  time: 56s
Epoch 5 - Score: 0.6982


EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) 
Epoch: [6][0/151] Elapsed 0m 0s (remain 1m 15s) 
Epoch: [6][50/151] Elapsed 0m 18s (remain 0m 36s) 
Epoch: [6][100/151] Elapsed 0m 36s (remain 0m 18s) 
Epoch: [6][150/151] Elapsed 0m 53s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 5s) 


Epoch 6 - avg_train_loss: 0.1399  avg_val_loss: 0.8322  time: 58s
Epoch 6 - Score: 0.7016


EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) 
Epoch: [7][0/151] Elapsed 0m 0s (remain 1m 21s) 
Epoch: [7][50/151] Elapsed 0m 17s (remain 0m 34s) 
Epoch: [7][100/151] Elapsed 0m 37s (remain 0m 18s) 
Epoch: [7][150/151] Elapsed 0m 54s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 5s) 


Epoch 7 - avg_train_loss: 0.0608  avg_val_loss: 0.8522  time: 58s
Epoch 7 - Score: 0.6933


EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) 
Epoch: [8][0/151] Elapsed 0m 0s (remain 0m 58s) 
Epoch: [8][50/151] Elapsed 0m 18s (remain 0m 35s) 
Epoch: [8][100/151] Elapsed 0m 36s (remain 0m 18s) 
Epoch: [8][150/151] Elapsed 0m 54s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 5s) 


Epoch 8 - avg_train_loss: 0.0334  avg_val_loss: 0.9058  time: 58s
Epoch 8 - Score: 0.6994


EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) 
Epoch: [9][0/151] Elapsed 0m 0s (remain 0m 58s) 
Epoch: [9][50/151] Elapsed 0m 17s (remain 0m 34s) 
Epoch: [9][100/151] Elapsed 0m 35s (remain 0m 17s) 
Epoch: [9][150/151] Elapsed 0m 53s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 5s) 


Epoch 9 - avg_train_loss: 0.0191  avg_val_loss: 0.9320  time: 57s
Epoch 9 - Score: 0.7293
Epoch 9 - Save Best Score: 0.7293 Model


EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) 
Epoch: [10][0/151] Elapsed 0m 0s (remain 0m 57s) 
Epoch: [10][50/151] Elapsed 0m 17s (remain 0m 33s) 
Epoch: [10][100/151] Elapsed 0m 33s (remain 0m 16s) 
Epoch: [10][150/151] Elapsed 0m 52s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 5s) 


Epoch 10 - avg_train_loss: 0.0183  avg_val_loss: 0.9434  time: 56s
Epoch 10 - Score: 0.6994


EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) 


Score: 0.7293


0.7293167677468589


-------------fold:2 training-------------
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceCl

Epoch: [1][0/151] Elapsed 0m 0s (remain 1m 8s) 
Epoch: [1][50/151] Elapsed 0m 18s (remain 0m 36s) 
Epoch: [1][100/151] Elapsed 0m 35s (remain 0m 17s) 
Epoch: [1][150/151] Elapsed 0m 51s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 6s) 


Epoch 1 - avg_train_loss: 0.9741  avg_val_loss: 0.6511  time: 56s
Epoch 1 - Score: 0.6886
Epoch 1 - Save Best Score: 0.6886 Model


EVAL: [18/19] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [2][0/151] Elapsed 0m 0s (remain 0m 59s) 
Epoch: [2][50/151] Elapsed 0m 17s (remain 0m 34s) 
Epoch: [2][100/151] Elapsed 0m 36s (remain 0m 17s) 
Epoch: [2][150/151] Elapsed 0m 50s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) 


Epoch 2 - avg_train_loss: 0.6390  avg_val_loss: 0.6070  time: 56s
Epoch 2 - Score: 0.6948
Epoch 2 - Save Best Score: 0.6948 Model


EVAL: [18/19] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [3][0/151] Elapsed 0m 0s (remain 1m 26s) 
Epoch: [3][50/151] Elapsed 0m 16s (remain 0m 32s) 
Epoch: [3][100/151] Elapsed 0m 35s (remain 0m 17s) 
Epoch: [3][150/151] Elapsed 0m 51s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 6s) 


Epoch 3 - avg_train_loss: 0.4504  avg_val_loss: 0.6280  time: 56s
Epoch 3 - Score: 0.6262


EVAL: [18/19] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [4][0/151] Elapsed 0m 0s (remain 0m 53s) 
Epoch: [4][50/151] Elapsed 0m 18s (remain 0m 36s) 
Epoch: [4][100/151] Elapsed 0m 35s (remain 0m 17s) 
Epoch: [4][150/151] Elapsed 0m 52s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 6s) 


Epoch 4 - avg_train_loss: 0.2987  avg_val_loss: 0.7204  time: 57s
Epoch 4 - Score: 0.6977
Epoch 4 - Save Best Score: 0.6977 Model


EVAL: [18/19] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [5][0/151] Elapsed 0m 0s (remain 1m 3s) 
Epoch: [5][50/151] Elapsed 0m 18s (remain 0m 35s) 
Epoch: [5][100/151] Elapsed 0m 34s (remain 0m 17s) 
Epoch: [5][150/151] Elapsed 0m 51s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 6s) 


Epoch 5 - avg_train_loss: 0.1844  avg_val_loss: 0.8088  time: 56s
Epoch 5 - Score: 0.6779


EVAL: [18/19] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [6][0/151] Elapsed 0m 0s (remain 1m 7s) 
Epoch: [6][50/151] Elapsed 0m 17s (remain 0m 34s) 
Epoch: [6][100/151] Elapsed 0m 35s (remain 0m 17s) 
Epoch: [6][150/151] Elapsed 0m 51s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) 


Epoch 6 - avg_train_loss: 0.1082  avg_val_loss: 0.9025  time: 56s
Epoch 6 - Score: 0.6739


EVAL: [18/19] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [7][0/151] Elapsed 0m 0s (remain 1m 42s) 
Epoch: [7][50/151] Elapsed 0m 16s (remain 0m 31s) 
Epoch: [7][100/151] Elapsed 0m 34s (remain 0m 17s) 
Epoch: [7][150/151] Elapsed 0m 50s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 6s) 


Epoch 7 - avg_train_loss: 0.0429  avg_val_loss: 1.0117  time: 55s
Epoch 7 - Score: 0.6703


EVAL: [18/19] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [8][0/151] Elapsed 0m 0s (remain 0m 53s) 
Epoch: [8][50/151] Elapsed 0m 16s (remain 0m 32s) 
Epoch: [8][100/151] Elapsed 0m 35s (remain 0m 17s) 
Epoch: [8][150/151] Elapsed 0m 51s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 6s) 


Epoch 8 - avg_train_loss: 0.0310  avg_val_loss: 0.9817  time: 56s
Epoch 8 - Score: 0.6868


EVAL: [18/19] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [9][0/151] Elapsed 0m 0s (remain 0m 52s) 
Epoch: [9][50/151] Elapsed 0m 17s (remain 0m 34s) 
Epoch: [9][100/151] Elapsed 0m 34s (remain 0m 16s) 
Epoch: [9][150/151] Elapsed 0m 50s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 6s) 


Epoch 9 - avg_train_loss: 0.0206  avg_val_loss: 1.0077  time: 56s
Epoch 9 - Score: 0.6716


EVAL: [18/19] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [10][0/151] Elapsed 0m 0s (remain 1m 10s) 
Epoch: [10][50/151] Elapsed 0m 16s (remain 0m 32s) 
Epoch: [10][100/151] Elapsed 0m 34s (remain 0m 17s) 
Epoch: [10][150/151] Elapsed 0m 50s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) 


Epoch 10 - avg_train_loss: 0.0163  avg_val_loss: 1.0180  time: 55s
Epoch 10 - Score: 0.6798


EVAL: [18/19] Elapsed 0m 4s (remain 0m 0s) 
0.6976686328965201


Score: 0.6977
-------------fold:3 training-------------
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a Ber

Epoch: [1][0/151] Elapsed 0m 0s (remain 1m 20s) 
Epoch: [1][50/151] Elapsed 0m 18s (remain 0m 35s) 
Epoch: [1][100/151] Elapsed 0m 36s (remain 0m 17s) 
Epoch: [1][150/151] Elapsed 0m 53s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) 


Epoch 1 - avg_train_loss: 0.9742  avg_val_loss: 0.7066  time: 57s
Epoch 1 - Score: 0.5556
Epoch 1 - Save Best Score: 0.5556 Model


EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) 
Epoch: [2][0/151] Elapsed 0m 0s (remain 2m 2s) 
Epoch: [2][50/151] Elapsed 0m 19s (remain 0m 37s) 
Epoch: [2][100/151] Elapsed 0m 38s (remain 0m 18s) 
Epoch: [2][150/151] Elapsed 0m 53s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) 


Epoch 2 - avg_train_loss: 0.6416  avg_val_loss: 0.6694  time: 58s
Epoch 2 - Score: 0.6483
Epoch 2 - Save Best Score: 0.6483 Model


EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) 
Epoch: [3][0/151] Elapsed 0m 0s (remain 1m 9s) 
Epoch: [3][50/151] Elapsed 0m 20s (remain 0m 39s) 
Epoch: [3][100/151] Elapsed 0m 36s (remain 0m 18s) 
Epoch: [3][150/151] Elapsed 0m 54s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) 


Epoch 3 - avg_train_loss: 0.4642  avg_val_loss: 0.6512  time: 58s
Epoch 3 - Score: 0.6908
Epoch 3 - Save Best Score: 0.6908 Model


EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) 
Epoch: [4][0/151] Elapsed 0m 0s (remain 1m 19s) 
Epoch: [4][50/151] Elapsed 0m 17s (remain 0m 33s) 
Epoch: [4][100/151] Elapsed 0m 34s (remain 0m 17s) 
Epoch: [4][150/151] Elapsed 0m 53s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) 


Epoch 4 - avg_train_loss: 0.3485  avg_val_loss: 0.5903  time: 58s
Epoch 4 - Score: 0.7065
Epoch 4 - Save Best Score: 0.7065 Model


EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) 
Epoch: [5][0/151] Elapsed 0m 1s (remain 2m 54s) 
Epoch: [5][50/151] Elapsed 0m 19s (remain 0m 38s) 
Epoch: [5][100/151] Elapsed 0m 36s (remain 0m 18s) 
Epoch: [5][150/151] Elapsed 0m 52s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) 


Epoch 5 - avg_train_loss: 0.2050  avg_val_loss: 0.7770  time: 56s
Epoch 5 - Score: 0.6931


EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) 
Epoch: [6][0/151] Elapsed 0m 0s (remain 0m 57s) 
Epoch: [6][50/151] Elapsed 0m 17s (remain 0m 34s) 
Epoch: [6][100/151] Elapsed 0m 36s (remain 0m 17s) 
Epoch: [6][150/151] Elapsed 0m 52s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) 


Epoch 6 - avg_train_loss: 0.1101  avg_val_loss: 0.8596  time: 56s
Epoch 6 - Score: 0.7233
Epoch 6 - Save Best Score: 0.7233 Model


EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) 
Epoch: [7][0/151] Elapsed 0m 0s (remain 1m 10s) 
Epoch: [7][50/151] Elapsed 0m 18s (remain 0m 35s) 
Epoch: [7][100/151] Elapsed 0m 34s (remain 0m 16s) 
Epoch: [7][150/151] Elapsed 0m 53s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) 


Epoch 7 - avg_train_loss: 0.0778  avg_val_loss: 0.8499  time: 57s
Epoch 7 - Score: 0.7271
Epoch 7 - Save Best Score: 0.7271 Model


EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) 
Epoch: [8][0/151] Elapsed 0m 0s (remain 1m 13s) 
Epoch: [8][50/151] Elapsed 0m 19s (remain 0m 38s) 
Epoch: [8][100/151] Elapsed 0m 36s (remain 0m 18s) 
Epoch: [8][150/151] Elapsed 0m 52s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) 


Epoch 8 - avg_train_loss: 0.0293  avg_val_loss: 0.8885  time: 56s
Epoch 8 - Score: 0.7334
Epoch 8 - Save Best Score: 0.7334 Model


EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) 
Epoch: [9][0/151] Elapsed 0m 0s (remain 1m 16s) 
Epoch: [9][50/151] Elapsed 0m 20s (remain 0m 40s) 
Epoch: [9][100/151] Elapsed 0m 37s (remain 0m 18s) 
Epoch: [9][150/151] Elapsed 0m 54s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 6s) 


Epoch 9 - avg_train_loss: 0.0194  avg_val_loss: 0.9130  time: 58s
Epoch 9 - Score: 0.7282


EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) 
Epoch: [10][0/151] Elapsed 0m 0s (remain 1m 23s) 
Epoch: [10][50/151] Elapsed 0m 20s (remain 0m 39s) 
Epoch: [10][100/151] Elapsed 0m 36s (remain 0m 17s) 
Epoch: [10][150/151] Elapsed 0m 52s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 6s) 


Epoch 10 - avg_train_loss: 0.0149  avg_val_loss: 0.9179  time: 57s
Epoch 10 - Score: 0.7233


EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) 
0.73338688449166


Score: 0.7334
-------------fold:4 training-------------
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a Ber

Epoch: [1][0/151] Elapsed 0m 0s (remain 1m 29s) 
Epoch: [1][50/151] Elapsed 0m 18s (remain 0m 36s) 
Epoch: [1][100/151] Elapsed 0m 35s (remain 0m 17s) 
Epoch: [1][150/151] Elapsed 0m 51s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 8s) 


Epoch 1 - avg_train_loss: 1.0259  avg_val_loss: 0.7582  time: 55s
Epoch 1 - Score: 0.5456
Epoch 1 - Save Best Score: 0.5456 Model


EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) 
Epoch: [2][0/151] Elapsed 0m 0s (remain 1m 16s) 
Epoch: [2][50/151] Elapsed 0m 18s (remain 0m 35s) 
Epoch: [2][100/151] Elapsed 0m 35s (remain 0m 17s) 
Epoch: [2][150/151] Elapsed 0m 52s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 8s) 


Epoch 2 - avg_train_loss: 0.6883  avg_val_loss: 0.6893  time: 56s
Epoch 2 - Score: 0.6253
Epoch 2 - Save Best Score: 0.6253 Model


EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) 
Epoch: [3][0/151] Elapsed 0m 0s (remain 0m 48s) 
Epoch: [3][50/151] Elapsed 0m 18s (remain 0m 35s) 
Epoch: [3][100/151] Elapsed 0m 34s (remain 0m 17s) 
Epoch: [3][150/151] Elapsed 0m 52s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 8s) 


Epoch 3 - avg_train_loss: 0.5520  avg_val_loss: 0.6974  time: 57s
Epoch 3 - Score: 0.6598
Epoch 3 - Save Best Score: 0.6598 Model


EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) 
Epoch: [4][0/151] Elapsed 0m 0s (remain 1m 1s) 
Epoch: [4][50/151] Elapsed 0m 18s (remain 0m 35s) 
Epoch: [4][100/151] Elapsed 0m 34s (remain 0m 17s) 
Epoch: [4][150/151] Elapsed 0m 50s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 8s) 


Epoch 4 - avg_train_loss: 0.4001  avg_val_loss: 0.6016  time: 55s
Epoch 4 - Score: 0.7305
Epoch 4 - Save Best Score: 0.7305 Model


EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) 
Epoch: [5][0/151] Elapsed 0m 0s (remain 1m 10s) 
Epoch: [5][50/151] Elapsed 0m 18s (remain 0m 36s) 
Epoch: [5][100/151] Elapsed 0m 36s (remain 0m 17s) 
Epoch: [5][150/151] Elapsed 0m 52s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 8s) 


Epoch 5 - avg_train_loss: 0.2805  avg_val_loss: 0.6555  time: 56s
Epoch 5 - Score: 0.7053


EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) 
Epoch: [6][0/151] Elapsed 0m 0s (remain 0m 52s) 
Epoch: [6][50/151] Elapsed 0m 16s (remain 0m 32s) 
Epoch: [6][100/151] Elapsed 0m 34s (remain 0m 16s) 
Epoch: [6][150/151] Elapsed 0m 51s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 8s) 


Epoch 6 - avg_train_loss: 0.1842  avg_val_loss: 0.7138  time: 56s
Epoch 6 - Score: 0.7296


EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) 
Epoch: [7][0/151] Elapsed 0m 0s (remain 0m 58s) 
Epoch: [7][50/151] Elapsed 0m 17s (remain 0m 34s) 
Epoch: [7][100/151] Elapsed 0m 35s (remain 0m 17s) 
Epoch: [7][150/151] Elapsed 0m 51s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 8s) 


Epoch 7 - avg_train_loss: 0.1045  avg_val_loss: 0.8909  time: 56s
Epoch 7 - Score: 0.6954


EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) 
Epoch: [8][0/151] Elapsed 0m 0s (remain 0m 52s) 
Epoch: [8][50/151] Elapsed 0m 17s (remain 0m 34s) 
Epoch: [8][100/151] Elapsed 0m 35s (remain 0m 17s) 
Epoch: [8][150/151] Elapsed 0m 52s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 8s) 


Epoch 8 - avg_train_loss: 0.0673  avg_val_loss: 0.8810  time: 57s
Epoch 8 - Score: 0.7381
Epoch 8 - Save Best Score: 0.7381 Model


EVAL: [18/19] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [9][0/151] Elapsed 0m 0s (remain 1m 6s) 
Epoch: [9][50/151] Elapsed 0m 18s (remain 0m 36s) 
Epoch: [9][100/151] Elapsed 0m 35s (remain 0m 17s) 
Epoch: [9][150/151] Elapsed 0m 52s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 8s) 


Epoch 9 - avg_train_loss: 0.0563  avg_val_loss: 0.9134  time: 56s
Epoch 9 - Score: 0.7387
Epoch 9 - Save Best Score: 0.7387 Model


EVAL: [18/19] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [10][0/151] Elapsed 0m 0s (remain 1m 37s) 
Epoch: [10][50/151] Elapsed 0m 17s (remain 0m 33s) 
Epoch: [10][100/151] Elapsed 0m 34s (remain 0m 17s) 
Epoch: [10][150/151] Elapsed 0m 51s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 8s) 


Epoch 10 - avg_train_loss: 0.0464  avg_val_loss: 0.9164  time: 56s
Epoch 10 - Score: 0.7364


EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) 


Score: 0.7387


0.7386594806051077


Score: 0.7327


In [20]:
A = pd.read_csv(OUTPUT_MODEL_DIR+'oof_df.csv')
A.head()

Unnamed: 0,id,description,label,inputs,kfold,Consultant,Data scientist,Machine learning engineer,Software engineer
0,1,"Designs and develops high quality, scalable a...",3,"Designs and develops high quality, scalable a...",0,-1.282729,-2.773179,-2.872591,5.333655
1,9,Maintain and improve existing predictive model...,1,Maintain and improve existing predictive model...,0,-0.521084,5.992557,-2.142767,-3.16655
2,10,Optimize deep learning frameworks like Tensor...,2,Optimize deep learning frameworks like Tensor...,0,-2.937326,-0.239947,5.709708,-0.98237
3,26,Explore and evaluate new ML algorithms to opti...,1,Explore and evaluate new ML algorithms to opti...,0,-3.754522,3.016727,3.990207,-2.201664
4,32,Optimizing our ML model and methods for deter...,2,Optimizing our ML model and methods for deter...,0,-3.863037,3.374876,3.804641,-1.566969
