#重複削除、Pooling層追加

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
INPUT_DIR = '/content/drive/MyDrive/Competitions/Signate/Student Cup 2022/input/'
OUTPUT_DIR = '/content/drive/MyDrive/Competitions/Signate/Student Cup 2022/output/'
OUTPUT_SUB_DIR = os.path.join(OUTPUT_DIR,'Submission')
OUTPUT_MODEL_DIR = os.path.join(OUTPUT_DIR,'Model/DeBERTa-v3-large[ver3]/')

In [None]:
class CFG:
    wandb = False
    apex = True
    model = 'microsoft/deberta-v3-large'
    seed = 42
    n_splits = 5
    max_len = 1024
    dropout = 0.2
    target_size=4
    n_accumulate=1
    print_freq = 50
    min_lr=1e-6
    scheduler = 'cosine'
    batch_size = 16
    num_workers = 2
    lr = 3e-5
    weigth_decay = 0.01
    epochs = 10
    n_fold = 5
    trn_fold = [0, 1, 2, 3, 4]
    train = True 
    num_warmup_steps = 0
    num_cycles=0.5
    debug = False
    debug_ver2 = False
    gradient_checkpointing = True
    freezing = True

In [None]:
# Loss Func
def criterion(outputs, labels):
    return nn.CrossEntropyLoss()(outputs, labels)

def softmax(z):
    assert len(z.shape) == 2
    s = np.max(z, axis=1)
    s = s[:, np.newaxis] # necessary step to do broadcasting
    e_x = np.exp(z - s)
    div = np.sum(e_x, axis=1)
    div = div[:, np.newaxis] # dito
    return e_x / div
"""
def get_score(y_true, y_pred):
    y_pred = softmax(y_pred)
    score = log_loss(y_true, y_pred)
    return round(score, 5)
"""
def get_score(outputs, labels):
    outputs = F.softmax(torch.tensor(outputs)).numpy()
    return f1_score(np.argmax(outputs,axis=1),labels ,average='macro')

def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

def prepare_input(cfg, text, text_2=None):
    inputs = cfg.tokenizer(text, text_2,
                           padding="max_length",
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           truncation=True)

    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
        
    return inputs

In [None]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False
        
def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
            
    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """
    
    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"
        
        if hasattr(embeddings_path, attr_name): 
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [None]:
train = pd.read_csv(os.path.join(INPUT_DIR, 'train.csv'))
test = pd.read_csv(os.path.join(INPUT_DIR, 'test.csv'))
submission_df = pd.read_csv(os.path.join(INPUT_DIR, 'submit_sample.csv'))

display(train.head())
print(train.shape)
display(test.head())
print(test.shape)

Unnamed: 0,id,description,jobflag
0,0,<li>Develop cutting-edge web applications that...,3
1,1,"<li> Designs and develops high quality, scalab...",3
2,2,<li>Functions as a point person for Network St...,4
3,3,"<li> Work on the technical design, development...",3
4,4,<li>Quantify the resources required for a task...,4


(1516, 3)


Unnamed: 0,id,description
0,1516,<li>Building decision-making models and propos...
1,1517,<li>Educate homeowners on the benefits of sola...
2,1518,"<li><span>Design, develop, document, and imple..."
3,1519,<li>Apply advanced technical expertise and ski...
4,1520,<li>Project manage and deliver against our roa...


(1517, 2)


In [None]:
def remove_tag(x):
    p = re.compile(r"<[^>]*?>")
    return p.sub('',x)

def cleaning(texts):
    clean_texts = []
    for text in texts:
        # htmlタグを削除
        text = remove_tag(text)
        #アルファベット以外をスペースに置き換え
        #clean_punc = re.sub(r'[^a-zA-Z]', ' ', text)
        clean_texts.append(text)
    return clean_texts



from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs

def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

# Register the encoding and decoding error handlers for `utf-8` and `cp1252`.
codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)

def resolve_encodings_and_normalize(text: str) -> str:
    """Resolve the encoding problems and normalize the abnormal characters."""
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text

train['description'] = cleaning(train['description'])
test['description'] = cleaning(test['description'])
train['inputs'] = train['description'].apply(lambda x : resolve_encodings_and_normalize(x))
test['inputs'] = test['description'].apply(lambda x : resolve_encodings_and_normalize(x))
train.drop_duplicates(subset=['inputs'],keep='first',inplace=True)
train = train.reset_index(drop=True)
train = train.rename(columns = {"jobflag": "label"})
train["label"] = train["label"] - 1
train

Unnamed: 0,id,description,label,inputs
0,0,Develop cutting-edge web applications that per...,2,Develop cutting-edge web applications that per...
1,1,"Designs and develops high quality, scalable a...",2,"Designs and develops high quality, scalable a..."
2,2,Functions as a point person for Network Strate...,3,Functions as a point person for Network Strate...
3,3,"Work on the technical design, development, re...",2,"Work on the technical design, development, re..."
4,4,Quantify the resources required for a task/pro...,3,Quantify the resources required for a task/pro...
...,...,...,...,...
1499,1511,"Support detailed reporting, statistical analys...",0,"Support detailed reporting, statistical analys..."
1500,1512,Collaborate with teams to support the ML techn...,1,Collaborate with teams to support the ML techn...
1501,1513,Work with executives and other business leade...,0,Work with executives and other business leade...
1502,1514,Leading design ideation sessions to ensure we ...,2,Leading design ideation sessions to ensure we ...


In [None]:
skf = StratifiedKFold(n_splits=CFG.n_splits,shuffle=True,random_state=CFG.seed)
for fold, ( _, val_) in enumerate(skf.split(train, train.label)):
    train.loc[val_ , "kfold"] = int(fold)
    
train["kfold"] = train["kfold"].astype(int)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_MODEL_DIR+'tokenizer/')
CFG.tokenizer = tokenizer
SEP = tokenizer.sep_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
class Dataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = CFG.max_len
        self.text = df['inputs'].values
        self.tokenizer = CFG.tokenizer
        self.targets = df['label'].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        text = self.text[index]
        inputs = tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length = self.max_len
        )
        samples = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
            'target': self.targets[index]
        }

        if 'token_type_ids' in inputs:
            samples['token_type_ids'] = inputs['token_type_ids']
            
        return samples

In [None]:
# Dynamic Padding (Collate)
#collate_fn = DataCollatorWithPadding(tokenizer=CFG.tokenizer)
class Collate:
    def __init__(self, tokenizer, isTrain=True):
        self.tokenizer = tokenizer
        self.isTrain = isTrain
        # self.args = args

    def __call__(self, batch):
        output = dict()
        output["input_ids"] = [sample["input_ids"] for sample in batch]
        output["attention_mask"] = [sample["attention_mask"] for sample in batch]
        if self.isTrain:
            output["target"] = [sample["target"] for sample in batch]

        # calculate max token length of this batch
        batch_max = max([len(ids) for ids in output["input_ids"]])

        # add padding
        if self.tokenizer.padding_side == "right":
            output["input_ids"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in output["input_ids"]]
            output["attention_mask"] = [s + (batch_max - len(s)) * [0] for s in output["attention_mask"]]
        else:
            output["input_ids"] = [(batch_max - len(s)) * [self.tokenizer.pad_token_id] + s for s in output["input_ids"]]
            output["attention_mask"] = [(batch_max - len(s)) * [0] + s for s in output["attention_mask"]]

        # convert to tensors
        output["input_ids"] = torch.tensor(output["input_ids"], dtype=torch.long)
        output["attention_mask"] = torch.tensor(output["attention_mask"], dtype=torch.long)
        if self.isTrain:
            output["target"] = torch.tensor(output["target"], dtype=torch.long)

        return output
    
collate_fn = Collate(CFG.tokenizer, isTrain=True)

In [None]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9) #
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

In [None]:
# ====================================================
# Model
# ====================================================
class CustomModel(nn.Module):
    def __init__(self, model_name):
        super(CustomModel, self).__init__()
        # Header (fast or normal)

        hidden_dropout_prob: float = 0.1
        layer_norm_eps: float = 1e-7

        self.model = AutoModel.from_pretrained(model_name)
        
        self.config = AutoConfig.from_pretrained(model_name)

        self.config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout_prob": hidden_dropout_prob,
                "layer_norm_eps": layer_norm_eps,
                "add_pooling_layer": False,
                "num_labels": CFG.target_size,
            }
        )

        self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.2)
        self.dropout3 = nn.Dropout(0.3)
        self.dropout4 = nn.Dropout(0.4)
        self.dropout5 = nn.Dropout(0.5)
        self.pooler = MeanPooling()
        self.output = nn.Linear(self.config.hidden_size, CFG.target_size)
        self._init_weights(self.output)
        self.layer_norm1 = nn.LayerNorm(self.config.hidden_size)
        
        # Gradient_checkpointing
        if CFG.gradient_checkpointing:
            (self.model).gradient_checkpointing_enable()
        
        # Freezing
        if CFG.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            CFG.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())
        
        

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)


    def forward(self, ids, mask):

        transformer_out = self.model(input_ids=ids, 
                         attention_mask=mask,
                         )
        sequence_output = transformer_out.last_hidden_state
        sequence_output = self.pooler(sequence_output, mask)
        #sequence_output = self.dropout(sequence_output)
        sequence_output = self.layer_norm1(sequence_output)

        logits1 = self.output(self.dropout1(sequence_output))
        logits2 = self.output(self.dropout2(sequence_output))
        logits3 = self.output(self.dropout3(sequence_output))
        logits4 = self.output(self.dropout4(sequence_output))
        logits5 = self.output(self.dropout5(sequence_output))

        logits = (logits1 + logits2 + logits3 + logits4 + logits5) / 5
     
        return logits

In [None]:
def asMinutes(s):
    m = math.floor(s/60)
    s -= m * 60
    return "%dm %ds" % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))

def get_scheduler(cfg, optimizer, num_train_steps):
    if cfg.scheduler == 'linear':
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
        )
    elif cfg.scheduler == 'cosine':
        scheduler = get_cosine_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
        )
    return scheduler

def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):
    model.train()

    dataset_size = 0
    running_loss = 0

    start = end = time.time()

    for step, data in enumerate(dataloader):
        ids = data['input_ids'].to(device, dtype=torch.long)
        mask = data['attention_mask'].to(device, dtype=torch.long)
        targets = data['target'].to(device, dtype=torch.long)

        batch_size = ids.size(0)
        
        outputs = model(ids, mask)
        loss = criterion(outputs, targets)

        #accumulate
        loss = loss / CFG.n_accumulate 
        loss.backward()
        if (step +1) % CFG.n_accumulate == 0:
            optimizer.step()

            optimizer.zero_grad()
            if scheduler is not None:
                scheduler.step()
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size

        epoch_loss = running_loss / dataset_size

        end = time.time()
        
        if step % CFG.print_freq == 0 or step == (len(dataloader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  .format(epoch+1, step, len(dataloader), 
                          remain=timeSince(start, float(step+1)/len(dataloader))))

    gc.collect()

    return epoch_loss


@torch.no_grad()
def valid_one_epoch(model, dataloader, device, epoch):
    model.eval()

    dataset_size = 0
    running_loss = 0

    start = end = time.time()
    pred = []

    for step, data in enumerate(dataloader):
        ids = data['input_ids'].to(device, dtype=torch.long)
        mask = data['attention_mask'].to(device, dtype=torch.long)
        targets = data['target'].to(device, dtype=torch.long)

        batch_size = ids.size(0)
        outputs = model(ids, mask)
        loss = criterion(outputs, targets)
        pred.append(outputs.to('cpu').numpy())

        running_loss += (loss.item()* batch_size)
        dataset_size += batch_size

        epoch_loss = running_loss / dataset_size

        end = time.time()

        if step % CFG.print_freq == 0 or step == (len(dataloader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  .format(step, len(dataloader),
                          remain=timeSince(start, float(step+1)/len(dataloader))))
            
    pred = np.concatenate(pred)
            
    return epoch_loss, pred

In [None]:
def train_loop(fold):
    #wandb.watch(model, log_freq=100)

    LOGGER.info(f'-------------fold:{fold} training-------------')

    train_data = train[train.kfold != fold].reset_index(drop=True)
    valid_data = train[train.kfold == fold].reset_index(drop=True)
    valid_labels = valid_data.label.values

    trainDataset = Dataset(train_data, CFG.tokenizer, CFG.max_len)
    validDataset = Dataset(valid_data, CFG.tokenizer, CFG.max_len)

    train_loader = DataLoader(trainDataset,
                              batch_size = CFG.batch_size,
                              shuffle=True,
                              collate_fn = collate_fn,
                              num_workers = CFG.num_workers,
                              pin_memory = True,
                              drop_last=True)
    
    valid_loader = DataLoader(validDataset,
                              batch_size = CFG.batch_size*2,
                              shuffle=False,
                              collate_fn = collate_fn,
                              num_workers = CFG.num_workers,
                              pin_memory = True,
                              drop_last=False)
    
    model = CustomModel(CFG.model)
    torch.save(model.config, OUTPUT_MODEL_DIR+'config.pth')
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=CFG.lr, weight_decay=CFG.weigth_decay)
    num_train_steps = int(len(train_data) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # loop
    best_score = 0

    for epoch in range(CFG.epochs):
        start_time = time.time()

        train_epoch_loss = train_one_epoch(model, optimizer, scheduler, train_loader, device, epoch)
        valid_epoch_loss, pred = valid_one_epoch(model, valid_loader, device, epoch)

        score = get_score(pred, valid_labels)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {train_epoch_loss:.4f}  avg_val_loss: {valid_epoch_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')
        if CFG.wandb:
            wandb.log({f"[fold{fold}] epoch": epoch+1, 
                       f"[fold{fold}] avg_train_loss": train_epoch_loss, 
                       f"[fold{fold}] avg_val_loss": valid_epoch_loss,
                       f"[fold{fold}] score": score})
            
        if score > best_score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': pred},
                        OUTPUT_MODEL_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")
            
    predictions = torch.load(OUTPUT_MODEL_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_data['Data scientist'] = predictions[:, 0]
    valid_data['Machine learning engineer'] = predictions[:, 1]
    valid_data['Software engineer'] = predictions[:, 2]
    valid_data['Consultant'] = predictions[:, 3]
    
    
    temp = valid_data[['Data scientist','Machine learning engineer','Software engineer','Consultant']].values.tolist()
    print(get_score(temp, valid_data['label'].values))

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_data

In [None]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df['label'].values
        preds = oof_df[['Data scientist','Machine learning engineer','Software engineer','Consultant']].values.tolist()
        score = get_score(preds, labels)
        LOGGER.info(f'Score: {score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
            
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_MODEL_DIR+'oof_df.pkl')
        oof_df.to_csv(OUTPUT_MODEL_DIR+f'oof_df.csv', index=False)

-------------fold:0 training-------------
INFO:__main__:-------------fold:0 training-------------
Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a Ber

Epoch: [1][0/75] Elapsed 0m 10s (remain 13m 22s) 
Epoch: [1][50/75] Elapsed 2m 1s (remain 0m 56s) 
Epoch: [1][74/75] Elapsed 2m 59s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 14s) 


Epoch 1 - avg_train_loss: 0.9876  avg_val_loss: 0.6685  time: 193s
INFO:__main__:Epoch 1 - avg_train_loss: 0.9876  avg_val_loss: 0.6685  time: 193s
Epoch 1 - Score: 0.5950
INFO:__main__:Epoch 1 - Score: 0.5950
Epoch 1 - Save Best Score: 0.5950 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.5950 Model


EVAL: [9/10] Elapsed 0m 14s (remain 0m 0s) 
Epoch: [2][0/75] Elapsed 0m 2s (remain 2m 47s) 
Epoch: [2][50/75] Elapsed 2m 2s (remain 0m 57s) 
Epoch: [2][74/75] Elapsed 2m 59s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 13s) 


Epoch 2 - avg_train_loss: 0.5951  avg_val_loss: 0.7215  time: 194s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5951  avg_val_loss: 0.7215  time: 194s
Epoch 2 - Score: 0.6630
INFO:__main__:Epoch 2 - Score: 0.6630
Epoch 2 - Save Best Score: 0.6630 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.6630 Model


EVAL: [9/10] Elapsed 0m 14s (remain 0m 0s) 
Epoch: [3][0/75] Elapsed 0m 1s (remain 2m 19s) 
Epoch: [3][50/75] Elapsed 2m 1s (remain 0m 57s) 
Epoch: [3][74/75] Elapsed 2m 55s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 14s) 


Epoch 3 - avg_train_loss: 0.4400  avg_val_loss: 0.5790  time: 190s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4400  avg_val_loss: 0.5790  time: 190s
Epoch 3 - Score: 0.7029
INFO:__main__:Epoch 3 - Score: 0.7029
Epoch 3 - Save Best Score: 0.7029 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7029 Model


EVAL: [9/10] Elapsed 0m 14s (remain 0m 0s) 
Epoch: [4][0/75] Elapsed 0m 3s (remain 3m 48s) 
Epoch: [4][50/75] Elapsed 2m 4s (remain 0m 58s) 
Epoch: [4][74/75] Elapsed 3m 0s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 14s) 


Epoch 4 - avg_train_loss: 0.2298  avg_val_loss: 0.7041  time: 195s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2298  avg_val_loss: 0.7041  time: 195s
Epoch 4 - Score: 0.6177
INFO:__main__:Epoch 4 - Score: 0.6177


EVAL: [9/10] Elapsed 0m 14s (remain 0m 0s) 
Epoch: [5][0/75] Elapsed 0m 1s (remain 2m 24s) 
Epoch: [5][50/75] Elapsed 2m 3s (remain 0m 57s) 
Epoch: [5][74/75] Elapsed 2m 59s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 14s) 


Epoch 5 - avg_train_loss: 0.1208  avg_val_loss: 0.7873  time: 194s
INFO:__main__:Epoch 5 - avg_train_loss: 0.1208  avg_val_loss: 0.7873  time: 194s
Epoch 5 - Score: 0.6900
INFO:__main__:Epoch 5 - Score: 0.6900


EVAL: [9/10] Elapsed 0m 14s (remain 0m 0s) 
Epoch: [6][0/75] Elapsed 0m 1s (remain 2m 20s) 
Epoch: [6][50/75] Elapsed 1m 50s (remain 0m 51s) 
Epoch: [6][74/75] Elapsed 2m 55s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 14s) 


Epoch 6 - avg_train_loss: 0.0487  avg_val_loss: 0.8020  time: 190s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0487  avg_val_loss: 0.8020  time: 190s
Epoch 6 - Score: 0.7016
INFO:__main__:Epoch 6 - Score: 0.7016


EVAL: [9/10] Elapsed 0m 14s (remain 0m 0s) 
Epoch: [7][0/75] Elapsed 0m 1s (remain 2m 20s) 
Epoch: [7][50/75] Elapsed 2m 0s (remain 0m 56s) 
Epoch: [7][74/75] Elapsed 2m 57s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 14s) 


Epoch 7 - avg_train_loss: 0.0211  avg_val_loss: 0.9874  time: 191s
INFO:__main__:Epoch 7 - avg_train_loss: 0.0211  avg_val_loss: 0.9874  time: 191s
Epoch 7 - Score: 0.6724
INFO:__main__:Epoch 7 - Score: 0.6724


EVAL: [9/10] Elapsed 0m 14s (remain 0m 0s) 
Epoch: [8][0/75] Elapsed 0m 1s (remain 1m 50s) 
Epoch: [8][50/75] Elapsed 2m 0s (remain 0m 56s) 
Epoch: [8][74/75] Elapsed 2m 55s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 14s) 


Epoch 8 - avg_train_loss: 0.0156  avg_val_loss: 0.9740  time: 190s
INFO:__main__:Epoch 8 - avg_train_loss: 0.0156  avg_val_loss: 0.9740  time: 190s
Epoch 8 - Score: 0.7005
INFO:__main__:Epoch 8 - Score: 0.7005


EVAL: [9/10] Elapsed 0m 14s (remain 0m 0s) 
Epoch: [9][0/75] Elapsed 0m 2s (remain 3m 28s) 
Epoch: [9][50/75] Elapsed 2m 0s (remain 0m 56s) 
Epoch: [9][74/75] Elapsed 2m 54s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 14s) 


Epoch 9 - avg_train_loss: 0.0075  avg_val_loss: 0.9782  time: 189s
INFO:__main__:Epoch 9 - avg_train_loss: 0.0075  avg_val_loss: 0.9782  time: 189s
Epoch 9 - Score: 0.6929
INFO:__main__:Epoch 9 - Score: 0.6929


EVAL: [9/10] Elapsed 0m 14s (remain 0m 0s) 
Epoch: [10][0/75] Elapsed 0m 2s (remain 3m 21s) 
Epoch: [10][50/75] Elapsed 1m 54s (remain 0m 53s) 
Epoch: [10][74/75] Elapsed 2m 51s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 13s) 


Epoch 10 - avg_train_loss: 0.0061  avg_val_loss: 0.9817  time: 186s
INFO:__main__:Epoch 10 - avg_train_loss: 0.0061  avg_val_loss: 0.9817  time: 186s
Epoch 10 - Score: 0.6952
INFO:__main__:Epoch 10 - Score: 0.6952


EVAL: [9/10] Elapsed 0m 14s (remain 0m 0s) 
0.7029117774880488


Score: 0.7029
INFO:__main__:Score: 0.7029
-------------fold:1 training-------------
INFO:__main__:-------------fold:1 training-------------
Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect t

Epoch: [1][0/75] Elapsed 0m 2s (remain 3m 34s) 
Epoch: [1][50/75] Elapsed 2m 3s (remain 0m 58s) 
Epoch: [1][74/75] Elapsed 2m 57s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 2s (remain 0m 18s) 


Epoch 1 - avg_train_loss: 0.8950  avg_val_loss: 0.6688  time: 190s
INFO:__main__:Epoch 1 - avg_train_loss: 0.8950  avg_val_loss: 0.6688  time: 190s
Epoch 1 - Score: 0.6824
INFO:__main__:Epoch 1 - Score: 0.6824
Epoch 1 - Save Best Score: 0.6824 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6824 Model


EVAL: [9/10] Elapsed 0m 12s (remain 0m 0s) 
Epoch: [2][0/75] Elapsed 0m 1s (remain 2m 0s) 
Epoch: [2][50/75] Elapsed 2m 12s (remain 1m 2s) 
Epoch: [2][74/75] Elapsed 3m 2s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 2s (remain 0m 18s) 


Epoch 2 - avg_train_loss: 0.5791  avg_val_loss: 0.5609  time: 195s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5791  avg_val_loss: 0.5609  time: 195s
Epoch 2 - Score: 0.6968
INFO:__main__:Epoch 2 - Score: 0.6968
Epoch 2 - Save Best Score: 0.6968 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.6968 Model


EVAL: [9/10] Elapsed 0m 12s (remain 0m 0s) 
Epoch: [3][0/75] Elapsed 0m 1s (remain 1m 58s) 
Epoch: [3][50/75] Elapsed 2m 1s (remain 0m 57s) 
Epoch: [3][74/75] Elapsed 2m 58s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 2s (remain 0m 18s) 


Epoch 3 - avg_train_loss: 0.3366  avg_val_loss: 0.6548  time: 191s
INFO:__main__:Epoch 3 - avg_train_loss: 0.3366  avg_val_loss: 0.6548  time: 191s
Epoch 3 - Score: 0.6617
INFO:__main__:Epoch 3 - Score: 0.6617


EVAL: [9/10] Elapsed 0m 12s (remain 0m 0s) 
Epoch: [4][0/75] Elapsed 0m 1s (remain 1m 57s) 
Epoch: [4][50/75] Elapsed 2m 2s (remain 0m 57s) 
Epoch: [4][74/75] Elapsed 3m 2s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 2s (remain 0m 18s) 


Epoch 4 - avg_train_loss: 0.1534  avg_val_loss: 0.7815  time: 196s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1534  avg_val_loss: 0.7815  time: 196s
Epoch 4 - Score: 0.7729
INFO:__main__:Epoch 4 - Score: 0.7729
Epoch 4 - Save Best Score: 0.7729 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7729 Model


EVAL: [9/10] Elapsed 0m 12s (remain 0m 0s) 
Epoch: [5][0/75] Elapsed 0m 2s (remain 2m 44s) 
Epoch: [5][50/75] Elapsed 2m 7s (remain 1m 0s) 
Epoch: [5][74/75] Elapsed 3m 3s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 2s (remain 0m 18s) 


Epoch 5 - avg_train_loss: 0.0782  avg_val_loss: 0.7867  time: 196s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0782  avg_val_loss: 0.7867  time: 196s
Epoch 5 - Score: 0.7152
INFO:__main__:Epoch 5 - Score: 0.7152


EVAL: [9/10] Elapsed 0m 12s (remain 0m 0s) 
Epoch: [6][0/75] Elapsed 0m 1s (remain 2m 20s) 
Epoch: [6][50/75] Elapsed 1m 57s (remain 0m 55s) 
Epoch: [6][74/75] Elapsed 2m 59s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 2s (remain 0m 18s) 


Epoch 6 - avg_train_loss: 0.0191  avg_val_loss: 0.9027  time: 192s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0191  avg_val_loss: 0.9027  time: 192s
Epoch 6 - Score: 0.7572
INFO:__main__:Epoch 6 - Score: 0.7572


EVAL: [9/10] Elapsed 0m 12s (remain 0m 0s) 
Epoch: [7][0/75] Elapsed 0m 5s (remain 6m 16s) 
Epoch: [7][50/75] Elapsed 2m 0s (remain 0m 56s) 
Epoch: [7][74/75] Elapsed 3m 2s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 2s (remain 0m 18s) 


Epoch 7 - avg_train_loss: 0.0071  avg_val_loss: 1.0078  time: 195s
INFO:__main__:Epoch 7 - avg_train_loss: 0.0071  avg_val_loss: 1.0078  time: 195s
Epoch 7 - Score: 0.7282
INFO:__main__:Epoch 7 - Score: 0.7282


EVAL: [9/10] Elapsed 0m 12s (remain 0m 0s) 
Epoch: [8][0/75] Elapsed 0m 3s (remain 3m 44s) 
Epoch: [8][50/75] Elapsed 1m 54s (remain 0m 53s) 
Epoch: [8][74/75] Elapsed 2m 55s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 2s (remain 0m 18s) 


Epoch 8 - avg_train_loss: 0.0020  avg_val_loss: 1.0072  time: 188s
INFO:__main__:Epoch 8 - avg_train_loss: 0.0020  avg_val_loss: 1.0072  time: 188s
Epoch 8 - Score: 0.7417
INFO:__main__:Epoch 8 - Score: 0.7417


EVAL: [9/10] Elapsed 0m 12s (remain 0m 0s) 
Epoch: [9][0/75] Elapsed 0m 7s (remain 9m 0s) 
Epoch: [9][50/75] Elapsed 2m 8s (remain 1m 0s) 
Epoch: [9][74/75] Elapsed 3m 1s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 2s (remain 0m 18s) 


Epoch 9 - avg_train_loss: 0.0019  avg_val_loss: 1.0107  time: 194s
INFO:__main__:Epoch 9 - avg_train_loss: 0.0019  avg_val_loss: 1.0107  time: 194s
Epoch 9 - Score: 0.7391
INFO:__main__:Epoch 9 - Score: 0.7391


EVAL: [9/10] Elapsed 0m 12s (remain 0m 0s) 
Epoch: [10][0/75] Elapsed 0m 1s (remain 2m 17s) 
Epoch: [10][50/75] Elapsed 1m 46s (remain 0m 50s) 
Epoch: [10][74/75] Elapsed 2m 56s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 2s (remain 0m 18s) 


Epoch 10 - avg_train_loss: 0.0014  avg_val_loss: 1.0118  time: 189s
INFO:__main__:Epoch 10 - avg_train_loss: 0.0014  avg_val_loss: 1.0118  time: 189s
Epoch 10 - Score: 0.7417
INFO:__main__:Epoch 10 - Score: 0.7417


EVAL: [9/10] Elapsed 0m 12s (remain 0m 0s) 


Score: 0.7729
INFO:__main__:Score: 0.7729
-------------fold:2 training-------------
INFO:__main__:-------------fold:2 training-------------


0.7729376990983609


Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/75] Elapsed 0m 2s (remain 3m 38s) 
Epoch: [1][50/75] Elapsed 2m 10s (remain 1m 1s) 
Epoch: [1][74/75] Elapsed 3m 5s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 11s) 


Epoch 1 - avg_train_loss: 0.9596  avg_val_loss: 0.6579  time: 196s
INFO:__main__:Epoch 1 - avg_train_loss: 0.9596  avg_val_loss: 0.6579  time: 196s
Epoch 1 - Score: 0.6789
INFO:__main__:Epoch 1 - Score: 0.6789
Epoch 1 - Save Best Score: 0.6789 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6789 Model


EVAL: [9/10] Elapsed 0m 11s (remain 0m 0s) 
Epoch: [2][0/75] Elapsed 0m 1s (remain 2m 21s) 
Epoch: [2][50/75] Elapsed 2m 6s (remain 0m 59s) 
Epoch: [2][74/75] Elapsed 3m 4s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 11s) 


Epoch 2 - avg_train_loss: 0.5582  avg_val_loss: 0.6108  time: 196s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5582  avg_val_loss: 0.6108  time: 196s
Epoch 2 - Score: 0.7281
INFO:__main__:Epoch 2 - Score: 0.7281
Epoch 2 - Save Best Score: 0.7281 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7281 Model


EVAL: [9/10] Elapsed 0m 11s (remain 0m 0s) 
Epoch: [3][0/75] Elapsed 0m 3s (remain 4m 11s) 
Epoch: [3][50/75] Elapsed 2m 2s (remain 0m 57s) 
Epoch: [3][74/75] Elapsed 3m 2s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 11s) 


Epoch 3 - avg_train_loss: 0.3552  avg_val_loss: 0.6904  time: 194s
INFO:__main__:Epoch 3 - avg_train_loss: 0.3552  avg_val_loss: 0.6904  time: 194s
Epoch 3 - Score: 0.7126
INFO:__main__:Epoch 3 - Score: 0.7126


EVAL: [9/10] Elapsed 0m 11s (remain 0m 0s) 
Epoch: [4][0/75] Elapsed 0m 2s (remain 3m 12s) 
Epoch: [4][50/75] Elapsed 2m 7s (remain 0m 59s) 
Epoch: [4][74/75] Elapsed 3m 2s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 11s) 


Epoch 4 - avg_train_loss: 0.1941  avg_val_loss: 0.7232  time: 194s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1941  avg_val_loss: 0.7232  time: 194s
Epoch 4 - Score: 0.7819
INFO:__main__:Epoch 4 - Score: 0.7819
Epoch 4 - Save Best Score: 0.7819 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7819 Model


EVAL: [9/10] Elapsed 0m 11s (remain 0m 0s) 
Epoch: [5][0/75] Elapsed 0m 3s (remain 3m 47s) 
Epoch: [5][50/75] Elapsed 2m 8s (remain 1m 0s) 
Epoch: [5][74/75] Elapsed 2m 57s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 11s) 


Epoch 5 - avg_train_loss: 0.1271  avg_val_loss: 0.8520  time: 189s
INFO:__main__:Epoch 5 - avg_train_loss: 0.1271  avg_val_loss: 0.8520  time: 189s
Epoch 5 - Score: 0.7185
INFO:__main__:Epoch 5 - Score: 0.7185


EVAL: [9/10] Elapsed 0m 11s (remain 0m 0s) 
Epoch: [6][0/75] Elapsed 0m 4s (remain 5m 16s) 
Epoch: [6][50/75] Elapsed 2m 12s (remain 1m 2s) 
Epoch: [6][74/75] Elapsed 3m 4s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 11s) 


Epoch 6 - avg_train_loss: 0.0833  avg_val_loss: 0.7833  time: 196s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0833  avg_val_loss: 0.7833  time: 196s
Epoch 6 - Score: 0.7389
INFO:__main__:Epoch 6 - Score: 0.7389


EVAL: [9/10] Elapsed 0m 11s (remain 0m 0s) 
Epoch: [7][0/75] Elapsed 0m 2s (remain 3m 2s) 
Epoch: [7][50/75] Elapsed 2m 4s (remain 0m 58s) 
Epoch: [7][74/75] Elapsed 3m 4s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 11s) 


Epoch 7 - avg_train_loss: 0.0212  avg_val_loss: 1.0020  time: 196s
INFO:__main__:Epoch 7 - avg_train_loss: 0.0212  avg_val_loss: 1.0020  time: 196s
Epoch 7 - Score: 0.7519
INFO:__main__:Epoch 7 - Score: 0.7519


EVAL: [9/10] Elapsed 0m 11s (remain 0m 0s) 
Epoch: [8][0/75] Elapsed 0m 5s (remain 6m 16s) 
Epoch: [8][50/75] Elapsed 2m 18s (remain 1m 5s) 
Epoch: [8][74/75] Elapsed 3m 5s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 11s) 


Epoch 8 - avg_train_loss: 0.0100  avg_val_loss: 0.9599  time: 197s
INFO:__main__:Epoch 8 - avg_train_loss: 0.0100  avg_val_loss: 0.9599  time: 197s
Epoch 8 - Score: 0.7732
INFO:__main__:Epoch 8 - Score: 0.7732


EVAL: [9/10] Elapsed 0m 11s (remain 0m 0s) 
Epoch: [9][0/75] Elapsed 0m 1s (remain 1m 57s) 
Epoch: [9][50/75] Elapsed 2m 3s (remain 0m 57s) 
Epoch: [9][74/75] Elapsed 2m 57s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 11s) 


Epoch 9 - avg_train_loss: 0.0065  avg_val_loss: 0.9685  time: 189s
INFO:__main__:Epoch 9 - avg_train_loss: 0.0065  avg_val_loss: 0.9685  time: 189s
Epoch 9 - Score: 0.7732
INFO:__main__:Epoch 9 - Score: 0.7732


EVAL: [9/10] Elapsed 0m 11s (remain 0m 0s) 
Epoch: [10][0/75] Elapsed 0m 1s (remain 1m 50s) 
Epoch: [10][50/75] Elapsed 2m 1s (remain 0m 57s) 
Epoch: [10][74/75] Elapsed 2m 57s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 11s) 


Epoch 10 - avg_train_loss: 0.0046  avg_val_loss: 0.9710  time: 189s
INFO:__main__:Epoch 10 - avg_train_loss: 0.0046  avg_val_loss: 0.9710  time: 189s
Epoch 10 - Score: 0.7732
INFO:__main__:Epoch 10 - Score: 0.7732


EVAL: [9/10] Elapsed 0m 11s (remain 0m 0s) 
0.7818889992196895


Score: 0.7819
INFO:__main__:Score: 0.7819
-------------fold:3 training-------------
INFO:__main__:-------------fold:3 training-------------
Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect t

Epoch: [1][0/75] Elapsed 0m 1s (remain 2m 7s) 
Epoch: [1][50/75] Elapsed 1m 56s (remain 0m 54s) 
Epoch: [1][74/75] Elapsed 2m 55s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 9s) 


Epoch 1 - avg_train_loss: 0.9437  avg_val_loss: 0.6642  time: 191s
INFO:__main__:Epoch 1 - avg_train_loss: 0.9437  avg_val_loss: 0.6642  time: 191s
Epoch 1 - Score: 0.5994
INFO:__main__:Epoch 1 - Score: 0.5994
Epoch 1 - Save Best Score: 0.5994 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.5994 Model


EVAL: [9/10] Elapsed 0m 14s (remain 0m 0s) 
Epoch: [2][0/75] Elapsed 0m 1s (remain 2m 13s) 
Epoch: [2][50/75] Elapsed 1m 52s (remain 0m 52s) 
Epoch: [2][74/75] Elapsed 2m 57s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 10s) 


Epoch 2 - avg_train_loss: 0.5307  avg_val_loss: 0.6817  time: 192s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5307  avg_val_loss: 0.6817  time: 192s
Epoch 2 - Score: 0.6297
INFO:__main__:Epoch 2 - Score: 0.6297
Epoch 2 - Save Best Score: 0.6297 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.6297 Model


EVAL: [9/10] Elapsed 0m 14s (remain 0m 0s) 
Epoch: [3][0/75] Elapsed 0m 2s (remain 3m 39s) 
Epoch: [3][50/75] Elapsed 2m 0s (remain 0m 56s) 
Epoch: [3][74/75] Elapsed 2m 54s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 9s) 


Epoch 3 - avg_train_loss: 0.3219  avg_val_loss: 0.8392  time: 189s
INFO:__main__:Epoch 3 - avg_train_loss: 0.3219  avg_val_loss: 0.8392  time: 189s
Epoch 3 - Score: 0.6794
INFO:__main__:Epoch 3 - Score: 0.6794
Epoch 3 - Save Best Score: 0.6794 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.6794 Model


EVAL: [9/10] Elapsed 0m 14s (remain 0m 0s) 
Epoch: [4][0/75] Elapsed 0m 1s (remain 2m 24s) 
Epoch: [4][50/75] Elapsed 1m 57s (remain 0m 55s) 
Epoch: [4][74/75] Elapsed 3m 1s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 10s) 


Epoch 4 - avg_train_loss: 0.1577  avg_val_loss: 0.8560  time: 196s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1577  avg_val_loss: 0.8560  time: 196s
Epoch 4 - Score: 0.7113
INFO:__main__:Epoch 4 - Score: 0.7113
Epoch 4 - Save Best Score: 0.7113 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7113 Model


EVAL: [9/10] Elapsed 0m 14s (remain 0m 0s) 
Epoch: [5][0/75] Elapsed 0m 2s (remain 3m 24s) 
Epoch: [5][50/75] Elapsed 2m 1s (remain 0m 56s) 
Epoch: [5][74/75] Elapsed 2m 56s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 10s) 


Epoch 5 - avg_train_loss: 0.0814  avg_val_loss: 1.0155  time: 191s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0814  avg_val_loss: 1.0155  time: 191s
Epoch 5 - Score: 0.7236
INFO:__main__:Epoch 5 - Score: 0.7236
Epoch 5 - Save Best Score: 0.7236 Model
INFO:__main__:Epoch 5 - Save Best Score: 0.7236 Model


EVAL: [9/10] Elapsed 0m 14s (remain 0m 0s) 
Epoch: [6][0/75] Elapsed 0m 3s (remain 4m 42s) 
Epoch: [6][50/75] Elapsed 1m 57s (remain 0m 55s) 
Epoch: [6][74/75] Elapsed 2m 53s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 9s) 


Epoch 6 - avg_train_loss: 0.0278  avg_val_loss: 1.0057  time: 188s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0278  avg_val_loss: 1.0057  time: 188s
Epoch 6 - Score: 0.7516
INFO:__main__:Epoch 6 - Score: 0.7516
Epoch 6 - Save Best Score: 0.7516 Model
INFO:__main__:Epoch 6 - Save Best Score: 0.7516 Model


EVAL: [9/10] Elapsed 0m 14s (remain 0m 0s) 
Epoch: [7][0/75] Elapsed 0m 2s (remain 3m 17s) 
Epoch: [7][50/75] Elapsed 1m 54s (remain 0m 53s) 
Epoch: [7][74/75] Elapsed 2m 58s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 10s) 


Epoch 7 - avg_train_loss: 0.0114  avg_val_loss: 1.0075  time: 193s
INFO:__main__:Epoch 7 - avg_train_loss: 0.0114  avg_val_loss: 1.0075  time: 193s
Epoch 7 - Score: 0.7525
INFO:__main__:Epoch 7 - Score: 0.7525
Epoch 7 - Save Best Score: 0.7525 Model
INFO:__main__:Epoch 7 - Save Best Score: 0.7525 Model


EVAL: [9/10] Elapsed 0m 14s (remain 0m 0s) 
Epoch: [8][0/75] Elapsed 0m 4s (remain 5m 1s) 
Epoch: [8][50/75] Elapsed 2m 2s (remain 0m 57s) 
Epoch: [8][74/75] Elapsed 2m 58s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 9s) 


Epoch 8 - avg_train_loss: 0.0030  avg_val_loss: 1.0546  time: 194s
INFO:__main__:Epoch 8 - avg_train_loss: 0.0030  avg_val_loss: 1.0546  time: 194s
Epoch 8 - Score: 0.7441
INFO:__main__:Epoch 8 - Score: 0.7441


EVAL: [9/10] Elapsed 0m 14s (remain 0m 0s) 
Epoch: [9][0/75] Elapsed 0m 1s (remain 2m 10s) 
Epoch: [9][50/75] Elapsed 2m 3s (remain 0m 57s) 
Epoch: [9][74/75] Elapsed 2m 56s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 9s) 


Epoch 9 - avg_train_loss: 0.0033  avg_val_loss: 1.0648  time: 191s
INFO:__main__:Epoch 9 - avg_train_loss: 0.0033  avg_val_loss: 1.0648  time: 191s
Epoch 9 - Score: 0.7491
INFO:__main__:Epoch 9 - Score: 0.7491


EVAL: [9/10] Elapsed 0m 14s (remain 0m 0s) 
Epoch: [10][0/75] Elapsed 0m 3s (remain 3m 48s) 
Epoch: [10][50/75] Elapsed 2m 3s (remain 0m 58s) 
Epoch: [10][74/75] Elapsed 2m 55s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 9s) 


Epoch 10 - avg_train_loss: 0.0046  avg_val_loss: 1.0684  time: 190s
INFO:__main__:Epoch 10 - avg_train_loss: 0.0046  avg_val_loss: 1.0684  time: 190s
Epoch 10 - Score: 0.7467
INFO:__main__:Epoch 10 - Score: 0.7467


EVAL: [9/10] Elapsed 0m 14s (remain 0m 0s) 
0.7525221784428456


Score: 0.7525
INFO:__main__:Score: 0.7525
-------------fold:4 training-------------
INFO:__main__:-------------fold:4 training-------------
Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect t

Epoch: [1][0/75] Elapsed 0m 7s (remain 9m 10s) 
Epoch: [1][50/75] Elapsed 2m 5s (remain 0m 59s) 
Epoch: [1][74/75] Elapsed 2m 57s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 10s) 


Epoch 1 - avg_train_loss: 0.9512  avg_val_loss: 0.6553  time: 192s
INFO:__main__:Epoch 1 - avg_train_loss: 0.9512  avg_val_loss: 0.6553  time: 192s
Epoch 1 - Score: 0.5811
INFO:__main__:Epoch 1 - Score: 0.5811
Epoch 1 - Save Best Score: 0.5811 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.5811 Model


EVAL: [9/10] Elapsed 0m 14s (remain 0m 0s) 
Epoch: [2][0/75] Elapsed 0m 4s (remain 5m 17s) 
Epoch: [2][50/75] Elapsed 1m 55s (remain 0m 54s) 
Epoch: [2][74/75] Elapsed 2m 51s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 10s) 


Epoch 2 - avg_train_loss: 0.5571  avg_val_loss: 0.6605  time: 186s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5571  avg_val_loss: 0.6605  time: 186s
Epoch 2 - Score: 0.5886
INFO:__main__:Epoch 2 - Score: 0.5886
Epoch 2 - Save Best Score: 0.5886 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.5886 Model


EVAL: [9/10] Elapsed 0m 14s (remain 0m 0s) 
Epoch: [3][0/75] Elapsed 0m 2s (remain 2m 55s) 
Epoch: [3][50/75] Elapsed 1m 56s (remain 0m 55s) 
Epoch: [3][74/75] Elapsed 2m 57s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 10s) 


Epoch 3 - avg_train_loss: 0.4157  avg_val_loss: 0.6315  time: 192s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4157  avg_val_loss: 0.6315  time: 192s
Epoch 3 - Score: 0.7088
INFO:__main__:Epoch 3 - Score: 0.7088
Epoch 3 - Save Best Score: 0.7088 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7088 Model


EVAL: [9/10] Elapsed 0m 14s (remain 0m 0s) 
Epoch: [4][0/75] Elapsed 0m 1s (remain 1m 50s) 
Epoch: [4][50/75] Elapsed 1m 57s (remain 0m 55s) 
Epoch: [4][74/75] Elapsed 2m 55s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 10s) 


Epoch 4 - avg_train_loss: 0.1908  avg_val_loss: 0.7960  time: 190s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1908  avg_val_loss: 0.7960  time: 190s
Epoch 4 - Score: 0.6845
INFO:__main__:Epoch 4 - Score: 0.6845


EVAL: [9/10] Elapsed 0m 14s (remain 0m 0s) 
Epoch: [5][0/75] Elapsed 0m 1s (remain 2m 25s) 
Epoch: [5][50/75] Elapsed 1m 59s (remain 0m 56s) 
Epoch: [5][74/75] Elapsed 2m 53s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 10s) 


Epoch 5 - avg_train_loss: 0.1002  avg_val_loss: 0.9267  time: 188s
INFO:__main__:Epoch 5 - avg_train_loss: 0.1002  avg_val_loss: 0.9267  time: 188s
Epoch 5 - Score: 0.6977
INFO:__main__:Epoch 5 - Score: 0.6977


EVAL: [9/10] Elapsed 0m 14s (remain 0m 0s) 
Epoch: [6][0/75] Elapsed 0m 1s (remain 2m 25s) 
Epoch: [6][50/75] Elapsed 1m 48s (remain 0m 50s) 
Epoch: [6][74/75] Elapsed 2m 52s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 10s) 


Epoch 6 - avg_train_loss: 0.0497  avg_val_loss: 1.0046  time: 186s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0497  avg_val_loss: 1.0046  time: 186s
Epoch 6 - Score: 0.6758
INFO:__main__:Epoch 6 - Score: 0.6758


EVAL: [9/10] Elapsed 0m 14s (remain 0m 0s) 
Epoch: [7][0/75] Elapsed 0m 1s (remain 2m 10s) 
Epoch: [7][50/75] Elapsed 1m 57s (remain 0m 55s) 
Epoch: [7][74/75] Elapsed 2m 54s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 10s) 


Epoch 7 - avg_train_loss: 0.0215  avg_val_loss: 1.0484  time: 189s
INFO:__main__:Epoch 7 - avg_train_loss: 0.0215  avg_val_loss: 1.0484  time: 189s
Epoch 7 - Score: 0.7160
INFO:__main__:Epoch 7 - Score: 0.7160
Epoch 7 - Save Best Score: 0.7160 Model
INFO:__main__:Epoch 7 - Save Best Score: 0.7160 Model


EVAL: [9/10] Elapsed 0m 14s (remain 0m 0s) 
Epoch: [8][0/75] Elapsed 0m 4s (remain 5m 5s) 
Epoch: [8][50/75] Elapsed 1m 56s (remain 0m 54s) 
Epoch: [8][74/75] Elapsed 2m 54s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 10s) 


Epoch 8 - avg_train_loss: 0.0156  avg_val_loss: 1.1156  time: 189s
INFO:__main__:Epoch 8 - avg_train_loss: 0.0156  avg_val_loss: 1.1156  time: 189s
Epoch 8 - Score: 0.7014
INFO:__main__:Epoch 8 - Score: 0.7014


EVAL: [9/10] Elapsed 0m 14s (remain 0m 0s) 
Epoch: [9][0/75] Elapsed 0m 1s (remain 1m 46s) 
Epoch: [9][50/75] Elapsed 2m 2s (remain 0m 57s) 
Epoch: [9][74/75] Elapsed 3m 0s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 10s) 


Epoch 9 - avg_train_loss: 0.0081  avg_val_loss: 1.0904  time: 194s
INFO:__main__:Epoch 9 - avg_train_loss: 0.0081  avg_val_loss: 1.0904  time: 194s
Epoch 9 - Score: 0.7163
INFO:__main__:Epoch 9 - Score: 0.7163
Epoch 9 - Save Best Score: 0.7163 Model
INFO:__main__:Epoch 9 - Save Best Score: 0.7163 Model


EVAL: [9/10] Elapsed 0m 14s (remain 0m 0s) 
Epoch: [10][0/75] Elapsed 0m 4s (remain 5m 50s) 
Epoch: [10][50/75] Elapsed 1m 57s (remain 0m 55s) 
Epoch: [10][74/75] Elapsed 2m 56s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 10s) 


Epoch 10 - avg_train_loss: 0.0081  avg_val_loss: 1.0897  time: 190s
INFO:__main__:Epoch 10 - avg_train_loss: 0.0081  avg_val_loss: 1.0897  time: 190s
Epoch 10 - Score: 0.7161
INFO:__main__:Epoch 10 - Score: 0.7161


EVAL: [9/10] Elapsed 0m 14s (remain 0m 0s) 
0.716301113779758


Score: 0.7163
INFO:__main__:Score: 0.7163
Score: 0.7460
INFO:__main__:Score: 0.7460


In [None]:
A = pd.read_csv(OUTPUT_MODEL_DIR+'oof_df.csv')
A.head()

Unnamed: 0,id,description,label,inputs,kfold,Data scientist,Machine learning engineer,Software engineer,Consultant
0,1,"Designs and develops high quality, scalable a...",2,"Designs and develops high quality, scalable a...",0,-0.700954,-1.415321,2.447985,-0.536542
1,4,Quantify the resources required for a task/pro...,3,Quantify the resources required for a task/pro...,0,1.259189,-2.867341,-0.881958,2.686974
2,7,"Facilitate pre-sales initiatives, such as live...",3,"Facilitate pre-sales initiatives, such as live...",0,0.746555,-3.894152,-1.631799,5.290353
3,9,Maintain and improve existing predictive model...,0,Maintain and improve existing predictive model...,0,4.39866,-0.758748,-2.899684,-0.179692
4,12,Project manage complex implementation engageme...,3,Project manage complex implementation engageme...,0,0.410145,-3.569713,-1.376815,5.136912
