#Pseudo labeling

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.1-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 4.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 13.5 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 56.6 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 76.7 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling P

In [3]:
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 4.1 MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.97


In [4]:
import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
INPUT_DIR = '/content/drive/MyDrive/Competitions/Signate/Student Cup 2022/input/'
OUTPUT_DIR = '/content/drive/MyDrive/Competitions/Signate/Student Cup 2022/output/'
OUTPUT_SUB_DIR = os.path.join(OUTPUT_DIR,'Submission')
OUTPUT_MODEL_DIR = os.path.join(OUTPUT_DIR,'Model/DeBERTa-base[ver5]/')

In [6]:
class CFG:
    wandb = False
    apex = True
    model = 'microsoft/deberta-v3-base'
    seed = 42
    n_splits = 5
    max_len = 1024
    dropout = 0.1
    target_size=4
    n_accumulate=1
    print_freq = 50
    min_lr=1e-6
    scheduler = 'cosine'
    batch_size = 8
    num_workers = 2
    lr = 3e-5
    weigth_decay = 0.01
    epochs = 10
    n_fold = 5
    trn_fold = [0, 1, 2, 3, 4]
    train = True 
    num_warmup_steps = 0
    num_cycles=0.5
    debug = False
    debug_ver2 = False
    gradient_checkpointing = True
    freezing = True

In [7]:
# Loss Func
def criterion(outputs, labels):
    return nn.CrossEntropyLoss()(outputs, labels)

def softmax(z):
    assert len(z.shape) == 2
    s = np.max(z, axis=1)
    s = s[:, np.newaxis] # necessary step to do broadcasting
    e_x = np.exp(z - s)
    div = np.sum(e_x, axis=1)
    div = div[:, np.newaxis] # dito
    return e_x / div
"""
def get_score(y_true, y_pred):
    y_pred = softmax(y_pred)
    score = log_loss(y_true, y_pred)
    return round(score, 5)
"""
def get_score(outputs, labels):
    outputs = F.softmax(torch.tensor(outputs)).numpy()
    return f1_score(np.argmax(outputs,axis=1),labels ,average='macro')

def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

def prepare_input(cfg, text, text_2=None):
    inputs = cfg.tokenizer(text, text_2,
                           padding="max_length",
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           truncation=True)

    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
        
    return inputs

In [8]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False
        
def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
            
    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """
    
    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"
        
        if hasattr(embeddings_path, attr_name): 
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [9]:
train = pd.read_csv(os.path.join(INPUT_DIR, 'train.csv'))
test = pd.read_csv(os.path.join(INPUT_DIR, 'test.csv'))
submission_df = pd.read_csv(os.path.join(INPUT_DIR, 'submit_sample.csv'))
pseudo = pd.read_csv(os.path.join(OUTPUT_DIR,"pseudo_df.csv"))

display(train.head())
print(train.shape)
display(test.head())
print(test.shape)
display(pseudo.head())
print(pseudo.shape)

Unnamed: 0,id,description,jobflag
0,0,<li>Develop cutting-edge web applications that...,3
1,1,"<li> Designs and develops high quality, scalab...",3
2,2,<li>Functions as a point person for Network St...,4
3,3,"<li> Work on the technical design, development...",3
4,4,<li>Quantify the resources required for a task...,4


(1516, 3)


Unnamed: 0,id,description
0,1516,<li>Building decision-making models and propos...
1,1517,<li>Educate homeowners on the benefits of sola...
2,1518,"<li><span>Design, develop, document, and imple..."
3,1519,<li>Apply advanced technical expertise and ski...
4,1520,<li>Project manage and deliver against our roa...


(1517, 2)


Unnamed: 0,id,description,jobflag
0,1516,<li>Building decision-making models and propos...,1
1,1517,<li>Educate homeowners on the benefits of sola...,4
2,1518,"<li><span>Design, develop, document, and imple...",3
3,1519,<li>Apply advanced technical expertise and ski...,4
4,1521,<li>Model and execute on concurrent Product A/...,1


(1051, 3)


In [10]:
def remove_tag(x):
    p = re.compile(r"<[^>]*?>")
    return p.sub('',x)

def cleaning(texts):
    clean_texts = []
    for text in texts:
        # htmlタグを削除
        text = remove_tag(text)
        #アルファベット以外をスペースに置き換え
        #clean_punc = re.sub(r'[^a-zA-Z]', ' ', text)
        clean_texts.append(text)
    return clean_texts



from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs

def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

# Register the encoding and decoding error handlers for `utf-8` and `cp1252`.
codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)

def resolve_encodings_and_normalize(text: str) -> str:
    """Resolve the encoding problems and normalize the abnormal characters."""
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text

train = pd.concat([train,pseudo],axis=0)
train['description'] = cleaning(train['description'])
test['description'] = cleaning(test['description'])
train['inputs'] = train['description'].apply(lambda x : resolve_encodings_and_normalize(x))
test['inputs'] = test['description'].apply(lambda x : resolve_encodings_and_normalize(x))
train.drop_duplicates(subset=['inputs'],keep='first',inplace=True)
train = train.reset_index(drop=True)
train = train.rename(columns = {"jobflag": "label"})
train["label"] = train["label"] -1
train

Unnamed: 0,id,description,label,inputs
0,0,Develop cutting-edge web applications that per...,2,Develop cutting-edge web applications that per...
1,1,"Designs and develops high quality, scalable a...",2,"Designs and develops high quality, scalable a..."
2,2,Functions as a point person for Network Strate...,3,Functions as a point person for Network Strate...
3,3,"Work on the technical design, development, re...",2,"Work on the technical design, development, re..."
4,4,Quantify the resources required for a task/pro...,3,Quantify the resources required for a task/pro...
...,...,...,...,...
2512,3026,"Write, validate and maintain programs in Pytho...",0,"Write, validate and maintain programs in Pytho..."
2513,3027,Completes programming and performs testing and...,2,Completes programming and performs testing and...
2514,3028,Manages the development of interface requirem...,2,Manages the development of interface requirem...
2515,3029,Lead the implementation of new statistical mod...,0,Lead the implementation of new statistical mod...


In [11]:
skf = StratifiedKFold(n_splits=CFG.n_splits,shuffle=True,random_state=CFG.seed)
for fold, ( _, val_) in enumerate(skf.split(train, train.label)):
    train.loc[val_ , "kfold"] = int(fold)
    
train["kfold"] = train["kfold"].astype(int)

In [12]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_MODEL_DIR+'tokenizer/')
CFG.tokenizer = tokenizer
SEP = tokenizer.sep_token

Downloading tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
class Dataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = CFG.max_len
        self.text = df['inputs'].values
        self.tokenizer = CFG.tokenizer
        self.targets = df['label'].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        text = self.text[index]
        inputs = tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length = self.max_len
        )
        samples = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
            'target': self.targets[index]
        }

        if 'token_type_ids' in inputs:
            samples['token_type_ids'] = inputs['token_type_ids']
            
        return samples

In [14]:
# Dynamic Padding (Collate)
#collate_fn = DataCollatorWithPadding(tokenizer=CFG.tokenizer)
class Collate:
    def __init__(self, tokenizer, isTrain=True):
        self.tokenizer = tokenizer
        self.isTrain = isTrain
        # self.args = args

    def __call__(self, batch):
        output = dict()
        output["input_ids"] = [sample["input_ids"] for sample in batch]
        output["attention_mask"] = [sample["attention_mask"] for sample in batch]
        if self.isTrain:
            output["target"] = [sample["target"] for sample in batch]

        # calculate max token length of this batch
        batch_max = max([len(ids) for ids in output["input_ids"]])

        # add padding
        if self.tokenizer.padding_side == "right":
            output["input_ids"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in output["input_ids"]]
            output["attention_mask"] = [s + (batch_max - len(s)) * [0] for s in output["attention_mask"]]
        else:
            output["input_ids"] = [(batch_max - len(s)) * [self.tokenizer.pad_token_id] + s for s in output["input_ids"]]
            output["attention_mask"] = [(batch_max - len(s)) * [0] + s for s in output["attention_mask"]]

        # convert to tensors
        output["input_ids"] = torch.tensor(output["input_ids"], dtype=torch.long)
        output["attention_mask"] = torch.tensor(output["attention_mask"], dtype=torch.long)
        if self.isTrain:
            output["target"] = torch.tensor(output["target"], dtype=torch.long)

        return output
    
collate_fn = Collate(CFG.tokenizer, isTrain=True)

In [15]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9) #
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

In [16]:
# ====================================================
# Model
# ====================================================
class CustomModel(nn.Module):
    def __init__(self, model_name):
        super(CustomModel, self).__init__()
        # Header (fast or normal)
        self.model = AutoModel.from_pretrained(model_name)
        
        # Gradient_checkpointing
        if CFG.gradient_checkpointing:
            (self.model).gradient_checkpointing_enable()
        
        # Freezing
        if CFG.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            CFG.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())
        
        self.config = AutoConfig.from_pretrained(model_name)
        self.drop = nn.Dropout(p=CFG.dropout)
        self.pooler = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, CFG.target_size)
        
    def forward(self, ids, mask):        
        out = self.model(input_ids=ids, 
                         attention_mask=mask,
                         output_hidden_states=False)
        out = self.pooler(out.last_hidden_state, mask)
        out = self.drop(out)
        outputs = self.fc(out)
        return outputs

In [17]:
def asMinutes(s):
    m = math.floor(s/60)
    s -= m * 60
    return "%dm %ds" % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))

def get_scheduler(cfg, optimizer, num_train_steps):
    if cfg.scheduler == 'linear':
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
        )
    elif cfg.scheduler == 'cosine':
        scheduler = get_cosine_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
        )
    return scheduler

def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):
    model.train()

    dataset_size = 0
    running_loss = 0

    start = end = time.time()

    for step, data in enumerate(dataloader):
        ids = data['input_ids'].to(device, dtype=torch.long)
        mask = data['attention_mask'].to(device, dtype=torch.long)
        targets = data['target'].to(device, dtype=torch.long)

        batch_size = ids.size(0)
        
        outputs = model(ids, mask)
        loss = criterion(outputs, targets)

        #accumulate
        loss = loss / CFG.n_accumulate 
        loss.backward()
        if (step +1) % CFG.n_accumulate == 0:
            optimizer.step()

            optimizer.zero_grad()
            if scheduler is not None:
                scheduler.step()
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size

        epoch_loss = running_loss / dataset_size

        end = time.time()
        
        if step % CFG.print_freq == 0 or step == (len(dataloader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  .format(epoch+1, step, len(dataloader), 
                          remain=timeSince(start, float(step+1)/len(dataloader))))

    gc.collect()

    return epoch_loss


@torch.no_grad()
def valid_one_epoch(model, dataloader, device, epoch):
    model.eval()

    dataset_size = 0
    running_loss = 0

    start = end = time.time()
    pred = []

    for step, data in enumerate(dataloader):
        ids = data['input_ids'].to(device, dtype=torch.long)
        mask = data['attention_mask'].to(device, dtype=torch.long)
        targets = data['target'].to(device, dtype=torch.long)

        batch_size = ids.size(0)
        outputs = model(ids, mask)
        loss = criterion(outputs, targets)
        pred.append(outputs.to('cpu').numpy())

        running_loss += (loss.item()* batch_size)
        dataset_size += batch_size

        epoch_loss = running_loss / dataset_size

        end = time.time()

        if step % CFG.print_freq == 0 or step == (len(dataloader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  .format(step, len(dataloader),
                          remain=timeSince(start, float(step+1)/len(dataloader))))
            
    pred = np.concatenate(pred)
            
    return epoch_loss, pred

In [18]:
def train_loop(fold):
    #wandb.watch(model, log_freq=100)

    LOGGER.info(f'-------------fold:{fold} training-------------')

    train_data = train[train.kfold != fold].reset_index(drop=True)
    valid_data = train[train.kfold == fold].reset_index(drop=True)
    valid_labels = valid_data.label.values

    trainDataset = Dataset(train_data, CFG.tokenizer, CFG.max_len)
    validDataset = Dataset(valid_data, CFG.tokenizer, CFG.max_len)

    train_loader = DataLoader(trainDataset,
                              batch_size = CFG.batch_size,
                              shuffle=True,
                              collate_fn = collate_fn,
                              num_workers = CFG.num_workers,
                              pin_memory = True,
                              drop_last=True)
    
    valid_loader = DataLoader(validDataset,
                              batch_size = CFG.batch_size*2,
                              shuffle=False,
                              collate_fn = collate_fn,
                              num_workers = CFG.num_workers,
                              pin_memory = True,
                              drop_last=False)
    
    model = CustomModel(CFG.model)
    torch.save(model.config, OUTPUT_MODEL_DIR+'config.pth')
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=CFG.lr, weight_decay=CFG.weigth_decay)
    num_train_steps = int(len(train_data) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # loop
    best_score = 0

    for epoch in range(CFG.epochs):
        start_time = time.time()

        train_epoch_loss = train_one_epoch(model, optimizer, scheduler, train_loader, device, epoch)
        valid_epoch_loss, pred = valid_one_epoch(model, valid_loader, device, epoch)

        score = get_score(pred, valid_labels)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {train_epoch_loss:.4f}  avg_val_loss: {valid_epoch_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')
        if CFG.wandb:
            wandb.log({f"[fold{fold}] epoch": epoch+1, 
                       f"[fold{fold}] avg_train_loss": train_epoch_loss, 
                       f"[fold{fold}] avg_val_loss": valid_epoch_loss,
                       f"[fold{fold}] score": score})
            
        if score > best_score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': pred},
                        OUTPUT_MODEL_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")
            
    predictions = torch.load(OUTPUT_MODEL_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_data['Consultant'] = predictions[:, 0]
    valid_data['Data scientist'] = predictions[:, 1]
    valid_data['Machine learning engineer'] = predictions[:, 2]
    valid_data['Software engineer'] = predictions[:, 3]
    
    
    temp = valid_data[['Consultant','Data scientist','Machine learning engineer','Software engineer']].values.tolist()
    print(get_score(temp, valid_data['label'].values))

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_data

In [19]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df['label'].values
        preds = oof_df[['Consultant','Data scientist','Machine learning engineer','Software engineer']].values.tolist()
        score = get_score(preds, labels)
        LOGGER.info(f'Score: {score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_MODEL_DIR+'oof_df.pkl')
        oof_df.to_csv(OUTPUT_MODEL_DIR+f'oof_df.csv', index=False)

-------------fold:0 training-------------
INFO:__main__:-------------fold:0 training-------------


Downloading pytorch_model.bin:   0%|          | 0.00/354M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/251] Elapsed 0m 3s (remain 14m 52s) 
Epoch: [1][50/251] Elapsed 0m 21s (remain 1m 24s) 
Epoch: [1][100/251] Elapsed 0m 41s (remain 1m 1s) 
Epoch: [1][150/251] Elapsed 0m 58s (remain 0m 38s) 
Epoch: [1][200/251] Elapsed 1m 14s (remain 0m 18s) 
Epoch: [1][250/251] Elapsed 1m 31s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 10s) 


Epoch 1 - avg_train_loss: 0.7552  avg_val_loss: 0.3950  time: 99s
INFO:__main__:Epoch 1 - avg_train_loss: 0.7552  avg_val_loss: 0.3950  time: 99s
Epoch 1 - Score: 0.7800
INFO:__main__:Epoch 1 - Score: 0.7800
Epoch 1 - Save Best Score: 0.7800 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7800 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) 
Epoch: [2][0/251] Elapsed 0m 0s (remain 2m 54s) 
Epoch: [2][50/251] Elapsed 0m 16s (remain 1m 5s) 
Epoch: [2][100/251] Elapsed 0m 35s (remain 0m 52s) 
Epoch: [2][150/251] Elapsed 0m 52s (remain 0m 35s) 
Epoch: [2][200/251] Elapsed 1m 10s (remain 0m 17s) 
Epoch: [2][250/251] Elapsed 1m 28s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 12s) 


Epoch 2 - avg_train_loss: 0.4288  avg_val_loss: 0.3929  time: 95s
INFO:__main__:Epoch 2 - avg_train_loss: 0.4288  avg_val_loss: 0.3929  time: 95s
Epoch 2 - Score: 0.7571
INFO:__main__:Epoch 2 - Score: 0.7571


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) 
Epoch: [3][0/251] Elapsed 0m 0s (remain 2m 15s) 
Epoch: [3][50/251] Elapsed 0m 18s (remain 1m 12s) 
Epoch: [3][100/251] Elapsed 0m 35s (remain 0m 53s) 
Epoch: [3][150/251] Elapsed 0m 52s (remain 0m 35s) 
Epoch: [3][200/251] Elapsed 1m 10s (remain 0m 17s) 
Epoch: [3][250/251] Elapsed 1m 28s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 12s) 


Epoch 3 - avg_train_loss: 0.3014  avg_val_loss: 0.3559  time: 95s
INFO:__main__:Epoch 3 - avg_train_loss: 0.3014  avg_val_loss: 0.3559  time: 95s
Epoch 3 - Score: 0.7985
INFO:__main__:Epoch 3 - Score: 0.7985
Epoch 3 - Save Best Score: 0.7985 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7985 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) 
Epoch: [4][0/251] Elapsed 0m 0s (remain 3m 39s) 
Epoch: [4][50/251] Elapsed 0m 17s (remain 1m 9s) 
Epoch: [4][100/251] Elapsed 0m 32s (remain 0m 48s) 
Epoch: [4][150/251] Elapsed 0m 50s (remain 0m 33s) 
Epoch: [4][200/251] Elapsed 1m 9s (remain 0m 17s) 
Epoch: [4][250/251] Elapsed 1m 27s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 12s) 


Epoch 4 - avg_train_loss: 0.1895  avg_val_loss: 0.4517  time: 94s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1895  avg_val_loss: 0.4517  time: 94s
Epoch 4 - Score: 0.7713
INFO:__main__:Epoch 4 - Score: 0.7713


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) 
Epoch: [5][0/251] Elapsed 0m 0s (remain 1m 15s) 
Epoch: [5][50/251] Elapsed 0m 18s (remain 1m 10s) 
Epoch: [5][100/251] Elapsed 0m 34s (remain 0m 51s) 
Epoch: [5][150/251] Elapsed 0m 53s (remain 0m 35s) 
Epoch: [5][200/251] Elapsed 1m 10s (remain 0m 17s) 
Epoch: [5][250/251] Elapsed 1m 28s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 12s) 


Epoch 5 - avg_train_loss: 0.1185  avg_val_loss: 0.5010  time: 96s
INFO:__main__:Epoch 5 - avg_train_loss: 0.1185  avg_val_loss: 0.5010  time: 96s
Epoch 5 - Score: 0.7882
INFO:__main__:Epoch 5 - Score: 0.7882


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) 
Epoch: [6][0/251] Elapsed 0m 0s (remain 1m 23s) 
Epoch: [6][50/251] Elapsed 0m 17s (remain 1m 7s) 
Epoch: [6][100/251] Elapsed 0m 34s (remain 0m 50s) 
Epoch: [6][150/251] Elapsed 0m 52s (remain 0m 34s) 
Epoch: [6][200/251] Elapsed 1m 11s (remain 0m 17s) 
Epoch: [6][250/251] Elapsed 1m 28s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 12s) 


Epoch 6 - avg_train_loss: 0.0449  avg_val_loss: 0.5352  time: 96s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0449  avg_val_loss: 0.5352  time: 96s
Epoch 6 - Score: 0.7985
INFO:__main__:Epoch 6 - Score: 0.7985


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) 
Epoch: [7][0/251] Elapsed 0m 0s (remain 2m 4s) 
Epoch: [7][50/251] Elapsed 0m 18s (remain 1m 12s) 
Epoch: [7][100/251] Elapsed 0m 36s (remain 0m 53s) 
Epoch: [7][150/251] Elapsed 0m 53s (remain 0m 35s) 
Epoch: [7][200/251] Elapsed 1m 11s (remain 0m 17s) 
Epoch: [7][250/251] Elapsed 1m 28s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 12s) 


Epoch 7 - avg_train_loss: 0.0234  avg_val_loss: 0.5349  time: 96s
INFO:__main__:Epoch 7 - avg_train_loss: 0.0234  avg_val_loss: 0.5349  time: 96s
Epoch 7 - Score: 0.7855
INFO:__main__:Epoch 7 - Score: 0.7855


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) 
Epoch: [8][0/251] Elapsed 0m 0s (remain 1m 18s) 
Epoch: [8][50/251] Elapsed 0m 19s (remain 1m 15s) 
Epoch: [8][100/251] Elapsed 0m 37s (remain 0m 55s) 
Epoch: [8][150/251] Elapsed 0m 56s (remain 0m 37s) 
Epoch: [8][200/251] Elapsed 1m 12s (remain 0m 18s) 
Epoch: [8][250/251] Elapsed 1m 28s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 12s) 


Epoch 8 - avg_train_loss: 0.0082  avg_val_loss: 0.5681  time: 95s
INFO:__main__:Epoch 8 - avg_train_loss: 0.0082  avg_val_loss: 0.5681  time: 95s
Epoch 8 - Score: 0.7894
INFO:__main__:Epoch 8 - Score: 0.7894


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) 
Epoch: [9][0/251] Elapsed 0m 0s (remain 1m 49s) 
Epoch: [9][50/251] Elapsed 0m 17s (remain 1m 7s) 
Epoch: [9][100/251] Elapsed 0m 33s (remain 0m 49s) 
Epoch: [9][150/251] Elapsed 0m 51s (remain 0m 33s) 
Epoch: [9][200/251] Elapsed 1m 9s (remain 0m 17s) 
Epoch: [9][250/251] Elapsed 1m 27s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 12s) 


Epoch 9 - avg_train_loss: 0.0052  avg_val_loss: 0.5759  time: 94s
INFO:__main__:Epoch 9 - avg_train_loss: 0.0052  avg_val_loss: 0.5759  time: 94s
Epoch 9 - Score: 0.7926
INFO:__main__:Epoch 9 - Score: 0.7926


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) 
Epoch: [10][0/251] Elapsed 0m 0s (remain 2m 32s) 
Epoch: [10][50/251] Elapsed 0m 17s (remain 1m 8s) 
Epoch: [10][100/251] Elapsed 0m 32s (remain 0m 48s) 
Epoch: [10][150/251] Elapsed 0m 49s (remain 0m 32s) 
Epoch: [10][200/251] Elapsed 1m 7s (remain 0m 16s) 
Epoch: [10][250/251] Elapsed 1m 27s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 12s) 


Epoch 10 - avg_train_loss: 0.0049  avg_val_loss: 0.5769  time: 95s
INFO:__main__:Epoch 10 - avg_train_loss: 0.0049  avg_val_loss: 0.5769  time: 95s
Epoch 10 - Score: 0.7926
INFO:__main__:Epoch 10 - Score: 0.7926


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) 


Score: 0.7985
INFO:__main__:Score: 0.7985
-------------fold:1 training-------------
INFO:__main__:-------------fold:1 training-------------


0.7984983795523621


Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/251] Elapsed 0m 0s (remain 1m 27s) 
Epoch: [1][50/251] Elapsed 0m 16s (remain 1m 5s) 
Epoch: [1][100/251] Elapsed 0m 34s (remain 0m 51s) 
Epoch: [1][150/251] Elapsed 0m 51s (remain 0m 33s) 
Epoch: [1][200/251] Elapsed 1m 9s (remain 0m 17s) 
Epoch: [1][250/251] Elapsed 1m 26s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 9s) 


Epoch 1 - avg_train_loss: 0.7140  avg_val_loss: 0.5170  time: 93s
INFO:__main__:Epoch 1 - avg_train_loss: 0.7140  avg_val_loss: 0.5170  time: 93s
Epoch 1 - Score: 0.7383
INFO:__main__:Epoch 1 - Score: 0.7383
Epoch 1 - Save Best Score: 0.7383 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7383 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) 
Epoch: [2][0/251] Elapsed 0m 0s (remain 1m 45s) 
Epoch: [2][50/251] Elapsed 0m 18s (remain 1m 11s) 
Epoch: [2][100/251] Elapsed 0m 34s (remain 0m 51s) 
Epoch: [2][150/251] Elapsed 0m 50s (remain 0m 33s) 
Epoch: [2][200/251] Elapsed 1m 8s (remain 0m 17s) 
Epoch: [2][250/251] Elapsed 1m 26s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 9s) 


Epoch 2 - avg_train_loss: 0.4205  avg_val_loss: 0.6040  time: 93s
INFO:__main__:Epoch 2 - avg_train_loss: 0.4205  avg_val_loss: 0.6040  time: 93s
Epoch 2 - Score: 0.7341
INFO:__main__:Epoch 2 - Score: 0.7341


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) 
Epoch: [3][0/251] Elapsed 0m 0s (remain 1m 14s) 
Epoch: [3][50/251] Elapsed 0m 17s (remain 1m 10s) 
Epoch: [3][100/251] Elapsed 0m 36s (remain 0m 53s) 
Epoch: [3][150/251] Elapsed 0m 52s (remain 0m 34s) 
Epoch: [3][200/251] Elapsed 1m 10s (remain 0m 17s) 
Epoch: [3][250/251] Elapsed 1m 26s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 9s) 


Epoch 3 - avg_train_loss: 0.2892  avg_val_loss: 0.4577  time: 94s
INFO:__main__:Epoch 3 - avg_train_loss: 0.2892  avg_val_loss: 0.4577  time: 94s
Epoch 3 - Score: 0.7699
INFO:__main__:Epoch 3 - Score: 0.7699
Epoch 3 - Save Best Score: 0.7699 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7699 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) 
Epoch: [4][0/251] Elapsed 0m 0s (remain 2m 8s) 
Epoch: [4][50/251] Elapsed 0m 17s (remain 1m 10s) 
Epoch: [4][100/251] Elapsed 0m 34s (remain 0m 51s) 
Epoch: [4][150/251] Elapsed 0m 51s (remain 0m 34s) 
Epoch: [4][200/251] Elapsed 1m 10s (remain 0m 17s) 
Epoch: [4][250/251] Elapsed 1m 27s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 9s) 


Epoch 4 - avg_train_loss: 0.2092  avg_val_loss: 0.4446  time: 94s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2092  avg_val_loss: 0.4446  time: 94s
Epoch 4 - Score: 0.8129
INFO:__main__:Epoch 4 - Score: 0.8129
Epoch 4 - Save Best Score: 0.8129 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.8129 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) 
Epoch: [5][0/251] Elapsed 0m 0s (remain 2m 49s) 
Epoch: [5][50/251] Elapsed 0m 19s (remain 1m 14s) 
Epoch: [5][100/251] Elapsed 0m 35s (remain 0m 53s) 
Epoch: [5][150/251] Elapsed 0m 51s (remain 0m 33s) 
Epoch: [5][200/251] Elapsed 1m 10s (remain 0m 17s) 
Epoch: [5][250/251] Elapsed 1m 26s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 9s) 


Epoch 5 - avg_train_loss: 0.1359  avg_val_loss: 0.4924  time: 94s
INFO:__main__:Epoch 5 - avg_train_loss: 0.1359  avg_val_loss: 0.4924  time: 94s
Epoch 5 - Score: 0.7999
INFO:__main__:Epoch 5 - Score: 0.7999


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) 
Epoch: [6][0/251] Elapsed 0m 0s (remain 1m 38s) 
Epoch: [6][50/251] Elapsed 0m 16s (remain 1m 2s) 
Epoch: [6][100/251] Elapsed 0m 34s (remain 0m 51s) 
Epoch: [6][150/251] Elapsed 0m 52s (remain 0m 34s) 
Epoch: [6][200/251] Elapsed 1m 8s (remain 0m 17s) 
Epoch: [6][250/251] Elapsed 1m 27s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 9s) 


Epoch 6 - avg_train_loss: 0.0605  avg_val_loss: 0.5841  time: 94s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0605  avg_val_loss: 0.5841  time: 94s
Epoch 6 - Score: 0.8114
INFO:__main__:Epoch 6 - Score: 0.8114


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) 
Epoch: [7][0/251] Elapsed 0m 0s (remain 1m 13s) 
Epoch: [7][50/251] Elapsed 0m 17s (remain 1m 9s) 
Epoch: [7][100/251] Elapsed 0m 34s (remain 0m 51s) 
Epoch: [7][150/251] Elapsed 0m 53s (remain 0m 35s) 
Epoch: [7][200/251] Elapsed 1m 9s (remain 0m 17s) 
Epoch: [7][250/251] Elapsed 1m 27s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 9s) 


Epoch 7 - avg_train_loss: 0.0309  avg_val_loss: 0.6195  time: 95s
INFO:__main__:Epoch 7 - avg_train_loss: 0.0309  avg_val_loss: 0.6195  time: 95s
Epoch 7 - Score: 0.8250
INFO:__main__:Epoch 7 - Score: 0.8250
Epoch 7 - Save Best Score: 0.8250 Model
INFO:__main__:Epoch 7 - Save Best Score: 0.8250 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) 
Epoch: [8][0/251] Elapsed 0m 0s (remain 1m 12s) 
Epoch: [8][50/251] Elapsed 0m 18s (remain 1m 13s) 
Epoch: [8][100/251] Elapsed 0m 36s (remain 0m 54s) 
Epoch: [8][150/251] Elapsed 0m 52s (remain 0m 34s) 
Epoch: [8][200/251] Elapsed 1m 9s (remain 0m 17s) 
Epoch: [8][250/251] Elapsed 1m 27s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 9s) 


Epoch 8 - avg_train_loss: 0.0177  avg_val_loss: 0.6484  time: 95s
INFO:__main__:Epoch 8 - avg_train_loss: 0.0177  avg_val_loss: 0.6484  time: 95s
Epoch 8 - Score: 0.8208
INFO:__main__:Epoch 8 - Score: 0.8208


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) 
Epoch: [9][0/251] Elapsed 0m 0s (remain 2m 38s) 
Epoch: [9][50/251] Elapsed 0m 16s (remain 1m 2s) 
Epoch: [9][100/251] Elapsed 0m 35s (remain 0m 52s) 
Epoch: [9][150/251] Elapsed 0m 51s (remain 0m 34s) 
Epoch: [9][200/251] Elapsed 1m 9s (remain 0m 17s) 
Epoch: [9][250/251] Elapsed 1m 26s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 9s) 


Epoch 9 - avg_train_loss: 0.0136  avg_val_loss: 0.6585  time: 94s
INFO:__main__:Epoch 9 - avg_train_loss: 0.0136  avg_val_loss: 0.6585  time: 94s
Epoch 9 - Score: 0.8294
INFO:__main__:Epoch 9 - Score: 0.8294
Epoch 9 - Save Best Score: 0.8294 Model
INFO:__main__:Epoch 9 - Save Best Score: 0.8294 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) 
Epoch: [10][0/251] Elapsed 0m 0s (remain 1m 46s) 
Epoch: [10][50/251] Elapsed 0m 17s (remain 1m 7s) 
Epoch: [10][100/251] Elapsed 0m 35s (remain 0m 52s) 
Epoch: [10][150/251] Elapsed 0m 52s (remain 0m 34s) 
Epoch: [10][200/251] Elapsed 1m 8s (remain 0m 17s) 
Epoch: [10][250/251] Elapsed 1m 26s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 9s) 


Epoch 10 - avg_train_loss: 0.0122  avg_val_loss: 0.6590  time: 93s
INFO:__main__:Epoch 10 - avg_train_loss: 0.0122  avg_val_loss: 0.6590  time: 93s
Epoch 10 - Score: 0.8294
INFO:__main__:Epoch 10 - Score: 0.8294


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) 


Score: 0.8294
INFO:__main__:Score: 0.8294
-------------fold:2 training-------------
INFO:__main__:-------------fold:2 training-------------


0.8294207531881913


Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/251] Elapsed 0m 0s (remain 1m 36s) 
Epoch: [1][50/251] Elapsed 0m 18s (remain 1m 12s) 
Epoch: [1][100/251] Elapsed 0m 35s (remain 0m 52s) 
Epoch: [1][150/251] Elapsed 0m 55s (remain 0m 36s) 
Epoch: [1][200/251] Elapsed 1m 12s (remain 0m 17s) 
Epoch: [1][250/251] Elapsed 1m 28s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 9s) 


Epoch 1 - avg_train_loss: 0.7532  avg_val_loss: 0.5740  time: 95s
INFO:__main__:Epoch 1 - avg_train_loss: 0.7532  avg_val_loss: 0.5740  time: 95s
Epoch 1 - Score: 0.6257
INFO:__main__:Epoch 1 - Score: 0.6257
Epoch 1 - Save Best Score: 0.6257 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6257 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) 
Epoch: [2][0/251] Elapsed 0m 0s (remain 3m 7s) 
Epoch: [2][50/251] Elapsed 0m 15s (remain 0m 59s) 
Epoch: [2][100/251] Elapsed 0m 30s (remain 0m 44s) 
Epoch: [2][150/251] Elapsed 0m 48s (remain 0m 31s) 
Epoch: [2][200/251] Elapsed 1m 6s (remain 0m 16s) 
Epoch: [2][250/251] Elapsed 1m 26s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 9s) 


Epoch 2 - avg_train_loss: 0.4367  avg_val_loss: 0.4213  time: 93s
INFO:__main__:Epoch 2 - avg_train_loss: 0.4367  avg_val_loss: 0.4213  time: 93s
Epoch 2 - Score: 0.7006
INFO:__main__:Epoch 2 - Score: 0.7006
Epoch 2 - Save Best Score: 0.7006 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7006 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) 
Epoch: [3][0/251] Elapsed 0m 0s (remain 1m 58s) 
Epoch: [3][50/251] Elapsed 0m 16s (remain 1m 2s) 
Epoch: [3][100/251] Elapsed 0m 33s (remain 0m 49s) 
Epoch: [3][150/251] Elapsed 0m 51s (remain 0m 33s) 
Epoch: [3][200/251] Elapsed 1m 10s (remain 0m 17s) 
Epoch: [3][250/251] Elapsed 1m 27s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 9s) 


Epoch 3 - avg_train_loss: 0.3085  avg_val_loss: 0.4152  time: 94s
INFO:__main__:Epoch 3 - avg_train_loss: 0.3085  avg_val_loss: 0.4152  time: 94s
Epoch 3 - Score: 0.7888
INFO:__main__:Epoch 3 - Score: 0.7888
Epoch 3 - Save Best Score: 0.7888 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7888 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) 
Epoch: [4][0/251] Elapsed 0m 0s (remain 1m 43s) 
Epoch: [4][50/251] Elapsed 0m 17s (remain 1m 8s) 
Epoch: [4][100/251] Elapsed 0m 35s (remain 0m 52s) 
Epoch: [4][150/251] Elapsed 0m 53s (remain 0m 35s) 
Epoch: [4][200/251] Elapsed 1m 9s (remain 0m 17s) 
Epoch: [4][250/251] Elapsed 1m 28s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 9s) 


Epoch 4 - avg_train_loss: 0.2144  avg_val_loss: 0.4290  time: 95s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2144  avg_val_loss: 0.4290  time: 95s
Epoch 4 - Score: 0.7771
INFO:__main__:Epoch 4 - Score: 0.7771


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) 
Epoch: [5][0/251] Elapsed 0m 0s (remain 3m 47s) 
Epoch: [5][50/251] Elapsed 0m 16s (remain 1m 3s) 
Epoch: [5][100/251] Elapsed 0m 36s (remain 0m 53s) 
Epoch: [5][150/251] Elapsed 0m 53s (remain 0m 35s) 
Epoch: [5][200/251] Elapsed 1m 11s (remain 0m 17s) 
Epoch: [5][250/251] Elapsed 1m 26s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 9s) 


Epoch 5 - avg_train_loss: 0.1375  avg_val_loss: 0.4237  time: 93s
INFO:__main__:Epoch 5 - avg_train_loss: 0.1375  avg_val_loss: 0.4237  time: 93s
Epoch 5 - Score: 0.8259
INFO:__main__:Epoch 5 - Score: 0.8259
Epoch 5 - Save Best Score: 0.8259 Model
INFO:__main__:Epoch 5 - Save Best Score: 0.8259 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) 
Epoch: [6][0/251] Elapsed 0m 0s (remain 1m 36s) 
Epoch: [6][50/251] Elapsed 0m 16s (remain 1m 3s) 
Epoch: [6][100/251] Elapsed 0m 33s (remain 0m 50s) 
Epoch: [6][150/251] Elapsed 0m 52s (remain 0m 34s) 
Epoch: [6][200/251] Elapsed 1m 8s (remain 0m 17s) 
Epoch: [6][250/251] Elapsed 1m 25s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 9s) 


Epoch 6 - avg_train_loss: 0.0728  avg_val_loss: 0.5240  time: 92s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0728  avg_val_loss: 0.5240  time: 92s
Epoch 6 - Score: 0.8247
INFO:__main__:Epoch 6 - Score: 0.8247


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) 
Epoch: [7][0/251] Elapsed 0m 0s (remain 1m 53s) 
Epoch: [7][50/251] Elapsed 0m 18s (remain 1m 12s) 
Epoch: [7][100/251] Elapsed 0m 33s (remain 0m 50s) 
Epoch: [7][150/251] Elapsed 0m 50s (remain 0m 33s) 
Epoch: [7][200/251] Elapsed 1m 9s (remain 0m 17s) 
Epoch: [7][250/251] Elapsed 1m 27s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 9s) 


Epoch 7 - avg_train_loss: 0.0345  avg_val_loss: 0.5622  time: 94s
INFO:__main__:Epoch 7 - avg_train_loss: 0.0345  avg_val_loss: 0.5622  time: 94s
Epoch 7 - Score: 0.8129
INFO:__main__:Epoch 7 - Score: 0.8129


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) 
Epoch: [8][0/251] Elapsed 0m 0s (remain 1m 19s) 
Epoch: [8][50/251] Elapsed 0m 15s (remain 0m 59s) 
Epoch: [8][100/251] Elapsed 0m 33s (remain 0m 49s) 
Epoch: [8][150/251] Elapsed 0m 50s (remain 0m 33s) 
Epoch: [8][200/251] Elapsed 1m 9s (remain 0m 17s) 
Epoch: [8][250/251] Elapsed 1m 27s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 9s) 


Epoch 8 - avg_train_loss: 0.0191  avg_val_loss: 0.5778  time: 94s
INFO:__main__:Epoch 8 - avg_train_loss: 0.0191  avg_val_loss: 0.5778  time: 94s
Epoch 8 - Score: 0.8081
INFO:__main__:Epoch 8 - Score: 0.8081


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) 
Epoch: [9][0/251] Elapsed 0m 0s (remain 2m 47s) 
Epoch: [9][50/251] Elapsed 0m 17s (remain 1m 8s) 
Epoch: [9][100/251] Elapsed 0m 33s (remain 0m 49s) 
Epoch: [9][150/251] Elapsed 0m 52s (remain 0m 34s) 
Epoch: [9][200/251] Elapsed 1m 9s (remain 0m 17s) 
Epoch: [9][250/251] Elapsed 1m 26s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 9s) 


Epoch 9 - avg_train_loss: 0.0115  avg_val_loss: 0.5980  time: 93s
INFO:__main__:Epoch 9 - avg_train_loss: 0.0115  avg_val_loss: 0.5980  time: 93s
Epoch 9 - Score: 0.8048
INFO:__main__:Epoch 9 - Score: 0.8048


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) 
Epoch: [10][0/251] Elapsed 0m 0s (remain 1m 16s) 
Epoch: [10][50/251] Elapsed 0m 20s (remain 1m 19s) 
Epoch: [10][100/251] Elapsed 0m 36s (remain 0m 54s) 
Epoch: [10][150/251] Elapsed 0m 53s (remain 0m 35s) 
Epoch: [10][200/251] Elapsed 1m 11s (remain 0m 17s) 
Epoch: [10][250/251] Elapsed 1m 28s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 9s) 


Epoch 10 - avg_train_loss: 0.0109  avg_val_loss: 0.5988  time: 95s
INFO:__main__:Epoch 10 - avg_train_loss: 0.0109  avg_val_loss: 0.5988  time: 95s
Epoch 10 - Score: 0.8065
INFO:__main__:Epoch 10 - Score: 0.8065


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) 


Score: 0.8259
INFO:__main__:Score: 0.8259
-------------fold:3 training-------------
INFO:__main__:-------------fold:3 training-------------


0.8259415582950171


Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/251] Elapsed 0m 0s (remain 2m 14s) 
Epoch: [1][50/251] Elapsed 0m 18s (remain 1m 13s) 
Epoch: [1][100/251] Elapsed 0m 36s (remain 0m 53s) 
Epoch: [1][150/251] Elapsed 0m 55s (remain 0m 36s) 
Epoch: [1][200/251] Elapsed 1m 11s (remain 0m 17s) 
Epoch: [1][250/251] Elapsed 1m 29s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 14s) 


Epoch 1 - avg_train_loss: 0.7748  avg_val_loss: 0.5419  time: 96s
INFO:__main__:Epoch 1 - avg_train_loss: 0.7748  avg_val_loss: 0.5419  time: 96s
Epoch 1 - Score: 0.6431
INFO:__main__:Epoch 1 - Score: 0.6431
Epoch 1 - Save Best Score: 0.6431 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6431 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) 
Epoch: [2][0/251] Elapsed 0m 0s (remain 1m 12s) 
Epoch: [2][50/251] Elapsed 0m 17s (remain 1m 9s) 
Epoch: [2][100/251] Elapsed 0m 34s (remain 0m 50s) 
Epoch: [2][150/251] Elapsed 0m 53s (remain 0m 35s) 
Epoch: [2][200/251] Elapsed 1m 11s (remain 0m 17s) 
Epoch: [2][250/251] Elapsed 1m 28s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 13s) 


Epoch 2 - avg_train_loss: 0.4076  avg_val_loss: 0.6433  time: 95s
INFO:__main__:Epoch 2 - avg_train_loss: 0.4076  avg_val_loss: 0.6433  time: 95s
Epoch 2 - Score: 0.7333
INFO:__main__:Epoch 2 - Score: 0.7333
Epoch 2 - Save Best Score: 0.7333 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7333 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) 
Epoch: [3][0/251] Elapsed 0m 0s (remain 2m 7s) 
Epoch: [3][50/251] Elapsed 0m 19s (remain 1m 14s) 
Epoch: [3][100/251] Elapsed 0m 36s (remain 0m 54s) 
Epoch: [3][150/251] Elapsed 0m 52s (remain 0m 34s) 
Epoch: [3][200/251] Elapsed 1m 10s (remain 0m 17s) 
Epoch: [3][250/251] Elapsed 1m 28s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 13s) 


Epoch 3 - avg_train_loss: 0.2905  avg_val_loss: 0.5030  time: 95s
INFO:__main__:Epoch 3 - avg_train_loss: 0.2905  avg_val_loss: 0.5030  time: 95s
Epoch 3 - Score: 0.7818
INFO:__main__:Epoch 3 - Score: 0.7818
Epoch 3 - Save Best Score: 0.7818 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7818 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) 
Epoch: [4][0/251] Elapsed 0m 0s (remain 1m 35s) 
Epoch: [4][50/251] Elapsed 0m 17s (remain 1m 8s) 
Epoch: [4][100/251] Elapsed 0m 35s (remain 0m 52s) 
Epoch: [4][150/251] Elapsed 0m 52s (remain 0m 34s) 
Epoch: [4][200/251] Elapsed 1m 10s (remain 0m 17s) 
Epoch: [4][250/251] Elapsed 1m 27s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 13s) 


Epoch 4 - avg_train_loss: 0.2120  avg_val_loss: 0.5155  time: 94s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2120  avg_val_loss: 0.5155  time: 94s
Epoch 4 - Score: 0.7852
INFO:__main__:Epoch 4 - Score: 0.7852
Epoch 4 - Save Best Score: 0.7852 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7852 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) 
Epoch: [5][0/251] Elapsed 0m 0s (remain 2m 51s) 
Epoch: [5][50/251] Elapsed 0m 17s (remain 1m 7s) 
Epoch: [5][100/251] Elapsed 0m 33s (remain 0m 50s) 
Epoch: [5][150/251] Elapsed 0m 51s (remain 0m 34s) 
Epoch: [5][200/251] Elapsed 1m 8s (remain 0m 17s) 
Epoch: [5][250/251] Elapsed 1m 26s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 13s) 


Epoch 5 - avg_train_loss: 0.1207  avg_val_loss: 0.5422  time: 93s
INFO:__main__:Epoch 5 - avg_train_loss: 0.1207  avg_val_loss: 0.5422  time: 93s
Epoch 5 - Score: 0.7782
INFO:__main__:Epoch 5 - Score: 0.7782


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) 
Epoch: [6][0/251] Elapsed 0m 0s (remain 1m 12s) 
Epoch: [6][50/251] Elapsed 0m 17s (remain 1m 8s) 
Epoch: [6][100/251] Elapsed 0m 36s (remain 0m 53s) 
Epoch: [6][150/251] Elapsed 0m 54s (remain 0m 36s) 
Epoch: [6][200/251] Elapsed 1m 10s (remain 0m 17s) 
Epoch: [6][250/251] Elapsed 1m 28s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 14s) 


Epoch 6 - avg_train_loss: 0.0748  avg_val_loss: 0.6123  time: 95s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0748  avg_val_loss: 0.6123  time: 95s
Epoch 6 - Score: 0.7780
INFO:__main__:Epoch 6 - Score: 0.7780


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) 
Epoch: [7][0/251] Elapsed 0m 0s (remain 1m 19s) 
Epoch: [7][50/251] Elapsed 0m 15s (remain 1m 1s) 
Epoch: [7][100/251] Elapsed 0m 35s (remain 0m 52s) 
Epoch: [7][150/251] Elapsed 0m 53s (remain 0m 35s) 
Epoch: [7][200/251] Elapsed 1m 9s (remain 0m 17s) 
Epoch: [7][250/251] Elapsed 1m 27s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 14s) 


Epoch 7 - avg_train_loss: 0.0365  avg_val_loss: 0.6597  time: 94s
INFO:__main__:Epoch 7 - avg_train_loss: 0.0365  avg_val_loss: 0.6597  time: 94s
Epoch 7 - Score: 0.7915
INFO:__main__:Epoch 7 - Score: 0.7915
Epoch 7 - Save Best Score: 0.7915 Model
INFO:__main__:Epoch 7 - Save Best Score: 0.7915 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) 
Epoch: [8][0/251] Elapsed 0m 0s (remain 2m 29s) 
Epoch: [8][50/251] Elapsed 0m 16s (remain 1m 4s) 
Epoch: [8][100/251] Elapsed 0m 35s (remain 0m 52s) 
Epoch: [8][150/251] Elapsed 0m 51s (remain 0m 33s) 
Epoch: [8][200/251] Elapsed 1m 10s (remain 0m 17s) 
Epoch: [8][250/251] Elapsed 1m 29s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 13s) 


Epoch 8 - avg_train_loss: 0.0179  avg_val_loss: 0.7127  time: 96s
INFO:__main__:Epoch 8 - avg_train_loss: 0.0179  avg_val_loss: 0.7127  time: 96s
Epoch 8 - Score: 0.7951
INFO:__main__:Epoch 8 - Score: 0.7951
Epoch 8 - Save Best Score: 0.7951 Model
INFO:__main__:Epoch 8 - Save Best Score: 0.7951 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) 
Epoch: [9][0/251] Elapsed 0m 0s (remain 1m 13s) 
Epoch: [9][50/251] Elapsed 0m 17s (remain 1m 8s) 
Epoch: [9][100/251] Elapsed 0m 34s (remain 0m 50s) 
Epoch: [9][150/251] Elapsed 0m 52s (remain 0m 34s) 
Epoch: [9][200/251] Elapsed 1m 8s (remain 0m 17s) 
Epoch: [9][250/251] Elapsed 1m 27s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 14s) 


Epoch 9 - avg_train_loss: 0.0161  avg_val_loss: 0.7256  time: 94s
INFO:__main__:Epoch 9 - avg_train_loss: 0.0161  avg_val_loss: 0.7256  time: 94s
Epoch 9 - Score: 0.7995
INFO:__main__:Epoch 9 - Score: 0.7995
Epoch 9 - Save Best Score: 0.7995 Model
INFO:__main__:Epoch 9 - Save Best Score: 0.7995 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) 
Epoch: [10][0/251] Elapsed 0m 0s (remain 2m 32s) 
Epoch: [10][50/251] Elapsed 0m 17s (remain 1m 9s) 
Epoch: [10][100/251] Elapsed 0m 34s (remain 0m 50s) 
Epoch: [10][150/251] Elapsed 0m 51s (remain 0m 34s) 
Epoch: [10][200/251] Elapsed 1m 8s (remain 0m 17s) 
Epoch: [10][250/251] Elapsed 1m 28s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 14s) 


Epoch 10 - avg_train_loss: 0.0127  avg_val_loss: 0.7286  time: 95s
INFO:__main__:Epoch 10 - avg_train_loss: 0.0127  avg_val_loss: 0.7286  time: 95s
Epoch 10 - Score: 0.7983
INFO:__main__:Epoch 10 - Score: 0.7983


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) 


Score: 0.7995
INFO:__main__:Score: 0.7995
-------------fold:4 training-------------
INFO:__main__:-------------fold:4 training-------------


0.7994527612230691


Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/251] Elapsed 0m 0s (remain 1m 18s) 
Epoch: [1][50/251] Elapsed 0m 16s (remain 1m 3s) 
Epoch: [1][100/251] Elapsed 0m 32s (remain 0m 48s) 
Epoch: [1][150/251] Elapsed 0m 52s (remain 0m 34s) 
Epoch: [1][200/251] Elapsed 1m 9s (remain 0m 17s) 
Epoch: [1][250/251] Elapsed 1m 24s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 11s) 


Epoch 1 - avg_train_loss: 0.7091  avg_val_loss: 0.4542  time: 92s
INFO:__main__:Epoch 1 - avg_train_loss: 0.7091  avg_val_loss: 0.4542  time: 92s
Epoch 1 - Score: 0.7911
INFO:__main__:Epoch 1 - Score: 0.7911
Epoch 1 - Save Best Score: 0.7911 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7911 Model


EVAL: [31/32] Elapsed 0m 7s (remain 0m 0s) 
Epoch: [2][0/251] Elapsed 0m 0s (remain 1m 47s) 
Epoch: [2][50/251] Elapsed 0m 19s (remain 1m 14s) 
Epoch: [2][100/251] Elapsed 0m 35s (remain 0m 52s) 
Epoch: [2][150/251] Elapsed 0m 51s (remain 0m 33s) 
Epoch: [2][200/251] Elapsed 1m 6s (remain 0m 16s) 
Epoch: [2][250/251] Elapsed 1m 25s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 11s) 


Epoch 2 - avg_train_loss: 0.4097  avg_val_loss: 0.3863  time: 93s
INFO:__main__:Epoch 2 - avg_train_loss: 0.4097  avg_val_loss: 0.3863  time: 93s
Epoch 2 - Score: 0.8033
INFO:__main__:Epoch 2 - Score: 0.8033
Epoch 2 - Save Best Score: 0.8033 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.8033 Model


EVAL: [31/32] Elapsed 0m 7s (remain 0m 0s) 
Epoch: [3][0/251] Elapsed 0m 0s (remain 1m 38s) 
Epoch: [3][50/251] Elapsed 0m 16s (remain 1m 5s) 
Epoch: [3][100/251] Elapsed 0m 32s (remain 0m 48s) 
Epoch: [3][150/251] Elapsed 0m 48s (remain 0m 32s) 
Epoch: [3][200/251] Elapsed 1m 6s (remain 0m 16s) 
Epoch: [3][250/251] Elapsed 1m 23s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 11s) 


Epoch 3 - avg_train_loss: 0.2798  avg_val_loss: 0.4446  time: 91s
INFO:__main__:Epoch 3 - avg_train_loss: 0.2798  avg_val_loss: 0.4446  time: 91s
Epoch 3 - Score: 0.7982
INFO:__main__:Epoch 3 - Score: 0.7982


EVAL: [31/32] Elapsed 0m 7s (remain 0m 0s) 
Epoch: [4][0/251] Elapsed 0m 0s (remain 1m 20s) 
Epoch: [4][50/251] Elapsed 0m 17s (remain 1m 8s) 
Epoch: [4][100/251] Elapsed 0m 35s (remain 0m 53s) 
Epoch: [4][150/251] Elapsed 0m 53s (remain 0m 35s) 
Epoch: [4][200/251] Elapsed 1m 9s (remain 0m 17s) 
Epoch: [4][250/251] Elapsed 1m 26s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 11s) 


Epoch 4 - avg_train_loss: 0.1951  avg_val_loss: 0.4423  time: 94s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1951  avg_val_loss: 0.4423  time: 94s
Epoch 4 - Score: 0.7738
INFO:__main__:Epoch 4 - Score: 0.7738


EVAL: [31/32] Elapsed 0m 7s (remain 0m 0s) 
Epoch: [5][0/251] Elapsed 0m 0s (remain 1m 28s) 
Epoch: [5][50/251] Elapsed 0m 18s (remain 1m 13s) 
Epoch: [5][100/251] Elapsed 0m 37s (remain 0m 55s) 
Epoch: [5][150/251] Elapsed 0m 53s (remain 0m 35s) 
Epoch: [5][200/251] Elapsed 1m 9s (remain 0m 17s) 
Epoch: [5][250/251] Elapsed 1m 25s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 11s) 


Epoch 5 - avg_train_loss: 0.1094  avg_val_loss: 0.4682  time: 93s
INFO:__main__:Epoch 5 - avg_train_loss: 0.1094  avg_val_loss: 0.4682  time: 93s
Epoch 5 - Score: 0.8022
INFO:__main__:Epoch 5 - Score: 0.8022


EVAL: [31/32] Elapsed 0m 7s (remain 0m 0s) 
Epoch: [6][0/251] Elapsed 0m 0s (remain 1m 3s) 
Epoch: [6][50/251] Elapsed 0m 18s (remain 1m 10s) 
Epoch: [6][100/251] Elapsed 0m 35s (remain 0m 51s) 
Epoch: [6][150/251] Elapsed 0m 50s (remain 0m 33s) 
Epoch: [6][200/251] Elapsed 1m 9s (remain 0m 17s) 
Epoch: [6][250/251] Elapsed 1m 25s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 11s) 


Epoch 6 - avg_train_loss: 0.0549  avg_val_loss: 0.5455  time: 93s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0549  avg_val_loss: 0.5455  time: 93s
Epoch 6 - Score: 0.7714
INFO:__main__:Epoch 6 - Score: 0.7714


EVAL: [31/32] Elapsed 0m 7s (remain 0m 0s) 
Epoch: [7][0/251] Elapsed 0m 0s (remain 1m 25s) 
Epoch: [7][50/251] Elapsed 0m 15s (remain 1m 2s) 
Epoch: [7][100/251] Elapsed 0m 34s (remain 0m 50s) 
Epoch: [7][150/251] Elapsed 0m 50s (remain 0m 33s) 
Epoch: [7][200/251] Elapsed 1m 8s (remain 0m 16s) 
Epoch: [7][250/251] Elapsed 1m 25s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 10s) 


Epoch 7 - avg_train_loss: 0.0233  avg_val_loss: 0.5764  time: 93s
INFO:__main__:Epoch 7 - avg_train_loss: 0.0233  avg_val_loss: 0.5764  time: 93s
Epoch 7 - Score: 0.7901
INFO:__main__:Epoch 7 - Score: 0.7901


EVAL: [31/32] Elapsed 0m 7s (remain 0m 0s) 
Epoch: [8][0/251] Elapsed 0m 0s (remain 1m 8s) 
Epoch: [8][50/251] Elapsed 0m 17s (remain 1m 9s) 
Epoch: [8][100/251] Elapsed 0m 35s (remain 0m 52s) 
Epoch: [8][150/251] Elapsed 0m 52s (remain 0m 34s) 
Epoch: [8][200/251] Elapsed 1m 9s (remain 0m 17s) 
Epoch: [8][250/251] Elapsed 1m 25s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 11s) 


Epoch 8 - avg_train_loss: 0.0140  avg_val_loss: 0.5655  time: 93s
INFO:__main__:Epoch 8 - avg_train_loss: 0.0140  avg_val_loss: 0.5655  time: 93s
Epoch 8 - Score: 0.7892
INFO:__main__:Epoch 8 - Score: 0.7892


EVAL: [31/32] Elapsed 0m 7s (remain 0m 0s) 
Epoch: [9][0/251] Elapsed 0m 0s (remain 1m 18s) 
Epoch: [9][50/251] Elapsed 0m 16s (remain 1m 4s) 
Epoch: [9][100/251] Elapsed 0m 33s (remain 0m 49s) 
Epoch: [9][150/251] Elapsed 0m 52s (remain 0m 35s) 
Epoch: [9][200/251] Elapsed 1m 9s (remain 0m 17s) 
Epoch: [9][250/251] Elapsed 1m 27s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 11s) 


Epoch 9 - avg_train_loss: 0.0083  avg_val_loss: 0.5647  time: 94s
INFO:__main__:Epoch 9 - avg_train_loss: 0.0083  avg_val_loss: 0.5647  time: 94s
Epoch 9 - Score: 0.7975
INFO:__main__:Epoch 9 - Score: 0.7975


EVAL: [31/32] Elapsed 0m 7s (remain 0m 0s) 
Epoch: [10][0/251] Elapsed 0m 0s (remain 1m 48s) 
Epoch: [10][50/251] Elapsed 0m 17s (remain 1m 8s) 
Epoch: [10][100/251] Elapsed 0m 34s (remain 0m 51s) 
Epoch: [10][150/251] Elapsed 0m 49s (remain 0m 32s) 
Epoch: [10][200/251] Elapsed 1m 8s (remain 0m 17s) 
Epoch: [10][250/251] Elapsed 1m 25s (remain 0m 0s) 
EVAL: [0/32] Elapsed 0m 0s (remain 0m 11s) 


Epoch 10 - avg_train_loss: 0.0075  avg_val_loss: 0.5599  time: 93s
INFO:__main__:Epoch 10 - avg_train_loss: 0.0075  avg_val_loss: 0.5599  time: 93s
Epoch 10 - Score: 0.7944
INFO:__main__:Epoch 10 - Score: 0.7944


EVAL: [31/32] Elapsed 0m 7s (remain 0m 0s) 


Score: 0.8033
INFO:__main__:Score: 0.8033
Score: 0.8107
INFO:__main__:Score: 0.8107


0.8032584336848235


In [20]:
A = pd.read_csv(OUTPUT_MODEL_DIR+'oof_df.csv')
A.head()

Unnamed: 0,id,description,label,inputs,kfold,Consultant,Data scientist,Machine learning engineer,Software engineer
0,3,"Work on the technical design, development, re...",2,"Work on the technical design, development, re...",0,-2.078508,-0.325869,4.206412,-1.603805
1,13,"Research, prototype, identify, and build predi...",1,"Research, prototype, identify, and build predi...",0,2.557664,0.811217,-2.674249,-2.269534
2,21,Execute on assigned project tasksPrepare mater...,3,Execute on assigned project tasksPrepare mater...,0,3.048291,-2.608134,-3.304976,1.603433
3,27,The successful candidate will:Perform industry...,0,The successful candidate will:Perform industry...,0,3.363289,-2.211976,-3.463783,0.708751
4,28,Build network within the Capgemini group and ...,3,Build network within the Capgemini group and ...,0,-0.646065,-2.87035,-1.895262,4.406855
