In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.1-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 4.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 14.6 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 57.8 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 70.2 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Un

In [3]:
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 4.2 MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.97


In [4]:
import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
INPUT_DIR = '/content/drive/MyDrive/Competitions/Signate/Student Cup 2022/input/'
OUTPUT_DIR = '/content/drive/MyDrive/Competitions/Signate/Student Cup 2022/output/'
OUTPUT_SUB_DIR = os.path.join(OUTPUT_DIR,'Submission')
OUTPUT_MODEL_DIR = os.path.join(OUTPUT_DIR,'Model/DeBERTa-large[ver2]/')

In [6]:
class CFG:
    wandb = False
    apex = True
    model = 'microsoft/deberta-large'
    seed = 42
    n_splits = 5
    max_len = 1024
    dropout = 0.2
    target_size=4
    n_accumulate=1
    print_freq = 50
    min_lr=1e-6
    scheduler = 'cosine'
    batch_size = 16
    num_workers = 2
    lr = 3e-5
    weigth_decay = 0.01
    epochs = 10
    n_fold = 5
    trn_fold = [0, 1, 2, 3, 4]
    train = True 
    num_warmup_steps = 0
    num_cycles=0.5
    debug = False
    debug_ver2 = False
    gradient_checkpointing = True
    freezing = True

In [7]:
# Loss Func
def criterion(outputs, labels):
    return nn.CrossEntropyLoss()(outputs, labels)

def softmax(z):
    assert len(z.shape) == 2
    s = np.max(z, axis=1)
    s = s[:, np.newaxis] # necessary step to do broadcasting
    e_x = np.exp(z - s)
    div = np.sum(e_x, axis=1)
    div = div[:, np.newaxis] # dito
    return e_x / div
"""
def get_score(y_true, y_pred):
    y_pred = softmax(y_pred)
    score = log_loss(y_true, y_pred)
    return round(score, 5)
"""
def get_score(outputs, labels):
    outputs = F.softmax(torch.tensor(outputs)).numpy()
    return f1_score(np.argmax(outputs,axis=1),labels ,average='macro')

def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

def prepare_input(cfg, text, text_2=None):
    inputs = cfg.tokenizer(text, text_2,
                           padding="max_length",
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           truncation=True)

    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
        
    return inputs

In [8]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False
        
def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
            
    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """
    
    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"
        
        if hasattr(embeddings_path, attr_name): 
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [9]:
train = pd.read_csv(os.path.join(INPUT_DIR, 'train.csv'))
test = pd.read_csv(os.path.join(INPUT_DIR, 'test.csv'))
submission_df = pd.read_csv(os.path.join(INPUT_DIR, 'submit_sample.csv'))
pseudo = pd.read_csv(os.path.join(OUTPUT_DIR,"pseudo_df.csv"))

display(train.head())
print(train.shape)
display(test.head())
print(test.shape)
display(pseudo)
print(pseudo.shape)

Unnamed: 0,id,description,jobflag
0,0,<li>Develop cutting-edge web applications that...,3
1,1,"<li> Designs and develops high quality, scalab...",3
2,2,<li>Functions as a point person for Network St...,4
3,3,"<li> Work on the technical design, development...",3
4,4,<li>Quantify the resources required for a task...,4


(1516, 3)


Unnamed: 0,id,description
0,1516,<li>Building decision-making models and propos...
1,1517,<li>Educate homeowners on the benefits of sola...
2,1518,"<li><span>Design, develop, document, and imple..."
3,1519,<li>Apply advanced technical expertise and ski...
4,1520,<li>Project manage and deliver against our roa...


(1517, 2)


Unnamed: 0,id,description,jobflag
0,1516,<li>Building decision-making models and propos...,1
1,1517,<li>Educate homeowners on the benefits of sola...,4
2,1518,"<li><span>Design, develop, document, and imple...",3
3,1519,<li>Apply advanced technical expertise and ski...,4
4,1521,<li>Model and execute on concurrent Product A/...,1
...,...,...,...
1046,3026,"<li>Write, validate and maintain programs in P...",1
1047,3027,<li>Completes programming and performs testing...,3
1048,3028,<li> Manages the development of interface requ...,3
1049,3029,<li>Lead the implementation of new statistical...,1


(1051, 3)


In [10]:
def remove_tag(x):
    p = re.compile(r"<[^>]*?>")
    return p.sub('',x)

def cleaning(texts):
    clean_texts = []
    for text in texts:
        # htmlタグを削除
        text = remove_tag(text)
        #アルファベット以外をスペースに置き換え
        #clean_punc = re.sub(r'[^a-zA-Z]', ' ', text)
        clean_texts.append(text)
    return clean_texts



from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs

def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

# Register the encoding and decoding error handlers for `utf-8` and `cp1252`.
codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)

def resolve_encodings_and_normalize(text: str) -> str:
    """Resolve the encoding problems and normalize the abnormal characters."""
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text

train = pd.concat([train,pseudo],axis=0)
train['description'] = cleaning(train['description'])
test['description'] = cleaning(test['description'])
train['inputs'] = train['description'].apply(lambda x : resolve_encodings_and_normalize(x))
test['inputs'] = test['description'].apply(lambda x : resolve_encodings_and_normalize(x))
train.drop_duplicates(subset=['inputs'],keep='first',inplace=True)
train = train.reset_index(drop=True)
train = train.rename(columns = {"jobflag": "label"})
train["label"] = train["label"] -1
train

Unnamed: 0,id,description,label,inputs
0,0,Develop cutting-edge web applications that per...,2,Develop cutting-edge web applications that per...
1,1,"Designs and develops high quality, scalable a...",2,"Designs and develops high quality, scalable a..."
2,2,Functions as a point person for Network Strate...,3,Functions as a point person for Network Strate...
3,3,"Work on the technical design, development, re...",2,"Work on the technical design, development, re..."
4,4,Quantify the resources required for a task/pro...,3,Quantify the resources required for a task/pro...
...,...,...,...,...
2512,3026,"Write, validate and maintain programs in Pytho...",0,"Write, validate and maintain programs in Pytho..."
2513,3027,Completes programming and performs testing and...,2,Completes programming and performs testing and...
2514,3028,Manages the development of interface requirem...,2,Manages the development of interface requirem...
2515,3029,Lead the implementation of new statistical mod...,0,Lead the implementation of new statistical mod...


In [11]:
skf = StratifiedKFold(n_splits=CFG.n_splits,shuffle=True,random_state=CFG.seed)
for fold, ( _, val_) in enumerate(skf.split(train, train.label)):
    train.loc[val_ , "kfold"] = int(fold)
    
train["kfold"] = train["kfold"].astype(int)

In [12]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_MODEL_DIR+'tokenizer/')
CFG.tokenizer = tokenizer
SEP = tokenizer.sep_token

Downloading tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/475 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

In [13]:
class Dataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = CFG.max_len
        self.text = df['inputs'].values
        self.tokenizer = CFG.tokenizer
        self.targets = df['label'].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        text = self.text[index]
        inputs = tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length = self.max_len
        )
        samples = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
            'target': self.targets[index]
        }

        if 'token_type_ids' in inputs:
            samples['token_type_ids'] = inputs['token_type_ids']
            
        return samples

In [14]:
# Dynamic Padding (Collate)
#collate_fn = DataCollatorWithPadding(tokenizer=CFG.tokenizer)
class Collate:
    def __init__(self, tokenizer, isTrain=True):
        self.tokenizer = tokenizer
        self.isTrain = isTrain
        # self.args = args

    def __call__(self, batch):
        output = dict()
        output["input_ids"] = [sample["input_ids"] for sample in batch]
        output["attention_mask"] = [sample["attention_mask"] for sample in batch]
        if self.isTrain:
            output["target"] = [sample["target"] for sample in batch]

        # calculate max token length of this batch
        batch_max = max([len(ids) for ids in output["input_ids"]])

        # add padding
        if self.tokenizer.padding_side == "right":
            output["input_ids"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in output["input_ids"]]
            output["attention_mask"] = [s + (batch_max - len(s)) * [0] for s in output["attention_mask"]]
        else:
            output["input_ids"] = [(batch_max - len(s)) * [self.tokenizer.pad_token_id] + s for s in output["input_ids"]]
            output["attention_mask"] = [(batch_max - len(s)) * [0] + s for s in output["attention_mask"]]

        # convert to tensors
        output["input_ids"] = torch.tensor(output["input_ids"], dtype=torch.long)
        output["attention_mask"] = torch.tensor(output["attention_mask"], dtype=torch.long)
        if self.isTrain:
            output["target"] = torch.tensor(output["target"], dtype=torch.long)

        return output
    
collate_fn = Collate(CFG.tokenizer, isTrain=True)

In [15]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9) #
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

In [16]:
# ====================================================
# Model
# ====================================================
class CustomModel(nn.Module):
    def __init__(self, model_name):
        super(CustomModel, self).__init__()
        # Header (fast or normal)
        self.model = AutoModel.from_pretrained(model_name)
        
        # Gradient_checkpointing
        if CFG.gradient_checkpointing:
            (self.model).gradient_checkpointing_enable()
        
        # Freezing
        if CFG.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            CFG.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())
        
        self.config = AutoConfig.from_pretrained(model_name)
        self.drop = nn.Dropout(p=CFG.dropout)
        #self.pooler = MeanPooling()
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.2)
        self.dropout3 = nn.Dropout(0.3)
        self.dropout4 = nn.Dropout(0.4)
        self.dropout5 = nn.Dropout(0.5)
        self.output = nn.Sequential( nn.Linear(self.config.hidden_size, CFG.target_size) )

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def forward(self, ids, mask):        
        output = self.model(input_ids=ids, 
                         attention_mask=mask,
                         output_hidden_states=False)
        output = output[0][:, 0, :]
        logits1 = self.output(self.dropout1(output))
        logits2 = self.output(self.dropout2(output))
        logits3 = self.output(self.dropout3(output))
        logits4 = self.output(self.dropout4(output))
        logits5 = self.output(self.dropout5(output))
        outputs = (logits1 + logits2 + logits3 + logits4 + logits5) / 5
        return outputs

In [17]:
def asMinutes(s):
    m = math.floor(s/60)
    s -= m * 60
    return "%dm %ds" % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))

def get_scheduler(cfg, optimizer, num_train_steps):
    if cfg.scheduler == 'linear':
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
        )
    elif cfg.scheduler == 'cosine':
        scheduler = get_cosine_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
        )
    return scheduler

def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):
    model.train()

    dataset_size = 0
    running_loss = 0

    start = end = time.time()

    for step, data in enumerate(dataloader):
        ids = data['input_ids'].to(device, dtype=torch.long)
        mask = data['attention_mask'].to(device, dtype=torch.long)
        targets = data['target'].to(device, dtype=torch.long)

        batch_size = ids.size(0)
        
        outputs = model(ids, mask)
        loss = criterion(outputs, targets)

        #accumulate
        loss = loss / CFG.n_accumulate 
        loss.backward()
        if (step +1) % CFG.n_accumulate == 0:
            optimizer.step()

            optimizer.zero_grad()
            if scheduler is not None:
                scheduler.step()
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size

        epoch_loss = running_loss / dataset_size

        end = time.time()
        
        if step % CFG.print_freq == 0 or step == (len(dataloader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  .format(epoch+1, step, len(dataloader), 
                          remain=timeSince(start, float(step+1)/len(dataloader))))

    gc.collect()

    return epoch_loss


@torch.no_grad()
def valid_one_epoch(model, dataloader, device, epoch):
    model.eval()

    dataset_size = 0
    running_loss = 0

    start = end = time.time()
    pred = []

    for step, data in enumerate(dataloader):
        ids = data['input_ids'].to(device, dtype=torch.long)
        mask = data['attention_mask'].to(device, dtype=torch.long)
        targets = data['target'].to(device, dtype=torch.long)

        batch_size = ids.size(0)
        outputs = model(ids, mask)
        loss = criterion(outputs, targets)
        pred.append(outputs.to('cpu').numpy())

        running_loss += (loss.item()* batch_size)
        dataset_size += batch_size

        epoch_loss = running_loss / dataset_size

        end = time.time()

        if step % CFG.print_freq == 0 or step == (len(dataloader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  .format(step, len(dataloader),
                          remain=timeSince(start, float(step+1)/len(dataloader))))
            
    pred = np.concatenate(pred)
            
    return epoch_loss, pred

In [18]:
def train_loop(fold):
    #wandb.watch(model, log_freq=100)

    LOGGER.info(f'-------------fold:{fold} training-------------')

    train_data = train[train.kfold != fold].reset_index(drop=True)
    valid_data = train[train.kfold == fold].reset_index(drop=True)
    valid_labels = valid_data.label.values

    trainDataset = Dataset(train_data, CFG.tokenizer, CFG.max_len)
    validDataset = Dataset(valid_data, CFG.tokenizer, CFG.max_len)

    train_loader = DataLoader(trainDataset,
                              batch_size = CFG.batch_size,
                              shuffle=True,
                              collate_fn = collate_fn,
                              num_workers = CFG.num_workers,
                              pin_memory = True,
                              drop_last=True)
    
    valid_loader = DataLoader(validDataset,
                              batch_size = CFG.batch_size*2,
                              shuffle=False,
                              collate_fn = collate_fn,
                              num_workers = CFG.num_workers,
                              pin_memory = True,
                              drop_last=False)
    
    model = CustomModel(CFG.model)
    torch.save(model.config, OUTPUT_MODEL_DIR+'config.pth')
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=CFG.lr, weight_decay=CFG.weigth_decay)
    num_train_steps = int(len(train_data) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # loop
    best_score = 0

    for epoch in range(CFG.epochs):
        start_time = time.time()

        train_epoch_loss = train_one_epoch(model, optimizer, scheduler, train_loader, device, epoch)
        valid_epoch_loss, pred = valid_one_epoch(model, valid_loader, device, epoch)

        score = get_score(pred, valid_labels)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {train_epoch_loss:.4f}  avg_val_loss: {valid_epoch_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')
        if CFG.wandb:
            wandb.log({f"[fold{fold}] epoch": epoch+1, 
                       f"[fold{fold}] avg_train_loss": train_epoch_loss, 
                       f"[fold{fold}] avg_val_loss": valid_epoch_loss,
                       f"[fold{fold}] score": score})
            
        if score > best_score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': pred},
                        OUTPUT_MODEL_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")
            
    predictions = torch.load(OUTPUT_MODEL_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_data['Data scientist'] = predictions[:, 0]
    valid_data['Machine learning engineer'] = predictions[:, 1]
    valid_data['Software engineer'] = predictions[:, 2]
    valid_data['Consultant'] = predictions[:, 3]
    
    
    temp = valid_data[['Data scientist','Machine learning engineer','Software engineer','Consultant']].values.tolist()
    print(get_score(temp, valid_data['label'].values))

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_data

In [19]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df['label'].values
        preds = oof_df[['Data scientist','Machine learning engineer','Software engineer','Consultant']].values.tolist()
        score = get_score(preds, labels)
        LOGGER.info(f'Score: {score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_MODEL_DIR+'oof_df.pkl')
        oof_df.to_csv(OUTPUT_MODEL_DIR+f'oof_df.csv', index=False)

-------------fold:0 training-------------
INFO:__main__:-------------fold:0 training-------------


Downloading pytorch_model.bin:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/125] Elapsed 0m 8s (remain 16m 48s) 
Epoch: [1][50/125] Elapsed 2m 12s (remain 3m 12s) 
Epoch: [1][100/125] Elapsed 4m 34s (remain 1m 5s) 
Epoch: [1][124/125] Elapsed 5m 35s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 2s (remain 0m 31s) 


Epoch 1 - avg_train_loss: 0.6679  avg_val_loss: 0.4641  time: 363s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6679  avg_val_loss: 0.4641  time: 363s
Epoch 1 - Score: 0.7593
INFO:__main__:Epoch 1 - Score: 0.7593
Epoch 1 - Save Best Score: 0.7593 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7593 Model


EVAL: [15/16] Elapsed 0m 27s (remain 0m 0s) 
Epoch: [2][0/125] Elapsed 0m 8s (remain 16m 42s) 
Epoch: [2][50/125] Elapsed 2m 27s (remain 3m 34s) 
Epoch: [2][100/125] Elapsed 4m 36s (remain 1m 5s) 
Epoch: [2][124/125] Elapsed 5m 33s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 2s (remain 0m 31s) 


Epoch 2 - avg_train_loss: 0.5043  avg_val_loss: 0.4709  time: 361s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5043  avg_val_loss: 0.4709  time: 361s
Epoch 2 - Score: 0.7801
INFO:__main__:Epoch 2 - Score: 0.7801
Epoch 2 - Save Best Score: 0.7801 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7801 Model


EVAL: [15/16] Elapsed 0m 27s (remain 0m 0s) 
Epoch: [3][0/125] Elapsed 0m 3s (remain 6m 55s) 
Epoch: [3][50/125] Elapsed 2m 26s (remain 3m 32s) 
Epoch: [3][100/125] Elapsed 4m 34s (remain 1m 5s) 
Epoch: [3][124/125] Elapsed 5m 33s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 2s (remain 0m 31s) 


Epoch 3 - avg_train_loss: 0.2837  avg_val_loss: 0.5431  time: 362s
INFO:__main__:Epoch 3 - avg_train_loss: 0.2837  avg_val_loss: 0.5431  time: 362s
Epoch 3 - Score: 0.7529
INFO:__main__:Epoch 3 - Score: 0.7529


EVAL: [15/16] Elapsed 0m 27s (remain 0m 0s) 
Epoch: [4][0/125] Elapsed 0m 3s (remain 7m 40s) 
Epoch: [4][50/125] Elapsed 2m 15s (remain 3m 16s) 
Epoch: [4][100/125] Elapsed 4m 32s (remain 1m 4s) 
Epoch: [4][124/125] Elapsed 5m 36s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 2s (remain 0m 31s) 


Epoch 4 - avg_train_loss: 0.1922  avg_val_loss: 0.4926  time: 364s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1922  avg_val_loss: 0.4926  time: 364s
Epoch 4 - Score: 0.7638
INFO:__main__:Epoch 4 - Score: 0.7638


EVAL: [15/16] Elapsed 0m 27s (remain 0m 0s) 
Epoch: [5][0/125] Elapsed 0m 3s (remain 7m 39s) 
Epoch: [5][50/125] Elapsed 2m 25s (remain 3m 30s) 
Epoch: [5][100/125] Elapsed 4m 42s (remain 1m 7s) 
Epoch: [5][124/125] Elapsed 5m 41s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 2s (remain 0m 32s) 


Epoch 5 - avg_train_loss: 0.1167  avg_val_loss: 0.6059  time: 369s
INFO:__main__:Epoch 5 - avg_train_loss: 0.1167  avg_val_loss: 0.6059  time: 369s
Epoch 5 - Score: 0.7733
INFO:__main__:Epoch 5 - Score: 0.7733


EVAL: [15/16] Elapsed 0m 27s (remain 0m 0s) 
Epoch: [6][0/125] Elapsed 0m 3s (remain 7m 32s) 
Epoch: [6][50/125] Elapsed 2m 8s (remain 3m 6s) 
Epoch: [6][100/125] Elapsed 4m 36s (remain 1m 5s) 
Epoch: [6][124/125] Elapsed 5m 44s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 2s (remain 0m 31s) 


Epoch 6 - avg_train_loss: 0.0868  avg_val_loss: 0.6219  time: 372s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0868  avg_val_loss: 0.6219  time: 372s
Epoch 6 - Score: 0.7852
INFO:__main__:Epoch 6 - Score: 0.7852
Epoch 6 - Save Best Score: 0.7852 Model
INFO:__main__:Epoch 6 - Save Best Score: 0.7852 Model


EVAL: [15/16] Elapsed 0m 27s (remain 0m 0s) 
Epoch: [7][0/125] Elapsed 0m 1s (remain 3m 58s) 
Epoch: [7][50/125] Elapsed 2m 6s (remain 3m 3s) 
Epoch: [7][100/125] Elapsed 4m 33s (remain 1m 4s) 
Epoch: [7][124/125] Elapsed 5m 36s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 2s (remain 0m 31s) 


Epoch 7 - avg_train_loss: 0.0348  avg_val_loss: 0.6325  time: 364s
INFO:__main__:Epoch 7 - avg_train_loss: 0.0348  avg_val_loss: 0.6325  time: 364s
Epoch 7 - Score: 0.7736
INFO:__main__:Epoch 7 - Score: 0.7736


EVAL: [15/16] Elapsed 0m 27s (remain 0m 0s) 
Epoch: [8][0/125] Elapsed 0m 2s (remain 5m 10s) 
Epoch: [8][50/125] Elapsed 2m 21s (remain 3m 25s) 
Epoch: [8][100/125] Elapsed 4m 31s (remain 1m 4s) 
Epoch: [8][124/125] Elapsed 5m 33s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 2s (remain 0m 31s) 


Epoch 8 - avg_train_loss: 0.0244  avg_val_loss: 0.6385  time: 362s
INFO:__main__:Epoch 8 - avg_train_loss: 0.0244  avg_val_loss: 0.6385  time: 362s
Epoch 8 - Score: 0.7962
INFO:__main__:Epoch 8 - Score: 0.7962
Epoch 8 - Save Best Score: 0.7962 Model
INFO:__main__:Epoch 8 - Save Best Score: 0.7962 Model


EVAL: [15/16] Elapsed 0m 27s (remain 0m 0s) 
Epoch: [9][0/125] Elapsed 0m 2s (remain 6m 0s) 
Epoch: [9][50/125] Elapsed 2m 22s (remain 3m 27s) 
Epoch: [9][100/125] Elapsed 4m 36s (remain 1m 5s) 
Epoch: [9][124/125] Elapsed 5m 38s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 2s (remain 0m 32s) 


Epoch 9 - avg_train_loss: 0.0117  avg_val_loss: 0.6525  time: 366s
INFO:__main__:Epoch 9 - avg_train_loss: 0.0117  avg_val_loss: 0.6525  time: 366s
Epoch 9 - Score: 0.8094
INFO:__main__:Epoch 9 - Score: 0.8094
Epoch 9 - Save Best Score: 0.8094 Model
INFO:__main__:Epoch 9 - Save Best Score: 0.8094 Model


EVAL: [15/16] Elapsed 0m 27s (remain 0m 0s) 
Epoch: [10][0/125] Elapsed 0m 2s (remain 5m 7s) 
Epoch: [10][50/125] Elapsed 2m 21s (remain 3m 25s) 
Epoch: [10][100/125] Elapsed 4m 33s (remain 1m 4s) 
Epoch: [10][124/125] Elapsed 5m 34s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 2s (remain 0m 31s) 


Epoch 10 - avg_train_loss: 0.0099  avg_val_loss: 0.6539  time: 362s
INFO:__main__:Epoch 10 - avg_train_loss: 0.0099  avg_val_loss: 0.6539  time: 362s
Epoch 10 - Score: 0.8093
INFO:__main__:Epoch 10 - Score: 0.8093


EVAL: [15/16] Elapsed 0m 27s (remain 0m 0s) 


Score: 0.8094
INFO:__main__:Score: 0.8094
-------------fold:1 training-------------
INFO:__main__:-------------fold:1 training-------------


0.8094458272424374


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/125] Elapsed 0m 2s (remain 5m 19s) 
Epoch: [1][50/125] Elapsed 2m 13s (remain 3m 14s) 
Epoch: [1][100/125] Elapsed 4m 27s (remain 1m 3s) 
Epoch: [1][124/125] Elapsed 5m 29s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 1s (remain 0m 20s) 


Epoch 1 - avg_train_loss: 0.8233  avg_val_loss: 0.4872  time: 356s
INFO:__main__:Epoch 1 - avg_train_loss: 0.8233  avg_val_loss: 0.4872  time: 356s
Epoch 1 - Score: 0.6381
INFO:__main__:Epoch 1 - Score: 0.6381
Epoch 1 - Save Best Score: 0.6381 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6381 Model


EVAL: [15/16] Elapsed 0m 26s (remain 0m 0s) 
Epoch: [2][0/125] Elapsed 0m 2s (remain 4m 23s) 
Epoch: [2][50/125] Elapsed 2m 13s (remain 3m 13s) 
Epoch: [2][100/125] Elapsed 4m 31s (remain 1m 4s) 
Epoch: [2][124/125] Elapsed 5m 31s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 1s (remain 0m 20s) 


Epoch 2 - avg_train_loss: 0.4543  avg_val_loss: 0.4756  time: 357s
INFO:__main__:Epoch 2 - avg_train_loss: 0.4543  avg_val_loss: 0.4756  time: 357s
Epoch 2 - Score: 0.7790
INFO:__main__:Epoch 2 - Score: 0.7790
Epoch 2 - Save Best Score: 0.7790 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7790 Model


EVAL: [15/16] Elapsed 0m 26s (remain 0m 0s) 
Epoch: [3][0/125] Elapsed 0m 2s (remain 5m 12s) 
Epoch: [3][50/125] Elapsed 2m 22s (remain 3m 27s) 
Epoch: [3][100/125] Elapsed 4m 42s (remain 1m 7s) 
Epoch: [3][124/125] Elapsed 5m 34s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 1s (remain 0m 20s) 


Epoch 3 - avg_train_loss: 0.2962  avg_val_loss: 0.4721  time: 361s
INFO:__main__:Epoch 3 - avg_train_loss: 0.2962  avg_val_loss: 0.4721  time: 361s
Epoch 3 - Score: 0.8098
INFO:__main__:Epoch 3 - Score: 0.8098
Epoch 3 - Save Best Score: 0.8098 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.8098 Model


EVAL: [15/16] Elapsed 0m 26s (remain 0m 0s) 
Epoch: [4][0/125] Elapsed 0m 2s (remain 4m 34s) 
Epoch: [4][50/125] Elapsed 2m 20s (remain 3m 23s) 
Epoch: [4][100/125] Elapsed 4m 22s (remain 1m 2s) 
Epoch: [4][124/125] Elapsed 5m 37s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 1s (remain 0m 20s) 


Epoch 4 - avg_train_loss: 0.1814  avg_val_loss: 0.4884  time: 364s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1814  avg_val_loss: 0.4884  time: 364s
Epoch 4 - Score: 0.8261
INFO:__main__:Epoch 4 - Score: 0.8261
Epoch 4 - Save Best Score: 0.8261 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.8261 Model


EVAL: [15/16] Elapsed 0m 26s (remain 0m 0s) 
Epoch: [5][0/125] Elapsed 0m 8s (remain 17m 0s) 
Epoch: [5][50/125] Elapsed 2m 32s (remain 3m 41s) 
Epoch: [5][100/125] Elapsed 4m 26s (remain 1m 3s) 
Epoch: [5][124/125] Elapsed 5m 36s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 1s (remain 0m 20s) 


Epoch 5 - avg_train_loss: 0.0719  avg_val_loss: 0.5202  time: 362s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0719  avg_val_loss: 0.5202  time: 362s
Epoch 5 - Score: 0.8204
INFO:__main__:Epoch 5 - Score: 0.8204


EVAL: [15/16] Elapsed 0m 26s (remain 0m 0s) 
Epoch: [6][0/125] Elapsed 0m 1s (remain 3m 50s) 
Epoch: [6][50/125] Elapsed 2m 24s (remain 3m 28s) 
Epoch: [6][100/125] Elapsed 4m 28s (remain 1m 3s) 
Epoch: [6][124/125] Elapsed 5m 29s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 1s (remain 0m 20s) 


Epoch 6 - avg_train_loss: 0.0315  avg_val_loss: 0.5813  time: 356s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0315  avg_val_loss: 0.5813  time: 356s
Epoch 6 - Score: 0.7982
INFO:__main__:Epoch 6 - Score: 0.7982


EVAL: [15/16] Elapsed 0m 26s (remain 0m 0s) 
Epoch: [7][0/125] Elapsed 0m 1s (remain 3m 30s) 
Epoch: [7][50/125] Elapsed 2m 22s (remain 3m 26s) 
Epoch: [7][100/125] Elapsed 4m 15s (remain 1m 0s) 
Epoch: [7][124/125] Elapsed 5m 30s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 1s (remain 0m 20s) 


Epoch 7 - avg_train_loss: 0.0099  avg_val_loss: 0.6349  time: 357s
INFO:__main__:Epoch 7 - avg_train_loss: 0.0099  avg_val_loss: 0.6349  time: 357s
Epoch 7 - Score: 0.8099
INFO:__main__:Epoch 7 - Score: 0.8099


EVAL: [15/16] Elapsed 0m 26s (remain 0m 0s) 
Epoch: [8][0/125] Elapsed 0m 2s (remain 5m 10s) 
Epoch: [8][50/125] Elapsed 2m 26s (remain 3m 32s) 
Epoch: [8][100/125] Elapsed 4m 47s (remain 1m 8s) 
Epoch: [8][124/125] Elapsed 5m 45s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 1s (remain 0m 20s) 


Epoch 8 - avg_train_loss: 0.0048  avg_val_loss: 0.6662  time: 372s
INFO:__main__:Epoch 8 - avg_train_loss: 0.0048  avg_val_loss: 0.6662  time: 372s
Epoch 8 - Score: 0.8225
INFO:__main__:Epoch 8 - Score: 0.8225


EVAL: [15/16] Elapsed 0m 26s (remain 0m 0s) 
Epoch: [9][0/125] Elapsed 0m 3s (remain 6m 58s) 
Epoch: [9][50/125] Elapsed 2m 9s (remain 3m 7s) 
Epoch: [9][100/125] Elapsed 4m 30s (remain 1m 4s) 
Epoch: [9][124/125] Elapsed 5m 30s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 1s (remain 0m 20s) 


Epoch 9 - avg_train_loss: 0.0086  avg_val_loss: 0.6683  time: 356s
INFO:__main__:Epoch 9 - avg_train_loss: 0.0086  avg_val_loss: 0.6683  time: 356s
Epoch 9 - Score: 0.8163
INFO:__main__:Epoch 9 - Score: 0.8163


EVAL: [15/16] Elapsed 0m 26s (remain 0m 0s) 
Epoch: [10][0/125] Elapsed 0m 4s (remain 8m 34s) 
Epoch: [10][50/125] Elapsed 2m 21s (remain 3m 25s) 
Epoch: [10][100/125] Elapsed 4m 38s (remain 1m 6s) 
Epoch: [10][124/125] Elapsed 5m 30s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 1s (remain 0m 20s) 


Epoch 10 - avg_train_loss: 0.0025  avg_val_loss: 0.6698  time: 357s
INFO:__main__:Epoch 10 - avg_train_loss: 0.0025  avg_val_loss: 0.6698  time: 357s
Epoch 10 - Score: 0.8162
INFO:__main__:Epoch 10 - Score: 0.8162


EVAL: [15/16] Elapsed 0m 26s (remain 0m 0s) 


Score: 0.8261
INFO:__main__:Score: 0.8261
-------------fold:2 training-------------
INFO:__main__:-------------fold:2 training-------------


0.8260576223194503


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/125] Elapsed 0m 1s (remain 3m 27s) 
Epoch: [1][50/125] Elapsed 2m 26s (remain 3m 32s) 
Epoch: [1][100/125] Elapsed 4m 27s (remain 1m 3s) 
Epoch: [1][124/125] Elapsed 5m 31s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 1s (remain 0m 18s) 


Epoch 1 - avg_train_loss: 0.6582  avg_val_loss: 0.4496  time: 356s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6582  avg_val_loss: 0.4496  time: 356s
Epoch 1 - Score: 0.7978
INFO:__main__:Epoch 1 - Score: 0.7978
Epoch 1 - Save Best Score: 0.7978 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7978 Model


EVAL: [15/16] Elapsed 0m 23s (remain 0m 0s) 
Epoch: [2][0/125] Elapsed 0m 1s (remain 3m 31s) 
Epoch: [2][50/125] Elapsed 2m 12s (remain 3m 12s) 
Epoch: [2][100/125] Elapsed 4m 37s (remain 1m 5s) 
Epoch: [2][124/125] Elapsed 5m 49s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 1s (remain 0m 18s) 


Epoch 2 - avg_train_loss: 0.3631  avg_val_loss: 0.4807  time: 373s
INFO:__main__:Epoch 2 - avg_train_loss: 0.3631  avg_val_loss: 0.4807  time: 373s
Epoch 2 - Score: 0.7841
INFO:__main__:Epoch 2 - Score: 0.7841


EVAL: [15/16] Elapsed 0m 23s (remain 0m 0s) 
Epoch: [3][0/125] Elapsed 0m 1s (remain 3m 17s) 
Epoch: [3][50/125] Elapsed 2m 27s (remain 3m 33s) 
Epoch: [3][100/125] Elapsed 4m 33s (remain 1m 4s) 
Epoch: [3][124/125] Elapsed 5m 46s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 1s (remain 0m 18s) 


Epoch 3 - avg_train_loss: 0.2040  avg_val_loss: 0.4683  time: 370s
INFO:__main__:Epoch 3 - avg_train_loss: 0.2040  avg_val_loss: 0.4683  time: 370s
Epoch 3 - Score: 0.7758
INFO:__main__:Epoch 3 - Score: 0.7758


EVAL: [15/16] Elapsed 0m 23s (remain 0m 0s) 
Epoch: [4][0/125] Elapsed 0m 2s (remain 5m 50s) 
Epoch: [4][50/125] Elapsed 2m 22s (remain 3m 26s) 
Epoch: [4][100/125] Elapsed 4m 38s (remain 1m 6s) 
Epoch: [4][124/125] Elapsed 5m 58s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 1s (remain 0m 18s) 


Epoch 4 - avg_train_loss: 0.0954  avg_val_loss: 0.6205  time: 382s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0954  avg_val_loss: 0.6205  time: 382s
Epoch 4 - Score: 0.7570
INFO:__main__:Epoch 4 - Score: 0.7570


EVAL: [15/16] Elapsed 0m 23s (remain 0m 0s) 
Epoch: [5][0/125] Elapsed 0m 1s (remain 3m 9s) 
Epoch: [5][50/125] Elapsed 2m 22s (remain 3m 26s) 
Epoch: [5][100/125] Elapsed 4m 34s (remain 1m 5s) 
Epoch: [5][124/125] Elapsed 5m 43s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 1s (remain 0m 18s) 


Epoch 5 - avg_train_loss: 0.0513  avg_val_loss: 0.7163  time: 368s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0513  avg_val_loss: 0.7163  time: 368s
Epoch 5 - Score: 0.7649
INFO:__main__:Epoch 5 - Score: 0.7649


EVAL: [15/16] Elapsed 0m 23s (remain 0m 0s) 
Epoch: [6][0/125] Elapsed 0m 2s (remain 4m 45s) 
Epoch: [6][50/125] Elapsed 2m 27s (remain 3m 33s) 
Epoch: [6][100/125] Elapsed 4m 48s (remain 1m 8s) 
Epoch: [6][124/125] Elapsed 5m 51s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 1s (remain 0m 18s) 


Epoch 6 - avg_train_loss: 0.0176  avg_val_loss: 0.8163  time: 375s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0176  avg_val_loss: 0.8163  time: 375s
Epoch 6 - Score: 0.7692
INFO:__main__:Epoch 6 - Score: 0.7692


EVAL: [15/16] Elapsed 0m 23s (remain 0m 0s) 
Epoch: [7][0/125] Elapsed 0m 2s (remain 5m 47s) 
Epoch: [7][50/125] Elapsed 2m 13s (remain 3m 13s) 
Epoch: [7][100/125] Elapsed 4m 33s (remain 1m 5s) 
Epoch: [7][124/125] Elapsed 5m 39s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 1s (remain 0m 18s) 


Epoch 7 - avg_train_loss: 0.0101  avg_val_loss: 0.7333  time: 364s
INFO:__main__:Epoch 7 - avg_train_loss: 0.0101  avg_val_loss: 0.7333  time: 364s
Epoch 7 - Score: 0.8159
INFO:__main__:Epoch 7 - Score: 0.8159
Epoch 7 - Save Best Score: 0.8159 Model
INFO:__main__:Epoch 7 - Save Best Score: 0.8159 Model


EVAL: [15/16] Elapsed 0m 23s (remain 0m 0s) 
Epoch: [8][0/125] Elapsed 0m 4s (remain 9m 0s) 
Epoch: [8][50/125] Elapsed 2m 13s (remain 3m 13s) 
Epoch: [8][100/125] Elapsed 4m 36s (remain 1m 5s) 
Epoch: [8][124/125] Elapsed 5m 36s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 1s (remain 0m 18s) 


Epoch 8 - avg_train_loss: 0.0041  avg_val_loss: 0.7342  time: 361s
INFO:__main__:Epoch 8 - avg_train_loss: 0.0041  avg_val_loss: 0.7342  time: 361s
Epoch 8 - Score: 0.8184
INFO:__main__:Epoch 8 - Score: 0.8184
Epoch 8 - Save Best Score: 0.8184 Model
INFO:__main__:Epoch 8 - Save Best Score: 0.8184 Model


EVAL: [15/16] Elapsed 0m 23s (remain 0m 0s) 
Epoch: [9][0/125] Elapsed 0m 2s (remain 5m 14s) 
Epoch: [9][50/125] Elapsed 2m 32s (remain 3m 41s) 
Epoch: [9][100/125] Elapsed 4m 33s (remain 1m 4s) 
Epoch: [9][124/125] Elapsed 5m 47s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 1s (remain 0m 18s) 


Epoch 9 - avg_train_loss: 0.0023  avg_val_loss: 0.7363  time: 372s
INFO:__main__:Epoch 9 - avg_train_loss: 0.0023  avg_val_loss: 0.7363  time: 372s
Epoch 9 - Score: 0.8155
INFO:__main__:Epoch 9 - Score: 0.8155


EVAL: [15/16] Elapsed 0m 23s (remain 0m 0s) 
Epoch: [10][0/125] Elapsed 0m 5s (remain 10m 25s) 
Epoch: [10][50/125] Elapsed 2m 10s (remain 3m 8s) 
Epoch: [10][100/125] Elapsed 4m 35s (remain 1m 5s) 
Epoch: [10][124/125] Elapsed 5m 37s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 1s (remain 0m 18s) 


Epoch 10 - avg_train_loss: 0.0015  avg_val_loss: 0.7374  time: 362s
INFO:__main__:Epoch 10 - avg_train_loss: 0.0015  avg_val_loss: 0.7374  time: 362s
Epoch 10 - Score: 0.8170
INFO:__main__:Epoch 10 - Score: 0.8170


EVAL: [15/16] Elapsed 0m 23s (remain 0m 0s) 


Score: 0.8184
INFO:__main__:Score: 0.8184
-------------fold:3 training-------------
INFO:__main__:-------------fold:3 training-------------


0.8184176672155139


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/125] Elapsed 0m 1s (remain 3m 52s) 
Epoch: [1][50/125] Elapsed 2m 21s (remain 3m 24s) 
Epoch: [1][100/125] Elapsed 4m 33s (remain 1m 5s) 
Epoch: [1][124/125] Elapsed 5m 40s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 2s (remain 0m 40s) 


Epoch 1 - avg_train_loss: 1.2922  avg_val_loss: 1.2360  time: 365s
INFO:__main__:Epoch 1 - avg_train_loss: 1.2922  avg_val_loss: 1.2360  time: 365s
Epoch 1 - Score: 0.1143
INFO:__main__:Epoch 1 - Score: 0.1143
Epoch 1 - Save Best Score: 0.1143 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.1143 Model


EVAL: [15/16] Elapsed 0m 23s (remain 0m 0s) 
Epoch: [2][0/125] Elapsed 0m 5s (remain 11m 56s) 
Epoch: [2][50/125] Elapsed 2m 23s (remain 3m 28s) 
Epoch: [2][100/125] Elapsed 4m 44s (remain 1m 7s) 
Epoch: [2][124/125] Elapsed 5m 44s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 2s (remain 0m 37s) 


Epoch 2 - avg_train_loss: 1.2458  avg_val_loss: 1.2341  time: 369s
INFO:__main__:Epoch 2 - avg_train_loss: 1.2458  avg_val_loss: 1.2341  time: 369s
Epoch 2 - Score: 0.1246
INFO:__main__:Epoch 2 - Score: 0.1246
Epoch 2 - Save Best Score: 0.1246 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.1246 Model


EVAL: [15/16] Elapsed 0m 23s (remain 0m 0s) 
Epoch: [3][0/125] Elapsed 0m 1s (remain 3m 5s) 
Epoch: [3][50/125] Elapsed 2m 23s (remain 3m 28s) 
Epoch: [3][100/125] Elapsed 4m 49s (remain 1m 8s) 
Epoch: [3][124/125] Elapsed 5m 44s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 2s (remain 0m 38s) 


Epoch 3 - avg_train_loss: 1.2387  avg_val_loss: 1.2293  time: 368s
INFO:__main__:Epoch 3 - avg_train_loss: 1.2387  avg_val_loss: 1.2293  time: 368s
Epoch 3 - Score: 0.1246
INFO:__main__:Epoch 3 - Score: 0.1246


EVAL: [15/16] Elapsed 0m 23s (remain 0m 0s) 
Epoch: [4][0/125] Elapsed 0m 2s (remain 5m 36s) 
Epoch: [4][50/125] Elapsed 2m 30s (remain 3m 37s) 
Epoch: [4][100/125] Elapsed 4m 45s (remain 1m 7s) 
Epoch: [4][124/125] Elapsed 5m 37s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 2s (remain 0m 38s) 


Epoch 4 - avg_train_loss: 1.2403  avg_val_loss: 1.2300  time: 362s
INFO:__main__:Epoch 4 - avg_train_loss: 1.2403  avg_val_loss: 1.2300  time: 362s
Epoch 4 - Score: 0.1246
INFO:__main__:Epoch 4 - Score: 0.1246


EVAL: [15/16] Elapsed 0m 23s (remain 0m 0s) 
Epoch: [5][0/125] Elapsed 0m 3s (remain 6m 25s) 
Epoch: [5][50/125] Elapsed 2m 19s (remain 3m 21s) 
Epoch: [5][100/125] Elapsed 4m 41s (remain 1m 6s) 
Epoch: [5][124/125] Elapsed 5m 41s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 2s (remain 0m 38s) 


Epoch 5 - avg_train_loss: 1.2325  avg_val_loss: 1.2445  time: 365s
INFO:__main__:Epoch 5 - avg_train_loss: 1.2325  avg_val_loss: 1.2445  time: 365s
Epoch 5 - Score: 0.1143
INFO:__main__:Epoch 5 - Score: 0.1143


EVAL: [15/16] Elapsed 0m 23s (remain 0m 0s) 
Epoch: [6][0/125] Elapsed 0m 3s (remain 7m 0s) 
Epoch: [6][50/125] Elapsed 2m 21s (remain 3m 24s) 
Epoch: [6][100/125] Elapsed 4m 30s (remain 1m 4s) 
Epoch: [6][124/125] Elapsed 5m 42s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 2s (remain 0m 38s) 


Epoch 6 - avg_train_loss: 1.2178  avg_val_loss: 1.2341  time: 366s
INFO:__main__:Epoch 6 - avg_train_loss: 1.2178  avg_val_loss: 1.2341  time: 366s
Epoch 6 - Score: 0.1235
INFO:__main__:Epoch 6 - Score: 0.1235


EVAL: [15/16] Elapsed 0m 23s (remain 0m 0s) 
Epoch: [7][0/125] Elapsed 0m 4s (remain 8m 30s) 
Epoch: [7][50/125] Elapsed 2m 12s (remain 3m 12s) 
Epoch: [7][100/125] Elapsed 4m 29s (remain 1m 4s) 
Epoch: [7][124/125] Elapsed 5m 43s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 2s (remain 0m 38s) 


Epoch 7 - avg_train_loss: 1.2128  avg_val_loss: 1.2377  time: 368s
INFO:__main__:Epoch 7 - avg_train_loss: 1.2128  avg_val_loss: 1.2377  time: 368s
Epoch 7 - Score: 0.1143
INFO:__main__:Epoch 7 - Score: 0.1143


EVAL: [15/16] Elapsed 0m 23s (remain 0m 0s) 
Epoch: [8][0/125] Elapsed 0m 2s (remain 5m 16s) 
Epoch: [8][50/125] Elapsed 2m 21s (remain 3m 25s) 
Epoch: [8][100/125] Elapsed 4m 39s (remain 1m 6s) 
Epoch: [8][124/125] Elapsed 5m 46s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 2s (remain 0m 38s) 


Epoch 8 - avg_train_loss: 1.1838  avg_val_loss: 1.2442  time: 370s
INFO:__main__:Epoch 8 - avg_train_loss: 1.1838  avg_val_loss: 1.2442  time: 370s
Epoch 8 - Score: 0.1235
INFO:__main__:Epoch 8 - Score: 0.1235


EVAL: [15/16] Elapsed 0m 23s (remain 0m 0s) 
Epoch: [9][0/125] Elapsed 0m 2s (remain 4m 49s) 
Epoch: [9][50/125] Elapsed 2m 27s (remain 3m 34s) 
Epoch: [9][100/125] Elapsed 4m 47s (remain 1m 8s) 
Epoch: [9][124/125] Elapsed 5m 49s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 2s (remain 0m 38s) 


Epoch 9 - avg_train_loss: 1.1987  avg_val_loss: 1.2404  time: 373s
INFO:__main__:Epoch 9 - avg_train_loss: 1.1987  avg_val_loss: 1.2404  time: 373s
Epoch 9 - Score: 0.1235
INFO:__main__:Epoch 9 - Score: 0.1235


EVAL: [15/16] Elapsed 0m 23s (remain 0m 0s) 
Epoch: [10][0/125] Elapsed 0m 2s (remain 4m 34s) 
Epoch: [10][50/125] Elapsed 2m 19s (remain 3m 22s) 
Epoch: [10][100/125] Elapsed 4m 32s (remain 1m 4s) 
Epoch: [10][124/125] Elapsed 5m 43s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 2s (remain 0m 38s) 


Epoch 10 - avg_train_loss: 1.1944  avg_val_loss: 1.2419  time: 367s
INFO:__main__:Epoch 10 - avg_train_loss: 1.1944  avg_val_loss: 1.2419  time: 367s
Epoch 10 - Score: 0.1235
INFO:__main__:Epoch 10 - Score: 0.1235


EVAL: [15/16] Elapsed 0m 23s (remain 0m 0s) 


Score: 0.1246
INFO:__main__:Score: 0.1246
-------------fold:4 training-------------
INFO:__main__:-------------fold:4 training-------------


0.1246268656716418


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/125] Elapsed 0m 4s (remain 9m 16s) 
Epoch: [1][50/125] Elapsed 2m 20s (remain 3m 24s) 
Epoch: [1][100/125] Elapsed 4m 29s (remain 1m 3s) 
Epoch: [1][124/125] Elapsed 5m 28s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 1s (remain 0m 23s) 


Epoch 1 - avg_train_loss: 0.7757  avg_val_loss: 0.4700  time: 358s
INFO:__main__:Epoch 1 - avg_train_loss: 0.7757  avg_val_loss: 0.4700  time: 358s
Epoch 1 - Score: 0.7856
INFO:__main__:Epoch 1 - Score: 0.7856
Epoch 1 - Save Best Score: 0.7856 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7856 Model


EVAL: [15/16] Elapsed 0m 28s (remain 0m 0s) 
Epoch: [2][0/125] Elapsed 0m 2s (remain 4m 8s) 
Epoch: [2][50/125] Elapsed 2m 20s (remain 3m 23s) 
Epoch: [2][100/125] Elapsed 4m 31s (remain 1m 4s) 
Epoch: [2][124/125] Elapsed 5m 31s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 1s (remain 0m 23s) 


Epoch 2 - avg_train_loss: 0.4103  avg_val_loss: 0.5354  time: 360s
INFO:__main__:Epoch 2 - avg_train_loss: 0.4103  avg_val_loss: 0.5354  time: 360s
Epoch 2 - Score: 0.7063
INFO:__main__:Epoch 2 - Score: 0.7063


EVAL: [15/16] Elapsed 0m 28s (remain 0m 0s) 
Epoch: [3][0/125] Elapsed 0m 1s (remain 3m 31s) 
Epoch: [3][50/125] Elapsed 2m 13s (remain 3m 13s) 
Epoch: [3][100/125] Elapsed 4m 22s (remain 1m 2s) 
Epoch: [3][124/125] Elapsed 5m 33s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 1s (remain 0m 23s) 


Epoch 3 - avg_train_loss: 0.2762  avg_val_loss: 0.4727  time: 362s
INFO:__main__:Epoch 3 - avg_train_loss: 0.2762  avg_val_loss: 0.4727  time: 362s
Epoch 3 - Score: 0.7830
INFO:__main__:Epoch 3 - Score: 0.7830


EVAL: [15/16] Elapsed 0m 28s (remain 0m 0s) 
Epoch: [4][0/125] Elapsed 0m 2s (remain 4m 49s) 
Epoch: [4][50/125] Elapsed 2m 22s (remain 3m 26s) 
Epoch: [4][100/125] Elapsed 4m 25s (remain 1m 3s) 
Epoch: [4][124/125] Elapsed 5m 20s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 1s (remain 0m 23s) 


Epoch 4 - avg_train_loss: 0.1608  avg_val_loss: 0.4335  time: 350s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1608  avg_val_loss: 0.4335  time: 350s
Epoch 4 - Score: 0.7958
INFO:__main__:Epoch 4 - Score: 0.7958
Epoch 4 - Save Best Score: 0.7958 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7958 Model


EVAL: [15/16] Elapsed 0m 28s (remain 0m 0s) 
Epoch: [5][0/125] Elapsed 0m 6s (remain 14m 3s) 
Epoch: [5][50/125] Elapsed 2m 5s (remain 3m 2s) 
Epoch: [5][100/125] Elapsed 4m 25s (remain 1m 3s) 
Epoch: [5][124/125] Elapsed 5m 29s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 1s (remain 0m 23s) 


Epoch 5 - avg_train_loss: 0.0728  avg_val_loss: 0.5243  time: 358s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0728  avg_val_loss: 0.5243  time: 358s
Epoch 5 - Score: 0.7512
INFO:__main__:Epoch 5 - Score: 0.7512


EVAL: [15/16] Elapsed 0m 28s (remain 0m 0s) 
Epoch: [6][0/125] Elapsed 0m 2s (remain 5m 51s) 
Epoch: [6][50/125] Elapsed 2m 13s (remain 3m 13s) 
Epoch: [6][100/125] Elapsed 4m 30s (remain 1m 4s) 
Epoch: [6][124/125] Elapsed 5m 29s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 1s (remain 0m 23s) 


Epoch 6 - avg_train_loss: 0.0272  avg_val_loss: 0.4723  time: 358s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0272  avg_val_loss: 0.4723  time: 358s
Epoch 6 - Score: 0.7849
INFO:__main__:Epoch 6 - Score: 0.7849


EVAL: [15/16] Elapsed 0m 28s (remain 0m 0s) 
Epoch: [7][0/125] Elapsed 0m 2s (remain 5m 26s) 
Epoch: [7][50/125] Elapsed 2m 24s (remain 3m 29s) 
Epoch: [7][100/125] Elapsed 4m 32s (remain 1m 4s) 
Epoch: [7][124/125] Elapsed 5m 23s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 1s (remain 0m 23s) 


Epoch 7 - avg_train_loss: 0.0050  avg_val_loss: 0.5260  time: 353s
INFO:__main__:Epoch 7 - avg_train_loss: 0.0050  avg_val_loss: 0.5260  time: 353s
Epoch 7 - Score: 0.8032
INFO:__main__:Epoch 7 - Score: 0.8032
Epoch 7 - Save Best Score: 0.8032 Model
INFO:__main__:Epoch 7 - Save Best Score: 0.8032 Model


EVAL: [15/16] Elapsed 0m 28s (remain 0m 0s) 
Epoch: [8][0/125] Elapsed 0m 1s (remain 2m 54s) 
Epoch: [8][50/125] Elapsed 2m 4s (remain 3m 0s) 
Epoch: [8][100/125] Elapsed 4m 16s (remain 1m 0s) 
Epoch: [8][124/125] Elapsed 5m 24s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 1s (remain 0m 23s) 


Epoch 8 - avg_train_loss: 0.0028  avg_val_loss: 0.5544  time: 353s
INFO:__main__:Epoch 8 - avg_train_loss: 0.0028  avg_val_loss: 0.5544  time: 353s
Epoch 8 - Score: 0.7877
INFO:__main__:Epoch 8 - Score: 0.7877


EVAL: [15/16] Elapsed 0m 28s (remain 0m 0s) 
Epoch: [9][0/125] Elapsed 0m 2s (remain 5m 51s) 
Epoch: [9][50/125] Elapsed 2m 5s (remain 3m 2s) 
Epoch: [9][100/125] Elapsed 4m 30s (remain 1m 4s) 
Epoch: [9][124/125] Elapsed 5m 28s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 1s (remain 0m 23s) 


Epoch 9 - avg_train_loss: 0.0033  avg_val_loss: 0.5540  time: 357s
INFO:__main__:Epoch 9 - avg_train_loss: 0.0033  avg_val_loss: 0.5540  time: 357s
Epoch 9 - Score: 0.7919
INFO:__main__:Epoch 9 - Score: 0.7919


EVAL: [15/16] Elapsed 0m 28s (remain 0m 0s) 
Epoch: [10][0/125] Elapsed 0m 3s (remain 7m 27s) 
Epoch: [10][50/125] Elapsed 2m 12s (remain 3m 11s) 
Epoch: [10][100/125] Elapsed 4m 38s (remain 1m 6s) 
Epoch: [10][124/125] Elapsed 5m 37s (remain 0m 0s) 
EVAL: [0/16] Elapsed 0m 1s (remain 0m 23s) 


Epoch 10 - avg_train_loss: 0.0017  avg_val_loss: 0.5546  time: 367s
INFO:__main__:Epoch 10 - avg_train_loss: 0.0017  avg_val_loss: 0.5546  time: 367s
Epoch 10 - Score: 0.7919
INFO:__main__:Epoch 10 - Score: 0.7919


EVAL: [15/16] Elapsed 0m 28s (remain 0m 0s) 


Score: 0.8032
INFO:__main__:Score: 0.8032
Score: 0.7150
INFO:__main__:Score: 0.7150


0.8032122646409309


In [20]:
A = pd.read_csv(OUTPUT_MODEL_DIR+'oof_df.csv')
A.head()

Unnamed: 0,id,description,label,inputs,kfold,Data scientist,Machine learning engineer,Software engineer,Consultant
0,3,"Work on the technical design, development, re...",2,"Work on the technical design, development, re...",0,-5.266695,-1.041337,7.242693,-1.686741
1,13,"Research, prototype, identify, and build predi...",1,"Research, prototype, identify, and build predi...",0,6.823453,0.385432,-4.989508,-2.207274
2,21,Execute on assigned project tasksPrepare mater...,3,Execute on assigned project tasksPrepare mater...,0,-0.024108,-5.392136,-2.679445,7.485084
3,27,The successful candidate will:Perform industry...,0,The successful candidate will:Perform industry...,0,6.419773,-4.326248,-3.910087,2.441678
4,28,Build network within the Capgemini group and ...,3,Build network within the Capgemini group and ...,0,-2.738999,-4.729551,-0.555396,7.123335
