In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.1-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 5.1 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 6.9 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 13.0 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.8.1 tokenizers-0.12.1 transformers-4.21.1


In [3]:
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 5.3 MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.97


In [4]:
import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
INPUT_DIR = '/content/drive/MyDrive/Competitions/Signate/Student Cup 2022/input/'
OUTPUT_DIR = '/content/drive/MyDrive/Competitions/Signate/Student Cup 2022/output/'
OUTPUT_SUB_DIR = os.path.join(OUTPUT_DIR,'Submission')
OUTPUT_MODEL_DIR = os.path.join(OUTPUT_DIR,'Model/DeBERTa-large[ver3]/')

In [6]:
class CFG:
    wandb = False
    apex = True
    model = 'microsoft/deberta-large'
    seed = 42
    n_splits = 5
    max_len = 1024
    dropout = 0.2
    target_size=4
    n_accumulate=1
    print_freq = 50
    min_lr=1e-6
    scheduler = 'cosine'
    batch_size = 16
    num_workers = 2
    lr = 3e-5
    weigth_decay = 0.01
    epochs = 10
    n_fold = 5
    trn_fold = [0, 1, 2, 3, 4]
    train = True 
    num_warmup_steps = 0
    num_cycles=0.5
    debug = False
    debug_ver2 = False
    gradient_checkpointing = True
    freezing = True

In [7]:
# Loss Func
def criterion(outputs, labels):
    return nn.CrossEntropyLoss()(outputs, labels)

def softmax(z):
    assert len(z.shape) == 2
    s = np.max(z, axis=1)
    s = s[:, np.newaxis] # necessary step to do broadcasting
    e_x = np.exp(z - s)
    div = np.sum(e_x, axis=1)
    div = div[:, np.newaxis] # dito
    return e_x / div
"""
def get_score(y_true, y_pred):
    y_pred = softmax(y_pred)
    score = log_loss(y_true, y_pred)
    return round(score, 5)
"""
def get_score(outputs, labels):
    outputs = F.softmax(torch.tensor(outputs)).numpy()
    return f1_score(np.argmax(outputs,axis=1),labels ,average='macro')

def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

def prepare_input(cfg, text, text_2=None):
    inputs = cfg.tokenizer(text, text_2,
                           padding="max_length",
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           truncation=True)

    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
        
    return inputs

In [8]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False
        
def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
            
    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """
    
    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"
        
        if hasattr(embeddings_path, attr_name): 
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [9]:
train = pd.read_csv(os.path.join(INPUT_DIR, 'train.csv'))
test = pd.read_csv(os.path.join(INPUT_DIR, 'test.csv'))
submission_df = pd.read_csv(os.path.join(INPUT_DIR, 'submit_sample.csv'))

display(train.head())
print(train.shape)
display(test.head())
print(test.shape)

Unnamed: 0,id,description,jobflag
0,0,<li>Develop cutting-edge web applications that...,3
1,1,"<li> Designs and develops high quality, scalab...",3
2,2,<li>Functions as a point person for Network St...,4
3,3,"<li> Work on the technical design, development...",3
4,4,<li>Quantify the resources required for a task...,4


(1516, 3)


Unnamed: 0,id,description
0,1516,<li>Building decision-making models and propos...
1,1517,<li>Educate homeowners on the benefits of sola...
2,1518,"<li><span>Design, develop, document, and imple..."
3,1519,<li>Apply advanced technical expertise and ski...
4,1520,<li>Project manage and deliver against our roa...


(1517, 2)


In [10]:
def remove_tag(x):
    p = re.compile(r"<[^>]*?>")
    return p.sub('',x)

def cleaning(texts):
    clean_texts = []
    for text in texts:
        # htmlタグを削除
        text = remove_tag(text)
        #アルファベット以外をスペースに置き換え
        #clean_punc = re.sub(r'[^a-zA-Z]', ' ', text)
        clean_texts.append(text)
    return clean_texts



from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs

def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

# Register the encoding and decoding error handlers for `utf-8` and `cp1252`.
codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)

def resolve_encodings_and_normalize(text: str) -> str:
    """Resolve the encoding problems and normalize the abnormal characters."""
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text

train['description'] = cleaning(train['description'])
test['description'] = cleaning(test['description'])
train['inputs'] = train['description'].apply(lambda x : resolve_encodings_and_normalize(x))
test['inputs'] = test['description'].apply(lambda x : resolve_encodings_and_normalize(x))
train.drop_duplicates(subset=['inputs'],keep='first',inplace=True)
train = train.reset_index(drop=True)
train = train.rename(columns = {"jobflag": "label"})
train["label"] = train["label"] -1
train

Unnamed: 0,id,description,label,inputs
0,0,Develop cutting-edge web applications that per...,2,Develop cutting-edge web applications that per...
1,1,"Designs and develops high quality, scalable a...",2,"Designs and develops high quality, scalable a..."
2,2,Functions as a point person for Network Strate...,3,Functions as a point person for Network Strate...
3,3,"Work on the technical design, development, re...",2,"Work on the technical design, development, re..."
4,4,Quantify the resources required for a task/pro...,3,Quantify the resources required for a task/pro...
...,...,...,...,...
1499,1511,"Support detailed reporting, statistical analys...",0,"Support detailed reporting, statistical analys..."
1500,1512,Collaborate with teams to support the ML techn...,1,Collaborate with teams to support the ML techn...
1501,1513,Work with executives and other business leade...,0,Work with executives and other business leade...
1502,1514,Leading design ideation sessions to ensure we ...,2,Leading design ideation sessions to ensure we ...


In [11]:
skf = StratifiedKFold(n_splits=CFG.n_splits,shuffle=True,random_state=CFG.seed)
for fold, ( _, val_) in enumerate(skf.split(train, train.label)):
    train.loc[val_ , "kfold"] = int(fold)
    
train["kfold"] = train["kfold"].astype(int)

In [12]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_MODEL_DIR+'tokenizer/')
CFG.tokenizer = tokenizer
SEP = tokenizer.sep_token

Downloading tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/475 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

In [13]:
class Dataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = CFG.max_len
        self.text = df['inputs'].values
        self.tokenizer = CFG.tokenizer
        self.targets = df['label'].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        text = self.text[index]
        inputs = tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length = self.max_len
        )
        samples = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
            'target': self.targets[index]
        }

        if 'token_type_ids' in inputs:
            samples['token_type_ids'] = inputs['token_type_ids']
            
        return samples

In [14]:
# Dynamic Padding (Collate)
#collate_fn = DataCollatorWithPadding(tokenizer=CFG.tokenizer)
class Collate:
    def __init__(self, tokenizer, isTrain=True):
        self.tokenizer = tokenizer
        self.isTrain = isTrain
        # self.args = args

    def __call__(self, batch):
        output = dict()
        output["input_ids"] = [sample["input_ids"] for sample in batch]
        output["attention_mask"] = [sample["attention_mask"] for sample in batch]
        if self.isTrain:
            output["target"] = [sample["target"] for sample in batch]

        # calculate max token length of this batch
        batch_max = max([len(ids) for ids in output["input_ids"]])

        # add padding
        if self.tokenizer.padding_side == "right":
            output["input_ids"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in output["input_ids"]]
            output["attention_mask"] = [s + (batch_max - len(s)) * [0] for s in output["attention_mask"]]
        else:
            output["input_ids"] = [(batch_max - len(s)) * [self.tokenizer.pad_token_id] + s for s in output["input_ids"]]
            output["attention_mask"] = [(batch_max - len(s)) * [0] + s for s in output["attention_mask"]]

        # convert to tensors
        output["input_ids"] = torch.tensor(output["input_ids"], dtype=torch.long)
        output["attention_mask"] = torch.tensor(output["attention_mask"], dtype=torch.long)
        if self.isTrain:
            output["target"] = torch.tensor(output["target"], dtype=torch.long)

        return output
    
collate_fn = Collate(CFG.tokenizer, isTrain=True)

In [15]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9) #
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

In [16]:
# ====================================================
# Model
# ====================================================
class CustomModel(nn.Module):
    def __init__(self, model_name):
        super(CustomModel, self).__init__()
        # Header (fast or normal)
        self.model = AutoModel.from_pretrained(model_name)
        
        # Gradient_checkpointing
        if CFG.gradient_checkpointing:
            (self.model).gradient_checkpointing_enable()
        
        # Freezing
        if CFG.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            CFG.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())
        
        self.config = AutoConfig.from_pretrained(model_name)
        self.drop = nn.Dropout(p=CFG.dropout)
        #self.pooler = MeanPooling()
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.2)
        self.dropout3 = nn.Dropout(0.3)
        self.dropout4 = nn.Dropout(0.4)
        self.dropout5 = nn.Dropout(0.5)
        self.output = nn.Sequential( nn.Linear(self.config.hidden_size, CFG.target_size) )

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def forward(self, ids, mask):        
        output = self.model(input_ids=ids, 
                         attention_mask=mask,
                         output_hidden_states=False)
        output = output[0][:, 0, :]
        logits1 = self.output(self.dropout1(output))
        logits2 = self.output(self.dropout2(output))
        logits3 = self.output(self.dropout3(output))
        logits4 = self.output(self.dropout4(output))
        logits5 = self.output(self.dropout5(output))
        outputs = (logits1 + logits2 + logits3 + logits4 + logits5) / 5
        return outputs

In [17]:
def asMinutes(s):
    m = math.floor(s/60)
    s -= m * 60
    return "%dm %ds" % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))

def get_scheduler(cfg, optimizer, num_train_steps):
    if cfg.scheduler == 'linear':
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
        )
    elif cfg.scheduler == 'cosine':
        scheduler = get_cosine_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
        )
    return scheduler

def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):
    model.train()

    dataset_size = 0
    running_loss = 0

    start = end = time.time()

    for step, data in enumerate(dataloader):
        ids = data['input_ids'].to(device, dtype=torch.long)
        mask = data['attention_mask'].to(device, dtype=torch.long)
        targets = data['target'].to(device, dtype=torch.long)

        batch_size = ids.size(0)
        
        outputs = model(ids, mask)
        loss = criterion(outputs, targets)

        #accumulate
        loss = loss / CFG.n_accumulate 
        loss.backward()
        if (step +1) % CFG.n_accumulate == 0:
            optimizer.step()

            optimizer.zero_grad()
            if scheduler is not None:
                scheduler.step()
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size

        epoch_loss = running_loss / dataset_size

        end = time.time()
        
        if step % CFG.print_freq == 0 or step == (len(dataloader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  .format(epoch+1, step, len(dataloader), 
                          remain=timeSince(start, float(step+1)/len(dataloader))))

    gc.collect()

    return epoch_loss


@torch.no_grad()
def valid_one_epoch(model, dataloader, device, epoch):
    model.eval()

    dataset_size = 0
    running_loss = 0

    start = end = time.time()
    pred = []

    for step, data in enumerate(dataloader):
        ids = data['input_ids'].to(device, dtype=torch.long)
        mask = data['attention_mask'].to(device, dtype=torch.long)
        targets = data['target'].to(device, dtype=torch.long)

        batch_size = ids.size(0)
        outputs = model(ids, mask)
        loss = criterion(outputs, targets)
        pred.append(outputs.to('cpu').numpy())

        running_loss += (loss.item()* batch_size)
        dataset_size += batch_size

        epoch_loss = running_loss / dataset_size

        end = time.time()

        if step % CFG.print_freq == 0 or step == (len(dataloader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  .format(step, len(dataloader),
                          remain=timeSince(start, float(step+1)/len(dataloader))))
            
    pred = np.concatenate(pred)
            
    return epoch_loss, pred

In [18]:
def train_loop(fold):
    #wandb.watch(model, log_freq=100)

    LOGGER.info(f'-------------fold:{fold} training-------------')

    train_data = train[train.kfold != fold].reset_index(drop=True)
    valid_data = train[train.kfold == fold].reset_index(drop=True)
    valid_labels = valid_data.label.values

    trainDataset = Dataset(train_data, CFG.tokenizer, CFG.max_len)
    validDataset = Dataset(valid_data, CFG.tokenizer, CFG.max_len)

    train_loader = DataLoader(trainDataset,
                              batch_size = CFG.batch_size,
                              shuffle=True,
                              collate_fn = collate_fn,
                              num_workers = CFG.num_workers,
                              pin_memory = True,
                              drop_last=True)
    
    valid_loader = DataLoader(validDataset,
                              batch_size = CFG.batch_size*2,
                              shuffle=False,
                              collate_fn = collate_fn,
                              num_workers = CFG.num_workers,
                              pin_memory = True,
                              drop_last=False)
    
    model = CustomModel(CFG.model)
    torch.save(model.config, OUTPUT_MODEL_DIR+'config.pth')
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=CFG.lr, weight_decay=CFG.weigth_decay)
    num_train_steps = int(len(train_data) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # loop
    best_score = 0

    for epoch in range(CFG.epochs):
        start_time = time.time()

        train_epoch_loss = train_one_epoch(model, optimizer, scheduler, train_loader, device, epoch)
        valid_epoch_loss, pred = valid_one_epoch(model, valid_loader, device, epoch)

        score = get_score(pred, valid_labels)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {train_epoch_loss:.4f}  avg_val_loss: {valid_epoch_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')
        if CFG.wandb:
            wandb.log({f"[fold{fold}] epoch": epoch+1, 
                       f"[fold{fold}] avg_train_loss": train_epoch_loss, 
                       f"[fold{fold}] avg_val_loss": valid_epoch_loss,
                       f"[fold{fold}] score": score})
            
        if score > best_score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': pred},
                        OUTPUT_MODEL_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")
            
    predictions = torch.load(OUTPUT_MODEL_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_data['Data scientist'] = predictions[:, 0]
    valid_data['Machine learning engineer'] = predictions[:, 1]
    valid_data['Software engineer'] = predictions[:, 2]
    valid_data['Consultant'] = predictions[:, 3]
    
    
    temp = valid_data[['Data scientist','Machine learning engineer','Software engineer','Consultant']].values.tolist()
    print(get_score(temp, valid_data['label'].values))

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_data

In [19]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df['label'].values
        preds = oof_df[['Data scientist','Machine learning engineer','Software engineer','Consultant']].values.tolist()
        score = get_score(preds, labels)
        LOGGER.info(f'Score: {score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_MODEL_DIR+'oof_df.pkl')
        oof_df.to_csv(OUTPUT_MODEL_DIR+f'oof_df.csv', index=False)

-------------fold:0 training-------------
INFO:__main__:-------------fold:0 training-------------


Downloading pytorch_model.bin:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/75] Elapsed 0m 7s (remain 8m 57s) 
Epoch: [1][50/75] Elapsed 3m 46s (remain 1m 46s) 
Epoch: [1][74/75] Elapsed 5m 48s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 3s (remain 0m 29s) 


Epoch 1 - avg_train_loss: 0.9592  avg_val_loss: 0.6611  time: 379s
INFO:__main__:Epoch 1 - avg_train_loss: 0.9592  avg_val_loss: 0.6611  time: 379s
Epoch 1 - Score: 0.5669
INFO:__main__:Epoch 1 - Score: 0.5669
Epoch 1 - Save Best Score: 0.5669 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.5669 Model


EVAL: [9/10] Elapsed 0m 30s (remain 0m 0s) 
Epoch: [2][0/75] Elapsed 0m 3s (remain 4m 14s) 
Epoch: [2][50/75] Elapsed 3m 54s (remain 1m 50s) 
Epoch: [2][74/75] Elapsed 5m 45s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 3s (remain 0m 29s) 


Epoch 2 - avg_train_loss: 0.6021  avg_val_loss: 0.5999  time: 376s
INFO:__main__:Epoch 2 - avg_train_loss: 0.6021  avg_val_loss: 0.5999  time: 376s
Epoch 2 - Score: 0.7109
INFO:__main__:Epoch 2 - Score: 0.7109
Epoch 2 - Save Best Score: 0.7109 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7109 Model


EVAL: [9/10] Elapsed 0m 30s (remain 0m 0s) 
Epoch: [3][0/75] Elapsed 0m 3s (remain 4m 14s) 
Epoch: [3][50/75] Elapsed 4m 7s (remain 1m 56s) 
Epoch: [3][74/75] Elapsed 5m 57s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 3s (remain 0m 29s) 


Epoch 3 - avg_train_loss: 0.3940  avg_val_loss: 0.6996  time: 388s
INFO:__main__:Epoch 3 - avg_train_loss: 0.3940  avg_val_loss: 0.6996  time: 388s
Epoch 3 - Score: 0.7393
INFO:__main__:Epoch 3 - Score: 0.7393
Epoch 3 - Save Best Score: 0.7393 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7393 Model


EVAL: [9/10] Elapsed 0m 30s (remain 0m 0s) 
Epoch: [4][0/75] Elapsed 0m 7s (remain 9m 19s) 
Epoch: [4][50/75] Elapsed 3m 53s (remain 1m 49s) 
Epoch: [4][74/75] Elapsed 5m 39s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 3s (remain 0m 29s) 


Epoch 4 - avg_train_loss: 0.1801  avg_val_loss: 0.8315  time: 370s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1801  avg_val_loss: 0.8315  time: 370s
Epoch 4 - Score: 0.6946
INFO:__main__:Epoch 4 - Score: 0.6946


EVAL: [9/10] Elapsed 0m 30s (remain 0m 0s) 
Epoch: [5][0/75] Elapsed 0m 7s (remain 9m 19s) 
Epoch: [5][50/75] Elapsed 4m 1s (remain 1m 53s) 
Epoch: [5][74/75] Elapsed 5m 52s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 3s (remain 0m 29s) 


Epoch 5 - avg_train_loss: 0.0808  avg_val_loss: 0.9803  time: 383s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0808  avg_val_loss: 0.9803  time: 383s
Epoch 5 - Score: 0.7057
INFO:__main__:Epoch 5 - Score: 0.7057


EVAL: [9/10] Elapsed 0m 30s (remain 0m 0s) 
Epoch: [6][0/75] Elapsed 0m 2s (remain 3m 20s) 
Epoch: [6][50/75] Elapsed 4m 2s (remain 1m 54s) 
Epoch: [6][74/75] Elapsed 5m 51s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 3s (remain 0m 30s) 


Epoch 6 - avg_train_loss: 0.0435  avg_val_loss: 0.9514  time: 382s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0435  avg_val_loss: 0.9514  time: 382s
Epoch 6 - Score: 0.7325
INFO:__main__:Epoch 6 - Score: 0.7325


EVAL: [9/10] Elapsed 0m 30s (remain 0m 0s) 
Epoch: [7][0/75] Elapsed 0m 2s (remain 3m 7s) 
Epoch: [7][50/75] Elapsed 4m 0s (remain 1m 53s) 
Epoch: [7][74/75] Elapsed 5m 36s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 3s (remain 0m 29s) 


Epoch 7 - avg_train_loss: 0.0070  avg_val_loss: 1.0592  time: 366s
INFO:__main__:Epoch 7 - avg_train_loss: 0.0070  avg_val_loss: 1.0592  time: 366s
Epoch 7 - Score: 0.7558
INFO:__main__:Epoch 7 - Score: 0.7558
Epoch 7 - Save Best Score: 0.7558 Model
INFO:__main__:Epoch 7 - Save Best Score: 0.7558 Model


EVAL: [9/10] Elapsed 0m 30s (remain 0m 0s) 
Epoch: [8][0/75] Elapsed 0m 4s (remain 5m 6s) 
Epoch: [8][50/75] Elapsed 3m 58s (remain 1m 52s) 
Epoch: [8][74/75] Elapsed 5m 50s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 3s (remain 0m 29s) 


Epoch 8 - avg_train_loss: 0.0034  avg_val_loss: 1.0893  time: 381s
INFO:__main__:Epoch 8 - avg_train_loss: 0.0034  avg_val_loss: 1.0893  time: 381s
Epoch 8 - Score: 0.7385
INFO:__main__:Epoch 8 - Score: 0.7385


EVAL: [9/10] Elapsed 0m 30s (remain 0m 0s) 
Epoch: [9][0/75] Elapsed 0m 6s (remain 8m 5s) 
Epoch: [9][50/75] Elapsed 3m 49s (remain 1m 47s) 
Epoch: [9][74/75] Elapsed 5m 57s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 3s (remain 0m 28s) 


Epoch 9 - avg_train_loss: 0.0023  avg_val_loss: 1.0966  time: 387s
INFO:__main__:Epoch 9 - avg_train_loss: 0.0023  avg_val_loss: 1.0966  time: 387s
Epoch 9 - Score: 0.7385
INFO:__main__:Epoch 9 - Score: 0.7385


EVAL: [9/10] Elapsed 0m 29s (remain 0m 0s) 
Epoch: [10][0/75] Elapsed 0m 5s (remain 6m 31s) 
Epoch: [10][50/75] Elapsed 4m 13s (remain 1m 59s) 
Epoch: [10][74/75] Elapsed 5m 50s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 3s (remain 0m 29s) 


Epoch 10 - avg_train_loss: 0.0023  avg_val_loss: 1.0955  time: 380s
INFO:__main__:Epoch 10 - avg_train_loss: 0.0023  avg_val_loss: 1.0955  time: 380s
Epoch 10 - Score: 0.7360
INFO:__main__:Epoch 10 - Score: 0.7360


EVAL: [9/10] Elapsed 0m 30s (remain 0m 0s) 
0.7558494527348809


Score: 0.7558
INFO:__main__:Score: 0.7558
-------------fold:1 training-------------
INFO:__main__:-------------fold:1 training-------------
Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/75] Elapsed 0m 3s (remain 4m 31s) 
Epoch: [1][50/75] Elapsed 4m 10s (remain 1m 57s) 
Epoch: [1][74/75] Elapsed 6m 8s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 4s (remain 0m 42s) 


Epoch 1 - avg_train_loss: 0.9803  avg_val_loss: 1.0936  time: 395s
INFO:__main__:Epoch 1 - avg_train_loss: 0.9803  avg_val_loss: 1.0936  time: 395s
Epoch 1 - Score: 0.5789
INFO:__main__:Epoch 1 - Score: 0.5789
Epoch 1 - Save Best Score: 0.5789 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.5789 Model


EVAL: [9/10] Elapsed 0m 26s (remain 0m 0s) 
Epoch: [2][0/75] Elapsed 0m 2s (remain 3m 1s) 
Epoch: [2][50/75] Elapsed 3m 57s (remain 1m 51s) 
Epoch: [2][74/75] Elapsed 5m 59s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 4s (remain 0m 42s) 


Epoch 2 - avg_train_loss: 0.5408  avg_val_loss: 0.6502  time: 386s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5408  avg_val_loss: 0.6502  time: 386s
Epoch 2 - Score: 0.6489
INFO:__main__:Epoch 2 - Score: 0.6489
Epoch 2 - Save Best Score: 0.6489 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.6489 Model


EVAL: [9/10] Elapsed 0m 26s (remain 0m 0s) 
Epoch: [3][0/75] Elapsed 0m 5s (remain 7m 2s) 
Epoch: [3][50/75] Elapsed 3m 53s (remain 1m 50s) 
Epoch: [3][74/75] Elapsed 5m 51s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 4s (remain 0m 42s) 


Epoch 3 - avg_train_loss: 0.2875  avg_val_loss: 0.6477  time: 379s
INFO:__main__:Epoch 3 - avg_train_loss: 0.2875  avg_val_loss: 0.6477  time: 379s
Epoch 3 - Score: 0.6989
INFO:__main__:Epoch 3 - Score: 0.6989
Epoch 3 - Save Best Score: 0.6989 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.6989 Model


EVAL: [9/10] Elapsed 0m 26s (remain 0m 0s) 
Epoch: [4][0/75] Elapsed 0m 4s (remain 6m 5s) 
Epoch: [4][50/75] Elapsed 3m 46s (remain 1m 46s) 
Epoch: [4][74/75] Elapsed 6m 2s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 4s (remain 0m 42s) 


Epoch 4 - avg_train_loss: 0.1720  avg_val_loss: 0.7172  time: 389s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1720  avg_val_loss: 0.7172  time: 389s
Epoch 4 - Score: 0.7243
INFO:__main__:Epoch 4 - Score: 0.7243
Epoch 4 - Save Best Score: 0.7243 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7243 Model


EVAL: [9/10] Elapsed 0m 26s (remain 0m 0s) 
Epoch: [5][0/75] Elapsed 0m 5s (remain 6m 31s) 
Epoch: [5][50/75] Elapsed 4m 10s (remain 1m 58s) 
Epoch: [5][74/75] Elapsed 6m 1s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 4s (remain 0m 42s) 


Epoch 5 - avg_train_loss: 0.0566  avg_val_loss: 0.8998  time: 388s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0566  avg_val_loss: 0.8998  time: 388s
Epoch 5 - Score: 0.7125
INFO:__main__:Epoch 5 - Score: 0.7125


EVAL: [9/10] Elapsed 0m 26s (remain 0m 0s) 
Epoch: [6][0/75] Elapsed 0m 3s (remain 4m 5s) 
Epoch: [6][50/75] Elapsed 3m 48s (remain 1m 47s) 
Epoch: [6][74/75] Elapsed 5m 47s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 4s (remain 0m 42s) 


Epoch 6 - avg_train_loss: 0.0108  avg_val_loss: 0.9588  time: 375s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0108  avg_val_loss: 0.9588  time: 375s
Epoch 6 - Score: 0.7410
INFO:__main__:Epoch 6 - Score: 0.7410
Epoch 6 - Save Best Score: 0.7410 Model
INFO:__main__:Epoch 6 - Save Best Score: 0.7410 Model


EVAL: [9/10] Elapsed 0m 26s (remain 0m 0s) 
Epoch: [7][0/75] Elapsed 0m 2s (remain 3m 5s) 
Epoch: [7][50/75] Elapsed 3m 59s (remain 1m 52s) 
Epoch: [7][74/75] Elapsed 5m 49s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 4s (remain 0m 42s) 


Epoch 7 - avg_train_loss: 0.0192  avg_val_loss: 0.9981  time: 376s
INFO:__main__:Epoch 7 - avg_train_loss: 0.0192  avg_val_loss: 0.9981  time: 376s
Epoch 7 - Score: 0.7155
INFO:__main__:Epoch 7 - Score: 0.7155


EVAL: [9/10] Elapsed 0m 26s (remain 0m 0s) 
Epoch: [8][0/75] Elapsed 0m 3s (remain 4m 49s) 
Epoch: [8][50/75] Elapsed 4m 9s (remain 1m 57s) 
Epoch: [8][74/75] Elapsed 5m 52s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 4s (remain 0m 42s) 


Epoch 8 - avg_train_loss: 0.0068  avg_val_loss: 1.0144  time: 380s
INFO:__main__:Epoch 8 - avg_train_loss: 0.0068  avg_val_loss: 1.0144  time: 380s
Epoch 8 - Score: 0.7217
INFO:__main__:Epoch 8 - Score: 0.7217


EVAL: [9/10] Elapsed 0m 26s (remain 0m 0s) 
Epoch: [9][0/75] Elapsed 0m 5s (remain 7m 7s) 
Epoch: [9][50/75] Elapsed 4m 17s (remain 2m 1s) 
Epoch: [9][74/75] Elapsed 6m 4s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 4s (remain 0m 42s) 


Epoch 9 - avg_train_loss: 0.0039  avg_val_loss: 1.0195  time: 391s
INFO:__main__:Epoch 9 - avg_train_loss: 0.0039  avg_val_loss: 1.0195  time: 391s
Epoch 9 - Score: 0.7215
INFO:__main__:Epoch 9 - Score: 0.7215


EVAL: [9/10] Elapsed 0m 26s (remain 0m 0s) 
Epoch: [10][0/75] Elapsed 0m 2s (remain 3m 37s) 
Epoch: [10][50/75] Elapsed 4m 5s (remain 1m 55s) 
Epoch: [10][74/75] Elapsed 5m 57s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 4s (remain 0m 42s) 


Epoch 10 - avg_train_loss: 0.0032  avg_val_loss: 1.0205  time: 384s
INFO:__main__:Epoch 10 - avg_train_loss: 0.0032  avg_val_loss: 1.0205  time: 384s
Epoch 10 - Score: 0.7215
INFO:__main__:Epoch 10 - Score: 0.7215


EVAL: [9/10] Elapsed 0m 26s (remain 0m 0s) 


Score: 0.7410
INFO:__main__:Score: 0.7410
-------------fold:2 training-------------
INFO:__main__:-------------fold:2 training-------------


0.7409758232826322


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/75] Elapsed 0m 2s (remain 3m 2s) 
Epoch: [1][50/75] Elapsed 3m 55s (remain 1m 50s) 
Epoch: [1][74/75] Elapsed 5m 43s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 2s (remain 0m 22s) 


Epoch 1 - avg_train_loss: 0.9811  avg_val_loss: 0.6581  time: 367s
INFO:__main__:Epoch 1 - avg_train_loss: 0.9811  avg_val_loss: 0.6581  time: 367s
Epoch 1 - Score: 0.6305
INFO:__main__:Epoch 1 - Score: 0.6305
Epoch 1 - Save Best Score: 0.6305 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6305 Model


EVAL: [9/10] Elapsed 0m 23s (remain 0m 0s) 
Epoch: [2][0/75] Elapsed 0m 2s (remain 2m 42s) 
Epoch: [2][50/75] Elapsed 3m 38s (remain 1m 42s) 
Epoch: [2][74/75] Elapsed 5m 47s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 2s (remain 0m 22s) 


Epoch 2 - avg_train_loss: 0.5909  avg_val_loss: 0.7291  time: 371s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5909  avg_val_loss: 0.7291  time: 371s
Epoch 2 - Score: 0.5658
INFO:__main__:Epoch 2 - Score: 0.5658


EVAL: [9/10] Elapsed 0m 23s (remain 0m 0s) 
Epoch: [3][0/75] Elapsed 0m 2s (remain 3m 16s) 
Epoch: [3][50/75] Elapsed 3m 56s (remain 1m 51s) 
Epoch: [3][74/75] Elapsed 5m 54s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 2s (remain 0m 22s) 


Epoch 3 - avg_train_loss: 0.4139  avg_val_loss: 0.6849  time: 378s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4139  avg_val_loss: 0.6849  time: 378s
Epoch 3 - Score: 0.7289
INFO:__main__:Epoch 3 - Score: 0.7289
Epoch 3 - Save Best Score: 0.7289 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7289 Model


EVAL: [9/10] Elapsed 0m 23s (remain 0m 0s) 
Epoch: [4][0/75] Elapsed 0m 4s (remain 4m 58s) 
Epoch: [4][50/75] Elapsed 4m 11s (remain 1m 58s) 
Epoch: [4][74/75] Elapsed 6m 9s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 2s (remain 0m 22s) 


Epoch 4 - avg_train_loss: 0.2279  avg_val_loss: 0.7824  time: 393s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2279  avg_val_loss: 0.7824  time: 393s
Epoch 4 - Score: 0.7301
INFO:__main__:Epoch 4 - Score: 0.7301
Epoch 4 - Save Best Score: 0.7301 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7301 Model


EVAL: [9/10] Elapsed 0m 23s (remain 0m 0s) 
Epoch: [5][0/75] Elapsed 0m 5s (remain 6m 29s) 
Epoch: [5][50/75] Elapsed 4m 8s (remain 1m 57s) 
Epoch: [5][74/75] Elapsed 5m 56s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 2s (remain 0m 22s) 


Epoch 5 - avg_train_loss: 0.0858  avg_val_loss: 0.9044  time: 380s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0858  avg_val_loss: 0.9044  time: 380s
Epoch 5 - Score: 0.7232
INFO:__main__:Epoch 5 - Score: 0.7232


EVAL: [9/10] Elapsed 0m 23s (remain 0m 0s) 
Epoch: [6][0/75] Elapsed 0m 3s (remain 4m 13s) 
Epoch: [6][50/75] Elapsed 4m 5s (remain 1m 55s) 
Epoch: [6][74/75] Elapsed 6m 0s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 2s (remain 0m 22s) 


Epoch 6 - avg_train_loss: 0.0389  avg_val_loss: 0.9918  time: 384s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0389  avg_val_loss: 0.9918  time: 384s
Epoch 6 - Score: 0.7411
INFO:__main__:Epoch 6 - Score: 0.7411
Epoch 6 - Save Best Score: 0.7411 Model
INFO:__main__:Epoch 6 - Save Best Score: 0.7411 Model


EVAL: [9/10] Elapsed 0m 23s (remain 0m 0s) 
Epoch: [7][0/75] Elapsed 0m 10s (remain 12m 37s) 
Epoch: [7][50/75] Elapsed 4m 16s (remain 2m 0s) 
Epoch: [7][74/75] Elapsed 6m 6s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 2s (remain 0m 22s) 


Epoch 7 - avg_train_loss: 0.0170  avg_val_loss: 1.1382  time: 390s
INFO:__main__:Epoch 7 - avg_train_loss: 0.0170  avg_val_loss: 1.1382  time: 390s
Epoch 7 - Score: 0.6948
INFO:__main__:Epoch 7 - Score: 0.6948


EVAL: [9/10] Elapsed 0m 23s (remain 0m 0s) 
Epoch: [8][0/75] Elapsed 0m 7s (remain 8m 54s) 
Epoch: [8][50/75] Elapsed 4m 6s (remain 1m 56s) 
Epoch: [8][74/75] Elapsed 6m 0s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 2s (remain 0m 22s) 


Epoch 8 - avg_train_loss: 0.0054  avg_val_loss: 1.1033  time: 384s
INFO:__main__:Epoch 8 - avg_train_loss: 0.0054  avg_val_loss: 1.1033  time: 384s
Epoch 8 - Score: 0.7420
INFO:__main__:Epoch 8 - Score: 0.7420
Epoch 8 - Save Best Score: 0.7420 Model
INFO:__main__:Epoch 8 - Save Best Score: 0.7420 Model


EVAL: [9/10] Elapsed 0m 23s (remain 0m 0s) 
Epoch: [9][0/75] Elapsed 0m 3s (remain 4m 13s) 
Epoch: [9][50/75] Elapsed 4m 36s (remain 2m 9s) 
Epoch: [9][74/75] Elapsed 6m 15s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 2s (remain 0m 22s) 


Epoch 9 - avg_train_loss: 0.0042  avg_val_loss: 1.1188  time: 399s
INFO:__main__:Epoch 9 - avg_train_loss: 0.0042  avg_val_loss: 1.1188  time: 399s
Epoch 9 - Score: 0.7420
INFO:__main__:Epoch 9 - Score: 0.7420


EVAL: [9/10] Elapsed 0m 23s (remain 0m 0s) 
Epoch: [10][0/75] Elapsed 0m 5s (remain 7m 1s) 
Epoch: [10][50/75] Elapsed 4m 29s (remain 2m 6s) 
Epoch: [10][74/75] Elapsed 6m 21s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 2s (remain 0m 22s) 


Epoch 10 - avg_train_loss: 0.0034  avg_val_loss: 1.1206  time: 405s
INFO:__main__:Epoch 10 - avg_train_loss: 0.0034  avg_val_loss: 1.1206  time: 405s
Epoch 10 - Score: 0.7420
INFO:__main__:Epoch 10 - Score: 0.7420


EVAL: [9/10] Elapsed 0m 23s (remain 0m 0s) 


Score: 0.7420
INFO:__main__:Score: 0.7420
-------------fold:3 training-------------
INFO:__main__:-------------fold:3 training-------------


0.7420202660606812


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/75] Elapsed 0m 5s (remain 6m 34s) 
Epoch: [1][50/75] Elapsed 4m 8s (remain 1m 56s) 
Epoch: [1][74/75] Elapsed 5m 52s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 2s (remain 0m 18s) 


RuntimeError: ignored

In [None]:
A = pd.read_csv(OUTPUT_MODEL_DIR+'oof_df.csv')
A.head()