Reference: https://www.kaggle.com/code/kojimar/fb3-single-pytorch-model-train

In [None]:
from google.colab import drive
drive.mount('/content/drive/')
%cd drive/MyDrive/kaggle/notebook

Mounted at /content/drive/
/content/drive/MyDrive/kaggle/notebook


In [None]:
%%capture
!pip install transformers==4.20.1
!pip install tokenizers==0.12.1
!pip install sentencepiece==0.1.97

In [None]:
import os
import gc
import sys

import time
import datetime
import math
import random

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from torch.utils.checkpoint import checkpoint

import transformers
import tokenizers
print(f'transformers.__version__: {transformers.__version__}')
print(f'tokenizers.__version__: {tokenizers.__version__}')
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
os.environ['TOKENIZERS_PARALLELISM']='true'

transformers.__version__: 4.20.1
tokenizers.__version__: 0.12.1


In [None]:
class CFG:
    str_now = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
    train = True

    models_path = 'FB3-models'
    epochs = 5
    save_all_models = False
    competition = 'FB3'
    apex = True
    print_freq = 20
    num_workers = 4
    model = 'microsoft/deberta-v3-base' #If you want to train on the kaggle platform, v3-base is realistic. v3-large will time out.

    gradient_checkpointing = True
    scheduler = 'cosine'
    batch_scheduler = True
    num_cycles = 0.5
    num_warmup_steps = 0

    #Layer-Wise Learning Rate Decay
    llrd = True
    layerwise_lr = 5e-5
    layerwise_lr_decay = 0.9
    layerwise_weight_decay = 0.01
    layerwise_adam_epsilon = 1e-6
    layerwise_use_bertadam = False
    #pooling
    pooling = 'mean' # mean, max, min, attention, weightedlayer

    #init_weight
    init_weight = 'normal' # normal, xavier_uniform, xavier_normal, kaiming_uniform, kaiming_normal, orthogonal
    #re-init
    reinit = True
    reinit_n = 1

    unscale = False
    eps = 1e-6
    betas = (0.9, 0.999)
    max_len = 512
    weight_decay = 0.01
    gradient_accumulation_steps = 1
    max_grad_norm = 1000
    target_cols = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed = 42
    cv_seed = 42
    n_fold = 5
    trn_fold = list(range(n_fold))
    batch_size = 8
    n_targets = 6
    gpu_id = 0
    device = f'cuda:{gpu_id}'
    train_file = '../input/feedback-prize-english-language-learning/train.csv'
    pseudo_files  = [f'../input/fb1-pseudo-label/{i}_pseudo_label.csv' for i in range(n_fold)]
    test_file = '../input/feedback-prize-english-language-learning/test.csv'
    submission_file = '../input/feedback-prize-english-language-learning/sample_submission.csv'

In [None]:
#Unique model name
plus = 'pseudo-simul'
if len(CFG.model.split("/")) == 2:
    CFG.identifier = f'{CFG.str_now}-{CFG.model.split("/")[1]}-{plus}' 
else:
    CFG.identifier = f'{CFG.str_now}-{CFG.model}-{plus}'
    
print(CFG.identifier)

20221127-071113-deberta-v3-base-pseudo-simul


In [None]:

CFG.df_train = pd.read_csv(CFG.train_file)
CFG.pseudo_train_list = [pd.read_csv(file_name) for file_name in CFG.pseudo_files]
CFG.OUTPUT_DIR = f'./{CFG.identifier}/'
CFG.log_filename = CFG.OUTPUT_DIR + 'train'

os.system('pip install iterative-stratification==0.1.7')
#CV
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold    
Fold = MultilabelStratifiedKFold(n_splits = CFG.n_fold, shuffle = True, random_state = CFG.cv_seed)
for n, (train_index, val_index) in enumerate(Fold.split(CFG.df_train, CFG.df_train[CFG.target_cols])):
    CFG.df_train.loc[val_index, 'fold'] = int(n)
CFG.df_train['fold'] = CFG.df_train['fold'].astype(int)


os.makedirs(CFG.OUTPUT_DIR, exist_ok=True)    
print(CFG.OUTPUT_DIR)

./20221127-071113-deberta-v3-base-pseudo-simul/


In [None]:
pseudo_folds = []
df_train_set = set(CFG.df_train['text_id'].values)
for fold in range(5):
    cur_pseudo_df  = CFG.pseudo_train_list[fold]
    pseudo_set = set(cur_pseudo_df['text_id'].values)
    drop_list  = list(df_train_set & pseudo_set)

    new_df = cur_pseudo_df[~cur_pseudo_df['text_id'].isin(drop_list)]
    new_df = new_df.drop([cur_pseudo_df.columns[0], cur_pseudo_df.columns[9]], axis = 1)
    print("Before: ", len(cur_pseudo_df), " After: ", len(new_df), " each fold: ", len(CFG.df_train[CFG.df_train['fold'] != 0]))
    new_df['fold'] = fold+1
    pseudo_folds.append(pd.concat([CFG.df_train, new_df], axis=0))
    del cur_pseudo_df, pseudo_set, drop_list
    print("Final: ", len(pseudo_folds[fold]))

Before:  15594  After:  15142  each fold:  3129
Final:  19053
Before:  15594  After:  15142  each fold:  3129
Final:  19053
Before:  15594  After:  15142  each fold:  3129
Final:  19053
Before:  15594  After:  15142  each fold:  3129
Final:  19053
Before:  15594  After:  15142  each fold:  3129
Final:  19053


In [None]:
def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:, i]
        y_pred = y_preds[:, i]
        score = mean_squared_error(y_true, y_pred, squared = False)
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores

def get_score(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return mcrmse_score, scores

def get_logger(filename = CFG.log_filename):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter('%(message)s'))
    handler2 = FileHandler(filename = f'{filename}.log')
    handler2.setFormatter(Formatter('%(message)s'))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

def prepare_input(cfg, text):
    inputs = cfg.tokenizer.encode_plus(
        text,
        return_tensors = None,
        add_special_tokens = True,
        max_length = cfg.max_len,
        pad_to_max_length = True,
        truncation = True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype = torch.long)
    return inputs    

def collate(inputs):
    mask_len = int(inputs['attention_mask'].sum(axis = 1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:, :mask_len]
    return inputs

class AverageMeter(object):
    def __init__(self):
        self.reset()
        
    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
        
    def update(self, val, n = 1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return f'{int(m)}m {int(s)}s'

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return f'{str(asMinutes(s))} (remain {str(asMinutes(rs))})'

def seed_everything(seed = 42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
seed_everything(CFG.seed)

In [None]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min = 1e-9)
        mean_embeddings = sum_embeddings/sum_mask
        return mean_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()


In [None]:
def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    losses = AverageMeter()
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled = CFG.apex)
    start = end = time.time()
    global_step = 0
  
    for step, (inputs, labels) in enumerate(train_loader):
        attention_mask = inputs['attention_mask'].to(device)
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled = CFG.apex):
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        if CFG.unscale:
            scaler.unscale_(optimizer)
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
                
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
                
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader) - 1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f} '
                  'LR: {lr:.8f} '
                  .format(epoch + 1, step, len(train_loader), remain = timeSince(start, float(step + 1)/len(train_loader)),
                          loss = losses,
                          grad_norm = grad_norm,
                          lr = scheduler.get_lr()[0]
                         )
                 )
    return losses.avg

In [None]:
def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader) - 1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss = losses,
                          remain = timeSince(start, float(step + 1) / len(valid_loader))
                         )
                 )
    return losses.avg, np.concatenate(preds)

In [None]:
LOGGER = get_logger()
LOGGER.info(f'OUTPUT_DIR: {CFG.OUTPUT_DIR}')

OUTPUT_DIR: ./20221127-071113-deberta-v3-base-pseudo-simul/
INFO:__main__:OUTPUT_DIR: ./20221127-071113-deberta-v3-base-pseudo-simul/


In [None]:
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.model)
CFG.tokenizer.save_pretrained(CFG.OUTPUT_DIR + 'tokenizer')

#max_len
lengths = []
tk0 = tqdm(CFG.df_train['full_text'].fillna('').values, total = len(CFG.df_train))
for text in tk0:
    length = len(CFG.tokenizer(text, add_special_tokens = False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 2

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/3911 [00:00<?, ?it/s]

In [None]:
class FB3TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['full_text'].values
        self.labels = df[cfg.target_cols].values
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        label = torch.tensor(self.labels[item], dtype = torch.float)
        return inputs, label

In [None]:
class FB3Model(nn.Module):
    def __init__(self, CFG, config_path = None, pretrained = False):
        super().__init__()
        self.CFG = CFG
        if config_path is None:
            self.config = AutoConfig.from_pretrained(CFG.model, ouput_hidden_states = True)
            self.config.save_pretrained(CFG.OUTPUT_DIR + 'config')
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
        else:
            self.config = torch.load(config_path)
            
        LOGGER.info(self.config)
        
        if pretrained:
            self.model = AutoModel.from_pretrained(CFG.model, config=self.config)
            self.model.save_pretrained(CFG.OUTPUT_DIR + 'model')
        else:
            self.model = AutoModel(self.config)
            
        if self.CFG.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
            
        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, self.CFG.n_targets)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean = 0.0, std = self.config.initializer_range)
                
            if module.bias is not None:
                module.bias.data.zero_()

        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean = 0.0, std = self.config.initializer_range)
               
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    
    def feature(self, inputs):
        outputs = self.model(**inputs)

        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
            
        return feature
    
    def forward(self, inputs):
        feature = self.feature(inputs)
        outout = self.fc(feature)
        return outout

In [None]:
def re_initializing_layer(model, config, layer_num):
    for module in model.model.encoder.layer[-layer_num:].modules():
        if isinstance(module, nn.Linear)::
            module.weight.data.normal_(mean=0.0, std=config.initializer_range)
     
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=config.initializer_range)
     
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    return model   

def train_loop(folds, org_folds, fold):
    LOGGER.info(f"========== fold: {fold} training ==========")
    print("len pseudo folds: ", len(folds))
    print("len_org_folds: ", len(org_folds))

    train_folds = folds[folds['fold'] != fold].reset_index(drop = True)
    valid_folds = org_folds[org_folds['fold'] == fold].reset_index(drop = True)


    print("len_train_folds: ", len(train_folds))
    print("len_valid_folds: ", len(valid_folds))
    valid_labels = valid_folds[CFG.target_cols].values
    
    train_dataset = FB3TrainDataset(CFG, train_folds)
    valid_dataset = FB3TrainDataset(CFG, valid_folds)
    
    train_loader = DataLoader(train_dataset,
                              batch_size = CFG.batch_size,
                              shuffle = True, 
                              num_workers = CFG.num_workers,
                              pin_memory = True, 
                              drop_last = True
                             )
    valid_loader = DataLoader(valid_dataset,
                              batch_size = CFG.batch_size * 2,
                              shuffle=False,
                              num_workers=CFG.num_workers,
                              pin_memory=True, 
                              drop_last=False)

    model = FB3Model(CFG, config_path = None, pretrained = True)
    if CFG.reinit:
        model = re_initializing_layer(model, model.config, CFG.reinit_n)
        
    #os.makedirs(CFG.OUTPUT_DIR + 'config/', exist_ok = True)
    #torch.save(model.config, CFG.OUTPUT_DIR + 'config/config.pth')
    model.to(CFG.device)
    
    #llrd
    def get_optimizer_grouped_parameters(model, 
                                         layerwise_lr,
                                         layerwise_weight_decay,
                                         layerwise_lr_decay):
        
        no_decay = ["bias", "LayerNorm.weight"]
        # initialize lr for task specific layer
        optimizer_grouped_parameters = [{"params": [p for n, p in model.named_parameters() if "model" not in n],
                                         "weight_decay": 0.0,
                                         "lr": layerwise_lr,
                                        },]
        # initialize lrs for every layer
        layers = [model.model.embeddings] + list(model.model.encoder.layer)
        layers.reverse()
        lr = layerwise_lr
        for layer in layers:
            optimizer_grouped_parameters += [{"params": [p for n, p in layer.named_parameters() if not any(nd in n for nd in no_decay)],
                                              "weight_decay": layerwise_weight_decay,
                                              "lr": lr,
                                             },
                                             {"params": [p for n, p in layer.named_parameters() if any(nd in n for nd in no_decay)],
                                              "weight_decay": 0.0,
                                              "lr": lr,
                                             },]
            lr *= layerwise_lr_decay
        return optimizer_grouped_parameters
    

    from transformers import AdamW
    grouped_optimizer_params = get_optimizer_grouped_parameters(model, 
                                                                CFG.layerwise_lr, 
                                                                CFG.layerwise_weight_decay, 
                                                                CFG.layerwise_lr_decay)
    optimizer = AdamW(grouped_optimizer_params,
                      lr = CFG.layerwise_lr,
                      eps = CFG.layerwise_adam_epsilon,
                      correct_bias = not CFG.layerwise_use_bertadam)

    
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, 
                num_warmup_steps = cfg.num_warmup_steps, 
                num_training_steps = num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, 
                num_warmup_steps = cfg.num_warmup_steps, 
                num_training_steps = num_train_steps,
                num_cycles = cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)
    
    criterion = nn.SmoothL1Loss(reduction='mean')

    best_score = np.inf
    best_train_loss = np.inf
    best_val_loss = np.inf
    
    epoch_list = []
    epoch_avg_loss_list = []
    epoch_avg_val_loss_list = []
    epoch_score_list = []
    epoch_scores_list = []

    for epoch in range(CFG.epochs):
        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, CFG.device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, CFG.device)
        
        # scoring
        score, scores = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time
        
        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}  Scores: {scores}')
        
        epoch_list.append(epoch+1)
        epoch_avg_loss_list.append(avg_loss)
        epoch_avg_val_loss_list.append(avg_val_loss)
        epoch_score_list.append(score)
        epoch_scores_list.append(scores)
        
        if best_score > score:
            best_score = score
            best_train_loss = avg_loss
            best_val_loss = avg_val_loss
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        CFG.OUTPUT_DIR + f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")
            
        if CFG.save_all_models:
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        CFG.OUTPUT_DIR + f"{CFG.model.replace('/', '-')}_fold{fold}_epoch{epoch + 1}.pth")

    predictions = torch.load(CFG.OUTPUT_DIR + f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location = torch.device('cpu'))['predictions']
    valid_folds[[f"pred_{c}" for c in CFG.target_cols]] = predictions
    
    df_epoch = pd.DataFrame({'epoch' : epoch_list,
                             'MCRMSE' : epoch_score_list,
                             'train_loss' : epoch_avg_loss_list, 
                             'val_loss' : epoch_avg_val_loss_list})
    df_scores = pd.DataFrame(epoch_scores_list)
    df_scores.columns = CFG.target_cols
    
    
    torch.cuda.empty_cache()
    gc.collect()
    
    return best_train_loss, best_val_loss, valid_folds, pd.concat([df_epoch, df_scores], axis = 1)

In [None]:
def get_result(oof_df, fold, best_train_loss, best_val_loss):
    labels = oof_df[CFG.target_cols].values
    preds = oof_df[[f"pred_{c}" for c in CFG.target_cols]].values
    score, scores = get_score(labels, preds)
    LOGGER.info(f'Score: {score:<.4f}  Scores: {scores}')
    _output_log = pd.DataFrame([CFG.identifier, CFG.model, CFG.cv_seed, CFG.seed, fold, 'best', score, best_train_loss, best_val_loss] + scores).T
    _output_log.columns = ['file', 'model', 'cv_seed', 'seed', 'fold', 'epoch', 'MCRMSE', 'train_loss', 'val_loss'] + CFG.target_cols
    return _output_log

if CFG.train:
    output_log = pd.DataFrame()
    oof_df = pd.DataFrame()
    train_loss_list = []
    val_loss_list = []
    for fold in range(CFG.n_fold):
        if fold in CFG.trn_fold:
            best_train_loss, best_val_loss, _oof_df, df_epoch_scores = train_loop(folds = pseudo_folds[fold], org_folds = CFG.df_train, fold = fold)
            train_loss_list.append(best_train_loss)
            val_loss_list.append(best_val_loss)
            oof_df = pd.concat([oof_df, _oof_df])
            LOGGER.info(f"========== fold: {fold} result ==========")

            df_epoch_scores['file'] = CFG.identifier
            df_epoch_scores['model'] = CFG.model
            df_epoch_scores['cv_seed'] = CFG.cv_seed
            df_epoch_scores['seed'] = CFG.seed
            df_epoch_scores['fold'] = fold
            df_epoch_scores = df_epoch_scores[['file', 'model', 'cv_seed', 'seed', 'fold', 'epoch', 'MCRMSE', 'train_loss', 'val_loss'] + CFG.target_cols]

            _output_log = get_result(_oof_df, fold, best_train_loss, best_val_loss)
            output_log = pd.concat([output_log, df_epoch_scores, _output_log])

    oof_df = oof_df.reset_index(drop=True)
    LOGGER.info(f"========== CV ==========")
    _output_log = get_result(oof_df, 'OOF', np.mean(train_loss_list), np.mean(val_loss_list))
    output_log = pd.concat([output_log, _output_log])
    output_log.to_csv(f'{CFG.identifier}.csv', index=False)
    oof_df.to_pickle(CFG.OUTPUT_DIR+'oof_df.pkl', protocol = 4)



len pseudo folds:  19053
len_org_folds:  3911
len_train_folds:  18271
len_valid_folds:  782


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.20.1",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0

Downloading:   0%|          | 0.00/354M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/2283] Elapsed 0m 6s (remain 252m 9s) Loss: 3.4162(3.4162) Grad: inf LR: 0.00005000 
Epoch: [1][20/2283] Elapsed 0m 49s (remain 88m 21s) Loss: 0.3282(1.8460) Grad: 225819.2500 LR: 0.00005000 
Epoch: [1][40/2283] Elapsed 1m 18s (remain 71m 18s) Loss: 0.2303(1.0411) Grad: 91145.9219 LR: 0.00005000 
Epoch: [1][60/2283] Elapsed 1m 56s (remain 70m 48s) Loss: 0.0373(0.7294) Grad: 93803.4062 LR: 0.00005000 
Epoch: [1][80/2283] Elapsed 2m 33s (remain 69m 40s) Loss: 0.0839(0.5594) Grad: 163021.0938 LR: 0.00004999 
Epoch: [1][100/2283] Elapsed 3m 8s (remain 68m 0s) Loss: 0.0210(0.4566) Grad: 59573.4453 LR: 0.00004999 
Epoch: [1][120/2283] Elapsed 3m 45s (remain 67m 14s) Loss: 0.0169(0.3874) Grad: 50552.6211 LR: 0.00004999 
Epoch: [1][140/2283] Elapsed 4m 23s (remain 66m 49s) Loss: 0.0194(0.3372) Grad: 56247.7383 LR: 0.00004998 
Epoch: [1][160/2283] Elapsed 4m 57s (remain 65m 25s) Loss: 0.0146(0.2986) Grad: 38078.0469 LR: 0.00004998 
Epoch: [1][180/2283] Elapsed 5m 30s (remain 63m 57s

Epoch 1 - avg_train_loss: 0.0434  avg_val_loss: 0.1116  time: 4421s
INFO:__main__:Epoch 1 - avg_train_loss: 0.0434  avg_val_loss: 0.1116  time: 4421s
Epoch 1 - Score: 0.4725  Scores: [0.5307354763645933, 0.478920852627887, 0.41890723674078406, 0.4876202599791557, 0.4693006827046313, 0.44923968411073834]
INFO:__main__:Epoch 1 - Score: 0.4725  Scores: [0.5307354763645933, 0.478920852627887, 0.41890723674078406, 0.4876202599791557, 0.4693006827046313, 0.44923968411073834]
Epoch 1 - Save Best Score: 0.4725 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.4725 Model


EVAL: [48/49] Elapsed 1m 48s (remain 0m 0s) Loss: 0.1184(0.1116) 
Epoch: [2][0/2283] Elapsed 0m 1s (remain 72m 52s) Loss: 0.0370(0.0370) Grad: 97320.7891 LR: 0.00004522 
Epoch: [2][20/2283] Elapsed 0m 39s (remain 70m 8s) Loss: 0.0027(0.0155) Grad: 37061.3008 LR: 0.00004514 
Epoch: [2][40/2283] Elapsed 1m 16s (remain 69m 41s) Loss: 0.0160(0.0146) Grad: 54252.5625 LR: 0.00004506 
Epoch: [2][60/2283] Elapsed 1m 54s (remain 69m 33s) Loss: 0.0013(0.0155) Grad: 25746.4141 LR: 0.00004498 
Epoch: [2][80/2283] Elapsed 2m 32s (remain 69m 3s) Loss: 0.0125(0.0168) Grad: 62234.1836 LR: 0.00004490 
Epoch: [2][100/2283] Elapsed 3m 13s (remain 69m 39s) Loss: 0.0013(0.0166) Grad: 50134.9844 LR: 0.00004481 
Epoch: [2][120/2283] Elapsed 3m 52s (remain 69m 13s) Loss: 0.0359(0.0174) Grad: 262531.0312 LR: 0.00004473 
Epoch: [2][140/2283] Elapsed 4m 34s (remain 69m 22s) Loss: 0.0584(0.0178) Grad: 217219.8438 LR: 0.00004464 
Epoch: [2][160/2283] Elapsed 5m 11s (remain 68m 23s) Loss: 0.0274(0.0184) Grad: 11523

Epoch 2 - avg_train_loss: 0.0183  avg_val_loss: 0.1006  time: 4419s
INFO:__main__:Epoch 2 - avg_train_loss: 0.0183  avg_val_loss: 0.1006  time: 4419s
Epoch 2 - Score: 0.4489  Scores: [0.4796153913304552, 0.45654387101035676, 0.4048814693945213, 0.4530342112235974, 0.4585768023929215, 0.44056050101466127]
INFO:__main__:Epoch 2 - Score: 0.4489  Scores: [0.4796153913304552, 0.45654387101035676, 0.4048814693945213, 0.4530342112235974, 0.4585768023929215, 0.44056050101466127]
Epoch 2 - Save Best Score: 0.4489 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.4489 Model


EVAL: [48/49] Elapsed 1m 48s (remain 0m 0s) Loss: 0.1173(0.1006) 
Epoch: [3][0/2283] Elapsed 0m 1s (remain 73m 15s) Loss: 0.0014(0.0014) Grad: 41786.2930 LR: 0.00003273 
Epoch: [3][20/2283] Elapsed 0m 38s (remain 68m 16s) Loss: 0.0081(0.0186) Grad: 188851.5781 LR: 0.00003260 
Epoch: [3][40/2283] Elapsed 1m 16s (remain 70m 2s) Loss: 0.0324(0.0198) Grad: 72616.4375 LR: 0.00003247 
Epoch: [3][60/2283] Elapsed 1m 51s (remain 67m 31s) Loss: 0.0025(0.0167) Grad: 33001.1406 LR: 0.00003234 
Epoch: [3][80/2283] Elapsed 2m 31s (remain 68m 31s) Loss: 0.0174(0.0173) Grad: 93571.1328 LR: 0.00003220 
Epoch: [3][100/2283] Elapsed 3m 4s (remain 66m 26s) Loss: 0.0287(0.0168) Grad: 56618.0156 LR: 0.00003207 
Epoch: [3][120/2283] Elapsed 3m 38s (remain 64m 59s) Loss: 0.0102(0.0173) Grad: 79624.3203 LR: 0.00003194 
Epoch: [3][140/2283] Elapsed 4m 16s (remain 65m 0s) Loss: 0.0048(0.0176) Grad: 47263.4609 LR: 0.00003181 
Epoch: [3][160/2283] Elapsed 4m 59s (remain 65m 42s) Loss: 0.0136(0.0173) Grad: 101225.

Epoch 3 - avg_train_loss: 0.0161  avg_val_loss: 0.1028  time: 4451s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0161  avg_val_loss: 0.1028  time: 4451s
Epoch 3 - Score: 0.4536  Scores: [0.48852379142617874, 0.46346628803128415, 0.40550629548094663, 0.4571205414593978, 0.46311902336686317, 0.44376748304757735]
INFO:__main__:Epoch 3 - Score: 0.4536  Scores: [0.48852379142617874, 0.46346628803128415, 0.40550629548094663, 0.4571205414593978, 0.46311902336686317, 0.44376748304757735]


EVAL: [48/49] Elapsed 1m 48s (remain 0m 0s) Loss: 0.1168(0.1028) 
Epoch: [4][0/2283] Elapsed 0m 1s (remain 50m 4s) Loss: 0.0529(0.0529) Grad: 56754.5117 LR: 0.00001728 
Epoch: [4][20/2283] Elapsed 0m 40s (remain 72m 57s) Loss: 0.0108(0.0095) Grad: 53837.6484 LR: 0.00001715 
Epoch: [4][40/2283] Elapsed 1m 14s (remain 67m 43s) Loss: 0.0169(0.0102) Grad: 38357.5742 LR: 0.00001702 
Epoch: [4][60/2283] Elapsed 1m 52s (remain 68m 16s) Loss: 0.0111(0.0117) Grad: 76140.6016 LR: 0.00001689 
Epoch: [4][80/2283] Elapsed 2m 32s (remain 69m 8s) Loss: 0.0161(0.0121) Grad: 52459.8047 LR: 0.00001676 
Epoch: [4][100/2283] Elapsed 3m 7s (remain 67m 39s) Loss: 0.0099(0.0126) Grad: 67546.1016 LR: 0.00001663 
Epoch: [4][120/2283] Elapsed 3m 44s (remain 66m 46s) Loss: 0.0148(0.0124) Grad: 21636.9082 LR: 0.00001650 
Epoch: [4][140/2283] Elapsed 4m 25s (remain 67m 11s) Loss: 0.0143(0.0130) Grad: 69288.8906 LR: 0.00001637 
Epoch: [4][160/2283] Elapsed 5m 2s (remain 66m 26s) Loss: 0.0152(0.0130) Grad: 54670.000

In [None]:
# epoch 7 / lr 5e-5 / len: max_len_of train / 