Reference: https://www.kaggle.com/code/kojimar/fb3-single-pytorch-model-train

In [1]:
from google.colab import drive
drive.mount('/content/drive/')
%cd drive/MyDrive/kaggle/notebook

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
/content/drive/MyDrive/kaggle/notebook


In [2]:
%%capture
!pip install transformers==4.20.1
!pip install tokenizers==0.12.1
!pip install sentencepiece==0.1.97

In [3]:
import os
import gc
import time
import math
import datetime

import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import LambdaLR


import transformers
import tokenizers
from transformers import AutoModel, AutoTokenizer, AutoConfig
from transformers import get_cosine_schedule_with_warmup

os.system('pip install iterative-stratification==0.1.7')
os.environ['TOKENIZERS_PARALLELISM']='true'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
class CFG:
    model_name  = "microsoft/deberta-v3-base"
    
    base        = "../input/feedback-prize-english-language-learning/"
    train       = base + "train.csv"
    test        = base + "test.csv"
    target_cols = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    num_targets = 6
    
    # Backbone
    max_length  = 512
    hidden_dims = 768
    output_hidden_states = True
    pool = 'mean'
    reinit   = True
    reinit_n = 1
    
    # loss
    loss = 'L1smooth'
    apex = True
    max_norm = 1000
    
    # optimizer
    default_lr   = 1e-5
    head_lr      = 1e-4
    adam_eps     = 1e-6
    lr_decay     = 0.9
    weight_decay = 0.01
    
    # scheduler
    scheduler = 'lambda'
    lr_sch_decay_steps = 1600
    num_warmup_steps = 0.0
    num_cycles = 0.5
    
    # CV
    n_folds = 5
    seed   = 42
    epochs = 5
    
    # Loader
    batch_size  = 4
    num_workers = 2
    print_freq = 20

    str_now = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')

In [5]:
if len(CFG.model_name.split("/")) == 2:
   CFG.identifier = f"{CFG.str_now}-{CFG.model_name.split('/')[1]}"
else:
   CFG.identifier = f"{CFG.str_now}-{CFG.model_name}"

CFG.OUTPUT_DIR = f'./{CFG.identifier}/'
CFG.log_file   = CFG.OUTPUT_DIR + 'train'
os.makedirs(CFG.OUTPUT_DIR, exist_ok = True)

In [6]:
df_train = pd.read_csv(CFG.train)

In [7]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [8]:
fold = MultilabelStratifiedKFold(n_splits = CFG.n_folds, shuffle = True, random_state = CFG.seed)
df   = df_train.copy()
y    = pd.get_dummies(data = df[CFG.target_cols], columns = CFG.target_cols)

for idx, (train_idx, valid_idx) in enumerate(fold.split(df_train, y)):
    df_train.loc[valid_idx, 'fold'] = idx

In [9]:
class AverageMeter(object):
    def __init__(self):
        self.reset()
             
    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
        
    def update(self, val, n = 1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return f'{int(m)}m {int(s)}s'
        
def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    
    return f'{str(asMinutes(s))} (remain {str(asMinutes(rs))})'
        
def prepare_inputs(texts):
    inputs = []
    for text in texts:
        tokens = tokenizer.encode_plus(text,
                                       add_special_tokens = True,
                                       max_length = CFG.max_length,
                                       pad_to_max_length = True,
                                       truncation = True,
                                       return_attention_mask = True
                                       )
        
        for key, vals in tokens.items():
            tokens[key] = torch.tensor(vals, dtype = torch.long)
        inputs.append(tokens)
    
    return inputs

class MeanPooling(nn.Module):
      def __init__(self):
          super(MeanPooling, self).__init__()

      def forward(self, inputs, attention_masks):
          #print("inputs.shape: ", inputs.shape, " masks.shape: ", attention_masks.shape)
          attention_masks = attention_masks.unsqueeze(-1).float()
          inputs   = torch.sum(inputs*attention_masks, dim = 1)
          num_masks = torch.sum(attention_masks == 1.0, dim = 1)
          torch.clamp(num_masks, min = 1e-9)
          inputs   = (inputs / num_masks)

          return inputs

class AttentionPooling(nn.Module):
      def __init__(self, hidden_size):
          super(AttentionPooling, self).__init__()
          self.attention = nn.Sequential(nn.Linear(hidden_size, hidden_size),
                                         nn.GELU(),
                                         nn.Linear(hidden_size, 1)
                                         )

      def forward(self, hidden_state, attention_mask):
          weights = self.attention(hidden_state)
          weights[attention_mask == 0] = float("-inf")
          weights = torch.softmax(weights, dim = 1)
          context = torch.sum(hidden_state * weights, dim = 1)

          return context

def MCRMSE(labels, preds):
    scores = []
    num_targets = labels.shape[1]
    
    for i in range(num_targets):
        pred  = preds[:,i]
        label = labels[:,i]
        score = mean_squared_error(label, pred, squared = False)
        scores.append(score)
    
    mcrmse = np.mean(scores)
    
    return mcrmse, scores 

def get_score(labels, preds):
    return MCRMSE(labels, preds)

def get_logger(log_file = CFG.log_file):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter('%(message)s'))
    handler2 = FileHandler(filename = f'{log_file}.log')
    handler2.setFormatter(Formatter('%(message)s'))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    logger.propagate = False
    
    return logger
        

In [10]:
LOGGER = get_logger()
LOGGER.info(f'OUTPUT_DIR: {CFG.OUTPUT_DIR}')

OUTPUT_DIR: ./20221213-000822-deberta-v3-base/


In [11]:
from tqdm import tqdm

tokenizer = AutoTokenizer.from_pretrained(CFG.model_name)
#tokenizer.add_tokens(["\n"], special_tokens=True)

CFG.max_len = 1429
LOGGER.info(f'max_len: {CFG.max_len}')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
max_len: 1428


In [12]:
class FB3Dataset(Dataset):
    def __init__(self, df_train):
        self.inputs = prepare_inputs(df_train['full_text'])
        self.labels = torch.tensor(df_train[CFG.target_cols].values, dtype = torch.float)
        
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, idx):
        return self.inputs[idx], self.labels[idx]

In [13]:
class FB3Model(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.config     = AutoConfig.from_pretrained(cfg.model_name, output_hidden_states = cfg.output_hidden_states)
        self.config.hidden_dropout = 0.
        self.config.hidden_dropout_prob = 0.
        self.config.attention_dropout = 0.
        self.config.attention_probs_dropout_prob = 0.

        self.model      = AutoModel.from_pretrained(cfg.model_name, 
                                                    config = self.config)
        self.cfg        = cfg

        LOGGER.info(self.cfg)

        self.pool       = MeanPooling()
        self.classifier = nn.Linear(cfg.hidden_dims, cfg.num_targets)
        self._init_weights(self.classifier)

    def feature(self, inputs):
        outputs = self.model(**inputs)
        feature = self.pool(outputs.last_hidden_state, inputs['attention_mask'])
 
        return feature

    def _init_weights(self, module):
        module.weight.data.normal_(mean = 0.0, std = 0.02)

        if module.bias is not None:
            module.bias.data.zero_()

    def reinitialize_layer(self, layer_num):
        for layer in self.model.encoder.layer[-layer_num:].modules():
            if isinstance(layer, nn.Linear):
               layer.weight.data.normal_(mean = 0.0, std = 0.02)

               if layer.bias is not None:
                  layer.bias.data.zero_()

            elif isinstance(layer, nn.Embedding):
               layer.weight.data.normal_(mean = 0.0, std = 0.02)

               if layer.padding_idx is not None:
                  layer.weight.data[layer.padding_idx].zero_()

            elif isinstance(layer, nn.LayerNorm):
                layer.bias.data.zero_()
                layer.weight.data.fill_(1.0)
          
    def forward(self, inputs):

        feature = self.feature(inputs)
        outputs = self.classifier(feature)
        
        return outputs

In [14]:
def train_fn(model, criterion, optimizer, scheduler, train_loader, epoch, cfg):
    losses = AverageMeter()
    total_steps = len(train_loader)
    start = end = time.time()
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=cfg.apex)
    
    for step, (inputs, labels) in enumerate(train_loader):
        #print("inputs.shape: ", inputs['input_ids'].shape)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        
        with torch.cuda.amp.autocast(enabled = cfg.apex):
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
        losses.update(loss.item(), labels.shape[0])
        
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.max_norm)
          
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
        scheduler.step()
        
        end = time.time()
        
        if (step + 1) % cfg.print_freq == 0:
            print("[{0}][{1}/{2}] "
                  "Elapsed: {remain:s} "
                  "Loss: {loss.val:.4f}({loss.avg:.4f}) "
                  "Grad: {grad_norm} "
                  "LR: {lr:.8f} "
                  .format(epoch, step, total_steps, remain = timeSince(start, float(step+1)/total_steps),
                          loss = losses,
                          grad_norm = grad_norm,
                          lr = scheduler.get_lr()[0]
                          )
                 )
    
    return losses.avg

def valid_fn(model, criterion, valid_loader, epoch, cfg):
    losses = AverageMeter()
    predictions = []
    total_steps = len(valid_loader) - 1
    model.eval()
    start = end = time.time()
    
    for step, (inputs, labels) in enumerate(valid_loader):
        
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        
        with torch.no_grad():
            preds = model(inputs)
        
        loss = criterion(preds, labels)
        losses.update(loss.item(), labels.shape[0])
        
        predictions.append(preds.detach().cpu().numpy())
        
        if (step + 1) % cfg.print_freq == 0 or step == total_steps:
            print("[{0}][{1}/{2}] "
                  "Elapsed: {remain:s} "
                  "Loss: {loss.val:.4f}({loss.avg:.4f}) "
                  .format(epoch+1, step, total_steps, remain = timeSince(start, float(step+1)/total_steps),
                          loss = losses,
                          )
                 )
    
    predictions = np.concatenate(predictions, axis = 0)
    
    return losses.avg, predictions

In [15]:
from collections import defaultdict

def train_loop(folds, fold, cfg):
    LOGGER.info(f"========== fold: {fold} training ==========")
    train_folds  = folds[folds['fold'] != fold]
    valid_folds  = folds[folds['fold'] == fold]
    valid_labels = valid_folds[cfg.target_cols].values
    
    train_dataset = FB3Dataset(train_folds)
    valid_dataset = FB3Dataset(valid_folds)
    
    train_loader  = DataLoader(train_dataset, 
                               batch_size  = cfg.batch_size,
                               shuffle = True,
                               pin_memory = True,
                               num_workers = cfg.num_workers)
    
    valid_loader  = DataLoader(valid_dataset, 
                               batch_size = cfg.batch_size,
                               shuffle = False,
                               pin_memory = True,
                               num_workers = cfg.num_workers)
    
    def get_optimizer(cfg, model):
        
        all_parameters = list(model.named_parameters())
        used_name_parameters = set()
        params = []
        no_wd = ['word_embeddings', 'bias', 'LayerNorm.weight']

        head   = [(n, p) for n, p in model.named_parameters() if 'model' not in n]
        for n, _ in head: used_name_parameters.add(n)
  
        params.append({"params": [p for n, p in head if not any(no in n for no in no_wd)], "weight_decay": 0.01, "lr": cfg.head_lr})
        params.append({"params": [p for n, p in head if any(no in n for no in no_wd)], "weight_decay": 0.0, "lr": cfg.head_lr})
        
        lr = cfg.default_lr
        decay = 0.9

        layer_list = [model.model.embeddings] + list(model.model.encoder.layer)
        layer_list.reverse()

        params.append(
            {
                "params": list(model.model.encoder.rel_embeddings.parameters()),
                "lr": lr,
                "weight_decay": 0.0
            }
        )

        params.append(
            {
                "params": list(model.model.encoder.LayerNorm.parameters()),
                "lr": lr,
                "weight_decay": 0.0
            }
        )
      
        for i, layer in enumerate(layer_list):
            
            parameters = [(n, p) for n, p in layer.named_parameters()]
            for n, _ in parameters: used_name_parameters.add(n)

            params.append({"params": [p for n, p in parameters if not any(no in n for no in no_wd)], 'weight_decay': 0.01, "lr": lr})
            params.append({"params": [p for n, p in parameters if any(no in n for no in no_wd)], 'weight_decay': 0.0, "lr": lr})

            lr = lr * decay

        optimizer = torch.optim.AdamW(params, eps = cfg.adam_eps)

        return optimizer

    def get_scheduler(cfg, optimizer, num_train_steps):
        warmup_start = int(cfg.epochs * num_train_steps)
        print("warmup start: ", warmup_start)

        def decay_func(step):
            return 0.3 ** (float(step) / float(cfg.lr_sch_decay_steps))

        if cfg.scheduler == 'linear':
           scheduler = transformers.get_scheduler(cfg.scheduler, optimizer, warmup_start, num_train_steps)
        elif cfg.scheduler == 'cosine':
           scheduler = get_cosine_schedule_with_warmup(
               optimizer,
               num_warmup_steps = cfg.num_warmup_steps,
               num_training_steps = num_train_steps,
               num_cycles = cfg.num_cycles
           )
        elif cfg.scheduler == 'lambda':
           scheduler = LambdaLR(optimizer, lr_lambda = decay_func)

        return scheduler
    
    model = FB3Model(cfg).to(device)
    if CFG.reinit:
       model.reinitialize_layer(CFG.reinit_n)
    criterion = nn.SmoothL1Loss(reduction = 'mean')
    optimizer = get_optimizer(cfg, model)
    num_train_steps = int(len(train_folds) / cfg.batch_size * cfg.epochs)
    print("[DEBUG] num_train_steps: ", num_train_steps, " len train loader: ", len(train_loader))
    scheduler = get_scheduler(cfg, optimizer, num_train_steps)

    best_score = np.inf
    best_val_loss = np.inf
    best_train_loss = np.inf

    for epoch in range(cfg.epochs):
        
        start = time.time()
        
        avg_loss = train_fn(model, criterion, optimizer, scheduler, train_loader, epoch, cfg)
        
        avg_val_loss, preds = valid_fn(model, criterion, valid_loader, epoch, cfg)
        score, scores = get_score(preds, valid_labels)
        
        elapsed = time.time() - start
        
        LOGGER.info(f"Epoch: {epoch+1} - avg_train_loss = {avg_loss:.4f} avg_val_loss: {avg_val_loss:.4f} time: {elapsed:.4f}s")
        LOGGER.info(f"Epoch: {epoch+1} - Score: {score:.4f}, Scores: {scores}")

        if best_score > score:
           best_score = score
           best_train_loss = avg_loss
           bset_val_loss = avg_val_loss
           LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')

           torch.save({'model': model.state_dict(),
                       'predictions': preds},
                      CFG.OUTPUT_DIR + f"{CFG.model_name.replace('/', '-')}_fold{fold}_best.pth")
    
    predictions = torch.load(CFG.OUTPUT_DIR + f"{CFG.model_name.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location = torch.device('cpu'))['predictions']
    valid_folds[[f"pred_{c}" for c in CFG.target_cols]] = predictions

    torch.cuda.empty_cache()
    gc.collect()

    return best_train_loss, best_val_loss, valid_folds, best_score

In [16]:
def get_result(oof_df, fold, best_train_loss, best_val_loss):
    labels = oof_df[CFG.target_cols].values
    preds = oof_df[[f"pred_{c}" for c in CFG.target_cols]].values
    score, scores = get_score(labels, preds)
    LOGGER.info(f'Score: {score:<.4f}  Scores: {scores}')

    return None

oof_df = pd.DataFrame()
train_loss_list = []
val_loss_list = []

for fold in range(CFG.n_folds):
    best_train_loss, best_val_loss, _oof_df, best_score = train_loop(df_train, fold, CFG)
    oof_df = pd.concat([oof_df, _oof_df])
    train_loss_list.append(best_train_loss)
    val_loss_list.append(best_val_loss)

    LOGGER.info(f"========== fold: {fold} result ==========")

    get_result(_oof_df, fold, best_train_loss, best_val_loss)

oof_df = oof_df.reset_index(drop = True)
LOGGER.info(f"========== CV ==========")
get_result(oof_df, "OOF", np.mean(train_loss_list), np.mean(val_loss_list))
oof_df.to_csv(CFG.OUTPUT_DIR + 'oof_df.csv')

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.dense.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
<class '__main__.CFG'

[DEBUG] num_train_steps:  3912  len train loader:  783
warmup start:  19560




[0][19/783] Elapsed: 0m 7s (remain 4m 53s) Loss: 0.3765(1.5168) Grad: 246857.6875 LR: 0.00009851 
[0][39/783] Elapsed: 0m 14s (remain 4m 24s) Loss: 0.2607(0.9170) Grad: 88933.359375 LR: 0.00009703 
[0][59/783] Elapsed: 0m 20s (remain 4m 10s) Loss: 0.1563(0.6692) Grad: 186682.390625 LR: 0.00009559 
[0][79/783] Elapsed: 0m 27s (remain 4m 0s) Loss: 0.0779(0.5441) Grad: 141349.421875 LR: 0.00009416 
[0][99/783] Elapsed: 0m 33s (remain 3m 51s) Loss: 0.0964(0.4679) Grad: 80614.7578125 LR: 0.00009275 
[0][119/783] Elapsed: 0m 40s (remain 3m 44s) Loss: 0.2041(0.4099) Grad: 165196.90625 LR: 0.00009137 
[0][139/783] Elapsed: 0m 47s (remain 3m 37s) Loss: 0.1572(0.3700) Grad: 154415.484375 LR: 0.00009000 
[0][159/783] Elapsed: 0m 53s (remain 3m 30s) Loss: 0.1772(0.3404) Grad: 227742.5 LR: 0.00008866 
[0][179/783] Elapsed: 1m 0s (remain 3m 22s) Loss: 0.0742(0.3158) Grad: 98222.4296875 LR: 0.00008733 
[0][199/783] Elapsed: 1m 7s (remain 3m 15s) Loss: 0.1501(0.2959) Grad: 117917.921875 LR: 0.00008603

Epoch: 1 - avg_train_loss = 0.1631 avg_val_loss: 0.1074 time: 300.7663s
Epoch: 1 - Score: 0.4651, Scores: [0.48553763825112545, 0.46286388668305734, 0.43294210874559136, 0.46273514523289316, 0.47782467958336794, 0.46845511347539165]
Epoch 1 - Save Best Score: 0.4651 Model


[1][195/195] Elapsed: 0m 41s (remain -1m 59s) Loss: 0.0707(0.1074) 




[1][19/783] Elapsed: 0m 6s (remain 4m 15s) Loss: 0.1168(0.1105) Grad: 262935.25 LR: 0.00005465 
[1][39/783] Elapsed: 0m 13s (remain 4m 6s) Loss: 0.0476(0.1062) Grad: 182202.75 LR: 0.00005383 
[1][59/783] Elapsed: 0m 19s (remain 4m 0s) Loss: 0.0518(0.1079) Grad: 100504.7578125 LR: 0.00005303 
[1][79/783] Elapsed: 0m 26s (remain 3m 53s) Loss: 0.1218(0.1056) Grad: 189165.421875 LR: 0.00005224 
[1][99/783] Elapsed: 0m 33s (remain 3m 46s) Loss: 0.0493(0.1031) Grad: 157897.578125 LR: 0.00005146 
[1][119/783] Elapsed: 0m 39s (remain 3m 39s) Loss: 0.0883(0.1027) Grad: 157243.03125 LR: 0.00005069 
[1][139/783] Elapsed: 0m 46s (remain 3m 32s) Loss: 0.1516(0.1030) Grad: 522948.0625 LR: 0.00004993 
[1][159/783] Elapsed: 0m 52s (remain 3m 26s) Loss: 0.0484(0.1035) Grad: 152926.53125 LR: 0.00004918 
[1][179/783] Elapsed: 0m 59s (remain 3m 19s) Loss: 0.1407(0.1035) Grad: 444762.1875 LR: 0.00004845 
[1][199/783] Elapsed: 1m 6s (remain 3m 12s) Loss: 0.0367(0.1036) Grad: 163598.59375 LR: 0.00004773 
[1]

Epoch: 2 - avg_train_loss = 0.1051 avg_val_loss: 0.1036 time: 299.7450s
Epoch: 2 - Score: 0.4563, Scores: [0.47959054715540494, 0.4576673340095501, 0.4225161352271177, 0.4540978044273171, 0.47833399622320216, 0.4457512265344438]
Epoch 2 - Save Best Score: 0.4563 Model


[2][195/195] Elapsed: 0m 41s (remain -1m 59s) Loss: 0.0528(0.1036) 




[2][19/783] Elapsed: 0m 6s (remain 4m 15s) Loss: 0.0967(0.0965) Grad: 182077.71875 LR: 0.00003032 
[2][39/783] Elapsed: 0m 13s (remain 4m 7s) Loss: 0.1183(0.1027) Grad: 245811.015625 LR: 0.00002986 
[2][59/783] Elapsed: 0m 19s (remain 4m 0s) Loss: 0.0952(0.1001) Grad: 250169.78125 LR: 0.00002942 
[2][79/783] Elapsed: 0m 26s (remain 3m 53s) Loss: 0.1269(0.0993) Grad: 252557.671875 LR: 0.00002898 
[2][99/783] Elapsed: 0m 33s (remain 3m 46s) Loss: 0.0619(0.0991) Grad: 156488.6875 LR: 0.00002855 
[2][119/783] Elapsed: 0m 39s (remain 3m 39s) Loss: 0.1486(0.1015) Grad: 499653.625 LR: 0.00002812 
[2][139/783] Elapsed: 0m 46s (remain 3m 32s) Loss: 0.0590(0.1003) Grad: 169920.75 LR: 0.00002770 
[2][159/783] Elapsed: 0m 52s (remain 3m 26s) Loss: 0.1014(0.1000) Grad: 293314.5625 LR: 0.00002729 
[2][179/783] Elapsed: 0m 59s (remain 3m 19s) Loss: 0.0479(0.0994) Grad: 151032.078125 LR: 0.00002688 
[2][199/783] Elapsed: 1m 6s (remain 3m 12s) Loss: 0.0788(0.0989) Grad: 341305.96875 LR: 0.00002648 
[2]

Epoch: 3 - avg_train_loss = 0.0994 avg_val_loss: 0.1024 time: 299.6775s
Epoch: 3 - Score: 0.4536, Scores: [0.47988349236460764, 0.456898197846496, 0.41880411861646316, 0.4546648135030823, 0.4690435416606141, 0.4423078494213756]
Epoch 3 - Save Best Score: 0.4536 Model


[3][195/195] Elapsed: 0m 41s (remain -1m 59s) Loss: 0.0521(0.1024) 




[3][19/783] Elapsed: 0m 6s (remain 4m 15s) Loss: 0.0736(0.0904) Grad: 192430.859375 LR: 0.00001682 
[3][39/783] Elapsed: 0m 13s (remain 4m 7s) Loss: 0.0591(0.0875) Grad: 187311.453125 LR: 0.00001657 
[3][59/783] Elapsed: 0m 19s (remain 4m 0s) Loss: 0.1073(0.0934) Grad: 313251.125 LR: 0.00001632 
[3][79/783] Elapsed: 0m 26s (remain 3m 53s) Loss: 0.0766(0.0976) Grad: 182752.765625 LR: 0.00001608 
[3][99/783] Elapsed: 0m 33s (remain 3m 46s) Loss: 0.1534(0.0980) Grad: 430150.59375 LR: 0.00001584 
[3][119/783] Elapsed: 0m 39s (remain 3m 39s) Loss: 0.0874(0.0970) Grad: 315770.53125 LR: 0.00001560 
[3][139/783] Elapsed: 0m 46s (remain 3m 32s) Loss: 0.0554(0.0949) Grad: 136960.0 LR: 0.00001537 
[3][159/783] Elapsed: 0m 52s (remain 3m 26s) Loss: 0.1938(0.0961) Grad: 176613.53125 LR: 0.00001514 
[3][179/783] Elapsed: 0m 59s (remain 3m 19s) Loss: 0.0690(0.0963) Grad: 174154.421875 LR: 0.00001491 
[3][199/783] Elapsed: 1m 6s (remain 3m 12s) Loss: 0.0802(0.0964) Grad: 163898.6875 LR: 0.00001469 
[3

Epoch: 4 - avg_train_loss = 0.0964 avg_val_loss: 0.1018 time: 299.6495s
Epoch: 4 - Score: 0.4524, Scores: [0.47640881263829066, 0.45776162460157954, 0.41868910358923933, 0.452380555922179, 0.4684776153382562, 0.44044009373976073]
Epoch 4 - Save Best Score: 0.4524 Model


[4][195/195] Elapsed: 0m 41s (remain -1m 59s) Loss: 0.0497(0.1018) 




[4][19/783] Elapsed: 0m 6s (remain 4m 16s) Loss: 0.1713(0.1073) Grad: 461288.875 LR: 0.00000933 
[4][39/783] Elapsed: 0m 13s (remain 4m 7s) Loss: 0.0435(0.0990) Grad: 178609.609375 LR: 0.00000919 
[4][59/783] Elapsed: 0m 19s (remain 4m 0s) Loss: 0.0456(0.1013) Grad: 125355.609375 LR: 0.00000905 
[4][79/783] Elapsed: 0m 26s (remain 3m 53s) Loss: 0.0667(0.1006) Grad: 285112.96875 LR: 0.00000892 
[4][99/783] Elapsed: 0m 33s (remain 3m 46s) Loss: 0.0486(0.0977) Grad: 133471.875 LR: 0.00000879 
[4][119/783] Elapsed: 0m 39s (remain 3m 39s) Loss: 0.0942(0.0972) Grad: 368474.25 LR: 0.00000865 
[4][139/783] Elapsed: 0m 46s (remain 3m 32s) Loss: 0.1282(0.0992) Grad: 161010.375 LR: 0.00000853 
[4][159/783] Elapsed: 0m 52s (remain 3m 26s) Loss: 0.1457(0.0978) Grad: 254464.921875 LR: 0.00000840 
[4][179/783] Elapsed: 0m 59s (remain 3m 19s) Loss: 0.1563(0.0965) Grad: 447202.09375 LR: 0.00000827 
[4][199/783] Elapsed: 1m 6s (remain 3m 12s) Loss: 0.1197(0.0965) Grad: 215566.671875 LR: 0.00000815 
[4][

Epoch: 5 - avg_train_loss = 0.0949 avg_val_loss: 0.1014 time: 299.6372s
Epoch: 5 - Score: 0.4513, Scores: [0.47486252547268554, 0.4562591933219615, 0.41928850554311603, 0.4507207335035314, 0.46708362500722833, 0.4396209436226126]
Epoch 5 - Save Best Score: 0.4513 Model


[5][195/195] Elapsed: 0m 41s (remain -1m 59s) Loss: 0.0508(0.1014) 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
Score: 0.4513  Scores: [0.47486252547268554, 0.4562591933219615, 0.41928850554311603, 0.4507207335035314, 0.46708362500722833, 0.4396209436226126]
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.dense.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializin

[DEBUG] num_train_steps:  3905  len train loader:  781
warmup start:  19525




[0][19/781] Elapsed: 0m 6s (remain 4m 13s) Loss: 1.2630(1.7638) Grad: 313708.8125 LR: 0.00009851 
[0][39/781] Elapsed: 0m 13s (remain 4m 4s) Loss: 0.1302(1.0261) Grad: 117226.03125 LR: 0.00009703 
[0][59/781] Elapsed: 0m 19s (remain 3m 57s) Loss: 0.1133(0.7463) Grad: 77318.34375 LR: 0.00009559 
[0][79/781] Elapsed: 0m 26s (remain 3m 51s) Loss: 0.1897(0.5965) Grad: 111742.7109375 LR: 0.00009416 
[0][99/781] Elapsed: 0m 33s (remain 3m 44s) Loss: 0.1449(0.5019) Grad: 170520.1875 LR: 0.00009275 
[0][119/781] Elapsed: 0m 39s (remain 3m 38s) Loss: 0.1044(0.4413) Grad: 63116.296875 LR: 0.00009137 
[0][139/781] Elapsed: 0m 46s (remain 3m 31s) Loss: 0.0739(0.3984) Grad: 90030.2578125 LR: 0.00009000 
[0][159/781] Elapsed: 0m 52s (remain 3m 24s) Loss: 0.1339(0.3655) Grad: 119022.3671875 LR: 0.00008866 
[0][179/781] Elapsed: 0m 59s (remain 3m 18s) Loss: 0.1199(0.3394) Grad: 90602.6015625 LR: 0.00008733 
[0][199/781] Elapsed: 1m 5s (remain 3m 11s) Loss: 0.1030(0.3182) Grad: 98825.1171875 LR: 0.0000

Epoch: 1 - avg_train_loss = 0.1697 avg_val_loss: 0.1060 time: 297.9898s
Epoch: 1 - Score: 0.4617, Scores: [0.48580829367790657, 0.4323346213225305, 0.45937869810183185, 0.46092950899927326, 0.47410185252611525, 0.4577047211419799]
Epoch 1 - Save Best Score: 0.4617 Model


[1][196/196] Elapsed: 0m 41s (remain -1m 59s) Loss: 0.0871(0.1060) 




[1][19/781] Elapsed: 0m 6s (remain 4m 12s) Loss: 0.0736(0.0984) Grad: 69701.5625 LR: 0.00005473 
[1][39/781] Elapsed: 0m 13s (remain 4m 4s) Loss: 0.0791(0.1020) Grad: 65164.17578125 LR: 0.00005391 
[1][59/781] Elapsed: 0m 19s (remain 3m 58s) Loss: 0.0904(0.1043) Grad: 114009.03125 LR: 0.00005311 
[1][79/781] Elapsed: 0m 26s (remain 3m 51s) Loss: 0.0785(0.1038) Grad: 108404.0703125 LR: 0.00005231 
[1][99/781] Elapsed: 0m 32s (remain 3m 44s) Loss: 0.0655(0.1007) Grad: 57650.828125 LR: 0.00005153 
[1][119/781] Elapsed: 0m 39s (remain 3m 37s) Loss: 0.1576(0.1022) Grad: 104791.9765625 LR: 0.00005076 
[1][139/781] Elapsed: 0m 46s (remain 3m 30s) Loss: 0.0730(0.1008) Grad: 105765.296875 LR: 0.00005001 
[1][159/781] Elapsed: 0m 52s (remain 3m 24s) Loss: 0.1208(0.1002) Grad: 210340.734375 LR: 0.00004926 
[1][179/781] Elapsed: 0m 59s (remain 3m 17s) Loss: 0.0928(0.1009) Grad: 86361.015625 LR: 0.00004852 
[1][199/781] Elapsed: 1m 5s (remain 3m 10s) Loss: 0.0742(0.1012) Grad: 62239.95703125 LR: 0.

Epoch: 2 - avg_train_loss = 0.1050 avg_val_loss: 0.1072 time: 297.7937s
Epoch: 2 - Score: 0.4634, Scores: [0.48680434513929766, 0.44718165982252017, 0.425854656271397, 0.4492660048552112, 0.5305862891019537, 0.44054315927742455]


[2][196/196] Elapsed: 0m 41s (remain -1m 59s) Loss: 0.0617(0.1072) 




[2][19/781] Elapsed: 0m 6s (remain 4m 9s) Loss: 0.0823(0.0860) Grad: 85592.6328125 LR: 0.00003041 
[2][39/781] Elapsed: 0m 13s (remain 4m 2s) Loss: 0.1045(0.1055) Grad: 144852.40625 LR: 0.00002995 
[2][59/781] Elapsed: 0m 19s (remain 3m 56s) Loss: 0.0678(0.0983) Grad: 89756.9453125 LR: 0.00002951 
[2][79/781] Elapsed: 0m 26s (remain 3m 49s) Loss: 0.0782(0.0971) Grad: 61842.3125 LR: 0.00002907 
[2][99/781] Elapsed: 0m 32s (remain 3m 43s) Loss: 0.0707(0.0980) Grad: 103962.0546875 LR: 0.00002863 
[2][119/781] Elapsed: 0m 39s (remain 3m 36s) Loss: 0.1133(0.0992) Grad: 129055.6796875 LR: 0.00002820 
[2][139/781] Elapsed: 0m 45s (remain 3m 30s) Loss: 0.1922(0.1003) Grad: 289774.15625 LR: 0.00002778 
[2][159/781] Elapsed: 0m 52s (remain 3m 23s) Loss: 0.1165(0.0994) Grad: 248712.734375 LR: 0.00002737 
[2][179/781] Elapsed: 0m 58s (remain 3m 16s) Loss: 0.0927(0.0996) Grad: 77442.484375 LR: 0.00002696 
[2][199/781] Elapsed: 1m 5s (remain 3m 10s) Loss: 0.0504(0.0997) Grad: 50791.35546875 LR: 0.00

Epoch: 3 - avg_train_loss = 0.0987 avg_val_loss: 0.0990 time: 297.7434s
Epoch: 3 - Score: 0.4456, Scores: [0.4790586452255965, 0.43055562430017624, 0.42132747486038635, 0.4449015651666129, 0.46431622975409964, 0.4333692833062872]
Epoch 3 - Save Best Score: 0.4456 Model


[3][196/196] Elapsed: 0m 41s (remain -1m 59s) Loss: 0.0706(0.0990) 




[3][19/781] Elapsed: 0m 6s (remain 4m 15s) Loss: 0.1152(0.0922) Grad: 327664.375 LR: 0.00001690 
[3][39/781] Elapsed: 0m 13s (remain 4m 4s) Loss: 0.0616(0.0950) Grad: 66711.359375 LR: 0.00001664 
[3][59/781] Elapsed: 0m 19s (remain 3m 57s) Loss: 0.0600(0.0928) Grad: 80202.484375 LR: 0.00001639 
[3][79/781] Elapsed: 0m 26s (remain 3m 51s) Loss: 0.0757(0.0910) Grad: 95514.890625 LR: 0.00001615 
[3][99/781] Elapsed: 0m 32s (remain 3m 44s) Loss: 0.1236(0.0923) Grad: 119882.7890625 LR: 0.00001591 
[3][119/781] Elapsed: 0m 39s (remain 3m 37s) Loss: 0.0654(0.0931) Grad: 150375.25 LR: 0.00001567 
[3][139/781] Elapsed: 0m 46s (remain 3m 31s) Loss: 0.1172(0.0917) Grad: 102906.328125 LR: 0.00001544 
[3][159/781] Elapsed: 0m 52s (remain 3m 24s) Loss: 0.2946(0.0927) Grad: 175897.296875 LR: 0.00001521 
[3][179/781] Elapsed: 0m 59s (remain 3m 17s) Loss: 0.1075(0.0933) Grad: 102878.9296875 LR: 0.00001498 
[3][199/781] Elapsed: 1m 5s (remain 3m 10s) Loss: 0.1521(0.0942) Grad: 160999.96875 LR: 0.0000147

Epoch: 4 - avg_train_loss = 0.0946 avg_val_loss: 0.0983 time: 297.7130s
Epoch: 4 - Score: 0.4441, Scores: [0.4736707829906585, 0.42938793687023075, 0.4202417231161282, 0.44035164741174343, 0.4667670276135549, 0.4341776676975357]
Epoch 4 - Save Best Score: 0.4441 Model


[4][196/196] Elapsed: 0m 41s (remain -1m 59s) Loss: 0.0614(0.0983) 




[4][19/781] Elapsed: 0m 6s (remain 4m 11s) Loss: 0.0829(0.1037) Grad: 90863.09375 LR: 0.00000939 
[4][39/781] Elapsed: 0m 13s (remain 4m 3s) Loss: 0.0479(0.1060) Grad: 49258.28515625 LR: 0.00000925 
[4][59/781] Elapsed: 0m 19s (remain 3m 57s) Loss: 0.0521(0.0972) Grad: 118525.171875 LR: 0.00000911 
[4][79/781] Elapsed: 0m 26s (remain 3m 50s) Loss: 0.0981(0.0964) Grad: 159583.515625 LR: 0.00000897 
[4][99/781] Elapsed: 0m 32s (remain 3m 43s) Loss: 0.0740(0.0971) Grad: 65310.47265625 LR: 0.00000884 
[4][119/781] Elapsed: 0m 39s (remain 3m 37s) Loss: 0.0663(0.0970) Grad: 76519.796875 LR: 0.00000871 
[4][139/781] Elapsed: 0m 46s (remain 3m 30s) Loss: 0.1329(0.0960) Grad: 201872.25 LR: 0.00000858 
[4][159/781] Elapsed: 0m 52s (remain 3m 24s) Loss: 0.0401(0.0962) Grad: 48993.49609375 LR: 0.00000845 
[4][179/781] Elapsed: 0m 59s (remain 3m 17s) Loss: 0.0941(0.0964) Grad: 150804.921875 LR: 0.00000832 
[4][199/781] Elapsed: 1m 5s (remain 3m 10s) Loss: 0.0612(0.0951) Grad: 114337.953125 LR: 0.00

Epoch: 5 - avg_train_loss = 0.0929 avg_val_loss: 0.0978 time: 297.8464s
Epoch: 5 - Score: 0.4430, Scores: [0.4734103256591635, 0.42552829638480516, 0.4182834201781216, 0.4423315380546638, 0.4627444445106863, 0.4357072294566703]
Epoch 5 - Save Best Score: 0.4430 Model


[5][196/196] Elapsed: 0m 41s (remain -1m 59s) Loss: 0.0674(0.0978) 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
Score: 0.4430  Scores: [0.4734103256591635, 0.42552829638480516, 0.4182834201781216, 0.4423315380546638, 0.4627444445106863, 0.4357072294566703]
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.dense.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing 

[DEBUG] num_train_steps:  3907  len train loader:  782
warmup start:  19535




[0][19/782] Elapsed: 0m 6s (remain 4m 14s) Loss: 0.2763(1.6272) Grad: 139481.015625 LR: 0.00009851 
[0][39/782] Elapsed: 0m 13s (remain 4m 5s) Loss: 0.1594(0.9347) Grad: 162874.875 LR: 0.00009703 
[0][59/782] Elapsed: 0m 19s (remain 3m 58s) Loss: 0.1380(0.6809) Grad: 126774.734375 LR: 0.00009559 
[0][79/782] Elapsed: 0m 26s (remain 3m 52s) Loss: 0.1328(0.5568) Grad: 119748.4921875 LR: 0.00009416 
[0][99/782] Elapsed: 0m 33s (remain 3m 45s) Loss: 0.1152(0.4767) Grad: 138125.171875 LR: 0.00009275 
[0][119/782] Elapsed: 0m 39s (remain 3m 39s) Loss: 0.0790(0.4177) Grad: 141898.109375 LR: 0.00009137 
[0][139/782] Elapsed: 0m 46s (remain 3m 32s) Loss: 0.1572(0.3735) Grad: 108409.9921875 LR: 0.00009000 
[0][159/782] Elapsed: 0m 52s (remain 3m 25s) Loss: 0.0918(0.3425) Grad: 77075.0546875 LR: 0.00008866 
[0][179/782] Elapsed: 0m 59s (remain 3m 18s) Loss: 0.0949(0.3220) Grad: 102633.984375 LR: 0.00008733 
[0][199/782] Elapsed: 1m 6s (remain 3m 12s) Loss: 0.0642(0.3017) Grad: 104605.8828125 LR: 

Epoch: 1 - avg_train_loss = 0.1654 avg_val_loss: 0.1110 time: 298.3020s
Epoch: 1 - Score: 0.4724, Scores: [0.5000862978257119, 0.48761364342271896, 0.4364122382913792, 0.45969248007341557, 0.46273176630682505, 0.4880066643466378]
Epoch 1 - Save Best Score: 0.4724 Model


[1][196/196] Elapsed: 0m 41s (remain -1m 59s) Loss: 0.1111(0.1110) 




[1][19/782] Elapsed: 0m 6s (remain 4m 14s) Loss: 0.0769(0.1150) Grad: 357628.96875 LR: 0.00005469 
[1][39/782] Elapsed: 0m 13s (remain 4m 5s) Loss: 0.0884(0.1144) Grad: 308793.8125 LR: 0.00005387 
[1][59/782] Elapsed: 0m 19s (remain 3m 58s) Loss: 0.1151(0.1127) Grad: 116407.8984375 LR: 0.00005307 
[1][79/782] Elapsed: 0m 26s (remain 3m 51s) Loss: 0.0897(0.1120) Grad: 308237.6875 LR: 0.00005228 
[1][99/782] Elapsed: 0m 32s (remain 3m 44s) Loss: 0.1243(0.1114) Grad: 368467.28125 LR: 0.00005149 
[1][119/782] Elapsed: 0m 39s (remain 3m 37s) Loss: 0.1760(0.1116) Grad: 385197.65625 LR: 0.00005073 
[1][139/782] Elapsed: 0m 46s (remain 3m 31s) Loss: 0.1045(0.1094) Grad: 209357.78125 LR: 0.00004997 
[1][159/782] Elapsed: 0m 52s (remain 3m 24s) Loss: 0.0846(0.1095) Grad: 248270.40625 LR: 0.00004922 
[1][179/782] Elapsed: 0m 59s (remain 3m 18s) Loss: 0.1336(0.1078) Grad: 318971.46875 LR: 0.00004849 
[1][199/782] Elapsed: 1m 5s (remain 3m 11s) Loss: 0.0830(0.1069) Grad: 154737.609375 LR: 0.0000477

Epoch: 2 - avg_train_loss = 0.1039 avg_val_loss: 0.1029 time: 298.0412s
Epoch: 2 - Score: 0.4543, Scores: [0.4900083865375496, 0.45070331378141276, 0.42236506296865556, 0.45445004293205954, 0.45352492803203986, 0.45495289515646203]
Epoch 2 - Save Best Score: 0.4543 Model


[2][196/196] Elapsed: 0m 41s (remain -1m 59s) Loss: 0.1404(0.1029) 




[2][19/782] Elapsed: 0m 6s (remain 4m 16s) Loss: 0.1759(0.1156) Grad: 190636.609375 LR: 0.00003036 
[2][39/782] Elapsed: 0m 13s (remain 4m 7s) Loss: 0.1160(0.1111) Grad: 394310.65625 LR: 0.00002991 
[2][59/782] Elapsed: 0m 19s (remain 3m 59s) Loss: 0.0911(0.1076) Grad: 262988.65625 LR: 0.00002946 
[2][79/782] Elapsed: 0m 26s (remain 3m 52s) Loss: 0.1639(0.1100) Grad: 489335.09375 LR: 0.00002902 
[2][99/782] Elapsed: 0m 33s (remain 3m 45s) Loss: 0.1334(0.1061) Grad: 174252.4375 LR: 0.00002859 
[2][119/782] Elapsed: 0m 39s (remain 3m 38s) Loss: 0.0742(0.1064) Grad: 227706.171875 LR: 0.00002816 
[2][139/782] Elapsed: 0m 46s (remain 3m 31s) Loss: 0.0701(0.1054) Grad: 145591.9375 LR: 0.00002774 
[2][159/782] Elapsed: 0m 52s (remain 3m 25s) Loss: 0.1010(0.1032) Grad: 151774.765625 LR: 0.00002733 
[2][179/782] Elapsed: 0m 59s (remain 3m 18s) Loss: 0.0944(0.1003) Grad: 213569.65625 LR: 0.00002692 
[2][199/782] Elapsed: 1m 5s (remain 3m 11s) Loss: 0.1055(0.1008) Grad: 108302.375 LR: 0.00002652 

Epoch: 3 - avg_train_loss = 0.0991 avg_val_loss: 0.1033 time: 298.1390s
Epoch: 3 - Score: 0.4553, Scores: [0.4917058424104645, 0.46389898009901875, 0.4238800920005655, 0.45125950044803037, 0.4532839105008448, 0.44749010839546305]


[3][196/196] Elapsed: 0m 41s (remain -1m 59s) Loss: 0.1103(0.1033) 




[3][19/782] Elapsed: 0m 6s (remain 4m 12s) Loss: 0.0605(0.0894) Grad: 145400.921875 LR: 0.00001686 
[3][39/782] Elapsed: 0m 13s (remain 4m 4s) Loss: 0.0591(0.0894) Grad: 216544.984375 LR: 0.00001661 
[3][59/782] Elapsed: 0m 19s (remain 3m 57s) Loss: 0.1292(0.0927) Grad: 153514.546875 LR: 0.00001636 
[3][79/782] Elapsed: 0m 26s (remain 3m 50s) Loss: 0.0797(0.0969) Grad: 203419.421875 LR: 0.00001611 
[3][99/782] Elapsed: 0m 32s (remain 3m 44s) Loss: 0.0987(0.0963) Grad: 215018.734375 LR: 0.00001587 
[3][119/782] Elapsed: 0m 39s (remain 3m 37s) Loss: 0.0799(0.0963) Grad: 174714.234375 LR: 0.00001564 
[3][139/782] Elapsed: 0m 45s (remain 3m 30s) Loss: 0.0781(0.0953) Grad: 299232.53125 LR: 0.00001540 
[3][159/782] Elapsed: 0m 52s (remain 3m 24s) Loss: 0.0553(0.0956) Grad: 98104.8125 LR: 0.00001517 
[3][179/782] Elapsed: 0m 59s (remain 3m 17s) Loss: 0.1411(0.0963) Grad: 118730.90625 LR: 0.00001495 
[3][199/782] Elapsed: 1m 5s (remain 3m 11s) Loss: 0.1170(0.0968) Grad: 238887.53125 LR: 0.0000

Epoch: 4 - avg_train_loss = 0.0965 avg_val_loss: 0.1031 time: 298.3916s
Epoch: 4 - Score: 0.4547, Scores: [0.4943872017772749, 0.4502623733334951, 0.42539271280034296, 0.4522721873940964, 0.4552923051960827, 0.45040881957766005]


[4][196/196] Elapsed: 0m 41s (remain -1m 59s) Loss: 0.1318(0.1031) 




[4][19/782] Elapsed: 0m 6s (remain 4m 13s) Loss: 0.0997(0.0939) Grad: 241405.6875 LR: 0.00000936 
[4][39/782] Elapsed: 0m 13s (remain 4m 5s) Loss: 0.1038(0.1007) Grad: 205772.8125 LR: 0.00000922 
[4][59/782] Elapsed: 0m 19s (remain 3m 58s) Loss: 0.0722(0.1032) Grad: 254737.15625 LR: 0.00000908 
[4][79/782] Elapsed: 0m 26s (remain 3m 51s) Loss: 0.2643(0.1010) Grad: 417431.28125 LR: 0.00000895 
[4][99/782] Elapsed: 0m 32s (remain 3m 44s) Loss: 0.0770(0.0992) Grad: 162549.578125 LR: 0.00000881 
[4][119/782] Elapsed: 0m 39s (remain 3m 37s) Loss: 0.1249(0.0999) Grad: 194257.0625 LR: 0.00000868 
[4][139/782] Elapsed: 0m 46s (remain 3m 31s) Loss: 0.0807(0.0972) Grad: 302809.8125 LR: 0.00000855 
[4][159/782] Elapsed: 0m 52s (remain 3m 24s) Loss: 0.1122(0.0965) Grad: 170424.078125 LR: 0.00000842 
[4][179/782] Elapsed: 0m 59s (remain 3m 18s) Loss: 0.0419(0.0950) Grad: 182939.765625 LR: 0.00000830 
[4][199/782] Elapsed: 1m 5s (remain 3m 11s) Loss: 0.2080(0.0954) Grad: 377484.4375 LR: 0.00000817 


Epoch: 5 - avg_train_loss = 0.0947 avg_val_loss: 0.1028 time: 298.9048s
Epoch: 5 - Score: 0.4543, Scores: [0.4891071863534419, 0.4500509331251393, 0.42075863414852477, 0.46153468289346894, 0.45529486499908906, 0.4490171107808261]
Epoch 5 - Save Best Score: 0.4543 Model


[5][196/196] Elapsed: 0m 41s (remain -1m 59s) Loss: 0.1142(0.1028) 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
Score: 0.4543  Scores: [0.4891071863534419, 0.4500509331251393, 0.42075863414852477, 0.46153468289346894, 0.45529486499908906, 0.4490171107808261]
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.dense.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializin

[DEBUG] num_train_steps:  3913  len train loader:  783
warmup start:  19565




[0][19/783] Elapsed: 0m 6s (remain 4m 14s) Loss: 0.7854(1.9731) Grad: 289419.5625 LR: 0.00009851 
[0][39/783] Elapsed: 0m 13s (remain 4m 5s) Loss: 0.4203(1.1510) Grad: 360795.125 LR: 0.00009703 
[0][59/783] Elapsed: 0m 19s (remain 3m 58s) Loss: 0.1294(0.8301) Grad: 109793.9140625 LR: 0.00009559 
[0][79/783] Elapsed: 0m 26s (remain 3m 51s) Loss: 0.1230(0.6610) Grad: 122384.921875 LR: 0.00009416 
[0][99/783] Elapsed: 0m 32s (remain 3m 44s) Loss: 0.1458(0.5595) Grad: 110559.9453125 LR: 0.00009275 
[0][119/783] Elapsed: 0m 39s (remain 3m 37s) Loss: 0.1158(0.4865) Grad: 188259.9375 LR: 0.00009137 
[0][139/783] Elapsed: 0m 45s (remain 3m 30s) Loss: 0.2285(0.4338) Grad: 178789.296875 LR: 0.00009000 
[0][159/783] Elapsed: 0m 52s (remain 3m 24s) Loss: 0.0807(0.3959) Grad: 52241.5 LR: 0.00008866 
[0][179/783] Elapsed: 0m 59s (remain 3m 17s) Loss: 0.1681(0.3654) Grad: 118276.34375 LR: 0.00008733 
[0][199/783] Elapsed: 1m 5s (remain 3m 11s) Loss: 0.1345(0.3401) Grad: 239503.15625 LR: 0.00008603 
[

Epoch: 1 - avg_train_loss = 0.1723 avg_val_loss: 0.1116 time: 298.0691s
Epoch: 1 - Score: 0.4732, Scores: [0.5072817129384404, 0.46210065919791316, 0.4237463020105452, 0.4790552254104742, 0.4940056783751439, 0.47330054804732125]
Epoch 1 - Save Best Score: 0.4732 Model


[1][194/194] Elapsed: 0m 41s (remain -1m 59s) Loss: 0.1217(0.1116) 




[1][19/783] Elapsed: 0m 6s (remain 4m 16s) Loss: 0.1337(0.0923) Grad: 406471.875 LR: 0.00005465 
[1][39/783] Elapsed: 0m 13s (remain 4m 7s) Loss: 0.0615(0.0918) Grad: 155397.59375 LR: 0.00005383 
[1][59/783] Elapsed: 0m 19s (remain 3m 59s) Loss: 0.1862(0.0953) Grad: 570134.1875 LR: 0.00005303 
[1][79/783] Elapsed: 0m 26s (remain 3m 52s) Loss: 0.1171(0.0972) Grad: 134765.78125 LR: 0.00005224 
[1][99/783] Elapsed: 0m 32s (remain 3m 45s) Loss: 0.1341(0.0946) Grad: 104220.296875 LR: 0.00005146 
[1][119/783] Elapsed: 0m 39s (remain 3m 38s) Loss: 0.1559(0.0988) Grad: 188759.484375 LR: 0.00005069 
[1][139/783] Elapsed: 0m 46s (remain 3m 31s) Loss: 0.1245(0.1021) Grad: 144619.40625 LR: 0.00004993 
[1][159/783] Elapsed: 0m 52s (remain 3m 24s) Loss: 0.0851(0.1019) Grad: 167935.859375 LR: 0.00004918 
[1][179/783] Elapsed: 0m 59s (remain 3m 18s) Loss: 0.1136(0.1020) Grad: 60241.53125 LR: 0.00004845 
[1][199/783] Elapsed: 1m 5s (remain 3m 11s) Loss: 0.0877(0.1015) Grad: 97674.953125 LR: 0.00004773 

Epoch: 2 - avg_train_loss = 0.1034 avg_val_loss: 0.1105 time: 298.2368s
Epoch: 2 - Score: 0.4709, Scores: [0.5097393994866717, 0.4583003592146541, 0.42167372214148174, 0.47475991841713316, 0.4971775498916156, 0.4636462175937214]
Epoch 2 - Save Best Score: 0.4709 Model


[2][194/194] Elapsed: 0m 41s (remain -1m 59s) Loss: 0.1193(0.1105) 




[2][19/783] Elapsed: 0m 6s (remain 4m 15s) Loss: 0.0904(0.0877) Grad: 331471.6875 LR: 0.00003032 
[2][39/783] Elapsed: 0m 13s (remain 4m 6s) Loss: 0.0811(0.0905) Grad: 139117.1875 LR: 0.00002986 
[2][59/783] Elapsed: 0m 19s (remain 3m 58s) Loss: 0.0311(0.0893) Grad: 105921.2890625 LR: 0.00002942 
[2][79/783] Elapsed: 0m 26s (remain 3m 52s) Loss: 0.1118(0.0890) Grad: 233044.46875 LR: 0.00002898 
[2][99/783] Elapsed: 0m 32s (remain 3m 45s) Loss: 0.1266(0.0893) Grad: 336077.03125 LR: 0.00002855 
[2][119/783] Elapsed: 0m 39s (remain 3m 38s) Loss: 0.0718(0.0901) Grad: 156779.109375 LR: 0.00002812 
[2][139/783] Elapsed: 0m 46s (remain 3m 31s) Loss: 0.0903(0.0917) Grad: 195690.0625 LR: 0.00002770 
[2][159/783] Elapsed: 0m 52s (remain 3m 25s) Loss: 0.0870(0.0947) Grad: 336737.09375 LR: 0.00002729 
[2][179/783] Elapsed: 0m 59s (remain 3m 18s) Loss: 0.0592(0.0961) Grad: 124671.0390625 LR: 0.00002688 
[2][199/783] Elapsed: 1m 5s (remain 3m 11s) Loss: 0.0694(0.0954) Grad: 180973.984375 LR: 0.00002

Epoch: 3 - avg_train_loss = 0.0959 avg_val_loss: 0.1100 time: 298.2288s
Epoch: 3 - Score: 0.4698, Scores: [0.5018428062861521, 0.46051859801144474, 0.4207361539767949, 0.4803043873235634, 0.4910628834216279, 0.46456895700258427]
Epoch 3 - Save Best Score: 0.4698 Model


[3][194/194] Elapsed: 0m 41s (remain -1m 59s) Loss: 0.1282(0.1100) 




[3][19/783] Elapsed: 0m 6s (remain 4m 17s) Loss: 0.0840(0.0893) Grad: 258840.171875 LR: 0.00001682 
[3][39/783] Elapsed: 0m 13s (remain 4m 7s) Loss: 0.0463(0.0876) Grad: 227343.984375 LR: 0.00001657 
[3][59/783] Elapsed: 0m 19s (remain 3m 59s) Loss: 0.1491(0.0894) Grad: 193837.609375 LR: 0.00001632 
[3][79/783] Elapsed: 0m 26s (remain 3m 52s) Loss: 0.0485(0.0860) Grad: 114184.0625 LR: 0.00001608 
[3][99/783] Elapsed: 0m 33s (remain 3m 45s) Loss: 0.0891(0.0852) Grad: 317885.28125 LR: 0.00001584 
[3][119/783] Elapsed: 0m 39s (remain 3m 38s) Loss: 0.0964(0.0865) Grad: 347556.03125 LR: 0.00001560 
[3][139/783] Elapsed: 0m 46s (remain 3m 32s) Loss: 0.0765(0.0875) Grad: 137640.484375 LR: 0.00001537 
[3][159/783] Elapsed: 0m 52s (remain 3m 25s) Loss: 0.0997(0.0883) Grad: 144660.96875 LR: 0.00001514 
[3][179/783] Elapsed: 0m 59s (remain 3m 18s) Loss: 0.0431(0.0886) Grad: 76030.2109375 LR: 0.00001491 
[3][199/783] Elapsed: 1m 5s (remain 3m 11s) Loss: 0.0578(0.0879) Grad: 182684.453125 LR: 0.000

Epoch: 4 - avg_train_loss = 0.0936 avg_val_loss: 0.1101 time: 298.3316s
Epoch: 4 - Score: 0.4701, Scores: [0.5008543184146947, 0.4595366433153847, 0.4202019839327752, 0.47672153035588816, 0.49889343217902177, 0.46417296878183667]


[4][194/194] Elapsed: 0m 41s (remain -1m 59s) Loss: 0.1272(0.1101) 




[4][19/783] Elapsed: 0m 6s (remain 4m 14s) Loss: 0.0687(0.1025) Grad: 180960.5 LR: 0.00000933 
[4][39/783] Elapsed: 0m 13s (remain 4m 5s) Loss: 0.1016(0.0935) Grad: 311858.78125 LR: 0.00000919 
[4][59/783] Elapsed: 0m 19s (remain 3m 58s) Loss: 0.0820(0.0917) Grad: 218155.21875 LR: 0.00000905 
[4][79/783] Elapsed: 0m 26s (remain 3m 51s) Loss: 0.0348(0.0904) Grad: 121595.4765625 LR: 0.00000892 
[4][99/783] Elapsed: 0m 32s (remain 3m 44s) Loss: 0.0770(0.0903) Grad: 264955.46875 LR: 0.00000879 
[4][119/783] Elapsed: 0m 39s (remain 3m 38s) Loss: 0.0658(0.0923) Grad: 93278.140625 LR: 0.00000865 
[4][139/783] Elapsed: 0m 46s (remain 3m 31s) Loss: 0.1984(0.0940) Grad: 215305.890625 LR: 0.00000853 
[4][159/783] Elapsed: 0m 52s (remain 3m 24s) Loss: 0.0840(0.0946) Grad: 126004.7578125 LR: 0.00000840 
[4][179/783] Elapsed: 0m 59s (remain 3m 18s) Loss: 0.1420(0.0945) Grad: 163811.53125 LR: 0.00000827 
[4][199/783] Elapsed: 1m 5s (remain 3m 11s) Loss: 0.0714(0.0937) Grad: 327789.46875 LR: 0.0000081

Epoch: 5 - avg_train_loss = 0.0921 avg_val_loss: 0.1089 time: 298.7101s
Epoch: 5 - Score: 0.4674, Scores: [0.4983256182987699, 0.4586314146066606, 0.41808478689213147, 0.47783451718474224, 0.489226655992209, 0.46228337125976426]
Epoch 5 - Save Best Score: 0.4674 Model


[5][194/194] Elapsed: 0m 41s (remain -1m 59s) Loss: 0.1290(0.1089) 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
Score: 0.4674  Scores: [0.4983256182987699, 0.4586314146066606, 0.41808478689213147, 0.47783451718474224, 0.489226655992209, 0.46228337125976426]
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.dense.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing

[DEBUG] num_train_steps:  3916  len train loader:  784
warmup start:  19580




[0][19/784] Elapsed: 0m 6s (remain 4m 13s) Loss: 0.9150(1.8874) Grad: 306832.59375 LR: 0.00009851 
[0][39/784] Elapsed: 0m 13s (remain 4m 5s) Loss: 0.1798(1.0772) Grad: 103197.2265625 LR: 0.00009703 
[0][59/784] Elapsed: 0m 19s (remain 3m 58s) Loss: 0.2238(0.7794) Grad: 113743.96875 LR: 0.00009559 
[0][79/784] Elapsed: 0m 26s (remain 3m 51s) Loss: 0.1607(0.6291) Grad: 53743.42578125 LR: 0.00009416 
[0][99/784] Elapsed: 0m 32s (remain 3m 44s) Loss: 0.1921(0.5336) Grad: 177327.703125 LR: 0.00009275 
[0][119/784] Elapsed: 0m 39s (remain 3m 37s) Loss: 0.1657(0.4686) Grad: 193106.0 LR: 0.00009137 
[0][139/784] Elapsed: 0m 45s (remain 3m 31s) Loss: 0.1172(0.4208) Grad: 107261.2421875 LR: 0.00009000 
[0][159/784] Elapsed: 0m 52s (remain 3m 24s) Loss: 0.2447(0.3834) Grad: 138078.109375 LR: 0.00008866 
[0][179/784] Elapsed: 0m 59s (remain 3m 18s) Loss: 0.1039(0.3538) Grad: 73933.2734375 LR: 0.00008733 
[0][199/784] Elapsed: 1m 5s (remain 3m 11s) Loss: 0.0937(0.3318) Grad: 143227.546875 LR: 0.00

Epoch: 1 - avg_train_loss = 0.1705 avg_val_loss: 0.1217 time: 298.4078s
Epoch: 1 - Score: 0.4946, Scores: [0.512257296535541, 0.46932700299306623, 0.47766331451387317, 0.47242218452273343, 0.4888122638558455, 0.5472840672497724]
Epoch 1 - Save Best Score: 0.4946 Model


[1][194/194] Elapsed: 0m 41s (remain -1m 59s) Loss: 0.0741(0.1217) 




[1][19/784] Elapsed: 0m 6s (remain 4m 16s) Loss: 0.0817(0.1303) Grad: 170529.453125 LR: 0.00005461 
[1][39/784] Elapsed: 0m 13s (remain 4m 7s) Loss: 0.2020(0.1146) Grad: 271820.03125 LR: 0.00005379 
[1][59/784] Elapsed: 0m 19s (remain 3m 59s) Loss: 0.0852(0.1101) Grad: 174002.9375 LR: 0.00005299 
[1][79/784] Elapsed: 0m 26s (remain 3m 52s) Loss: 0.1564(0.1093) Grad: 273114.5 LR: 0.00005220 
[1][99/784] Elapsed: 0m 33s (remain 3m 45s) Loss: 0.1063(0.1090) Grad: 181513.0625 LR: 0.00005142 
[1][119/784] Elapsed: 0m 39s (remain 3m 39s) Loss: 0.0784(0.1081) Grad: 223358.328125 LR: 0.00005065 
[1][139/784] Elapsed: 0m 46s (remain 3m 32s) Loss: 0.0907(0.1081) Grad: 194976.453125 LR: 0.00004989 
[1][159/784] Elapsed: 0m 52s (remain 3m 25s) Loss: 0.0884(0.1085) Grad: 249102.328125 LR: 0.00004915 
[1][179/784] Elapsed: 0m 59s (remain 3m 19s) Loss: 0.1112(0.1060) Grad: 195232.890625 LR: 0.00004841 
[1][199/784] Elapsed: 1m 5s (remain 3m 12s) Loss: 0.0555(0.1063) Grad: 143412.671875 LR: 0.00004769

Epoch: 2 - avg_train_loss = 0.1027 avg_val_loss: 0.1081 time: 299.1191s
Epoch: 2 - Score: 0.4657, Scores: [0.4956899493437762, 0.448770801376623, 0.41927404626869436, 0.4629767982400117, 0.4842734326208937, 0.4832995788499807]
Epoch 2 - Save Best Score: 0.4657 Model


[2][194/194] Elapsed: 0m 41s (remain -1m 59s) Loss: 0.0919(0.1081) 




[2][19/784] Elapsed: 0m 6s (remain 4m 16s) Loss: 0.1359(0.0930) Grad: 150641.84375 LR: 0.00003027 
[2][39/784] Elapsed: 0m 13s (remain 4m 7s) Loss: 0.0676(0.0945) Grad: 174614.46875 LR: 0.00002982 
[2][59/784] Elapsed: 0m 19s (remain 4m 0s) Loss: 0.0877(0.0940) Grad: 226332.6875 LR: 0.00002937 
[2][79/784] Elapsed: 0m 26s (remain 3m 53s) Loss: 0.0962(0.0974) Grad: 180798.25 LR: 0.00002894 
[2][99/784] Elapsed: 0m 33s (remain 3m 46s) Loss: 0.2150(0.0997) Grad: 501478.84375 LR: 0.00002850 
[2][119/784] Elapsed: 0m 39s (remain 3m 39s) Loss: 0.1920(0.1001) Grad: 250127.828125 LR: 0.00002808 
[2][139/784] Elapsed: 0m 46s (remain 3m 32s) Loss: 0.0442(0.1000) Grad: 220569.421875 LR: 0.00002766 
[2][159/784] Elapsed: 0m 52s (remain 3m 26s) Loss: 0.0673(0.0987) Grad: 185182.75 LR: 0.00002725 
[2][179/784] Elapsed: 0m 59s (remain 3m 19s) Loss: 0.0531(0.0992) Grad: 171411.421875 LR: 0.00002684 
[2][199/784] Elapsed: 1m 5s (remain 3m 12s) Loss: 0.0704(0.0985) Grad: 129025.3359375 LR: 0.00002644 
[

Epoch: 3 - avg_train_loss = 0.0972 avg_val_loss: 0.1059 time: 299.2936s
Epoch: 3 - Score: 0.4608, Scores: [0.4859723639517774, 0.4464138210746739, 0.4177311566650099, 0.4650511648552315, 0.486525219443456, 0.46285465918229857]
Epoch 3 - Save Best Score: 0.4608 Model


[3][194/194] Elapsed: 0m 41s (remain -1m 59s) Loss: 0.0778(0.1059) 




[3][19/784] Elapsed: 0m 6s (remain 4m 17s) Loss: 0.0751(0.0933) Grad: 176688.625 LR: 0.00001678 
[3][39/784] Elapsed: 0m 13s (remain 4m 8s) Loss: 0.0624(0.0909) Grad: 189520.34375 LR: 0.00001653 
[3][59/784] Elapsed: 0m 19s (remain 4m 0s) Loss: 0.1054(0.0911) Grad: 310128.71875 LR: 0.00001628 
[3][79/784] Elapsed: 0m 26s (remain 3m 53s) Loss: 0.1177(0.0926) Grad: 199889.15625 LR: 0.00001604 
[3][99/784] Elapsed: 0m 33s (remain 3m 46s) Loss: 0.2259(0.0944) Grad: 321449.5625 LR: 0.00001580 
[3][119/784] Elapsed: 0m 39s (remain 3m 39s) Loss: 0.1211(0.0933) Grad: 210641.453125 LR: 0.00001557 
[3][139/784] Elapsed: 0m 46s (remain 3m 32s) Loss: 0.0345(0.0926) Grad: 150149.53125 LR: 0.00001533 
[3][159/784] Elapsed: 0m 52s (remain 3m 26s) Loss: 0.0744(0.0934) Grad: 160239.21875 LR: 0.00001510 
[3][179/784] Elapsed: 0m 59s (remain 3m 19s) Loss: 0.1161(0.0943) Grad: 149232.59375 LR: 0.00001488 
[3][199/784] Elapsed: 1m 5s (remain 3m 12s) Loss: 0.1455(0.0948) Grad: 204573.4375 LR: 0.00001466 
[3

Epoch: 4 - avg_train_loss = 0.0936 avg_val_loss: 0.1089 time: 299.3237s
Epoch: 4 - Score: 0.4673, Scores: [0.49827034911734924, 0.4621370416893894, 0.4208501849435237, 0.46857882700439024, 0.4850973720746817, 0.469016626991304]


[4][194/194] Elapsed: 0m 41s (remain -1m 59s) Loss: 0.0712(0.1089) 




[4][19/784] Elapsed: 0m 6s (remain 4m 14s) Loss: 0.0925(0.0964) Grad: 200985.03125 LR: 0.00000930 
[4][39/784] Elapsed: 0m 13s (remain 4m 5s) Loss: 0.0773(0.0922) Grad: 82997.375 LR: 0.00000916 
[4][59/784] Elapsed: 0m 19s (remain 3m 58s) Loss: 0.0655(0.0924) Grad: 116106.9921875 LR: 0.00000903 
[4][79/784] Elapsed: 0m 26s (remain 3m 51s) Loss: 0.0605(0.0913) Grad: 61096.40234375 LR: 0.00000889 
[4][99/784] Elapsed: 0m 32s (remain 3m 45s) Loss: 0.0519(0.0920) Grad: 85458.9140625 LR: 0.00000876 
[4][119/784] Elapsed: 0m 39s (remain 3m 38s) Loss: 0.0801(0.0924) Grad: 189732.9375 LR: 0.00000863 
[4][139/784] Elapsed: 0m 46s (remain 3m 32s) Loss: 0.1062(0.0908) Grad: 148996.5625 LR: 0.00000850 
[4][159/784] Elapsed: 0m 52s (remain 3m 25s) Loss: 0.0471(0.0893) Grad: 80307.5546875 LR: 0.00000837 
[4][179/784] Elapsed: 0m 59s (remain 3m 18s) Loss: 0.0547(0.0886) Grad: 72383.2578125 LR: 0.00000825 
[4][199/784] Elapsed: 1m 5s (remain 3m 12s) Loss: 0.0653(0.0882) Grad: 87686.078125 LR: 0.000008

Epoch: 5 - avg_train_loss = 0.0933 avg_val_loss: 0.1049 time: 299.1705s
Epoch: 5 - Score: 0.4587, Scores: [0.4854830223515189, 0.4438361967028614, 0.41789888625461663, 0.46396889856107, 0.48186613262388917, 0.45892999541854645]
Epoch 5 - Save Best Score: 0.4587 Model


[5][194/194] Elapsed: 0m 41s (remain -1m 59s) Loss: 0.0867(0.1049) 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
Score: 0.4587  Scores: [0.4854830223515189, 0.4438361967028614, 0.41789888625461663, 0.46396889856107, 0.48186613262388917, 0.45892999541854645]
Score: 0.4550  Scores: [0.4843101272329265, 0.4469884731288726, 0.41886618468616554, 0.4594054979227242, 0.4713665400029151, 0.4492003125655974]
