In [None]:
from google.colab import drive
drive.mount('/content/drive/')
%cd drive/MyDrive/kaggle/notebook

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
[Errno 2] No such file or directory: 'drive/MyDrive/kaggle/notebook'
/content/drive/MyDrive/kaggle/notebook


In [None]:
%%capture
!pip install transformers==4.20.1
!pip install tokenizers==0.12.1
!pip install sentencepiece==0.1.97

In [None]:
import os
import gc
import time
import math
import datetime

import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader


import transformers
import tokenizers
from transformers import AutoModel, AutoTokenizer
from transformers import get_cosine_schedule_with_warmup

os.system('pip install iterative-stratification==0.1.7')
os.environ['TOKENIZERS_PARALLELISM']='true'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
class CFG:
    model_name  = "microsoft/deberta-v3-large"
    
    base        = "../input/feedback-prize-english-language-learning/"
    train       = base + "train.csv"
    test        = base + "test.csv"
    target_cols = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    num_targets = 6
    
    # Backbone
    max_length  = 512
    hidden_dims = 1024
    output_hidden_states = True
    pool = 'attention'
    
    # loss
    loss = 'L1smooth'
    apex = True
    max_norm = 300
    
    # optimizer
    default_lr = 2e-5
    head_lr    = 3e-4
    adam_eps   = 1e-5
    
    # scheduler
    scheduler = 'linear'
    num_warmup_steps = 0.333333
    
    # CV
    n_folds = 5
    seed   = 42
    epochs = 5
    
    # Loader
    batch_size  = 2
    num_workers = 2
    print_freq = 20

    str_now = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')

In [None]:
if len(CFG.model_name.split("/")) == 2:
   CFG.identifier = f"{CFG.str_now}-{CFG.model_name.split('/')[1]}"
else:
   CFG.identifier = f"{CFG.str_now}-{CFG.model_name}"

CFG.OUTPUT_DIR = f'./{CFG.identifier}/'
os.makedirs(CFG.OUTPUT_DIR, exist_ok = True)

In [None]:
df_train = pd.read_csv(CFG.train)

In [None]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [None]:
fold = MultilabelStratifiedKFold(n_splits = CFG.n_folds, shuffle = True, random_state = CFG.seed)
df   = df_train.copy()
y    = pd.get_dummies(data = df[CFG.target_cols], columns = CFG.target_cols)

for idx, (train_idx, valid_idx) in enumerate(fold.split(df_train, y)):
    df_train.loc[valid_idx, 'fold'] = idx

In [None]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model_name)
tokenizer.add_tokens(["\n"], special_tokens=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


1

In [None]:
class AverageMeter(object):
    def __init__(self):
        self.reset()
             
    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
        
    def update(self, val, n = 1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return f'{int(m)}m {int(s)}s'
        
def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    
    return f'{str(asMinutes(s))} (remain {str(asMinutes(rs))})'
        
def prepare_inputs(texts):
    inputs = []
    for text in texts:
        tokens = tokenizer.encode_plus(text,
                                       add_special_tokens = True,
                                       max_length = CFG.max_length,
                                       pad_to_max_length = True,
                                       truncation = True,
                                       return_attention_mask = True
                                       )
        
        for key, vals in tokens.items():
            tokens[key] = torch.tensor(vals, dtype = torch.long)
        inputs.append(tokens)
    
    return inputs

class MeanPooling(nn.Module):
      def __init__(self):
          super(MeanPooling, self).__init__()

      def forward(self, inputs, attention_masks):
          #print("inputs.shape: ", inputs.shape, " masks.shape: ", attention_masks.shape)
          attention_masks = attention_masks.unsqueeze(-1).float()
          inputs   = torch.sum(inputs*attention_masks, dim = 1)
          num_masks = torch.sum(attention_masks == 1.0, dim = 1)
          torch.clamp(num_masks, min = 1e-9)
          inputs   = (inputs / num_masks)

          return inputs

class AttentionPooling(nn.Module):
      def __init__(self, hidden_size):
          super(AttentionPooling, self).__init__()
          self.attention = nn.Sequential(nn.Linear(hidden_size, hidden_size),
                                         nn.GELU(),
                                         nn.Linear(hidden_size, 1)
                                         )

      def forward(self, hidden_state, attention_mask):
          weights = self.attention(hidden_state)
          weights[attention_mask == 0] = float("-inf")
          weights = torch.softmax(weights, dim = 1)
          context = torch.sum(hidden_state * weights, dim = 1)

          return context

def MCRMSE(labels, preds):
    scores = []
    num_targets = labels.shape[1]
    
    for i in range(num_targets):
        pred  = preds[:,i]
        label = labels[:,i]
        score = mean_squared_error(label, pred, squared = False)
        scores.append(score)
    
    mcrmse = np.mean(scores)
    
    return mcrmse, scores 

def get_score(labels, preds):
    return MCRMSE(labels, preds)
        

In [None]:
class FB3Dataset(Dataset):
    def __init__(self, df_train):
        self.inputs = prepare_inputs(df_train['full_text'])
        self.labels = torch.tensor(df_train[CFG.target_cols].values, dtype = torch.float)
        
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, idx):
        return self.inputs[idx], self.labels[idx]

In [None]:
class FB3Model(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.model      = AutoModel.from_pretrained(cfg.model_name, 
                                                    output_hidden_states=cfg.output_hidden_states)
        self.cfg        = cfg

        if self.cfg.pool == 'mean':
          self.pool       = MeanPooling()
          self.classifier = nn.Linear(cfg.hidden_dims, cfg.num_targets)
        elif self.cfg.pool == 'attention':
          hidden_size = cfg.hidden_dims * 4
          self.lstm = nn.LSTM(input_size = hidden_size, 
                              hidden_size = hidden_size // 2,
                              batch_first = True,
                              bidirectional = True)
          self.pool = AttentionPooling(hidden_size)
          #self.pool       = MeanPooling()
        
          self.classifier = nn.Linear(hidden_size, cfg.num_targets)
    
    def feature(self, inputs):
        outputs = self.model(**inputs)

        if self.cfg.pool == 'mean':
           feature = self.pool(outputs.last_hidden_state, inputs['attention_mask'])
        elif self.cfg.pool == 'attention':
           hidden_states = outputs.hidden_states
           #print("hidden_states.shape: ", len(hidden_states))
           stacked_outputs = torch.cat([hidden_states[-i-1] for i in range(4)], dim = -1)
           #print("stacked_outputs.shape: ", stacked_outputs.shape)
           feature, _ = self.lstm(stacked_outputs)
           #print("feature.shape: ", feature.shape)
           feature = self.pool(feature, inputs['attention_mask'])
           #print("last.shape: ", feature.shape)
        
        return feature
    
    def forward(self, inputs):

        feature = self.feature(inputs)
        outputs = self.classifier(feature)
        
        return outputs
    

In [None]:
def train_fn(model, criterion, optimizer, scheduler, train_loader, epoch, cfg):
    losses = AverageMeter()
    total_steps = len(train_loader)
    start = end = time.time()
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=cfg.apex)
    
    for step, (inputs, labels) in enumerate(train_loader):
        #print("inputs.shape: ", inputs['input_ids'].shape)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        
        with torch.cuda.amp.autocast(enabled = cfg.apex):
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
        losses.update(loss.item(), labels.shape[0])
        
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.max_norm)
          
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
        scheduler.step()
        
        end = time.time()
        
        if (step + 1) % cfg.print_freq == 0:
            print("[{0}][{1}/{2}] "
                  "Elapsed: {remain:s} "
                  "Loss: {loss.val:.4f}({loss.avg:.4f}) "
                  "Grad: {grad_norm} "
                  "LR: {lr:.8f} "
                  .format(epoch, step, total_steps, remain = timeSince(start, float(step+1)/total_steps),
                          loss = losses,
                          grad_norm = grad_norm,
                          lr = scheduler.get_lr()[0]
                          )
                 )
    
    return losses.avg

def valid_fn(model, criterion, valid_loader, epoch, cfg):
    losses = AverageMeter()
    predictions = []
    total_steps = len(valid_loader) - 1
    model.eval()
    start = end = time.time()
    
    for step, (inputs, labels) in enumerate(valid_loader):
        
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        
        with torch.no_grad():
            preds = model(inputs)
        
        loss = criterion(preds, labels)
        losses.update(loss.item(), labels.shape[0])
        
        predictions.append(preds.detach().cpu().numpy())
        
        if (step + 1) % cfg.print_freq == 0 or step == total_steps:
            print("[{0}][{1}/{2}] "
                  "Elapsed: {remain:s} "
                  "Loss: {loss.val:.4f}({loss.avg:.4f}) "
                  .format(epoch, step, total_steps, remain = timeSince(start, float(step+1)/total_steps),
                          loss = losses,
                          )
                 )
    
    predictions = np.concatenate(predictions, axis = 0)
    
    return losses.avg, predictions

In [None]:
from collections import defaultdict

def train_loop(folds, fold, cfg):
    train_folds  = folds[folds['fold'] != fold]
    valid_folds  = folds[folds['fold'] == fold]
    valid_labels = valid_folds[cfg.target_cols].values
    
    train_dataset = FB3Dataset(train_folds)
    valid_dataset = FB3Dataset(valid_folds)
    
    train_loader  = DataLoader(train_dataset, 
                               batch_size  = cfg.batch_size,
                               shuffle = True,
                               pin_memory = True,
                               num_workers = cfg.num_workers)
    
    valid_loader  = DataLoader(valid_dataset, 
                               batch_size = cfg.batch_size,
                               shuffle = False,
                               pin_memory = True,
                               num_workers = cfg.num_workers)
    
    def get_optimizer(cfg, model):
        
        all_parameters = list(model.named_parameters())
        used_name_parameters = set()
        params = []
        named_params = []
        no_wd = ['word_embeddings', 'bias', 'LayerNorm.weight']

        head   = [(n, p) for n, p in model.named_parameters() if not 'model' in n]
        for n, _ in head: used_name_parameters.add(n)
  
        params.append({"params": [p for n, p in head if not any(no in n for no in no_wd)], "weight_decay": 0.01, "lr": cfg.head_lr})
        named_params.append({"params": [n for n, p in head if not any(no in n for no in no_wd)], "weight_decay": 0.01, "lr": cfg.head_lr})
        params.append({"params": [p for n, p in head if any(no in n for no in no_wd)], "weight_decay": 0.0, "lr": cfg.head_lr})
        named_params.append({"params": [n for n, p in head if any(no in n for no in no_wd)], "weight_decay": 0.0, "lr": cfg.head_lr})

        backbone = [(n, p) for n, p in model.named_parameters() if 'model' in n]
        
        groups = [
            [ [".embeddings."],                           1e-6],
            [ ["encoder.LayerNorm", "rel_embeddings"],    1e-6],
            [ ["." + str(i) + "." for i in range(0,6)],   1e-8],
            [ ["." + str(i) + "." for i in range(6,12)],  1e-7],
            [ ["." + str(i) + "." for i in range(12,23)], 1e-6],
            [ [".23."], cfg.head_lr]
        ]

        for group in groups:
            names, lr = group[0], group[1]
            print("names: ", names)
            parameters = [(n, p) for n, p in backbone if any(name in n for name in names)]
            for n, _ in parameters: used_name_parameters.add(n)

            params.append({"params": [p for n, p in parameters if not any(no in n for no in no_wd)], 'weight_decay': 0.01, "lr": lr})
            named_params.append({"params": [n for n, p in parameters if not any(no in n for no in no_wd)], 'weight_decay': 0.01, "lr": lr})
            params.append({"params": [p for n, p in parameters if any(no in n for no in no_wd)], 'weight_decay': 0.0, "lr": lr})
            named_params.append({"params": [n for n, p in parameters if any(no in n for no in no_wd)], 'weight_decay': 0.0, "lr": lr})
        

        param_dict = defaultdict()
        for param in params:
            for _param in param['params']:
                key = id(_param)
                param_dict[key] = param_dict.get(key, 0) + 1
        
        for key, val in param_dict.items():
           if val > 1: print("key: ", key, " val: ", val)
        optimizer = torch.optim.AdamW(params, eps = cfg.adam_eps)
        
        state_dict_keys = {n:p for n, p in all_parameters}.keys()
        assert(len(state_dict_keys - used_name_parameters) == 0), \
               f"Missing parameters: {str(state_dict_keys-used_name_parameters)}"

        return optimizer
    
    def get_scheduler(cfg, optimizer, num_train_steps):
        warmup_start = int(cfg.epochs * num_train_steps)
        print("warmup start: ", warmup_start)
        scheduler = transformers.get_scheduler(cfg.scheduler, optimizer, warmup_start, num_train_steps)
        
        return scheduler
    
    model = FB3Model(cfg).to(device)
    criterion = nn.SmoothL1Loss(reduction = 'mean')
    optimizer = get_optimizer(cfg, model)
    scheduler = get_scheduler(cfg, optimizer, len(train_loader))

    best_score = np.inf
    best_val_loss = np.inf
    best_train_loss = np.inf

    for epoch in range(cfg.epochs):
        
        start = time.time()
        
        avg_loss = train_fn(model, criterion, optimizer, scheduler, train_loader, epoch, cfg)
        
        avg_val_loss, preds = valid_fn(model, criterion, valid_loader, epoch, cfg)
        score, scores = get_score(preds, valid_labels)
        
        elapsed = time.time() - start
        
        print(f"Epoch: {epoch+1} - avg_train_loss = {avg_loss:.4f} avg_val_loss: {avg_val_loss:.4f} time: {elapsed:.4f}s")
        print(f"Epoch: {epoch+1} - Score: {score:.4f}, Scores: {scores}")

        if best_score > score:
           best_score = score
           best_train_loss = avg_loss
           bset_val_loss = avg_val_loss
           
           torch.save({'model': model.state_dict(),
                       'predictions': preds},
                      CFG.OUTPUT_DIR + f"{CFG.model_name.replace('/', '-')}_fold{fold}_epoch{epoch+1}.pth")
    
    torch.cuda.empty_cache()
    gc.collect()

    return best_train_loss, best_val_loss, best_score

In [None]:
for fold in range(CFG.n_folds):
    best_train_loss, best_val_loss, best_score = train_loop(df_train, fold, CFG)

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


names:  ['.embeddings.']
names:  ['encoder.LayerNorm', 'rel_embeddings']
names:  ['.0.', '.1.', '.2.', '.3.', '.4.', '.5.']
names:  ['.6.', '.7.', '.8.', '.9.', '.10.', '.11.']
names:  ['.12.', '.13.', '.14.', '.15.', '.16.', '.17.', '.18.', '.19.', '.20.', '.21.', '.22.']
names:  ['.23.']
warmup start:  7825




[0][19/1565] Elapsed: 0m 20s (remain 26m 59s) Loss: 2.6929(2.4295) Grad: 419478.875 LR: 0.00000077 
[0][39/1565] Elapsed: 0m 41s (remain 26m 23s) Loss: 2.3067(2.4609) Grad: 417741.875 LR: 0.00000153 
[0][59/1565] Elapsed: 1m 1s (remain 25m 53s) Loss: 2.1510(2.4950) Grad: 419321.65625 LR: 0.00000230 
[0][79/1565] Elapsed: 1m 22s (remain 25m 28s) Loss: 2.6312(2.4824) Grad: 415949.15625 LR: 0.00000307 
[0][99/1565] Elapsed: 1m 42s (remain 25m 6s) Loss: 2.5555(2.4301) Grad: 401013.4375 LR: 0.00000383 
[0][119/1565] Elapsed: 2m 3s (remain 24m 45s) Loss: 2.0924(2.4091) Grad: 415186.6875 LR: 0.00000460 
[0][139/1565] Elapsed: 2m 23s (remain 24m 23s) Loss: 2.8557(2.3796) Grad: 419205.03125 LR: 0.00000537 
[0][159/1565] Elapsed: 2m 44s (remain 24m 2s) Loss: 1.7566(2.3601) Grad: 414584.375 LR: 0.00000613 
[0][179/1565] Elapsed: 3m 4s (remain 23m 41s) Loss: 2.2624(2.3205) Grad: 435310.8125 LR: 0.00000690 
[0][199/1565] Elapsed: 3m 25s (remain 23m 20s) Loss: 1.8345(2.2596) Grad: 424898.3125 LR: 0.



[1][19/1565] Elapsed: 0m 20s (remain 26m 39s) Loss: 0.0684(0.1264) Grad: 164123.140625 LR: 0.00006077 
[1][39/1565] Elapsed: 0m 41s (remain 26m 14s) Loss: 0.2191(0.1222) Grad: 239148.359375 LR: 0.00006153 
[1][59/1565] Elapsed: 1m 1s (remain 25m 50s) Loss: 0.0749(0.1193) Grad: 147209.296875 LR: 0.00006230 
[1][79/1565] Elapsed: 1m 22s (remain 25m 26s) Loss: 0.2401(0.1169) Grad: 169666.5625 LR: 0.00006307 
[1][99/1565] Elapsed: 1m 42s (remain 25m 4s) Loss: 0.1901(0.1178) Grad: 354363.4375 LR: 0.00006383 
[1][119/1565] Elapsed: 2m 3s (remain 24m 43s) Loss: 0.1935(0.1147) Grad: 418134.875 LR: 0.00006460 
[1][139/1565] Elapsed: 2m 23s (remain 24m 22s) Loss: 0.1543(0.1148) Grad: 150467.625 LR: 0.00006537 
[1][159/1565] Elapsed: 2m 44s (remain 24m 1s) Loss: 0.0521(0.1148) Grad: 124627.578125 LR: 0.00006613 
[1][179/1565] Elapsed: 3m 4s (remain 23m 40s) Loss: 0.0815(0.1141) Grad: 204087.84375 LR: 0.00006690 
[1][199/1565] Elapsed: 3m 25s (remain 23m 19s) Loss: 0.0338(0.1121) Grad: 219677.7812



[2][19/1565] Elapsed: 0m 20s (remain 26m 41s) Loss: 0.0647(0.0769) Grad: 132411.796875 LR: 0.00012077 
[2][39/1565] Elapsed: 0m 41s (remain 26m 16s) Loss: 0.1076(0.0963) Grad: 185489.09375 LR: 0.00012153 
[2][59/1565] Elapsed: 1m 1s (remain 25m 50s) Loss: 0.0946(0.1057) Grad: 287660.25 LR: 0.00012230 
[2][79/1565] Elapsed: 1m 22s (remain 25m 26s) Loss: 0.2240(0.1096) Grad: 670586.5 LR: 0.00012307 
[2][99/1565] Elapsed: 1m 42s (remain 25m 3s) Loss: 0.0957(0.1117) Grad: 182390.015625 LR: 0.00012383 
[2][119/1565] Elapsed: 2m 3s (remain 24m 42s) Loss: 0.0594(0.1094) Grad: 169186.71875 LR: 0.00012460 
[2][139/1565] Elapsed: 2m 23s (remain 24m 21s) Loss: 0.0797(0.1097) Grad: 180845.984375 LR: 0.00012537 
[2][159/1565] Elapsed: 2m 44s (remain 24m 0s) Loss: 0.0723(0.1094) Grad: 249915.421875 LR: 0.00012613 
[2][179/1565] Elapsed: 3m 4s (remain 23m 39s) Loss: 0.0386(0.1086) Grad: 123311.9921875 LR: 0.00012690 
[2][199/1565] Elapsed: 3m 24s (remain 23m 18s) Loss: 0.0393(0.1088) Grad: 189097.203



[3][19/1565] Elapsed: 0m 20s (remain 26m 30s) Loss: 0.0971(0.1152) Grad: 253376.15625 LR: 0.00018077 
[3][39/1565] Elapsed: 0m 41s (remain 26m 5s) Loss: 0.2359(0.1188) Grad: 587455.6875 LR: 0.00018153 
[3][59/1565] Elapsed: 1m 1s (remain 25m 42s) Loss: 0.0727(0.1060) Grad: 159275.59375 LR: 0.00018230 
[3][79/1565] Elapsed: 1m 22s (remain 25m 22s) Loss: 0.0776(0.1073) Grad: 225189.984375 LR: 0.00018307 
[3][99/1565] Elapsed: 1m 42s (remain 25m 1s) Loss: 0.1114(0.1057) Grad: 380754.3125 LR: 0.00018383 
[3][119/1565] Elapsed: 2m 3s (remain 24m 42s) Loss: 0.1409(0.1054) Grad: 292008.34375 LR: 0.00018460 
[3][139/1565] Elapsed: 2m 23s (remain 24m 21s) Loss: 0.0873(0.1057) Grad: 187182.359375 LR: 0.00018537 
[3][159/1565] Elapsed: 2m 44s (remain 24m 0s) Loss: 0.1578(0.1061) Grad: 246976.03125 LR: 0.00018613 
[3][179/1565] Elapsed: 3m 4s (remain 23m 40s) Loss: 0.0098(0.1067) Grad: 59560.61328125 LR: 0.00018690 
[3][199/1565] Elapsed: 3m 25s (remain 23m 19s) Loss: 0.1819(0.1085) Grad: 319569.7



[4][19/1565] Elapsed: 0m 20s (remain 26m 35s) Loss: 0.1068(0.1393) Grad: 428486.5 LR: 0.00024077 
[4][39/1565] Elapsed: 0m 41s (remain 26m 13s) Loss: 0.1615(0.1349) Grad: 379528.6875 LR: 0.00024153 
[4][59/1565] Elapsed: 1m 1s (remain 25m 48s) Loss: 0.2405(0.1392) Grad: 549583.0625 LR: 0.00024230 
[4][79/1565] Elapsed: 1m 22s (remain 25m 25s) Loss: 0.0942(0.1326) Grad: 265684.5625 LR: 0.00024307 
[4][99/1565] Elapsed: 1m 42s (remain 25m 3s) Loss: 0.1987(0.1321) Grad: 197711.421875 LR: 0.00024383 
[4][119/1565] Elapsed: 2m 3s (remain 24m 42s) Loss: 0.0650(0.1327) Grad: 110972.03125 LR: 0.00024460 
[4][139/1565] Elapsed: 2m 23s (remain 24m 21s) Loss: 0.0368(0.1262) Grad: 72889.9140625 LR: 0.00024537 
[4][159/1565] Elapsed: 2m 44s (remain 24m 0s) Loss: 0.0497(0.1255) Grad: 97169.609375 LR: 0.00024613 
[4][179/1565] Elapsed: 3m 4s (remain 23m 39s) Loss: 0.1176(0.1246) Grad: 291356.65625 LR: 0.00024690 
[4][199/1565] Elapsed: 3m 25s (remain 23m 19s) Loss: 0.0813(0.1235) Grad: 230435.796875 

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


names:  ['.embeddings.']
names:  ['encoder.LayerNorm', 'rel_embeddings']
names:  ['.0.', '.1.', '.2.', '.3.', '.4.', '.5.']
names:  ['.6.', '.7.', '.8.', '.9.', '.10.', '.11.']
names:  ['.12.', '.13.', '.14.', '.15.', '.16.', '.17.', '.18.', '.19.', '.20.', '.21.', '.22.']
names:  ['.23.']
warmup start:  7810




[0][19/1562] Elapsed: 0m 20s (remain 26m 31s) Loss: 2.5822(2.5861) Grad: 421879.21875 LR: 0.00000077 
[0][39/1562] Elapsed: 0m 41s (remain 26m 11s) Loss: 2.2719(2.5632) Grad: 412118.90625 LR: 0.00000154 
[0][59/1562] Elapsed: 1m 1s (remain 25m 45s) Loss: 2.1032(2.5009) Grad: 405502.125 LR: 0.00000230 
[0][79/1562] Elapsed: 1m 22s (remain 25m 21s) Loss: 2.8923(2.5036) Grad: 424057.34375 LR: 0.00000307 
[0][99/1562] Elapsed: 1m 42s (remain 24m 59s) Loss: 3.0062(2.4763) Grad: 408744.0 LR: 0.00000384 
[0][119/1562] Elapsed: 2m 3s (remain 24m 38s) Loss: 2.8028(2.4590) Grad: 422890.65625 LR: 0.00000461 
[0][139/1562] Elapsed: 2m 23s (remain 24m 17s) Loss: 2.1468(2.4378) Grad: 429266.90625 LR: 0.00000538 
[0][159/1562] Elapsed: 2m 43s (remain 23m 56s) Loss: 2.0887(2.3841) Grad: 422323.53125 LR: 0.00000615 
[0][179/1562] Elapsed: 3m 4s (remain 23m 35s) Loss: 2.4856(2.3521) Grad: 425937.46875 LR: 0.00000691 
[0][199/1562] Elapsed: 3m 24s (remain 23m 14s) Loss: 1.8583(2.3019) Grad: 430980.5 LR: 



[1][19/1562] Elapsed: 0m 20s (remain 26m 34s) Loss: 0.0825(0.1125) Grad: 135373.21875 LR: 0.00006077 
[1][39/1562] Elapsed: 0m 41s (remain 26m 12s) Loss: 0.2375(0.1240) Grad: 586580.125 LR: 0.00006154 
[1][59/1562] Elapsed: 1m 1s (remain 25m 47s) Loss: 0.0353(0.1282) Grad: 100453.2421875 LR: 0.00006230 
[1][79/1562] Elapsed: 1m 22s (remain 25m 23s) Loss: 0.0683(0.1283) Grad: 100715.7421875 LR: 0.00006307 
[1][99/1562] Elapsed: 1m 42s (remain 25m 1s) Loss: 0.0503(0.1236) Grad: 144491.828125 LR: 0.00006384 
[1][119/1562] Elapsed: 2m 3s (remain 24m 40s) Loss: 0.2626(0.1250) Grad: 615623.5 LR: 0.00006461 
[1][139/1562] Elapsed: 2m 23s (remain 24m 18s) Loss: 0.0913(0.1230) Grad: 137272.453125 LR: 0.00006538 
[1][159/1562] Elapsed: 2m 44s (remain 23m 58s) Loss: 0.1474(0.1235) Grad: 452404.875 LR: 0.00006615 
[1][179/1562] Elapsed: 3m 4s (remain 23m 37s) Loss: 0.1129(0.1228) Grad: 327447.09375 LR: 0.00006691 
[1][199/1562] Elapsed: 3m 25s (remain 23m 16s) Loss: 0.1275(0.1209) Grad: 328846.75 



[2][19/1562] Elapsed: 0m 20s (remain 26m 38s) Loss: 0.0887(0.1197) Grad: 344986.3125 LR: 0.00012077 
[2][39/1562] Elapsed: 0m 41s (remain 26m 15s) Loss: 0.1320(0.1113) Grad: 193154.1875 LR: 0.00012154 
[2][59/1562] Elapsed: 1m 1s (remain 25m 50s) Loss: 0.1323(0.1100) Grad: 396117.90625 LR: 0.00012230 
[2][79/1562] Elapsed: 1m 22s (remain 25m 25s) Loss: 0.1283(0.1066) Grad: 199982.796875 LR: 0.00012307 
[2][99/1562] Elapsed: 1m 42s (remain 25m 3s) Loss: 0.0469(0.1060) Grad: 189191.84375 LR: 0.00012384 
[2][119/1562] Elapsed: 2m 3s (remain 24m 42s) Loss: 0.1210(0.1078) Grad: 365705.125 LR: 0.00012461 
[2][139/1562] Elapsed: 2m 23s (remain 24m 20s) Loss: 0.1263(0.1116) Grad: 393103.03125 LR: 0.00012538 
[2][159/1562] Elapsed: 2m 44s (remain 23m 59s) Loss: 0.0402(0.1116) Grad: 120760.046875 LR: 0.00012615 
[2][179/1562] Elapsed: 3m 4s (remain 23m 38s) Loss: 0.0664(0.1105) Grad: 140190.9375 LR: 0.00012691 
[2][199/1562] Elapsed: 3m 25s (remain 23m 17s) Loss: 0.0378(0.1108) Grad: 101865.5625



[3][19/1562] Elapsed: 0m 20s (remain 26m 35s) Loss: 0.1054(0.0979) Grad: 231275.40625 LR: 0.00018077 
[3][39/1562] Elapsed: 0m 41s (remain 26m 11s) Loss: 0.0851(0.1128) Grad: 207987.71875 LR: 0.00018154 
[3][59/1562] Elapsed: 1m 1s (remain 25m 47s) Loss: 0.1515(0.1128) Grad: 305321.25 LR: 0.00018230 
[3][79/1562] Elapsed: 1m 22s (remain 25m 22s) Loss: 0.0876(0.1050) Grad: 319365.0625 LR: 0.00018307 
[3][99/1562] Elapsed: 1m 42s (remain 25m 1s) Loss: 0.0912(0.1078) Grad: 185644.890625 LR: 0.00018384 
[3][119/1562] Elapsed: 2m 3s (remain 24m 40s) Loss: 0.0938(0.1060) Grad: 261269.9375 LR: 0.00018461 
[3][139/1562] Elapsed: 2m 23s (remain 24m 19s) Loss: 0.0640(0.1049) Grad: 209501.859375 LR: 0.00018538 
[3][159/1562] Elapsed: 2m 44s (remain 23m 57s) Loss: 0.0826(0.1041) Grad: 243573.140625 LR: 0.00018615 
[3][179/1562] Elapsed: 3m 4s (remain 23m 36s) Loss: 0.0378(0.1044) Grad: 117632.9921875 LR: 0.00018691 
[3][199/1562] Elapsed: 3m 25s (remain 23m 16s) Loss: 0.1053(0.1059) Grad: 210351.7



[4][19/1562] Elapsed: 0m 20s (remain 26m 27s) Loss: 0.1178(0.0959) Grad: 324035.75 LR: 0.00024077 
[4][39/1562] Elapsed: 0m 41s (remain 26m 2s) Loss: 0.1291(0.1048) Grad: 177356.75 LR: 0.00024154 
[4][59/1562] Elapsed: 1m 1s (remain 25m 41s) Loss: 0.0852(0.1131) Grad: 156685.03125 LR: 0.00024230 
[4][79/1562] Elapsed: 1m 22s (remain 25m 19s) Loss: 0.0959(0.1099) Grad: 162044.1875 LR: 0.00024307 
[4][99/1562] Elapsed: 1m 42s (remain 24m 59s) Loss: 0.1848(0.1140) Grad: 386182.0 LR: 0.00024384 
[4][119/1562] Elapsed: 2m 3s (remain 24m 38s) Loss: 0.1241(0.1192) Grad: 198847.515625 LR: 0.00024461 
[4][139/1562] Elapsed: 2m 23s (remain 24m 17s) Loss: 0.0856(0.1161) Grad: 164805.3125 LR: 0.00024538 
[4][159/1562] Elapsed: 2m 43s (remain 23m 56s) Loss: 0.1367(0.1152) Grad: 344367.8125 LR: 0.00024615 
[4][179/1562] Elapsed: 3m 4s (remain 23m 35s) Loss: 0.3160(0.1155) Grad: 707532.8125 LR: 0.00024691 
[4][199/1562] Elapsed: 3m 24s (remain 23m 15s) Loss: 0.0820(0.1154) Grad: 136909.703125 LR: 0.0

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


names:  ['.embeddings.']
names:  ['encoder.LayerNorm', 'rel_embeddings']
names:  ['.0.', '.1.', '.2.', '.3.', '.4.', '.5.']
names:  ['.6.', '.7.', '.8.', '.9.', '.10.', '.11.']
names:  ['.12.', '.13.', '.14.', '.15.', '.16.', '.17.', '.18.', '.19.', '.20.', '.21.', '.22.']
names:  ['.23.']
warmup start:  7815




[0][19/1563] Elapsed: 0m 20s (remain 26m 39s) Loss: 2.3902(2.6684) Grad: 417293.1875 LR: 0.00000077 
[0][39/1563] Elapsed: 0m 41s (remain 26m 18s) Loss: 2.2176(2.6371) Grad: 425452.375 LR: 0.00000154 
[0][59/1563] Elapsed: 1m 1s (remain 25m 52s) Loss: 2.3879(2.5949) Grad: 420452.40625 LR: 0.00000230 
[0][79/1563] Elapsed: 1m 22s (remain 25m 26s) Loss: 2.4751(2.5634) Grad: 415011.21875 LR: 0.00000307 
[0][99/1563] Elapsed: 1m 42s (remain 25m 4s) Loss: 2.8357(2.5895) Grad: 420136.0625 LR: 0.00000384 
[0][119/1563] Elapsed: 2m 3s (remain 24m 42s) Loss: 1.8722(2.5649) Grad: 420157.8125 LR: 0.00000461 
[0][139/1563] Elapsed: 2m 23s (remain 24m 21s) Loss: 2.1325(2.5393) Grad: 419238.34375 LR: 0.00000537 
[0][159/1563] Elapsed: 2m 44s (remain 24m 0s) Loss: 2.2890(2.5024) Grad: 426907.03125 LR: 0.00000614 
[0][179/1563] Elapsed: 3m 4s (remain 23m 38s) Loss: 3.2267(2.4874) Grad: 442663.75 LR: 0.00000691 
[0][199/1563] Elapsed: 3m 25s (remain 23m 18s) Loss: 2.0252(2.4412) Grad: 422285.21875 LR: 



[1][19/1563] Elapsed: 0m 20s (remain 26m 40s) Loss: 0.1015(0.1312) Grad: 304177.4375 LR: 0.00006077 
[1][39/1563] Elapsed: 0m 41s (remain 26m 15s) Loss: 0.1321(0.1303) Grad: 282355.78125 LR: 0.00006154 
[1][59/1563] Elapsed: 1m 1s (remain 25m 51s) Loss: 0.1165(0.1239) Grad: 165624.734375 LR: 0.00006230 
[1][79/1563] Elapsed: 1m 22s (remain 25m 26s) Loss: 0.2344(0.1251) Grad: 500392.84375 LR: 0.00006307 
[1][99/1563] Elapsed: 1m 42s (remain 25m 4s) Loss: 0.1202(0.1224) Grad: 173147.0 LR: 0.00006384 
[1][119/1563] Elapsed: 2m 3s (remain 24m 43s) Loss: 0.0626(0.1236) Grad: 210051.03125 LR: 0.00006461 
[1][139/1563] Elapsed: 2m 23s (remain 24m 21s) Loss: 0.1253(0.1234) Grad: 289981.625 LR: 0.00006537 
[1][159/1563] Elapsed: 2m 44s (remain 24m 0s) Loss: 0.1231(0.1246) Grad: 331749.71875 LR: 0.00006614 
[1][179/1563] Elapsed: 3m 4s (remain 23m 39s) Loss: 0.1693(0.1241) Grad: 500883.0 LR: 0.00006691 
[1][199/1563] Elapsed: 3m 25s (remain 23m 18s) Loss: 0.1509(0.1227) Grad: 446165.0625 LR: 0.0



[2][19/1563] Elapsed: 0m 20s (remain 26m 33s) Loss: 0.0805(0.1308) Grad: 244140.953125 LR: 0.00012077 
[2][39/1563] Elapsed: 0m 41s (remain 26m 5s) Loss: 0.1591(0.1211) Grad: 306919.90625 LR: 0.00012154 
[2][59/1563] Elapsed: 1m 1s (remain 25m 42s) Loss: 0.2723(0.1227) Grad: 621501.125 LR: 0.00012230 
[2][79/1563] Elapsed: 1m 22s (remain 25m 20s) Loss: 0.0687(0.1246) Grad: 137161.484375 LR: 0.00012307 
[2][99/1563] Elapsed: 1m 42s (remain 24m 59s) Loss: 0.0655(0.1241) Grad: 191670.859375 LR: 0.00012384 
[2][119/1563] Elapsed: 2m 2s (remain 24m 38s) Loss: 0.1266(0.1235) Grad: 352398.90625 LR: 0.00012461 
[2][139/1563] Elapsed: 2m 23s (remain 24m 17s) Loss: 0.0931(0.1215) Grad: 329270.84375 LR: 0.00012537 
[2][159/1563] Elapsed: 2m 43s (remain 23m 56s) Loss: 0.1328(0.1218) Grad: 432949.125 LR: 0.00012614 
[2][179/1563] Elapsed: 3m 4s (remain 23m 35s) Loss: 0.1233(0.1200) Grad: 204926.59375 LR: 0.00012691 
[2][199/1563] Elapsed: 3m 24s (remain 23m 15s) Loss: 0.0635(0.1182) Grad: 184977.57