In [1]:
from __future__ import annotations

import warnings
import logging
import colorlog
from multiprocessing import cpu_count

from pathlib import Path
from functools import partial

import pandas as pd
import numpy as np

import hydra
from omegaconf import DictConfig, OmegaConf

from torch.utils.data import DataLoader

from fastcore.xtras import Path  # for ls

import datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers.data.data_collator import default_data_collator

from torch.optim import AdamW
from torch.optim.lr_scheduler import OneCycleLR
from torchmetrics import PearsonCorrCoef, MeanSquaredError
from composer.models.huggingface import HuggingFaceModel
from composer.loggers import WandBLogger
from composer import Trainer

In [2]:
warnings.filterwarnings("ignore")

In [3]:
param = {
    'apex': True,
    'awp_eps': 1e-2,
    'awp_lr': 1e-4,
    'batch_size': 32, # 2
    'batch_scheduler': False,
    'betas': (0.9, 0.999),
    'ckpt_name': 'deberta_v3_small',
    'debug': True, # False
    'decoder_lr': 1e-5,
    'encoder_lr': 1e-5,
    'eps': 1e-6,
    'fc_dropout':0.2,
    'max_grad_norm': 1000,
    'max_len': 400, # 512
    'min_lr': 1e-7,
    'model_name': 'microsoft/deberta-v3-small',
    'n_cycles': 0.5,
    'n_epochs': 4, # 12
    'n_eval_steps': 100,
    'n_folds': 2, # 4
    'n_gradient_accumulation_steps': 1,
    'n_warmup_steps': 0,
    'n_workers': 0,
    'nth_awp_start_epoch': 6, # 4
    'print_freq': 100,
    'scheduler_name': 'cosine',
    'seed': 42,
    'output_dir': 'output',
    'tar_token': '[TAR]',
    'wandb': False,
    'weight_decay': 0.01,
}

In [4]:
path = Path("dataset")
output_path = Path("output")

In [5]:
class Config:
    def __init__(self, d: dict) -> None:
        for k,v in d.items():
            setattr(self, k, v)

cfg = Config(d=param)

In [6]:
if not output_path.exists():
    output_path.mkdir()

In [7]:
train_df = pd.read_csv(path/"train.csv")
cpc_titles_df = pd.read_csv(path/"cpc_titles.csv")

In [8]:
train_df.head()

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75
2,36d72442aefd8232,abatement,active catalyst,A47,0.25
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0


In [9]:
cpc_titles_df.head()

Unnamed: 0,context,context_text
0,A01,HUMAN NECESSITIES. GRICULTURE; FORESTRY; ANIMA...
1,A21,HUMAN NECESSITIES. BAKING; EDIBLE DOUGHS
2,A22,HUMAN NECESSITIES. BUTCHERING; MEAT TREATMENT;...
3,A23,HUMAN NECESSITIES. FOODS OR FOODSTUFFS; TREATM...
4,A24,HUMAN NECESSITIES. TOBACCO; CIGARS; CIGARETTES...


In [10]:
train_df = train_df.merge(cpc_titles_df, on="context", how="left")

In [11]:
train_df.head()

Unnamed: 0,id,anchor,target,context,score,context_text
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...


In [12]:
tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)
special_tokens_dict = {'additional_special_tokens': [cfg.tar_token]}
tokenizer.add_special_tokens(special_tokens_dict)
tar_token_id = tokenizer(f'[{cfg.tar_token}]', add_special_tokens=False)['input_ids'][0]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
tar_token_id

647

In [14]:
setattr(tokenizer, 'tar_token', f'{cfg.tar_token}')
setattr(tokenizer, 'tar_token_id', tar_token_id)

In [15]:
tokenizer.tar_token_id, tokenizer.tar_token

(647, '[TAR]')

In [16]:
tokenizer.all_special_tokens_extended

['[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]', '[TAR]']

In [17]:
train_df['text'] = train_df['anchor'] + '[SEP]' + train_df['target'] + '[SEP]'  + train_df['context_text']

In [18]:
import torch
from torch.utils.data import Dataset

In [19]:
cfg.max_len = 133
cfg.tokenizer = tokenizer

In [20]:
train_df

Unnamed: 0,id,anchor,target,context,score,context_text,text
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.50,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,abatement[SEP]abatement of pollution[SEP]HUMAN...
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,abatement[SEP]act of abating[SEP]HUMAN NECESSI...
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,abatement[SEP]active catalyst[SEP]HUMAN NECESS...
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.50,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,abatement[SEP]eliminating process[SEP]HUMAN NE...
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.00,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,abatement[SEP]forest region[SEP]HUMAN NECESSIT...
...,...,...,...,...,...,...,...
36468,8e1386cbefd7f245,wood article,wooden article,B44,1.00,PERFORMING OPERATIONS; TRANSPORTING. DECORATIV...,wood article[SEP]wooden article[SEP]PERFORMING...
36469,42d9e032d1cd3242,wood article,wooden box,B44,0.50,PERFORMING OPERATIONS; TRANSPORTING. DECORATIV...,wood article[SEP]wooden box[SEP]PERFORMING OPE...
36470,208654ccb9e14fa3,wood article,wooden handle,B44,0.50,PERFORMING OPERATIONS; TRANSPORTING. DECORATIV...,wood article[SEP]wooden handle[SEP]PERFORMING ...
36471,756ec035e694722b,wood article,wooden material,B44,0.75,PERFORMING OPERATIONS; TRANSPORTING. DECORATIV...,wood article[SEP]wooden material[SEP]PERFORMIN...


In [21]:
from sklearn.model_selection import train_test_split

def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df, is_valid=False):
        self.cfg = cfg

        if is_valid:
            _, df = train_test_split(df, test_size=0.2, random_state=cfg.seed)
            self.valid_scores = df['score'].explode().to_numpy()
        else:
            df, _ = train_test_split(df, test_size=0.2, random_state=cfg.seed)
        self.texts = df['text'].values
        self.labels = df['score'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label

In [22]:
train_dataset = TrainDataset(cfg, train_df)
inputs, label = train_dataset[0]
print(inputs['input_ids'].shape)
print(inputs['attention_mask'].shape)
print(label.shape)

torch.Size([133])
torch.Size([133])
torch.Size([])


In [23]:
val_dataset = TrainDataset(cfg, train_df, is_valid=True)
inputs, label = val_dataset[1]
print(inputs)
print(label)

{'input_ids': tensor([     1,  12462,   4844,      2,  11994,   3036,      2,  97623,   4479,
        109320,    346, 112822,   4479,    260,  59248,    430,   3078,  43799,
             2,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,   

In [24]:
len(train_dataset), len(val_dataset)

(29178, 7295)

In [25]:
# create training dataloader and get one batch
train_loader = DataLoader(train_dataset,
                            batch_size=cfg.batch_size,
                            shuffle=True,
                            num_workers=cfg.n_workers,
                            pin_memory=True,
                            drop_last=True)

inputs, label = next(iter(train_loader))

print(inputs['input_ids'].shape)
print(label.shape)

torch.Size([32, 133])
torch.Size([32])


In [26]:
from torch import Tensor
from torch.nn import Module
import torch.nn as nn
from transformers import AutoModel, AutoConfig

class CustomModel(Module):
    def __init__(self, model_name: str, n_vocabs: int) -> None:
        super().__init__()
        self.cfg = cfg
        self.model_config = AutoConfig.from_pretrained(
            model_name, output_hidden_states=True)
        self.model = AutoModel.from_pretrained(
            model_name, config=self.model_config)
        self.model.resize_token_embeddings(n_vocabs)
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.model_config.hidden_size, 1)
        self._init_weights(self.fc)
        self.attention = nn.Sequential(
            nn.Linear(self.model_config.hidden_size, 512),
            nn.Tanh(),
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )
        self._init_weights(self.attention)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.model_config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.model_config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        # feature = torch.mean(last_hidden_states, 1)
        weights = self.attention(last_hidden_states)
        feature = torch.sum(weights * last_hidden_states, dim=1)
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output

In [27]:
from torch import Tensor
from torch.nn import Module
from torch.optim import Optimizer
from torch.nn.modules.loss import _Loss

class AWP:
    def __init__(
        self,
        model: Module,
        criterion: _Loss,
        optimizer: Optimizer,
        apex: bool,
        adv_param: str="weight",
        adv_lr: float=1.0,
        adv_eps: float=0.01
    ) -> None:
        self.model = model
        self.criterion = criterion
        self.optimizer = optimizer
        self.adv_param = adv_param
        self.adv_lr = adv_lr
        self.adv_eps = adv_eps
        self.apex = apex
        self.backup = {}
        self.backup_eps = {}

    def attack_backward(self, inputs: dict, label: Tensor) -> Tensor:
        with torch.cuda.amp.autocast(enabled=self.apex):
            self._save()
            self._attack_step()
            y_preds = self.model(inputs)
            adv_loss = self.criterion(
                y_preds.view(-1, 1), label.view(-1, 1))
            mask = (label.view(-1, 1) != -1)
            adv_loss = torch.masked_select(adv_loss, mask).mean()
            self.optimizer.zero_grad()
        return adv_loss

    def _attack_step(self) -> None:
        e = 1e-6
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                norm1 = torch.norm(param.grad)
                norm2 = torch.norm(param.data.detach())
                if norm1 != 0 and not torch.isnan(norm1):
                    r_at = self.adv_lr * param.grad / (norm1 + e) * (norm2 + e)
                    param.data.add_(r_at)
                    param.data = torch.min(
                        torch.max(
                            param.data, self.backup_eps[name][0]), self.backup_eps[name][1]
                    )

    def _save(self) -> None:
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                if name not in self.backup:
                    self.backup[name] = param.data.clone()
                    grad_eps = self.adv_eps * param.abs().detach()
                    self.backup_eps[name] = (
                        self.backup[name] - grad_eps,
                        self.backup[name] + grad_eps,
                    )

    def _restore(self) -> None:
        for name, param in self.model.named_parameters():
            if name in self.backup:
                param.data = self.backup[name]
        self.backup = {}
        self.backup_eps = {}

In [28]:
# ====================================================
# Helper functions
# ====================================================
import math
import time
from tqdm.notebook import tqdm

class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

In [38]:
from numpy import ndarray
import scipy as sp
from torch.profiler import profile, record_function, ProfilerActivity


def train_fn(train_loader, model, awp, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=cfg.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    tot_loss = 0
    if not epoch < cfg.nth_awp_start_epoch:
        print(f'AWP training with epoch {epoch+1}')
    for step, (inputs, labels) in enumerate(train_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
    
        if cfg.n_gradient_accumulation_steps > 1:
            loss = loss / cfg.n_gradient_accumulation_steps

        scaler.scale(loss).backward()

        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.max_grad_norm)
        
        # awp stuff
        if cfg.nth_awp_start_epoch <= epoch:
            loss = awp.attack_backward(inputs, labels)
            scaler.scale(loss).backward()
            awp._restore()
        
        losses.update(loss.item(), batch_size)

        tot_loss += loss.item()
        end = time.time()
        if (step + 1) % cfg.n_gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if cfg.batch_scheduler:
                scheduler.step()

        if step % cfg.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                'Elapsed {remain:s} '
                'Loss: {loss.val:.4f}({avg_loss:.4f}) '
                'Grad: {grad_norm:.4f}  '
                'LR: {lr:.8f}  '
                .format(epoch+1, step, len(train_loader), 
                        remain=timeSince(start, float(step+1)/len(train_loader)),
                        loss=losses,
                        avg_loss=tot_loss/(step+1),
                        grad_norm=grad_norm,
                        lr=scheduler.get_lr()[0]))

    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        if cfg.n_gradient_accumulation_steps > 1:
            loss = loss / cfg.n_gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % cfg.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    predictions = np.concatenate(predictions)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

def get_score(y_true: ndarray, y_pred: ndarray) -> float:
    score = sp.stats.pearsonr(y_true, y_pred)[0]
    return score

In [34]:
import wandb
import gc

from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

# ====================================================
# train loop
# ====================================================
def train_loop(train_df, awp=False):
    if awp:
        cfg.nth_awp_start_epoch = 1
    train_dataset = TrainDataset(cfg, train_df)
    valid_dataset = TrainDataset(cfg, train_df, is_valid=True)

    train_loader = DataLoader(train_dataset,
                              batch_size=cfg.batch_size,
                              shuffle=True,
                              num_workers=cfg.n_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=cfg.batch_size,
                              shuffle=False,
                              num_workers=cfg.n_workers, pin_memory=True, drop_last=False)
    valid_labels = valid_dataset.valid_scores

    # ====================================================
    #  tokenzier & model & optimizer
    # ====================================================
    model = CustomModel(cfg.model_name, n_vocabs=len(cfg.tokenizer))
    # torch.save(model.model_config, f'{cfg.output_dir}config.pth')
    model.cuda()
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=cfg.encoder_lr, 
                                                decoder_lr=cfg.decoder_lr,
                                                weight_decay=cfg.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=cfg.encoder_lr, eps=cfg.eps, betas=cfg.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler_name == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler_name == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.n_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.n_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_dataset) // cfg.batch_size * cfg.n_epochs)
    scheduler = get_scheduler(cfg, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.BCEWithLogitsLoss(reduction="mean")

    # ====================================================
    # awp
    # ====================================================
    awp = AWP(
        model=model,
        criterion=criterion,
        optimizer=optimizer,
        apex=cfg.apex,
        adv_lr=cfg.awp_lr,
        adv_eps=cfg.awp_eps,
    )
    
    best_score = 0.
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    for epoch in range(cfg.n_epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(train_loader, model, awp, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        print(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        print(f'Epoch {epoch+1} - Score: {score:.4f}')
        if cfg.wandb:
            wandb.log({f"epoch": epoch+1, 
                       f"avg_train_loss": avg_loss, 
                       f"avg_val_loss": avg_val_loss,
                       f"score": score})
        
        if best_score < score:
            best_score = score
            print(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            # torch.save({'model': model.state_dict(),
            #             'predictions': predictions},
            #             cfg.output_dir+f"{cfg.model.replace('/', '-')}_best.pth")
        
        if not cfg.batch_scheduler:
            scheduler.step()

    # predictions = torch.load(cfg.output_dir+f"{cfg.model.replace('/', '-')}_best.pth", 
    #                          map_location=torch.device('cpu'))['predictions']
    
    torch.cuda.empty_cache()
    gc.collect()


In [31]:
# without awp
train_loop(train_df)

Some weights of the model checkpoint at microsoft/deberta-v3-small were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.bias', 'mask_predictions.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/911] Elapsed 0m 1s (remain 21m 54s) Loss: 0.8752(0.8752) Grad: 545037.3125  LR: 0.00001000  
Epoch: [1][100/911] Elapsed 0m 33s (remain 4m 32s) Loss: 0.6190(0.6654) Grad: 40512.6719  LR: 0.00001000  
Epoch: [1][200/911] Elapsed 1m 6s (remain 3m 55s) Loss: 0.5877(0.6424) Grad: 54439.4766  LR: 0.00001000  
Epoch: [1][300/911] Elapsed 1m 39s (remain 3m 21s) Loss: 0.6557(0.6283) Grad: 103767.4219  LR: 0.00001000  
Epoch: [1][400/911] Elapsed 2m 12s (remain 2m 48s) Loss: 0.5348(0.6186) Grad: 79035.4141  LR: 0.00001000  
Epoch: [1][500/911] Elapsed 2m 45s (remain 2m 15s) Loss: 0.5374(0.6126) Grad: 123632.1016  LR: 0.00001000  
Epoch: [1][600/911] Elapsed 3m 18s (remain 1m 42s) Loss: 0.5955(0.6078) Grad: 94763.8125  LR: 0.00001000  
Epoch: [1][700/911] Elapsed 3m 51s (remain 1m 9s) Loss: 0.5996(0.6032) Grad: 112590.0781  LR: 0.00001000  
Epoch: [1][800/911] Elapsed 4m 23s (remain 0m 36s) Loss: 0.6070(0.5987) Grad: 191803.1406  LR: 0.00001000  
Epoch: [1][900/911] Elapsed 4m 56s (

In [35]:
# with awp
train_loop(train_df, awp=True)

Some weights of the model checkpoint at microsoft/deberta-v3-small were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.bias', 'mask_predictions.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/911] Elapsed 0m 0s (remain 4m 38s) Loss: 0.6674(0.6674) Grad: 161547.9062  LR: 0.00001000  
Epoch: [1][100/911] Elapsed 0m 33s (remain 4m 25s) Loss: 0.6023(0.6613) Grad: 208309.6250  LR: 0.00001000  
Epoch: [1][200/911] Elapsed 1m 5s (remain 3m 52s) Loss: 0.6154(0.6494) Grad: 55361.1289  LR: 0.00001000  
Epoch: [1][300/911] Elapsed 1m 38s (remain 3m 20s) Loss: 0.5816(0.6342) Grad: 71358.9062  LR: 0.00001000  
Epoch: [1][400/911] Elapsed 2m 11s (remain 2m 47s) Loss: 0.5448(0.6262) Grad: 181526.3125  LR: 0.00001000  
Epoch: [1][500/911] Elapsed 2m 44s (remain 2m 14s) Loss: 0.5799(0.6174) Grad: 76826.3203  LR: 0.00001000  
Epoch: [1][600/911] Elapsed 3m 17s (remain 1m 41s) Loss: 0.5512(0.6101) Grad: 156022.9219  LR: 0.00001000  
Epoch: [1][700/911] Elapsed 3m 50s (remain 1m 9s) Loss: 0.6321(0.6044) Grad: 88221.1953  LR: 0.00001000  
Epoch: [1][800/911] Elapsed 4m 23s (remain 0m 36s) Loss: 0.5716(0.6004) Grad: 151570.7500  LR: 0.00001000  
Epoch: [1][900/911] Elapsed 4m 56s (r

In [37]:
# awp experimental
train_loop(train_df, awp=True)

Some weights of the model checkpoint at microsoft/deberta-v3-small were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.bias', 'mask_predictions.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/911] Elapsed 0m 0s (remain 4m 37s) Loss: 0.7636(0.7636) Grad: 509024.6250  LR: 0.00001000  
Epoch: [1][100/911] Elapsed 0m 32s (remain 4m 21s) Loss: 0.6684(0.6632) Grad: 56533.6484  LR: 0.00001000  
Epoch: [1][200/911] Elapsed 1m 4s (remain 3m 49s) Loss: 0.6121(0.6444) Grad: 82577.4766  LR: 0.00001000  
Epoch: [1][300/911] Elapsed 1m 37s (remain 3m 17s) Loss: 0.6381(0.6294) Grad: 98416.9062  LR: 0.00001000  
Epoch: [1][400/911] Elapsed 2m 10s (remain 2m 45s) Loss: 0.6236(0.6204) Grad: 97415.6719  LR: 0.00001000  
Epoch: [1][500/911] Elapsed 2m 42s (remain 2m 13s) Loss: 0.6316(0.6118) Grad: 126235.6016  LR: 0.00001000  
Epoch: [1][600/911] Elapsed 3m 15s (remain 1m 40s) Loss: 0.5452(0.6052) Grad: 106304.9766  LR: 0.00001000  
Epoch: [1][700/911] Elapsed 3m 48s (remain 1m 8s) Loss: 0.5044(0.6008) Grad: 95169.6250  LR: 0.00001000  
Epoch: [1][800/911] Elapsed 4m 21s (remain 0m 35s) Loss: 0.5103(0.5976) Grad: 92704.4688  LR: 0.00001000  
Epoch: [1][900/911] Elapsed 4m 54s (rem