In [1]:
!pip install transformers sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 15.1 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 52.8 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 66.8 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 94.3 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers, sentencepiece
Successfully installed huggingface-hub-0.10.1 sentencepiece-0.1.97 tokenizers-0.13.1 transformers-4

In [2]:
!pip install optuna

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting optuna
  Downloading optuna-3.0.3-py3-none-any.whl (348 kB)
[K     |████████████████████████████████| 348 kB 16.1 MB/s 
[?25hCollecting importlib-metadata<5.0.0
  Downloading importlib_metadata-4.13.0-py3-none-any.whl (23 kB)
Collecting alembic>=1.5.0
  Downloading alembic-1.8.1-py3-none-any.whl (209 kB)
[K     |████████████████████████████████| 209 kB 87.2 MB/s 
[?25hCollecting cmaes>=0.8.2
  Downloading cmaes-0.8.2-py3-none-any.whl (15 kB)
Collecting cliff
  Downloading cliff-3.10.1-py3-none-any.whl (81 kB)
[K     |████████████████████████████████| 81 kB 11.0 MB/s 
Collecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting Mako
  Downloading Mako-1.2.3-py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 9.0 MB/s 
Collecting autopage>=0.4.0
  Downloading autopage-0.5.1-py3-none-any.whl (29 kB)
Collecting cmd2>=1.0.0
  Do

In [3]:
import torch
import torch.nn as nn
import numpy as np
import os
from transformers import AutoModel, AutoConfig, AutoTokenizer, get_cosine_schedule_with_warmup, get_polynomial_decay_schedule_with_warmup, get_linear_schedule_with_warmup, get_cosine_with_hard_restarts_schedule_with_warmup
import pandas as pd
from torch.cuda.amp import autocast, GradScaler
from sklearn.metrics import mean_squared_error
import random
import time
from torch.utils import checkpoint
import math
import gc
from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs
import warnings
import torch.nn.functional as F
import optuna

In [4]:
#######
#### BEST PARAMS FOR THE DEBERTA V3 LARGE MODEL
# {'learning_rate': 1.7027701828938127e-05,
#  'layer_wise_learning_rate_decay': 0.8780935762069765,
#  'learning_rate_schduler': 'polynomial',
#  'reinit_layers': 1}
#######


#####
#### BEST PARAMS FOR THE DEBERTA V3 BASE MODEL
# {'learning_rate': 0.0002634969863920811, 
#  'layer_wise_learning_rate_decay': 0.7867664854455205, 
#  'learning_rate_schduler': 'polynomial', 
#  'reinit_layers': 3}
#####

In [5]:
import transformers
transformers.logging.set_verbosity_error()

In [6]:
warnings.simplefilter('ignore')

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

In [9]:
class CFG:
    train_file = "/content/drive/MyDrive/Kaggle/FeedbackPrize3/train_folds.csv"
    fold = 0
    batch_size = 8
    num_workers = 4
    target_columns = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    hidden_dropout_prob = 0.0
    reinit_weights = False
    reinit_layers = 1
    lr = 2e-5
    llrd = 0.9
    warmup_ratio = 0.0
    use_awp = False
    adv_lr = 0.0002
    adv_eps = 0.001
    model_name = "microsoft/deberta-v3-large"
    gradient_accumulation_steps = 2
    max_grad_norm = 10
    print_freq = 20
    epochs = 3
    specific_max_len = 512
    token_dropout = False
    token_dropout_prob = 0.1
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    gradient_checkpointing_enable = True
    save_dir = "deberta-v3-large"
    save_model_name = "deberta-v3-large"

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/580 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

In [10]:
#Preprocessing Functions

def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

# Register the encoding and decoding error handlers for `utf-8` and `cp1252`.
codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)

def resolve_encodings_and_normalize(text: str) -> str:
    """Resolve the encoding problems and normalize the abnormal characters."""
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text

In [11]:
#Utiliy functions 
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
        
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))
        
def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores


def get_score(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return mcrmse_score, scores

In [12]:
class Collate:
    def __init__(self, tokenizer, max_length = CFG.specific_max_len):
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __call__(self, batch):
        
        batch_len = max([len(sample["ids"]) for sample in batch])
        
        output = dict()
        output["ids"] = [sample["ids"] for sample in batch]
        output["mask"] = [sample["mask"] for sample in batch]
        output["targets"] = [sample["targets"] for sample in batch]
        
        if self.tokenizer.padding_side == "right":
            output["ids"] = [s + [self.tokenizer.pad_token_id] * (batch_len - len(s)) for s in output["ids"]]
            output["mask"] = [s + [0] * (batch_len - len(s)) for s in output["mask"]]
        else:
            output["ids"] = [[self.tokenizer.pad_token_id] * (batch_len - len(s)) + s for s in output["ids"]]
            output["mask"] = [[0] * (batch_len - len(s)) + s for s in output["mask"]]
            
            
        output["ids"] = torch.tensor(output["ids"], dtype = torch.long)
        output["mask"] = torch.tensor(output["mask"], dtype = torch.long)
        output["targets"] = torch.tensor(output["targets"], dtype = torch.float32)
        
        return output

In [13]:
class Dataset:
    def __init__(self, texts, targets, tokenizer, is_train = True):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.is_train = is_train
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        
        text = self.texts[idx]
        targets = self.targets[idx]
        
        
        if CFG.specific_max_len is not None:
          encoding = self.tokenizer(text, add_special_tokens = True, max_length = CFG.specific_max_len, padding = False, truncation = 'longest_first')
        else:
          encoding = self.tokenizer(text, add_special_tokens = True)
        
        sample = dict()

        if CFG.token_dropout and self.is_train:
          idxs = np.random.choice(np.arange(1, len(encoding["input_ids"]) - 1), size = int(CFG.token_dropout_prob * len(encoding["input_ids"])), replace = False)
          ids = np.array(encoding["input_ids"])
          ids[idxs] = self.tokenizer.mask_token_id
          encoding["input_ids"] = ids.tolist()
          
        
        sample["ids"] = encoding["input_ids"]
        sample["mask"] = encoding["attention_mask"]  
        sample["targets"] = targets
        
        return sample

In [14]:
class RMSELoss(nn.Module):
    def __init__(self, reduction='mean', eps=1e-9):
        super().__init__()
        self.mse = nn.MSELoss(reduction='none')
        self.reduction = reduction
        self.eps = eps

    def forward(self, y_pred, y_true):
        loss = torch.sqrt(self.mse(y_pred, y_true) + self.eps)
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss

class SmoothRMSELoss(nn.Module):
    def __init__(self, reduction='mean', eps=1e-9):
        super().__init__()
        self.mse = nn.SmoothL1Loss(reduction='none')
        self.reduction = reduction
        self.eps = eps

    def forward(self, y_pred, y_true):
        loss = torch.sqrt(self.mse(y_pred, y_true) + self.eps)
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss

class SmoothRMSEComp(nn.Module):
  def __init__(self, reduction='mean', eps=1e-9):
        super().__init__()
        self.mse = nn.SmoothL1Loss(reduction='none')
        self.reduction = reduction
        self.eps = eps

  def forward(self, y_pred, y_true):
    loss = torch.sqrt(self.mse(y_pred, y_true).mean(dim = 0)).mean(dim = 0)
    return loss  


class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class WeightedLayerPooling(nn.Module):
    def __init__(self, num_hidden_layers, layer_start: int = 4, layer_weights = None):
        super(WeightedLayerPooling, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = num_hidden_layers
        self.layer_weights = layer_weights if layer_weights is not None \
            else nn.Parameter(
                torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float)
            )

    def forward(self, features):

        all_layer_embedding = torch.stack(features)
        all_layer_embedding = all_layer_embedding[self.layer_start:, :, :, :]

        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum()

        return weighted_average

In [15]:
#AWP
class AWP:
    def __init__(
        self,
        model,
        optimizer,
        adv_param="weight",
        adv_lr=1,
        adv_eps=0.2,
        start_epoch=0,
        adv_step=1,
        scaler=None
    ):
        self.model = model
        self.optimizer = optimizer
        self.adv_param = adv_param
        self.adv_lr = adv_lr
        self.adv_eps = adv_eps
        self.start_epoch = start_epoch
        self.adv_step = adv_step
        self.backup = {}
        self.backup_eps = {}
        self.scaler = scaler

    def attack_backward(self, x, y, attention_mask,epoch):
        if (self.adv_lr == 0) or (epoch < self.start_epoch):
            return None

        self._save() 
        for i in range(self.adv_step):
            self._attack_step() 
            with torch.cuda.amp.autocast():
                adv_loss, tr_logits = self.model(ids=x, mask=attention_mask, targets=y)
                adv_loss = adv_loss.mean()
            self.optimizer.zero_grad()
            self.scaler.scale(adv_loss).backward()
            
        self._restore()

    def _attack_step(self):
        e = 1e-6
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                norm1 = torch.norm(param.grad)
                norm2 = torch.norm(param.data.detach())
                if norm1 != 0 and not torch.isnan(norm1):
                    r_at = self.adv_lr * param.grad / (norm1 + e) * (norm2 + e)
                    param.data.add_(r_at)
                    param.data = torch.min(
                        torch.max(param.data, self.backup_eps[name][0]), self.backup_eps[name][1]
                    )
                # param.data.clamp_(*self.backup_eps[name])

    def _save(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                if name not in self.backup:
                    self.backup[name] = param.data.clone()
                    grad_eps = self.adv_eps * param.abs().detach()
                    self.backup_eps[name] = (
                        self.backup[name] - grad_eps,
                        self.backup[name] + grad_eps,
                    )

    def _restore(self,):
        for name, param in self.model.named_parameters():
            if name in self.backup:
                param.data = self.backup[name]
        self.backup = {}
        self.backup_eps = {}

In [16]:
class Model(nn.Module):
    def __init__(self, model_name, params = None):
        super(Model, self).__init__()
        
        self.model_name = model_name

        hidden_dropout_prob: float = CFG.hidden_dropout_prob
        layer_norm_eps: float = 1e-7

        config = AutoConfig.from_pretrained(model_name)

        config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout_prob": hidden_dropout_prob,
                "attention_probs_dropout_prob" : hidden_dropout_prob,
                "layer_norm_eps": layer_norm_eps,
                "add_pooling_layer": False,
                "num_labels": 6,
            }
        )
        
        self.config = config
        self.params = params
        
        self.transformer = AutoModel.from_pretrained(model_name, config=config)
        if CFG.gradient_checkpointing_enable:
            self.transformer.gradient_checkpointing_enable()
        
        self.output = nn.Linear(config.hidden_size, 6)
        self.loss = nn.SmoothL1Loss(reduction = "mean")

        if CFG.reinit_weights:
          self.init_weights_(params["reinit_layers"])

    def init_weights_(self, reinit_layers):
      for layer in self.transformer.encoder.layer[-reinit_layers:]:
        for module in layer.modules():
            if isinstance(module, nn.Linear):
                module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
                if module.bias is not None:
                    module.bias.data.zero_()
            elif isinstance(module, nn.Embedding):
                module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
                if module.padding_idx is not None:
                    module.weight.data[module.padding_idx].zero_()
            elif isinstance(module, nn.LayerNorm):
                module.bias.data.zero_()
                module.weight.data.fill_(1.0)

    def get_grouped_llrd_optimizer_scheduler(self, num_train_steps, params):
      no_decay = ["bias", "LayerNorm.weight"]
      top_params = list(self.output.named_parameters())
      # initialize lr for task specific layer
      optimizer_grouped_parameters = [
              {
                  "params": [p for n, p in top_params if not any(nd in n for nd in no_decay)],
                  "weight_decay": 0.001,
                  "lr": params["lr"],
              },
              {
                  "params": [p for n, p in top_params if any(nd in n for nd in no_decay)],
                  "weight_decay": 0.0,
                  "lr": params["lr"],
              },
          ]

      bottom_params = [(n,p) for n,p in self.named_parameters() if n in ["transformer.encoder.rel_embeddings.weight","transformer.encoder.LayerNorm.weight","transformer.encoder.LayerNorm.bias"]]

      optimizer_grouped_parameters += [
              {
                  "params": [p for n, p in bottom_params if not any(nd in n for nd in no_decay)],
                  "weight_decay": 0.001,
                  "lr": params["lr"],
              },
              {
                  "params": [p for n, p in bottom_params if any(nd in n for nd in no_decay)],
                  "weight_decay": 0.0,
                  "lr": params["lr"],
              },
          ]
      # initialize lrs for every layer
      num_layers = self.config.num_hidden_layers
      layers = [self.transformer.embeddings] + list(self.transformer.encoder.layer)
      layers.reverse()
      lr = params["lr"]
      for i,layer in enumerate(layers):
        lr *= params["llrd"]

        optimizer_grouped_parameters += [
            {
                "params": [p for n, p in layer.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": 0.001,
                "lr": lr,
            },
            {
                "params": [p for n, p in layer.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
                "lr": lr,
            },
        ]
      opt = torch.optim.AdamW(optimizer_grouped_parameters)
      if params["scheduler"] == "polynomial":
        sch = get_polynomial_decay_schedule_with_warmup(
            opt,
            num_warmup_steps=int(num_train_steps * CFG.warmup_ratio),
            num_training_steps=num_train_steps,
            last_epoch=-1,
        )
      elif params["scheduler"] == "linear":
        sch = get_linear_schedule_with_warmup(
            opt,
            num_warmup_steps=int(num_train_steps * CFG.warmup_ratio),
            num_training_steps=num_train_steps,
            last_epoch=-1,
        )
      else:
        sch = get_cosine_schedule_with_warmup(
            opt,
            num_warmup_steps=int(num_train_steps * CFG.warmup_ratio),
            num_training_steps=num_train_steps,
            last_epoch=-1,
        )

      return opt, sch
        
    def get_optimizer_scheduler(self, num_train_steps):
        param_optimizer = list(self.named_parameters())
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {
                "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                "weight_decay": 0.001,
            },
            {
                "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        opt = torch.optim.AdamW(optimizer_parameters, lr=CFG.lr)
        sch = get_polynomial_decay_schedule_with_warmup(
            opt,
            num_warmup_steps=int(num_train_steps * CFG.warmup_ratio),
            num_training_steps=num_train_steps,
            last_epoch=-1,
        )
        return opt, sch

    def forward(self, ids, mask, token_type_ids=None, targets=None):
        if token_type_ids is not None:
            transformer_out = self.transformer( ids, mask, token_type_ids )
        else:
            transformer_out = self.transformer(ids, mask)
        sequence_output = transformer_out.last_hidden_state[:,0,:]
    
        logits = self.output(sequence_output)
        loss = self.loss(logits, targets)

        return loss, logits

In [17]:
class Model2(nn.Module):
    def __init__(self, model_name, params = None):
        super(Model2, self).__init__()
        
        self.model_name = model_name

        hidden_dropout_prob: float = CFG.hidden_dropout_prob
        layer_norm_eps: float = 1e-7

        config = AutoConfig.from_pretrained(model_name)

        config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout_prob": hidden_dropout_prob,
                "attention_probs_dropout_prob" : hidden_dropout_prob,
                "layer_norm_eps": layer_norm_eps,
                "add_pooling_layer": False,
                "num_labels": 6,
            }
        )
        
        self.config = config
        self.params = params
        
        self.transformer = AutoModel.from_pretrained(model_name, config=config)
        if CFG.gradient_checkpointing_enable:
            self.transformer.gradient_checkpointing_enable()
        
        self.output = nn.Linear(config.hidden_size * 2, 6)
        self.drop1 = nn.Dropout(0.1)
        self.drop2 = nn.Dropout(0.2)
        self.drop3 = nn.Dropout(0.3)
        self.loss = nn.SmoothL1Loss(reduction = "mean")

        if self.params is not None:
          self.init_weights_(params["reinit_layers"])

    def init_weights_(self, reinit_layers):
      for layer in self.transformer.encoder.layer[-reinit_layers:]:
        for module in layer.modules():
            if isinstance(module, nn.Linear):
                module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
                if module.bias is not None:
                    module.bias.data.zero_()
            elif isinstance(module, nn.Embedding):
                module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
                if module.padding_idx is not None:
                    module.weight.data[module.padding_idx].zero_()
            elif isinstance(module, nn.LayerNorm):
                module.bias.data.zero_()
                module.weight.data.fill_(1.0)

    def get_grouped_llrd_optimizer_scheduler(self, num_train_steps, params):
      
      
      no_decay = ["bias", "LayerNorm.weight"]
      top_params = list(self.output.named_parameters())
      # initialize lr for task specific layer
      optimizer_grouped_parameters = [
              {
                  "params": [p for n, p in top_params if not any(nd in n for nd in no_decay)],
                  "weight_decay": 0.001,
                  "lr": params["lr"],
              },
              {
                  "params": [p for n, p in top_params if any(nd in n for nd in no_decay)],
                  "weight_decay": 0.0,
                  "lr": params["lr"],
              },
          ]

      bottom_params = [(n,p) for n,p in self.named_parameters() if n in ["transformer.encoder.rel_embeddings.weight","transformer.encoder.LayerNorm.weight","transformer.encoder.LayerNorm.bias"]]

      optimizer_grouped_parameters += [
              {
                  "params": [p for n, p in bottom_params if not any(nd in n for nd in no_decay)],
                  "weight_decay": 0.001,
                  "lr": params["lr"],
              },
              {
                  "params": [p for n, p in bottom_params if any(nd in n for nd in no_decay)],
                  "weight_decay": 0.0,
                  "lr": params["lr"],
              },
          ]
      # initialize lrs for every layer
      num_layers = self.config.num_hidden_layers
      layers = [self.transformer.embeddings] + list(self.transformer.encoder.layer)
      layers.reverse()
      lr = params["lr"]
      for i,layer in enumerate(layers):
        lr *= params["llrd"]

        optimizer_grouped_parameters += [
            {
                "params": [p for n, p in layer.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": 0.001,
                "lr": lr,
            },
            {
                "params": [p for n, p in layer.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
                "lr": lr,
            },
        ]
      opt = torch.optim.AdamW(optimizer_grouped_parameters)
      if params["scheduler"] == "polynomial":
        sch = get_polynomial_decay_schedule_with_warmup(
            opt,
            num_warmup_steps=int(num_train_steps * CFG.warmup_ratio),
            num_training_steps=num_train_steps,
            last_epoch=-1,
        )
      elif params["scheduler"] == "linear":
        sch = get_linear_schedule_with_warmup(
            opt,
            num_warmup_steps=int(num_train_steps * CFG.warmup_ratio),
            num_training_steps=num_train_steps,
            last_epoch=-1,
        )
      else:
        sch = get_cosine_schedule_with_warmup(
            opt,
            num_warmup_steps=int(num_train_steps * CFG.warmup_ratio),
            num_training_steps=num_train_steps,
            last_epoch=-1,
        )

      return opt, sch
        
    def get_optimizer_scheduler(self, num_train_steps):
        param_optimizer = list(self.named_parameters())
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {
                "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                "weight_decay": 0.001,
            },
            {
                "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        opt = torch.optim.AdamW(optimizer_parameters, lr=CFG.lr)
        sch = get_polynomial_decay_schedule_with_warmup(
            opt,
            num_warmup_steps=int(num_train_steps * CFG.warmup_ratio),
            num_training_steps=num_train_steps,
            last_epoch=-1,
        )
        return opt, sch

    def forward(self, ids, mask, token_type_ids=None, targets=None):
        if token_type_ids is not None:
            transformer_out = self.transformer( ids, mask, token_type_ids )
        else:
            transformer_out = self.transformer(ids, mask)

        last_hidden_state = transformer_out.last_hidden_state
        mean_pool_output = torch.mean(last_hidden_state, 1)
        max_pool_output = torch.max(last_hidden_state, 1)[0]
        sequence_output = torch.cat((mean_pool_output, max_pool_output), 1)


        logits1 = self.output(self.drop1(sequence_output))
        logits2 = self.output(self.drop2(sequence_output))
        logits3 = self.output(self.drop3(sequence_output))

        loss = 0
        
        if targets is not None:
          loss += self.loss(logits1, targets)
          loss += self.loss(logits2, targets)
          loss += self.loss(logits3, targets)

          loss /= 3

        logits = (logits1 + logits2 + logits3) / 3

        return loss, logits

In [18]:
def train(epoch, model, train_loader, valid_loader, optimizer, scheduler, device, awp, scaler, best_loss, fold):
    model.train()
    losses = AverageMeter()
    start = end = time.time()
    val_steps = len(train_loader) // 1
    for step, x in enumerate(train_loader):
        for k,v in x.items():
            x[k] = v.to(device)
        
        with autocast():    
            loss, logits = model(**x)
            
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        
        losses.update(loss.item() * CFG.gradient_accumulation_steps , CFG.batch_size)
        scaler.scale(loss).backward()
        if CFG.use_awp:
          awp.attack_backward(x["ids"],x["targets"],x["mask"],epoch) 
        
        
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.unscale_(optimizer)
            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            if scheduler is not None:
                scheduler.step()
        end = time.time()
        
        if ((step + 1) % CFG.print_freq == 0) or (step == (len(train_loader)-1)):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step + 1, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm= grad_norm,
                          lr=scheduler.get_lr()[0]))
        
        if ((step + 1) % val_steps == 0) or ((step + 1) == len(train_loader)):
          print("\nVALID LOOP\n")
          valid_loss = valid(epoch, model, valid_loader, device)
          print(f"\nThe valid loss for the current epoch is {valid_loss}\n")
          torch.cuda.empty_cache()
          gc.collect()
          if valid_loss < best_loss:
              best_loss = valid_loss
              if CFG.save_dir is not None:
                if not os.path.exists(CFG.save_dir):
                  os.mkdir(CFG.save_dir)
                save_path = os.path.join(CFG.save_dir, f"{CFG.save_model_name}_fold_{fold}.pth")
              else:
                save_path = f"{CFG.save_model_name}_fold_{fold}.pth"
              torch.save(model.state_dict(), save_path)
        model.train()
    return best_loss

def valid(epoch, model, valid_loader, device):
    model.eval()
    all_targets = []
    all_outputs = []
    losses = AverageMeter()
    with torch.no_grad():
        for step, x in enumerate(valid_loader):

            for k, v in x.items():
                x[k] = v.to(device)
            with autocast():
                loss, logits = model(**x)

            
            losses.update(loss.item(), CFG.batch_size)
            targets = x["targets"].cpu().numpy()
            outputs = logits.cpu().numpy()

            all_targets.append(targets)
            all_outputs.append(np.clip(outputs, 1.0, 5.0))

            if ((step + 1) % CFG.print_freq == 0) or (step == (len(valid_loader)-1)):
                print('Epoch: [{0}][{1}/{2}] '
                      'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                      .format(epoch+1, step + 1, len(valid_loader), loss=losses))
        
    
    all_targets = np.vstack(all_targets)
    all_outputs = np.vstack(all_outputs)
    loss = get_score(all_targets, all_outputs)[0]
    
    del all_targets, all_outputs;
    return loss

In [19]:
def main(fold, params):
    torch.cuda.empty_cache()
    df = pd.read_csv(CFG.train_file)
    
    train_df = df.loc[df.kfold != fold]
    valid_df = df.loc[df.kfold == fold]
    
    train_texts = train_df["full_text"].apply(resolve_encodings_and_normalize).to_list()
    valid_texts = valid_df["full_text"].apply(resolve_encodings_and_normalize).to_list()
    
    train_targets = train_df[CFG.target_columns].values.tolist()
    
    valid_targets = valid_df[CFG.target_columns].values.tolist()
    
    train_ds = Dataset(train_texts, train_targets, CFG.tokenizer)
    valid_ds = Dataset(valid_texts, valid_targets, CFG.tokenizer, is_train = False)

    collate_fn = Collate(CFG.tokenizer)
    train_loader = torch.utils.data.DataLoader(train_ds, batch_size = CFG.batch_size, shuffle = True, collate_fn = collate_fn, num_workers = CFG.num_workers)
    valid_loader = torch.utils.data.DataLoader(valid_ds, batch_size = CFG.batch_size, shuffle = False, collate_fn = collate_fn, num_workers = CFG.num_workers)
    
    model = Model(CFG.model_name, params)

    ## Adding noisy tune -> NoisyTune: A Little Noise Can Help You Finetune Pretrained Language Models Better
    # noise_lambda = 0.15
    # sd = model.state_dict()
    # for name, param in model.named_parameters():
    #   sd[name][:] += (torch.rand(param.size()) - 0.5) * noise_lambda * torch.std(param)

    
    # model.load_state_dict(sd)

    
    num_train_steps = int(len(train_ds) / CFG.batch_size / CFG.gradient_accumulation_steps * CFG.epochs)
    optimizer, scheduler = model.get_grouped_llrd_optimizer_scheduler(num_train_steps, params)
    
    model = model.to(CFG.device)
    best_loss = np.inf
    scaler = GradScaler()
    if CFG.use_awp:
      print('Enable AWP')
      awp = AWP(model,
          optimizer,
          adv_lr=CFG.adv_lr,
          adv_eps=CFG.adv_eps,
          start_epoch=2,
          scaler=scaler)
    else:
      awp = None
    for epoch in range(CFG.epochs):
        print("\nTRAIN LOOP\n")
        best_loss = train(epoch, model, train_loader, valid_loader, optimizer, scheduler, CFG.device, awp, scaler, best_loss, fold)
        # print("\nVALID LOOP\n")
        # valid_loss = valid(epoch, model, valid_loader, CFG.device)
        
        # print(f"\nThe valid loss for the current epoch is {valid_loss}\n")
        # torch.cuda.empty_cache()
        # gc.collect()
        # if valid_loss < best_loss:
        #     best_loss = valid_loss
        #     if CFG.save_dir is not None:
        #       if not os.path.exists(CFG.save_dir):
        #         os.mkdir(CFG.save_dir)
        #       save_path = os.path.join(CFG.save_dir, f"{CFG.save_model_name}_fold_{fold}.pth")
        #     else:
        #       save_path = f"{CFG.save_model_name}_fold_{fold}.pth"
        #     torch.save(model.state_dict(), save_path)
            
    del model, optimizer, scheduler, train_loader, valid_loader, train_df, valid_df;
    gc.collect()

    return best_loss

In [None]:
for fold in range(5):
  print("-----"*30)
  print(f"\nRUNNING FOLD {fold}\n")
  print("-----"*30)
  params = {'lr': 5e-5,
            'llrd': 0.9,
            'scheduler': 'polynomial',
            'reinit_layers': 4}
  best_loss = main(fold, params)
  print("######")
  print(f"The best loss for fold {fold} is {best_loss}")
  print("######")
  gc.collect()

------------------------------------------------------------------------------------------------------------------------------------------------------

RUNNING FOLD 0

------------------------------------------------------------------------------------------------------------------------------------------------------


Downloading:   0%|          | 0.00/874M [00:00<?, ?B/s]


TRAIN LOOP

Epoch: [1][20/392] Elapsed 0m 36s (remain 11m 21s) Loss: 0.4230(1.3046) Grad: 2.6483  LR: 0.00004915  
Epoch: [1][40/392] Elapsed 1m 10s (remain 10m 22s) Loss: 0.2401(0.7478) Grad: 8.4716  LR: 0.00004829  
Epoch: [1][60/392] Elapsed 1m 45s (remain 9m 45s) Loss: 0.1414(0.5591) Grad: 3.5440  LR: 0.00004744  
Epoch: [1][80/392] Elapsed 2m 21s (remain 9m 13s) Loss: 0.0770(0.4489) Grad: 1.5898  LR: 0.00004659  
Epoch: [1][100/392] Elapsed 2m 57s (remain 8m 38s) Loss: 0.1291(0.3855) Grad: 6.1771  LR: 0.00004573  
Epoch: [1][120/392] Elapsed 3m 33s (remain 8m 2s) Loss: 0.0924(0.3399) Grad: 3.5196  LR: 0.00004488  
Epoch: [1][140/392] Elapsed 4m 9s (remain 7m 28s) Loss: 0.1289(0.3123) Grad: 4.9873  LR: 0.00004403  
Epoch: [1][160/392] Elapsed 4m 44s (remain 6m 53s) Loss: 0.1364(0.2887) Grad: 1.3655  LR: 0.00004317  
Epoch: [1][180/392] Elapsed 5m 20s (remain 6m 17s) Loss: 0.0897(0.2695) Grad: 1.4337  LR: 0.00004232  
Epoch: [1][200/392] Elapsed 5m 56s (remain 5m 41s) Loss: 0.1186(

In [None]:
#running hyperparameter tuning on first fold with optuna
def objective(trial):
  params = {
      "lr": trial.suggest_loguniform("learning_rate", 1e-7, 1e-4),
      "llrd" : trial.suggest_uniform("layer_wise_learning_rate_decay", 0.7, 1.0),
      "scheduler" : trial.suggest_categorical("learning_rate_schduler",["polynomial", "cosine","linear"]),
      "reinit_layers": trial.suggest_int("reinit_layers",0, 4)
  }
  best_loss= main(1,params)
  return best_loss


study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)

[32m[I 2022-09-15 18:26:18,743][0m A new study created in memory with name: no-name-d9193ad8-4b94-4d31-a28c-de8c5b753dcc[0m



TRAIN LOOP

Epoch: [1][20/196] Elapsed 1m 12s (remain 10m 33s) Loss: 1.3281(2.1507) Grad: 16.2040  LR: 0.00000612  
Epoch: [1][40/196] Elapsed 2m 23s (remain 9m 19s) Loss: 0.2657(1.2626) Grad: 3.9198  LR: 0.00001225  
Epoch: [1][60/196] Elapsed 3m 35s (remain 8m 7s) Loss: 0.1242(0.8913) Grad: 3.8266  LR: 0.00001837  
Epoch: [1][80/196] Elapsed 4m 46s (remain 6m 55s) Loss: 0.1186(0.6998) Grad: 4.7077  LR: 0.00002450  
Epoch: [1][100/196] Elapsed 5m 57s (remain 5m 43s) Loss: 0.0962(0.5867) Grad: 1.9993  LR: 0.00003062  
Epoch: [1][120/196] Elapsed 7m 9s (remain 4m 31s) Loss: 0.1393(0.5106) Grad: 2.8769  LR: 0.00003583  
Epoch: [1][140/196] Elapsed 8m 20s (remain 3m 20s) Loss: 0.1014(0.4560) Grad: 3.6234  LR: 0.00003562  
Epoch: [1][160/196] Elapsed 9m 32s (remain 2m 8s) Loss: 0.0952(0.4140) Grad: 2.8452  LR: 0.00003509  
Epoch: [1][180/196] Elapsed 10m 43s (remain 0m 57s) Loss: 0.1466(0.3822) Grad: 3.5936  LR: 0.00003426  
Epoch: [1][196/196] Elapsed 11m 39s (remain 0m 0s) Loss: 0.1468(

[32m[I 2022-09-15 19:04:12,212][0m Trial 0 finished with value: 0.45831298828125 and parameters: {'learning_rate': 3.5828994394331454e-05, 'layer_wise_learning_rate_decay': 0.9767672591137755, 'learning_rate_schduler': 'cosine', 'reinit_layers': 1}. Best is trial 0 with value: 0.45831298828125.[0m



TRAIN LOOP

Epoch: [1][20/196] Elapsed 1m 11s (remain 10m 32s) Loss: 0.6627(1.9961) Grad: 12.6625  LR: 0.00001022  
Epoch: [1][40/196] Elapsed 2m 23s (remain 9m 18s) Loss: 0.1617(1.1272) Grad: 5.9817  LR: 0.00002043  
Epoch: [1][60/196] Elapsed 3m 34s (remain 8m 6s) Loss: 0.1523(0.8059) Grad: 6.1189  LR: 0.00003065  
Epoch: [1][80/196] Elapsed 4m 46s (remain 6m 54s) Loss: 0.1614(0.6430) Grad: 4.9825  LR: 0.00004086  
Epoch: [1][100/196] Elapsed 5m 57s (remain 5m 43s) Loss: 0.0882(0.5389) Grad: 2.2667  LR: 0.00005108  
Epoch: [1][120/196] Elapsed 7m 9s (remain 4m 31s) Loss: 0.1492(0.4709) Grad: 5.0348  LR: 0.00005975  
Epoch: [1][140/196] Elapsed 8m 20s (remain 3m 20s) Loss: 0.0770(0.4227) Grad: 1.9766  LR: 0.00005941  
Epoch: [1][160/196] Elapsed 9m 32s (remain 2m 8s) Loss: 0.1981(0.3885) Grad: 4.4461  LR: 0.00005853  
Epoch: [1][180/196] Elapsed 10m 43s (remain 0m 57s) Loss: 0.1491(0.3605) Grad: 3.6420  LR: 0.00005714  
Epoch: [1][196/196] Elapsed 11m 39s (remain 0m 0s) Loss: 0.1232(

[32m[I 2022-09-15 19:42:05,320][0m Trial 1 finished with value: 0.45570382475852966 and parameters: {'learning_rate': 5.975913614980313e-05, 'layer_wise_learning_rate_decay': 0.9419119594261866, 'learning_rate_schduler': 'cosine', 'reinit_layers': 1}. Best is trial 1 with value: 0.45570382475852966.[0m



TRAIN LOOP

Epoch: [1][20/196] Elapsed 1m 11s (remain 10m 26s) Loss: 3.0075(2.8926) Grad: 45.9391  LR: 0.00000002  
Epoch: [1][40/196] Elapsed 2m 22s (remain 9m 15s) Loss: 2.9578(2.8690) Grad: 46.0174  LR: 0.00000004  
Epoch: [1][60/196] Elapsed 3m 33s (remain 8m 4s) Loss: 2.8359(2.8478) Grad: 45.5045  LR: 0.00000007  
Epoch: [1][80/196] Elapsed 4m 45s (remain 6m 53s) Loss: 2.4535(2.8280) Grad: 45.7833  LR: 0.00000009  
Epoch: [1][100/196] Elapsed 5m 56s (remain 5m 42s) Loss: 2.6267(2.7785) Grad: 45.9548  LR: 0.00000011  
Epoch: [1][120/196] Elapsed 7m 7s (remain 4m 30s) Loss: 2.3203(2.7316) Grad: 45.5919  LR: 0.00000013  
Epoch: [1][140/196] Elapsed 8m 18s (remain 3m 19s) Loss: 2.2931(2.6779) Grad: 45.9473  LR: 0.00000013  
Epoch: [1][160/196] Elapsed 9m 30s (remain 2m 8s) Loss: 2.2342(2.6223) Grad: 45.3126  LR: 0.00000013  
Epoch: [1][180/196] Elapsed 10m 41s (remain 0m 57s) Loss: 1.8138(2.5486) Grad: 44.5934  LR: 0.00000013  
Epoch: [1][196/196] Elapsed 11m 37s (remain 0m 0s) Loss:

[32m[I 2022-09-15 20:19:57,130][0m Trial 2 finished with value: 1.2976852655410767 and parameters: {'learning_rate': 1.3139112976803736e-07, 'layer_wise_learning_rate_decay': 0.7741365547424885, 'learning_rate_schduler': 'cosine', 'reinit_layers': 0}. Best is trial 1 with value: 0.45570382475852966.[0m



TRAIN LOOP

Epoch: [1][20/196] Elapsed 1m 12s (remain 10m 36s) Loss: 1.6320(2.3574) Grad: 42.3980  LR: 0.00000258  
Epoch: [1][40/196] Elapsed 2m 23s (remain 9m 20s) Loss: 0.2700(1.5041) Grad: 10.6155  LR: 0.00000515  
Epoch: [1][60/196] Elapsed 3m 35s (remain 8m 8s) Loss: 0.2152(1.0883) Grad: 5.6037  LR: 0.00000773  
Epoch: [1][80/196] Elapsed 4m 46s (remain 6m 55s) Loss: 0.2636(0.8729) Grad: 4.9757  LR: 0.00001031  
Epoch: [1][100/196] Elapsed 5m 58s (remain 5m 43s) Loss: 0.2555(0.7395) Grad: 7.7593  LR: 0.00001288  
Epoch: [1][120/196] Elapsed 7m 9s (remain 4m 32s) Loss: 0.2169(0.6504) Grad: 7.0170  LR: 0.00001498  
Epoch: [1][140/196] Elapsed 8m 21s (remain 3m 20s) Loss: 0.3085(0.5887) Grad: 5.4635  LR: 0.00001433  
Epoch: [1][160/196] Elapsed 9m 32s (remain 2m 8s) Loss: 0.1893(0.5397) Grad: 5.4492  LR: 0.00001369  
Epoch: [1][180/196] Elapsed 10m 44s (remain 0m 57s) Loss: 0.1760(0.5016) Grad: 8.2680  LR: 0.00001305  
Epoch: [1][196/196] Elapsed 11m 39s (remain 0m 0s) Loss: 0.1654

[32m[I 2022-09-15 20:57:45,530][0m Trial 3 finished with value: 0.5547553300857544 and parameters: {'learning_rate': 1.5072591229583353e-05, 'layer_wise_learning_rate_decay': 0.735996067546315, 'learning_rate_schduler': 'polynomial', 'reinit_layers': 0}. Best is trial 1 with value: 0.45570382475852966.[0m



TRAIN LOOP

Epoch: [1][20/196] Elapsed 1m 12s (remain 10m 34s) Loss: 2.9079(2.8963) Grad: 19.5235  LR: 0.00000007  
Epoch: [1][40/196] Elapsed 2m 23s (remain 9m 19s) Loss: 3.0929(2.9001) Grad: 20.1187  LR: 0.00000013  
Epoch: [1][60/196] Elapsed 3m 34s (remain 8m 7s) Loss: 2.9493(2.8764) Grad: 19.9003  LR: 0.00000020  
Epoch: [1][80/196] Elapsed 4m 46s (remain 6m 55s) Loss: 2.6498(2.8356) Grad: 19.7370  LR: 0.00000026  
Epoch: [1][100/196] Elapsed 5m 57s (remain 5m 43s) Loss: 2.4036(2.7994) Grad: 19.7950  LR: 0.00000033  
Epoch: [1][120/196] Elapsed 7m 9s (remain 4m 31s) Loss: 2.2564(2.7526) Grad: 20.0148  LR: 0.00000038  
Epoch: [1][140/196] Elapsed 8m 20s (remain 3m 20s) Loss: 2.3973(2.6889) Grad: 19.9308  LR: 0.00000037  
Epoch: [1][160/196] Elapsed 9m 31s (remain 2m 8s) Loss: 2.2436(2.6279) Grad: 20.1031  LR: 0.00000035  
Epoch: [1][180/196] Elapsed 10m 42s (remain 0m 57s) Loss: 1.9258(2.5645) Grad: 19.6313  LR: 0.00000033  
Epoch: [1][196/196] Elapsed 11m 37s (remain 0m 0s) Loss:

[32m[I 2022-09-15 21:35:29,369][0m Trial 4 finished with value: 1.177595853805542 and parameters: {'learning_rate': 3.8498404377033904e-07, 'layer_wise_learning_rate_decay': 0.7221476129529402, 'learning_rate_schduler': 'linear', 'reinit_layers': 2}. Best is trial 1 with value: 0.45570382475852966.[0m



TRAIN LOOP

Epoch: [1][20/196] Elapsed 1m 12s (remain 10m 35s) Loss: 2.2792(2.4612) Grad: 21.3833  LR: 0.00000020  
Epoch: [1][40/196] Elapsed 2m 23s (remain 9m 20s) Loss: 2.5460(2.4187) Grad: 21.7220  LR: 0.00000041  
Epoch: [1][60/196] Elapsed 3m 34s (remain 8m 7s) Loss: 1.9550(2.3407) Grad: 21.4315  LR: 0.00000061  
Epoch: [1][80/196] Elapsed 4m 46s (remain 6m 55s) Loss: 1.6546(2.2281) Grad: 21.5187  LR: 0.00000081  
Epoch: [1][100/196] Elapsed 5m 58s (remain 5m 43s) Loss: 1.3316(2.0808) Grad: 20.3413  LR: 0.00000102  
Epoch: [1][120/196] Elapsed 7m 9s (remain 4m 31s) Loss: 0.8517(1.9025) Grad: 18.6450  LR: 0.00000119  
Epoch: [1][140/196] Elapsed 8m 20s (remain 3m 20s) Loss: 0.4417(1.7069) Grad: 11.7520  LR: 0.00000118  
Epoch: [1][160/196] Elapsed 9m 31s (remain 2m 8s) Loss: 0.3019(1.5293) Grad: 8.5100  LR: 0.00000117  
Epoch: [1][180/196] Elapsed 10m 43s (remain 0m 57s) Loss: 0.2264(1.3873) Grad: 3.2246  LR: 0.00000114  
Epoch: [1][196/196] Elapsed 11m 38s (remain 0m 0s) Loss: 0

[32m[I 2022-09-15 22:13:21,780][0m Trial 5 finished with value: 0.5984465479850769 and parameters: {'learning_rate': 1.1896075168370157e-06, 'layer_wise_learning_rate_decay': 0.7212806415658086, 'learning_rate_schduler': 'cosine', 'reinit_layers': 3}. Best is trial 1 with value: 0.45570382475852966.[0m



TRAIN LOOP

Epoch: [1][20/196] Elapsed 1m 12s (remain 10m 35s) Loss: 2.4248(2.5588) Grad: 22.4338  LR: 0.00000004  
Epoch: [1][40/196] Elapsed 2m 23s (remain 9m 19s) Loss: 2.7910(2.5485) Grad: 22.1282  LR: 0.00000008  
Epoch: [1][60/196] Elapsed 3m 34s (remain 8m 6s) Loss: 2.4471(2.5460) Grad: 22.0588  LR: 0.00000012  
Epoch: [1][80/196] Elapsed 4m 46s (remain 6m 55s) Loss: 2.1733(2.5048) Grad: 22.1435  LR: 0.00000016  
Epoch: [1][100/196] Elapsed 5m 57s (remain 5m 43s) Loss: 2.3596(2.4730) Grad: 22.2139  LR: 0.00000020  
Epoch: [1][120/196] Elapsed 7m 9s (remain 4m 31s) Loss: 2.0947(2.4229) Grad: 22.2017  LR: 0.00000023  
Epoch: [1][140/196] Elapsed 8m 20s (remain 3m 20s) Loss: 1.9021(2.3670) Grad: 22.2261  LR: 0.00000023  
Epoch: [1][160/196] Elapsed 9m 32s (remain 2m 8s) Loss: 1.6768(2.3123) Grad: 21.6222  LR: 0.00000022  
Epoch: [1][180/196] Elapsed 10m 43s (remain 0m 57s) Loss: 1.4424(2.2496) Grad: 21.6740  LR: 0.00000022  
Epoch: [1][196/196] Elapsed 11m 38s (remain 0m 0s) Loss:

[32m[I 2022-09-15 22:51:14,205][0m Trial 6 finished with value: 1.1141407489776611 and parameters: {'learning_rate': 2.2922957603103856e-07, 'layer_wise_learning_rate_decay': 0.7872546732613366, 'learning_rate_schduler': 'cosine', 'reinit_layers': 3}. Best is trial 1 with value: 0.45570382475852966.[0m



TRAIN LOOP

Epoch: [1][20/196] Elapsed 1m 12s (remain 10m 39s) Loss: 2.2866(2.2442) Grad: 25.0934  LR: 0.00000055  
Epoch: [1][40/196] Elapsed 2m 23s (remain 9m 21s) Loss: 1.6449(2.0800) Grad: 24.1665  LR: 0.00000110  
Epoch: [1][60/196] Elapsed 3m 35s (remain 8m 7s) Loss: 0.8250(1.7940) Grad: 20.3056  LR: 0.00000166  
Epoch: [1][80/196] Elapsed 4m 46s (remain 6m 55s) Loss: 0.2524(1.4649) Grad: 7.4933  LR: 0.00000221  
Epoch: [1][100/196] Elapsed 5m 57s (remain 5m 43s) Loss: 0.1931(1.2125) Grad: 3.9911  LR: 0.00000276  
Epoch: [1][120/196] Elapsed 7m 9s (remain 4m 31s) Loss: 0.2164(1.0484) Grad: 2.5684  LR: 0.00000321  
Epoch: [1][140/196] Elapsed 8m 20s (remain 3m 20s) Loss: 0.3320(0.9262) Grad: 3.9656  LR: 0.00000307  
Epoch: [1][160/196] Elapsed 9m 31s (remain 2m 8s) Loss: 0.2017(0.8324) Grad: 4.7449  LR: 0.00000293  
Epoch: [1][180/196] Elapsed 10m 43s (remain 0m 57s) Loss: 0.1928(0.7586) Grad: 3.0671  LR: 0.00000280  
Epoch: [1][196/196] Elapsed 11m 38s (remain 0m 0s) Loss: 0.126

[32m[I 2022-09-15 23:29:06,692][0m Trial 7 finished with value: 0.5056667923927307 and parameters: {'learning_rate': 3.2291716682826884e-06, 'layer_wise_learning_rate_decay': 0.7547056904289499, 'learning_rate_schduler': 'linear', 'reinit_layers': 4}. Best is trial 1 with value: 0.45570382475852966.[0m



TRAIN LOOP

Epoch: [1][20/196] Elapsed 1m 12s (remain 10m 37s) Loss: 0.3207(1.3670) Grad: 10.5144  LR: 0.00000891  
Epoch: [1][40/196] Elapsed 2m 23s (remain 9m 20s) Loss: 0.2283(0.8161) Grad: 2.8694  LR: 0.00001781  
Epoch: [1][60/196] Elapsed 3m 34s (remain 8m 7s) Loss: 0.2030(0.6093) Grad: 4.4690  LR: 0.00002672  
Epoch: [1][80/196] Elapsed 4m 46s (remain 6m 55s) Loss: 0.1216(0.4948) Grad: 3.6565  LR: 0.00003563  
Epoch: [1][100/196] Elapsed 5m 57s (remain 5m 43s) Loss: 0.1262(0.4214) Grad: 4.3316  LR: 0.00004453  
Epoch: [1][120/196] Elapsed 7m 9s (remain 4m 31s) Loss: 0.1467(0.3738) Grad: 3.4700  LR: 0.00005177  
Epoch: [1][140/196] Elapsed 8m 20s (remain 3m 20s) Loss: 0.1391(0.3393) Grad: 4.8795  LR: 0.00004955  
Epoch: [1][160/196] Elapsed 9m 31s (remain 2m 8s) Loss: 0.1094(0.3154) Grad: 3.3148  LR: 0.00004733  
Epoch: [1][180/196] Elapsed 10m 42s (remain 0m 57s) Loss: 0.1142(0.2938) Grad: 2.3177  LR: 0.00004511  
Epoch: [1][196/196] Elapsed 11m 38s (remain 0m 0s) Loss: 0.1448(

[32m[I 2022-09-16 00:06:42,044][0m Trial 8 finished with value: 0.4658414423465729 and parameters: {'learning_rate': 5.210470632097099e-05, 'layer_wise_learning_rate_decay': 0.9815766262345241, 'learning_rate_schduler': 'linear', 'reinit_layers': 3}. Best is trial 1 with value: 0.45570382475852966.[0m



TRAIN LOOP

Epoch: [1][20/196] Elapsed 1m 11s (remain 10m 33s) Loss: 2.4134(2.3962) Grad: 42.6502  LR: 0.00000033  
Epoch: [1][40/196] Elapsed 2m 23s (remain 9m 19s) Loss: 1.9811(2.2472) Grad: 41.9844  LR: 0.00000065  
Epoch: [1][60/196] Elapsed 3m 34s (remain 8m 6s) Loss: 1.2317(2.0337) Grad: 38.3061  LR: 0.00000098  
Epoch: [1][80/196] Elapsed 4m 46s (remain 6m 54s) Loss: 0.4153(1.7435) Grad: 20.5296  LR: 0.00000130  
Epoch: [1][100/196] Elapsed 5m 57s (remain 5m 43s) Loss: 0.2168(1.4496) Grad: 4.8794  LR: 0.00000163  
Epoch: [1][120/196] Elapsed 7m 8s (remain 4m 31s) Loss: 0.2251(1.2432) Grad: 7.4450  LR: 0.00000189  
Epoch: [1][140/196] Elapsed 8m 20s (remain 3m 20s) Loss: 0.3085(1.0970) Grad: 5.9484  LR: 0.00000181  
Epoch: [1][160/196] Elapsed 9m 31s (remain 2m 8s) Loss: 0.1537(0.9886) Grad: 4.0654  LR: 0.00000173  
Epoch: [1][180/196] Elapsed 10m 42s (remain 0m 57s) Loss: 0.1554(0.8985) Grad: 4.1655  LR: 0.00000165  
Epoch: [1][196/196] Elapsed 11m 38s (remain 0m 0s) Loss: 0.15

[32m[I 2022-09-16 00:44:35,900][0m Trial 9 finished with value: 0.6222785711288452 and parameters: {'learning_rate': 1.906783363311194e-06, 'layer_wise_learning_rate_decay': 0.7418116385385011, 'learning_rate_schduler': 'linear', 'reinit_layers': 0}. Best is trial 1 with value: 0.45570382475852966.[0m



TRAIN LOOP

Epoch: [1][20/196] Elapsed 1m 11s (remain 10m 33s) Loss: 2.6041(2.7005) Grad: 17.3163  LR: 0.00000175  
Epoch: [1][40/196] Elapsed 2m 23s (remain 9m 18s) Loss: 1.3273(2.3319) Grad: 16.5500  LR: 0.00000350  
Epoch: [1][60/196] Elapsed 3m 34s (remain 8m 6s) Loss: 0.2629(1.7385) Grad: 4.1191  LR: 0.00000524  
Epoch: [1][80/196] Elapsed 4m 46s (remain 6m 54s) Loss: 0.1850(1.3510) Grad: 2.8408  LR: 0.00000699  
Epoch: [1][100/196] Elapsed 5m 57s (remain 5m 43s) Loss: 0.1194(1.1085) Grad: 4.0295  LR: 0.00000874  
Epoch: [1][120/196] Elapsed 7m 8s (remain 4m 31s) Loss: 0.1170(0.9457) Grad: 4.0105  LR: 0.00001016  
Epoch: [1][140/196] Elapsed 8m 20s (remain 3m 20s) Loss: 0.0956(0.8292) Grad: 1.9154  LR: 0.00000972  
Epoch: [1][160/196] Elapsed 9m 31s (remain 2m 8s) Loss: 0.1318(0.7405) Grad: 3.9856  LR: 0.00000929  
Epoch: [1][180/196] Elapsed 10m 42s (remain 0m 57s) Loss: 0.0797(0.6709) Grad: 1.9032  LR: 0.00000885  
Epoch: [1][196/196] Elapsed 11m 37s (remain 0m 0s) Loss: 0.1035

[32m[I 2022-09-16 01:22:27,296][0m Trial 10 finished with value: 0.4555629789829254 and parameters: {'learning_rate': 1.0223764573961562e-05, 'layer_wise_learning_rate_decay': 0.9089745817743605, 'learning_rate_schduler': 'polynomial', 'reinit_layers': 1}. Best is trial 10 with value: 0.4555629789829254.[0m



TRAIN LOOP

Epoch: [1][20/196] Elapsed 1m 12s (remain 10m 34s) Loss: 2.3743(2.8769) Grad: 17.4244  LR: 0.00000177  
Epoch: [1][40/196] Elapsed 2m 23s (remain 9m 19s) Loss: 1.2638(2.4915) Grad: 16.5317  LR: 0.00000354  
Epoch: [1][60/196] Elapsed 3m 34s (remain 8m 6s) Loss: 0.3078(1.8601) Grad: 5.0017  LR: 0.00000531  
Epoch: [1][80/196] Elapsed 4m 46s (remain 6m 55s) Loss: 0.1489(1.4458) Grad: 5.7934  LR: 0.00000708  
Epoch: [1][100/196] Elapsed 5m 57s (remain 5m 43s) Loss: 0.1392(1.1840) Grad: 3.4416  LR: 0.00000884  
Epoch: [1][120/196] Elapsed 7m 8s (remain 4m 31s) Loss: 0.1509(1.0075) Grad: 6.5264  LR: 0.00001028  
Epoch: [1][140/196] Elapsed 8m 20s (remain 3m 20s) Loss: 0.1047(0.8811) Grad: 4.3910  LR: 0.00000984  
Epoch: [1][160/196] Elapsed 9m 31s (remain 2m 8s) Loss: 0.1322(0.7849) Grad: 2.5689  LR: 0.00000940  
Epoch: [1][180/196] Elapsed 10m 42s (remain 0m 57s) Loss: 0.1091(0.7113) Grad: 2.3364  LR: 0.00000896  
Epoch: [1][196/196] Elapsed 11m 37s (remain 0m 0s) Loss: 0.1441

[32m[I 2022-09-16 02:00:17,411][0m Trial 11 finished with value: 0.4552156627178192 and parameters: {'learning_rate': 1.0348080335764067e-05, 'layer_wise_learning_rate_decay': 0.9154759345739282, 'learning_rate_schduler': 'polynomial', 'reinit_layers': 1}. Best is trial 11 with value: 0.4552156627178192.[0m



TRAIN LOOP

Epoch: [1][20/196] Elapsed 1m 11s (remain 10m 33s) Loss: 2.2095(2.6464) Grad: 17.1960  LR: 0.00000144  
Epoch: [1][40/196] Elapsed 2m 23s (remain 9m 19s) Loss: 1.7338(2.3753) Grad: 16.3909  LR: 0.00000288  
Epoch: [1][60/196] Elapsed 3m 34s (remain 8m 6s) Loss: 0.2608(1.8709) Grad: 3.4001  LR: 0.00000432  
Epoch: [1][80/196] Elapsed 4m 45s (remain 6m 54s) Loss: 0.1422(1.4588) Grad: 2.9088  LR: 0.00000576  
Epoch: [1][100/196] Elapsed 5m 57s (remain 5m 42s) Loss: 0.1322(1.2014) Grad: 3.4412  LR: 0.00000721  
Epoch: [1][120/196] Elapsed 7m 8s (remain 4m 31s) Loss: 0.1136(1.0239) Grad: 3.8666  LR: 0.00000838  
Epoch: [1][140/196] Elapsed 8m 19s (remain 3m 19s) Loss: 0.1123(0.8947) Grad: 2.4485  LR: 0.00000802  
Epoch: [1][160/196] Elapsed 9m 30s (remain 2m 8s) Loss: 0.0813(0.7970) Grad: 1.4254  LR: 0.00000766  
Epoch: [1][180/196] Elapsed 10m 42s (remain 0m 57s) Loss: 0.1009(0.7210) Grad: 2.8761  LR: 0.00000730  
Epoch: [1][196/196] Elapsed 11m 38s (remain 0m 0s) Loss: 0.1665

[32m[I 2022-09-16 02:38:07,724][0m Trial 12 finished with value: 0.4534367322921753 and parameters: {'learning_rate': 8.430514132964566e-06, 'layer_wise_learning_rate_decay': 0.8931288973170695, 'learning_rate_schduler': 'polynomial', 'reinit_layers': 1}. Best is trial 12 with value: 0.4534367322921753.[0m



TRAIN LOOP

Epoch: [1][20/196] Elapsed 1m 12s (remain 10m 35s) Loss: 2.2832(2.5220) Grad: 19.9522  LR: 0.00000120  
Epoch: [1][40/196] Elapsed 2m 23s (remain 9m 19s) Loss: 0.9825(2.1572) Grad: 18.0501  LR: 0.00000240  
Epoch: [1][60/196] Elapsed 3m 34s (remain 8m 6s) Loss: 0.2484(1.6441) Grad: 6.3193  LR: 0.00000360  
Epoch: [1][80/196] Elapsed 4m 46s (remain 6m 54s) Loss: 0.2434(1.2891) Grad: 7.0590  LR: 0.00000480  
Epoch: [1][100/196] Elapsed 5m 57s (remain 5m 43s) Loss: 0.1364(1.0664) Grad: 1.7462  LR: 0.00000600  
Epoch: [1][120/196] Elapsed 7m 8s (remain 4m 31s) Loss: 0.1184(0.9148) Grad: 2.1593  LR: 0.00000698  
Epoch: [1][140/196] Elapsed 8m 20s (remain 3m 20s) Loss: 0.1196(0.8032) Grad: 2.8526  LR: 0.00000668  
Epoch: [1][160/196] Elapsed 9m 31s (remain 2m 8s) Loss: 0.1685(0.7180) Grad: 2.9121  LR: 0.00000638  
Epoch: [1][180/196] Elapsed 10m 42s (remain 0m 57s) Loss: 0.1005(0.6521) Grad: 2.3067  LR: 0.00000608  
Epoch: [1][196/196] Elapsed 11m 38s (remain 0m 0s) Loss: 0.0842

[32m[I 2022-09-16 03:15:59,833][0m Trial 13 finished with value: 0.4558442533016205 and parameters: {'learning_rate': 7.0242596937025054e-06, 'layer_wise_learning_rate_decay': 0.8529699795029209, 'learning_rate_schduler': 'polynomial', 'reinit_layers': 2}. Best is trial 12 with value: 0.4534367322921753.[0m



TRAIN LOOP

Epoch: [1][20/196] Elapsed 1m 11s (remain 10m 33s) Loss: 2.1400(2.5713) Grad: 17.4885  LR: 0.00000291  
Epoch: [1][40/196] Elapsed 2m 23s (remain 9m 18s) Loss: 0.6804(1.9668) Grad: 13.8776  LR: 0.00000582  
Epoch: [1][60/196] Elapsed 3m 34s (remain 8m 6s) Loss: 0.1551(1.3973) Grad: 3.0829  LR: 0.00000873  
Epoch: [1][80/196] Elapsed 4m 46s (remain 6m 54s) Loss: 0.1315(1.0872) Grad: 2.0704  LR: 0.00001164  
Epoch: [1][100/196] Elapsed 5m 57s (remain 5m 43s) Loss: 0.1088(0.8941) Grad: 3.2032  LR: 0.00001455  
Epoch: [1][120/196] Elapsed 7m 9s (remain 4m 31s) Loss: 0.1265(0.7645) Grad: 3.1294  LR: 0.00001692  
Epoch: [1][140/196] Elapsed 8m 20s (remain 3m 20s) Loss: 0.1299(0.6719) Grad: 3.3602  LR: 0.00001619  
Epoch: [1][160/196] Elapsed 9m 31s (remain 2m 8s) Loss: 0.1288(0.6026) Grad: 5.2275  LR: 0.00001547  
Epoch: [1][180/196] Elapsed 10m 43s (remain 0m 57s) Loss: 0.1380(0.5489) Grad: 3.1907  LR: 0.00001474  
Epoch: [1][196/196] Elapsed 11m 38s (remain 0m 0s) Loss: 0.1286

[32m[I 2022-09-16 03:53:53,804][0m Trial 14 finished with value: 0.4489534795284271 and parameters: {'learning_rate': 1.7027701828938127e-05, 'layer_wise_learning_rate_decay': 0.8780935762069765, 'learning_rate_schduler': 'polynomial', 'reinit_layers': 1}. Best is trial 14 with value: 0.4489534795284271.[0m



TRAIN LOOP

Epoch: [1][20/196] Elapsed 1m 11s (remain 10m 33s) Loss: 1.9800(2.4633) Grad: 18.8299  LR: 0.00000371  
Epoch: [1][40/196] Elapsed 2m 23s (remain 9m 18s) Loss: 0.2941(1.6175) Grad: 4.2475  LR: 0.00000741  
Epoch: [1][60/196] Elapsed 3m 34s (remain 8m 6s) Loss: 0.2280(1.1579) Grad: 6.2789  LR: 0.00001112  
Epoch: [1][80/196] Elapsed 4m 46s (remain 6m 54s) Loss: 0.1413(0.9121) Grad: 2.9273  LR: 0.00001483  
Epoch: [1][100/196] Elapsed 5m 57s (remain 5m 43s) Loss: 0.1081(0.7558) Grad: 4.2234  LR: 0.00001853  
Epoch: [1][120/196] Elapsed 7m 8s (remain 4m 31s) Loss: 0.1060(0.6517) Grad: 1.9142  LR: 0.00002154  
Epoch: [1][140/196] Elapsed 8m 20s (remain 3m 20s) Loss: 0.1474(0.5752) Grad: 4.3052  LR: 0.00002062  
Epoch: [1][160/196] Elapsed 9m 31s (remain 2m 8s) Loss: 0.1363(0.5175) Grad: 2.5704  LR: 0.00001970  
Epoch: [1][180/196] Elapsed 10m 43s (remain 0m 57s) Loss: 0.1022(0.4732) Grad: 2.9929  LR: 0.00001877  
Epoch: [1][196/196] Elapsed 11m 38s (remain 0m 0s) Loss: 0.1170(

[32m[I 2022-09-16 04:31:47,643][0m Trial 15 finished with value: 0.45275118947029114 and parameters: {'learning_rate': 2.168343678854504e-05, 'layer_wise_learning_rate_decay': 0.8571961513207124, 'learning_rate_schduler': 'polynomial', 'reinit_layers': 2}. Best is trial 14 with value: 0.4489534795284271.[0m



TRAIN LOOP

Epoch: [1][20/196] Elapsed 1m 11s (remain 10m 33s) Loss: 1.6403(2.2064) Grad: 18.6384  LR: 0.00000407  
Epoch: [1][40/196] Elapsed 2m 23s (remain 9m 19s) Loss: 0.3045(1.4024) Grad: 6.8586  LR: 0.00000814  
Epoch: [1][60/196] Elapsed 3m 34s (remain 8m 7s) Loss: 0.2466(1.0238) Grad: 7.4947  LR: 0.00001221  
Epoch: [1][80/196] Elapsed 4m 46s (remain 6m 55s) Loss: 0.1050(0.8146) Grad: 3.3463  LR: 0.00001627  
Epoch: [1][100/196] Elapsed 5m 57s (remain 5m 43s) Loss: 0.0963(0.6806) Grad: 2.7758  LR: 0.00002034  
Epoch: [1][120/196] Elapsed 7m 8s (remain 4m 31s) Loss: 0.1422(0.5868) Grad: 7.0914  LR: 0.00002365  
Epoch: [1][140/196] Elapsed 8m 20s (remain 3m 20s) Loss: 0.0890(0.5190) Grad: 2.9379  LR: 0.00002263  
Epoch: [1][160/196] Elapsed 9m 31s (remain 2m 8s) Loss: 0.1512(0.4687) Grad: 6.9918  LR: 0.00002162  
Epoch: [1][180/196] Elapsed 10m 43s (remain 0m 57s) Loss: 0.1182(0.4297) Grad: 2.8159  LR: 0.00002060  
Epoch: [1][196/196] Elapsed 11m 38s (remain 0m 0s) Loss: 0.1106(

[32m[I 2022-09-16 05:09:42,784][0m Trial 16 finished with value: 0.454280823469162 and parameters: {'learning_rate': 2.3800759968097607e-05, 'layer_wise_learning_rate_decay': 0.8363549913652786, 'learning_rate_schduler': 'polynomial', 'reinit_layers': 2}. Best is trial 14 with value: 0.4489534795284271.[0m



TRAIN LOOP

Epoch: [1][20/196] Elapsed 1m 12s (remain 10m 37s) Loss: 0.3605(1.4878) Grad: 9.8687  LR: 0.00001317  
Epoch: [1][40/196] Elapsed 2m 23s (remain 9m 21s) Loss: 0.2062(0.9130) Grad: 3.3334  LR: 0.00002635  
Epoch: [1][60/196] Elapsed 3m 35s (remain 8m 8s) Loss: 0.1867(0.6777) Grad: 5.4035  LR: 0.00003952  
Epoch: [1][80/196] Elapsed 4m 46s (remain 6m 55s) Loss: 0.1797(0.5494) Grad: 4.0744  LR: 0.00005270  
Epoch: [1][100/196] Elapsed 5m 58s (remain 5m 43s) Loss: 0.1112(0.4646) Grad: 3.0191  LR: 0.00006587  
Epoch: [1][120/196] Elapsed 7m 9s (remain 4m 31s) Loss: 0.0880(0.4108) Grad: 4.0185  LR: 0.00007658  
Epoch: [1][140/196] Elapsed 8m 20s (remain 3m 20s) Loss: 0.1163(0.3732) Grad: 3.6453  LR: 0.00007329  
Epoch: [1][160/196] Elapsed 9m 32s (remain 2m 8s) Loss: 0.1144(0.3429) Grad: 3.1968  LR: 0.00007001  
Epoch: [1][180/196] Elapsed 10m 43s (remain 0m 57s) Loss: 0.1454(0.3208) Grad: 3.1167  LR: 0.00006672  
Epoch: [1][196/196] Elapsed 11m 39s (remain 0m 0s) Loss: 0.0859(0

[32m[I 2022-09-16 05:47:37,855][0m Trial 17 finished with value: 0.45891880989074707 and parameters: {'learning_rate': 7.707324684449108e-05, 'layer_wise_learning_rate_decay': 0.8339476407176097, 'learning_rate_schduler': 'polynomial', 'reinit_layers': 4}. Best is trial 14 with value: 0.4489534795284271.[0m



TRAIN LOOP

Epoch: [1][20/196] Elapsed 1m 12s (remain 10m 36s) Loss: 2.6135(2.7590) Grad: 19.8303  LR: 0.00000063  
Epoch: [1][40/196] Elapsed 2m 23s (remain 9m 20s) Loss: 2.0224(2.5872) Grad: 19.9114  LR: 0.00000127  
Epoch: [1][60/196] Elapsed 3m 34s (remain 8m 7s) Loss: 1.1023(2.2502) Grad: 18.6274  LR: 0.00000190  
Epoch: [1][80/196] Elapsed 4m 46s (remain 6m 55s) Loss: 0.2280(1.8308) Grad: 6.7808  LR: 0.00000254  
Epoch: [1][100/196] Elapsed 5m 57s (remain 5m 43s) Loss: 0.2152(1.5102) Grad: 3.8052  LR: 0.00000317  
Epoch: [1][120/196] Elapsed 7m 9s (remain 4m 31s) Loss: 0.2582(1.2887) Grad: 2.1512  LR: 0.00000369  
Epoch: [1][140/196] Elapsed 8m 20s (remain 3m 20s) Loss: 0.1642(1.1284) Grad: 2.3194  LR: 0.00000353  
Epoch: [1][160/196] Elapsed 9m 32s (remain 2m 8s) Loss: 0.1995(1.0067) Grad: 3.5046  LR: 0.00000337  
Epoch: [1][180/196] Elapsed 10m 43s (remain 0m 57s) Loss: 0.1004(0.9093) Grad: 2.2256  LR: 0.00000321  
Epoch: [1][196/196] Elapsed 11m 38s (remain 0m 0s) Loss: 0.110

[32m[I 2022-09-16 06:25:32,764][0m Trial 18 finished with value: 0.46429410576820374 and parameters: {'learning_rate': 3.712200194342814e-06, 'layer_wise_learning_rate_decay': 0.873797238234208, 'learning_rate_schduler': 'polynomial', 'reinit_layers': 2}. Best is trial 14 with value: 0.4489534795284271.[0m



TRAIN LOOP

Epoch: [1][20/196] Elapsed 1m 12s (remain 10m 36s) Loss: 1.6407(2.3579) Grad: 19.6849  LR: 0.00000342  
Epoch: [1][40/196] Elapsed 2m 23s (remain 9m 21s) Loss: 0.1692(1.6092) Grad: 2.6283  LR: 0.00000683  
Epoch: [1][60/196] Elapsed 3m 35s (remain 8m 8s) Loss: 0.1660(1.1529) Grad: 1.6743  LR: 0.00001025  
Epoch: [1][80/196] Elapsed 4m 46s (remain 6m 56s) Loss: 0.1442(0.9106) Grad: 2.9029  LR: 0.00001366  
Epoch: [1][100/196] Elapsed 5m 58s (remain 5m 44s) Loss: 0.1284(0.7553) Grad: 5.7241  LR: 0.00001708  
Epoch: [1][120/196] Elapsed 7m 10s (remain 4m 32s) Loss: 0.0961(0.6498) Grad: 6.2203  LR: 0.00001985  
Epoch: [1][140/196] Elapsed 8m 21s (remain 3m 20s) Loss: 0.0909(0.5737) Grad: 2.3681  LR: 0.00001900  
Epoch: [1][160/196] Elapsed 9m 33s (remain 2m 8s) Loss: 0.0931(0.5165) Grad: 2.3086  LR: 0.00001815  
Epoch: [1][180/196] Elapsed 10m 44s (remain 0m 57s) Loss: 0.1277(0.4712) Grad: 2.2002  LR: 0.00001730  
Epoch: [1][196/196] Elapsed 11m 40s (remain 0m 0s) Loss: 0.1350

[32m[I 2022-09-16 07:03:31,355][0m Trial 19 finished with value: 0.45386430621147156 and parameters: {'learning_rate': 1.9979077386798717e-05, 'layer_wise_learning_rate_decay': 0.8086618256858717, 'learning_rate_schduler': 'polynomial', 'reinit_layers': 2}. Best is trial 14 with value: 0.4489534795284271.[0m


In [None]:
study.best_trial.params

{'learning_rate': 1.7027701828938127e-05,
 'layer_wise_learning_rate_decay': 0.8780935762069765,
 'learning_rate_schduler': 'polynomial',
 'reinit_layers': 1}

In [None]:
!cp -r deberta-v3-large/ /content/drive/MyDrive/Kaggle/FeedbackPrize3/