In [1]:
!pip install transformers sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.22.2-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 31.0 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 78.8 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 95.2 MB/s 
Collecting huggingface-hub<1.0,>=0.9.0
  Downloading huggingface_hub-0.10.0-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 86.6 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers, sentencepiece
Successfully installed huggingface-hub-0.10.0 sentencepiece-0.1.97 tokenizers-0.12.1 transformers-4.22.2


In [2]:
import torch
import torch.nn as nn
import numpy as np
import os
from transformers import AutoModel, AutoConfig, AutoTokenizer, get_polynomial_decay_schedule_with_warmup, get_linear_schedule_with_warmup, get_cosine_with_hard_restarts_schedule_with_warmup
import pandas as pd
from transformers.optimization import Adafactor, AdafactorSchedule
from torch.cuda.amp import autocast, GradScaler
from sklearn.metrics import mean_squared_error
import random
import time
from torch.utils import checkpoint
import math
import gc
from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs
import warnings
import torch.nn.functional as F

In [3]:
import transformers
transformers.logging.set_verbosity_error()

In [4]:
warnings.simplefilter('ignore')

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
!nvidia-smi

Sun Oct  2 17:48:38 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [7]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

In [8]:
#####
#### BEST PARAMS FOR THE DEBERTA V3 BASE MODEL
# params = {'learning_rate': 0.0002634969863920811, 
#             'layer_wise_learning_rate_decay': 0.7867664854455205, 
#             'learning_rate_schduler': 'polynomial', 
#             'reinit_layers': 3}
#####

In [9]:
class CFG:
    train_file = "/content/drive/MyDrive/Kaggle/FeedbackPrize3/train_folds.csv"
    fold = 0
    batch_size = 16
    num_workers = 4
    target_columns = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    hidden_dropout_prob = 0.0
    reinit_weights = False
    reinit_layers = 1
    lr = 0.001
    llrd = 0.9
    warmup_ratio = 0
    use_awp = False
    adv_lr = 0.0002
    adv_eps = 0.001
    model_name = "microsoft/deberta-v3-large"
    gradient_accumulation_steps = 1
    max_grad_norm = 10
    print_freq = 20
    epochs = 3
    n_tokens = 40
    specific_max_len = 768 - n_tokens
    token_dropout = False
    token_dropout_prob = 0.15
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    gradient_checkpointing_enable = True
    save_dir = "deberta-v3-large"
    save_model_name = "deberta-v3-large"

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/580 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

In [10]:
#Preprocessing Functions

def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

# Register the encoding and decoding error handlers for `utf-8` and `cp1252`.
codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)

def resolve_encodings_and_normalize(text: str) -> str:
    """Resolve the encoding problems and normalize the abnormal characters."""
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text

In [11]:
#Utiliy functions 
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
        
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))
        
def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores


def get_score(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return mcrmse_score, scores

In [12]:
class Collate:
    def __init__(self, tokenizer, max_length = CFG.specific_max_len):
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __call__(self, batch):
        
        batch_len = max([len(sample["ids"]) for sample in batch])
        
        output = dict()
        output["ids"] = [sample["ids"] for sample in batch]
        output["mask"] = [sample["mask"] for sample in batch]
        output["targets"] = [sample["targets"] for sample in batch]
        
        if self.tokenizer.padding_side == "right":
            output["ids"] = [s + [self.tokenizer.pad_token_id] * (batch_len - len(s)) for s in output["ids"]]
            output["mask"] = [s + [0] * (batch_len - len(s)) for s in output["mask"]]
        else:
            output["ids"] = [[self.tokenizer.pad_token_id] * (batch_len - len(s)) + s for s in output["ids"]]
            output["mask"] = [[0] * (batch_len - len(s)) + s for s in output["mask"]]
            
            
        output["ids"] = torch.tensor(output["ids"], dtype = torch.long)
        output["mask"] = torch.tensor(output["mask"], dtype = torch.long)
        output["targets"] = torch.tensor(output["targets"], dtype = torch.float32)
        
        return output

In [13]:
class Dataset:
    def __init__(self, texts, targets, tokenizer, is_train = True):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.is_train = is_train
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        
        text = self.texts[idx]
        targets = self.targets[idx]
        
        
        if CFG.specific_max_len is not None:
            encoding = self.tokenizer(text, add_special_tokens = True, max_length = CFG.specific_max_len, padding = False, truncation = 'longest_first')
        else:
            encoding = self.tokenizer(text, add_special_tokens = True)
        
        sample = dict()

        if CFG.token_dropout and self.is_train:
            print("Running token dropout")
            idxs = np.random.choice(np.arange(1, len(encoding["input_ids"]) - 1), size = int(CFG.token_dropout_prob * len(encoding["input_ids"])), replace = False)
            ids = np.array(encoding["input_ids"])
            ids[idxs] = self.tokenizer.mask_token_id
            encoding["input_ids"] = ids.tolist()
          
        sample["ids"] = [50256] * CFG.n_tokens + encoding["input_ids"]
        sample["mask"] = [1] * CFG.n_tokens + encoding["attention_mask"]  
        sample["targets"] = targets
        
        return sample

In [14]:
class RMSELoss(nn.Module):
    def __init__(self, reduction='mean', eps=1e-9):
        super().__init__()
        self.mse = nn.MSELoss(reduction='none')
        self.reduction = reduction
        self.eps = eps

    def forward(self, y_pred, y_true):
        loss = torch.sqrt(self.mse(y_pred, y_true) + self.eps)
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss

class SmoothRMSELoss(nn.Module):
    def __init__(self, reduction='mean', eps=1e-9):
        super().__init__()
        self.mse = nn.SmoothL1Loss(reduction='none')
        self.reduction = reduction
        self.eps = eps

    def forward(self, y_pred, y_true):
        loss = torch.sqrt(self.mse(y_pred, y_true) + self.eps)
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss

class SmoothRMSEComp(nn.Module):
    def __init__(self, reduction='mean', eps=1e-9):
        super().__init__()
        self.mse = nn.SmoothL1Loss(reduction='none')
        self.reduction = reduction
        self.eps = eps

    def forward(self, y_pred, y_true):
        loss = torch.sqrt(self.mse(y_pred, y_true).mean(dim = 0)).mean(dim = 0)
        return loss  


class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class WeightedLayerPooling(nn.Module):
    def __init__(self, num_hidden_layers, layer_start: int = 4, layer_weights = None):
        super(WeightedLayerPooling, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = num_hidden_layers
        self.layer_weights = layer_weights if layer_weights is not None \
            else nn.Parameter(
                torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float)
            )

    def forward(self, features):

        all_layer_embedding = torch.stack(features)
        all_layer_embedding = all_layer_embedding[self.layer_start:, :, :, :]

        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum()

        return weighted_average


class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim
        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector

In [15]:
#AWP
class AWP:
    def __init__(
        self,
        model,
        optimizer,
        adv_param="weight",
        adv_lr=1,
        adv_eps=0.2,
        start_epoch=0,
        adv_step=1,
        scaler=None
    ):
        self.model = model
        self.optimizer = optimizer
        self.adv_param = adv_param
        self.adv_lr = adv_lr
        self.adv_eps = adv_eps
        self.start_epoch = start_epoch
        self.adv_step = adv_step
        self.backup = {}
        self.backup_eps = {}
        self.scaler = scaler

    def attack_backward(self, x, y, attention_mask,epoch):
        if (self.adv_lr == 0) or (epoch < self.start_epoch):
            return None

        self._save() 
        for i in range(self.adv_step):
            self._attack_step() 
            with torch.cuda.amp.autocast():
                adv_loss, tr_logits = self.model(ids=x, mask=attention_mask, targets=y)
                adv_loss = adv_loss.mean()
            self.optimizer.zero_grad()
            self.scaler.scale(adv_loss).backward()
            
        self._restore()

    def _attack_step(self):
        e = 1e-6
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                norm1 = torch.norm(param.grad)
                norm2 = torch.norm(param.data.detach())
                if norm1 != 0 and not torch.isnan(norm1):
                    r_at = self.adv_lr * param.grad / (norm1 + e) * (norm2 + e)
                    param.data.add_(r_at)
                    param.data = torch.min(
                        torch.max(param.data, self.backup_eps[name][0]), self.backup_eps[name][1]
                    )
                # param.data.clamp_(*self.backup_eps[name])

    def _save(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                if name not in self.backup:
                    self.backup[name] = param.data.clone()
                    grad_eps = self.adv_eps * param.abs().detach()
                    self.backup_eps[name] = (
                        self.backup[name] - grad_eps,
                        self.backup[name] + grad_eps,
                    )

    def _restore(self,):
        for name, param in self.model.named_parameters():
            if name in self.backup:
                param.data = self.backup[name]
        self.backup = {}
        self.backup_eps = {}

In [16]:
#Getting the prompt tuning soft embeddings
class SoftEmbedding(nn.Module):
    def __init__(self, 
                wte: nn.Embedding,
                n_tokens: int = 10, 
                random_range: float = 0.5,
                initialize_from_vocab: bool = True):
        """appends learned embedding to 
        Args:
            wte (nn.Embedding): original transformer word embedding
            n_tokens (int, optional): number of tokens for task. Defaults to 10.
            random_range (float, optional): range to init embedding (if not initialize from vocab). Defaults to 0.5.
            initialize_from_vocab (bool, optional): initalizes from default vocab. Defaults to True.
        """
        super(SoftEmbedding, self).__init__()
        self.wte = wte
        self.n_tokens = n_tokens
        self.learned_embedding = nn.parameter.Parameter(self.initialize_embedding(wte,
                                                                               n_tokens, 
                                                                               random_range, 
                                                                               initialize_from_vocab))
            
    def initialize_embedding(self, 
                             wte: nn.Embedding,
                             n_tokens: int = 10, 
                             random_range: float = 0.5, 
                             initialize_from_vocab: bool = True):
        """initializes learned embedding
        Args:
            same as __init__
        Returns:
            torch.float: initialized using original schemes
        """
        if initialize_from_vocab:
            return self.wte.weight[:n_tokens].clone().detach()
        return torch.FloatTensor(n_tokens, wte.weight.size(1)).uniform_(-random_range, random_range)
            
    def forward(self, tokens):
        """run forward pass
        Args:
            tokens (torch.long): input tokens before encoding
        Returns:
            torch.float: encoding of text concatenated with learned task specifc embedding
        """
        input_embedding = self.wte(tokens[:, self.n_tokens:])
        learned_embedding = self.learned_embedding.repeat(input_embedding.size(0), 1, 1)
        return torch.cat([learned_embedding, input_embedding], 1)

In [17]:
class Model(nn.Module):
    def __init__(self, model_name):
        super(Model, self).__init__()
        
        self.model_name = model_name

        hidden_dropout_prob: float = CFG.hidden_dropout_prob
        layer_norm_eps: float = 1e-7

        config = AutoConfig.from_pretrained(model_name)

        config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout_prob": hidden_dropout_prob,
                "attention_probs_dropout_prob" : hidden_dropout_prob,
                "layer_norm_eps": layer_norm_eps,
                "add_pooling_layer": False,
                "num_labels": 6,
            }
        )
        
        self.config = config
        
        #Using Prompt Tuning
        self.transformer = AutoModel.from_pretrained(model_name, config=config)
        s_wte = SoftEmbedding(self.transformer.get_input_embeddings(), 
                      n_tokens=CFG.n_tokens, 
                      initialize_from_vocab=True)
        self.transformer.set_input_embeddings(s_wte)
        
        if CFG.gradient_checkpointing_enable:
            self.transformer.gradient_checkpointing_enable()
        
        self.layer_nums = list(range(self.config.num_hidden_layers - 6, self.config.num_hidden_layers))
            
        self.freeze()
        
        self.output = nn.Linear(config.hidden_size, 6)
        self.loss = nn.SmoothL1Loss(reduction = "mean")

        if CFG.reinit_weights:
            self.init_weights_(CFG.reinit_layers)

    def init_weights_(self, reinit_layers):
        for layer in self.transformer.encoder.layer[-reinit_layers:]:
            for module in layer.modules():
                if isinstance(module, nn.Linear):
                    module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
                    if module.bias is not None:
                        module.bias.data.zero_()
                elif isinstance(module, nn.Embedding):
                    module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
                    if module.padding_idx is not None:
                        module.weight.data[module.padding_idx].zero_()
                elif isinstance(module, nn.LayerNorm):
                    module.bias.data.zero_()
                    module.weight.data.fill_(1.0)

    def get_grouped_llrd_optimizer_scheduler(self, num_train_steps):
        no_decay = ["bias", "LayerNorm.weight"]
        top_params = [('learned_embedding', self.transformer.embeddings.word_embeddings.learned_embedding)] #+ list(self.output.named_parameters())
      # initialize lr for task specific layer
        optimizer_grouped_parameters = [
              {
                  "params": [p for n, p in top_params if not any(nd in n for nd in no_decay)],
                  "weight_decay": 0.001,
                  "lr": CFG.lr,
              },
              {
                  "params": [p for n, p in top_params if any(nd in n for nd in no_decay)],
                  "weight_decay": 0.0,
                  "lr": CFG.lr,
              },
          ]
      # initialize lrs for every layer
        num_layers = self.config.num_hidden_layers
        layers = [self.transformer.embeddings] + list(self.transformer.encoder.layer)
        layers.reverse()
        layers = layers[:6]
        lr = 2e-5
        for layer in layers:
            lr *= CFG.llrd
            optimizer_grouped_parameters += [
              {
                  "params": [p for n, p in layer.named_parameters() if not any(nd in n for nd in no_decay)],
                  "weight_decay": 0.001,
                  "lr": lr,
              },
              {
                  "params": [p for n, p in layer.named_parameters() if any(nd in n for nd in no_decay)],
                  "weight_decay": 0.0,
                  "lr": lr,
              },
          ]
        opt = torch.optim.AdamW(optimizer_grouped_parameters)
        sch = get_polynomial_decay_schedule_with_warmup(
          opt,
          num_warmup_steps=int(num_train_steps * CFG.warmup_ratio),
          num_training_steps=num_train_steps,
          last_epoch=-1,
          )
        return opt, sch
    
    
    def freeze(self):
        for n,param in self.transformer.named_parameters():
            if "learned_embedding" not in n and not any([str(ln) in n for ln in self.layer_nums]):
                param.requires_grad = False
        
    
    def get_optimizer_scheduler(self, num_train_steps):
        param_optimizer = [('learned_embedding', self.transformer.embeddings.word_embeddings.learned_embedding)] + list(self.output.named_parameters())
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {
                "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                "weight_decay": 0.001,
            },
            {
                "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]

        transformer_parms = [(n,p) for n,p in self.transformer.named_parameters() if any([str(ln) in n for ln in self.layer_nums])] #+ [(n,p) for n,p in self.named_parameters() if n in ["transformer.encoder.LayerNorm.weight","transformer.encoder.LayerNorm.bias"]]
        optimizer_parameters += [
            {
                "params": [p for n, p in transformer_parms if not any(nd in n for nd in no_decay)],
                "weight_decay": 0.001,
                "lr" : 2e-5
            },
            {
                "params": [p for n, p in transformer_parms if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
                "lr" : 2e-5
            },
        ]
        opt = torch.optim.AdamW(optimizer_parameters, lr=CFG.lr)

        sch = get_polynomial_decay_schedule_with_warmup(
          opt,
          num_warmup_steps=int(num_train_steps * CFG.warmup_ratio),
          num_training_steps=num_train_steps,
          last_epoch=-1,
          )
        return opt, sch

    def forward(self, ids, mask, token_type_ids=None, targets=None):
        if token_type_ids is not None:
            transformer_out = self.transformer( ids, mask, token_type_ids )
        else:
            transformer_out = self.transformer(ids, mask)
        sequence_output = transformer_out.last_hidden_state[:,0,:]
        logits = self.output(sequence_output)
        loss = self.loss(logits, targets)

        return loss, logits

In [18]:
def train(epoch, model, train_loader, valid_loader, optimizer, scheduler, device, awp, scaler, best_loss, fold):
    model.train()
    losses = AverageMeter()
    start = end = time.time()
    val_steps = len(train_loader) // 1
    for step, x in enumerate(train_loader):
        for k,v in x.items():
            x[k] = v.to(device)
        
        with autocast():    
            loss, logits = model(**x)
            
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        
        losses.update(loss.item() * CFG.gradient_accumulation_steps, CFG.batch_size)
        scaler.scale(loss).backward()
        if CFG.use_awp:
            awp.attack_backward(x["ids"],x["targets"],x["mask"],epoch)
        
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.unscale_(optimizer)
            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            if scheduler is not None:
                scheduler.step()
        end = time.time()
        
        if ((step + 1) % CFG.print_freq == 0) or (step == (len(train_loader)-1)):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step + 1, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm= grad_norm,
                          lr=scheduler.get_lr()[0] if scheduler is not None else CFG.lr))
        
        if ((step + 1) % val_steps == 0) or ((step + 1) == len(train_loader)):
            print("\nVALID LOOP\n")
            valid_loss = valid(epoch, model, valid_loader, device)
            print(f"\nThe valid loss for the current epoch is {valid_loss}\n")
            torch.cuda.empty_cache()
            gc.collect()
            if valid_loss < best_loss:
                best_loss = valid_loss
                if CFG.save_dir is not None:
                    if not os.path.exists(CFG.save_dir):
                        os.mkdir(CFG.save_dir)
                    save_path = os.path.join(CFG.save_dir, f"{CFG.save_model_name}_fold_{fold}.pth")
                else:
                    save_path = f"{CFG.save_model_name}_fold_{fold}.pth"
                torch.save(model.state_dict(), save_path)
        model.train()

def valid(epoch, model, valid_loader, device):
    model.eval()
    all_targets = []
    all_outputs = []
    losses = AverageMeter()
    with torch.no_grad():
        for step, x in enumerate(valid_loader):

            for k, v in x.items():
                x[k] = v.to(device)
          
            loss, logits = model(**x)

            losses.update(loss.item(), CFG.batch_size)
            targets = x["targets"].cpu().numpy()
            outputs = logits.cpu().numpy()

            all_targets.append(targets)
            all_outputs.append(np.clip(outputs, 1.0, 5.0))

            if ((step + 1) % CFG.print_freq == 0) or (step == (len(valid_loader)-1)):
                print('Epoch: [{0}][{1}/{2}] '
                      'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                      .format(epoch+1, step + 1, len(valid_loader), loss=losses))
        
    
    all_targets = np.vstack(all_targets)
    all_outputs = np.vstack(all_outputs)
    loss = get_score(all_targets, all_outputs)[0]
    
    del all_targets, all_outputs;
    return loss

In [19]:
def main(fold):
    torch.cuda.empty_cache()
    df = pd.read_csv(CFG.train_file)
    
    train_df = df.loc[df.kfold != fold]
    valid_df = df.loc[df.kfold == fold]
    
    train_texts = train_df["full_text"].apply(resolve_encodings_and_normalize).to_list()
    valid_texts = valid_df["full_text"].apply(resolve_encodings_and_normalize).to_list()
    
    train_targets = train_df[CFG.target_columns].values.tolist()
    
    valid_targets = valid_df[CFG.target_columns].values.tolist()
    
    train_ds = Dataset(train_texts, train_targets, CFG.tokenizer)
    valid_ds = Dataset(valid_texts, valid_targets, CFG.tokenizer, is_train = False)

    collate_fn = Collate(CFG.tokenizer)
    train_loader = torch.utils.data.DataLoader(train_ds, batch_size = CFG.batch_size, shuffle = True, collate_fn = collate_fn, num_workers = CFG.num_workers)
    valid_loader = torch.utils.data.DataLoader(valid_ds, batch_size = CFG.batch_size, shuffle = False, collate_fn = collate_fn, num_workers = CFG.num_workers)
    
    model = Model(CFG.model_name)
    num_train_steps = int(len(train_ds) / CFG.batch_size / CFG.gradient_accumulation_steps * CFG.epochs)
    optimizer, scheduler = model.get_optimizer_scheduler(num_train_steps)
    
    model = model.to(CFG.device)
    best_loss = np.inf
    scaler = GradScaler()
    if CFG.use_awp:
        print('ENABLE AWP')
        awp = AWP(model,
          optimizer,
          adv_lr=CFG.adv_lr,
          adv_eps=CFG.adv_eps,
          start_epoch=3,
          scaler=scaler)
    else:
        awp = None
    for epoch in range(CFG.epochs):
        print("\nTRAIN LOOP\n")
        train(epoch, model, train_loader, valid_loader, optimizer, scheduler, CFG.device, awp, scaler, best_loss, fold) # None for the scaler parameter
        # print("\nVALID LOOP\n")
        # valid_loss = valid(epoch, model, valid_loader, CFG.device)
        
        # print(f"\nThe valid loss for the current epoch is {valid_loss}\n")
        # torch.cuda.empty_cache()
        # gc.collect()
        # if valid_loss < best_loss:
        #     best_loss = valid_loss
        #     if CFG.save_dir is not None:
        #       if not os.path.exists(CFG.save_dir):
        #         os.mkdir(CFG.save_dir)
        #       save_path = os.path.join(CFG.save_dir, f"{CFG.save_model_name}_fold_{fold}.pth")
        #     else:
        #       save_path = f"{CFG.save_model_name}_fold_{fold}.pth"
        #     torch.save(model.state_dict(), save_path)
            
    del model, optimizer, scheduler, train_loader, valid_loader, train_df, valid_df;
    gc.collect()

In [None]:
for fold in range(0,5):
    print("-----"*20)
    print(f"\nRUNNING FOLD {fold}\n")
    print("-----"*20)
    main(fold)
    gc.collect()

----------------------------------------------------------------------------------------------------

RUNNING FOLD 0

----------------------------------------------------------------------------------------------------


Downloading:   0%|          | 0.00/874M [00:00<?, ?B/s]


TRAIN LOOP

Epoch: [1][20/196] Elapsed 2m 2s (remain 17m 54s) Loss: 0.2197(0.5643) Grad: 6.4194  LR: 0.00096587  
Epoch: [1][40/196] Elapsed 4m 5s (remain 15m 59s) Loss: 0.0952(0.3805) Grad: 3.4756  LR: 0.00093175  
Epoch: [1][60/196] Elapsed 6m 7s (remain 13m 53s) Loss: 0.0882(0.2973) Grad: 1.7023  LR: 0.00089762  
Epoch: [1][80/196] Elapsed 8m 15s (remain 11m 57s) Loss: 0.1489(0.2547) Grad: 2.5205  LR: 0.00086349  
Epoch: [1][100/196] Elapsed 10m 23s (remain 9m 58s) Loss: 0.1246(0.2273) Grad: 1.8776  LR: 0.00082937  
Epoch: [1][120/196] Elapsed 12m 28s (remain 7m 53s) Loss: 0.0842(0.2093) Grad: 3.1683  LR: 0.00079524  
Epoch: [1][140/196] Elapsed 14m 33s (remain 5m 49s) Loss: 0.1000(0.1963) Grad: 2.4861  LR: 0.00076112  
Epoch: [1][160/196] Elapsed 16m 36s (remain 3m 44s) Loss: 0.0910(0.1867) Grad: 1.8027  LR: 0.00072699  
Epoch: [1][180/196] Elapsed 18m 43s (remain 1m 39s) Loss: 0.1283(0.1770) Grad: 1.9926  LR: 0.00069286  
Epoch: [1][196/196] Elapsed 20m 19s (remain 0m 0s) Loss: 0

In [None]:
!cp -r deberta-v3-large/ /content/drive/MyDrive/Kaggle/FeedbackPrize3/