In [1]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers
!pip install datasets
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!nvidia-smi

Thu Sep 15 05:32:36 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   62C    P0    42W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re
import html

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD, AdamW, RAdam
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    debug2 = False
    apex=True
    print_freq=100
    num_workers=4
    model="microsoft/deberta-v3-base"
    # model='microsoft/deberta-base'
    # model='roberta-base'
    # model='roberta-large'
    # model='roberta-large-mnli'
    # model='xlnet-large-cased'
    # model='albert-xxlarge-v2'
    # model="microsoft/deberta-large"
    # model="microsoft/deberta-v3-large"
    # model='microsoft/deberta-v2-xlarge'
    # model='funnel-transformer/large'
    # model='funnel-transformer/medium'
    # model='albert-base-v2'
    # model='albert-large-v2'
    # model='google/electra-large-discriminator'
    # model='google/electra-base-discriminator'
    # model="facebook/bart-large-mnli"
    # model="facebook/bart-large"
    # model="facebook/bart-base"
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=4
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=16
    fc_dropout=0.2
    target_size=1
    max_len=256
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]
    train=True
    nth_awp_start_epoch=1
    gradient_checkpointing = True
    freezing = True
    clean_content = True

if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0, 1]

if CFG.debug2:
    CFG.epochs = 3
    CFG.trn_fold = [0]

In [6]:
DIR = '/content/drive/MyDrive/Competitions/Signate/MUFJ'
INPUT_DIR = os.path.join(DIR,'input')
OUTPUT_DIR = os.path.join(DIR,'output')
OUTPUT_SUB_DIR = os.path.join(OUTPUT_DIR,'submission')
#OUTPUT_MODEL_DIR = os.path.join(OUTPUT_DIR,'model')
OUTPUT_MODEL_DIR = DIR + '/output/model/EXP21/'
if not os.path.exists(OUTPUT_MODEL_DIR):
    os.makedirs(OUTPUT_MODEL_DIR)

In [7]:
def get_score(labels, outputs):
    thresh = 0.5
    y_pred = outputs
    y_true = labels
    return f1_score(y_true, (y_pred>thresh).astype(int))


def get_logger(filename=OUTPUT_MODEL_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=CFG.seed)

def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

g = torch.Generator()
g.manual_seed(CFG.seed)

<torch._C.Generator at 0x7ff00f150ed0>

In [8]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False
        
def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
            
    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """
    
    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"
        
        if hasattr(embeddings_path, attr_name): 
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [9]:
train = pd.read_csv(os.path.join(INPUT_DIR,'train.csv'))
test = pd.read_csv(os.path.join(INPUT_DIR,'test.csv'))
sub = pd.read_csv(os.path.join(INPUT_DIR,'sample_submit.csv'),header=None)
sub.columns = ['id','state']

In [10]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub('',text)

def remove_html(text):
    html=re.compile(r"<[^>]*?>")
    return html.sub('',text)

def cleaning(texts):
    clean_texts = []
    for text in texts:
        # htmlタグを削除
        text = remove_URL(text)
        text = remove_html(text)
        #アルファベット以外をスペースに置き換え
        #clean_punc = re.sub(r'[^a-zA-Z]', ' ', text)
        #改行削除
        #text = text.replace("\n","")
        clean_texts.append(text)
    return clean_texts

def get_goal_values(df):
  df["goal"].replace("100000+","100000-100000",inplace=True)
  _df = df["goal"].str.split('-').apply(pd.Series).astype(float)
  _df.columns = ["goal_max","goal_min"]
  df["goal_max"] = _df["goal_max"].astype(str)
  df["goal_min"] = _df["goal_min"].astype(str)
  df["goal_median"] = _df[["goal_max","goal_min"]].median(axis=1)
  df["goal_median"] = df["goal_median"].astype(int)
  return df

if CFG.clean_content==True:
    train['html_content'] = train['html_content'].map(lambda x: str(x))
    train['html_content'] = train['html_content'].apply(html.unescape)
    p = re.compile(r"<[^>]*?>|&amp;|[/'’\"”]")
    train['html_content'] = train['html_content'].map(lambda x: p.sub("", x))
    train['html_content'] = train['html_content'].map(lambda x: x.lstrip())
    train['html_content'] = train['html_content'].fillna('missing')

train = get_goal_values(train)
train['inputs'] = train.goal_median.astype(str) + ' [SEP] ' + train.duration.astype(str) + ' [SEP] ' + train.country + ' [SEP] ' + train.category1 + ' [SEP] ' + train.category2 + ' [SEP] ' + train.html_content

In [11]:
train

Unnamed: 0,id,goal,country,duration,category1,category2,html_content,state,goal_max,goal_min,goal_median,inputs
0,train_00000,20001-21000,US,45,art,mixed media,"http:dummy.comIn its first year, The Shillitos...",1,20001.0,21000.0,20500,20500 [SEP] 45 [SEP] US [SEP] art [SEP] mixed ...
1,train_00001,19001-20000,US,59,food,restaurants,Cultural Pretzel Sports Bar is a place where p...,0,19001.0,20000.0,19500,19500 [SEP] 59 [SEP] US [SEP] food [SEP] resta...
2,train_00002,2001-3000,US,38,art,performance art,"I want to perform this piece guerilla style, o...",0,2001.0,3000.0,2500,2500 [SEP] 38 [SEP] US [SEP] art [SEP] perform...
3,train_00003,1001-2000,US,30,art,mixed media,"Canyon de Chelley, Dine (Navajo) Reservation, ...",1,1001.0,2000.0,1500,1500 [SEP] 30 [SEP] US [SEP] art [SEP] mixed m...
4,train_00004,1001-2000,US,29,film & video,webseries,"The story of the show, both on and off screen,...",1,1001.0,2000.0,1500,1500 [SEP] 29 [SEP] US [SEP] film & video [SEP...
...,...,...,...,...,...,...,...,...,...,...,...,...
9786,train_09786,1-1000,US,15,music,electronic music,So the story behind this is that Ive been maki...,0,1.0,1000.0,500,500 [SEP] 15 [SEP] US [SEP] music [SEP] electr...
9787,train_09787,3001-4000,CA,30,fashion,ready-to-wear,THE HIGH CLOTHINGMy vision is to create high q...,0,3001.0,4000.0,3500,3500 [SEP] 30 [SEP] CA [SEP] fashion [SEP] rea...
9788,train_09788,100000-100000,GB,30,technology,software,We dont think anybody looks forward to filling...,0,100000.0,100000.0,100000,100000 [SEP] 30 [SEP] GB [SEP] technology [SEP...
9789,train_09789,79001-80000,US,35,technology,gadgets,What is Droplet?\nDroplet is a wireless button...,1,79001.0,80000.0,79500,79500 [SEP] 35 [SEP] US [SEP] technology [SEP]...


In [12]:
skf = StratifiedKFold(n_splits=CFG.n_fold,shuffle=True,random_state=CFG.seed)
for fold, ( _, val_) in enumerate(skf.split(train, train.state)):
    train.loc[val_ , "kfold"] = int(fold)
    
train["kfold"] = train["kfold"].astype(int)

if CFG.debug:
    display(train.groupby('kfold').size())
    train = train.sample(n=1000, random_state=0).reset_index(drop=True)
    display(train.groupby('kfold').size())

In [13]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_MODEL_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [14]:
# ====================================================
# Define max_len
# ====================================================
#lengths = []
#tk0 = tqdm(train['inputs'].fillna("").values, total=len(train))
#for text in tk0:
#    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
#    lengths.append(length)
#CFG.max_len = max(lengths) + 6 # cls & sep & sep
#LOGGER.info(f"max_len: {CFG.max_len}")

In [15]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False,
                           truncation=True)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['inputs'].values
        self.labels = df['state'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label

#collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [16]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim=1)
        return max_embeddings
    

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()

        # Freezing
        if cfg.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            cfg.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())

        #self.pool = MeanPooling()
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        self._init_weights(self.fc)
        self.attention = nn.Sequential(
            nn.Linear(self.config.hidden_size, 512),
            nn.Tanh(),
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )
        self._init_weights(self.attention)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        weights = self.attention(last_hidden_states)
        feature = torch.sum(weights * last_hidden_states, dim=1)
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output

In [17]:
class AWP:
    def __init__(
        self,
        model,
        optimizer,
        criterion,
        adv_param="weight",
        adv_lr=1e-4,
        adv_eps=1e-2,
        start_epoch=0,
        adv_step=1,
        device="cpu",
    ):
        self.model = model
        self.optimizer = optimizer
        self.criterion = criterion
        self.adv_param = adv_param
        self.adv_lr = adv_lr
        self.adv_eps = adv_eps
        self.start_epoch = start_epoch
        self.adv_step = adv_step
        self.backup = {}
        self.backup_eps = {}
        self.device = device

    def attack_backward(self, inputs, label):
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            self.save()
            self.attack_step() # モデルを近傍の悪い方へ改変
            y_preds = self.model(inputs)
            adv_loss = self.criterion(
                y_preds.view(-1, 1), label.view(-1, 1))
            mask = (label.view(-1, 1) != -1)
            adv_loss = torch.masked_select(adv_loss, mask).mean()
            self.optimizer.zero_grad()
        return adv_loss
        
        

    def attack_step(self):
        e = 1e-6
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                norm1 = torch.norm(param.grad)
                norm2 = torch.norm(param.data.detach())
                if norm1 != 0 and not torch.isnan(norm1):
                    r_at = self.adv_lr * param.grad / (norm1 + e) * (norm2 + e)
                    param.data.add_(r_at)
                    param.data = torch.min(
                        torch.max(param.data, self.backup_eps[name][0]), self.backup_eps[name][1]
                    )
                # param.data.clamp_(*self.backup_eps[name])

    def save(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                if name not in self.backup:
                    self.backup[name] = param.data.clone()
                    grad_eps = self.adv_eps * param.abs().detach()
                    self.backup_eps[name] = (
                        self.backup[name] - grad_eps,
                        self.backup[name] + grad_eps,
                    )

    def restore(self,):
        for name, param in self.model.named_parameters():
            if name in self.backup:
                param.data = self.backup[name]
        self.backup = {}
        self.backup_eps = {}

In [18]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    #if not epoch < CFG.nth_awp_start_epoch:
    #    LOGGER.info(f'AWP training with epoch {epoch+1}')
    model.train()
    #awp = AWP(model=model,
    #          optimizer=optimizer,
    #          criterion=criterion,
    #          adv_eps=0.01, 
    #          device=device)
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    #tot_loss = 0
    for step, (inputs, labels) in enumerate(train_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        #if CFG.nth_awp_start_epoch <= epoch:
        #      loss = awp.attack_backward(inputs, labels)
        #      scaler.scale(loss).backward()
        #      awp.restore()
        #tot_loss += loss.item()
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))

    return losses.avg
    #model.train()
    #return tot_loss/(step+1)


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    predictions = np.concatenate(predictions)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [19]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['kfold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['kfold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds['state'].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)


    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size*2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_MODEL_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.BCEWithLogitsLoss(reduction="mean")
    
    best_score = 0.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')

        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_MODEL_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_MODEL_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds['pred'] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [20]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df['state'].values
        preds = oof_df['pred'].values
        score = get_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_MODEL_DIR+'oof_df.pkl')

DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.22.0",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "at

Epoch: [1][0/458] Elapsed 0m 2s (remain 19m 8s) Loss: 0.6573(0.6573) Grad: 169677.0469  LR: 0.00002000  
Epoch: [1][100/458] Elapsed 1m 9s (remain 4m 4s) Loss: 0.6308(0.6440) Grad: 40969.8047  LR: 0.00001985  
Epoch: [1][200/458] Elapsed 2m 15s (remain 2m 53s) Loss: 0.3900(0.6167) Grad: 23699.0684  LR: 0.00001941  
Epoch: [1][300/458] Elapsed 3m 22s (remain 1m 45s) Loss: 0.5316(0.5896) Grad: 96095.6250  LR: 0.00001870  
Epoch: [1][400/458] Elapsed 4m 28s (remain 0m 38s) Loss: 0.5468(0.5749) Grad: 31599.9746  LR: 0.00001773  
Epoch: [1][457/458] Elapsed 5m 6s (remain 0m 0s) Loss: 0.6925(0.5656) Grad: 101732.7656  LR: 0.00001708  
EVAL: [0/77] Elapsed 0m 0s (remain 1m 11s) Loss: 0.4907(0.4907) 


Epoch 1 - avg_train_loss: 0.5656  avg_val_loss: 0.5984  time: 333s
INFO:__main__:Epoch 1 - avg_train_loss: 0.5656  avg_val_loss: 0.5984  time: 333s
Epoch 1 - Score: 0.7608
INFO:__main__:Epoch 1 - Score: 0.7608
Epoch 1 - Save Best Score: 0.7608 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7608 Model


EVAL: [76/77] Elapsed 0m 26s (remain 0m 0s) Loss: 0.6720(0.5984) 
Epoch: [2][0/458] Elapsed 0m 1s (remain 9m 7s) Loss: 0.4288(0.4288) Grad: 396539.3438  LR: 0.00001707  
Epoch: [2][100/458] Elapsed 1m 8s (remain 4m 0s) Loss: 0.3709(0.4361) Grad: 80739.0312  LR: 0.00001576  
Epoch: [2][200/458] Elapsed 2m 14s (remain 2m 52s) Loss: 0.3444(0.4156) Grad: 60957.7227  LR: 0.00001428  
Epoch: [2][300/458] Elapsed 3m 21s (remain 1m 44s) Loss: 0.2358(0.4072) Grad: 63545.6875  LR: 0.00001268  
Epoch: [2][400/458] Elapsed 4m 27s (remain 0m 38s) Loss: 0.4177(0.4073) Grad: 56432.6367  LR: 0.00001100  
Epoch: [2][457/458] Elapsed 5m 5s (remain 0m 0s) Loss: 0.6131(0.4063) Grad: 100983.8750  LR: 0.00001003  
EVAL: [0/77] Elapsed 0m 0s (remain 1m 11s) Loss: 0.4337(0.4337) 


Epoch 2 - avg_train_loss: 0.4063  avg_val_loss: 0.4377  time: 332s
INFO:__main__:Epoch 2 - avg_train_loss: 0.4063  avg_val_loss: 0.4377  time: 332s
Epoch 2 - Score: 0.7911
INFO:__main__:Epoch 2 - Score: 0.7911
Epoch 2 - Save Best Score: 0.7911 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7911 Model


EVAL: [76/77] Elapsed 0m 26s (remain 0m 0s) Loss: 0.5965(0.4377) 
Epoch: [3][0/458] Elapsed 0m 1s (remain 8m 58s) Loss: 0.4162(0.4162) Grad: inf  LR: 0.00001001  
Epoch: [3][100/458] Elapsed 1m 7s (remain 3m 59s) Loss: 0.1596(0.3259) Grad: 44991.8711  LR: 0.00000830  
Epoch: [3][200/458] Elapsed 2m 14s (remain 2m 51s) Loss: 0.1919(0.3110) Grad: 78931.9062  LR: 0.00000665  
Epoch: [3][300/458] Elapsed 3m 20s (remain 1m 44s) Loss: 0.0792(0.3099) Grad: 24117.0078  LR: 0.00000509  
Epoch: [3][400/458] Elapsed 4m 27s (remain 0m 37s) Loss: 0.1384(0.3039) Grad: 59220.4219  LR: 0.00000368  
Epoch: [3][457/458] Elapsed 5m 5s (remain 0m 0s) Loss: 0.6416(0.2993) Grad: 69694.6094  LR: 0.00000296  
EVAL: [0/77] Elapsed 0m 0s (remain 1m 8s) Loss: 0.3960(0.3960) 


Epoch 3 - avg_train_loss: 0.2993  avg_val_loss: 0.4479  time: 331s
INFO:__main__:Epoch 3 - avg_train_loss: 0.2993  avg_val_loss: 0.4479  time: 331s
Epoch 3 - Score: 0.7987
INFO:__main__:Epoch 3 - Score: 0.7987
Epoch 3 - Save Best Score: 0.7987 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7987 Model


EVAL: [76/77] Elapsed 0m 26s (remain 0m 0s) Loss: 0.6474(0.4479) 
Epoch: [4][0/458] Elapsed 0m 1s (remain 8m 55s) Loss: 0.1651(0.1651) Grad: 323343.6250  LR: 0.00000294  
Epoch: [4][100/458] Elapsed 1m 7s (remain 3m 59s) Loss: 0.0766(0.1987) Grad: 93782.8047  LR: 0.00000184  
Epoch: [4][200/458] Elapsed 2m 14s (remain 2m 51s) Loss: 0.1534(0.1967) Grad: 73292.6562  LR: 0.00000097  
Epoch: [4][300/458] Elapsed 3m 20s (remain 1m 44s) Loss: 0.1084(0.1945) Grad: 99828.2891  LR: 0.00000037  
Epoch: [4][400/458] Elapsed 4m 27s (remain 0m 38s) Loss: 0.1605(0.1937) Grad: 100499.7969  LR: 0.00000005  
Epoch: [4][457/458] Elapsed 5m 5s (remain 0m 0s) Loss: 0.4019(0.1938) Grad: 125410.1953  LR: 0.00000000  
EVAL: [0/77] Elapsed 0m 0s (remain 1m 14s) Loss: 0.4489(0.4489) 


Epoch 4 - avg_train_loss: 0.1938  avg_val_loss: 0.4824  time: 332s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1938  avg_val_loss: 0.4824  time: 332s
Epoch 4 - Score: 0.7948
INFO:__main__:Epoch 4 - Score: 0.7948


EVAL: [76/77] Elapsed 0m 26s (remain 0m 0s) Loss: 0.7348(0.4824) 


Score: 0.7987
INFO:__main__:Score: 0.7987
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.22.0",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v

Epoch: [1][0/458] Elapsed 0m 1s (remain 8m 15s) Loss: 1.0017(1.0017) Grad: inf  LR: 0.00002000  
Epoch: [1][100/458] Elapsed 1m 7s (remain 3m 58s) Loss: 0.3922(0.6421) Grad: 33988.6406  LR: 0.00001985  
Epoch: [1][200/458] Elapsed 2m 13s (remain 2m 51s) Loss: 0.5850(0.6034) Grad: 83063.1328  LR: 0.00001941  
Epoch: [1][300/458] Elapsed 3m 20s (remain 1m 44s) Loss: 0.3123(0.5909) Grad: 40023.3203  LR: 0.00001870  
Epoch: [1][400/458] Elapsed 4m 26s (remain 0m 37s) Loss: 0.6796(0.5759) Grad: 102541.1562  LR: 0.00001773  
Epoch: [1][457/458] Elapsed 5m 4s (remain 0m 0s) Loss: 0.3997(0.5713) Grad: 21613.6348  LR: 0.00001708  
EVAL: [0/77] Elapsed 0m 0s (remain 1m 6s) Loss: 0.4530(0.4530) 


Epoch 1 - avg_train_loss: 0.5713  avg_val_loss: 0.4723  time: 331s
INFO:__main__:Epoch 1 - avg_train_loss: 0.5713  avg_val_loss: 0.4723  time: 331s
Epoch 1 - Score: 0.7502
INFO:__main__:Epoch 1 - Score: 0.7502
Epoch 1 - Save Best Score: 0.7502 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7502 Model


EVAL: [76/77] Elapsed 0m 26s (remain 0m 0s) Loss: 0.4514(0.4723) 
Epoch: [2][0/458] Elapsed 0m 1s (remain 8m 16s) Loss: 0.3423(0.3423) Grad: 168927.0625  LR: 0.00001707  
Epoch: [2][100/458] Elapsed 1m 7s (remain 3m 58s) Loss: 0.3691(0.4206) Grad: 31367.8828  LR: 0.00001576  
Epoch: [2][200/458] Elapsed 2m 13s (remain 2m 51s) Loss: 0.4162(0.4342) Grad: 38044.8125  LR: 0.00001428  
Epoch: [2][300/458] Elapsed 3m 20s (remain 1m 44s) Loss: 0.5409(0.4344) Grad: 69104.8359  LR: 0.00001268  
Epoch: [2][400/458] Elapsed 4m 26s (remain 0m 37s) Loss: 0.3376(0.4266) Grad: 39911.4648  LR: 0.00001100  
Epoch: [2][457/458] Elapsed 5m 4s (remain 0m 0s) Loss: 0.3330(0.4240) Grad: 56903.5977  LR: 0.00001003  
EVAL: [0/77] Elapsed 0m 1s (remain 1m 16s) Loss: 0.5066(0.5066) 


Epoch 2 - avg_train_loss: 0.4240  avg_val_loss: 0.4677  time: 331s
INFO:__main__:Epoch 2 - avg_train_loss: 0.4240  avg_val_loss: 0.4677  time: 331s
Epoch 2 - Score: 0.8085
INFO:__main__:Epoch 2 - Score: 0.8085
Epoch 2 - Save Best Score: 0.8085 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.8085 Model


EVAL: [76/77] Elapsed 0m 26s (remain 0m 0s) Loss: 0.6174(0.4677) 
Epoch: [3][0/458] Elapsed 0m 1s (remain 8m 54s) Loss: 0.3301(0.3301) Grad: inf  LR: 0.00001001  
Epoch: [3][100/458] Elapsed 1m 7s (remain 3m 59s) Loss: 0.1212(0.3059) Grad: 95876.9062  LR: 0.00000830  
Epoch: [3][200/458] Elapsed 2m 13s (remain 2m 51s) Loss: 0.3672(0.3037) Grad: 183662.3750  LR: 0.00000665  
Epoch: [3][300/458] Elapsed 3m 20s (remain 1m 44s) Loss: 0.2966(0.2931) Grad: 160161.1875  LR: 0.00000509  
Epoch: [3][400/458] Elapsed 4m 26s (remain 0m 37s) Loss: 0.3152(0.2790) Grad: 197982.4219  LR: 0.00000368  
Epoch: [3][457/458] Elapsed 5m 4s (remain 0m 0s) Loss: 0.5343(0.2766) Grad: 347487.8438  LR: 0.00000296  
EVAL: [0/77] Elapsed 0m 0s (remain 1m 13s) Loss: 0.5352(0.5352) 


Epoch 3 - avg_train_loss: 0.2766  avg_val_loss: 0.4537  time: 331s
INFO:__main__:Epoch 3 - avg_train_loss: 0.2766  avg_val_loss: 0.4537  time: 331s
Epoch 3 - Score: 0.8045
INFO:__main__:Epoch 3 - Score: 0.8045


EVAL: [76/77] Elapsed 0m 26s (remain 0m 0s) Loss: 0.3834(0.4537) 
Epoch: [4][0/458] Elapsed 0m 1s (remain 8m 29s) Loss: 0.0496(0.0496) Grad: 215977.7812  LR: 0.00000294  
Epoch: [4][100/458] Elapsed 1m 7s (remain 3m 59s) Loss: 0.1931(0.2197) Grad: 309369.5625  LR: 0.00000184  
Epoch: [4][200/458] Elapsed 2m 14s (remain 2m 51s) Loss: 0.2413(0.2094) Grad: 218032.8281  LR: 0.00000097  
Epoch: [4][300/458] Elapsed 3m 20s (remain 1m 44s) Loss: 0.0834(0.2040) Grad: 63619.0156  LR: 0.00000037  
Epoch: [4][400/458] Elapsed 4m 27s (remain 0m 38s) Loss: 0.0563(0.2012) Grad: 76167.1016  LR: 0.00000005  
Epoch: [4][457/458] Elapsed 5m 5s (remain 0m 0s) Loss: 0.5009(0.1993) Grad: 90341.3906  LR: 0.00000000  
EVAL: [0/77] Elapsed 0m 0s (remain 1m 5s) Loss: 0.5710(0.5710) 


Epoch 4 - avg_train_loss: 0.1993  avg_val_loss: 0.4713  time: 332s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1993  avg_val_loss: 0.4713  time: 332s
Epoch 4 - Score: 0.8055
INFO:__main__:Epoch 4 - Score: 0.8055


EVAL: [76/77] Elapsed 0m 26s (remain 0m 0s) Loss: 0.4169(0.4713) 


Score: 0.8085
INFO:__main__:Score: 0.8085
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.22.0",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v

Epoch: [1][0/458] Elapsed 0m 1s (remain 8m 46s) Loss: 0.6499(0.6499) Grad: 195783.0781  LR: 0.00002000  
Epoch: [1][100/458] Elapsed 1m 7s (remain 3m 59s) Loss: 0.5673(0.6440) Grad: 33576.0820  LR: 0.00001985  
Epoch: [1][200/458] Elapsed 2m 14s (remain 2m 51s) Loss: 0.3423(0.6103) Grad: 10809.3701  LR: 0.00001941  
Epoch: [1][300/458] Elapsed 3m 20s (remain 1m 44s) Loss: 0.7294(0.5837) Grad: 42733.2461  LR: 0.00001870  
Epoch: [1][400/458] Elapsed 4m 27s (remain 0m 37s) Loss: 0.5022(0.5745) Grad: 25594.4961  LR: 0.00001773  
Epoch: [1][457/458] Elapsed 5m 5s (remain 0m 0s) Loss: 0.4886(0.5617) Grad: 23760.6699  LR: 0.00001708  
EVAL: [0/77] Elapsed 0m 0s (remain 1m 15s) Loss: 0.6371(0.6371) 


Epoch 1 - avg_train_loss: 0.5617  avg_val_loss: 0.5077  time: 332s
INFO:__main__:Epoch 1 - avg_train_loss: 0.5617  avg_val_loss: 0.5077  time: 332s
Epoch 1 - Score: 0.7773
INFO:__main__:Epoch 1 - Score: 0.7773
Epoch 1 - Save Best Score: 0.7773 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7773 Model


EVAL: [76/77] Elapsed 0m 26s (remain 0m 0s) Loss: 0.4244(0.5077) 
Epoch: [2][0/458] Elapsed 0m 1s (remain 8m 55s) Loss: 0.3149(0.3149) Grad: inf  LR: 0.00001707  
Epoch: [2][100/458] Elapsed 1m 7s (remain 4m 0s) Loss: 0.2684(0.3836) Grad: 71954.2500  LR: 0.00001576  
Epoch: [2][200/458] Elapsed 2m 14s (remain 2m 51s) Loss: 0.4814(0.3949) Grad: 74330.7188  LR: 0.00001428  
Epoch: [2][300/458] Elapsed 3m 21s (remain 1m 44s) Loss: 0.3734(0.3918) Grad: 29405.5430  LR: 0.00001268  
Epoch: [2][400/458] Elapsed 4m 27s (remain 0m 38s) Loss: 0.3328(0.3839) Grad: 38736.5820  LR: 0.00001100  
Epoch: [2][457/458] Elapsed 5m 5s (remain 0m 0s) Loss: 0.3236(0.3840) Grad: 20348.8594  LR: 0.00001003  
EVAL: [0/77] Elapsed 0m 0s (remain 1m 11s) Loss: 0.4483(0.4483) 


Epoch 2 - avg_train_loss: 0.3840  avg_val_loss: 0.4611  time: 332s
INFO:__main__:Epoch 2 - avg_train_loss: 0.3840  avg_val_loss: 0.4611  time: 332s
Epoch 2 - Score: 0.7764
INFO:__main__:Epoch 2 - Score: 0.7764


EVAL: [76/77] Elapsed 0m 26s (remain 0m 0s) Loss: 0.2037(0.4611) 
Epoch: [3][0/458] Elapsed 0m 1s (remain 9m 7s) Loss: 0.1639(0.1639) Grad: inf  LR: 0.00001001  
Epoch: [3][100/458] Elapsed 1m 7s (remain 3m 59s) Loss: 0.3973(0.2980) Grad: 84281.4609  LR: 0.00000830  
Epoch: [3][200/458] Elapsed 2m 14s (remain 2m 51s) Loss: 0.0705(0.2936) Grad: 54911.3398  LR: 0.00000665  
Epoch: [3][300/458] Elapsed 3m 20s (remain 1m 44s) Loss: 0.3204(0.2901) Grad: 149446.4531  LR: 0.00000509  
Epoch: [3][400/458] Elapsed 4m 27s (remain 0m 38s) Loss: 0.1876(0.2853) Grad: 200413.1875  LR: 0.00000368  
Epoch: [3][457/458] Elapsed 5m 5s (remain 0m 0s) Loss: 0.2541(0.2811) Grad: 121664.0938  LR: 0.00000296  
EVAL: [0/77] Elapsed 0m 0s (remain 1m 9s) Loss: 0.4765(0.4765) 


Epoch 3 - avg_train_loss: 0.2811  avg_val_loss: 0.5008  time: 332s
INFO:__main__:Epoch 3 - avg_train_loss: 0.2811  avg_val_loss: 0.5008  time: 332s
Epoch 3 - Score: 0.8036
INFO:__main__:Epoch 3 - Score: 0.8036
Epoch 3 - Save Best Score: 0.8036 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.8036 Model


EVAL: [76/77] Elapsed 0m 26s (remain 0m 0s) Loss: 0.4207(0.5008) 
Epoch: [4][0/458] Elapsed 0m 1s (remain 9m 9s) Loss: 0.2022(0.2022) Grad: 352247.6875  LR: 0.00000294  
Epoch: [4][100/458] Elapsed 1m 7s (remain 3m 59s) Loss: 0.2100(0.2078) Grad: 222714.8281  LR: 0.00000184  
Epoch: [4][200/458] Elapsed 2m 14s (remain 2m 51s) Loss: 0.2128(0.2195) Grad: 116179.0391  LR: 0.00000097  
Epoch: [4][300/458] Elapsed 3m 20s (remain 1m 44s) Loss: 0.0667(0.2249) Grad: 41725.5234  LR: 0.00000037  
Epoch: [4][400/458] Elapsed 4m 27s (remain 0m 38s) Loss: 0.2250(0.2227) Grad: 165062.0625  LR: 0.00000005  
Epoch: [4][457/458] Elapsed 5m 5s (remain 0m 0s) Loss: 0.1024(0.2213) Grad: 96645.0781  LR: 0.00000000  
EVAL: [0/77] Elapsed 0m 0s (remain 1m 12s) Loss: 0.4485(0.4485) 


Epoch 4 - avg_train_loss: 0.2213  avg_val_loss: 0.5020  time: 332s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2213  avg_val_loss: 0.5020  time: 332s
Epoch 4 - Score: 0.7929
INFO:__main__:Epoch 4 - Score: 0.7929


EVAL: [76/77] Elapsed 0m 26s (remain 0m 0s) Loss: 0.3485(0.5020) 


Score: 0.8036
INFO:__main__:Score: 0.8036
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.22.0",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v

Epoch: [1][0/459] Elapsed 0m 1s (remain 8m 34s) Loss: 0.8723(0.8723) Grad: inf  LR: 0.00002000  
Epoch: [1][100/459] Elapsed 1m 7s (remain 3m 59s) Loss: 0.6418(0.6367) Grad: 29832.8789  LR: 0.00001985  
Epoch: [1][200/459] Elapsed 2m 13s (remain 2m 51s) Loss: 0.6591(0.5996) Grad: 56452.5938  LR: 0.00001941  
Epoch: [1][300/459] Elapsed 3m 20s (remain 1m 45s) Loss: 0.5200(0.5825) Grad: 28776.4941  LR: 0.00001870  
Epoch: [1][400/459] Elapsed 4m 26s (remain 0m 38s) Loss: 0.5090(0.5633) Grad: 28020.2480  LR: 0.00001774  
Epoch: [1][458/459] Elapsed 5m 5s (remain 0m 0s) Loss: 0.3253(0.5584) Grad: 27315.6230  LR: 0.00001707  
EVAL: [0/77] Elapsed 0m 0s (remain 1m 15s) Loss: 0.4348(0.4348) 


Epoch 1 - avg_train_loss: 0.5584  avg_val_loss: 0.4515  time: 332s
INFO:__main__:Epoch 1 - avg_train_loss: 0.5584  avg_val_loss: 0.4515  time: 332s
Epoch 1 - Score: 0.7933
INFO:__main__:Epoch 1 - Score: 0.7933
Epoch 1 - Save Best Score: 0.7933 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7933 Model


EVAL: [76/77] Elapsed 0m 26s (remain 0m 0s) Loss: 0.4114(0.4515) 
Epoch: [2][0/459] Elapsed 0m 1s (remain 9m 14s) Loss: 0.4591(0.4591) Grad: 210883.2500  LR: 0.00001706  
Epoch: [2][100/459] Elapsed 1m 7s (remain 3m 59s) Loss: 0.4513(0.3827) Grad: 80603.7188  LR: 0.00001575  
Epoch: [2][200/459] Elapsed 2m 14s (remain 2m 52s) Loss: 1.1233(0.3986) Grad: 203492.5312  LR: 0.00001427  
Epoch: [2][300/459] Elapsed 3m 20s (remain 1m 45s) Loss: 0.2948(0.4023) Grad: 83219.9062  LR: 0.00001267  
Epoch: [2][400/459] Elapsed 4m 26s (remain 0m 38s) Loss: 0.6591(0.4080) Grad: 64826.6133  LR: 0.00001099  
Epoch: [2][458/459] Elapsed 5m 5s (remain 0m 0s) Loss: 0.2838(0.4048) Grad: 77169.2422  LR: 0.00001000  
EVAL: [0/77] Elapsed 0m 0s (remain 1m 12s) Loss: 0.3603(0.3603) 


Epoch 2 - avg_train_loss: 0.4048  avg_val_loss: 0.4023  time: 332s
INFO:__main__:Epoch 2 - avg_train_loss: 0.4048  avg_val_loss: 0.4023  time: 332s
Epoch 2 - Score: 0.8111
INFO:__main__:Epoch 2 - Score: 0.8111
Epoch 2 - Save Best Score: 0.8111 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.8111 Model


EVAL: [76/77] Elapsed 0m 26s (remain 0m 0s) Loss: 0.3621(0.4023) 
Epoch: [3][0/459] Elapsed 0m 1s (remain 8m 48s) Loss: 0.2315(0.2315) Grad: 260940.9062  LR: 0.00000998  
Epoch: [3][100/459] Elapsed 1m 7s (remain 3m 59s) Loss: 0.2790(0.2860) Grad: 116433.9766  LR: 0.00000828  
Epoch: [3][200/459] Elapsed 2m 13s (remain 2m 51s) Loss: 0.2592(0.2720) Grad: 86943.9531  LR: 0.00000663  
Epoch: [3][300/459] Elapsed 3m 20s (remain 1m 45s) Loss: 0.4075(0.2659) Grad: 199688.5469  LR: 0.00000507  
Epoch: [3][400/459] Elapsed 4m 26s (remain 0m 38s) Loss: 0.2695(0.2657) Grad: 102898.4297  LR: 0.00000366  
Epoch: [3][458/459] Elapsed 5m 5s (remain 0m 0s) Loss: 0.1922(0.2644) Grad: 111710.4062  LR: 0.00000293  
EVAL: [0/77] Elapsed 0m 0s (remain 1m 10s) Loss: 0.4005(0.4005) 


Epoch 3 - avg_train_loss: 0.2644  avg_val_loss: 0.4301  time: 332s
INFO:__main__:Epoch 3 - avg_train_loss: 0.2644  avg_val_loss: 0.4301  time: 332s
Epoch 3 - Score: 0.8205
INFO:__main__:Epoch 3 - Score: 0.8205
Epoch 3 - Save Best Score: 0.8205 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.8205 Model


EVAL: [76/77] Elapsed 0m 26s (remain 0m 0s) Loss: 0.3739(0.4301) 
Epoch: [4][0/459] Elapsed 0m 1s (remain 8m 42s) Loss: 0.2867(0.2867) Grad: 688704.9375  LR: 0.00000292  
Epoch: [4][100/459] Elapsed 1m 7s (remain 3m 59s) Loss: 0.1239(0.2032) Grad: 87385.5469  LR: 0.00000182  
Epoch: [4][200/459] Elapsed 2m 13s (remain 2m 51s) Loss: 0.1245(0.1948) Grad: 171430.0312  LR: 0.00000096  
Epoch: [4][300/459] Elapsed 3m 20s (remain 1m 45s) Loss: 0.1996(0.1902) Grad: 85315.9609  LR: 0.00000036  
Epoch: [4][400/459] Elapsed 4m 26s (remain 0m 38s) Loss: 0.1133(0.1865) Grad: 151616.4531  LR: 0.00000005  
Epoch: [4][458/459] Elapsed 5m 4s (remain 0m 0s) Loss: 0.3781(0.1903) Grad: 312059.5938  LR: 0.00000000  
EVAL: [0/77] Elapsed 0m 0s (remain 1m 9s) Loss: 0.4346(0.4346) 


Epoch 4 - avg_train_loss: 0.1903  avg_val_loss: 0.4853  time: 331s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1903  avg_val_loss: 0.4853  time: 331s
Epoch 4 - Score: 0.8120
INFO:__main__:Epoch 4 - Score: 0.8120


EVAL: [76/77] Elapsed 0m 26s (remain 0m 0s) Loss: 0.4331(0.4853) 


Score: 0.8205
INFO:__main__:Score: 0.8205
Score: 0.8079
INFO:__main__:Score: 0.8079


In [21]:
A = pd.read_pickle(OUTPUT_MODEL_DIR+'oof_df.pkl')
A.head()

Unnamed: 0,id,goal,country,duration,category1,category2,html_content,state,goal_max,goal_min,goal_median,inputs,kfold,pred
0,train_00002,2001-3000,US,38,art,performance art,"I want to perform this piece guerilla style, o...",0,2001.0,3000.0,2500,2500 [SEP] 38 [SEP] US [SEP] art [SEP] perform...,0,0.680858
1,train_00006,2001-3000,CA,30,music,classical music,The Crimson String Quartet (CSQ) is returning ...,1,2001.0,3000.0,2500,2500 [SEP] 30 [SEP] CA [SEP] music [SEP] class...,0,0.905334
2,train_00016,3001-4000,GB,30,journalism,web,I am interested in setting up a small news web...,0,3001.0,4000.0,3500,3500 [SEP] 30 [SEP] GB [SEP] journalism [SEP] ...,0,0.004502
3,train_00017,11001-12000,US,30,food,drinks,Why Schenk-Atwood?Our neighborhood has just ab...,1,11001.0,12000.0,11500,11500 [SEP] 30 [SEP] US [SEP] food [SEP] drink...,0,0.141276
4,train_00018,12001-13000,US,28,journalism,web,Mega Visions app has been approved on all majo...,1,12001.0,13000.0,12500,12500 [SEP] 28 [SEP] US [SEP] journalism [SEP]...,0,0.216301
