In [None]:
! nvidia-smi

Sat May 28 17:42:26 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    23W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import os

class Config:
    AUTHOR = "kuruton"

    NAME = "USP-" + "Exp105-deberta-large-kf-mse"
    MODEL_PATH = "microsoft/deberta-large"
    DATASET_PATH = [
        "yasufuminakama/cpc-data"
    ]

    COMPETITION = "us-patent-phrase-to-phrase-matching"
    COLAB_PATH = "/content/drive/Shareddrives/USPatent" 
    DRIVE_PATH = os.path.join(COLAB_PATH, AUTHOR)

    api_path = "/content/drive/MyDrive/kaggle.json"

    seed = 42
    num_fold = 4
    trn_fold = [0, 1, 2, 3]
    batch_size = 32
    n_epochs = 8
    max_len = 256

    weight_decay = 2e-5
    beta = (0.9, 0.98)
    lr = 2e-5
    num_warmup_steps_rate = 0.01
    clip_grad_norm = None
    gradient_accumulation_steps = 1
    num_eval = 3

    upload_from_colab = True

In [None]:
# ========================================
# Library
# ========================================
import os
import gc
import re
import sys
import json
import time
import shutil
import joblib
import random
import requests
import warnings
warnings.filterwarnings('ignore')
from ast import literal_eval

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import scipy 
import itertools
from pathlib import Path
from glob import glob
from tqdm.auto import tqdm
from sklearn.model_selection import (
    StratifiedKFold, 
    KFold, 
    GroupKFold
)
from sklearn.metrics import (
    accuracy_score, 
    f1_score,
    roc_auc_score,
)

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset
from torch.cuda.amp import autocast, GradScaler

from google.colab import drive
if not os.path.isdir('/content/drive'):
    drive.mount('/content/drive') 

Mounted at /content/drive


In [None]:
def setup(cfg):
    cfg.COLAB = 'google.colab' in sys.modules
    cfg.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    if cfg.COLAB:
        print('This environment is Google Colab')

        # mount
        from google.colab import drive
        if not os.path.isdir('/content/drive'):
            drive.mount('/content/drive') 

        # pip install
        # ! pip install -q torch==1.10.0
        ! pip install -q transformers
        ! pip install -q sentencepiece

        # use kaggle api (need kaggle token)
        f = open(cfg.api_path, 'r')
        json_data = json.load(f) 
        os.environ['KAGGLE_USERNAME'] = json_data['username']
        os.environ['KAGGLE_KEY'] = json_data['key']

        # set dirs
        cfg.DRIVE = cfg.DRIVE_PATH
        cfg.EXP = (cfg.NAME if cfg.NAME is not None 
            else requests.get('http://172.28.0.2:9000/api/sessions').json()[0]['name'][:-6]
        )
        cfg.INPUT = os.path.join(cfg.DRIVE, 'Input')
        cfg.OUTPUT = os.path.join(cfg.DRIVE, 'Output')
        cfg.SUBMISSION = os.path.join(cfg.DRIVE, 'Submission')
        cfg.DATASET = os.path.join(cfg.DRIVE, 'Dataset')

        cfg.OUTPUT_EXP = os.path.join(cfg.OUTPUT, cfg.EXP) 
        cfg.EXP_MODEL = os.path.join(cfg.OUTPUT_EXP, 'model')
        cfg.EXP_FIG = os.path.join(cfg.OUTPUT_EXP, 'fig')
        cfg.EXP_PREDS = os.path.join(cfg.OUTPUT_EXP, 'preds')

        # make dirs
        for d in [cfg.INPUT, cfg.SUBMISSION, cfg.EXP_MODEL, cfg.EXP_FIG, cfg.EXP_PREDS]:
            os.makedirs(d, exist_ok=True)
        
        if not os.path.isfile(os.path.join(cfg.INPUT, 'train.csv')):
            # load dataset
            ! pip install --upgrade --force-reinstall --no-deps kaggle
            ! kaggle competitions download -c $cfg.COMPETITION -p $cfg.INPUT
            filepath = os.path.join(cfg.INPUT,cfg.COMPETITION+'.zip')
            ! unzip -d $cfg.INPUT $filepath
            
        
        for path in cfg.DATASET_PATH:
            datasetpath = os.path.join(cfg.DATASET,  path.split('/')[1])
            if not os.path.exists(datasetpath):
                os.makedirs(datasetpath, exist_ok=True)
                ! kaggle datasets download $path -p $datasetpath
                filepath = os.path.join(datasetpath, path.split("/")[1]+'.zip')
                ! unzip -d $datasetpath $filepath

    else:
        print('This environment is Kaggle Kernel')

        # set dirs
        cfg.INPUT = f'../input/{cfg.COMPETITION}'
        cfg.EXP = cfg.NAME
        cfg.OUTPUT_EXP = cfg.NAME
        cfg.SUBMISSION = './'
        cfg.DATASET = '../input/'
        
        cfg.EXP_MODEL = os.path.join(cfg.EXP, 'model')
        cfg.EXP_FIG = os.path.join(cfg.EXP, 'fig')
        cfg.EXP_PREDS = os.path.join(cfg.EXP, 'preds')

        # make dirs
        for d in [cfg.EXP_MODEL, cfg.EXP_FIG, cfg.EXP_PREDS]:
            os.makedirs(d, exist_ok=True)
    return cfg


def dataset_create_new(dataset_name, upload_dir):
    dataset_metadata = {}
    dataset_metadata['id'] = f'{os.environ["KAGGLE_USERNAME"]}/{dataset_name}'
    dataset_metadata['licenses'] = [{'name': 'CC0-1.0'}]
    dataset_metadata['title'] = dataset_name
    with open(os.path.join(upload_dir, 'dataset-metadata.json'), 'w') as f:
        json.dump(dataset_metadata, f, indent=4)
    api = KaggleApi()
    api.authenticate()
    api.dataset_create_new(folder=upload_dir, convert_to_csv=False, dir_mode='tar')

In [None]:
# =====================
# Utils
# =====================
# Seed
def set_seed(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

# KFold
def get_kfold(train, n_splits, seed):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    generator = kf.split(train)
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

def get_stratifiedkfold(train, target_col, n_splits, seed):
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    generator = kf.split(train, train[target_col])
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

def get_groupkfold(train, target_col, group_col, n_splits):
    kf = GroupKFold(n_splits=n_splits)
    generator = kf.split(train, train[target_col], train[group_col])
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

# collatte
def collatte(inputs, labels=None):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    if not labels is None:
        inputs = {
            "input_ids" : inputs['input_ids'][:,:mask_len],
            "attention_mask" : inputs['attention_mask'][:,:mask_len],
        }
        labels =  labels[:,:mask_len]
        return inputs, labels, mask_len
                
    else:
        inputs = {
            "input_ids" : inputs['input_ids'][:,:mask_len],
            "attention_mask" : inputs['attention_mask'][:,:mask_len],
        }
        return inputs, mask_len

# =====================
# CPC Data
# =====================
def get_cpc_texts(cfg):
    contexts = []
    pattern = '[A-Z]\d+'
    for file_name in os.listdir(os.path.join(cfg.DATASET, 'cpc-data/CPCSchemeXML202105')):
        result = re.findall(pattern, file_name)
        if result:
            contexts.append(result)
    contexts = sorted(set(sum(contexts, [])))
    results = {}
    for cpc in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'Y']:
        with open(os.path.join(cfg.DATASET, f'cpc-data/CPCTitleList202202/cpc-section-{cpc}_20220201.txt')) as f:
            s = f.read()
        pattern = f'{cpc}\t\t.+'
        result = re.findall(pattern, s)
        cpc_result = result[0].lstrip(pattern)
        for context in [c for c in contexts if c[0] == cpc]:
            pattern = f'{context}\t\t.+'
            result = re.findall(pattern, s)
            results[context] = cpc_result + ". " + result[0].lstrip(pattern)
    for key, val in results.items():
        results[key] = val.lower()
    return results


def get_optimizer_params(model, lr, weight_decay=0.0):
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    layer_num = len(model.backbone.encoder.layer)
    layer1_num = layer_num // 3
    layer2_num = layer_num // 3
    layer3_num = layer_num - (layer1_num + layer2_num)
    group1 = ['layer.' + str(i) + '.' for i in range(layer1_num)]
    group2 = ['layer.' + str(i) + '.' for i in range(layer1_num, layer1_num + layer2_num)]
    group3 = ['layer.' + str(i) + '.' for i in range(layer1_num + layer2_num, layer1_num + layer2_num + layer3_num)]
    group_all = group1 + group2 + group3
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.backbone.named_parameters() if (not any(nd in n for nd in no_decay)) and (not any(nd in n for nd in group2)) and (not any(nd in n for nd in group3))] ,
          'lr': lr * 0.25, 'weight_decay': weight_decay},
        {'params': [p for n, p in model.backbone.named_parameters() if any(nd in n for nd in no_decay) and (not any(nd in n for nd in group2)) and (not any(nd in n for nd in group3))],
          'lr': lr * 0.25, 'weight_decay': 0.0},
        {'params': [p for n, p in model.named_parameters() if "backbone" not in n],
          'lr': lr * 0.2, 'weight_decay': 0.0},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in group2) and (not any(nd in n for nd in no_decay))],
        'lr': lr * 0.5, 'weight_decay': weight_decay},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in group2) and any(nd in n for nd in no_decay)],
        'lr': lr * 0.5, 'weight_decay': 0.0},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in group3) and (not any(nd in n for nd in no_decay))],
        'lr': lr, 'weight_decay': weight_decay},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in group3) and any(nd in n for nd in no_decay)],
        'lr': lr, 'weight_decay': 0.0}
    ]
    return optimizer_grouped_parameters

In [None]:
class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['text'].values
        self.labels = df['score'].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        inputs = self.prepare_input(self.cfg, self.texts[index])
        label = torch.tensor(self.labels[index], dtype=torch.float)
        return inputs, label
    
    @staticmethod
    def prepare_input(cfg, text):
        inputs = cfg.tokenizer(
            text,
            add_special_tokens=True,
            max_length=cfg.max_len,
            padding="max_length",
            return_offsets_mapping=False
        )
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
        return inputs

class CustomModel(nn.Module):
    def __init__(self, cfg, criterion):
        super().__init__()
        self.cfg = cfg
        self.criterion = criterion
        self.config = AutoConfig.from_pretrained(
            cfg.MODEL_PATH,
            output_hidden_states=True
        )
        self.config.attention_probs_dropout_prob = 0.0
        self.config.hidden_dropout_prob = 0.0
        self.backbone = AutoModel.from_pretrained(
            cfg.MODEL_PATH, 
            config=self.config
        )
        self.fc = nn.Sequential(
            nn.Linear(self.config.hidden_size, 1),
        )

    def forward(self, inputs, labels=None):
        outputs = self.backbone(**inputs)["last_hidden_state"]
        outputs = outputs[:, 0, :]
        logits = self.fc(outputs).flatten()
        loss = 0
        if labels is not None:
            loss = self.criterion(logits, labels)
        return logits, loss


class CustomModelDropout(nn.Module):
    def __init__(self, cfg, criterion):
        super().__init__()
        self.cfg = cfg
        self.criterion = criterion
        self.config = AutoConfig.from_pretrained(
            cfg.MODEL_PATH,
            output_hidden_states=True
        )
        self.backbone = AutoModel.from_pretrained(
            cfg.MODEL_PATH, 
            config=self.config
        )
        self.fc = nn.Sequential(
            nn.Linear(self.config.hidden_size, 1),
        )

        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.2)
        self.dropout3 = nn.Dropout(0.3)
        self.dropout4 = nn.Dropout(0.4)
        self.dropout5 = nn.Dropout(0.5)

    def forward(self, inputs, labels=None):
        outputs = self.backbone(**inputs)["last_hidden_state"]
        outputs = outputs[:, 0, :]
        logits1 = self.fc(self.dropout1(outputs)).flatten()
        logits2 = self.fc(self.dropout2(outputs)).flatten()
        logits4 = self.fc(self.dropout3(outputs)).flatten()
        logits3 = self.fc(self.dropout4(outputs)).flatten()
        logits5 = self.fc(self.dropout5(outputs)).flatten()
        logits = (logits1 + logits2 + logits3 + logits4 + logits5) / 5
        
        loss = 0
        if labels is not None:
            loss1 = self.criterion(logits1, labels,)
            loss2 = self.criterion(logits2, labels)
            loss3 = self.criterion(logits3, labels)
            loss4 = self.criterion(logits4, labels)
            loss5 = self.criterion(logits5, labels)
            loss = (loss1 + loss2 + loss3 + loss4 + loss5) / 5

        return logits, loss

In [None]:
# evaluating
def evaluating(cfg, valid_loader, model, criterion, valid_df, fold, best_val_preds, best_val_score):
    # evaluating
    val_preds = []
    val_losses = []
    val_nums = []
    model.eval()
    with torch.no_grad():
        with tqdm(valid_loader, total=len(valid_loader)) as pbar:
            for (inputs, labels) in pbar:
                for k, v in inputs.items():
                    inputs[k] = v.to(cfg.device)
                labels = labels.to(cfg.device)
                with autocast():
                    output, loss = model(inputs, labels)
                output = output.detach().cpu().numpy()
                val_preds.append(output)
                val_losses.append(loss.item() * len(labels))
                val_nums.append(len(labels))
                pbar.set_postfix({
                    'val_loss': loss.item()
                })

    val_preds = np.concatenate(val_preds)
    val_loss = sum(val_losses) / sum(val_nums)
    corr_score = np.corrcoef(val_preds, valid_df['score'])[0, 1]

    val_log = {
        'val_loss': val_loss,
        'score': corr_score,
    }
    display(val_log)

    if best_val_score < corr_score:
        print("save model weight")
        best_val_preds = val_preds
        best_val_score = corr_score
        torch.save(
            model.state_dict(), 
            os.path.join(cfg.EXP_MODEL, f"fold{fold}.pth")
        )
    return best_val_preds, best_val_score


def training(cfg, train):
    # =====================
    # Training
    # =====================
    set_seed(cfg.seed)
    oof_pred = np.zeros(len(train), dtype=np.float32)
    for fold in cfg.trn_fold:
        # dataset, dataloader
        train_df = train.loc[cfg.folds!=fold]
        valid_df = train.loc[cfg.folds==fold]
        train_idx = list(train_df.index)
        valid_idx = list(valid_df.index)

        # reverse anchor_target
        # rev_train_df = train_df.copy()
        # rev_train_df[['anchor', 'target']] = rev_train_df[['target', 'anchor']].to_numpy()
        # train_df = pd.concat([train_df, rev_train_df]).reset_index(drop=True)

        train_dataset = TrainDataset(cfg, train_df)
        valid_dataset = TrainDataset(cfg, valid_df)
        train_loader = DataLoader(
            dataset=train_dataset, 
            batch_size=cfg.batch_size, 
            shuffle=True,
            pin_memory=True,
            drop_last=True
        )
        valid_loader = DataLoader(
            dataset=valid_dataset,
            batch_size=cfg.batch_size,
            shuffle=False,
            pin_memory=True,
            drop_last=False
        )
        # model-training
        # criterion = nn.BCEWithLogitsLoss(reduction="mean")
        criterion = nn.MSELoss()
        best_val_preds = None
        best_val_score = -1

        # model
        model = CustomModel(cfg, criterion)
        model = model.to(cfg.device)


        # optimizer, scheduler
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {
                'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 
                'weight_decay': cfg.weight_decay
            },
            {
                'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 
                'weight_decay': 0.0
            }
        ]

        optimizer_grouped_parameters = get_optimizer_params(
            model,
            lr=cfg.lr,
            weight_decay=cfg.weight_decay
        )

        optimizer = AdamW(
            optimizer_grouped_parameters,
            lr=cfg.lr,
            betas=cfg.beta,
            weight_decay=cfg.weight_decay,
        )
        num_train_optimization_steps = int(
            len(train_loader) * cfg.n_epochs // cfg.gradient_accumulation_steps
        )
        num_warmup_steps = int(num_train_optimization_steps * cfg.num_warmup_steps_rate)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_train_optimization_steps
        )
        num_eval_step = len(train_loader) // cfg.num_eval + cfg.num_eval
        
        for epoch in range(cfg.n_epochs):
            # training
            print(f"# ============ start epoch:{epoch} ============== #")
            model.train() 
            val_losses_batch = []
            scaler = GradScaler()
            with tqdm(train_loader, total=len(train_loader)) as pbar:
                for step, (inputs, labels) in enumerate(pbar):
                    inputs, max_len = collatte(inputs)
                    for k, v in inputs.items():
                        inputs[k] = v.to(cfg.device)
                    labels = labels.to(cfg.device)

                    optimizer.zero_grad()
                    with autocast():
                        output, loss = model(inputs, labels)
                    pbar.set_postfix({
                        'loss': loss.item(),
                        'lr': scheduler.get_lr()[0]
                    })

                    if cfg.gradient_accumulation_steps > 1:
                        loss = loss / cfg.gradient_accumulation_steps

                    scaler.scale(loss).backward()
                    if cfg.clip_grad_norm is not None:
                        torch.nn.utils.clip_grad_norm_(
                            model.parameters(), 
                            cfg.clip_grad_norm
                        )
                    if (step+1) % cfg.gradient_accumulation_steps == 0:
                        scaler.step(optimizer)
                        scaler.update()
                        scheduler.step()

                    if step % num_eval_step == 0 and step != 0:
                        tmp_num_eval = step // num_eval_step
                        print(f'fold: {fold}, epoch: {epoch}, num_eval: {tmp_num_eval}')
                        best_val_preds, best_val_score = evaluating(
                            cfg, valid_loader,
                            model,
                            criterion,
                            valid_df,
                            fold,
                            best_val_preds,
                            best_val_score
                        )
                        model.train() 
            
            print(f'fold: {fold}, epoch: {epoch}, last')
            best_val_preds, best_val_score = evaluating(
                cfg, valid_loader,
                model,
                criterion,
                valid_df,
                fold,
                best_val_preds,
                best_val_score
            )

        oof_pred[valid_idx] = best_val_preds.astype(np.float32)
        np.save(os.path.join(cfg.EXP_PREDS, f'oof_pred_fold{fold}.npy'), best_val_preds)
        del model; gc.collect()

    np.save(os.path.join(cfg.EXP_PREDS, 'oof_pred.npy'), oof_pred)

    # =====================
    # scoring
    # =====================
    corr_score = np.corrcoef(oof_pred, train['score'])[0, 1]
    print('CV:', round(corr_score, 5))
    return corr_score

In [8]:
# =====================
# Main
# =====================
cfg = setup(Config)

import transformers
from transformers import AutoConfig, AutoModel, AutoTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup

train = pd.read_csv(os.path.join(cfg.INPUT, 'train.csv'))
test = pd.read_csv(os.path.join(cfg.INPUT, 'test.csv'))
sub = pd.read_csv(os.path.join(cfg.INPUT, 'sample_submission.csv'))

cfg.tokenizer = AutoTokenizer.from_pretrained(cfg.MODEL_PATH)
print(cfg.tokenizer.sep_token)

cpc_texts = get_cpc_texts(cfg)
torch.save(cpc_texts, os.path.join(cfg.EXP_PREDS, "cpc_texts.pth"))
train['context_text'] = train['context'].map(cpc_texts)
train['text'] = train['anchor'] + cfg.tokenizer.sep_token + train['target'] + cfg.tokenizer.sep_token + train['context_text']

# cfg.folds = get_groupkfold(train, 'score', 'anchor', cfg.num_fold)
cfg.folds = get_kfold(train, cfg.num_fold, cfg.seed)
cfg.folds.to_csv(os.path.join(cfg.EXP_PREDS, 'folds.csv'))

score = training(cfg, train)
if cfg.upload_from_colab and cfg.COLAB:
    from kaggle.api.kaggle_api_extended import KaggleApi
    dataset_create_new(dataset_name=Config.EXP, upload_dir=Config.OUTPUT_EXP)

This environment is Google Colab
[K     |████████████████████████████████| 4.2 MB 15.0 MB/s 
[K     |████████████████████████████████| 86 kB 5.5 MB/s 
[K     |████████████████████████████████| 596 kB 62.9 MB/s 
[K     |████████████████████████████████| 6.6 MB 56.0 MB/s 
[K     |████████████████████████████████| 1.2 MB 14.8 MB/s 
[?25h

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/475 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

[SEP]


Downloading:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




  0%|          | 0/854 [00:00<?, ?it/s]

fold: 0, epoch: 0, num_eval: 1


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.7636506013340406, 'val_loss': 0.029529737235087877}

save model weight
fold: 0, epoch: 0, num_eval: 2


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.806304642710747, 'val_loss': 0.026253265114130387}

save model weight
fold: 0, epoch: 0, last


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8141405567623384, 'val_loss': 0.02737372180777679}

save model weight


  0%|          | 0/854 [00:00<?, ?it/s]

fold: 0, epoch: 1, num_eval: 1


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8256771245118589, 'val_loss': 0.021571432184442275}

save model weight
fold: 0, epoch: 1, num_eval: 2


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8246952477610761, 'val_loss': 0.021591467782650774}

fold: 0, epoch: 1, last


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8390390701282063, 'val_loss': 0.01981259909432824}

save model weight


  0%|          | 0/854 [00:00<?, ?it/s]

fold: 0, epoch: 2, num_eval: 1


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.839358934179264, 'val_loss': 0.019768935415600885}

save model weight
fold: 0, epoch: 2, num_eval: 2


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8394848819359829, 'val_loss': 0.02075128964136104}

save model weight
fold: 0, epoch: 2, last


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8461021405780061, 'val_loss': 0.019197558942899942}

save model weight


  0%|          | 0/854 [00:00<?, ?it/s]

fold: 0, epoch: 3, num_eval: 1


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8429780441797725, 'val_loss': 0.019685063907805733}

fold: 0, epoch: 3, num_eval: 2


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8430850965483158, 'val_loss': 0.01949971154720663}

fold: 0, epoch: 3, last


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.844281208329992, 'val_loss': 0.019559813175693174}



  0%|          | 0/854 [00:00<?, ?it/s]

fold: 0, epoch: 4, num_eval: 1


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8433223744893494, 'val_loss': 0.019458476674415876}

fold: 0, epoch: 4, num_eval: 2


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8419721728636674, 'val_loss': 0.019925260072657155}

fold: 0, epoch: 4, last


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8451492487538244, 'val_loss': 0.019283137492768526}



  0%|          | 0/854 [00:00<?, ?it/s]

fold: 0, epoch: 5, num_eval: 1


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8425551741243653, 'val_loss': 0.020027093518529857}

fold: 0, epoch: 5, num_eval: 2


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8411782323041036, 'val_loss': 0.020332137361938514}

fold: 0, epoch: 5, last


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8437793677211439, 'val_loss': 0.0197352760857761}



  0%|          | 0/854 [00:00<?, ?it/s]

fold: 0, epoch: 6, num_eval: 1


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8443151636050594, 'val_loss': 0.019667964264131983}

fold: 0, epoch: 6, num_eval: 2


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.843953747017532, 'val_loss': 0.019877816032226513}

fold: 0, epoch: 6, last


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8429344994397813, 'val_loss': 0.019701136423499355}



  0%|          | 0/854 [00:00<?, ?it/s]

fold: 0, epoch: 7, num_eval: 1


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8436603436463003, 'val_loss': 0.01972797494951551}

fold: 0, epoch: 7, num_eval: 2


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.84358876016921, 'val_loss': 0.019656399278514956}

fold: 0, epoch: 7, last


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8434538971379273, 'val_loss': 0.019752126730315053}

Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




  0%|          | 0/854 [00:00<?, ?it/s]

fold: 1, epoch: 0, num_eval: 1


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.7722832864794249, 'val_loss': 0.03665107625051859}

save model weight
fold: 1, epoch: 0, num_eval: 2


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.812002738056248, 'val_loss': 0.023012184186377502}

save model weight
fold: 1, epoch: 0, last


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8287245040132769, 'val_loss': 0.022637614331948285}

save model weight


  0%|          | 0/854 [00:00<?, ?it/s]

fold: 1, epoch: 1, num_eval: 1


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8348122328031992, 'val_loss': 0.021003084298203648}

save model weight
fold: 1, epoch: 1, num_eval: 2


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8312266509582129, 'val_loss': 0.020807084739396162}

fold: 1, epoch: 1, last


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8400132572548985, 'val_loss': 0.019627690832489126}

save model weight


  0%|          | 0/854 [00:00<?, ?it/s]

fold: 1, epoch: 2, num_eval: 1


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8415997631251929, 'val_loss': 0.019517995750506632}

save model weight
fold: 1, epoch: 2, num_eval: 2


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.841745600871051, 'val_loss': 0.019474721933182074}

save model weight
fold: 1, epoch: 2, last


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8431068231852881, 'val_loss': 0.01925160395000706}

save model weight


  0%|          | 0/854 [00:00<?, ?it/s]

fold: 1, epoch: 3, num_eval: 1


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8453860610122007, 'val_loss': 0.01922035273687473}

save model weight
fold: 1, epoch: 3, num_eval: 2


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8428648706169642, 'val_loss': 0.02039020560803572}

fold: 1, epoch: 3, last


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8459243396641212, 'val_loss': 0.02040088855883915}

save model weight


  0%|          | 0/854 [00:00<?, ?it/s]

fold: 1, epoch: 4, num_eval: 1


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8390851831474965, 'val_loss': 0.02014859066612203}

fold: 1, epoch: 4, num_eval: 2


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8424334825931373, 'val_loss': 0.019888359559142203}

fold: 1, epoch: 4, last


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8458559305239689, 'val_loss': 0.019665538239590847}



  0%|          | 0/854 [00:00<?, ?it/s]

fold: 1, epoch: 5, num_eval: 1


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8420623234302799, 'val_loss': 0.019729985635303555}

fold: 1, epoch: 5, num_eval: 2


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8430233359686939, 'val_loss': 0.019346502491271556}

fold: 1, epoch: 5, last


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8421534681758457, 'val_loss': 0.019753718811607618}



  0%|          | 0/854 [00:00<?, ?it/s]

fold: 1, epoch: 6, num_eval: 1


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8409199024057711, 'val_loss': 0.01974543323178587}

fold: 1, epoch: 6, num_eval: 2


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8392975910945725, 'val_loss': 0.020331583702236082}

fold: 1, epoch: 6, last


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8399784253872667, 'val_loss': 0.02000751159615352}



  0%|          | 0/854 [00:00<?, ?it/s]

fold: 1, epoch: 7, num_eval: 1


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8415223453004648, 'val_loss': 0.019872106306939617}

fold: 1, epoch: 7, num_eval: 2


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8414183127970293, 'val_loss': 0.01974940342233353}

fold: 1, epoch: 7, last


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8415936346021301, 'val_loss': 0.019806020617579835}

Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




  0%|          | 0/854 [00:00<?, ?it/s]

fold: 2, epoch: 0, num_eval: 1


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.7616410968793464, 'val_loss': 0.02852276569985329}

save model weight
fold: 2, epoch: 0, num_eval: 2


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.7984938919100285, 'val_loss': 0.02584731700865094}

save model weight
fold: 2, epoch: 0, last


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8211799275273725, 'val_loss': 0.02270183480302018}

save model weight


  0%|          | 0/854 [00:00<?, ?it/s]

fold: 2, epoch: 1, num_eval: 1


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8248304254930394, 'val_loss': 0.0224247509397094}

save model weight
fold: 2, epoch: 1, num_eval: 2


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8307413384725995, 'val_loss': 0.02168423031465144}

save model weight
fold: 2, epoch: 1, last


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8320014299985526, 'val_loss': 0.020980922783083313}

save model weight


  0%|          | 0/854 [00:00<?, ?it/s]

fold: 2, epoch: 2, num_eval: 1


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8374235350840582, 'val_loss': 0.021050974248860863}

save model weight
fold: 2, epoch: 2, num_eval: 2


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8355692424385486, 'val_loss': 0.020579727563903267}

fold: 2, epoch: 2, last


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8411745506100529, 'val_loss': 0.020159693787017003}

save model weight


  0%|          | 0/854 [00:00<?, ?it/s]

fold: 2, epoch: 3, num_eval: 1


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8374853626350502, 'val_loss': 0.020456348496413028}

fold: 2, epoch: 3, num_eval: 2


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8391154396192888, 'val_loss': 0.020695169004022317}

fold: 2, epoch: 3, last


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8396891906887641, 'val_loss': 0.020594063249481407}



  0%|          | 0/854 [00:00<?, ?it/s]

fold: 2, epoch: 4, num_eval: 1


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8376198379614285, 'val_loss': 0.021670121527770276}

fold: 2, epoch: 4, num_eval: 2


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8402013095240388, 'val_loss': 0.020308504748683367}

fold: 2, epoch: 4, last


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8412578544511632, 'val_loss': 0.020002496897206807}

save model weight


  0%|          | 0/854 [00:00<?, ?it/s]

fold: 2, epoch: 5, num_eval: 1


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8393381854874356, 'val_loss': 0.020313940177148444}

fold: 2, epoch: 5, num_eval: 2


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8387745431064838, 'val_loss': 0.02060421214735797}

fold: 2, epoch: 5, last


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8400651966936553, 'val_loss': 0.020306185093802512}



  0%|          | 0/854 [00:00<?, ?it/s]

fold: 2, epoch: 6, num_eval: 1


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8402213116508809, 'val_loss': 0.020357925809689405}

fold: 2, epoch: 6, num_eval: 2


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8388018534747821, 'val_loss': 0.02048954629741918}

fold: 2, epoch: 6, last


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8400626803401358, 'val_loss': 0.020209969255691636}



  0%|          | 0/854 [00:00<?, ?it/s]

fold: 2, epoch: 7, num_eval: 1


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8393522614604796, 'val_loss': 0.020404009658060468}

fold: 2, epoch: 7, num_eval: 2


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8401937924628436, 'val_loss': 0.020315126609179758}

fold: 2, epoch: 7, last


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.839611099369179, 'val_loss': 0.020455667685924078}

Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




  0%|          | 0/854 [00:00<?, ?it/s]

fold: 3, epoch: 0, num_eval: 1


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.7754137492396533, 'val_loss': 0.02650772458067796}

save model weight
fold: 3, epoch: 0, num_eval: 2


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8031370240962092, 'val_loss': 0.026845914604034855}

save model weight
fold: 3, epoch: 0, last


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8180909440332352, 'val_loss': 0.023527642632735535}

save model weight


  0%|          | 0/854 [00:00<?, ?it/s]

fold: 3, epoch: 1, num_eval: 1


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8335672499277395, 'val_loss': 0.02048401473946812}

save model weight
fold: 3, epoch: 1, num_eval: 2


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8384451788275027, 'val_loss': 0.019701266036999933}

save model weight
fold: 3, epoch: 1, last


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8407108906416659, 'val_loss': 0.019502079515528147}

save model weight


  0%|          | 0/854 [00:00<?, ?it/s]

fold: 3, epoch: 2, num_eval: 1


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8402818226077328, 'val_loss': 0.019822228136103674}

fold: 3, epoch: 2, num_eval: 2


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8423829608757291, 'val_loss': 0.019431769909358815}

save model weight
fold: 3, epoch: 2, last


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8491062131641522, 'val_loss': 0.018459444872121405}

save model weight


  0%|          | 0/854 [00:00<?, ?it/s]

fold: 3, epoch: 3, num_eval: 1


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8420404441610231, 'val_loss': 0.01916094517601271}

fold: 3, epoch: 3, num_eval: 2


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8455395891448301, 'val_loss': 0.018826081037127507}

fold: 3, epoch: 3, last


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8469168230775467, 'val_loss': 0.018791786124952258}



  0%|          | 0/854 [00:00<?, ?it/s]

fold: 3, epoch: 4, num_eval: 1


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8483059547772904, 'val_loss': 0.018686840028987625}

fold: 3, epoch: 4, num_eval: 2


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8455809124894859, 'val_loss': 0.019639143732117573}

fold: 3, epoch: 4, last


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.842603305957464, 'val_loss': 0.01978599793970402}



  0%|          | 0/854 [00:00<?, ?it/s]

fold: 3, epoch: 5, num_eval: 1


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8447057952859331, 'val_loss': 0.019180002699933585}

fold: 3, epoch: 5, num_eval: 2


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8450244513286037, 'val_loss': 0.019263785339182642}

fold: 3, epoch: 5, last


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.845325613753695, 'val_loss': 0.018871199036269146}



  0%|          | 0/854 [00:00<?, ?it/s]

fold: 3, epoch: 6, num_eval: 1


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8459354984487938, 'val_loss': 0.01906152582173972}

fold: 3, epoch: 6, num_eval: 2


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8456138058222492, 'val_loss': 0.019026680601902518}

fold: 3, epoch: 6, last


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8462119149723246, 'val_loss': 0.01934553919873607}



  0%|          | 0/854 [00:00<?, ?it/s]

fold: 3, epoch: 7, num_eval: 1


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8459726245483457, 'val_loss': 0.01920932164047192}

fold: 3, epoch: 7, num_eval: 2


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8459155553739844, 'val_loss': 0.01908272715196762}

fold: 3, epoch: 7, last


  0%|          | 0/285 [00:00<?, ?it/s]

{'score': 0.8459731006733066, 'val_loss': 0.01906155507025305}

CV: 0.84291
Starting upload for file model.tar


100%|██████████| 6.04G/6.04G [03:09<00:00, 34.2MB/s]


Upload successful: model.tar (6GB)
Starting upload for file fig.tar


100%|██████████| 10.0k/10.0k [00:02<00:00, 4.74kB/s]


Upload successful: fig.tar (10KB)
Starting upload for file preds.tar


100%|██████████| 510k/510k [00:02<00:00, 235kB/s]


Upload successful: preds.tar (510KB)
