ソース： https://qiita.com/colum2131/items/b2e5e86f1b98330cb851

In [None]:
! nvidia-smi

Sun Jul 31 08:59:31 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# pip install
! pip install -q transformers
! pip install -q sentencepiece

[K     |████████████████████████████████| 4.7 MB 7.6 MB/s 
[K     |████████████████████████████████| 101 kB 12.0 MB/s 
[K     |████████████████████████████████| 6.6 MB 57.4 MB/s 
[K     |████████████████████████████████| 596 kB 61.7 MB/s 
[K     |████████████████████████████████| 1.2 MB 7.1 MB/s 
[?25h

In [None]:
import os

class Config:
    AUTHOR = "Datascientist"

    # 出力フォルダ名
    NAME = "Exp001-roberta-base-epoch10"

    # 学習するモデルの読み込み
    # https://huggingface.co/ からモデルのパスを指定
    # 例えば, "microsoft/deberta-base"
    MODEL_PATH = "roberta-base"

    # ベースとなるディレクトリパスの指定
    COLAB_PATH = "/content/drive/MyDrive/Competition/SIGNATE" 
    DRIVE_PATH = os.path.join(COLAB_PATH, AUTHOR)

    # シード値
    seed = 42
    
    # cross-validaitonの分割数
    num_fold = 4
    # 学習するfold
    trn_fold = [0, 1, 2, 3]
    
    # batct_sizeの設定
    batch_size = 16
    
    # epoch数の設定
    n_epochs = 10
    
    # トークン数の最大の長さの設定
    max_len = 128

    # 学習率の設定
    lr = 2e-5

    # optimizer等の設定
    weight_decay = 2e-5
    beta = (0.9, 0.98)
    num_warmup_steps_rate = 0.01
    clip_grad_norm = None
    gradient_accumulation_steps = 1
    num_eval = 1

In [None]:
# ========================================
# Library
# ========================================
import os
import gc
import re
import sys
import json
import time
import shutil
import joblib
import random
import requests
import warnings
warnings.filterwarnings('ignore')
from ast import literal_eval

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import scipy 
import itertools
from pathlib import Path
from glob import glob
from tqdm.auto import tqdm
from sklearn.model_selection import (
    StratifiedKFold, 
    KFold, 
    GroupKFold,
    StratifiedGroupKFold,
)
from sklearn.metrics import (
    accuracy_score, 
    f1_score,
    roc_auc_score,
)

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset
from torch.cuda.amp import autocast, GradScaler

from google.colab import drive
if not os.path.isdir('/content/drive'):
    drive.mount('/content/drive') 

In [None]:
def setup(cfg):
    cfg.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # mount
    from google.colab import drive
    if not os.path.isdir('/content/drive'):
        drive.mount('/content/drive') 



    # set dirs
    cfg.DRIVE = cfg.DRIVE_PATH
    cfg.EXP = (cfg.NAME if cfg.NAME is not None 
        else requests.get('http://172.28.0.2:9000/api/sessions').json()[0]['name'][:-6]
    )
    cfg.INPUT = os.path.join(cfg.DRIVE, 'Input')
    cfg.OUTPUT = os.path.join(cfg.DRIVE, 'Output')
    cfg.DATASET = os.path.join(cfg.DRIVE, 'Dataset')

    cfg.OUTPUT_EXP = os.path.join(cfg.OUTPUT, cfg.EXP) 
    cfg.EXP_MODEL = os.path.join(cfg.OUTPUT_EXP, 'model')
    cfg.EXP_FIG = os.path.join(cfg.OUTPUT_EXP, 'fig')
    cfg.EXP_PREDS = os.path.join(cfg.OUTPUT_EXP, 'preds')

    # make dirs
    for d in [cfg.INPUT, cfg.EXP_MODEL, cfg.EXP_FIG, cfg.EXP_PREDS]:
        os.makedirs(d, exist_ok=True)
    return cfg

In [None]:
# =====================
# Utils
# =====================
# Seed
def set_seed(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

# KFold
def get_stratifiedkfold(train, target_col, n_splits, seed):
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    generator = kf.split(train, train[target_col])
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

# collatte
def collatte(inputs, labels=None):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    if not labels is None:
        inputs = {
            "input_ids" : inputs['input_ids'][:,:mask_len],
            "attention_mask" : inputs['attention_mask'][:,:mask_len],
        }
        labels =  labels[:,:mask_len]
        return inputs, labels, mask_len
                
    else:
        inputs = {
            "input_ids" : inputs['input_ids'][:,:mask_len],
            "attention_mask" : inputs['attention_mask'][:,:mask_len],
        }
        return inputs, mask_len

In [None]:
# =====================
# Dataset & Model
# =====================
class BERTDataset(Dataset):
    def __init__(self, cfg, texts, labels=None):
        self.cfg = cfg
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        inputs = self.prepare_input(self.cfg, self.texts[index])
        if self.labels is not None:
            label = torch.tensor(self.labels[index], dtype=torch.int64)
            return inputs, label
        else:
            return inputs
    
    @staticmethod
    def prepare_input(cfg, text):
        inputs = cfg.tokenizer(
            text,
            add_special_tokens=True,
            max_length=cfg.max_len,
            padding="max_length",
            truncation=True,
            return_offsets_mapping=False,
        )
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
        return inputs


class BERTModel(nn.Module):
    def __init__(self, cfg, criterion=None):
        super().__init__()
        self.cfg = cfg
        self.criterion = criterion
        self.config = AutoConfig.from_pretrained(
            cfg.MODEL_PATH,
            output_hidden_states=True
        )
        self.backbone = AutoModel.from_pretrained(
            cfg.MODEL_PATH, 
            config=self.config
        )
        self.fc = nn.Sequential(
            nn.Linear(self.config.hidden_size, 4),
        )
    
    def forward(self, inputs, labels=None):
        outputs = self.backbone(**inputs)["last_hidden_state"]
        outputs = outputs[:, 0, :]
        if labels is not None:
            logits = self.fc(outputs)
            loss = self.criterion(logits, labels)
            return logits, loss
        else:
            logits = self.fc(outputs)
            return logits

In [None]:
def training(cfg, train):
    # =====================
    # Training
    # =====================
    set_seed(cfg.seed)
    oof_pred = np.zeros((len(train), 4), dtype=np.float32)
    
    # 損失関数
    criterion = nn.CrossEntropyLoss()

    for fold in cfg.trn_fold:
        # Dataset,Dataloaderの設定
        train_df = train.loc[cfg.folds!=fold]
        valid_df = train.loc[cfg.folds==fold]
        train_idx = list(train_df.index)
        valid_idx = list(valid_df.index)

        train_dataset = BERTDataset(
            cfg,
            train_df['description'].to_numpy(), 
            train_df['jobflag'].to_numpy(),
        )
        valid_dataset = BERTDataset(
            cfg, 
            valid_df['description'].to_numpy(), 
            valid_df['jobflag'].to_numpy()
        )
        train_loader = DataLoader(
            dataset=train_dataset, 
            batch_size=cfg.batch_size, 
            shuffle=True,
            pin_memory=True,
            drop_last=True
        )
        valid_loader = DataLoader(
            dataset=valid_dataset,
            batch_size=cfg.batch_size,
            shuffle=False,
            pin_memory=True,
            drop_last=False
        )

        # 初期化
        best_val_preds = None
        best_val_score = -1

        # modelの読み込み
        model = BERTModel(cfg, criterion)
        model = model.to(cfg.device)

        # optimizer，schedulerの設定
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = []
        optimizer_grouped_parameters.append({
            'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 
            'weight_decay': cfg.weight_decay
        })
        optimizer_grouped_parameters.append({
            'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 
            'weight_decay': 0.0
        })
        optimizer = AdamW(
            optimizer_grouped_parameters,
            lr=cfg.lr,
            betas=cfg.beta,
            weight_decay=cfg.weight_decay,
        )
        num_train_optimization_steps = int(
            len(train_loader) * cfg.n_epochs // cfg.gradient_accumulation_steps
        )
        num_warmup_steps = int(num_train_optimization_steps * cfg.num_warmup_steps_rate)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_train_optimization_steps
        )
        num_eval_step = len(train_loader) // cfg.num_eval + cfg.num_eval
        
        for epoch in range(cfg.n_epochs):
            # training
            print(f"# ============ start epoch:{epoch} ============== #")
            model.train() 
            val_losses_batch = []
            scaler = GradScaler()
            with tqdm(train_loader, total=len(train_loader)) as pbar:
                for step, (inputs, labels) in enumerate(pbar):
                    inputs, max_len = collatte(inputs)
                    for k, v in inputs.items():
                        inputs[k] = v.to(cfg.device)
                    labels = labels.to(cfg.device)

                    optimizer.zero_grad()
                    with autocast():
                        output, loss = model(inputs, labels)
                    pbar.set_postfix({
                        'loss': loss.item(),
                        'lr': scheduler.get_lr()[0]
                    })

                    if cfg.gradient_accumulation_steps > 1:
                        loss = loss / cfg.gradient_accumulation_steps
                    scaler.scale(loss).backward()
                    if cfg.clip_grad_norm is not None:
                        torch.nn.utils.clip_grad_norm_(
                            model.parameters(), 
                            cfg.clip_grad_norm
                        )
                    if (step+1) % cfg.gradient_accumulation_steps == 0:
                        scaler.step(optimizer)
                        scaler.update()
                        scheduler.step()
                
            # evaluating
            val_preds = []
            val_losses = []
            val_nums = []
            model.eval()
            with torch.no_grad():
                with tqdm(valid_loader, total=len(valid_loader)) as pbar:
                    for (inputs, labels) in pbar:
                        inputs, max_len = collatte(inputs)
                        for k, v in inputs.items():
                            inputs[k] = v.to(cfg.device)
                        labels = labels.to(cfg.device)
                        with autocast():
                            output, loss = model(inputs, labels)
                        output = output.sigmoid().detach().cpu().numpy()
                        val_preds.append(output)
                        val_losses.append(loss.item() * len(labels))
                        val_nums.append(len(labels))
                        pbar.set_postfix({
                            'val_loss': loss.item()
                        })

            val_preds = np.concatenate(val_preds)
            val_loss = sum(val_losses) / sum(val_nums)
            score = f1_score(np.argmax(val_preds, axis=1), valid_df['jobflag'], average='macro')
            val_log = {
                'val_loss': val_loss,
                'score': score,
            }
            display(val_log)
            if best_val_score < score:
                print("save model weight")
                best_val_preds = val_preds
                best_val_score = score
                torch.save(
                    model.state_dict(), 
                    os.path.join(cfg.EXP_MODEL, f"fold{fold}.pth")
                )

        oof_pred[valid_idx] = best_val_preds.astype(np.float32)
        np.save(os.path.join(cfg.EXP_PREDS, f'oof_pred_fold{fold}.npy'), best_val_preds)
        del model; gc.collect()

    # scoring
    np.save(os.path.join(cfg.EXP_PREDS, 'oof_pred.npy'), oof_pred)
    score = f1_score(np.argmax(oof_pred, axis=1), train['jobflag'], average='macro')
    print('CV:', round(score, 5))
    return score

In [None]:
def inferring(cfg, test):
    print('\n'.join(cfg.model_weights))
    sub_pred = np.zeros((len(test), 4), dtype=np.float32)
    for fold, model_weight in enumerate(cfg.model_weights):
        # dataset, dataloader
        test_dataset = BERTDataset(
            cfg,
            test['description'].to_numpy()
        )
        test_loader = DataLoader(
            dataset=test_dataset, 
            batch_size=cfg.batch_size, 
            shuffle=False,
            pin_memory=True
        )
        model = BERTModel(cfg)
        model.load_state_dict(torch.load(model_weight))
        model = model.to(cfg.device)

        model.eval()
        fold_pred = []
        with torch.no_grad():
            for inputs in tqdm(test_loader, total=len(test_loader)):
                inputs, max_len = collatte(inputs)
                for k, v in inputs.items():
                    inputs[k] = v.to(cfg.device)
                with autocast():
                    output = model(inputs)
                output = output.softmax(axis=1).detach().cpu().numpy()
                fold_pred.append(output)
        fold_pred = np.concatenate(fold_pred)
        np.save(os.path.join(cfg.EXP_PREDS, f'sub_pred_fold{fold}.npy'), fold_pred)
        sub_pred += fold_pred / len(cfg.model_weights)
        del model; gc.collect()
    np.save(os.path.join(cfg.EXP_PREDS, f'sub_pred.npy'), sub_pred)
    return sub_pred

In [None]:
# =====================
# Main
# =====================
# セットアップ
cfg = setup(Config)

import transformers
from transformers import AutoConfig, AutoModel, AutoTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup

# データの読み込み
train = pd.read_csv(os.path.join(cfg.INPUT, 'train.csv'))
test = pd.read_csv(os.path.join(cfg.INPUT, 'test.csv'))
sub = pd.read_csv(os.path.join(cfg.INPUT, 'submit_sample.csv'), header=None)

# targetの前処理
train['jobflag'] -= 1

# tokenizerの読み込み
cfg.tokenizer = AutoTokenizer.from_pretrained(cfg.MODEL_PATH)

# validationデータの設定
cfg.folds = get_stratifiedkfold(train, 'jobflag', cfg.num_fold, cfg.seed)
cfg.folds.to_csv(os.path.join(cfg.EXP_PREDS, 'folds.csv'))

# BERTの学習
score = training(cfg, train)

# BERTの推論
cfg.model_weights = [p for p in sorted(glob(os.path.join(cfg.EXP_MODEL, 'fold*.pth')))]
sub_pred = inferring(cfg, test)
sub[1] = np.argmax(sub_pred, axis=1)
sub[1] = sub[1].astype(int) + 1

# 提出用ファイル
sub.to_csv(os.path.join(cfg.EXP_PREDS, 'roberta_baseline01.csv'), index=False, header=False)

Downloading config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

{'score': 0.5488482570061517, 'val_loss': 0.7563184276419768}

save model weight


  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

{'score': 0.6760048377535997, 'val_loss': 0.7400663668058793}

save model weight


  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

{'score': 0.6391140474251733, 'val_loss': 0.6831712842302147}



  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

{'score': 0.7175772894550503, 'val_loss': 0.7792199460057281}

save model weight


  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

{'score': 0.6588336836731921, 'val_loss': 0.8704566321775592}



  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

{'score': 0.726514685295597, 'val_loss': 0.9231994230703188}

save model weight


  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

{'score': 0.7072676924576006, 'val_loss': 1.089707575246967}



  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

{'score': 0.7310214349028236, 'val_loss': 1.0650930291429987}

save model weight


  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

{'score': 0.7492164144009421, 'val_loss': 1.1258886505242702}

save model weight


  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

{'score': 0.7332933920850331, 'val_loss': 1.1408548738836928}

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

{'score': 0.547169239679973, 'val_loss': 0.8522979807413348}

save model weight


  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

{'score': 0.6774919336464328, 'val_loss': 0.6341648947909514}

save model weight


  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

{'score': 0.6642842692392428, 'val_loss': 0.6583287586951948}



  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

{'score': 0.6637625045344545, 'val_loss': 0.812864665462977}



  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

{'score': 0.6933090992335376, 'val_loss': 0.8359989009620646}

save model weight


  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

{'score': 0.6749135124135124, 'val_loss': 0.9501388934797221}



  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

{'score': 0.6737541925629367, 'val_loss': 1.0549909675341482}



  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

{'score': 0.6848867545635606, 'val_loss': 1.0797976302597327}



  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

{'score': 0.6808398464280818, 'val_loss': 1.1352235091393101}



  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

{'score': 0.6834050403484477, 'val_loss': 1.1638860510647455}

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

{'score': 0.560717543326239, 'val_loss': 0.7418224163609004}

save model weight


  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

{'score': 0.63972924267482, 'val_loss': 0.7117200053461623}

save model weight


  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

{'score': 0.7038005251871867, 'val_loss': 0.6157739676712056}

save model weight


  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

{'score': 0.710894398001394, 'val_loss': 0.6323842878077465}

save model weight


  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

{'score': 0.7136462416815375, 'val_loss': 0.6590713764243515}

save model weight


  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

{'score': 0.6992288023339719, 'val_loss': 0.7500852525706027}



  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

{'score': 0.6547758712269924, 'val_loss': 0.8525900024537361}



  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

{'score': 0.673278667008522, 'val_loss': 0.8302823199760316}



  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

{'score': 0.7089377618159766, 'val_loss': 0.8399198671758646}



  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

{'score': 0.6998372529798793, 'val_loss': 0.8377208576038834}

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

{'score': 0.5519697089242624, 'val_loss': 0.7418493903721228}

save model weight


  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

{'score': 0.6686937676411548, 'val_loss': 0.6800396595749817}

save model weight


  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

{'score': 0.6912748771812517, 'val_loss': 0.6132892014011544}

save model weight


  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

{'score': 0.6935525250749395, 'val_loss': 0.6806066504097237}

save model weight


  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

{'score': 0.6730660389528816, 'val_loss': 0.7521464113431116}



  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

{'score': 0.6625467973037471, 'val_loss': 0.9338824516235996}



  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

{'score': 0.668138134833725, 'val_loss': 0.8727236751830673}



  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

{'score': 0.6814748466367421, 'val_loss': 0.9320644387744348}



  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

{'score': 0.6792644007654999, 'val_loss': 0.9878498723528945}



  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

{'score': 0.674446740596288, 'val_loss': 0.9960521028074866}

CV: 0.71242
/content/drive/MyDrive/Competition/SIGNATE/Datascientist/Output/Exp001-roberta-base-epoch10/model/fold0.pth
/content/drive/MyDrive/Competition/SIGNATE/Datascientist/Output/Exp001-roberta-base-epoch10/model/fold1.pth
/content/drive/MyDrive/Competition/SIGNATE/Datascientist/Output/Exp001-roberta-base-epoch10/model/fold2.pth
/content/drive/MyDrive/Competition/SIGNATE/Datascientist/Output/Exp001-roberta-base-epoch10/model/fold3.pth


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/95 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/95 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/95 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/95 [00:00<?, ?it/s]