### Setting.py

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 5.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 6.2 MB/s 
Collecting tokenizers!=0.11.3,>=0.10.1
  Downloading tokenizers-0.11.5-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 47.6 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 64.2 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 64.8 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Foun

In [None]:
import pandas as pd 
import numpy as np 
import os
import torch
import torch.nn as nn

import warnings 
warnings.filterwarnings("ignore")
from tqdm import tqdm
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModel, AutoTokenizer, AdamW
from transformers.optimization import get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup
import re
from sklearn.model_selection import train_test_split

In [None]:
# Random Seed Fix
import random
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  
    torch.backends.cudnn.deterministic = True  
    torch.backends.cudnn.benchmark = True  
seed_everything()

In [None]:
def load_data(path):
  TRAIN = os.path.join(path, 'benchmark_train_data.csv')
  TEST = os.path.join(path, 'test_data.csv')
  SS = os.path.join(path, 'sample_submission.csv')
  label_dict = {"entailment" : 0, "contradiction" : 1, "neutral" : 2}
  train = pd.read_csv(TRAIN)
  test = pd.read_csv(TEST)
  sample_submission = pd.read_csv(SS)
  train['label'] = train['label'].map(label_dict)

  return train,test,sample_submission

def text_clean(df):
  #=df["premise_"] = "<s>" + df["premise"].astype(str) + "[SEP]"
  df["premise_"] = "[CLS]"+df["premise"].astype(str)
  #df["hypothesis_"] = df["hypothesis"].astype(str) + "[SEP]"
  df["hypothesis_"] = df["hypothesis"].astype(str) + "[SEP]"
  df["text_sum"] = df.premise_ + "[SEP]" + df.hypothesis_
  df = df[['text_sum','label']]
  return df 

def random_deletion(sentence, p=0.2):
    words = sentence.split ()
    n = len (words)
    if n == 1: # return if single word
        return words
    remaining = list(filter(lambda x: random.uniform(0,1) > p,words))
    #print (remaining) 
    if len(remaining) == 0: # if not left, sample a random word
        return ' '.join ([random.choice(words)])
    else:
        return ' '.join (remaining)

def random_swap(sentence, n=2):
    sentence = sentence.split () 
    length = range(len(sentence))
    swapped = []
    if len(sentence) >2:
      for _ in range(n):
          idx1, idx2 = random.sample(length, 2)
          swapped.append ([sentence[idx1], sentence[idx2]])
          sentence[idx1], sentence[idx2] = sentence[idx2], sentence[idx1] 
    return ' '.join (sentence)

def eda_aug(df):

    cache = {'premise':[], 'hypothesis':[], 'label':[]}
    for idx in tqdm(range(len(df))):
        premise = df.iloc[idx]['premise']
        hypothesis = df.iloc[idx]['hypothesis']
        label = df.iloc[idx]['label']
        cache['premise'].append(premise)
        cache['hypothesis'].append(hypothesis)
        cache['label'].append(label)
        flag = random.randrange(10)
        if flag < 2:
          cache['premise'].append(random_deletion(premise))
          cache['hypothesis'].append(random_deletion(hypothesis))
          cache['label'].append(label)
          cache['premise'].append(random_swap(premise))
          cache['hypothesis'].append(random_swap(hypothesis))
          cache['label'].append(label)
    
    return pd.DataFrame(cache)

### Dataset.py

In [None]:
#ROOT = '/content/drive/MyDrive/DACON_MONTHLYNLI'
DATA = '/content/drive/Shareddrives/Dacon/hongsun/data'
train,test,sample_submission = load_data(DATA)
###### AUGMENTATION ######
#train = eda_aug(train)
###### AUGMENTATION ######

clean_train,clean_test  = text_clean(train),text_clean(test)
display(clean_train)

Unnamed: 0,text_sum,label
0,"[CLS]씨름은 상고시대로부터 전해져 내려오는 남자들의 대표적인 놀이로서, 소년이나...",1
1,[CLS]삼성은 자작극을 벌인 2명에게 형사 고소 등의 법적 대응을 검토 중이라고 ...,1
2,[CLS]이를 위해 예측적 범죄예방 시스템을 구축하고 고도화한다.[SEP]예측적 범...,0
3,[CLS]광주광역시가 재개발 정비사업 원주민들에 대한 종합대책을 마련하는 등 원주민...,2
4,"[CLS]진정 소비자와 직원들에게 사랑 받는 기업으로 오래 지속되고 싶으면, 이런 ...",2
...,...,...
27993,[CLS]흔히 비자림로라고 불리는 지방도 제1112호선을 넓히는 공사가 1년만에 재...,1
27994,[CLS]흔히 비자림로라고 불리는 지방도 제1112호선을 넓히는 공사가 1년만에 재...,2
27995,[CLS]흔히 비자림로라고 불리는 지방도 제1112호선을 넓히는 공사가 1년만에 재...,0
27996,[CLS]흡연자분들은 발코니가 있는 방이면 발코니에서 흡연이 가능합니다.[SEP]비...,2


In [None]:

tokenizer_roberta = AutoTokenizer.from_pretrained("klue/roberta-base")

Downloading:   0%|          | 0.00/375 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/243k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/734k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/173 [00:00<?, ?B/s]

In [None]:
def roberta_transform(text):
  transform = tokenizer_roberta(text,
                                pad_to_max_length=True,
                               truncation=True,
                               max_length=256,
                               return_tensors='pt',
                                add_special_tokens=False)
  return transform

In [None]:
class customDataset(Dataset):
  def __init__(self,dataset,mode='train',transform=roberta_transform):
    super(customDataset, self).__init__()
    self.mode = mode
    self.dataset = dataset.reset_index(drop=True)
    self.transform = transform

  def __getitem__(self, idx):
    
    text = self.dataset['text_sum'][idx]
    tokens = self.transform(text)
    token_ids = tokens['input_ids'][0]  # tensor of token ids
    attn_masks = tokens['attention_mask'][0]  # binary tensor with "0" for padded values and "1" for the other values
    token_type_ids = tokens['token_type_ids'][0]  # binary tensor with "0" for the 1st sentence tokens & "1" for the 2nd sentence tokens

    if self.mode == 'test':
      return token_ids,attn_masks,token_type_ids
    else: 
      labels = self.dataset['label'][idx]
      return token_ids,attn_masks,token_type_ids, labels
  
  def __len__(self):
    return(len(self.dataset))

### Model.py

In [None]:
class ROBERTaClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768, #BERT base hidden-size
                 num_classes=3,   ##클래스 수 조정##
                 params=None,
                 freeze_bert=False):
        super(ROBERTaClassifier, self).__init__()
        self.bert = bert
        self.freeze_bert=freeze_bert

        if self.freeze_bert:
            for p in self.bert.parameters():
                p.requires_grad = False

                 
        self.classifier = nn.Linear(hidden_size , 256)
        self.dropout = nn.Dropout(p=0.5)
        self.fc_layer1 = nn.Linear(256,128)
        self.fc_layer2 = nn.Linear(128,num_classes)
    

    def forward(self, input_ids, attn_masks):
        
        _,pooler = self.bert(input_ids, attn_masks, return_dict=False)
        output1 = self.classifier(pooler)
        output2 = self.fc_layer1(output1)
        output3 = self.fc_layer2(self.dropout(output2))
        return (output3)

### Train.py

In [None]:
device = torch.device("cuda")
!nvidia-smi

Fri Feb 25 22:40:44 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
###### HYPERPARMS ######
lr = 2e-5

batch_size= 32
warmup_ratio = 0.06
num_epochs = 10

log_interval = 500
#learning_rate = 5e-6

In [None]:
############### CV ################
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits = 5,shuffle=True,random_state=42)
folds=[]
for trn_idx,val_idx in skf.split(clean_train['text_sum'], train['label']):
    folds.append((trn_idx,val_idx))

In [None]:


############### CV Training ###############
best_models = []


for i,fold in enumerate(range(1,5)):
    print('===============',i+1,'fold start===============')
    model_roberta = AutoModel.from_pretrained("klue/roberta-base")
    model = ROBERTaClassifier(model_roberta).to(device)
    model=nn.DataParallel(model).to(device)
    optimizer = AdamW(model.parameters(), lr=lr)

    train_idx = folds[fold][0]
    valid_idx = folds[fold][1]
    train_data = train.loc[train_idx]
    val_data = train.loc[valid_idx]
    train_dataset = customDataset(train_data,'train')
    valid_dataset = customDataset(val_data,'train')
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
    total_steps = len(train_loader) * num_epochs
    warmup_step = int(total_steps * warmup_ratio)
    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=5, num_training_steps=total_steps)

    
    for epoch in range(num_epochs):
        batches = 0
        total_loss = 0.0
        correct = 0
        total =0
        model.train()
        
        for input_ids_batch, attention_masks_batch, token_type_ids_batch, y_batch in tqdm(train_loader):
            optimizer.zero_grad()
            y_batch = y_batch.to(device)
            y_pred = model(input_ids_batch.to(device),attn_masks= attention_masks_batch.to(device))
            loss = F.cross_entropy(y_pred, y_batch)
            loss.backward()
            #torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            optimizer.step()
            scheduler.step()
            total_loss += loss.item()
            _, predicted = torch.max(y_pred, 1)
            correct += (predicted == y_batch).sum()
            total += len(y_batch)
            batches += 1
            if batches % log_interval == 0:
                print("Batch Loss: ", total_loss / batches, "Accuracy: ", correct.float() / total)
      
        val_loss = []
        val_acc = []
        """
        for input_ids_batch, attention_masks_batch, token_type_ids, y_batch in tqdm(valid_loader):
            
            model.eval()
            with torch.no_grad():
                
                y_pred = model(input_ids_batch.to(device),attn_masks= attention_masks_batch.to(device))
                valid_loss = F.cross_entropy(y_pred,y_batch.to(device)).cpu().detach().numpy()

                preds = torch.argmax(y_pred,1)
                preds = preds.cpu().detach().numpy()
                y_batch = y_batch.cpu().detach().numpy()
                batch_acc = (preds==y_batch).mean()
                val_loss.append(valid_loss)
                val_acc.append(batch_acc)
        """     
                
        val_loss = np.mean(val_loss)
        val_acc = np.mean(val_acc)
        
        print(f'Epoch: {epoch} - valid Loss: {val_loss:.6f} - valid_acc : {val_acc:.6f}')
        print(optimizer.param_groups[0]["lr"])
        if epoch  == 9:
            valid_acc_max = val_acc
            best_models.append(model)
            torch.save(model.state_dict(), f'/content/drive/Shareddrives/Dacon/hongsun/ROBERTa_base_fold_{fold}.pth') 
            print('model save, model val acc : ',val_acc)
            print('best_models size : ',len(best_models))



Downloading:   0%|          | 0.00/546 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/422M [00:00<?, ?B/s]

Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for

Batch Loss:  0.6990929025113582 Accuracy:  tensor(0.6882, device='cuda:0')


100%|██████████| 700/700 [09:01<00:00,  1.29it/s]


Epoch: 0 - valid Loss: nan - valid_acc : nan
1.951679108887946e-05


 71%|███████▏  | 500/700 [06:26<02:34,  1.29it/s]

Batch Loss:  0.32249695986509325 Accuracy:  tensor(0.8853, device='cuda:0')


100%|██████████| 700/700 [09:01<00:00,  1.29it/s]


Epoch: 1 - valid Loss: nan - valid_acc : nan
1.8100716321851976e-05


 71%|███████▏  | 500/700 [06:26<02:34,  1.29it/s]

Batch Loss:  0.18043044014275075 Accuracy:  tensor(0.9386, device='cuda:0')


100%|██████████| 700/700 [09:01<00:00,  1.29it/s]


Epoch: 2 - valid Loss: nan - valid_acc : nan
1.5890562348677304e-05


 71%|███████▏  | 500/700 [06:27<02:35,  1.29it/s]

Batch Loss:  0.11627758835442364 Accuracy:  tensor(0.9599, device='cuda:0')


100%|██████████| 700/700 [09:01<00:00,  1.29it/s]


Epoch: 3 - valid Loss: nan - valid_acc : nan
1.3102981282915746e-05


 71%|███████▏  | 500/700 [06:26<02:34,  1.29it/s]

Batch Loss:  0.07578935714717955 Accuracy:  tensor(0.9774, device='cuda:0')


100%|██████████| 700/700 [09:01<00:00,  1.29it/s]


Epoch: 4 - valid Loss: nan - valid_acc : nan
1.0011227991399215e-05


 71%|███████▏  | 500/700 [06:27<02:35,  1.29it/s]

Batch Loss:  0.04472270631766878 Accuracy:  tensor(0.9861, device='cuda:0')


100%|██████████| 700/700 [09:01<00:00,  1.29it/s]


Epoch: 5 - valid Loss: nan - valid_acc : nan
6.918374067032124e-06


 71%|███████▏  | 500/700 [06:27<02:35,  1.29it/s]

Batch Loss:  0.030576591667952017 Accuracy:  tensor(0.9919, device='cuda:0')


100%|██████████| 700/700 [09:02<00:00,  1.29it/s]


Epoch: 6 - valid Loss: nan - valid_acc : nan
4.127598993135665e-06


 71%|███████▏  | 500/700 [06:27<02:35,  1.29it/s]

Batch Loss:  0.017965578875970094 Accuracy:  tensor(0.9945, device='cuda:0')


100%|██████████| 700/700 [09:02<00:00,  1.29it/s]


Epoch: 7 - valid Loss: nan - valid_acc : nan
1.9124707317475876e-06


 71%|███████▏  | 500/700 [06:27<02:34,  1.29it/s]

Batch Loss:  0.011381420003715902 Accuracy:  tensor(0.9966, device='cuda:0')


100%|██████████| 700/700 [09:01<00:00,  1.29it/s]


Epoch: 8 - valid Loss: nan - valid_acc : nan
4.901290050147345e-07


 71%|███████▏  | 500/700 [06:27<02:34,  1.29it/s]

Batch Loss:  0.00941792278480716 Accuracy:  tensor(0.9974, device='cuda:0')


100%|██████████| 700/700 [09:01<00:00,  1.29it/s]


Epoch: 9 - valid Loss: nan - valid_acc : nan
0.0
model save, model val acc :  nan
best_models size :  1


Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for

Batch Loss:  0.6895628114938736 Accuracy:  tensor(0.6962, device='cuda:0')


100%|██████████| 700/700 [09:03<00:00,  1.29it/s]


Epoch: 0 - valid Loss: nan - valid_acc : nan
1.951679108887946e-05


 71%|███████▏  | 500/700 [06:27<02:34,  1.29it/s]

Batch Loss:  0.3222522528171539 Accuracy:  tensor(0.8868, device='cuda:0')


100%|██████████| 700/700 [09:02<00:00,  1.29it/s]


Epoch: 1 - valid Loss: nan - valid_acc : nan
1.8100716321851976e-05


 71%|███████▏  | 500/700 [06:27<02:35,  1.29it/s]

Batch Loss:  0.18123372930660844 Accuracy:  tensor(0.9397, device='cuda:0')


100%|██████████| 700/700 [09:02<00:00,  1.29it/s]


Epoch: 2 - valid Loss: nan - valid_acc : nan
1.5890562348677304e-05


 71%|███████▏  | 500/700 [06:27<02:34,  1.29it/s]

Batch Loss:  0.1093151182308793 Accuracy:  tensor(0.9638, device='cuda:0')


100%|██████████| 700/700 [09:02<00:00,  1.29it/s]


Epoch: 3 - valid Loss: nan - valid_acc : nan
1.3102981282915746e-05


 71%|███████▏  | 500/700 [06:27<02:35,  1.28it/s]

Batch Loss:  0.06850112859904767 Accuracy:  tensor(0.9781, device='cuda:0')


100%|██████████| 700/700 [09:02<00:00,  1.29it/s]


Epoch: 4 - valid Loss: nan - valid_acc : nan
1.0011227991399215e-05


 71%|███████▏  | 500/700 [06:27<02:35,  1.29it/s]

Batch Loss:  0.03868255938403308 Accuracy:  tensor(0.9886, device='cuda:0')


100%|██████████| 700/700 [09:02<00:00,  1.29it/s]


Epoch: 5 - valid Loss: nan - valid_acc : nan
6.918374067032124e-06


 71%|███████▏  | 500/700 [06:27<02:35,  1.29it/s]

Batch Loss:  0.024515475361375137 Accuracy:  tensor(0.9932, device='cuda:0')


100%|██████████| 700/700 [09:02<00:00,  1.29it/s]


Epoch: 6 - valid Loss: nan - valid_acc : nan
4.127598993135665e-06


 71%|███████▏  | 500/700 [06:27<02:34,  1.29it/s]

Batch Loss:  0.0156935687311925 Accuracy:  tensor(0.9956, device='cuda:0')


100%|██████████| 700/700 [09:02<00:00,  1.29it/s]


Epoch: 7 - valid Loss: nan - valid_acc : nan
1.9124707317475876e-06


 71%|███████▏  | 500/700 [06:27<02:34,  1.29it/s]

Batch Loss:  0.010769643500854726 Accuracy:  tensor(0.9974, device='cuda:0')


100%|██████████| 700/700 [09:02<00:00,  1.29it/s]


Epoch: 8 - valid Loss: nan - valid_acc : nan
4.901290050147345e-07


 71%|███████▏  | 500/700 [06:27<02:35,  1.29it/s]

Batch Loss:  0.008959507852210663 Accuracy:  tensor(0.9971, device='cuda:0')


100%|██████████| 700/700 [09:02<00:00,  1.29it/s]


Epoch: 9 - valid Loss: nan - valid_acc : nan
0.0
model save, model val acc :  nan
best_models size :  2


Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for

Batch Loss:  0.7176843285858631 Accuracy:  tensor(0.6694, device='cuda:0')


100%|██████████| 700/700 [09:03<00:00,  1.29it/s]


Epoch: 0 - valid Loss: nan - valid_acc : nan
1.951679108887946e-05


 71%|███████▏  | 500/700 [06:27<02:35,  1.29it/s]

Batch Loss:  0.3335082470178604 Accuracy:  tensor(0.8804, device='cuda:0')


100%|██████████| 700/700 [09:02<00:00,  1.29it/s]


Epoch: 1 - valid Loss: nan - valid_acc : nan
1.8100716321851976e-05


 71%|███████▏  | 500/700 [06:27<02:35,  1.29it/s]

Batch Loss:  0.19443064701929688 Accuracy:  tensor(0.9343, device='cuda:0')


100%|██████████| 700/700 [09:02<00:00,  1.29it/s]


Epoch: 2 - valid Loss: nan - valid_acc : nan
1.5890562348677304e-05


 71%|███████▏  | 500/700 [06:27<02:35,  1.29it/s]

Batch Loss:  0.11364980144053698 Accuracy:  tensor(0.9636, device='cuda:0')


100%|██████████| 700/700 [09:02<00:00,  1.29it/s]


Epoch: 3 - valid Loss: nan - valid_acc : nan
1.3102981282915746e-05


 71%|███████▏  | 500/700 [06:27<02:35,  1.29it/s]

Batch Loss:  0.06659582748636603 Accuracy:  tensor(0.9811, device='cuda:0')


100%|██████████| 700/700 [09:02<00:00,  1.29it/s]


Epoch: 4 - valid Loss: nan - valid_acc : nan
1.0011227991399215e-05


 71%|███████▏  | 500/700 [06:27<02:34,  1.29it/s]

Batch Loss:  0.04471017389977351 Accuracy:  tensor(0.9873, device='cuda:0')


100%|██████████| 700/700 [09:02<00:00,  1.29it/s]


Epoch: 5 - valid Loss: nan - valid_acc : nan
6.918374067032124e-06


 71%|███████▏  | 500/700 [06:28<02:35,  1.29it/s]

Batch Loss:  0.031563786261016503 Accuracy:  tensor(0.9919, device='cuda:0')


100%|██████████| 700/700 [09:03<00:00,  1.29it/s]


Epoch: 6 - valid Loss: nan - valid_acc : nan
4.127598993135665e-06


 71%|███████▏  | 500/700 [06:28<02:35,  1.29it/s]

Batch Loss:  0.01974055906315334 Accuracy:  tensor(0.9955, device='cuda:0')


100%|██████████| 700/700 [09:03<00:00,  1.29it/s]


Epoch: 7 - valid Loss: nan - valid_acc : nan
1.9124707317475876e-06


 71%|███████▏  | 500/700 [06:28<02:35,  1.28it/s]

Batch Loss:  0.015871648362488486 Accuracy:  tensor(0.9964, device='cuda:0')


100%|██████████| 700/700 [09:03<00:00,  1.29it/s]


Epoch: 8 - valid Loss: nan - valid_acc : nan
4.901290050147345e-07


 71%|███████▏  | 500/700 [06:28<02:35,  1.29it/s]

Batch Loss:  0.013792287093237973 Accuracy:  tensor(0.9969, device='cuda:0')


100%|██████████| 700/700 [09:04<00:00,  1.29it/s]


Epoch: 9 - valid Loss: nan - valid_acc : nan
0.0
model save, model val acc :  nan
best_models size :  3


Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for

Batch Loss:  0.7085235356986522 Accuracy:  tensor(0.6868, device='cuda:0')


100%|██████████| 700/700 [09:05<00:00,  1.28it/s]


Epoch: 0 - valid Loss: nan - valid_acc : nan
1.951679108887946e-05


 71%|███████▏  | 500/700 [06:29<02:36,  1.28it/s]

Batch Loss:  0.32689891186356546 Accuracy:  tensor(0.8808, device='cuda:0')


100%|██████████| 700/700 [09:05<00:00,  1.28it/s]


Epoch: 1 - valid Loss: nan - valid_acc : nan
1.8100716321851976e-05


 71%|███████▏  | 500/700 [06:29<02:36,  1.28it/s]

Batch Loss:  0.18543627540394664 Accuracy:  tensor(0.9384, device='cuda:0')


100%|██████████| 700/700 [09:04<00:00,  1.28it/s]


Epoch: 2 - valid Loss: nan - valid_acc : nan
1.5890562348677304e-05


 71%|███████▏  | 500/700 [06:29<02:35,  1.28it/s]

Batch Loss:  0.10660494090430439 Accuracy:  tensor(0.9649, device='cuda:0')


100%|██████████| 700/700 [09:04<00:00,  1.28it/s]


Epoch: 3 - valid Loss: nan - valid_acc : nan
1.3102981282915746e-05


 71%|███████▏  | 500/700 [06:29<02:36,  1.28it/s]

Batch Loss:  0.06141594324586913 Accuracy:  tensor(0.9809, device='cuda:0')


100%|██████████| 700/700 [09:04<00:00,  1.28it/s]


Epoch: 4 - valid Loss: nan - valid_acc : nan
1.0011227991399215e-05


 71%|███████▏  | 500/700 [06:29<02:36,  1.28it/s]

Batch Loss:  0.041564452693331984 Accuracy:  tensor(0.9868, device='cuda:0')


100%|██████████| 700/700 [09:04<00:00,  1.28it/s]


Epoch: 5 - valid Loss: nan - valid_acc : nan
6.918374067032124e-06


 71%|███████▏  | 500/700 [06:29<02:35,  1.28it/s]

Batch Loss:  0.02308630574506242 Accuracy:  tensor(0.9929, device='cuda:0')


100%|██████████| 700/700 [09:05<00:00,  1.28it/s]


Epoch: 6 - valid Loss: nan - valid_acc : nan
4.127598993135665e-06


 71%|███████▏  | 500/700 [06:29<02:35,  1.28it/s]

Batch Loss:  0.014367283396306448 Accuracy:  tensor(0.9955, device='cuda:0')


100%|██████████| 700/700 [09:04<00:00,  1.28it/s]


Epoch: 7 - valid Loss: nan - valid_acc : nan
1.9124707317475876e-06


 71%|███████▏  | 500/700 [06:29<02:35,  1.28it/s]

Batch Loss:  0.010533124544366728 Accuracy:  tensor(0.9974, device='cuda:0')


100%|██████████| 700/700 [09:04<00:00,  1.28it/s]


Epoch: 8 - valid Loss: nan - valid_acc : nan
4.901290050147345e-07


 71%|███████▏  | 500/700 [06:28<02:35,  1.29it/s]

Batch Loss:  0.008341697230818682 Accuracy:  tensor(0.9978, device='cuda:0')


100%|██████████| 700/700 [09:04<00:00,  1.29it/s]


Epoch: 9 - valid Loss: nan - valid_acc : nan
0.0
model save, model val acc :  nan
best_models size :  4
