# 설정, 설치 임포트

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
os.chdir('/content/drive/MyDrive/015GithubRepos/Dacon_sentence_classification')

In [4]:
pip install transformers

Collecting transformers
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 5.3 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 4.5 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.10.1
  Downloading tokenizers-0.11.4-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 26.8 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 46.4 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 39.0 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
  

In [5]:
import pandas as pd 
import numpy as np 
import os
import torch
import torch.nn as nn

import warnings 
warnings.filterwarnings("ignore")
from tqdm import tqdm
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, ElectraForSequenceClassification, AdamW
from transformers.optimization import get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup
import re
from sklearn.model_selection import train_test_split
#%% Seed Fix

import random
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  
    torch.backends.cudnn.deterministic = True  
    torch.backends.cudnn.benchmark = True  
seed_everything()

#%%

device = torch.device("cuda")

# local = 'C:/Users/posick/Desktop/Dacon/open/'
# local2 = 'C:/Users/201/Desktop/Dacon/'
suv = 'data/'
# colab = '/content/drive/MyDrive/Dacon/'

#%% 
num_epochs = 10
batch_size =128
lr = 0.00001
pretrain = "monologg/koelectra-base-v3-discriminator"

#%% data load 

def load_data(path):
    train = pd.read_csv(path+'train_data.csv')
    label_dict = {"entailment" : 0, "contradiction" : 1, "neutral" : 2}
    train['label'] = train['label'].map(label_dict)
    test = pd.read_csv(path+'test_data.csv')
    sample_submission = pd.read_csv(path+'sample_submission.csv')
    
    return train,test,sample_submission 

def text_clean(df):
    df["premise_"] = "[CLS]" + df["premise"] + "[SEP]"
    df["hypothesis_"] = df["hypothesis"] + "[SEP]"
    df["text_sum"] = df.premise_ + " " + df.hypothesis_
    df = df[['text_sum','label']]
    return df 

train,test,sample_submission = load_data(suv)
clean_train,clean_test  = text_clean(train),text_clean(test)


#%%data loader 

class CustomDataset(Dataset):
  
  def __init__(self,dataset,option):
    
    self.dataset = dataset 
    self.option = option
    self.tokenizer = AutoTokenizer.from_pretrained(pretrain)

  
  def __len__(self):
    return len(self.dataset)
  
  def __getitem__(self, idx):
    row = self.dataset.iloc[idx, 0:2].values
    text = row[0]
    #y = row[1]

    inputs = self.tokenizer(
        text, 
        return_tensors='pt',
        truncation=True,
        max_length=70,
        pad_to_max_length=True,
        add_special_tokens=False
        )
    
    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]
    
    if self.option =='train':
        y =row[1]
        return input_ids,attention_mask,y

    return input_ids, attention_mask


#%% Cross validation 

from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits = 5,shuffle=True,random_state=42)
folds=[]
for trn_idx,val_idx in skf.split(clean_train['text_sum'],clean_train['label']):
    folds.append((trn_idx,val_idx))
    
    
#%%
#model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-base-v3-discriminator",num_labels=3).to(device)
#%%
#model

#n=0
#for name, child in model.named_children():
#    if n==0:
#      h=0
#      for param in child.parameters():
#        if h<=328: #이부분 숫자 조절로 fine-tuning => Roberta229: h=229
#          param.requires_grad = False
#        h+=1
#    n+=1
    

In [6]:
best_models = []

for i,fold in enumerate(range(5)):
    print('===============',i+1,'fold start===============')
    model = ElectraForSequenceClassification.from_pretrained(pretrain,num_labels=3).to(device)
    model=nn.DataParallel(model).to(device)
    optimizer = AdamW(model.parameters(), lr=lr)
    
    
    train_idx = folds[fold][0]
    valid_idx = folds[fold][1]
    train_data = clean_train.loc[trn_idx]
    val_data = clean_train.loc[valid_idx]
    train_dataset = CustomDataset(train_data,'train')
    valid_dataset = CustomDataset(val_data,'train')
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
    warmup_ratio = 0.1
    total_steps = len(train_loader) * num_epochs
    warmup_step = int(total_steps * warmup_ratio)
    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=1, num_training_steps=total_steps)
    valid_loss_min = 0.4
    valid_acc_max = 0.8
    
    for epoch in range(num_epochs):
        batches = 0
        total_loss = 0.0
        correct = 0
        total =0
        model.train()
        
        for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
            optimizer.zero_grad()
            y_batch = y_batch.to(device)
            y_pred = model(input_ids_batch.to(device), attention_mask = attention_masks_batch.to(device))[0]
            loss = F.cross_entropy(y_pred, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            _, predicted = torch.max(y_pred, 1)
            correct += (predicted == y_batch).sum()
            total += len(y_batch)
            batches += 1
            if batches % 100 == 0:
                print("Batch Loss: ", total_loss, "Accuracy: ", correct.float() / total)
      
        val_loss = []
        val_acc = []
        
        for input_ids_batch, attention_masks_batch, y_batch in tqdm(valid_loader):
            
            model.eval()
            with torch.no_grad():
                
                y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
                valid_loss = F.cross_entropy(y_pred,y_batch.to(device)).cpu().detach().numpy()

                preds = torch.argmax(y_pred,1)
                preds = preds.cpu().detach().numpy()
                y_batch = y_batch.cpu().detach().numpy()
                batch_acc = (preds==y_batch).mean()
                val_loss.append(valid_loss)
                val_acc.append(batch_acc)
                
                
        val_loss = np.mean(val_loss)
        val_acc = np.mean(val_acc)
        scheduler.step()
        print(f'Epoch: {epoch} - valid Loss: {val_loss:.6f} - valid_acc : {val_acc:.6f}')
        print(optimizer.param_groups[0]["lr"])
        if valid_acc_max < val_acc:
            valid_acc_max = val_acc
            best_models.append(model)
            print('model save, model val acc : ',val_acc)
            print('best_models size : ',len(best_models))




Downloading:   0%|          | 0.00/467 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/431M [00:00<?, ?B/s]

Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: 

Downloading:   0%|          | 0.00/61.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/257k [00:00<?, ?B/s]

 64%|██████▎   | 100/157 [04:44<02:41,  2.83s/it]

Batch Loss:  109.72513401508331 Accuracy:  tensor(0.3592, device='cuda:0')


100%|██████████| 157/157 [07:23<00:00,  2.82s/it]
100%|██████████| 40/40 [00:44<00:00,  1.10s/it]


Epoch: 0 - valid Loss: 1.095447 - valid_acc : 0.391797
1e-05


 64%|██████▎   | 100/157 [04:42<02:41,  2.83s/it]

Batch Loss:  84.93394750356674 Accuracy:  tensor(0.6548, device='cuda:0')


100%|██████████| 157/157 [07:22<00:00,  2.82s/it]
100%|██████████| 40/40 [00:44<00:00,  1.10s/it]


Epoch: 1 - valid Loss: 0.375572 - valid_acc : 0.877344
9.999989977092514e-06
model save, model val acc :  0.87734375
best_models size :  1


 64%|██████▎   | 100/157 [04:42<02:41,  2.83s/it]

Batch Loss:  38.87187622487545 Accuracy:  tensor(0.8676, device='cuda:0')


100%|██████████| 157/157 [07:22<00:00,  2.82s/it]
100%|██████████| 40/40 [00:43<00:00,  1.10s/it]


Epoch: 2 - valid Loss: 0.227332 - valid_acc : 0.928711
9.999959908410236e-06
model save, model val acc :  0.9287109375
best_models size :  2


 64%|██████▎   | 100/157 [04:42<02:41,  2.83s/it]

Batch Loss:  28.896242022514343 Accuracy:  tensor(0.9054, device='cuda:0')


100%|██████████| 157/157 [07:22<00:00,  2.82s/it]
100%|██████████| 40/40 [00:44<00:00,  1.10s/it]


Epoch: 3 - valid Loss: 0.151015 - valid_acc : 0.955273
9.999909794073715e-06
model save, model val acc :  0.9552734375
best_models size :  3


 64%|██████▎   | 100/157 [04:42<02:40,  2.82s/it]

Batch Loss:  21.035183809697628 Accuracy:  tensor(0.9318, device='cuda:0')


100%|██████████| 157/157 [07:21<00:00,  2.81s/it]
100%|██████████| 40/40 [00:43<00:00,  1.10s/it]


Epoch: 4 - valid Loss: 0.103415 - valid_acc : 0.972266
9.99983963428387e-06
model save, model val acc :  0.972265625
best_models size :  4


 64%|██████▎   | 100/157 [04:43<02:41,  2.84s/it]

Batch Loss:  16.350761868059635 Accuracy:  tensor(0.9497, device='cuda:0')


100%|██████████| 157/157 [07:22<00:00,  2.82s/it]
100%|██████████| 40/40 [00:44<00:00,  1.11s/it]


Epoch: 5 - valid Loss: 0.080190 - valid_acc : 0.979102
9.999749429321982e-06
model save, model val acc :  0.9791015625
best_models size :  5


 64%|██████▎   | 100/157 [04:43<02:41,  2.83s/it]

Batch Loss:  13.364772409200668 Accuracy:  tensor(0.9600, device='cuda:0')


100%|██████████| 157/157 [07:23<00:00,  2.82s/it]
100%|██████████| 40/40 [00:44<00:00,  1.11s/it]


Epoch: 6 - valid Loss: 0.049668 - valid_acc : 0.988281
9.999639179549699e-06
model save, model val acc :  0.98828125
best_models size :  6


 64%|██████▎   | 100/157 [04:45<02:42,  2.86s/it]

Batch Loss:  9.881972555071115 Accuracy:  tensor(0.9706, device='cuda:0')


100%|██████████| 157/157 [07:26<00:00,  2.85s/it]
100%|██████████| 40/40 [00:44<00:00,  1.11s/it]


Epoch: 7 - valid Loss: 0.034038 - valid_acc : 0.992969
9.999508885409028e-06
model save, model val acc :  0.99296875
best_models size :  7


 64%|██████▎   | 100/157 [04:45<02:43,  2.87s/it]

Batch Loss:  7.731785601004958 Accuracy:  tensor(0.9769, device='cuda:0')


100%|██████████| 157/157 [07:25<00:00,  2.84s/it]
100%|██████████| 40/40 [00:44<00:00,  1.11s/it]


Epoch: 8 - valid Loss: 0.025126 - valid_acc : 0.994336
9.999358547422342e-06
model save, model val acc :  0.9943359375
best_models size :  8


 64%|██████▎   | 100/157 [04:45<02:43,  2.87s/it]

Batch Loss:  7.021880384534597 Accuracy:  tensor(0.9808, device='cuda:0')


100%|██████████| 157/157 [07:26<00:00,  2.84s/it]
100%|██████████| 40/40 [00:44<00:00,  1.11s/it]


Epoch: 9 - valid Loss: 0.020542 - valid_acc : 0.996289
9.999188166192368e-06
model save, model val acc :  0.9962890625
best_models size :  9


Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: 

Batch Loss:  109.83966147899628 Accuracy:  tensor(0.3388, device='cuda:0')


100%|██████████| 157/157 [07:26<00:00,  2.84s/it]
100%|██████████| 40/40 [00:44<00:00,  1.11s/it]


Epoch: 0 - valid Loss: 1.098895 - valid_acc : 0.317773
1e-05


 64%|██████▎   | 100/157 [04:46<02:43,  2.87s/it]

Batch Loss:  91.56615382432938 Accuracy:  tensor(0.5885, device='cuda:0')


100%|██████████| 157/157 [07:27<00:00,  2.85s/it]
100%|██████████| 40/40 [00:44<00:00,  1.11s/it]


Epoch: 1 - valid Loss: 0.387070 - valid_acc : 0.872461
9.999989977092514e-06
model save, model val acc :  0.8724609375
best_models size :  10


 64%|██████▎   | 100/157 [04:44<02:42,  2.85s/it]

Batch Loss:  40.36764547228813 Accuracy:  tensor(0.8585, device='cuda:0')


100%|██████████| 157/157 [07:25<00:00,  2.84s/it]
100%|██████████| 40/40 [00:44<00:00,  1.11s/it]


Epoch: 2 - valid Loss: 0.244025 - valid_acc : 0.923633
9.999959908410236e-06
model save, model val acc :  0.9236328125
best_models size :  11


 64%|██████▎   | 100/157 [04:43<02:41,  2.83s/it]

Batch Loss:  27.737796157598495 Accuracy:  tensor(0.9063, device='cuda:0')


100%|██████████| 157/157 [07:23<00:00,  2.82s/it]
100%|██████████| 40/40 [00:43<00:00,  1.10s/it]


Epoch: 3 - valid Loss: 0.166544 - valid_acc : 0.950781
9.999909794073715e-06
model save, model val acc :  0.95078125
best_models size :  12


 64%|██████▎   | 100/157 [04:42<02:41,  2.83s/it]

Batch Loss:  19.715318977832794 Accuracy:  tensor(0.9373, device='cuda:0')


100%|██████████| 157/157 [07:21<00:00,  2.81s/it]
100%|██████████| 40/40 [00:43<00:00,  1.10s/it]


Epoch: 4 - valid Loss: 0.117789 - valid_acc : 0.968164
9.99983963428387e-06
model save, model val acc :  0.9681640625
best_models size :  13


 64%|██████▎   | 100/157 [04:43<02:41,  2.83s/it]

Batch Loss:  15.99383869022131 Accuracy:  tensor(0.9525, device='cuda:0')


100%|██████████| 157/157 [07:22<00:00,  2.82s/it]
100%|██████████| 40/40 [00:43<00:00,  1.10s/it]


Epoch: 5 - valid Loss: 0.082058 - valid_acc : 0.979688
9.999749429321982e-06
model save, model val acc :  0.9796875
best_models size :  14


 64%|██████▎   | 100/157 [04:42<02:41,  2.83s/it]

Batch Loss:  11.793493317440152 Accuracy:  tensor(0.9656, device='cuda:0')


100%|██████████| 157/157 [07:22<00:00,  2.82s/it]
100%|██████████| 40/40 [00:44<00:00,  1.12s/it]


Epoch: 6 - valid Loss: 0.056030 - valid_acc : 0.986914
9.999639179549699e-06
model save, model val acc :  0.9869140625
best_models size :  15


 64%|██████▎   | 100/157 [04:43<02:41,  2.84s/it]

Batch Loss:  9.298936219885945 Accuracy:  tensor(0.9734, device='cuda:0')


100%|██████████| 157/157 [07:22<00:00,  2.82s/it]
100%|██████████| 40/40 [00:43<00:00,  1.10s/it]


Epoch: 7 - valid Loss: 0.043384 - valid_acc : 0.989648
9.999508885409028e-06
model save, model val acc :  0.9896484375
best_models size :  16


 64%|██████▎   | 100/157 [04:43<02:41,  2.83s/it]

Batch Loss:  7.848069893196225 Accuracy:  tensor(0.9764, device='cuda:0')


100%|██████████| 157/157 [07:22<00:00,  2.82s/it]
100%|██████████| 40/40 [00:44<00:00,  1.10s/it]


Epoch: 8 - valid Loss: 0.032630 - valid_acc : 0.992383
9.999358547422342e-06
model save, model val acc :  0.9923828125
best_models size :  17


 64%|██████▎   | 100/157 [04:42<02:41,  2.83s/it]

Batch Loss:  5.9981485633179545 Accuracy:  tensor(0.9824, device='cuda:0')


100%|██████████| 157/157 [07:22<00:00,  2.82s/it]
100%|██████████| 40/40 [00:43<00:00,  1.10s/it]


Epoch: 9 - valid Loss: 0.023265 - valid_acc : 0.995313
9.999188166192368e-06
model save, model val acc :  0.9953125
best_models size :  18


Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: 

Batch Loss:  110.14424085617065 Accuracy:  tensor(0.3241, device='cuda:0')


100%|██████████| 157/157 [07:22<00:00,  2.82s/it]
100%|██████████| 40/40 [00:43<00:00,  1.10s/it]


Epoch: 0 - valid Loss: 1.101079 - valid_acc : 0.320898
1e-05


 64%|██████▎   | 100/157 [04:45<02:42,  2.85s/it]

Batch Loss:  88.23669734597206 Accuracy:  tensor(0.6259, device='cuda:0')


100%|██████████| 157/157 [07:25<00:00,  2.84s/it]
100%|██████████| 40/40 [00:44<00:00,  1.11s/it]


Epoch: 1 - valid Loss: 0.408190 - valid_acc : 0.865234
9.999989977092514e-06
model save, model val acc :  0.865234375
best_models size :  19


 64%|██████▎   | 100/157 [04:45<02:42,  2.85s/it]

Batch Loss:  39.407834976911545 Accuracy:  tensor(0.8649, device='cuda:0')


100%|██████████| 157/157 [07:25<00:00,  2.84s/it]
100%|██████████| 40/40 [00:44<00:00,  1.11s/it]


Epoch: 2 - valid Loss: 0.248094 - valid_acc : 0.920898
9.999959908410236e-06
model save, model val acc :  0.9208984375
best_models size :  20


 64%|██████▎   | 100/157 [04:45<02:42,  2.85s/it]

Batch Loss:  28.281151354312897 Accuracy:  tensor(0.9083, device='cuda:0')


100%|██████████| 157/157 [07:25<00:00,  2.84s/it]
100%|██████████| 40/40 [00:44<00:00,  1.11s/it]


Epoch: 3 - valid Loss: 0.172162 - valid_acc : 0.950195
9.999909794073715e-06
model save, model val acc :  0.9501953125
best_models size :  21


 64%|██████▎   | 100/157 [04:44<02:42,  2.85s/it]

Batch Loss:  21.664254255592823 Accuracy:  tensor(0.9321, device='cuda:0')


100%|██████████| 157/157 [07:25<00:00,  2.84s/it]
100%|██████████| 40/40 [00:44<00:00,  1.11s/it]


Epoch: 4 - valid Loss: 0.123012 - valid_acc : 0.967773
9.99983963428387e-06
model save, model val acc :  0.9677734375
best_models size :  22


 64%|██████▎   | 100/157 [04:44<02:42,  2.85s/it]

Batch Loss:  16.9687410145998 Accuracy:  tensor(0.9470, device='cuda:0')


100%|██████████| 157/157 [07:25<00:00,  2.84s/it]
100%|██████████| 40/40 [00:44<00:00,  1.11s/it]


Epoch: 5 - valid Loss: 0.089705 - valid_acc : 0.976367
9.999749429321982e-06
model save, model val acc :  0.9763671875
best_models size :  23


 64%|██████▎   | 100/157 [04:44<02:41,  2.83s/it]

Batch Loss:  12.101420897990465 Accuracy:  tensor(0.9638, device='cuda:0')


100%|██████████| 157/157 [07:23<00:00,  2.83s/it]
100%|██████████| 40/40 [00:44<00:00,  1.10s/it]


Epoch: 6 - valid Loss: 0.060783 - valid_acc : 0.985547
9.999639179549699e-06
model save, model val acc :  0.985546875
best_models size :  24


 64%|██████▎   | 100/157 [04:42<02:41,  2.83s/it]

Batch Loss:  10.2985326834023 Accuracy:  tensor(0.9705, device='cuda:0')


100%|██████████| 157/157 [07:22<00:00,  2.82s/it]
100%|██████████| 40/40 [00:43<00:00,  1.10s/it]


Epoch: 7 - valid Loss: 0.044113 - valid_acc : 0.991211
9.999508885409028e-06
model save, model val acc :  0.9912109375
best_models size :  25


 64%|██████▎   | 100/157 [04:42<02:41,  2.83s/it]

Batch Loss:  7.967084295116365 Accuracy:  tensor(0.9771, device='cuda:0')


100%|██████████| 157/157 [07:22<00:00,  2.82s/it]
100%|██████████| 40/40 [00:43<00:00,  1.10s/it]


Epoch: 8 - valid Loss: 0.031831 - valid_acc : 0.993164
9.999358547422342e-06
model save, model val acc :  0.9931640625
best_models size :  26


 64%|██████▎   | 100/157 [04:44<02:43,  2.87s/it]

Batch Loss:  6.659367703832686 Accuracy:  tensor(0.9801, device='cuda:0')


100%|██████████| 157/157 [07:25<00:00,  2.84s/it]
100%|██████████| 40/40 [00:44<00:00,  1.12s/it]


Epoch: 9 - valid Loss: 0.027649 - valid_acc : 0.993359
9.999188166192368e-06
model save, model val acc :  0.993359375
best_models size :  27


Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: 

RuntimeError: ignored

In [None]:
y_batch.shape

torch.Size([128])

In [7]:
temp

NameError: ignored

In [None]:
torch.load()