### Setting.py

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 4.1 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 66.5 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.10.1
  Downloading tokenizers-0.11.5-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 47.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 6.9 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 53.3 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
  

In [4]:
import pandas as pd 
import numpy as np 
import os
import torch
import torch.nn as nn

import warnings 
warnings.filterwarnings("ignore")
from tqdm import tqdm
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import ElectraModel, ElectraTokenizer, AdamW
from transformers.optimization import get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup
import re
from sklearn.model_selection import train_test_split

In [5]:
device = torch.device("cuda")
!nvidia-smi

Sun Feb 27 11:19:00 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [6]:
# Random Seed Fix
import random
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  
    torch.backends.cudnn.deterministic = True  
    torch.backends.cudnn.benchmark = True  
seed_everything()

In [7]:
def load_data(path):
  TRAIN = os.path.join(path, 'benchmark_train_data.csv')
  TEST = os.path.join(path, 'test_data.csv')
  SS = os.path.join(path, 'sample_submission.csv')
  AUG = os.path.join(path, 'aug_train_data.csv')
  label_dict = {"entailment" : 0, "contradiction" : 1, "neutral" : 2}
  train = pd.read_csv(TRAIN)
  test = pd.read_csv(TEST)
  sample_submission = pd.read_csv(SS)
  aug_train = pd.read_csv(AUG)
  train = pd.concat([train, aug_train.iloc[:10000]])
  #display(train)
  train['label'] = train['label'].map(label_dict)

  return train,test,sample_submission

def text_clean(df):
  df["premise_"] = "[CLS]" + df["premise"].astype(str) + "[SEP]"
  #df["premise_"] = df["premise"].astype(str)
  #df["hypothesis_"] = df["hypothesis"].astype(str) + "[SEP]"
  df["hypothesis_"] = df["hypothesis"].astype(str)
  df["text_sum"] = df.premise_ + " " + df.hypothesis_
  df = df[['text_sum','label']]
  df = df.reset_index()
  df = df.drop('index', axis = 1)
  return df 

def random_deletion(sentence, p=0.2):
    words = sentence.split ()
    n = len (words)
    if n == 1: # return if single word
        return words
    remaining = list(filter(lambda x: random.uniform(0,1) > p,words))
    #print (remaining) 
    if len(remaining) == 0: # if not left, sample a random word
        return ' '.join ([random.choice(words)])
    else:
        return ' '.join (remaining)

def random_swap(sentence, n=2):
    sentence = sentence.split () 
    length = range(len(sentence))
    swapped = []
    if len(sentence) >2:
      for _ in range(n):
          idx1, idx2 = random.sample(length, 2)
          swapped.append ([sentence[idx1], sentence[idx2]])
          sentence[idx1], sentence[idx2] = sentence[idx2], sentence[idx1] 
    return ' '.join (sentence)

def eda_aug(df):

    cache = {'premise':[], 'hypothesis':[], 'label':[]}
    for idx in tqdm(range(len(df))):
        premise = df.iloc[idx]['premise']
        hypothesis = df.iloc[idx]['hypothesis']
        label = df.iloc[idx]['label']
        cache['premise'].append(premise)
        cache['hypothesis'].append(hypothesis)
        cache['label'].append(label)
        flag = random.randrange(10)
        if flag < 2:
          cache['premise'].append(random_deletion(premise))
          cache['hypothesis'].append(random_deletion(hypothesis))
          cache['label'].append(label)
          cache['premise'].append(random_swap(premise))
          cache['hypothesis'].append(random_swap(hypothesis))
          cache['label'].append(label)
    
    return pd.DataFrame(cache)

### Dataset.py

In [8]:
#ROOT = '/content/drive/MyDrive/DACON_MONTHLYNLI'
ROOT = '/content/drive/Shareddrives/Dacon/data'
train,test,sample_submission = load_data(ROOT)
###### AUGMENTATION ######
#train = eda_aug(train)
###### AUGMENTATION ######

clean_train,clean_test  = text_clean(train),text_clean(test)
display(clean_train)

Unnamed: 0,text_sum,label
0,"[CLS]씨름은 상고시대로부터 전해져 내려오는 남자들의 대표적인 놀이로서, 소년이나...",1
1,[CLS]삼성은 자작극을 벌인 2명에게 형사 고소 등의 법적 대응을 검토 중이라고 ...,1
2,[CLS]이를 위해 예측적 범죄예방 시스템을 구축하고 고도화한다.[SEP] 예측적 ...,0
3,[CLS]광주광역시가 재개발 정비사업 원주민들에 대한 종합대책을 마련하는 등 원주민...,2
4,"[CLS]진정 소비자와 직원들에게 사랑 받는 기업으로 오래 지속되고 싶으면, 이런 ...",2
...,...,...
37993,[CLS]2000년 집권한 로랑 그바그보 대통령은 지난해 11월 대선에서 패배해 재...,2
37994,[CLS]시부야와 롯데로 이사하는 것이 좋습니다.[SEP] 시부야나 롯데로만 이동할...,2
37995,"[CLS]다만 지리산, 설악산, 덕유산 등 전국 국립공원 내 쉼터 14곳은 탐방객의...",1
37996,[CLS]이것은 현대 세계에서 고군분투하는 한 청소년에 관한 영화입니다.[SEP] ...,0


In [9]:
model_electra = ElectraModel.from_pretrained("monologg/koelectra-base-v3-discriminator")
tokenizer_electra =  ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")

Downloading:   0%|          | 0.00/467 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/431M [00:00<?, ?B/s]

Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraModel: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/61.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/257k [00:00<?, ?B/s]

In [10]:
def electra_transform(text):
  transform = tokenizer_electra(text,
                               padding='max_length',
                               truncation=True,
                               max_length=256,
                               return_tensors='pt',
                                add_special_tokens=True)
  return transform

In [11]:
class customDataset(Dataset):
  def __init__(self,dataset,mode='train',transform=electra_transform):
    super(customDataset, self).__init__()
    self.mode = mode
    self.dataset = dataset
    self.transform = transform

  def __getitem__(self,idx):
    row = self.dataset.iloc[idx, 0:2].values
    text = row[0]
    tokens = self.transform(text)
    token_ids = tokens['input_ids'][0] # tensor of token ids
    attn_masks = tokens['attention_mask'][0]  # binary tensor with "0" for padded values and "1" for the other values
    token_type_ids = tokens['token_type_ids'][0]  # binary tensor with "0" for the 1st sentence tokens & "1" for the 2nd sentence tokens

    if self.mode == 'test':
      return token_ids,attn_masks,token_type_ids
    else: 
      labels = row[1]
      return token_ids,attn_masks,token_type_ids, labels
  
  def __len__(self):
    return(len(self.dataset))

### Model.py

In [12]:
class electraClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=3,   ##클래스 수 조정##
                 params=None,
                 freeze_bert=False):
        super(electraClassifier, self).__init__()
        self.bert = bert
        self.freeze_bert=freeze_bert

        if self.freeze_bert:
            for p in self.bert.parameters():
                p.requires_grad = False

        '''
        self.conv1 = nn.Conv1d(hidden_size, 2, 2, padding = 1)
        self.conv2 = nn.Conv1d(hidden_size, 2, 3, padding =2)
        self.conv3 = nn.Conv1d(hidden_size, 2, 4, padding = 3)
        self.batchnorm = nn.BatchNorm1d(2)
        self.activation = nn.Sigmoid()
        self.maxpool = nn.MaxPool1d(256)

        #self.classifier = nn.Linear(hidden_size , 256)
        self.dropout = nn.Dropout(p=0.5)
        #self.fc_layer1 = nn.Linear(256,128)
        self.fc_layer = nn.Linear(6,num_classes)
        '''
        self.classifier = nn.Linear(hidden_size , 256)
        self.dropout = nn.Dropout(p=0.5)
        self.fc_layer1 = nn.Linear(256,128)
        self.fc_layer2 = nn.Linear(128,num_classes)
    

    def forward(self, input_ids, attn_masks, token_type_ids):
        
        pooler = self.bert(input_ids, attn_masks, token_type_ids, return_dict=False)[0]
        '''
        pooler = torch.transpose(pooler, 1, 2)
        output1 = torch.squeeze(self.maxpool(self.conv1(self.dropout(pooler))))
        output2 = torch.squeeze(self.maxpool(self.conv2(self.dropout(pooler))))
        output3 = torch.squeeze(self.maxpool(self.conv3(self.dropout(pooler))))
        output1 = self.activation(self.batchnorm(output1))
        output2 = self.activation(self.batchnorm(output2))
        output3 = self.activation(self.batchnorm(output3))
        output = torch.concat([output1, output2, output3], axis = 1)
      

        output = self.fc_layer(output)
        return output
        '''
        output1 = self.classifier(pooler)
        output2 = self.fc_layer1(output1)
        output3 = self.fc_layer2(self.dropout(output2))
        return output3[:,-1]

### Train.py

In [13]:
###### HYPERPARMS ######
lr = 2e-5
batch_size=32
warmup_ratio = 0.06
num_epochs = 10
max_grad_norm = 1
log_interval = 200

In [14]:
############### CV ################
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits = 5,shuffle=True,random_state=42)
folds=[]
for trn_idx,val_idx in skf.split(clean_train['text_sum'],clean_train['label']):
    folds.append((trn_idx,val_idx))

############### CV Training Config for WanDB###############
'''
import wandb
config = config = {
    'batch_size' : batch_size,
    'warmup_ratio' : warmup_ratio,
    'num_epochs' : num_epochs, 
    'max_grad_norm' : max_grad_norm,
    'log_interval' : log_interval,
    'learning_rate' : lr
}
wandb.init(
        project="YBIGTA20 DACON NLI",
        config = config
)
'''
########## START #############
best_models = []

for i,fold in enumerate(range(3,5)):
    model_electra = ElectraModel.from_pretrained("monologg/koelectra-base-v3-discriminator")
    print('===============',i+1,'fold start===============')
    model = electraClassifier(model_electra).to(device)
    model=nn.DataParallel(model).to(device)
    optimizer = AdamW(model.parameters(), lr=lr)

    train_idx = folds[fold][0]
    valid_idx = folds[fold][1]
    train_data = clean_train.loc[train_idx]
    val_data = clean_train.loc[valid_idx]
    train_dataset = customDataset(train_data,'train')
    valid_dataset = customDataset(val_data,'train')
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
    total_steps = len(train_loader) * num_epochs
    warmup_step = int(total_steps * warmup_ratio)
    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=total_steps)
    valid_acc_max = 0.8
    valid_loss_min = 0.4
    
    for epoch in range(num_epochs):
        batches = 0
        total_loss = 0.0
        correct = 0
        total =0
        train_acc = 0.0
        model.train()
        
        for input_ids_batch, attention_masks_batch, token_type_batch, y_batch in tqdm(train_loader):
            optimizer.zero_grad()
            y_batch = y_batch.to(device)
            y_pred = model(input_ids_batch.to(device), attention_masks_batch.to(device), token_type_batch.to(device))
            #y_pred = model(input_ids_batch.to(device), attention_masks_batch.to(device), token_type_batch.to(device))
            #y_pred = torch.mean(torch.transpose(model(input_ids_batch.to(device), attention_masks_batch.to(device), token_type_batch.to(device)),0,1),0)
            
            loss1 = F.cross_entropy(y_pred, y_batch)
            one_hot = torch.nn.functional.one_hot(y_batch, 3).to(torch.float32)
            loss2 = F.mse_loss(y_pred, one_hot)
            loss3= F.hinge_embedding_loss(y_pred, one_hot)
            loss = 0.7*loss1+0.15*loss2+0.15*loss3
            '''
            loss = F.cross_entropy(y_pred,y_batch)
            '''
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            optimizer.step()
            scheduler.step()
            total_loss += loss.item()
            _, predicted = torch.max(y_pred, 1)
            batch_correct = (predicted == y_batch).sum()
            batch_acc = batch_correct.float() / len(y_batch)
            train_acc += batch_acc
            correct += batch_correct
            total += len(y_batch)
            batches += 1
            if batches % log_interval == 0:
                print("Batch Loss: ", total_loss / batches, "Accuracy: ", correct.float() / total)

      
        val_loss = []
        val_acc = []
        
        for input_ids_batch, attention_masks_batch, token_type_batch, y_batch in tqdm(valid_loader):
            
            model.eval()
            with torch.no_grad():
                y_batch = y_batch.to(device)
                y_pred = model(input_ids_batch.to(device), attention_masks_batch.to(device), token_type_batch.to(device))
                #y_pred = torch.mean(torch.transpose(model(input_ids_batch.to(device), attention_masks_batch.to(device), token_type_batch.to(device)),0,1),0)
                
                valid_loss1 = F.cross_entropy(y_pred, y_batch)
                valid_one_hot = torch.nn.functional.one_hot(y_batch, 3).to(torch.float32)
                valid_loss2 = F.mse_loss(y_pred, valid_one_hot)
                valid_loss3= F.hinge_embedding_loss(y_pred, valid_one_hot)
                valid_loss = 0.7*valid_loss1+0.15*valid_loss2+0.15*valid_loss3
                
                '''
                valid_loss = F.cross_entropy(y_pred,y_batch)
                '''
                valid_loss = valid_loss.cpu().detach().numpy()

                preds = torch.argmax(y_pred,1)
                preds = preds.cpu().detach().numpy()
                y_batch = y_batch.cpu().detach().numpy()
                batch_acc = (preds==y_batch).mean()
                val_loss.append(valid_loss)
                val_acc.append(batch_acc)
                
                
        val_loss = np.mean(val_loss)
        val_acc = np.mean(val_acc)

        print(f'Epoch: {epoch} - valid Loss: {val_loss:.6f} - valid_acc : {val_acc:.6f}')
        #wandb.log({"train": {"loss": total_loss/batches, "acc": train_acc/batches}, "val" : {"loss": val_loss, "acc": val_acc}})
        print("-"*20)

        if valid_acc_max < val_acc:
            valid_acc_max = val_acc
            best_models.append(model)
            torch.save(model.state_dict(), f'/content/drive/Shareddrives/Dacon/saved_models/Junha/electraFC_5CV_customloss_BTAug/electra-fold{i+1}-epoch{epoch+1}.pth') 
            print('model save, model val acc : ',val_acc)
            print('best_models size : ',len(best_models))

Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraModel: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




 21%|██        | 200/950 [02:41<10:02,  1.25it/s]

Batch Loss:  0.9203529638051987 Accuracy:  tensor(0.3486, device='cuda:0')


 42%|████▏     | 400/950 [05:22<07:22,  1.24it/s]

Batch Loss:  0.8348684325814247 Accuracy:  tensor(0.5096, device='cuda:0')


 63%|██████▎   | 600/950 [08:02<04:41,  1.24it/s]

Batch Loss:  0.764901961684227 Accuracy:  tensor(0.6050, device='cuda:0')


 84%|████████▍ | 800/950 [10:43<02:00,  1.24it/s]

Batch Loss:  0.7219964623078704 Accuracy:  tensor(0.6597, device='cuda:0')


100%|██████████| 950/950 [12:44<00:00,  1.24it/s]
100%|██████████| 238/238 [01:09<00:00,  3.41it/s]


Epoch: 0 - valid Loss: 0.565031 - valid_acc : 0.844223
--------------------
model save, model val acc :  0.8442226890756303
best_models size :  1


 21%|██        | 200/950 [02:40<10:01,  1.25it/s]

Batch Loss:  0.5297258861362935 Accuracy:  tensor(0.8836, device='cuda:0')


 42%|████▏     | 400/950 [05:21<07:23,  1.24it/s]

Batch Loss:  0.5267803380638361 Accuracy:  tensor(0.8841, device='cuda:0')


 63%|██████▎   | 600/950 [08:02<04:42,  1.24it/s]

Batch Loss:  0.5247862008710702 Accuracy:  tensor(0.8848, device='cuda:0')


 84%|████████▍ | 800/950 [10:43<02:00,  1.24it/s]

Batch Loss:  0.5229328293725848 Accuracy:  tensor(0.8859, device='cuda:0')


100%|██████████| 950/950 [12:44<00:00,  1.24it/s]
100%|██████████| 238/238 [01:09<00:00,  3.41it/s]


Epoch: 1 - valid Loss: 0.551647 - valid_acc : 0.864855
--------------------
model save, model val acc :  0.8648546918767507
best_models size :  2


 21%|██        | 200/950 [02:40<10:04,  1.24it/s]

Batch Loss:  0.4616241391003132 Accuracy:  tensor(0.9397, device='cuda:0')


 42%|████▏     | 400/950 [05:21<07:22,  1.24it/s]

Batch Loss:  0.4628842809051275 Accuracy:  tensor(0.9380, device='cuda:0')


 63%|██████▎   | 600/950 [08:02<04:41,  1.24it/s]

Batch Loss:  0.46342336212595303 Accuracy:  tensor(0.9369, device='cuda:0')


 84%|████████▍ | 800/950 [10:43<02:00,  1.24it/s]

Batch Loss:  0.4633980384096503 Accuracy:  tensor(0.9377, device='cuda:0')


100%|██████████| 950/950 [12:44<00:00,  1.24it/s]
100%|██████████| 238/238 [01:09<00:00,  3.41it/s]


Epoch: 2 - valid Loss: 0.549441 - valid_acc : 0.873162
--------------------
model save, model val acc :  0.8731617647058824
best_models size :  3


 21%|██        | 200/950 [02:40<10:04,  1.24it/s]

Batch Loss:  0.42882747530937193 Accuracy:  tensor(0.9664, device='cuda:0')


 42%|████▏     | 400/950 [05:22<07:23,  1.24it/s]

Batch Loss:  0.4287276566773653 Accuracy:  tensor(0.9664, device='cuda:0')


 63%|██████▎   | 600/950 [08:03<04:42,  1.24it/s]

Batch Loss:  0.43058550213774044 Accuracy:  tensor(0.9636, device='cuda:0')


 84%|████████▍ | 800/950 [10:44<02:00,  1.24it/s]

Batch Loss:  0.4313053910806775 Accuracy:  tensor(0.9636, device='cuda:0')


100%|██████████| 950/950 [12:44<00:00,  1.24it/s]
100%|██████████| 238/238 [01:09<00:00,  3.41it/s]


Epoch: 3 - valid Loss: 0.554972 - valid_acc : 0.873801
--------------------
model save, model val acc :  0.8738007703081233
best_models size :  4


 21%|██        | 200/950 [02:41<10:03,  1.24it/s]

Batch Loss:  0.4150963301956654 Accuracy:  tensor(0.9752, device='cuda:0')


 42%|████▏     | 400/950 [05:22<07:23,  1.24it/s]

Batch Loss:  0.41547978229820726 Accuracy:  tensor(0.9752, device='cuda:0')


 63%|██████▎   | 600/950 [08:03<04:42,  1.24it/s]

Batch Loss:  0.41532094329595565 Accuracy:  tensor(0.9754, device='cuda:0')


 84%|████████▍ | 800/950 [10:44<02:00,  1.24it/s]

Batch Loss:  0.415462678745389 Accuracy:  tensor(0.9757, device='cuda:0')


100%|██████████| 950/950 [12:45<00:00,  1.24it/s]
100%|██████████| 238/238 [01:09<00:00,  3.41it/s]


Epoch: 4 - valid Loss: 0.553537 - valid_acc : 0.881434
--------------------
model save, model val acc :  0.8814338235294118
best_models size :  5


 21%|██        | 200/950 [02:41<10:03,  1.24it/s]

Batch Loss:  0.4058632355928421 Accuracy:  tensor(0.9830, device='cuda:0')


 42%|████▏     | 400/950 [05:22<07:22,  1.24it/s]

Batch Loss:  0.4057289867848158 Accuracy:  tensor(0.9825, device='cuda:0')


 63%|██████▎   | 600/950 [08:03<04:42,  1.24it/s]

Batch Loss:  0.40724503189325334 Accuracy:  tensor(0.9817, device='cuda:0')


 84%|████████▍ | 800/950 [10:44<02:00,  1.24it/s]

Batch Loss:  0.40672916136682036 Accuracy:  tensor(0.9821, device='cuda:0')


100%|██████████| 950/950 [12:45<00:00,  1.24it/s]
100%|██████████| 238/238 [01:09<00:00,  3.43it/s]


Epoch: 5 - valid Loss: 0.549307 - valid_acc : 0.884979
--------------------
model save, model val acc :  0.8849789915966386
best_models size :  6


 21%|██        | 200/950 [02:41<10:04,  1.24it/s]

Batch Loss:  0.3980135786533356 Accuracy:  tensor(0.9895, device='cuda:0')


 42%|████▏     | 400/950 [05:22<07:23,  1.24it/s]

Batch Loss:  0.39848914474248887 Accuracy:  tensor(0.9887, device='cuda:0')


 63%|██████▎   | 600/950 [08:03<04:41,  1.24it/s]

Batch Loss:  0.3985163414478302 Accuracy:  tensor(0.9888, device='cuda:0')


 84%|████████▍ | 800/950 [10:44<02:01,  1.24it/s]

Batch Loss:  0.3990337575599551 Accuracy:  tensor(0.9884, device='cuda:0')


100%|██████████| 950/950 [12:44<00:00,  1.24it/s]
100%|██████████| 238/238 [01:09<00:00,  3.42it/s]


Epoch: 6 - valid Loss: 0.546291 - valid_acc : 0.887605
--------------------
model save, model val acc :  0.8876050420168067
best_models size :  7


 21%|██        | 200/950 [02:41<10:04,  1.24it/s]

Batch Loss:  0.3957314100861549 Accuracy:  tensor(0.9895, device='cuda:0')


 42%|████▏     | 400/950 [05:22<07:23,  1.24it/s]

Batch Loss:  0.3972774823755026 Accuracy:  tensor(0.9889, device='cuda:0')


 63%|██████▎   | 600/950 [08:03<04:42,  1.24it/s]

Batch Loss:  0.3967348971962929 Accuracy:  tensor(0.9895, device='cuda:0')


 84%|████████▍ | 800/950 [10:44<02:00,  1.24it/s]

Batch Loss:  0.39610439255833624 Accuracy:  tensor(0.9900, device='cuda:0')


100%|██████████| 950/950 [12:45<00:00,  1.24it/s]
100%|██████████| 238/238 [01:09<00:00,  3.42it/s]


Epoch: 7 - valid Loss: 0.548073 - valid_acc : 0.887342
--------------------


 21%|██        | 200/950 [02:41<10:05,  1.24it/s]

Batch Loss:  0.3945192179083824 Accuracy:  tensor(0.9911, device='cuda:0')


 42%|████▏     | 400/950 [05:22<07:24,  1.24it/s]

Batch Loss:  0.3942207732051611 Accuracy:  tensor(0.9915, device='cuda:0')


 63%|██████▎   | 600/950 [08:03<04:42,  1.24it/s]

Batch Loss:  0.39374765535195666 Accuracy:  tensor(0.9917, device='cuda:0')


 84%|████████▍ | 800/950 [10:44<02:00,  1.24it/s]

Batch Loss:  0.3935217957571149 Accuracy:  tensor(0.9917, device='cuda:0')


100%|██████████| 950/950 [12:45<00:00,  1.24it/s]
100%|██████████| 238/238 [01:09<00:00,  3.41it/s]


Epoch: 8 - valid Loss: 0.545423 - valid_acc : 0.889181
--------------------
model save, model val acc :  0.8891806722689075
best_models size :  8


 21%|██        | 200/950 [02:41<10:06,  1.24it/s]

Batch Loss:  0.39403068378567696 Accuracy:  tensor(0.9919, device='cuda:0')


 42%|████▏     | 400/950 [05:22<07:24,  1.24it/s]

Batch Loss:  0.3938963361084461 Accuracy:  tensor(0.9920, device='cuda:0')


 63%|██████▎   | 600/950 [08:03<04:42,  1.24it/s]

Batch Loss:  0.3936788447201252 Accuracy:  tensor(0.9922, device='cuda:0')


 84%|████████▍ | 800/950 [10:44<02:01,  1.24it/s]

Batch Loss:  0.39335318088531496 Accuracy:  tensor(0.9922, device='cuda:0')


100%|██████████| 950/950 [12:45<00:00,  1.24it/s]
100%|██████████| 238/238 [01:09<00:00,  3.41it/s]


Epoch: 9 - valid Loss: 0.545477 - valid_acc : 0.889049
--------------------


Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraModel: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




 21%|██        | 200/950 [02:41<10:03,  1.24it/s]

Batch Loss:  0.919458811879158 Accuracy:  tensor(0.3330, device='cuda:0')


 42%|████▏     | 400/950 [05:22<07:25,  1.24it/s]

Batch Loss:  0.8520341339707375 Accuracy:  tensor(0.4762, device='cuda:0')


 63%|██████▎   | 600/950 [08:03<04:42,  1.24it/s]

Batch Loss:  0.7775904873013496 Accuracy:  tensor(0.5830, device='cuda:0')


 84%|████████▍ | 800/950 [10:45<02:00,  1.24it/s]

Batch Loss:  0.733822219632566 Accuracy:  tensor(0.6404, device='cuda:0')


100%|██████████| 950/950 [12:45<00:00,  1.24it/s]
100%|██████████| 238/238 [01:09<00:00,  3.41it/s]


Epoch: 0 - valid Loss: 0.553412 - valid_acc : 0.856154
--------------------
model save, model val acc :  0.8561537114845938
best_models size :  9


 21%|██        | 200/950 [02:41<10:07,  1.23it/s]

Batch Loss:  0.5319315539300442 Accuracy:  tensor(0.8802, device='cuda:0')


 42%|████▏     | 400/950 [05:22<07:24,  1.24it/s]

Batch Loss:  0.5305883143097162 Accuracy:  tensor(0.8812, device='cuda:0')


 63%|██████▎   | 600/950 [08:03<04:42,  1.24it/s]

Batch Loss:  0.5262154061595599 Accuracy:  tensor(0.8851, device='cuda:0')


 84%|████████▍ | 800/950 [10:45<02:07,  1.17it/s]

Batch Loss:  0.5239548049494624 Accuracy:  tensor(0.8862, device='cuda:0')


100%|██████████| 950/950 [12:46<00:00,  1.24it/s]
100%|██████████| 238/238 [01:10<00:00,  3.40it/s]


Epoch: 1 - valid Loss: 0.532344 - valid_acc : 0.874589
--------------------
model save, model val acc :  0.8745885854341737
best_models size :  10


 21%|██        | 200/950 [02:41<10:07,  1.24it/s]

Batch Loss:  0.45931817442178724 Accuracy:  tensor(0.9425, device='cuda:0')


 42%|████▏     | 400/950 [05:22<07:22,  1.24it/s]

Batch Loss:  0.4637576839327812 Accuracy:  tensor(0.9385, device='cuda:0')


 63%|██████▎   | 600/950 [08:03<04:41,  1.25it/s]

Batch Loss:  0.46414564952254295 Accuracy:  tensor(0.9381, device='cuda:0')


 84%|████████▍ | 800/950 [10:45<02:00,  1.24it/s]

Batch Loss:  0.4639366316795349 Accuracy:  tensor(0.9374, device='cuda:0')


100%|██████████| 950/950 [12:45<00:00,  1.24it/s]
100%|██████████| 238/238 [01:09<00:00,  3.41it/s]


Epoch: 2 - valid Loss: 0.535444 - valid_acc : 0.882187
--------------------
model save, model val acc :  0.88218662464986
best_models size :  11


 21%|██        | 200/950 [02:41<10:03,  1.24it/s]

Batch Loss:  0.4361113879084587 Accuracy:  tensor(0.9594, device='cuda:0')


 42%|████▏     | 400/950 [05:22<07:23,  1.24it/s]

Batch Loss:  0.43424203254282473 Accuracy:  tensor(0.9611, device='cuda:0')


 63%|██████▎   | 600/950 [08:03<04:40,  1.25it/s]

Batch Loss:  0.43344556088248887 Accuracy:  tensor(0.9616, device='cuda:0')


 84%|████████▍ | 800/950 [10:44<02:00,  1.24it/s]

Batch Loss:  0.43244636099785566 Accuracy:  tensor(0.9620, device='cuda:0')


100%|██████████| 950/950 [12:45<00:00,  1.24it/s]
100%|██████████| 238/238 [01:09<00:00,  3.41it/s]


Epoch: 3 - valid Loss: 0.534088 - valid_acc : 0.890196
--------------------
model save, model val acc :  0.8901960784313726
best_models size :  12


 21%|██        | 200/950 [02:41<10:03,  1.24it/s]

Batch Loss:  0.41812155693769454 Accuracy:  tensor(0.9733, device='cuda:0')


 42%|████▏     | 400/950 [05:22<07:23,  1.24it/s]

Batch Loss:  0.4161714381724596 Accuracy:  tensor(0.9750, device='cuda:0')


 63%|██████▎   | 600/950 [08:03<04:41,  1.24it/s]

Batch Loss:  0.41513257602850595 Accuracy:  tensor(0.9757, device='cuda:0')


 84%|████████▍ | 800/950 [10:44<02:00,  1.24it/s]

Batch Loss:  0.41610630184412 Accuracy:  tensor(0.9751, device='cuda:0')


100%|██████████| 950/950 [12:45<00:00,  1.24it/s]
100%|██████████| 238/238 [01:09<00:00,  3.42it/s]


Epoch: 4 - valid Loss: 0.534684 - valid_acc : 0.890853
--------------------
model save, model val acc :  0.8908525910364146
best_models size :  13


 21%|██        | 200/950 [02:41<10:05,  1.24it/s]

Batch Loss:  0.40910679280757906 Accuracy:  tensor(0.9812, device='cuda:0')


 42%|████▏     | 400/950 [05:22<07:23,  1.24it/s]

Batch Loss:  0.40827527552843096 Accuracy:  tensor(0.9818, device='cuda:0')


 63%|██████▎   | 600/950 [08:03<04:42,  1.24it/s]

Batch Loss:  0.4075771376490593 Accuracy:  tensor(0.9822, device='cuda:0')


 84%|████████▍ | 800/950 [10:44<02:00,  1.24it/s]

Batch Loss:  0.4072342150658369 Accuracy:  tensor(0.9823, device='cuda:0')


100%|██████████| 950/950 [12:44<00:00,  1.24it/s]
100%|██████████| 238/238 [01:09<00:00,  3.42it/s]


Epoch: 5 - valid Loss: 0.536224 - valid_acc : 0.892034
--------------------
model save, model val acc :  0.8920343137254902
best_models size :  14


 21%|██        | 200/950 [02:41<10:05,  1.24it/s]

Batch Loss:  0.40295726627111433 Accuracy:  tensor(0.9858, device='cuda:0')


 42%|████▏     | 400/950 [05:22<07:23,  1.24it/s]

Batch Loss:  0.40178956896066664 Accuracy:  tensor(0.9869, device='cuda:0')


 63%|██████▎   | 600/950 [08:03<04:42,  1.24it/s]

Batch Loss:  0.4015936707456907 Accuracy:  tensor(0.9868, device='cuda:0')


 84%|████████▍ | 800/950 [10:44<02:01,  1.24it/s]

Batch Loss:  0.40077281583100555 Accuracy:  tensor(0.9873, device='cuda:0')


100%|██████████| 950/950 [12:45<00:00,  1.24it/s]
100%|██████████| 238/238 [01:09<00:00,  3.41it/s]


Epoch: 6 - valid Loss: 0.533227 - valid_acc : 0.894135
--------------------
model save, model val acc :  0.8941351540616247
best_models size :  15


 21%|██        | 200/950 [02:41<10:05,  1.24it/s]

Batch Loss:  0.3947217157483101 Accuracy:  tensor(0.9912, device='cuda:0')


 42%|████▏     | 400/950 [05:22<07:23,  1.24it/s]

Batch Loss:  0.39517185240983965 Accuracy:  tensor(0.9908, device='cuda:0')


 63%|██████▎   | 600/950 [08:03<04:42,  1.24it/s]

Batch Loss:  0.39478850523630776 Accuracy:  tensor(0.9911, device='cuda:0')


 84%|████████▍ | 800/950 [10:45<02:01,  1.24it/s]

Batch Loss:  0.39533060912042856 Accuracy:  tensor(0.9908, device='cuda:0')


100%|██████████| 950/950 [12:45<00:00,  1.24it/s]
100%|██████████| 238/238 [01:09<00:00,  3.41it/s]


Epoch: 7 - valid Loss: 0.533485 - valid_acc : 0.895842
--------------------
model save, model val acc :  0.8958420868347339
best_models size :  16


 21%|██        | 200/950 [02:41<10:04,  1.24it/s]

Batch Loss:  0.3934261578321457 Accuracy:  tensor(0.9914, device='cuda:0')


 42%|████▏     | 400/950 [05:22<07:22,  1.24it/s]

Batch Loss:  0.39422429718077184 Accuracy:  tensor(0.9910, device='cuda:0')


 63%|██████▎   | 600/950 [08:04<04:41,  1.24it/s]

Batch Loss:  0.3948386699457963 Accuracy:  tensor(0.9908, device='cuda:0')


 84%|████████▍ | 800/950 [10:45<02:01,  1.24it/s]

Batch Loss:  0.3944942070543766 Accuracy:  tensor(0.9911, device='cuda:0')


100%|██████████| 950/950 [12:46<00:00,  1.24it/s]
100%|██████████| 238/238 [01:09<00:00,  3.42it/s]


Epoch: 8 - valid Loss: 0.532626 - valid_acc : 0.896630
--------------------
model save, model val acc :  0.8966299019607843
best_models size :  17


 21%|██        | 200/950 [02:41<10:05,  1.24it/s]

Batch Loss:  0.393058717250824 Accuracy:  tensor(0.9919, device='cuda:0')


 42%|████▏     | 400/950 [05:22<07:25,  1.24it/s]

Batch Loss:  0.3925216380506754 Accuracy:  tensor(0.9924, device='cuda:0')


 63%|██████▎   | 600/950 [08:03<04:41,  1.24it/s]

Batch Loss:  0.39225667983293533 Accuracy:  tensor(0.9927, device='cuda:0')


 84%|████████▍ | 800/950 [10:44<02:00,  1.24it/s]

Batch Loss:  0.3926657149195671 Accuracy:  tensor(0.9923, device='cuda:0')


100%|██████████| 950/950 [12:45<00:00,  1.24it/s]
100%|██████████| 238/238 [01:09<00:00,  3.42it/s]


Epoch: 9 - valid Loss: 0.532861 - valid_acc : 0.897418
--------------------
model save, model val acc :  0.8974177170868348
best_models size :  18


In [None]:
 #### non-CV ###
best_models = []
model = electraClassifier(model_electra).to(device)
model=nn.DataParallel(model).to(device)
optimizer = AdamW(model.parameters(), lr=lr)

random_idx = random.sample(range(len(clean_train)), len(clean_train))
train_idx = random_idx[:34000]
val_idx = random_idx[34000:]
print(len(train_idx), len(val_idx))
print(train_idx[:5], val_idx[:5])
train_data = clean_train.iloc[train_idx]
val_data = clean_train.iloc[val_idx]

train_dataset = customDataset(train_data,'train')
valid_dataset = customDataset(val_data,'train')
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
total_steps = len(train_loader) * num_epochs
warmup_step = int(total_steps * warmup_ratio)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=total_steps)


for epoch in range(num_epochs):
    batches = 0
    total_loss = 0.0
    correct = 0
    total =0
    model.train()
    
    for input_ids_batch, attention_masks_batch, token_type_batch, y_batch in tqdm(train_loader):
        optimizer.zero_grad()
        y_batch = y_batch.to(device)
        y_pred = model(input_ids_batch.to(device), attention_masks_batch.to(device), token_type_batch.to(device))
        '''
        loss = F.cross_entropy(y_pred, y_batch)
        '''
        
        one_hot = torch.nn.functional.one_hot(y_batch, 3).to(torch.float32)
        loss1 = F.cross_entropy(y_pred, one_hot)
        loss2 = F.mse_loss(y_pred, one_hot)
        loss3= F.hinge_embedding_loss(y_pred, one_hot)
        loss = 0.7*loss1+0.15*loss2+0.15*loss3
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
        _, predicted = torch.max(y_pred, 1)
        correct += (predicted == y_batch).sum()
        total += len(y_batch)
        batches += 1
        if batches % log_interval == 0:
            print("Batch Loss: ", total_loss / batches, "Accuracy: ", correct.float() / total)
  
    val_loss = []
    val_acc = []
    valid_acc_max = 0.8
    valid_loss_min = 0.4
    
    for input_ids_batch, attention_masks_batch,  token_type_batch, y_batch in tqdm(valid_loader):
        
        model.eval()
        with torch.no_grad():
            y_batch = y_batch.to(device)
            y_pred = model(input_ids_batch.to(device), attention_masks_batch.to(device), token_type_batch.to(device))
            '''
            valid_loss = F.cross_entropy(y_pred,y_batch)
            '''

            valid_loss1 = F.cross_entropy(y_pred, y_batch)
            valid_one_hot = torch.nn.functional.one_hot(y_batch, 3).to(torch.float32)
            valid_loss2 = F.mse_loss(y_pred, valid_one_hot)
            valid_loss3= F.hinge_embedding_loss(y_pred, valid_one_hot)
            valid_loss = 0.7*valid_loss1+0.15*valid_loss2+0.15*valid_loss3
            
            valid_loss = valid_loss.cpu().detach().numpy()

            preds = torch.argmax(y_pred,1)
            preds = preds.cpu().detach().numpy()
            y_batch = y_batch.cpu().detach().numpy()
            batch_acc = (preds==y_batch).mean()
            val_loss.append(valid_loss)
            val_acc.append(batch_acc)
            
            
    val_loss = np.mean(val_loss)
    val_acc = np.mean(val_acc)
    
    print(f'Epoch: {epoch} - valid Loss: {val_loss:.6f} - valid_acc : {val_acc:.6f}')
    print(optimizer.param_groups[0]["lr"])
    if valid_acc_max < val_acc:
        valid_acc_max = val_acc
        best_models.append(model)
        torch.save(model.state_dict(), f'/content/drive/Shareddrives/Dacon/saved_models/Junha/electraFC_newdropout_customloss_BTAug/electra-{len(best_models)}.pth') 
        print('model save, model val acc : ',val_acc)
        print('best_models size : ',len(best_models))

34000 3998
[14208, 18724, 10969, 28914, 6023] [10415, 15165, 10824, 13596, 13711]


 19%|█▉        | 200/1063 [02:41<11:34,  1.24it/s]

Batch Loss:  0.9222558909654617 Accuracy:  tensor(0.3384, device='cuda:0')


 38%|███▊      | 400/1063 [05:22<08:52,  1.25it/s]

Batch Loss:  0.8317950884997845 Accuracy:  tensor(0.5180, device='cuda:0')


 56%|█████▋    | 600/1063 [08:02<06:12,  1.24it/s]

Batch Loss:  0.7580851187805334 Accuracy:  tensor(0.6158, device='cuda:0')


 75%|███████▌  | 800/1063 [10:43<03:32,  1.24it/s]

Batch Loss:  0.7155147766321898 Accuracy:  tensor(0.6690, device='cuda:0')


 94%|█████████▍| 1000/1063 [13:24<00:50,  1.24it/s]

Batch Loss:  0.6878987230360508 Accuracy:  tensor(0.7023, device='cuda:0')


100%|██████████| 1063/1063 [14:14<00:00,  1.24it/s]
100%|██████████| 125/125 [00:36<00:00,  3.40it/s]


Epoch: 0 - valid Loss: 0.547462 - valid_acc : 0.856217
1.9910453637637563e-05
model save, model val acc :  0.8562166666666667
best_models size :  1


 19%|█▉        | 200/1063 [02:40<11:32,  1.25it/s]

Batch Loss:  0.5214511847496033 Accuracy:  tensor(0.8866, device='cuda:0')


 38%|███▊      | 400/1063 [05:21<08:53,  1.24it/s]

Batch Loss:  0.5204580419510603 Accuracy:  tensor(0.8895, device='cuda:0')


 56%|█████▋    | 600/1063 [08:02<06:13,  1.24it/s]

Batch Loss:  0.5199909548461438 Accuracy:  tensor(0.8898, device='cuda:0')


 75%|███████▌  | 800/1063 [10:43<03:31,  1.24it/s]

Batch Loss:  0.5171849774196744 Accuracy:  tensor(0.8917, device='cuda:0')


 94%|█████████▍| 1000/1063 [13:24<00:50,  1.24it/s]

Batch Loss:  0.5151413538753986 Accuracy:  tensor(0.8938, device='cuda:0')


100%|██████████| 1063/1063 [14:14<00:00,  1.24it/s]
100%|██████████| 125/125 [00:36<00:00,  3.40it/s]


Epoch: 1 - valid Loss: 0.527299 - valid_acc : 0.882733
1.8924222787277088e-05
model save, model val acc :  0.8827333333333334
best_models size :  2


 19%|█▉        | 200/1063 [02:40<11:34,  1.24it/s]

Batch Loss:  0.45706945925951004 Accuracy:  tensor(0.9436, device='cuda:0')


 38%|███▊      | 400/1063 [05:21<08:53,  1.24it/s]

Batch Loss:  0.45963285319507124 Accuracy:  tensor(0.9419, device='cuda:0')


 56%|█████▋    | 600/1063 [08:02<06:12,  1.24it/s]

Batch Loss:  0.45905622839927673 Accuracy:  tensor(0.9417, device='cuda:0')


 75%|███████▌  | 800/1063 [10:43<03:31,  1.24it/s]

Batch Loss:  0.458862695209682 Accuracy:  tensor(0.9419, device='cuda:0')


 94%|█████████▍| 1000/1063 [13:23<00:50,  1.24it/s]

Batch Loss:  0.45895812621712684 Accuracy:  tensor(0.9417, device='cuda:0')


100%|██████████| 1063/1063 [14:14<00:00,  1.24it/s]
100%|██████████| 125/125 [00:36<00:00,  3.41it/s]


Epoch: 2 - valid Loss: 0.536681 - valid_acc : 0.884950
1.6950577871284738e-05
model save, model val acc :  0.88495
best_models size :  3


 19%|█▉        | 200/1063 [02:40<11:34,  1.24it/s]

Batch Loss:  0.4240660049021244 Accuracy:  tensor(0.9688, device='cuda:0')


 38%|███▊      | 400/1063 [05:21<08:53,  1.24it/s]

Batch Loss:  0.42801901906728745 Accuracy:  tensor(0.9660, device='cuda:0')


 56%|█████▋    | 600/1063 [08:02<06:12,  1.24it/s]

Batch Loss:  0.42869012013077734 Accuracy:  tensor(0.9660, device='cuda:0')


 75%|███████▌  | 800/1063 [10:43<03:31,  1.24it/s]

Batch Loss:  0.4291017931327224 Accuracy:  tensor(0.9656, device='cuda:0')


 94%|█████████▍| 1000/1063 [13:24<00:50,  1.24it/s]

Batch Loss:  0.4290202032327652 Accuracy:  tensor(0.9656, device='cuda:0')


100%|██████████| 1063/1063 [14:14<00:00,  1.24it/s]
100%|██████████| 125/125 [00:36<00:00,  3.39it/s]


Epoch: 3 - valid Loss: 0.526877 - valid_acc : 0.893250
1.4207891379575496e-05
model save, model val acc :  0.89325
best_models size :  4


 19%|█▉        | 200/1063 [02:40<11:34,  1.24it/s]

Batch Loss:  0.4144554953277111 Accuracy:  tensor(0.9758, device='cuda:0')


 38%|███▊      | 400/1063 [05:21<08:55,  1.24it/s]

Batch Loss:  0.4141665160655975 Accuracy:  tensor(0.9771, device='cuda:0')


 56%|█████▋    | 600/1063 [08:02<06:12,  1.24it/s]

Batch Loss:  0.4145042253037294 Accuracy:  tensor(0.9766, device='cuda:0')


 75%|███████▌  | 800/1063 [10:43<03:31,  1.24it/s]

Batch Loss:  0.4135107536613941 Accuracy:  tensor(0.9774, device='cuda:0')


 94%|█████████▍| 1000/1063 [13:24<00:50,  1.24it/s]

Batch Loss:  0.4141527355611324 Accuracy:  tensor(0.9767, device='cuda:0')


100%|██████████| 1063/1063 [14:14<00:00,  1.24it/s]
100%|██████████| 125/125 [00:36<00:00,  3.41it/s]


Epoch: 4 - valid Loss: 0.534496 - valid_acc : 0.893000
1.0999625841501848e-05
model save, model val acc :  0.893
best_models size :  5


 19%|█▉        | 200/1063 [02:40<11:34,  1.24it/s]

Batch Loss:  0.405511916577816 Accuracy:  tensor(0.9848, device='cuda:0')


 38%|███▊      | 400/1063 [05:21<08:52,  1.24it/s]

Batch Loss:  0.4053132059425116 Accuracy:  tensor(0.9846, device='cuda:0')


 56%|█████▋    | 600/1063 [08:02<06:12,  1.24it/s]

Batch Loss:  0.40535443857312203 Accuracy:  tensor(0.9845, device='cuda:0')


 75%|███████▌  | 800/1063 [10:43<03:32,  1.24it/s]

Batch Loss:  0.4054632551595569 Accuracy:  tensor(0.9844, device='cuda:0')


 94%|█████████▍| 1000/1063 [13:24<00:50,  1.24it/s]

Batch Loss:  0.40586856561899187 Accuracy:  tensor(0.9839, device='cuda:0')


100%|██████████| 1063/1063 [14:14<00:00,  1.24it/s]
100%|██████████| 125/125 [00:36<00:00,  3.40it/s]


Epoch: 5 - valid Loss: 0.529314 - valid_acc : 0.897983
7.680757437520913e-06
model save, model val acc :  0.8979833333333334
best_models size :  6


 19%|█▉        | 200/1063 [02:40<11:33,  1.25it/s]

Batch Loss:  0.399744575470686 Accuracy:  tensor(0.9866, device='cuda:0')


 38%|███▊      | 400/1063 [05:21<08:52,  1.24it/s]

Batch Loss:  0.4012412401288748 Accuracy:  tensor(0.9862, device='cuda:0')


 56%|█████▋    | 600/1063 [08:02<06:12,  1.24it/s]

Batch Loss:  0.39873578970630963 Accuracy:  tensor(0.9879, device='cuda:0')


 75%|███████▌  | 800/1063 [10:42<03:32,  1.24it/s]

Batch Loss:  0.39947493609040974 Accuracy:  tensor(0.9876, device='cuda:0')


 94%|█████████▍| 1000/1063 [13:23<00:50,  1.25it/s]

Batch Loss:  0.3998158380687237 Accuracy:  tensor(0.9873, device='cuda:0')


100%|██████████| 1063/1063 [14:13<00:00,  1.24it/s]
100%|██████████| 125/125 [00:36<00:00,  3.40it/s]


Epoch: 6 - valid Loss: 0.525785 - valid_acc : 0.897733
4.618499920828534e-06
model save, model val acc :  0.8977333333333334
best_models size :  7


 19%|█▉        | 200/1063 [02:40<11:34,  1.24it/s]

Batch Loss:  0.39486975148320197 Accuracy:  tensor(0.9908, device='cuda:0')


 38%|███▊      | 400/1063 [05:21<08:53,  1.24it/s]

Batch Loss:  0.3941223635524511 Accuracy:  tensor(0.9916, device='cuda:0')


 56%|█████▋    | 600/1063 [08:02<06:12,  1.24it/s]

Batch Loss:  0.3955620159208775 Accuracy:  tensor(0.9907, device='cuda:0')


 75%|███████▌  | 800/1063 [10:43<03:31,  1.25it/s]

Batch Loss:  0.39561078753322365 Accuracy:  tensor(0.9906, device='cuda:0')


 94%|█████████▍| 1000/1063 [13:23<00:50,  1.25it/s]

Batch Loss:  0.39570266482234 Accuracy:  tensor(0.9905, device='cuda:0')


100%|██████████| 1063/1063 [14:14<00:00,  1.24it/s]
100%|██████████| 125/125 [00:36<00:00,  3.40it/s]


Epoch: 7 - valid Loss: 0.523836 - valid_acc : 0.900467
2.1516745217597245e-06
model save, model val acc :  0.9004666666666667
best_models size :  8


 19%|█▉        | 200/1063 [02:40<11:32,  1.25it/s]

Batch Loss:  0.39344362005591393 Accuracy:  tensor(0.9919, device='cuda:0')


 38%|███▊      | 400/1063 [05:21<08:53,  1.24it/s]

Batch Loss:  0.3934659627825022 Accuracy:  tensor(0.9919, device='cuda:0')


 56%|█████▋    | 600/1063 [08:02<06:11,  1.25it/s]

Batch Loss:  0.3937191455066204 Accuracy:  tensor(0.9918, device='cuda:0')


 75%|███████▌  | 800/1063 [10:43<03:31,  1.24it/s]

Batch Loss:  0.39413722325116396 Accuracy:  tensor(0.9914, device='cuda:0')


 94%|█████████▍| 1000/1063 [13:23<00:50,  1.24it/s]

Batch Loss:  0.3940484247505665 Accuracy:  tensor(0.9915, device='cuda:0')


100%|██████████| 1063/1063 [14:14<00:00,  1.24it/s]
100%|██████████| 125/125 [00:36<00:00,  3.40it/s]


Epoch: 8 - valid Loss: 0.524291 - valid_acc : 0.900483
5.532213219954508e-07
model save, model val acc :  0.9004833333333333
best_models size :  9


 19%|█▉        | 200/1063 [02:40<11:32,  1.25it/s]

Batch Loss:  0.39482695013284685 Accuracy:  tensor(0.9914, device='cuda:0')


 38%|███▊      | 400/1063 [05:21<08:54,  1.24it/s]

Batch Loss:  0.3931886663287878 Accuracy:  tensor(0.9921, device='cuda:0')


 56%|█████▋    | 600/1063 [08:02<06:10,  1.25it/s]

Batch Loss:  0.3930482572813829 Accuracy:  tensor(0.9922, device='cuda:0')


 75%|███████▌  | 800/1063 [10:43<03:31,  1.24it/s]

Batch Loss:  0.39354297187179327 Accuracy:  tensor(0.9918, device='cuda:0')


 94%|█████████▍| 1000/1063 [13:23<00:50,  1.24it/s]

Batch Loss:  0.39319370713829993 Accuracy:  tensor(0.9922, device='cuda:0')


100%|██████████| 1063/1063 [14:14<00:00,  1.24it/s]
100%|██████████| 125/125 [00:36<00:00,  3.40it/s]


Epoch: 9 - valid Loss: 0.523870 - valid_acc : 0.900483
0.0
model save, model val acc :  0.9004833333333333
best_models size :  10


In [None]:
!mkdir /content/drive/Shareddrives/Dacon/saved_models/Junha/electraFC_newdropout_customloss_BTAug

In [None]:
!pip install wandb

In [None]:
!mkdir /content/drive/Shareddrives/Dacon/Junha_custom/saved_models/electra_Benchmark_noCV_CNN_customloss

In [None]:
# koelectra-4k번 모델이 가장 성능이 좋은 것으로 가정
test_dataset = customDataset(clean_test,'test')
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=0)

preds = dict()
for idx, m in enumerate(best_models):
    #if idx % num_epochs == 2 or idx % num_epochs == 3 or idx % num_epochs == 4:
    if idx == 6:
      print(f'{idx+1} 번째 모델 예측 진행중')
      bestm = m
      bestm.eval()
      answer = []
      with torch.no_grad():
          for input_ids_batch, attention_masks_batch, token_type_batch  in tqdm(test_loader):
              y_pred = model(input_ids_batch.to(device), attention_masks_batch.to(device), token_type_batch.to(device)).detach().cpu().numpy()
              answer.extend(y_pred)
      preds[idx+1] = answer

7 번째 모델 예측 진행중


100%|██████████| 27/27 [00:16<00:00,  1.61it/s]


In [None]:
### SINGLE PREDICTION ###
temp = np.zeros((1666, 3))
for key in preds.keys():
  x = np.array(preds[key])
  max = np.max(x,axis=1,keepdims=True) #returns max of each row and keeps same dims
  e_x = np.exp(x - max) #subtracts each row with its max value
  sum = np.sum(e_x,axis=1,keepdims=True) #returns sum of each row and keeps same dims
  f_x = e_x / sum 
  temp += f_x
temp = temp/3
softvoted_prob = pd.DataFrame(temp)
softvoted_pred = pd.DataFrame(np.argmax(temp, axis=1))
decode_map = {0 : "entailment" , 1 :  "contradiction" , 2 : "neutral" }
sample_submission['label'] = softvoted_pred
sample_submission['label'] = sample_submission['label'].map(decode_map)
sample_submission.to_csv('submission_KoELECTRACNN_BTAUG.csv', index = False)