In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 7.5 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 70.3 MB/s 
Collecting tokenizers!=0.11.3,>=0.10.1
  Downloading tokenizers-0.11.5-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 53.0 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 6.8 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 59.1 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Foun

In [3]:
import pandas as pd 
import numpy as np 
import os
import torch
import torch.nn as nn

import warnings 
warnings.filterwarnings("ignore")
from tqdm import tqdm
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, ElectraForSequenceClassification, AdamW
from transformers.optimization import get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup
import re
from sklearn.model_selection import train_test_split

In [4]:
# Random Seed Fix
import random
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  
    torch.backends.cudnn.deterministic = True  
    torch.backends.cudnn.benchmark = True  
seed_everything()

In [5]:
device = torch.device("cuda")
!nvidia-smi

Sat Feb 19 04:52:32 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [6]:
############# HYPERPARMS ##############
num_epochs = 7
batch_size =128
lr = 0.00001
pretrain = "monologg/koelectra-base-v3-discriminator"

In [7]:
def load_data(path):
  TRAIN = os.path.join(path, 'benchmark_train_data.csv')
  TEST = os.path.join(path, 'test_data.csv')
  SS = os.path.join(path, 'sample_submission.csv')
  label_dict = {"entailment" : 0, "contradiction" : 1, "neutral" : 2}
  train = pd.read_csv(TRAIN)
  test = pd.read_csv(TEST)
  sample_submission = pd.read_csv(SS)
  train['label'] = train['label'].map(label_dict)

  return train,test,sample_submission

def text_clean(df):
  df["premise_"] = "[CLS]" + df["premise"] + "[SEP]"
  df["hypothesis_"] = df["hypothesis"] + "[SEP]"
  df["text_sum"] = df.premise_ + " " + df.hypothesis_
  df = df[['text_sum','label']]
  return df 

#ROOT = '/content/drive/MyDrive/DACON_MONTHLYNLI'
ROOT = '/content/drive/Shareddrives/Dacon/data'
train,test,sample_submission = load_data(ROOT)
clean_train,clean_test  = text_clean(train),text_clean(test)

In [8]:
############# Dataset ##############
class CustomDataset(Dataset):
  
  def __init__(self,dataset,option):
    
    self.dataset = dataset 
    self.option = option
    self.tokenizer = AutoTokenizer.from_pretrained(pretrain)

  
  def __len__(self):
    return len(self.dataset)
  
  def __getitem__(self, idx):
    row = self.dataset.iloc[idx, 0:2].values
    text = row[0]
    #y = row[1]

    inputs = self.tokenizer(
        text, 
        return_tensors='pt',
        truncation=True,
        max_length=70,
        pad_to_max_length=True,
        add_special_tokens=False
        )
    
    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]
    
    if self.option =='train':
        y =row[1]
        return input_ids,attention_mask,y

    return input_ids, attention_mask

In [None]:
############### CV ################
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits = 1,shuffle=True,random_state=42)
folds=[]
for trn_idx,val_idx in skf.split(clean_train['text_sum'],clean_train['label']):
    folds.append((trn_idx,val_idx))

ValueError: ignored

In [9]:
### NON - CV ###
best_models = []

model = ElectraForSequenceClassification.from_pretrained(pretrain,num_labels=3).to(device)
model=nn.DataParallel(model).to(device)
optimizer = AdamW(model.parameters(), lr=lr)

random_idx = random.sample(range(len(clean_train)), len(clean_train))
train_idx = random_idx[:24000]
val_idx = random_idx[24000:]
print(len(train_idx), len(val_idx))
print(train_idx[:5], val_idx[:5])
train_data = clean_train.iloc[train_idx]
val_data = clean_train.iloc[val_idx]

train_dataset = CustomDataset(train_data,'train')
valid_dataset = CustomDataset(val_data,'train')
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

warmup_ratio = 0.1
total_steps = len(train_loader) * num_epochs
warmup_step = int(total_steps * warmup_ratio)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=1, num_training_steps=total_steps)
valid_loss_min = 0.4
valid_acc_max = 0.8

for epoch in range(num_epochs):
    batches = 0
    total_loss = 0.0
    correct = 0
    total =0
    model.train()
    
    for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
        optimizer.zero_grad()
        y_batch = y_batch.to(device)
        y_pred = model(input_ids_batch.to(device), attention_mask = attention_masks_batch.to(device))[0]
        #print(y_pred)
        #loss1 = F.cross_entropy(y_pred, y_batch)
        #loss2= F.hinge_embedding_loss(y_pred, y_batch)
        #loss = 0.7*loss1+0.3*loss2
        loss1 = F.cross_entropy(y_pred, y_batch)
        one_hot = torch.nn.functional.one_hot(y_batch, 3).to(torch.float32)
        loss2 = F.mse_loss(y_pred, one_hot)
        loss3= F.hinge_embedding_loss(y_pred, one_hot)
        loss = 0.7*loss1+0.15*loss2+0.15*loss3
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        _, predicted = torch.max(y_pred, 1)
        correct += (predicted == y_batch).sum()
        total += len(y_batch)
        batches += 1
        if batches % 100 == 0:
            print("Batch Loss: ", total_loss, "Accuracy: ", correct.float() / total)
  
    val_loss = []
    val_acc = []
    
    for input_ids_batch, attention_masks_batch, y_batch in tqdm(valid_loader):
        
        model.eval()
        with torch.no_grad():
            
            y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
            valid_loss = F.cross_entropy(y_pred,y_batch.to(device)).cpu().detach().numpy()

            preds = torch.argmax(y_pred,1)
            preds = preds.cpu().detach().numpy()
            y_batch = y_batch.cpu().detach().numpy()
            batch_acc = (preds==y_batch).mean()
            val_loss.append(valid_loss)
            val_acc.append(batch_acc)
            
            
    val_loss = np.mean(val_loss)
    val_acc = np.mean(val_acc)
    scheduler.step()
    print(f'Epoch: {epoch} - valid Loss: {val_loss:.6f} - valid_acc : {val_acc:.6f}')
    print(optimizer.param_groups[0]["lr"])
    if valid_acc_max < val_acc:
        valid_acc_max = val_acc
        best_models.append(model)
        torch.save(model, f'koelectra-{len(best_models)}.pth') 
        print('model save, model val acc : ',val_acc)
        print('best_models size : ',len(best_models))

Downloading:   0%|          | 0.00/467 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/431M [00:00<?, ?B/s]

Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: 

24000 3998
[20952, 3648, 819, 24299, 9012] [20788, 5643, 1886, 3907, 5200]


Downloading:   0%|          | 0.00/61.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/257k [00:00<?, ?B/s]

 53%|█████▎    | 100/188 [02:12<02:01,  1.38s/it]

Batch Loss:  91.74550002813339 Accuracy:  tensor(0.3509, device='cuda:0')


100%|██████████| 188/188 [04:15<00:00,  1.36s/it]
100%|██████████| 32/32 [00:17<00:00,  1.84it/s]


Epoch: 0 - valid Loss: 1.095412 - valid_acc : 0.388949
1e-05


 53%|█████▎    | 100/188 [02:24<02:07,  1.45s/it]

Batch Loss:  76.73995506763458 Accuracy:  tensor(0.6499, device='cuda:0')


100%|██████████| 188/188 [04:32<00:00,  1.45s/it]
100%|██████████| 32/32 [00:17<00:00,  1.79it/s]


Epoch: 1 - valid Loss: 0.475376 - valid_acc : 0.844824
9.999985731180576e-06
model save, model val acc :  0.84482421875
best_models size :  1


 53%|█████▎    | 100/188 [02:25<02:08,  1.46s/it]

Batch Loss:  53.01206216216087 Accuracy:  tensor(0.8727, device='cuda:0')


100%|██████████| 188/188 [04:33<00:00,  1.46s/it]
100%|██████████| 32/32 [00:17<00:00,  1.79it/s]


Epoch: 2 - valid Loss: 0.431983 - valid_acc : 0.860270
9.999942924803747e-06
model save, model val acc :  0.8602701822916667
best_models size :  2


 53%|█████▎    | 100/188 [02:26<02:08,  1.47s/it]

Batch Loss:  48.85580110549927 Accuracy:  tensor(0.9100, device='cuda:0')


100%|██████████| 188/188 [04:33<00:00,  1.46s/it]
100%|██████████| 32/32 [00:17<00:00,  1.80it/s]


Epoch: 3 - valid Loss: 0.431537 - valid_acc : 0.864909
9.99987158111383e-06
model save, model val acc :  0.8649088541666667
best_models size :  3


 53%|█████▎    | 100/188 [02:25<02:08,  1.46s/it]

Batch Loss:  45.86799883842468 Accuracy:  tensor(0.9345, device='cuda:0')


100%|██████████| 188/188 [04:33<00:00,  1.46s/it]
100%|██████████| 32/32 [00:17<00:00,  1.79it/s]


Epoch: 4 - valid Loss: 0.412842 - valid_acc : 0.872656
9.999771700518019e-06
model save, model val acc :  0.87265625
best_models size :  4


 53%|█████▎    | 100/188 [02:26<02:08,  1.46s/it]

Batch Loss:  44.54069623351097 Accuracy:  tensor(0.9480, device='cuda:0')


100%|██████████| 188/188 [04:33<00:00,  1.46s/it]
100%|██████████| 32/32 [00:17<00:00,  1.80it/s]


Epoch: 5 - valid Loss: 0.413134 - valid_acc : 0.878955
9.99964328358639e-06
model save, model val acc :  0.878955078125
best_models size :  5


 53%|█████▎    | 100/188 [02:25<02:08,  1.46s/it]

Batch Loss:  42.533296912908554 Accuracy:  tensor(0.9639, device='cuda:0')


100%|██████████| 188/188 [04:33<00:00,  1.46s/it]
100%|██████████| 32/32 [00:17<00:00,  1.80it/s]

Epoch: 6 - valid Loss: 0.416558 - valid_acc : 0.875651
9.99948633105188e-06





In [None]:
best_models = []

for i,fold in enumerate(range(5)):
    print('===============',i+1,'fold start===============')
    model = ElectraForSequenceClassification.from_pretrained(pretrain,num_labels=3).to(device)
    model=nn.DataParallel(model).to(device)
    optimizer = AdamW(model.parameters(), lr=lr)
    
    train_idx = folds[fold][0]
    valid_idx = folds[fold][1]
    train_data = clean_train.loc[trn_idx]
    val_data = clean_train.loc[valid_idx]
    train_dataset = CustomDataset(train_data,'train')
    valid_dataset = CustomDataset(val_data,'train')
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
    warmup_ratio = 0.1
    total_steps = len(train_loader) * num_epochs
    warmup_step = int(total_steps * warmup_ratio)
    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=1, num_training_steps=total_steps)
    valid_loss_min = 0.4
    valid_acc_max = 0.8
    
    for epoch in range(num_epochs):
        batches = 0
        total_loss = 0.0
        correct = 0
        total =0
        model.train()
        
        for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
            optimizer.zero_grad()
            y_batch = y_batch.to(device)
            y_pred = model(input_ids_batch.to(device), attention_mask = attention_masks_batch.to(device))[0]
            loss = F.cross_entropy(y_pred, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            _, predicted = torch.max(y_pred, 1)
            correct += (predicted == y_batch).sum()
            total += len(y_batch)
            batches += 1
            if batches % 100 == 0:
                print("Batch Loss: ", total_loss, "Accuracy: ", correct.float() / total)
      
        val_loss = []
        val_acc = []
        
        for input_ids_batch, attention_masks_batch, y_batch in tqdm(valid_loader):
            
            model.eval()
            with torch.no_grad():
                
                y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
                valid_loss = F.cross_entropy(y_pred,y_batch.to(device)).cpu().detach().numpy()

                preds = torch.argmax(y_pred,1)
                preds = preds.cpu().detach().numpy()
                y_batch = y_batch.cpu().detach().numpy()
                batch_acc = (preds==y_batch).mean()
                val_loss.append(valid_loss)
                val_acc.append(batch_acc)
                
                
        val_loss = np.mean(val_loss)
        val_acc = np.mean(val_acc)
        scheduler.step()
        print(f'Epoch: {epoch} - valid Loss: {val_loss:.6f} - valid_acc : {val_acc:.6f}')
        print(optimizer.param_groups[0]["lr"])
        if valid_acc_max < val_acc:
            valid_acc_max = val_acc
            best_models.append(model)
            torch.save(model.state_dict(), f'/content/drive/Shareddrives/Dacon/saved models/Junha/Electra_Benchmark/koelectra-{len(best_models)}.pth') 
            print('model save, model val acc : ',val_acc)
            print('best_models size : ',len(best_models))



Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: 

Batch Loss:  109.94188272953033 Accuracy:  tensor(0.3420, device='cuda:0')


100%|██████████| 146/146 [06:44<00:00,  2.77s/it]
100%|██████████| 73/73 [01:20<00:00,  1.10s/it]


Epoch: 0 - valid Loss: 1.097843 - valid_acc : 0.355517
1e-05


 68%|██████▊   | 100/146 [04:37<02:07,  2.78s/it]

Batch Loss:  85.54311275482178 Accuracy:  tensor(0.6455, device='cuda:0')


100%|██████████| 146/146 [06:44<00:00,  2.77s/it]
100%|██████████| 73/73 [01:20<00:00,  1.10s/it]


Epoch: 1 - valid Loss: 0.398908 - valid_acc : 0.864107
9.999988408783906e-06
model save, model val acc :  0.8641067717480388
best_models size :  1


 68%|██████▊   | 100/146 [04:37<02:07,  2.77s/it]

Batch Loss:  39.67155250906944 Accuracy:  tensor(0.8632, device='cuda:0')


100%|██████████| 146/146 [06:44<00:00,  2.77s/it]
100%|██████████| 73/73 [01:20<00:00,  1.10s/it]


Epoch: 2 - valid Loss: 0.243975 - valid_acc : 0.922824
9.99995363518936e-06
model save, model val acc :  0.9228244643484369
best_models size :  2


 68%|██████▊   | 100/146 [04:37<02:07,  2.78s/it]

Batch Loss:  28.0028108805418 Accuracy:  tensor(0.9095, device='cuda:0')


100%|██████████| 146/146 [06:44<00:00,  2.77s/it]
100%|██████████| 73/73 [01:20<00:00,  1.10s/it]


Epoch: 3 - valid Loss: 0.167476 - valid_acc : 0.949212
9.999895679377595e-06
model save, model val acc :  0.9492118897084649
best_models size :  3


 68%|██████▊   | 100/146 [04:37<02:07,  2.77s/it]

Batch Loss:  20.511776469647884 Accuracy:  tensor(0.9352, device='cuda:0')


100%|██████████| 146/146 [06:44<00:00,  2.77s/it]
100%|██████████| 73/73 [01:20<00:00,  1.10s/it]


Epoch: 4 - valid Loss: 0.114189 - valid_acc : 0.966987
9.99981454161732e-06
model save, model val acc :  0.9669873624282871
best_models size :  4


 68%|██████▊   | 100/146 [04:37<02:07,  2.77s/it]

Batch Loss:  16.243974678218365 Accuracy:  tensor(0.9502, device='cuda:0')


100%|██████████| 146/146 [06:44<00:00,  2.77s/it]
100%|██████████| 73/73 [01:20<00:00,  1.10s/it]


Epoch: 5 - valid Loss: 0.073217 - valid_acc : 0.981124
9.999710222284731e-06
model save, model val acc :  0.9811241365179721
best_models size :  5


 68%|██████▊   | 100/146 [04:37<02:07,  2.77s/it]

Batch Loss:  12.350498288869858 Accuracy:  tensor(0.9627, device='cuda:0')


100%|██████████| 146/146 [06:44<00:00,  2.77s/it]
100%|██████████| 73/73 [01:20<00:00,  1.10s/it]


Epoch: 6 - valid Loss: 0.056304 - valid_acc : 0.986067
9.9995827218635e-06
model save, model val acc :  0.9860672052452873
best_models size :  6


 68%|██████▊   | 100/146 [04:37<02:07,  2.77s/it]

Batch Loss:  9.551588298752904 Accuracy:  tensor(0.9730, device='cuda:0')


100%|██████████| 146/146 [06:44<00:00,  2.77s/it]
100%|██████████| 73/73 [01:20<00:00,  1.10s/it]


Epoch: 7 - valid Loss: 0.042618 - valid_acc : 0.990027
9.999432040944784e-06
model save, model val acc :  0.99002696551926
best_models size :  7


 68%|██████▊   | 100/146 [04:36<02:06,  2.76s/it]

Batch Loss:  7.734367474913597 Accuracy:  tensor(0.9787, device='cuda:0')


100%|██████████| 146/146 [06:43<00:00,  2.76s/it]
100%|██████████| 73/73 [01:19<00:00,  1.09s/it]


Epoch: 8 - valid Loss: 0.031714 - valid_acc : 0.992830
9.99925818022721e-06
model save, model val acc :  0.9928296232876712
best_models size :  8


 68%|██████▊   | 100/146 [04:36<02:06,  2.76s/it]

Batch Loss:  6.173172029666603 Accuracy:  tensor(0.9827, device='cuda:0')


100%|██████████| 146/146 [06:43<00:00,  2.76s/it]
100%|██████████| 73/73 [01:20<00:00,  1.10s/it]


Epoch: 9 - valid Loss: 0.021523 - valid_acc : 0.995398
9.999061140516881e-06
model save, model val acc :  0.9953981164383562
best_models size :  9


Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: 

Batch Loss:  109.7501266002655 Accuracy:  tensor(0.3462, device='cuda:0')


100%|██████████| 146/146 [06:43<00:00,  2.76s/it]
100%|██████████| 73/73 [01:20<00:00,  1.10s/it]


Epoch: 0 - valid Loss: 1.096958 - valid_acc : 0.339617
1e-05


 68%|██████▊   | 100/146 [04:36<02:07,  2.77s/it]

Batch Loss:  84.9484578371048 Accuracy:  tensor(0.6453, device='cuda:0')


100%|██████████| 146/146 [06:43<00:00,  2.76s/it]
100%|██████████| 73/73 [01:20<00:00,  1.10s/it]


Epoch: 1 - valid Loss: 0.378699 - valid_acc : 0.874090
9.999988408783906e-06
model save, model val acc :  0.8740898679896968
best_models size :  10


 68%|██████▊   | 100/146 [04:36<02:07,  2.77s/it]

Batch Loss:  38.88322842121124 Accuracy:  tensor(0.8678, device='cuda:0')


100%|██████████| 146/146 [06:43<00:00,  2.76s/it]
100%|██████████| 73/73 [01:20<00:00,  1.10s/it]


Epoch: 2 - valid Loss: 0.240335 - valid_acc : 0.926570
9.99995363518936e-06
model save, model val acc :  0.9265701835265191
best_models size :  11


 68%|██████▊   | 100/146 [04:36<02:07,  2.77s/it]

Batch Loss:  27.534405544400215 Accuracy:  tensor(0.9092, device='cuda:0')


100%|██████████| 146/146 [06:42<00:00,  2.76s/it]
100%|██████████| 73/73 [01:19<00:00,  1.09s/it]


Epoch: 3 - valid Loss: 0.165220 - valid_acc : 0.953065
9.999895679377595e-06
model save, model val acc :  0.9530646294344923
best_models size :  12


 68%|██████▊   | 100/146 [04:36<02:07,  2.78s/it]

Batch Loss:  21.00300856679678 Accuracy:  tensor(0.9317, device='cuda:0')


100%|██████████| 146/146 [06:43<00:00,  2.77s/it]
100%|██████████| 73/73 [01:20<00:00,  1.10s/it]


Epoch: 4 - valid Loss: 0.109755 - valid_acc : 0.970947
9.99981454161732e-06
model save, model val acc :  0.9709471227022597
best_models size :  13


 68%|██████▊   | 100/146 [04:37<02:07,  2.77s/it]

Batch Loss:  15.694932006299496 Accuracy:  tensor(0.9520, device='cuda:0')


100%|██████████| 146/146 [06:44<00:00,  2.77s/it]
100%|██████████| 73/73 [01:20<00:00,  1.10s/it]


Epoch: 5 - valid Loss: 0.076159 - valid_acc : 0.980793
9.999710222284731e-06
model save, model val acc :  0.9807930131132185
best_models size :  14


 68%|██████▊   | 100/146 [04:37<02:07,  2.77s/it]

Batch Loss:  12.018045980483294 Accuracy:  tensor(0.9640, device='cuda:0')


100%|██████████| 146/146 [06:45<00:00,  2.78s/it]
100%|██████████| 73/73 [01:20<00:00,  1.10s/it]


Epoch: 6 - valid Loss: 0.051780 - valid_acc : 0.989161
9.9995827218635e-06
model save, model val acc :  0.9891607393747806
best_models size :  15


 68%|██████▊   | 100/146 [04:37<02:07,  2.77s/it]

Batch Loss:  9.095211554318666 Accuracy:  tensor(0.9751, device='cuda:0')


100%|██████████| 146/146 [06:44<00:00,  2.77s/it]
100%|██████████| 73/73 [01:20<00:00,  1.10s/it]


Epoch: 7 - valid Loss: 0.037599 - valid_acc : 0.991963
9.999432040944784e-06
model save, model val acc :  0.9919633971431916
best_models size :  16


 68%|██████▊   | 100/146 [04:37<02:07,  2.78s/it]

Batch Loss:  6.89077331032604 Accuracy:  tensor(0.9811, device='cuda:0')


100%|██████████| 146/146 [06:44<00:00,  2.77s/it]
100%|██████████| 73/73 [01:20<00:00,  1.10s/it]


Epoch: 8 - valid Loss: 0.034679 - valid_acc : 0.991963
9.99925818022721e-06


 68%|██████▊   | 100/146 [04:37<02:07,  2.77s/it]

Batch Loss:  6.517204710282385 Accuracy:  tensor(0.9812, device='cuda:0')


100%|██████████| 146/146 [06:44<00:00,  2.77s/it]
100%|██████████| 73/73 [01:20<00:00,  1.10s/it]


Epoch: 9 - valid Loss: 0.023446 - valid_acc : 0.995174
9.999061140516881e-06
model save, model val acc :  0.9951740135815478
best_models size :  17


Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: 

RuntimeError: ignored

In [17]:
# koelectra-4k번 모델이 가장 성능이 좋은 것으로 가정
test_dataset = CustomDataset(clean_test,'test')
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=0)

preds = dict()
for idx, m in enumerate(best_models):
    #if idx % num_epochs == 2 or idx % num_epochs == 3 or idx % num_epochs == 4:
    if idx == 4:
      print(f'{idx+1} 번째 모델 예측 진행중')
      bestm = m
      bestm.eval()
      answer = []
      with torch.no_grad():
          for input_ids_batch, attention_masks_batch in tqdm(test_loader):
              y_pred = bestm(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0].detach().cpu().numpy()
              answer.extend(y_pred)
      preds[idx+1] = answer

5 번째 모델 예측 진행중


100%|██████████| 27/27 [00:06<00:00,  4.01it/s]


In [12]:
df = pd.DataFrame()
for key in preds.keys():
  df = pd.concat([df, pd.DataFrame(np.array(preds[key]))], axis =1 )
df.columns = [i for i in range(3*3)]
df

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,-0.692385,1.794275,-0.602543,-0.692385,1.794275,-0.602543,-0.692385,1.794275,-0.602543
1,-0.048374,-0.775319,1.330322,-0.048374,-0.775319,1.330322,-0.048374,-0.775319,1.330322
2,0.597040,-0.809023,0.740466,0.597040,-0.809023,0.740466,0.597040,-0.809023,0.740466
3,-0.687000,1.872369,-0.674526,-0.687000,1.872369,-0.674526,-0.687000,1.872369,-0.674526
4,-0.447792,1.420751,-0.553400,-0.447792,1.420751,-0.553400,-0.447792,1.420751,-0.553400
...,...,...,...,...,...,...,...,...,...
1661,-0.691068,-0.608387,1.780281,-0.691068,-0.608387,1.780281,-0.691068,-0.608387,1.780281
1662,0.281797,-0.801149,1.092021,0.281797,-0.801149,1.092021,0.281797,-0.801149,1.092021
1663,0.192983,-0.854506,1.201216,0.192983,-0.854506,1.201216,0.192983,-0.854506,1.201216
1664,-0.707620,-0.604393,1.758602,-0.707620,-0.604393,1.758602,-0.707620,-0.604393,1.758602


In [15]:
### SINGLE PREDICTION FOR CONCORDANCE OBSERVATION ###
single_preds = dict()
concat_probs = pd.DataFrame()
temp = np.zeros((1666, 3))
for key in preds.keys():
  x = np.array(preds[key])
  max = np.max(x,axis=1,keepdims=True) #returns max of each row and keeps same dims
  e_x = np.exp(x - max) #subtracts each row with its max value
  sum = np.sum(e_x,axis=1,keepdims=True) #returns sum of each row and keeps same dims
  f_x = e_x / sum
  #single_preds['prob_'+str(key)] = pd.DataFrame(f_x)
  single_preds['pred_'+str(key)] = pd.DataFrame(np.argmax(f_x, axis=1))
  concat_probs = pd.concat([concat_probs, pd.DataFrame(f_x)], axis =1)
columns = []

for j in range(3,6):
  for i in range(3):
    column = f"{j}_{i}"
    columns.append(column)
concat_probs.columns = columns
display(concat_probs)
concat_probs.to_csv('submission_KoELECTRA_soft.csv', index=False)

Unnamed: 0,3_0,3_1,3_2,4_0,4_1,4_2,5_0,5_1,5_2
0,0.070846,0.851648,0.077506,0.070846,0.851648,0.077506,0.070846,0.851648,0.077506
1,0.183382,0.088644,0.727975,0.183382,0.088644,0.727975,0.183382,0.088644,0.727975
2,0.416783,0.102156,0.481061,0.416783,0.102156,0.481061,0.416783,0.102156,0.481061
3,0.066933,0.865293,0.067774,0.066933,0.865293,0.067774,0.066933,0.865293,0.067774
4,0.119351,0.773259,0.107390,0.119351,0.773259,0.107390,0.119351,0.773259,0.107390
...,...,...,...,...,...,...,...,...,...
1661,0.071815,0.078005,0.850179,0.071815,0.078005,0.850179,0.071815,0.078005,0.850179
1662,0.278784,0.094395,0.626821,0.278784,0.094395,0.626821,0.278784,0.094395,0.626821
1663,0.244405,0.085741,0.669854,0.244405,0.085741,0.669854,0.244405,0.085741,0.669854
1664,0.072012,0.079843,0.848146,0.072012,0.079843,0.848146,0.072012,0.079843,0.848146


In [18]:
### ENSEMBLED PREDICTION ###
temp = np.zeros((1666, 3))
for key in preds.keys():
  x = np.array(preds[key])
  max = np.max(x,axis=1,keepdims=True) #returns max of each row and keeps same dims
  e_x = np.exp(x - max) #subtracts each row with its max value
  sum = np.sum(e_x,axis=1,keepdims=True) #returns sum of each row and keeps same dims
  f_x = e_x / sum 
  temp += f_x
temp = temp/3
softvoted_prob = pd.DataFrame(temp)
softvoted_pred = pd.DataFrame(np.argmax(temp, axis=1))
decode_map = {0 : "entailment" , 1 :  "contradiction" , 2 : "neutral" }
sample_submission['label'] = softvoted_pred
sample_submission['label'] = sample_submission['label'].map(decode_map)
sample_submission.to_csv('submission_KoELECTRA_best1.csv', index = False)

In [21]:
!mkdir /content/drive/Shareddrives/Dacon/saved_models/Junha/Electra_Benchmark_customloss
!mv *.pth /content/drive/Shareddrives/Dacon/saved_models/Junha/Electra_Benchmark_customloss/