In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 5.4 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.2 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 40.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 46.2 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.10.1
  Downloading tokenizers-0.11.5-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 35.3 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
  

In [None]:
import pandas as pd 
import numpy as np 
import os
import torch
import torch.nn as nn

import warnings 
warnings.filterwarnings("ignore")
from tqdm import tqdm
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, ElectraForSequenceClassification, AdamW
from transformers.optimization import get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup
import re
from sklearn.model_selection import train_test_split

In [None]:
# Random Seed Fix
import random
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  
    torch.backends.cudnn.deterministic = True  
    torch.backends.cudnn.benchmark = True  
seed_everything()

In [None]:
device = torch.device("cuda")

In [None]:
############# HYPERPARMS ##############
num_epochs = 5
batch_size =128
lr = 0.00001
pretrain = "monologg/koelectra-base-v3-discriminator"

In [None]:
def load_data(path):
  TRAIN = os.path.join(path, 'benchmark_train_data.csv')
  TEST = os.path.join(path, 'test_data.csv')
  SS = os.path.join(path, 'sample_submission.csv')
  label_dict = {"entailment" : 0, "contradiction" : 1, "neutral" : 2}
  train = pd.read_csv(TRAIN)
  test = pd.read_csv(TEST)
  sample_submission = pd.read_csv(SS)
  train['label'] = train['label'].map(label_dict)

  return train,test,sample_submission

def text_clean(df):
  df["premise_"] = "[CLS]" + df["premise"] + "[SEP]"
  df["hypothesis_"] = df["hypothesis"] + "[SEP]"
  df["text_sum"] = df.premise_ + " " + df.hypothesis_
  df = df[['text_sum','label']]
  return df 

ROOT = '/content/drive/MyDrive/DACON_MONTHLYNLI'
#ROOT = '/content/drive/Shareddrives/Dacon/data'
train,test,sample_submission = load_data(ROOT)
clean_train,clean_test  = text_clean(train),text_clean(test)

In [None]:
############# Dataset ##############
class CustomDataset(Dataset):
  
  def __init__(self,dataset,option):
    
    self.dataset = dataset 
    self.option = option
    self.tokenizer = AutoTokenizer.from_pretrained(pretrain)

  
  def __len__(self):
    return len(self.dataset)
  
  def __getitem__(self, idx):
    row = self.dataset.iloc[idx, 0:2].values
    text = row[0]
    #y = row[1]

    inputs = self.tokenizer(
        text, 
        return_tensors='pt',
        truncation=True,
        max_length=70,
        pad_to_max_length=True,
        add_special_tokens=False
        )
    
    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]
    
    if self.option =='train':
        y =row[1]
        return input_ids,attention_mask,y

    return input_ids, attention_mask

In [None]:
# koelectra-4k번 모델이 가장 성능이 좋은 것으로 가정
model = ElectraForSequenceClassification.from_pretrained(pretrain,num_labels=3).to(device)
#model=nn.DataParallel(model).to(device)
test_dataset = CustomDataset(clean_test,'test')
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=0)

model_ROOT = '/content/drive/MyDrive/DACON_MONTHLYNLI/models/koelectra(benchmarked)'
model_PATHs = [os.path.join(model_ROOT, 'koelectra-4.pth'), os.path.join(model_ROOT, 'koelectra-8.pth'), os.path.join(model_ROOT, 'koelectra-12.pth')]

preds = dict()
for pth in model_PATHs:
    print(pth[-5:])
    currentm = model
    currentm.load_state_dict(torch.load(pth).module.state_dict())
    currentm.eval()
    answer = []
    with torch.no_grad():
        for input_ids_batch, attention_masks_batch in tqdm(test_loader):
            y_pred = currentm(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0].detach().cpu().numpy()
            answer.extend(y_pred)
    preds[pth[-5]] = answer

Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: 

4.pth


100%|██████████| 27/27 [00:14<00:00,  1.86it/s]


8.pth


100%|██████████| 27/27 [00:14<00:00,  1.85it/s]


2.pth


100%|██████████| 27/27 [00:14<00:00,  1.84it/s]


In [None]:
### SINGLE PREDICTION FOR CONCORDANCE OBSERVATION ###
single_preds = dict()
concat_probs = pd.DataFrame()
temp = np.zeros((1666, 3))
for key in preds.keys():
  x = np.array(preds[key])
  max = np.max(x,axis=1,keepdims=True) #returns max of each row and keeps same dims
  e_x = np.exp(x - max) #subtracts each row with its max value
  sum = np.sum(e_x,axis=1,keepdims=True) #returns sum of each row and keeps same dims
  f_x = e_x / sum
  #single_preds['prob_'+str(key)] = pd.DataFrame(f_x)
  single_preds['pred_'+str(key)] = pd.DataFrame(np.argmax(f_x, axis=1))
  concat_probs = pd.concat([concat_probs, pd.DataFrame(f_x)], axis =1)
columns = []

for j in range(1,4):
  for i in range(3):
    column = f"{4*j}_{i}"
    columns.append(column)
concat_probs.columns = columns
display(concat_probs)
concat_probs.to_csv('submission_KoELECTRA_soft.csv', index=False)

Unnamed: 0,4_0,4_1,4_2,8_0,8_1,8_2,12_0,12_1,12_2
0,0.004339,0.989957,0.005704,0.003115,0.993447,0.003437,0.003267,0.992312,0.004421
1,0.053540,0.005561,0.940898,0.405675,0.014389,0.579936,0.102595,0.006492,0.890913
2,0.613809,0.010409,0.375782,0.705776,0.013020,0.281204,0.584750,0.015179,0.400070
3,0.003056,0.906573,0.090371,0.002320,0.989104,0.008576,0.003008,0.987799,0.009193
4,0.011893,0.978237,0.009870,0.031735,0.959900,0.008365,0.010683,0.979891,0.009426
...,...,...,...,...,...,...,...,...,...
1661,0.005877,0.004433,0.989691,0.005544,0.004134,0.990321,0.004064,0.004816,0.991120
1662,0.492611,0.008447,0.498942,0.225914,0.008708,0.765378,0.100834,0.005882,0.893284
1663,0.038510,0.003807,0.957682,0.072953,0.006287,0.920760,0.069169,0.016159,0.914672
1664,0.006247,0.004053,0.989700,0.006821,0.003116,0.990064,0.004323,0.003225,0.992452


In [None]:
### ENSEMBLED PREDICTION ###
temp = np.zeros((1666, 3))
for key in preds.keys():
  x = np.array(preds[key])
  max = np.max(x,axis=1,keepdims=True) #returns max of each row and keeps same dims
  e_x = np.exp(x - max) #subtracts each row with its max value
  sum = np.sum(e_x,axis=1,keepdims=True) #returns sum of each row and keeps same dims
  f_x = e_x / sum 
  temp += f_x
temp = temp/5
softvoted_prob = pd.DataFrame(temp)
softvoted_pred = pd.DataFrame(np.argmax(temp, axis=1))
decode_map = {0 : "entailment" , 1 :  "contradiction" , 2 : "neutral" }
sample_submission['label'] = softvoted_pred
sample_submission['label'] = sample_submission['label'].map(decode_map)
sample_submission.to_csv('submission_KoELECTRA.csv', index = False)