### Setting.py

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 5.3 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 44.9 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 4.9 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 42.9 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.10.1
  Downloading tokenizers-0.11.5-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 33.3 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Foun

In [3]:
import pandas as pd 
import numpy as np 
import os
import torch
import torch.nn as nn

import warnings 
warnings.filterwarnings("ignore")
from tqdm import tqdm
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import ElectraModel, ElectraTokenizer, AdamW
from transformers.optimization import get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup
import re
from sklearn.model_selection import train_test_split

In [4]:
# Random Seed Fix
import random
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  
    torch.backends.cudnn.deterministic = True  
    torch.backends.cudnn.benchmark = True  
seed_everything()

In [5]:
def load_data(path):
  TRAIN = os.path.join(path, 'benchmark_train_data.csv')
  TEST = os.path.join(path, 'test_data.csv')
  SS = os.path.join(path, 'sample_submission.csv')
  label_dict = {"entailment" : 0, "contradiction" : 1, "neutral" : 2}
  train = pd.read_csv(TRAIN)
  test = pd.read_csv(TEST)
  sample_submission = pd.read_csv(SS)
  train['label'] = train['label'].map(label_dict)

  return train,test,sample_submission

def text_clean(df):
  #df["premise_"] = "[CLS]" + df["premise"].astype(str) + "[SEP]"
  df["premise_"] = df["premise"].astype(str)
  #df["hypothesis_"] = df["hypothesis"].astype(str) + "[SEP]"
  df["hypothesis_"] = df["hypothesis"].astype(str)
  df["text_sum"] = df.premise_ + " " + df.hypothesis_
  df = df[['text_sum','label']]
  return df 

def random_deletion(sentence, p=0.2):
    words = sentence.split ()
    n = len (words)
    if n == 1: # return if single word
        return words
    remaining = list(filter(lambda x: random.uniform(0,1) > p,words))
    #print (remaining) 
    if len(remaining) == 0: # if not left, sample a random word
        return ' '.join ([random.choice(words)])
    else:
        return ' '.join (remaining)

def random_swap(sentence, n=2):
    sentence = sentence.split () 
    length = range(len(sentence))
    swapped = []
    if len(sentence) >2:
      for _ in range(n):
          idx1, idx2 = random.sample(length, 2)
          swapped.append ([sentence[idx1], sentence[idx2]])
          sentence[idx1], sentence[idx2] = sentence[idx2], sentence[idx1] 
    return ' '.join (sentence)

def eda_aug(df):

    cache = {'premise':[], 'hypothesis':[], 'label':[]}
    for idx in tqdm(range(len(df))):
        premise = df.iloc[idx]['premise']
        hypothesis = df.iloc[idx]['hypothesis']
        label = df.iloc[idx]['label']
        cache['premise'].append(premise)
        cache['hypothesis'].append(hypothesis)
        cache['label'].append(label)
        flag = random.randrange(10)
        if flag < 2:
          cache['premise'].append(random_deletion(premise))
          cache['hypothesis'].append(random_deletion(hypothesis))
          cache['label'].append(label)
          cache['premise'].append(random_swap(premise))
          cache['hypothesis'].append(random_swap(hypothesis))
          cache['label'].append(label)
    
    return pd.DataFrame(cache)

### Dataset.py

In [6]:
ROOT = '/content/drive/MyDrive/DACON_MONTHLYNLI'
#ROOT = '/content/drive/Shareddrives/Dacon/data'
train,test,sample_submission = load_data(ROOT)
###### AUGMENTATION ######
#train = eda_aug(train)
###### AUGMENTATION ######

clean_train,clean_test  = text_clean(train),text_clean(test)
display(clean_train)

Unnamed: 0,text_sum,label
0,"씨름은 상고시대로부터 전해져 내려오는 남자들의 대표적인 놀이로서, 소년이나 장정들이...",1
1,"삼성은 자작극을 벌인 2명에게 형사 고소 등의 법적 대응을 검토 중이라고 하였으나,...",1
2,이를 위해 예측적 범죄예방 시스템을 구축하고 고도화한다. 예측적 범죄예방 시스템 구...,0
3,광주광역시가 재개발 정비사업 원주민들에 대한 종합대책을 마련하는 등 원주민 보호에 ...,2
4,"진정 소비자와 직원들에게 사랑 받는 기업으로 오래 지속되고 싶으면, 이런 상황에서는...",2
...,...,...
27993,흔히 비자림로라고 불리는 지방도 제1112호선을 넓히는 공사가 1년만에 재개되었다가...,1
27994,흔히 비자림로라고 불리는 지방도 제1112호선을 넓히는 공사가 1년만에 재개되었다가...,2
27995,흔히 비자림로라고 불리는 지방도 제1112호선을 넓히는 공사가 1년만에 재개되었다가...,0
27996,흡연자분들은 발코니가 있는 방이면 발코니에서 흡연이 가능합니다. 비흡연자는 발코니 ...,2


In [7]:
model_electra = ElectraModel.from_pretrained("monologg/koelectra-base-v3-discriminator")
tokenizer_electra =  ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")

Downloading:   0%|          | 0.00/467 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/431M [00:00<?, ?B/s]

Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraModel: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/61.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/257k [00:00<?, ?B/s]

In [8]:
def electra_transform(text):
  transform = tokenizer_electra(text,
                               padding='max_length',
                               truncation=True,
                               max_length=90,
                               return_tensors='pt',
                                add_special_tokens=True)
  return transform

In [9]:
class customDataset(Dataset):
  def __init__(self,dataset,mode='train',transform=electra_transform):
    super(customDataset, self).__init__()
    self.mode = mode
    self.dataset = dataset
    self.transform = transform

  def __getitem__(self,idx):
    row = self.dataset.iloc[idx, 0:2].values
    text = row[0]
    tokens = self.transform(text)
    token_ids = tokens['input_ids'][0] # tensor of token ids
    attn_masks = tokens['attention_mask'][0]  # binary tensor with "0" for padded values and "1" for the other values
    token_type_ids = tokens['token_type_ids'][0]  # binary tensor with "0" for the 1st sentence tokens & "1" for the 2nd sentence tokens

    if self.mode == 'test':
      return token_ids,attn_masks,token_type_ids
    else: 
      labels = row[1]
      return token_ids,attn_masks,token_type_ids, labels
  
  def __len__(self):
    return(len(self.dataset))

### Model.py

In [10]:
class electraClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=3,   ##클래스 수 조정##
                 params=None,
                 freeze_bert=False):
        super(electraClassifier, self).__init__()
        self.bert = bert
        self.freeze_bert=freeze_bert

        if self.freeze_bert:
            for p in self.bert.parameters():
                p.requires_grad = False

        self.conv1 = nn.Conv1d(hidden_size, 10, 2, padding = 1)
        self.conv2 = nn.Conv1d(hidden_size, 10, 3, padding =2)
        self.conv3 = nn.Conv1d(hidden_size, 10, 4, padding = 3)
        self.batchnorm = nn.BatchNorm1d(10)
        self.activation = nn.GELU()
        self.maxpool = nn.MaxPool1d(90)

        #self.classifier = nn.Linear(hidden_size , 256)
        self.dropout = nn.Dropout(p=0.5)
        #self.fc_layer1 = nn.Linear(256,128)
        self.fc_layer = nn.Linear(30,num_classes)
    

    def forward(self, input_ids, attn_masks, token_type_ids):
        
        pooler = self.bert(input_ids, attn_masks, token_type_ids, return_dict=False)[0]
        pooler = torch.transpose(pooler, 1, 2)
        output1 = torch.squeeze(self.maxpool(self.conv1(self.dropout(pooler))))
        output2 = torch.squeeze(self.maxpool(self.conv2(self.dropout(pooler))))
        output3 = torch.squeeze(self.maxpool(self.conv3(self.dropout(pooler))))
        output1 = self.activation(self.batchnorm(output1))
        output2 = self.activation(self.batchnorm(output2))
        output3 = self.activation(self.batchnorm(output3))
        output = torch.concat([output1, output2, output3], axis = 1)
      

        output = self.fc_layer(output)
        return output

### Train.py

In [11]:
device = torch.device("cuda")
!nvidia-smi

Tue Feb 22 04:44:58 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P8    26W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [12]:
###### HYPERPARMS ######
lr = 5e-6
max_len = max([len(i) for i in train['premise']])
batch_size=32
warmup_ratio = 0.06
num_epochs = 10
max_grad_norm = 1
log_interval = 200

In [42]:
############### CV ################
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits = 5,shuffle=True,random_state=42)
folds=[]
for trn_idx,val_idx in skf.split(clean_train['text_sum'],clean_train['label']):
    folds.append((trn_idx,val_idx))

############### CV Training ###############
best_models = []

for i,fold in enumerate(range(5)):
    print('===============',i+1,'fold start===============')
    model = electraClassifier(model_electra).to(device)
    model=nn.DataParallel(model).to(device)
    optimizer = AdamW(model.parameters(), lr=lr)

    train_idx = folds[fold][0]
    valid_idx = folds[fold][1]
    train_data = clean_train.loc[trn_idx]
    val_data = clean_train.loc[valid_idx]
    train_dataset = customDataset(train_data,'train')
    valid_dataset = customDataset(val_data,'train')
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
    total_steps = len(train_loader) * num_epochs
    warmup_step = int(total_steps * warmup_ratio)
    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=total_steps)
    valid_acc_max = 0.8
    valid_loss_min = 0.4
    
    for epoch in range(num_epochs):
        batches = 0
        total_loss = 0.0
        correct = 0
        total =0
        model.train()
        
        for input_ids_batch, attention_masks_batch, token_type_batch, y_batch in tqdm(train_loader):
            optimizer.zero_grad()
            y_batch = y_batch.to(device)
            y_pred = model(input_ids_batch.to(device), attention_masks_batch.to(device), token_type_batch.to(device))
            #y_pred = model(input_ids_batch.to(device), attention_masks_batch.to(device), token_type_batch.to(device))
            #y_pred = torch.mean(torch.transpose(model(input_ids_batch.to(device), attention_masks_batch.to(device), token_type_batch.to(device)),0,1),0)
            loss1 = F.cross_entropy(y_pred, y_batch)
            one_hot = torch.nn.functional.one_hot(y_batch, 3).to(torch.float32)
            loss2 = F.mse_loss(y_pred, one_hot)
            loss3= F.hinge_embedding_loss(y_pred, one_hot)
            loss = 0.7*loss1+0.15*loss2+0.15*loss3
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            optimizer.step()
            scheduler.step()
            total_loss += loss.item()
            _, predicted = torch.max(y_pred, 1)
            correct += (predicted == y_batch).sum()
            total += len(y_batch)
            batches += 1
            if batches % log_interval == 0:
                print("Batch Loss: ", total_loss / batches, "Accuracy: ", correct.float() / total)
      
        val_loss = []
        val_acc = []
        
        for input_ids_batch, attention_masks_batch, token_type_batch, y_batch in tqdm(valid_loader):
            
            model.eval()
            with torch.no_grad():
                y_batch = y_batch.to(device)
                y_pred = model(input_ids_batch.to(device), attention_masks_batch.to(device), token_type_batch.to(device))
                #y_pred = torch.mean(torch.transpose(model(input_ids_batch.to(device), attention_masks_batch.to(device), token_type_batch.to(device)),0,1),0)
                valid_loss1 = F.cross_entropy(y_pred, y_batch)
                valid_one_hot = torch.nn.functional.one_hot(y_batch, 3).to(torch.float32)
                valid_loss2 = F.mse_loss(y_pred, valid_one_hot)
                valid_loss3= F.hinge_embedding_loss(y_pred, valid_one_hot)
                valid_loss = 0.7*valid_loss1+0.15*valid_loss2+0.15*valid_loss3
                valid_loss = valid_loss.cpu().detach().numpy()

                preds = torch.argmax(y_pred,1)
                preds = preds.cpu().detach().numpy()
                y_batch = y_batch.cpu().detach().numpy()
                batch_acc = (preds==y_batch).mean()
                val_loss.append(valid_loss)
                val_acc.append(batch_acc)
                
                
        val_loss = np.mean(val_loss)
        val_acc = np.mean(val_acc)
        
        print(f'Epoch: {epoch} - valid Loss: {val_loss:.6f} - valid_acc : {val_acc:.6f}')
        print(optimizer.param_groups[0]["lr"])
        if valid_acc_max < val_acc:
            valid_acc_max = val_acc
            best_models.append(model)
            torch.save(model.state_dict(), f'/content/drive/MyDrive/DACON_MONTHLYNLI/models/electra_Benchmark_5CV_aug_customloss/electra-{len(best_models)}.pth') 
            print('model save, model val acc : ',val_acc)
            print('best_models size : ',len(best_models))



  0%|          | 0/1965 [00:00<?, ?it/s]

torch.Size([16, 30])
tensor([[-0.9117,  0.4015, -0.4459],
        [-0.8912,  0.6407,  0.1351],
        [-0.7794,  0.7084,  0.3529],
        [-0.0268,  0.4653, -0.1363],
        [ 0.2611,  0.1435,  0.7758],
        [ 0.1354,  0.5449, -0.7000],
        [-0.5763,  0.5529,  0.1376],
        [-0.2092,  0.2729, -0.6904],
        [-0.0164,  0.2766, -0.5194],
        [ 0.2535, -0.2849, -0.9005],
        [-0.4227, -0.0317,  0.1936],
        [ 0.0693,  0.2727, -0.4180],
        [ 0.2130,  0.5082,  0.2004],
        [-0.1790,  0.4313,  0.2222],
        [ 0.0304,  0.7550,  0.2318],
        [ 0.4612,  0.2547,  0.8604]], device='cuda:0',
       grad_fn=<AddmmBackward0>)


  0%|          | 1/1965 [00:00<31:38,  1.03it/s]


RuntimeError: ignored

In [14]:
#### non-CV ###
model = electraClassifier(model_electra).to(device)
model=nn.DataParallel(model).to(device)
optimizer = AdamW(model.parameters(), lr=lr)

random_idx = random.sample(range(len(clean_train)), len(clean_train))
train_idx = random_idx[:24000]
val_idx = random_idx[24000:]
print(len(train_idx), len(val_idx))
print(train_idx[:5], val_idx[:5])
train_data = clean_train.iloc[train_idx]
val_data = clean_train.iloc[val_idx]

train_dataset = customDataset(train_data,'train')
valid_dataset = customDataset(val_data,'train')
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
total_steps = len(train_loader) * num_epochs
warmup_step = int(total_steps * warmup_ratio)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=total_steps)


for epoch in range(num_epochs):
    batches = 0
    total_loss = 0.0
    correct = 0
    total =0
    model.train()
    
    for input_ids_batch, attention_masks_batch, token_type_batch, y_batch in tqdm(train_loader):
        optimizer.zero_grad()
        y_batch = y_batch.to(device)
        y_pred = model(input_ids_batch.to(device), attention_masks_batch.to(device), token_type_batch.to(device))
        loss1 = F.cross_entropy(y_pred, y_batch)
        one_hot = torch.nn.functional.one_hot(y_batch, 3).to(torch.float32)
        loss2 = F.mse_loss(y_pred, one_hot)
        loss3= F.hinge_embedding_loss(y_pred, one_hot)
        loss = 0.7*loss1+0.15*loss2+0.15*loss3
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
        _, predicted = torch.max(y_pred, 1)
        correct += (predicted == y_batch).sum()
        total += len(y_batch)
        batches += 1
        if batches % log_interval == 0:
            print("Batch Loss: ", total_loss / batches, "Accuracy: ", correct.float() / total)
  
    val_loss = []
    val_acc = []
    valid_acc_max = 0.8
    valid_loss_min = 0.4
    
    for input_ids_batch, attention_masks_batch,  token_type_batch, y_batch in tqdm(valid_loader):
        
        model.eval()
        with torch.no_grad():
            y_batch = y_batch.to(device)
            y_pred = model(input_ids_batch.to(device), attention_masks_batch.to(device), token_type_batch.to(device))
            valid_loss1 = F.cross_entropy(y_pred, y_batch)
            valid_one_hot = torch.nn.functional.one_hot(y_batch, 3).to(torch.float32)
            valid_loss2 = F.mse_loss(y_pred, valid_one_hot)
            valid_loss3= F.hinge_embedding_loss(y_pred, valid_one_hot)
            valid_loss = 0.7*valid_loss1+0.15*valid_loss2+0.15*valid_loss3
            valid_loss = valid_loss.cpu().detach().numpy()

            preds = torch.argmax(y_pred,1)
            preds = preds.cpu().detach().numpy()
            y_batch = y_batch.cpu().detach().numpy()
            batch_acc = (preds==y_batch).mean()
            val_loss.append(valid_loss)
            val_acc.append(batch_acc)
            
            
    val_loss = np.mean(val_loss)
    val_acc = np.mean(val_acc)
    
    print(f'Epoch: {epoch} - valid Loss: {val_loss:.6f} - valid_acc : {val_acc:.6f}')
    print(optimizer.param_groups[0]["lr"])
    if valid_acc_max < val_acc:
        valid_acc_max = val_acc
        best_models.append(model)
        torch.save(model.state_dict(), f'/content/drive/MyDrive/DACON_MONTHLYNLI/models/electra_Benchmark_noCV_CNN_customloss/electra-{len(best_models)}.pth') 
        print('model save, model val acc : ',val_acc)
        print('best_models size : ',len(best_models))

24000 3998
[3562, 22900, 19779, 7175, 24188] [25896, 17827, 14117, 24593, 3465]


 27%|██▋       | 200/750 [03:16<08:56,  1.03it/s]

Batch Loss:  0.9567082887887954 Accuracy:  tensor(0.3469, device='cuda:0')


 53%|█████▎    | 400/750 [06:31<05:41,  1.02it/s]

Batch Loss:  0.8547860503196716 Accuracy:  tensor(0.5281, device='cuda:0')


 80%|████████  | 600/750 [09:45<02:26,  1.02it/s]

Batch Loss:  0.7892951456705729 Accuracy:  tensor(0.6168, device='cuda:0')


100%|██████████| 750/750 [12:11<00:00,  1.03it/s]
  0%|          | 0/125 [00:00<?, ?it/s]


RuntimeError: ignored

In [15]:
!mkdir /content/drive/MyDrive/DACON_MONTHLYNLI/models/electra_Benchmark_noCV_CNN_customloss