In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 7.5 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 70.4 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 66.5 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.10.1
  Downloading tokenizers-0.11.5-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 29.6 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 6.7 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Foun

In [3]:
import pandas as pd 
import numpy as np 
import os
import torch
import torch.nn as nn

import warnings 
warnings.filterwarnings("ignore")
from tqdm import tqdm
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModel,AutoTokenizer,RobertaTokenizer, ElectraForSequenceClassification, AdamW, ElectraModel,ElectraTokenizer
from transformers.optimization import get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup
import re
from sklearn.model_selection import train_test_split

In [4]:
# Random Seed Fix
import random
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  
    torch.backends.cudnn.deterministic = True  
    torch.backends.cudnn.benchmark = True  
seed_everything()

In [5]:
device = torch.device("cuda")

In [6]:
############# HYPERPARMS ##############
num_epochs = 5
batch_size =128
lr = 0.00001
pretrain = "roberta-large"
#pretrain = "roberta-base"

In [7]:
def load_data(path):
  TRAIN = os.path.join(path, 'benchmark_train_data.csv')
  TEST = os.path.join(path, 'test_data.csv')
  SS = os.path.join(path, 'sample_submission.csv')
  label_dict = {"entailment" : 0, "contradiction" : 1, "neutral" : 2}
  train = pd.read_csv(TRAIN)
  test = pd.read_csv(TEST)
  sample_submission = pd.read_csv(SS)
  train['label'] = train['label'].map(label_dict)

  return train,test,sample_submission

def text_clean(df):
  #=df["premise_"] = "<s>" + df["premise"].astype(str) + "[SEP]"
  df["premise_"] = "[CLS]"+df["premise"].astype(str)
  #df["hypothesis_"] = df["hypothesis"].astype(str) + "[SEP]"
  df["hypothesis_"] = df["hypothesis"].astype(str) + "[SEP]"
  df["text_sum"] = df.premise_ + "[SEP]" + df.hypothesis_
  df = df[['text_sum','label']]
  return df 

def random_deletion(sentence, p=0.2):
    words = sentence.split ()
    n = len (words)
    if n == 1: # return if single word
        return words
    remaining = list(filter(lambda x: random.uniform(0,1) > p,words))
    #print (remaining) 
    if len(remaining) == 0: # if not left, sample a random word
        return ' '.join ([random.choice(words)])
    else:
        return ' '.join (remaining)

def random_swap(sentence, n=2):
    sentence = sentence.split () 
    length = range(len(sentence))
    swapped = []
    if len(sentence) >2:
      for _ in range(n):
          idx1, idx2 = random.sample(length, 2)
          swapped.append ([sentence[idx1], sentence[idx2]])
          sentence[idx1], sentence[idx2] = sentence[idx2], sentence[idx1] 
    return ' '.join (sentence)

def eda_aug(df):

    cache = {'premise':[], 'hypothesis':[], 'label':[]}
    for idx in tqdm(range(len(df))):
        premise = df.iloc[idx]['premise']
        hypothesis = df.iloc[idx]['hypothesis']
        label = df.iloc[idx]['label']
        cache['premise'].append(premise)
        cache['hypothesis'].append(hypothesis)
        cache['label'].append(label)
        flag = random.randrange(10)
        if flag < 2:
          cache['premise'].append(random_deletion(premise))
          cache['hypothesis'].append(random_deletion(hypothesis))
          cache['label'].append(label)
          cache['premise'].append(random_swap(premise))
          cache['hypothesis'].append(random_swap(hypothesis))
          cache['label'].append(label)
    
    return pd.DataFrame(cache)

In [8]:
#ROOT = '/content/drive/MyDrive/DACON_MONTHLYNLI'
DATA = '/content/drive/Shareddrives/Dacon/hongsun/data'
train,test,sample_submission = load_data(DATA)
###### AUGMENTATION ######
#train = eda_aug(train)
###### AUGMENTATION ######

clean_train,clean_test  = text_clean(train),text_clean(test)
display(clean_train)
display(clean_test)

Unnamed: 0,text_sum,label
0,"[CLS]씨름은 상고시대로부터 전해져 내려오는 남자들의 대표적인 놀이로서, 소년이나...",1
1,[CLS]삼성은 자작극을 벌인 2명에게 형사 고소 등의 법적 대응을 검토 중이라고 ...,1
2,[CLS]이를 위해 예측적 범죄예방 시스템을 구축하고 고도화한다.[SEP]예측적 범...,0
3,[CLS]광주광역시가 재개발 정비사업 원주민들에 대한 종합대책을 마련하는 등 원주민...,2
4,"[CLS]진정 소비자와 직원들에게 사랑 받는 기업으로 오래 지속되고 싶으면, 이런 ...",2
...,...,...
27993,[CLS]흔히 비자림로라고 불리는 지방도 제1112호선을 넓히는 공사가 1년만에 재...,1
27994,[CLS]흔히 비자림로라고 불리는 지방도 제1112호선을 넓히는 공사가 1년만에 재...,2
27995,[CLS]흔히 비자림로라고 불리는 지방도 제1112호선을 넓히는 공사가 1년만에 재...,0
27996,[CLS]흡연자분들은 발코니가 있는 방이면 발코니에서 흡연이 가능합니다.[SEP]비...,2


Unnamed: 0,text_sum,label
0,[CLS]다만 조금 좁아서 케리어를 펼치기 불편합니다.[SEP]케리어를 펼치기에 공...,answer
1,[CLS]그리고 위치가 시먼역보다는 샤오난먼역에 가까워요[SEP]시먼역보다는 샤오난...,answer
2,[CLS]구구절절 설명하고 이해시키려는 노력이 큰 의미없이 다가온다.[SEP]무엇인...,answer
3,[CLS]몇 번을 다시봐도 볼 때마다 가슴이 저민다.[SEP]다시 봤을때는 무덤덤했...,answer
4,"[CLS]8월 중에 입주신청을 하면 청년은 9월, 신혼부부는 10월부터 입주가 가능...",answer
...,...,...
1661,"[CLS]또 작업자의 숙련도와 경험 향상, 전문성을 요구하는 난이도 높은 데이터 가...",answer
1662,[CLS]결말을 보니 아무래도 이 영화는 2부가 계획된 듯 합니다.[SEP]결말을 ...,answer
1663,[CLS]사회적 거리 두기 상황에서 총리도 카페를 갔다가 자리가 없어서 퇴짜 맞은 ...,answer
1664,[CLS]로마에서 3박4일간 이곳에서 머물렀습니다.[SEP]이곳에서 머무르며 로마의...,answer


In [9]:
model_roberta = AutoModel.from_pretrained("klue/roberta-large")
#model_roberta = AutoModel.from_pretrained("klue/roberta-base")
tokenizer_roberta = AutoTokenizer.from_pretrained("klue/roberta-large")
#tokenizer_roberta = AutoTokenizer.from_pretrained("klue/roberta-base")
model_electra = ElectraModel.from_pretrained("monologg/koelectra-base-v3-discriminator")
tokenizer_electra =  ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")

Downloading:   0%|          | 0.00/547 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.25G [00:00<?, ?B/s]

Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it f

Downloading:   0%|          | 0.00/375 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/243k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/734k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/173 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/467 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/431M [00:00<?, ?B/s]

Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraModel: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/61.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/257k [00:00<?, ?B/s]

In [10]:
def roberta_transform(text):
  transform = tokenizer_roberta(text,
                                pad_to_max_length=True,
                               truncation=True,
                               max_length=256,
                               return_tensors='pt',
                                add_special_tokens=False)
  return transform

In [11]:
class customDataset(Dataset):
  def __init__(self,dataset,mode='train',transform=roberta_transform):
    super(customDataset, self).__init__()
    self.mode = mode
    self.dataset = dataset.reset_index(drop=True)
    self.transform = transform

  def __getitem__(self, idx):
    
    text = self.dataset['text_sum'][idx]
    tokens = self.transform(text)
    token_ids = tokens['input_ids'][0]  # tensor of token ids
    attn_masks = tokens['attention_mask'][0]  # binary tensor with "0" for padded values and "1" for the other values
    token_type_ids = tokens['token_type_ids'][0]  # binary tensor with "0" for the 1st sentence tokens & "1" for the 2nd sentence tokens

    if self.mode == 'test':
      return token_ids,attn_masks,token_type_ids
    else: 
      labels = self.dataset['label'][idx]
      return token_ids,attn_masks,token_type_ids, labels
  
  def __len__(self):
    return(len(self.dataset))

In [12]:
test_dataset = customDataset(clean_test,'test')
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=0)

In [13]:
class ROBERTaClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 1024, # 768
                 num_classes=3,   ##클래스 수 조정##
                 params=None,
                 freeze_bert=False):
        super(ROBERTaClassifier, self).__init__()
        self.bert = bert
        self.freeze_bert=freeze_bert

        if self.freeze_bert:
            for p in self.bert.parameters():
                p.requires_grad = False

                 
        self.classifier = nn.Linear(hidden_size , 256)
        self.dropout = nn.Dropout(p=0.5)
        self.fc_layer1 = nn.Linear(256,128)
        self.fc_layer2 = nn.Linear(128,num_classes)
    

    def forward(self, input_ids, attn_masks):
        
        _,pooler = self.bert(input_ids, attn_masks, return_dict=False)
        output1 = self.classifier(pooler)
        output2 = self.fc_layer1(output1)
        output3 = self.fc_layer2(self.dropout(output2))
        return (output3)

In [15]:
# roberta_large 모델이 가장 성능이 좋은 것으로 가정

model = ROBERTaClassifier(model_roberta).to(device)
model=nn.DataParallel(model).to(device)



model_ROOT1 = '/content/drive/Shareddrives/Dacon/hongsun/'

model_PATHs = [
               os.path.join(model_ROOT1, 'ROBERTa_large_fold_0_4.pth'),
               #os.path.join(model_ROOT1, 'ROBERTa_large_fold_1_9.pth')
]

preds = dict()
for pth in model_PATHs:
    currentm = model
    currentm.load_state_dict(torch.load(pth))
    currentm.eval()
    answer = []
    with torch.no_grad():
        for input_ids_batch, attention_masks_batch, token_type_ids in tqdm(test_loader):
            y_pred = currentm(input_ids_batch.to(device), attention_masks_batch.to(device)).detach().cpu().numpy()
            answer.extend(y_pred)
    preds['roberta_large'+pth[-5]] = np.array(answer )/ len(model_PATHs)

100%|██████████| 27/27 [00:42<00:00,  1.58s/it]


In [16]:
class ROBERTaClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768, # 768
                 num_classes=3,   ##클래스 수 조정##
                 params=None,
                 freeze_bert=False):
        super(ROBERTaClassifier, self).__init__()
        self.bert = bert
        self.freeze_bert=freeze_bert

        if self.freeze_bert:
            for p in self.bert.parameters():
                p.requires_grad = False

                 
        self.classifier = nn.Linear(hidden_size , 256)
        self.dropout = nn.Dropout(p=0.5)
        self.fc_layer1 = nn.Linear(256,128)
        self.fc_layer2 = nn.Linear(128,num_classes)
    

    def forward(self, input_ids, attn_masks):
        
        _,pooler = self.bert(input_ids, attn_masks, return_dict=False)
        output1 = self.classifier(pooler)
        output2 = self.fc_layer1(output1)
        output3 = self.fc_layer2(self.dropout(output2))
        return (output3)

In [17]:
# roberta_base 모델이 가장 성능이 좋은 것으로 가정
model_roberta = AutoModel.from_pretrained("klue/roberta-base")

model = ROBERTaClassifier(model_roberta).to(device)
model=nn.DataParallel(model).to(device)

test_dataset = customDataset(clean_test,'test')
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=0)

model_ROOT1 = '/content/drive/Shareddrives/Dacon/hongsun/'

model_PATHs = [
               os.path.join(model_ROOT1, 'ROBERTa_base_fold_0.pth'),
               os.path.join(model_ROOT1, 'ROBERTa_base_fold_1.pth'),
               os.path.join(model_ROOT1, 'ROBERTa_base_fold_2.pth'),
               os.path.join(model_ROOT1, 'ROBERTa_base_fold_3.pth'),
               os.path.join(model_ROOT1, 'ROBERTa_base_fold_4.pth'),
]


for pth in model_PATHs:
    currentm = model
    currentm.load_state_dict(torch.load(pth))
    currentm.eval()
    answer = []
    with torch.no_grad():
        for input_ids_batch, attention_masks_batch, token_type_ids in tqdm(test_loader):
            y_pred = currentm(input_ids_batch.to(device), attention_masks_batch.to(device)).detach().cpu().numpy()
            answer.extend(y_pred)
    preds['roberta_base'+pth[-5]] = np.array(answer) / len(model_PATHs)

Downloading:   0%|          | 0.00/546 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/422M [00:00<?, ?B/s]

Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for

In [18]:
# koelectra 모델
model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-base-v3-discriminator",num_labels=3).to(device)

model=nn.DataParallel(model).to(device)


model_ROOT1 = '/content/drive/Shareddrives/Dacon/saved_models/Junha/Electra_Benchmark_5CV_aug_customloss/'

model_PATHs = [os.path.join(model_ROOT1, 'koelectra-5.pth'),
               os.path.join(model_ROOT1, 'koelectra-11.pth'),
               os.path.join(model_ROOT1, 'koelectra-14.pth'),
               os.path.join(model_ROOT1, 'koelectra-18.pth'),
               #os.path.join(model_ROOT1, 'koelectra-fold5-new-7.pth')
               ]


for pth in model_PATHs:
    currentm = model
    currentm.load_state_dict(torch.load(pth))
    currentm.eval()
    answer = []
    with torch.no_grad():
        for input_ids_batch, attention_masks_batch, token_type_ids in tqdm(test_loader):
            y_pred = currentm(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0].detach().cpu().numpy()
            answer.extend(y_pred)
    preds['electra'+pth[-5]] = np.array(answer)/len(model_PATHs)

Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: 

In [19]:

preds.keys()

dict_keys(['roberta_large4', 'roberta_base0', 'roberta_base1', 'roberta_base2', 'roberta_base3', 'roberta_base4', 'electra5', 'electra1', 'electra4', 'electra8'])

In [20]:
temp = np.zeros((1666, 3))
for key in preds.keys():
  x = preds[key]
  temp += x
temp = temp
print(temp)
softvoted_prob = pd.DataFrame(temp)
softvoted_pred = pd.DataFrame(np.argmax(temp, axis=1))
decode_map = {0 : "entailment" , 1 :  "contradiction" , 2 : "neutral" }
sample_submission['label'] = softvoted_pred
sample_submission['label'] = sample_submission['label'].map(decode_map)
sample_submission.to_csv('./submission_maybe_final_final_final.csv', index = False)

[[-5.9587965  12.88269381 -7.14229291]
 [ 0.29489882 -7.62197534  8.60364624]
 [ 5.12554299 -7.07582764  3.64992286]
 ...
 [-5.57763316 -5.67288375 12.2671699 ]
 [-5.06217606 -5.90070839 11.96622442]
 [-4.03583274 -3.95913462  8.89686278]]
