# 0 설정, 설치 임포트

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.chdir('/content/drive/MyDrive/015GithubRepos/Dacon_sentence_classification')

In [None]:
pip install transformers

In [None]:
import pandas as pd 
import numpy as np 
import re
from tqdm import tqdm
import time

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset

from transformers import AutoTokenizer, ElectraForSequenceClassification, AdamW, AutoModel, AutoModelForSequenceClassification
from transformers.optimization import get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup

from sklearn.model_selection import train_test_split

import warnings 
warnings.filterwarnings("ignore")

## 경로지정

In [None]:
# local = 'C:/Users/posick/Desktop/Dacon/open/'
# local2 = 'C:/Users/201/Desktop/Dacon/'
suv = 'data/'
# colab = '/content/drive/MyDrive/Dacon/'

# 1 하이퍼 파라미터 설정

### epoch

In [None]:
# epoch 5만 해도될듯
num_epochs = 10

### batch_size

In [None]:
batch_size = 32

### learning rate

In [None]:
lr = 0.00001

# 2 Data 처리

## 텍스트 전처리

In [None]:
# dev 데이터 더함

def load_data(path):
    train = pd.read_csv(path+'train_data.csv')
    test = pd.read_csv(path+'test_data.csv')
    train_dev = pd.read_csv(path+'train_dev.csv')
    sample_submission = pd.read_csv(path+'sample_submission.csv')

    train = pd.concat([train,train_dev], ignore_index=True)

    label_dict = {"entailment" : 0, "contradiction" : 1, "neutral" : 2}

    train['label'] = train['label'].map(label_dict)
    train['premise']=train['premise'].map(lambda x: re.sub('[-=+.,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]', '', x))
    train['hypothesis']=train['hypothesis'].map(lambda x: re.sub('[-=+.,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]', '', x))
    test['premise']=test['premise'].map(lambda x: re.sub('[-=+.,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]', '', x))
    test['hypothesis']=test['hypothesis'].map(lambda x: re.sub('[-=+.,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]', '', x))

    return train,test,sample_submission 

def text_clean(df):
    df["premise_"] = "[CLS]" + df["premise"] + "[SEP]"
    df["hypothesis_"] = df["hypothesis"] + "[SEP]"
    df["text_sum"] = df.premise_ + " " + df.hypothesis_
    df = df[['text_sum','label']]

    return df 

train, test, sample_submission = load_data(suv)
clean_train, clean_test = text_clean(train), text_clean(test)


## CustomDataset 클래스 선언

In [None]:
#%% data loader 

class CustomDataset(Dataset):
  
  def __init__(self, dataset, option, modelname):
    self.dataset = dataset 
    self.option = option
    self.tokenizer = AutoTokenizer.from_pretrained(modelname)
  
  def __len__(self):
    return len(self.dataset)
  
  def __getitem__(self, idx):
    row = self.dataset.iloc[idx, 0:2].values  # numpy array
    text = row[0]
    # y = row[1]

    inputs = self.tokenizer(
        text, 
        return_tensors='pt',
        truncation=True,
        max_length=70,
        pad_to_max_length=True,
        add_special_tokens=False
        )
    
    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]
    
    if self.option =='train':
        y = row[1]
        return input_ids, attention_mask, y

    return input_ids, attention_mask


## CrossValidation 인덱스 생성

In [None]:
#%% Cross validation 

n_splits = 5

from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits = n_splits, shuffle=True, random_state=42)

folds=[]
for trn_idx, val_idx in skf.split(clean_train['text_sum'], clean_train['label']):
    folds.append((trn_idx,val_idx))

# 학습

## device 설정

In [None]:
device = torch.device("cuda")
device

device(type='cuda')

## 학습

### modelname

In [93]:
modelname = 'klue/roberta-large'
# modelname = 'monologg/koelectra-base-v3-discriminator'
modelname

'klue/roberta-large'

In [None]:
best_models = []
model_num = 0
for fold in range(5):
    start_time = time.time()
    print(f'=============================={fold+1}fold start==============================')

    # CV용 model 새로 선언
    model = AutoModelForSequenceClassification.from_pretrained(modelname, num_labels=3)
    model = nn.DataParallel(model).to(device)
    
    optimizer = AdamW(model.parameters(), lr=lr)
    
    train_idx = folds[fold][0]
    valid_idx = folds[fold][1]

    train_data = clean_train.loc[train_idx]
    val_data = clean_train.loc[valid_idx]

    train_dataset = CustomDataset(train_data, 'train', modelname)
    valid_dataset = CustomDataset(val_data, 'train', modelname)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
    
    warmup_ratio = 0.1
    total_steps = len(train_loader) * num_epochs
    warmup_step = int(total_steps * warmup_ratio)
    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=1, num_training_steps=total_steps)
    valid_loss_min = 0.4
    valid_acc_max = 0.8
    
    
    for epoch in range(num_epochs):  # epoch = 10
        print(f'epoch : {epoch}-----------------------------------------------------------------')
        batches = 0
        total_loss = 0.0
        correct = 0
        total =0
        model.train()  # train 모드로 변경
        
        # 학습데이터로 학습 #############################################################
        print(f'train 학습..........')
        for input_ids_batch, attention_masks_batch, y_batch in train_loader:
            optimizer.zero_grad()  # 그래디언트 초기화
            y_batch = y_batch.to(device)  # y_batch를 gpu 올림
            y_pred = model(input_ids_batch.to(device), attention_mask = attention_masks_batch.to(device))[0]  # 순전파
            loss = F.cross_entropy(y_pred, y_batch)  # loss 계산
            loss.backward()  # 역전파
            optimizer.step()  # 가중치 업데이트
            total_loss += loss.item()
            _, predicted = torch.max(y_pred, 1)
            correct += (predicted == y_batch).sum()
            total += len(y_batch)
            batches += 1
            if batches % 100 == 0:
                acc = correct.float() / total
                print(f'iteration 누적 : {batches}, Train Loss: {total_loss:.4f}, Train Accuracy : {acc.item():.4f}')
        acc = correct.float() / total
        print(f'iteration 누적 : {batches}, Train Loss: {total_loss:.4f}, Train Accuracy : {acc.item():.4f}')
        # 학습데이터로 학습 #############################################################


        # 검증데이터로 검증 #############################################################
        val_loss = []
        val_acc = []
        print(f'validation 검증..........')
        for input_ids_batch, attention_masks_batch, y_batch in valid_loader:
            
            model.eval()
            with torch.no_grad():
                y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
                valid_loss = F.cross_entropy(y_pred,y_batch.to(device)).cpu().detach().numpy()

                preds = torch.argmax(y_pred,1)
                preds = preds.cpu().detach().numpy()
                y_batch = y_batch.cpu().detach().numpy()
                batch_acc = (preds==y_batch).mean()
                val_loss.append(valid_loss)
                val_acc.append(batch_acc)

        val_loss = np.mean(val_loss)
        val_acc = np.mean(val_acc)
        scheduler.step()
        print(f'Valid Loss: {val_loss:.4f}, Valid Accuracy : {val_acc:.4f}')
        print(f'Learning rate : {optimizer.param_groups[0]["lr"]:.6f}')
        # 검증데이터로 검증 #############################################################

        if valid_acc_max < val_acc:  # 이전보다 좋으면 best_models에 모델을 추가
            valid_acc_max = val_acc
            best_models.append(model)
            torch.save(model, f'koelectra-adddata{model_num}.pth')  # 모델 저장
            model_num += 1
            print(f'model \'koelectra-adddata{model_num}.pth\' save. the number of best_models: {len(best_models)}, model val acc : {val_acc:.6f}******************')
        print(f'--------------------------------------------------------------------------------')

    print(f'{fold+1}fold elapsed time : {time.time() - start_time}')
    

In [None]:
test_dataset = CustomDataset(clean_test,'test')
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=0)

preds = []
for idx, m in enumerate(best_models): 
    print(f'{idx+1}/{len(best_models)}번째 모델 예측 진행중')
    bestm = m
    bestm.eval()
    answer = []
    with torch.no_grad():
        for input_ids_batch, attention_masks_batch in tqdm(test_loader):
            y_pred = bestm(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0].detach().cpu().numpy()
            answer.extend(y_pred.argmax(axis=1))
            
    preds.append(answer)

1/6번째 모델 예측 진행중


100%|██████████| 27/27 [00:12<00:00,  2.12it/s]


2/6번째 모델 예측 진행중


100%|██████████| 27/27 [00:12<00:00,  2.17it/s]


3/6번째 모델 예측 진행중


100%|██████████| 27/27 [00:12<00:00,  2.17it/s]


4/6번째 모델 예측 진행중


100%|██████████| 27/27 [00:12<00:00,  2.17it/s]


5/6번째 모델 예측 진행중


100%|██████████| 27/27 [00:12<00:00,  2.17it/s]


6/6번째 모델 예측 진행중


100%|██████████| 27/27 [00:12<00:00,  2.17it/s]


In [None]:
from collections import Counter

np_pred = np.array(preds).T

pred = []
for i in range(1666):
    cnt = Counter(np_pred[i])
    pred.append(cnt.most_common()[0][0])
    

In [None]:
label_dict1 = {0:"entailment" , 1: "contradiction" , 2:"neutral"}

sample_submission['label'] = [label_dict1[_] for _ in pred]

In [None]:
sample_submission

Unnamed: 0,index,label
0,0,contradiction
1,1,neutral
2,2,entailment
3,3,contradiction
4,4,contradiction
...,...,...
1661,1661,neutral
1662,1662,entailment
1663,1663,neutral
1664,1664,neutral


In [None]:
sample_submission.to_csv(suv + 'aaaaaaaaa.csv', index=False)

In [None]:
suv + 'kc_roberta-large_3fold.csv'

'data/kc_roberta-large_3fold.csv'

In [None]:
os.getcwd()

'/content/drive/MyDrive/015GithubRepos/Dacon_sentence_classification'

In [None]:
for _ in np_pred[:50]:
    print(f'{_}\t{Counter(_)}')

[1 1 1 1 1 1]	Counter({1: 6})
[2 2 2 2 2 2]	Counter({2: 6})
[0 0 0 0 0 0]	Counter({0: 6})
[1 1 1 1 1 1]	Counter({1: 6})
[1 1 1 1 1 1]	Counter({1: 6})
[2 2 2 2 2 2]	Counter({2: 6})
[1 1 1 2 2 2]	Counter({1: 3, 2: 3})
[2 2 2 0 0 0]	Counter({2: 3, 0: 3})
[0 0 0 0 0 0]	Counter({0: 6})
[2 2 2 1 1 1]	Counter({2: 3, 1: 3})
[1 1 1 1 1 1]	Counter({1: 6})
[0 0 0 0 0 0]	Counter({0: 6})
[1 1 1 1 1 1]	Counter({1: 6})
[0 0 0 0 0 0]	Counter({0: 6})
[2 2 2 2 2 2]	Counter({2: 6})
[2 2 2 2 2 2]	Counter({2: 6})
[2 2 2 2 2 2]	Counter({2: 6})
[2 2 2 2 2 2]	Counter({2: 6})
[1 1 1 1 1 1]	Counter({1: 6})
[2 2 2 2 2 2]	Counter({2: 6})
[1 1 1 1 1 1]	Counter({1: 6})
[2 2 2 2 2 2]	Counter({2: 6})
[0 0 0 2 2 2]	Counter({0: 3, 2: 3})
[0 0 0 0 0 0]	Counter({0: 6})
[1 1 1 1 1 1]	Counter({1: 6})
[1 1 1 1 1 1]	Counter({1: 6})
[0 0 0 2 2 2]	Counter({0: 3, 2: 3})
[0 0 0 0 0 0]	Counter({0: 6})
[0 0 0 0 0 0]	Counter({0: 6})
[2 2 2 2 2 2]	Counter({2: 6})
[1 1 1 1 1 1]	Counter({1: 6})
[1 1 1 1 1 1]	Counter({1: 6})
[1 1 1 1 1

In [None]:
for _ in range(11, 11+len(best_models)):
    torch.save(best_models[_-11], f'roberta-large{_}.pth')

In [None]:
torch.load()

# 불러와서 예측

In [None]:
# import time

# test_dataset = CustomDataset(clean_test,'test')
# test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=0)
# preds = []
# for _ in range(13):
#     start = time.time()
#     print(f'{_+1}/13 번째 모델 예측 진행중')
#     m = torch.load(f'roberta-large-adddata{_}.pth')
#     m.eval()
#     answer = []
#     with torch.no_grad():
#         for input_ids_batch, attention_masks_batch in tqdm(test_loader):
#             y_pred = m(
#                 input_ids_batch.to(device),
#                 attention_mask=attention_masks_batch.to(device)
#                 )[0].detach().cpu().numpy()
#             answer.extend(y_pred.argmax(axis=1))
            
#     preds.append(answer)
#     print(f'elapsed time : {time.time() - start}')

In [91]:
import time

test_dataset = CustomDataset(clean_test,'test', modelname)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=0)
preds = []
for _ in range(23):
    start = time.time()
    print(f'{_+1}/23 번째 모델 예측 진행중')
    m = torch.load(f'koelectra-adddata{_}.pth')
    m.eval()
    answer = []
    with torch.no_grad():
        for input_ids_batch, attention_masks_batch in tqdm(test_loader):
            y_pred = m(
                input_ids_batch.to(device),
                attention_mask=attention_masks_batch.to(device)
                )[0].detach().cpu().numpy()
            answer.extend(y_pred.argmax(axis=1))
            
    preds.append(answer)
    print(f'elapsed time : {time.time() - start}')

TypeError: ignored

In [None]:
from collections import Counter

np_pred = np.array(preds).T

pred = []
for i in range(1666):
    cnt = Counter(np_pred[i])
    pred.append(cnt.most_common()[0][0])

In [None]:
label_dict1 = {0:"entailment" , 1: "contradiction" , 2:"neutral"}

sample_submission['label'] = [label_dict1[_] for _ in pred]

In [None]:
for _ in np_pred[:50]:
    print(f'{_}\t{Counter(_).most_common()[0][0]}\t{Counter(_)}')

In [None]:
sample_submission.to_csv(suv + 'roberta-large-adddata.csv', index=False)

# 불러와서 앙상블

In [94]:
def soft_max(x):
    
    max = np.max(x,axis=1,keepdims=True) #returns max of each row and keeps same dims
    e_x = np.exp(x - max) #subtracts each row with its max value
    sum = np.sum(e_x,axis=1,keepdims=True) #returns sum of each row and keeps same dims
    f_x = e_x / sum 
    return f_x

## roBERTa-large

In [95]:
import time

modelname = 'klue/roberta-large'
print(f'불러오는 model name : {modelname}')

test_dataset = CustomDataset(clean_test,'test', modelname)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=0)
sum_probs = np.zeros((test_dataset.dataset.shape[0], 3), dtype = 'f')

for _ in range(13):
    # 모델 1개로 예측 ----------------------------------------------------------
    start = time.time()
    print(f'{_+1}/13 번째 모델 예측 진행중')
    m = torch.load(f'roberta-large-adddata{_}.pth')
    m.eval()
    probs = np.empty((0,3), float)

    with torch.no_grad():
        for input_ids_batch, attention_masks_batch in tqdm(test_loader):
            y_pred = m(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0].detach().cpu().numpy()
            probs = np.vstack((probs, soft_max(np.array(y_pred))))  # 3가지 클래스에 대한 확률값. 행렬 shape : 사이즈 X 3
            
    sum_probs += probs  # 예측 끝나면 더해줌 ********************************
    print(f'elapsed time : {time.time() - start}')
    # 모델 1개로 예측 ----------------------------------------------------------

불러오는 model name : klue/roberta-large
1/13 번째 모델 예측 진행중


100%|██████████| 27/27 [00:12<00:00,  2.13it/s]


elapsed time : 20.71603536605835
2/13 번째 모델 예측 진행중


100%|██████████| 27/27 [00:12<00:00,  2.14it/s]


elapsed time : 20.88239598274231
3/13 번째 모델 예측 진행중


100%|██████████| 27/27 [00:12<00:00,  2.14it/s]


elapsed time : 20.401285648345947
4/13 번째 모델 예측 진행중


100%|██████████| 27/27 [00:12<00:00,  2.14it/s]


elapsed time : 20.439566612243652
5/13 번째 모델 예측 진행중


100%|██████████| 27/27 [00:12<00:00,  2.14it/s]


elapsed time : 20.343756914138794
6/13 번째 모델 예측 진행중


100%|██████████| 27/27 [00:12<00:00,  2.14it/s]


elapsed time : 20.37973976135254
7/13 번째 모델 예측 진행중


100%|██████████| 27/27 [00:12<00:00,  2.14it/s]


elapsed time : 20.416035413742065
8/13 번째 모델 예측 진행중


100%|██████████| 27/27 [00:12<00:00,  2.14it/s]


elapsed time : 21.22062063217163
9/13 번째 모델 예측 진행중


100%|██████████| 27/27 [00:12<00:00,  2.14it/s]


elapsed time : 20.573599338531494
10/13 번째 모델 예측 진행중


100%|██████████| 27/27 [00:12<00:00,  2.14it/s]


elapsed time : 19.985865592956543
11/13 번째 모델 예측 진행중


100%|██████████| 27/27 [00:12<00:00,  2.14it/s]


elapsed time : 20.902229070663452
12/13 번째 모델 예측 진행중


100%|██████████| 27/27 [00:12<00:00,  2.14it/s]


elapsed time : 21.126691102981567
13/13 번째 모델 예측 진행중


100%|██████████| 27/27 [00:12<00:00,  2.14it/s]

elapsed time : 20.680630207061768





## KoELECTRA

In [96]:
modelname = 'monologg/koelectra-base-v3-discriminator'
print(f'불러오는 model name : {modelname}')

test_dataset = CustomDataset(clean_test,'test', modelname)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=0)

for _ in range(23):
    # 모델 1개로 예측 ----------------------------------------------------------
    start = time.time()
    print(f'{_+1}/23 번째 모델 예측 진행중')
    m = torch.load(f'koelectra-adddata{_}.pth')
    m.eval()
    probs = np.empty((0,3), float)

    with torch.no_grad():
        for input_ids_batch, attention_masks_batch in tqdm(test_loader):
            y_pred = m(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0].detach().cpu().numpy()
            probs = np.vstack((probs, soft_max(np.array(y_pred))))  # 3가지 클래스에 대한 확률값 배치만큼 쌓기. 행렬 shape : 사이즈 X 3
            
    sum_probs += probs  # 모델 예측 끝나면 더해줌 ********************************
    print(f'elapsed time : {time.time() - start}')
    # 모델 1개로 예측 ----------------------------------------------------------

불러오는 model name : monologg/koelectra-base-v3-discriminator


Downloading:   0%|          | 0.00/61.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/467 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/257k [00:00<?, ?B/s]

1/23 번째 모델 예측 진행중


100%|██████████| 27/27 [00:04<00:00,  5.94it/s]


elapsed time : 6.899779796600342
2/23 번째 모델 예측 진행중


100%|██████████| 27/27 [00:04<00:00,  5.93it/s]


elapsed time : 7.347826719284058
3/23 번째 모델 예측 진행중


100%|██████████| 27/27 [00:04<00:00,  5.95it/s]


elapsed time : 7.369963884353638
4/23 번째 모델 예측 진행중


100%|██████████| 27/27 [00:04<00:00,  5.94it/s]


elapsed time : 7.72137975692749
5/23 번째 모델 예측 진행중


100%|██████████| 27/27 [00:04<00:00,  5.94it/s]


elapsed time : 7.321476936340332
6/23 번째 모델 예측 진행중


100%|██████████| 27/27 [00:04<00:00,  5.94it/s]


elapsed time : 7.6625401973724365
7/23 번째 모델 예측 진행중


100%|██████████| 27/27 [00:04<00:00,  5.92it/s]


elapsed time : 7.623301267623901
8/23 번째 모델 예측 진행중


100%|██████████| 27/27 [00:04<00:00,  5.89it/s]


elapsed time : 7.724207639694214
9/23 번째 모델 예측 진행중


100%|██████████| 27/27 [00:04<00:00,  5.94it/s]


elapsed time : 7.468636512756348
10/23 번째 모델 예측 진행중


100%|██████████| 27/27 [00:04<00:00,  5.93it/s]


elapsed time : 7.431588411331177
11/23 번째 모델 예측 진행중


100%|██████████| 27/27 [00:04<00:00,  5.93it/s]


elapsed time : 7.614890098571777
12/23 번째 모델 예측 진행중


100%|██████████| 27/27 [00:04<00:00,  5.94it/s]


elapsed time : 7.521316766738892
13/23 번째 모델 예측 진행중


100%|██████████| 27/27 [00:04<00:00,  5.93it/s]


elapsed time : 7.645878314971924
14/23 번째 모델 예측 진행중


100%|██████████| 27/27 [00:04<00:00,  5.91it/s]


elapsed time : 7.5557451248168945
15/23 번째 모델 예측 진행중


100%|██████████| 27/27 [00:04<00:00,  6.02it/s]


elapsed time : 7.79499363899231
16/23 번째 모델 예측 진행중


100%|██████████| 27/27 [00:04<00:00,  6.00it/s]


elapsed time : 7.845914840698242
17/23 번째 모델 예측 진행중


100%|██████████| 27/27 [00:04<00:00,  5.99it/s]


elapsed time : 7.3764402866363525
18/23 번째 모델 예측 진행중


100%|██████████| 27/27 [00:04<00:00,  5.97it/s]


elapsed time : 7.684270858764648
19/23 번째 모델 예측 진행중


100%|██████████| 27/27 [00:04<00:00,  5.98it/s]


elapsed time : 7.654652118682861
20/23 번째 모델 예측 진행중


100%|██████████| 27/27 [00:04<00:00,  5.93it/s]


elapsed time : 7.762315988540649
21/23 번째 모델 예측 진행중


100%|██████████| 27/27 [00:04<00:00,  5.99it/s]


elapsed time : 7.611638784408569
22/23 번째 모델 예측 진행중


100%|██████████| 27/27 [00:04<00:00,  5.99it/s]


elapsed time : 7.765585422515869
23/23 번째 모델 예측 진행중


100%|██████████| 27/27 [00:04<00:00,  5.99it/s]

elapsed time : 7.627519845962524





In [97]:
pred = sum_probs.argmax(axis=1)

In [98]:
label_dict1 = {0:"entailment" , 1: "contradiction" , 2:"neutral"}

sample_submission['label'] = [label_dict1[_] for _ in pred]

In [99]:
sample_submission.head(5)

Unnamed: 0,index,label
0,0,contradiction
1,1,neutral
2,2,entailment
3,3,contradiction
4,4,contradiction


In [101]:
sample_submission.to_csv(suv + 'robertaNkoelectra_ensem.csv', index=False)