In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


https://dacon.io/competitions/official/235747/codeshare/3054?page=1&dtype=recent

In [3]:
import pandas as pd
import os 
import torch

In [4]:
if torch.cuda.is_available():
    device = 'cuda:0'
else:
    device = 'cpu'

print(device)

cuda:0


# Preprocessing

In [5]:
# PATH = './dataset/' # for local
PATH = '/content/drive/MyDrive/Colab_Notebooks/jw_dacon_news_classification/dataset/'    # for colab

In [6]:
train_data = pd.read_csv(os.path.join(PATH, 'train_data.csv'), encoding='utf-8')
test_data = pd.read_csv(os.path.join(PATH, 'test_data.csv'), encoding='utf-8')

topic_dict = pd.read_csv(os.path.join(PATH, 'topic_dict.csv'), encoding='utf-8')

In [7]:
train_data

Unnamed: 0,index,title,topic_idx
0,0,인천→핀란드 항공기 결항…휴가철 여행객 분통,4
1,1,실리콘밸리 넘어서겠다…구글 15조원 들여 美전역 거점화,4
2,2,이란 외무 긴장완화 해결책은 미국이 경제전쟁 멈추는 것,4
3,3,NYT 클린턴 측근韓기업 특수관계 조명…공과 사 맞물려종합,4
4,4,시진핑 트럼프에 중미 무역협상 조속 타결 희망,4
...,...,...,...
45649,45649,KB금융 미국 IB 스티펠과 제휴…선진국 시장 공략,1
45650,45650,1보 서울시교육청 신종코로나 확산에 개학 연기·휴업 검토,2
45651,45651,게시판 키움증권 2020 키움 영웅전 실전투자대회,1
45652,45652,답변하는 배기동 국립중앙박물관장,2


In [8]:
train_data = train_data.drop('index', axis=1)
test_data = test_data.drop('index', axis=1)

# Make Dataset, DataLoader

In [9]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.2-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 7.0 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 54.4 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 67.3 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.9.1 tokenizers-0.12.1 transformers-4.21.2


In [10]:
pip install wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wandb
  Downloading wandb-0.13.2-py2.py3-none-any.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 9.1 MB/s 
Collecting GitPython>=1.0.0
  Downloading GitPython-3.1.27-py3-none-any.whl (181 kB)
[K     |████████████████████████████████| 181 kB 71.9 MB/s 
Collecting pathtools
  Downloading pathtools-0.1.2.tar.gz (11 kB)
Collecting setproctitle
  Downloading setproctitle-1.3.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)
Collecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-1.9.5-py2.py3-none-any.whl (157 kB)
[K     |████████████████████████████████| 157 kB 69.0 MB/s 
[?25hCollecting shortuuid>=0.5.0
  Downloading shortuuid-1.0.9-py3-none-any.whl (9.4 kB)
Collecting docker-pycreds>=0.4.0
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting gitdb<5,>=4.0.1
  Downloading gitdb-4.0.9-p

In [11]:
import wandb
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [12]:
# from eunjeon import Mecab
# from kobert_tokenizer import KoBERTTokenizer
from transformers import AdamWeightDecay
from transformers import AutoTokenizer
from tqdm import tqdm
import numpy as np

import torch

In [13]:
# Dataset class when using SKF
from torch.utils.data import Dataset, DataLoader

class MyDataset(Dataset):
    def __init__(self, data_df, tokenizer, max_seq_len, pad_idx, mode):
        self.data = data_df
        self.tokenizer = tokenizer
        self.max_len = max_seq_len
        self.pad_idx = pad_idx
        if mode=='test':
            self.mode = mode
        else:
            self.mode = 'train'
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context = self.tokenizer.encode(self.data['title'][idx])[1:-1]
        con_len = len(context)
        pad_li = [self.pad_idx]*(self.max_len - con_len)
        context += pad_li
        context = torch.LongTensor(context)
        
        if self.mode == 'train':
            label_idx = int(self.data['topic_idx'][idx])
            label = torch.zeros(7)
            label[label_idx] += 1
            return context, label
        return context,

## Stratified K-Fold

- 데이터셋의 레이블 분포에 차이가 있기에 이를 고려한 Startified K-Fold 를 해준다.
- Startified-5-Fold 를 이용하여 각 80%의 train_data로 학습한 모델 5개를 test데이터에 대하여 앙상블한 것을 최종 결과로 채택한다.
- 

In [14]:
import numpy as np
import sklearn
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import StratifiedKFold     # GT를 자동인식해서 맞게 분배한다.

In [15]:
# Cross validation, StratifiedKfold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
folds=[]

# label에 비례하는 idx를 만든다.
for train_idx, valid_idx in skf.split(train_data, train_data['topic_idx']):
    train_idx = np.array(train_idx)
    valid_idx = np.array(valid_idx)
    folds.append((train_idx, valid_idx))

In [16]:
# fold 나누기
train_data0 = train_data.iloc[folds[0][0]].reset_index(drop=True)
train_data1 = train_data.iloc[folds[1][0]].reset_index(drop=True)
train_data2 = train_data.iloc[folds[2][0]].reset_index(drop=True)
train_data3 = train_data.iloc[folds[3][0]].reset_index(drop=True)
train_data4 = train_data.iloc[folds[4][0]].reset_index(drop=True)

val_data0 = train_data.iloc[folds[0][1]].reset_index(drop=True)
val_data1 = train_data.iloc[folds[1][1]].reset_index(drop=True)
val_data2 = train_data.iloc[folds[2][1]].reset_index(drop=True)
val_data3 = train_data.iloc[folds[3][1]].reset_index(drop=True)
val_data4 = train_data.iloc[folds[4][1]].reset_index(drop=True)

In [17]:
BATCH_SIZE = 64
MAX_LEN = 30

tokenizer = AutoTokenizer.from_pretrained('klue/roberta-large')

# make dataset
train_dataset=[]
train_dataset.append(MyDataset(data_df=train_data0, tokenizer=tokenizer, max_seq_len=MAX_LEN, pad_idx = 1, mode='train'))
train_dataset.append(MyDataset(data_df=train_data1, tokenizer=tokenizer, max_seq_len=MAX_LEN, pad_idx = 1, mode='train'))
train_dataset.append(MyDataset(data_df=train_data2, tokenizer=tokenizer, max_seq_len=MAX_LEN, pad_idx = 1, mode='train'))
train_dataset.append(MyDataset(data_df=train_data3, tokenizer=tokenizer, max_seq_len=MAX_LEN, pad_idx = 1, mode='train'))
train_dataset.append(MyDataset(data_df=train_data4, tokenizer=tokenizer, max_seq_len=MAX_LEN, pad_idx = 1, mode='train'))

val_dataset=[]
val_dataset.append(MyDataset(data_df=val_data0, tokenizer=tokenizer, max_seq_len=MAX_LEN, pad_idx = 1, mode='eval'))
val_dataset.append(MyDataset(data_df=val_data1, tokenizer=tokenizer, max_seq_len=MAX_LEN, pad_idx = 1, mode='eval'))
val_dataset.append(MyDataset(data_df=val_data2, tokenizer=tokenizer, max_seq_len=MAX_LEN, pad_idx = 1, mode='eval'))
val_dataset.append(MyDataset(data_df=val_data3, tokenizer=tokenizer, max_seq_len=MAX_LEN, pad_idx = 1, mode='eval'))
val_dataset.append(MyDataset(data_df=val_data4, tokenizer=tokenizer, max_seq_len=MAX_LEN, pad_idx = 1, mode='eval'))

Downloading tokenizer_config.json:   0%|          | 0.00/375 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/243k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/734k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

## Normal Training set



In [None]:
# BATCH_SIZE = 64
# MAX_LEN = 30
# tokenizer = AutoTokenizer.from_pretrained('klue/roberta-large')

# train_dataset = MyDataset(
#     data_df=train_data,
#     tokenizer=tokenizer,
#     max_seq_len=MAX_LEN,
#     pad_idx=1,
#     mode='train'
# )
# test_dataset = MyDataset(
#     data_df=test_data,
#     tokenizer=tokenizer,
#     max_seq_len=MAX_LEN,
#     pad_idx=1,
#     mode='test'
# )

# train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Build Model

transformer 의 encdoer n개를 통과시켜, 마지막 output중 첫번째 token이 Affine Layer을 지나는 model이다.

- d_model = 512
- d_word_vec = 512
- n_layers = 6
- n_head = 8
- d_inner = 2048
- dropout = 0.1
- n_position = 50
- pad_idx = 1

In [18]:
tokenizer.vocab['[PAD]']

1

In [19]:
pip install einops

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting einops
  Downloading einops-0.4.1-py3-none-any.whl (28 kB)
Installing collected packages: einops
Successfully installed einops-0.4.1


In [25]:
cd /content/drive/MyDrive/Colab_Notebooks/jw_dacon_news_classification/

/content/drive/MyDrive/Colab_Notebooks/jw_dacon_news_classification


In [26]:
pwd

'/content/drive/MyDrive/Colab_Notebooks/jw_dacon_news_classification'

In [27]:
from torch import nn
from model.Models import Encoder
from einops import rearrange

def get_pad_mask(seq, pad_idx):
    return (seq != pad_idx).unsqueeze(-2)

class TransformerClassifier(nn.Module):
    
    def __init__(self, n_vocab, pad_idx, d_word_vec=512, d_model=512, d_inner=2048,
                 n_layers=6, n_head=8, d_k=64, dropout=0.1, n_position=50, num_labels=7,max_seq_len=30):
        
        super().__init__()
        self.encoder = Encoder(n_src_vocab=n_vocab, d_word_vec=d_word_vec, n_layers=n_layers, 
                               d_inner=d_inner, n_head=n_head, d_k=d_k, d_v=d_k, d_model=d_model, 
                               pad_idx=pad_idx, dropout=dropout, n_position=n_position, scale_emb=False)
        self.linear1 = nn.Linear(max_seq_len*d_model, d_model, bias=True)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(d_model, num_labels, bias=True)
        self.pad_idx = pad_idx
    
    def forward(self, context):
        
        mask = get_pad_mask(context, self.pad_idx)
        enc_output, *_ = self.encoder(context, mask)
        lin_output = self.relu(self.linear1(rearrange(enc_output,'b s d -> b (s d)')))
        seq_logit = self.relu(self.linear2(lin_output))
        
        return seq_logit

## ERROR logs when building model

### ERROR 1 : nn.embedding() 은 input Tensor가 int 형이어야한다.
c:\Users\jaeng\Desktop\VSC\DACON\text_classification\main.ipynb 셀 23 in TransformerClassifier.forward(self, context)
...
   2197     # remove once script supports set_grad_enabled
   2198     _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 2199 return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)

RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)

------------------------

학습 해보려 하는데,
- 모델 내부에서 int로 받아야하는 'indices'라는 놈에 floatTensor 가 들어가고있어 말썽인 상황이다. 내일 해결해봐야지.
- torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse) 의 첫번째 argument weight가 Int이어야하는데 Float로 들어온건가?

### ERROR 2: 
-> 3014 return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)

IndexError: Target -9223372036854775808 is out of bounds.

model 어딘가에서 normalization 일어나고있지 않나?
linear 결과에 norm 추가해야하나? 다른 모델 봐야겠다.
- Y 찍어보니 그 안에 엄청 큰 절댓값이 존재한다.. 뭐지?

-1.9357e+16 이 값이, -19356592969351168 이 된다. 왜 저 값이 존재하지?
Y label 만드는 과정에서 오류가 있을거다.

### ERROR 3: model 과 Tensor 의 .to(device) 함수 차이
https://stackoverflow.com/questions/59560043/what-is-the-difference-between-model-todevice-and-model-model-todevice

---> 36 train(classifier_model, train_loader, optimizer, device, criterion)
AttributeError: 'TransformerClassifier' object has no attribute 'device'

Model can be placed in GPU with code,
```
a = my_model()  # a is in cpu
a.to(device)    # a is moved to gpu
```

But Tensor cannot be moved to GPU with the same code.
```
a = torch.Tensor([1,2,3])
a.to(device)    # a is in cpu
a = a.to(devivce)   # a is now in gpu
```

### ERROR 4: 
RuntimeError: CUDA error: no kernel image is available for execution on the device
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

https://ndb796.tistory.com/744  을 보고 해결했다.
- 현재 커널의 python 버전과, CUDA 버전이 호환되지 않아서 발생하는 문제다.
- python 버전을 3.6 -> 3.8 업그레이드함으로 해결했다.(사실은 conda 환경을 새로 팜)
    - 내가 가진 CUDA버전이 3.6이랑 호환이 안된다.

### ERROR 5: 모델 반환모양이 ( b seq class ) 이다. ( b c ) 이어야하는데..?

in model forward... context.shape torch.Size([64, 30])
in model forward... mask.shape torch.Size([64, 1, 30])
in model forward... enc_output.shape torch.Size([64, 30, 512])      # linear 을 잘못 수행하였다.
in model forward... after_linear.shape torch.Size([64, 30, 7])
-> linear 추가해서, 한번 더 거치게 하였다.

# Train !


In [28]:
CFG = {
    'EPOCHS':12,
    'LEARNING_RATE':1e-5,
    "BATCH_SIZE":128,
    'SEED':42
}

In [38]:
classifier_model = TransformerClassifier(n_vocab=32000, pad_idx=tokenizer.vocab['[PAD]'],
                                        d_word_vec=512, d_model=512, d_inner=2048,
                                        n_layers=6, n_head=8, d_k=64, dropout=0.1, n_position=MAX_LEN)

optimizer = torch.optim.Adam(params = classifier_model.parameters(), lr = CFG['LEARNING_RATE'])
criterion = nn.CrossEntropyLoss()

# MODEL_PATH = './ckpt/'    # for locab

MODEL_PATH = './ckpt/' # for colab

In [30]:
def evaluate(model, eval_loader, loss_fn):
    """
        input: model, data_loader, loss_fn
        output: loss, acc
    """
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    with torch.no_grad():
        for batch in eval_loader:    # ERROR1: 
            # print('================INSIDE evaluate()==================\n\n')
            # print('what eval_loader gives: ',batch)
            X = batch[0]
            gt = batch[1].to(device)
            Y = torch.argmax(batch[1], dim=1)
            # print('Y exists in the form of: ', Y)
            X = X.to(device)
            Y = Y.to(device)
            output = model(X)
            epoch_loss += loss_fn(output, Y).item()
            epoch_acc += binary_accuracy(output, gt).item()
            # print('================================================\n\n')
    
    # print('================INSIDE evaluate()==================\n\n')
    # print('epoch_loss: ', epoch_loss)
    # print('epoch_ascc: ', epoch_acc)
    # print('epoch_loss_compute', epoch_loss/len(eval_loader))
    # print('epoch_acc_compute', epoch_acc/len(eval_loader))
    # print('epoch_loss_type', type(epoch_loss/len(eval_loader)))
    # print('epoch_acc_type', type(epoch_acc/len(eval_loader)))
    # print('================================================\n\n')
    return epoch_loss/len(eval_loader), epoch_acc/len(eval_loader)


def binary_accuracy(pred, gt):
    # pred (B score)
    # gt (B binary_value)
    pred_class = torch.argmax(pred, dim=1)  # (B, )
    gt_class = torch.argmax(gt, dim=1)      # (B, )
    result = pred_class==gt_class
    
    return result.sum()/len(result)

def show_me_what_you_got(pred,gt):
    """
    Just to validate the accuracy by my eyes.
    """
    print(
        '\n********************** SHOW ME WHAT YOU GOT, MODEL. **********************'
    )
    pred_class = torch.argmax(pred, dim=1)  # (B,)
    gt_class = torch.argmax(gt, dim=1)
    print('pred_class: ',pred_class)
    print('gt_class: ', gt_class)
    result = pred_class==gt_class
    print('Acc: ',result.sum()/len(result))
    print(
        '********************************* THANKS *********************************\n'
    )
    

In [80]:
import os

def train(model, train_loader, valid_loader, optimizer, scheduler, device, criterion, ckpt_path):
    model.to(device)
    criterion = criterion.to(device)   # loss
    best_val_loss = 1e5

    for epoch in range(1,CFG['EPOCHS']+1):
        model.train()
        train_loss = []
        train_acc = []

        for X,Y in tqdm(train_loader, desc='Training ...'):
            X = X.to(device)
            Y = Y.to(device)
            optimizer.zero_grad()
            
            # Forward
            output = model(X)
            loss = criterion(output, Y)
            acc = binary_accuracy(output, Y)    # acc for each batch -- is this right..?
            
            # Backward
            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())
            train_acc.append(acc.item())
        # scheduler.step()

        print(f"Epoch {epoch}  |  Train Loss : [{np.mean(train_loss):.5f}]  |  Train Acc: [{np.mean(train_acc):5f}] ")
        
        # Check if metric works well (per epoch)
        # show_me_what_you_got(output, Y)
        # If Validation 
        if valid_loader!=None:
            
            val_loss, val_acc = evaluate(model, valid_loader, criterion)
            
            print(f"Epoch {epoch}  |  Valid Loss : [{np.mean(val_loss):.5f}]  |  Valid Acc: [{np.mean(val_acc):5f}] ")
            # Best Model
            if val_loss < best_val_loss: 
                best_val_loss = val_loss
                torch.save(model.state_dict(), os.path.join(ckpt_path, 'best_model.pth'))
                print('============= Best model saved! =============')
        
        # Model Saving
        torch.save(model.state_dict(), os.path.join(ckpt_path, f'model_epoch{epoch}.pth'))
        print('================ model saved ================')
        
        
        

# simple trianing for whole training set

In [None]:
classifier_model = TransformerClassifier(n_vocab=32000, pad_idx=tokenizer.vocab['[PAD]'],
                                        d_word_vec=512, d_model=512, d_inner=2048,
                                        n_layers=6, n_head=8, d_k=64, dropout=0.1, n_position=MAX_LEN)
optimizer = torch.optim.Adam(params = classifier_model.parameters(), lr = CFG['LEARNING_RATE'])
criterion = nn.CrossEntropyLoss()
ckpt_path = os.path.join(MODEL_PATH, 'simple_train')

train(
    model=classifier_model,
    train_loader=train_loader,
    valid_loader=None,
    optimizer=optimizer,
    device=device,
    criterion=criterion,
    ckpt_path=ckpt_path
    )

NameError: ignored

#### FOUND ERROR

accuracy 평가방식을 완전 잘못 구현했었다. 그래서 틀린게 많을수록 이상한 acc가 높게 나왔던 것이라 생각된다.<br/>
-> 결과에 argmax를 안하고 비교를 해서 엉망진창이었다. 해결!

## Simaple Train Data - Test Data Inference

In [None]:
test_dataset[0]

In [None]:
# Inference
import pandas as pd
import numpy as np

def predict(model, test_loader, device):
    model.to(device)
    model.eval()
    preds=[]
    with torch.no_grad():
        for X, in tqdm(test_loader, desc='Inferencing ...'):
            X = X.to(device)
            
            pred = model(X)
            preds += pred.cpu().tolist()
    
    # pd.DataFrame(preds).to_csv('test_result.csv')
    result2classidx(preds)
    print('======= Saved as \'test_result.csv\' =======')

def result2classidx(preds):
    """
    preds (N class_num=7)
    """
    result = []
    for i, data_idx in enumerate(range(45654, 54785)):
        topic_index = np.argmax(preds[i])
        # interrupt : test결과보고, train에서 truth랑 비교하는부분 내가 argmax 신경 잘썼는지 체크해봐야겠다 생각.
t = [1,2,3,4,5]
result2classidx(t)

In [None]:
predict(
    model = classifier_model,
    test_loader=test_loader,
    device=device
)

# Stratified K-fold Data - training 

In [82]:
# import torch.optim as optim
import os

for fold in range(5):
    print(f'Fold {fold} Processing ...')
    
    classifier_model = TransformerClassifier(n_vocab=32000, pad_idx=tokenizer.vocab['[PAD]'],
                                        d_word_vec=512, d_model=512, d_inner=2048,
                                        n_layers=6, n_head=8, d_k=64, dropout=0.1, n_position=MAX_LEN)
    optimizer = torch.optim.Adam(params = classifier_model.parameters(), lr = CFG['LEARNING_RATE'])
    # scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=1e-2, steps_per_epoch=571, epochs=12, anneal_strategy='linear')
    criterion = nn.CrossEntropyLoss()
    train_loader = DataLoader(train_dataset[fold], batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset[fold], batch_size=BATCH_SIZE, shuffle=False)
    ckpt_path = os.path.join(MODEL_PATH, f'fold{fold}')
    
    train(model=classifier_model, train_loader=train_loader,valid_loader=val_loader, 
          optimizer=optimizer, scheduler=scheduler, device=device, 
          criterion=criterion, ckpt_path=ckpt_path)

Fold 0 Processing ...


Training ...: 100%|██████████| 571/571 [00:37<00:00, 15.28it/s]


Epoch 1  |  Train Loss : [1.62688]  |  Train Acc: [0.376607] 
Epoch 1  |  Valid Loss : [1.17452]  |  Valid Acc: [0.572143] 


Training ...:  12%|█▏        | 70/571 [00:04<00:33, 15.15it/s]


KeyboardInterrupt: ignored

### ERRORS

##### ERROR 1 : ValueError: not enough values to unpack (expected 2, got 1)
- it was 코드 오타


#### ERROR 1 : Loss[1.94591], Acc[1.105499],[1.105548] for "fold1,4"
- Train Loss, Valid Loss 값이 똑같게 나온다.
- Accuracy 는 다르지만, 거의 비슷한 수준이다.

1. 아예 같은 Loss 가 나온다는 것 은, 같은 데이터가 들어갔다는 것 이고,
2. 다른 데이터셋인데 Train, Valid Loss가 같다는것은, --> 
    - weight 들이 .eval() 모드에서 잘 loaded 됐나?
3. 한 데이터셋에서 (Valid)Acc가 같다는 것은, --> epoch마다 train data를 통해 학습이 전혀 이뤄지지 않았다는 것 이다.

### ISSUES

##### Issues 1
1. 왜 epoch10 이후로 성능이 확 감소한거지? train에 overfitting 된것도 아니고, lr 문제인가?
    - acc 산식이 완전 틀렸었다.
2. 왜 inital training 시, accuracy 가 train, valid 둘 다 50%에 달하는거지?  binary classification도 아니고, 
    - acc 산식이 완전 틀렸었다.

# Ensemble

http://www.dinnopartners.com/__trashed-4/ 참조함.

1. 각 모델 load
2. 각 모델 predict
    - 1,4 모델 reliability check 선행
3. 각 모델 Hard Voting -> generalized prediction.

## 1. Load

In [40]:
# 1. 각 모델 loading
import os


model_0 = TransformerClassifier(n_vocab=32000, pad_idx=tokenizer.vocab['[PAD]'],
                                d_word_vec=512, d_model=512, d_inner=2048,
                                n_layers=6, n_head=8, d_k=64, dropout=0.1, n_position=MAX_LEN)
model_0.load_state_dict(torch.load('./ckpt/fold0/best_model.pth'))
model_0.eval()

model_1 = TransformerClassifier(n_vocab=32000, pad_idx=tokenizer.vocab['[PAD]'],
                                d_word_vec=512, d_model=512, d_inner=2048,
                                n_layers=6, n_head=8, d_k=64, dropout=0.1, n_position=MAX_LEN)
model_1.load_state_dict(torch.load('./ckpt/fold1/best_model.pth'))
model_1.eval()

model_2 = TransformerClassifier(n_vocab=32000, pad_idx=tokenizer.vocab['[PAD]'],
                                d_word_vec=512, d_model=512, d_inner=2048,
                                n_layers=6, n_head=8, d_k=64, dropout=0.1, n_position=MAX_LEN)
model_2.load_state_dict(torch.load('./ckpt/fold2/best_model.pth'))
model_2.eval()

model_3 = TransformerClassifier(n_vocab=32000, pad_idx=tokenizer.vocab['[PAD]'],
                                d_word_vec=512, d_model=512, d_inner=2048,
                                n_layers=6, n_head=8, d_k=64, dropout=0.1, n_position=MAX_LEN)
model_3.load_state_dict(torch.load('./ckpt/fold3/best_model.pth'))
model_3.eval()

model_4 = TransformerClassifier(n_vocab=32000, pad_idx=tokenizer.vocab['[PAD]'],
                                d_word_vec=512, d_model=512, d_inner=2048,
                                n_layers=6, n_head=8, d_k=64, dropout=0.1, n_position=MAX_LEN)
model_4.load_state_dict(torch.load('./ckpt/fold4/best_model.pth'))
model_4.eval()



TransformerClassifier(
  (encoder): Encoder(
    (src_word_emb): Embedding(32000, 512, padding_idx=1)
    (position_enc): PostiionalEncoding()
    (dropout): Dropout(p=0.1, inplace=False)
    (layer_stack): ModuleList(
      (0): EncoderLayer(
        (slf_attn): MultiHeadAttention(
          (w_q): Linear(in_features=512, out_features=512, bias=False)
          (w_k): Linear(in_features=512, out_features=512, bias=False)
          (w_v): Linear(in_features=512, out_features=512, bias=False)
          (fc): Linear(in_features=512, out_features=512, bias=False)
          (attention): ScaledDotProductAttention(
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (dropout): Dropout(p=0.1, inplace=False)
          (layer_norm): LayerNorm((512,), eps=1e-06, elementwise_affine=True)
        )
        (pos_ffn): PositionwiseFeedForward(
          (w_1): Linear(in_features=512, out_features=2048, bias=True)
          (w_2): Linear(in_features=2048, out_features=512, bias

## 2. Predict

In [41]:
def predict(model, test_loader, device):
    model.to(device)
    model.eval()
    preds=[]
    with torch.no_grad():
        for X, in tqdm(test_loader, desc='Inferencing ...'):
            X = X.to(device)
            
            pred = model(X)
            pred = torch.argmax(pred.cpu(), dim=1)
            pred = np.array(pred).tolist()
            preds += pred
    
    return preds

## Test Data

In [42]:
test_dataset = MyDataset(data_df=test_data, tokenizer=tokenizer, max_seq_len=MAX_LEN, pad_idx = 1, mode='test')

# make test loader
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

## Model Load

In [43]:
# preds_0 = predict(model_0, test_loader, device)
preds_1 = predict(model_1, test_loader, device)
preds_2 = predict(model_2, test_loader, device)
preds_3 = predict(model_3, test_loader, device)
preds_4 = predict(model_4, test_loader, device)

indices = np.arange(45654,54784+1).tolist()

col_name = ['index', 'topic_idx']

Inferencing ...: 100%|██████████| 143/143 [00:03<00:00, 42.73it/s]
Inferencing ...: 100%|██████████| 143/143 [00:03<00:00, 43.40it/s]
Inferencing ...: 100%|██████████| 143/143 [00:03<00:00, 43.69it/s]
Inferencing ...: 100%|██████████| 143/143 [00:03<00:00, 42.80it/s]
Inferencing ...: 100%|██████████| 143/143 [00:03<00:00, 43.39it/s]


Voating

In [67]:
# Model들의 결과를 hard voting 해줬다.
import numpy as np

preds = []
for i in range(len(preds_1)):
    logit = []
    # logit += [preds_1[i]]     홀수개수 맞춰준다.
    logit += [preds_2[i]]
    logit += [preds_3[i]]
    logit += [preds_4[i]]
    counts = np.array([0,0,0,0,0,0,0])

    for i in range(4):
        counts[logit[0]] += 1
    
    preds.append(np.argmax(counts))


9131

In [68]:
test_inference = pd.DataFrame({
    'index':indices,
    'topic_idx':preds
})

test_inference.to_csv('./prediction/pred_ensembled.csv',index=False)