In [1]:
import os
import torch
import pandas as pd
import numpy as np
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
from torch.nn.utils.rnn import pad_sequence
from transformers import BertForSequenceClassification, AdamW
from tqdm import tqdm

### Dataset 및 DataLoader 생성

In [2]:
# 반드시 do_lower_case=True로 해야 한다.
# bert-base-uncased는 영어 데이터를 소문자로 변환해서 학습한 모델이기 때문이다.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [3]:
class CoLADataset(Dataset):
    def __init__(self, path, tokenizer, is_train=True, is_inference=False):
        if is_train:
            filename = os.path.join(path, 'raw/in_domain_train.tsv')
        else:
            if is_inference:
                filename = os.path.join(path, 'raw/out_of_domain_dev.tsv')
            else:
                filename = os.path.join(path, 'raw/in_domain_dev.tsv')
        df = pd.read_csv(filename, sep='\t', names=['source', 'label', 'judgement', 'text'])
        self.input_ids = []
        self.token_type_ids = []
        self.attention_mask = []
        for t in df.text:
            inp = tokenizer(t, return_tensors='pt')
            self.input_ids.append(inp['input_ids'])
            self.token_type_ids.append(inp['token_type_ids'])
            self.attention_mask.append(inp['attention_mask'])
        self.label = df.label
        
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return [self.input_ids[idx], self.token_type_ids[idx], self.attention_mask[idx], self.label[idx]]

In [4]:
train_dataset = CoLADataset('../../data/cola_classification', tokenizer)
eval_dataset = CoLADataset('../../data/cola_classification', tokenizer, is_train=False)

In [5]:
len(train_dataset), len(eval_dataset)

(8551, 527)

In [6]:
def collate_fn(batch):
    input_ids = [b[0][0] for b in batch]
    token_type_ids = [b[1][0] for b in batch]
    attention_mask = [b[2][0] for b in batch]
    label = torch.tensor([b[3] for b in batch])
    input_ids = pad_sequence(input_ids, batch_first=True)
    token_type_ids = pad_sequence(token_type_ids, batch_first=True)
    attention_mask = pad_sequence(attention_mask, batch_first=True)
    return input_ids, token_type_ids, attention_mask, label

In [7]:
# collate_fn은 batch 단위의 데이터에 적용해야 하는 작업을 수행할 때 사용하면 된다.
# 가령, 모델의 입력 데이터 사이즈는 일정해야 하기 때문에 pad_sequence 등의 함수를 통해 길이를 맞춰줘야 한다.
# 이 작업을 Dataset에서 할 경우 불필요하게 메모리를 많이 사용하게 되기 때문에
# collate_fn을 이용해서 각 batch가 생성될 때마다 pad_sequence를 적용해주는 것이다.
train_dataloader = DataLoader(train_dataset, collate_fn=collate_fn, batch_size=32, shuffle=True)

In [8]:
for i, d in enumerate(train_dataloader):
    if i > 10:
        break
    input_ids, token_type_ids, attention_mask, labels = d
    print(input_ids.shape, token_type_ids.shape, attention_mask.shape, labels.shape)

torch.Size([32, 18]) torch.Size([32, 18]) torch.Size([32, 18]) torch.Size([32])
torch.Size([32, 18]) torch.Size([32, 18]) torch.Size([32, 18]) torch.Size([32])
torch.Size([32, 21]) torch.Size([32, 21]) torch.Size([32, 21]) torch.Size([32])
torch.Size([32, 35]) torch.Size([32, 35]) torch.Size([32, 35]) torch.Size([32])
torch.Size([32, 19]) torch.Size([32, 19]) torch.Size([32, 19]) torch.Size([32])
torch.Size([32, 24]) torch.Size([32, 24]) torch.Size([32, 24]) torch.Size([32])
torch.Size([32, 18]) torch.Size([32, 18]) torch.Size([32, 18]) torch.Size([32])
torch.Size([32, 28]) torch.Size([32, 28]) torch.Size([32, 28]) torch.Size([32])
torch.Size([32, 22]) torch.Size([32, 22]) torch.Size([32, 22]) torch.Size([32])
torch.Size([32, 25]) torch.Size([32, 25]) torch.Size([32, 25]) torch.Size([32])
torch.Size([32, 25]) torch.Size([32, 25]) torch.Size([32, 25]) torch.Size([32])


### Train

모델을 학습하는 코드

In [9]:
# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 2).cuda()
model.train()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [10]:
# Optimizer와 Loss 함수는 가장 일반적인 것으로 정의했다.
# 이 노트북 파일의 목적은 BERT를 이용해서 높은 성능의 모델을 간편하게 만들 수 있다는 것을 보여주기 위함이다.
# Optimizer와 Loss를 최적화할 경우 좋은 성능이 나온 이유를 잘 설명할 수 없다.
optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)
loss = nn.CrossEntropyLoss()

In [11]:
n_epoch = 20

In [12]:
def train(model, dataloader, optimizer):
    tbar = tqdm(dataloader, desc='Training', leave=True)
    
    total_loss = 0.0
    for i, d in enumerate(tbar):
        optimizer.zero_grad()
        input_ids, token_type_ids, attention_mask, labels = d
        
        # to cuda
        input_ids = input_ids.cuda()
        attention_mask = attention_mask.cuda()
        labels = labels.cuda()
        
        # train model
        out = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = out[0]
        
        #print('before backward: {}'.format(loss))
        loss.backward()
        #print('after backward: {}'.format(loss))
        
        optimizer.step()
        
        total_loss += loss.data.item()
        tbar.set_description("Average Loss = {:.4f})".format(total_loss/(i+1)))
        

In [13]:
for i in range(n_epoch):
    train(model, train_dataloader, optimizer)

Average Loss = 0.4773): 100%|██████████| 268/268 [00:29<00:00,  9.18it/s]
Average Loss = 0.2876): 100%|██████████| 268/268 [00:29<00:00,  9.14it/s]
Average Loss = 0.1710): 100%|██████████| 268/268 [00:30<00:00,  8.87it/s]
Average Loss = 0.1203): 100%|██████████| 268/268 [00:30<00:00,  8.90it/s]
Average Loss = 0.0922): 100%|██████████| 268/268 [00:30<00:00,  8.89it/s]
Average Loss = 0.0647): 100%|██████████| 268/268 [00:29<00:00,  9.02it/s]
Average Loss = 0.0525): 100%|██████████| 268/268 [00:29<00:00,  8.94it/s]
Average Loss = 0.0449): 100%|██████████| 268/268 [00:29<00:00,  8.99it/s]
Average Loss = 0.0411): 100%|██████████| 268/268 [00:29<00:00,  8.96it/s]
Average Loss = 0.0343): 100%|██████████| 268/268 [00:29<00:00,  8.94it/s]
Average Loss = 0.0291): 100%|██████████| 268/268 [00:29<00:00,  8.97it/s]
Average Loss = 0.0318): 100%|██████████| 268/268 [00:30<00:00,  8.92it/s]
Average Loss = 0.0206): 100%|██████████| 268/268 [00:29<00:00,  8.99it/s]
Average Loss = 0.0227): 100%|█████████

In [14]:
torch.save(model.state_dict(), 'cola_model.bin')

In [15]:
! ls -alh *.bin

-rw-r--r-- 1 jkfirst deep-learners 418M  6월 18 05:28 cola_model.bin
-rw-r--r-- 1 jkfirst deep-learners 418M  6월 18 05:06 cola_model_no_pretrained.bin


### Inference

학습한 모델을 로딩해서 Inference하는 코드

In [9]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef

In [10]:
# 테스트를 위한 CoLA 데이터셋 로딩 및 DataLoader 클래스 생성
test_dataset = CoLADataset('../../data/cola_classification', tokenizer, is_train=False, is_inference=True)
test_dataloader = DataLoader(test_dataset, collate_fn=collate_fn, batch_size=32, shuffle=False)

In [11]:
# 모델 정의
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [14]:
# 학습한 모델 로딩
model.load_state_dict(torch.load('cola_model.bin', map_location='cpu'))
#model.load_state_dict(torch.load('cola_model_no_pretrained.bin', map_location='cpu'))
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [15]:
def inference(model, dataloader):
    tbar = tqdm(dataloader, desc='Inference', leave=True)
    
    label_list = []
    pred_list = []
    for i, d in enumerate(tbar):
        input_ids, token_type_ids, attention_mask, labels = d
                
        # do inference
        pred = model(input_ids=input_ids, attention_mask=attention_mask)
        pred = pred[0].argmax(dim=1)
        
        label_list.extend(labels.cpu().data.numpy())
        pred_list.extend(pred.cpu().data.numpy())

    labels = np.array(label_list)
    preds = np.array(pred_list)
    
    return labels, preds

In [16]:
labels, preds = inference(model, test_dataloader)

Inference: 100%|██████████| 17/17 [00:06<00:00,  2.45it/s]


In [17]:
(labels==preds).mean()

0.8236434108527132

In [18]:
confusion_matrix(labels, preds)

array([[ 97,  65],
       [ 26, 328]])

In [19]:
matthews_corrcoef(labels, preds)

0.5721810924354415