In [8]:
from transformers import BertModel

bert = BertModel.from_pretrained('bert-base-uncased')
print(bert.config.to_dict()['hidden_size'])
 # bert 모델의 output은 token 별로 지정된 hidden layer size의 vector로 나온다.
 # FC layer 1개를 더 추가하기위해 hidden layer size를 확인한다.

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


768


In [9]:
import re
import sys
import random

import torch
import torch.nn as nn
import torch.nn.functional as F

from torchtext import data
from torchtext import datasets

from transformers import BertTokenizer, BertModel

In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

sentence = "My dog is cute. He likes playing. I bought a  pet food for him"
sentence2 = '나는 책상 위에 사과를 먹었다. 알고 보니 그 사과는 Jason 것이었다. 그래서 Jason에게 사과를 했다'
print(tokenizer.tokenize(sentence))
print(tokenizer.tokenize(sentence2))

['my', 'dog', 'is', 'cute', '.', 'he', 'likes', 'playing', '.', 'i', 'bought', 'a', 'pet', 'food', 'for', 'him']
['ᄂ', '##ᅡ', '##ᄂ', '##ᅳ', '##ᆫ', 'ᄎ', '##ᅢ', '##ᆨ', '##ᄉ', '##ᅡ', '##ᆼ', '[UNK]', 'ᄉ', '##ᅡ', '##ᄀ', '##ᅪ', '##ᄅ', '##ᅳ', '##ᆯ', '[UNK]', '.', 'ᄋ', '##ᅡ', '##ᆯ', '##ᄀ', '##ᅩ', 'ᄇ', '##ᅩ', '##ᄂ', '##ᅵ', 'ᄀ', '##ᅳ', 'ᄉ', '##ᅡ', '##ᄀ', '##ᅪ', '##ᄂ', '##ᅳ', '##ᆫ', 'jason', '[UNK]', '.', 'ᄀ', '##ᅳ', '##ᄅ', '##ᅢ', '##ᄉ', '##ᅥ', 'jason', '##ᄋ', '##ᅦ', '##ᄀ', '##ᅦ', 'ᄉ', '##ᅡ', '##ᄀ', '##ᅪ', '##ᄅ', '##ᅳ', '##ᆯ', '[UNK]']


In [11]:
len(tokenizer.vocab)

30522

In [12]:
max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']
print(max_input_length)

def new_tokenizer(sentence):
    tokens = tokenizer.tokenize(sentence)
    tokens = tokens[:max_input_length-2]
    return tokens
# IMDb 데이터를 bert가 학습됐던 input과 동일하게 맞춰주어야 한다.
# Bert의 input 맨 앞에는 [CLS] 맨 뒤에는 [SEP] token을 추가하기 때문에
# IMDBb Data도 2 글자를 제거하고 위의 두 token을 넣얼 수 있도록 전처리 한다.

512


In [13]:

def PreProcessingText(input_sentence):
    input_sentence = input_sentence.lower() # 소문자화
    input_sentence = re.sub('<[^>]*>', repl= ' ', string = input_sentence) # "<br />" 처리
    input_sentence = re.sub('[!"$%&\()*+,-./:;<=>?@[\\]^_`{|}~]', repl= ' ', string = input_sentence) # 특수문자 처리 ("'" 제외)
    input_sentence = re.sub('\s+', repl= ' ', string = input_sentence) # 연속된 띄어쓰기 처리
    if input_sentence:
        return input_sentence

def PreProc(list_sentence):
    return [tokenizer.convert_tokens_to_ids(PreProcessingText(x)) for x in list_sentence]
# tokenizer.convert_tokens_to_ids
    # 기본적인 전처리와 token을 index로 바꿔주는 과정
    # bert 가 이미 가지고 있는 vocab을 사용해야 하기 때문에 전처리 과정에서 아예 벡터로 변환까지 해줌

In [14]:

TEXT = data.Field(batch_first = True,
                  use_vocab = False, # tokenizer.convert_tokens_to_ids 을 이용하기 때문에 false
                  tokenize = new_tokenizer, # 위에서 정의한 new_tokenizer 함수, input을 510 길이로
                  preprocessing = PreProc, # 위에서 정의한 함수 사용
                  init_token = tokenizer.cls_token_id, # 시작 토큰 지정 [CLS]
                  eos_token = tokenizer.sep_token_id, # 끝 토큰 지정 [SEP]
                  pad_token = tokenizer.pad_token_id, # padding 토큰 지정 [PAD]
                  unk_token = tokenizer.unk_token_id) # unkown 토큰 지정 [UNK]

LABEL = data.LabelField(dtype = torch.float)

In [15]:
# train, test set 분리
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:10<00:00, 8.39MB/s]


In [16]:
LABEL.build_vocab(train_data)

In [17]:
# train, valid set 분리
train_data, valid_data = train_data.split(random_state = random.seed(0), split_ratio=0.8)

In [19]:
print(f'Train Data Length : {len(train_data.examples)}')
print(f'Valid Data Length : {len(valid_data.examples)}')
print(f'Test Data Length : {len(test_data.examples)}')

Train Data Length : 20000
Valid Data Length : 5000
Test Data Length : 25000


In [20]:
print(vars(train_data.examples[2])['text'])
print('---- Data Sample ----')
print('Input : ')
print(tokenizer.convert_ids_to_tokens(vars(train_data.examples[2])['text']))

[2023, 2003, 1037, 10973, 2125, 1997, 1996, 2214, 22478, 18458, 100, 2019, 10251, 7155, 2003, 2730, 1999, 1037, 11576, 4509, 4926, 1998, 2010, 2269, 100, 1037, 3264, 4167, 9431, 13169, 1996, 4167, 100, 2009, 2003, 2059, 2404, 2046, 1037, 20478, 2303, 1997, 2010, 2219, 2640, 100, 2010, 2567, 100, 2019, 5992, 11067, 100, 3957, 1996, 1005, 8902, 15094, 2271, 1005, 27961, 2000, 4651, 4301, 2046, 4367, 100, 1996, 2878, 2622, 3632, 2919, 100, 2043, 1996, 4325, 3632, 2022, 22573, 8024, 100, 100, 7987, 100, 100, 100, 7987, 100, 100, 2569, 3896, 2024, 29341, 100, 1996, 5896, 2003, 11158, 100, 1998, 2009, 2003, 14742, 2008, 2023, 2003, 2025, 12459, 1037, 2978, 100, 2092, 100, 2200, 2235, 4268, 2097, 2228, 2023, 2003, 2204, 100, 2065, 2017, 2024, 2898, 8300, 2012, 2093, 1999, 1996, 2851, 1998, 2023, 3310, 2006, 100, 100, 100, 100, 2305, 100, 2305, 100, 100, 7987, 100, 100, 100, 7987, 100, 100, 1996, 2599, 3494, 2024, 2209, 2011, 8064, 27823, 100, 2198, 3347, 8490, 15202, 1998, 5811, 3235, 100, 39

In [21]:
model_config = {}

In [22]:
model_config['batch_size'] = 8

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=model_config['batch_size'],
    device=device)

In [23]:
sample_for_check = next(iter(train_iterator))
print(sample_for_check)
print(sample_for_check.text)
print(sample_for_check.label)


[torchtext.data.batch.Batch of size 8]
	[.text]:[torch.LongTensor of size 8x512]
	[.label]:[torch.FloatTensor of size 8]
tensor([[  101, 22190,  4402,  ...,     0,     0,     0],
        [  101,  2023,  3185,  ...,     0,     0,     0],
        [  101,  1037,  2879,  ...,     0,     0,     0],
        ...,
        [  101,  1045,  2001,  ...,     0,     0,     0],
        [  101, 11648,  5297,  ...,     0,     0,     0],
        [  101,  2006,  4465,  ...,  2106,  1996,   102]])
tensor([1., 1., 1., 0., 0., 0., 0., 1.])


In [25]:
bert = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_re

In [26]:
model_config['emb_dim'] = bert.config.to_dict()['hidden_size']
print(model_config['emb_dim'])

768


In [27]:
class SentenceClassification(nn.Module):
    def __init__(self, **model_config):
        super(SentenceClassification, self).__init__()
        self.bert = bert
        self.fc = nn.Linear(model_config['emb_dim'],
                            model_config['output_dim'])

    def forward(self, x):
        pooled_cls_output = self.bert(x)[1] # 그냥 [CLS]가 아닌 [CLS] Pooled output을 이용하기 위해
        return self.fc(pooled_cls_output)

In [33]:
def binary_accuracy(preds, y):
    # rounded_preds = torch.argmax(preds, axis=1)
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum()/len(correct)
    return acc

In [29]:

def train(model, iterator, optimizer, loss_fn, idx_epoch, **model_params):

    epoch_loss = 0
    epoch_acc = 0

    model.train()
    batch_size = model_params['batch_size']

    for idx, batch in enumerate(iterator):

        # Initializing
        optimizer.zero_grad()

        # Forward
        predictions = model(batch.text).squeeze()
        loss = loss_fn(predictions, batch.label)

        acc = binary_accuracy(predictions, batch.label)

        sys.stdout.write(
                    "\r" + f"[Train] Epoch : {idx_epoch:^3}"\
                    f"[{(idx + 1) * batch_size} / {len(iterator) * batch_size} ({100. * (idx + 1) / len(iterator) :.4}%)]"\
                    f"  Loss: {loss.item():.4}"\
                    f"  Acc : {acc.item():.4}"\
                    )

        # Backward
        loss.backward()
        optimizer.step()

        # Update Epoch Performance
        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss/len(iterator) , epoch_acc/len(iterator)

In [30]:

def evaluate(model, iterator, loss_fn, idx_epoch, **model_params):

    epoch_loss = 0
    epoch_acc = 0

    batch_size = model_params['batch_size']

    # evaluation mode
    model.eval()
    with torch.no_grad():
        for idx, batch in enumerate(iterator):
            predictions = model(batch.text).squeeze()
            loss = loss_fn(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

            sys.stdout.write(
                    "\r" + f"[Eval] Epoch : {idx_epoch:^3}"\
                    f"[{(idx + 1) * batch_size} / {len(iterator) * batch_size} ({100. * (idx + 1) / len(iterator) :.4}%)]"\
                    f"  Loss: {loss.item():.4}"\
                    f"  Acc : {acc.item():.4}"\
                    )

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [32]:
model_config.update(dict(output_dim = 1))
model = SentenceClassification(**model_config)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-5) # fine-tunning 시에는 learning rate를 낮춰야 한다. 이미 잘 학습된 bert의 parameter가 변경되면 안되니까
loss_fn = nn.BCEWithLogitsLoss().to(device)
model = model.to(device)

In [34]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

count_parameters(model)
# Bert 부분을 freeze 하고 fc 만 학습 시킬 경우 성능이 좋지 않다.
# 따라서 bert parameter도 학습 시켜야 하기 때문에 parameter 수가 많다.

109483009

In [35]:
# Train

N_EPOCH = 2

best_valid_loss = float('inf')
model_name = "BERT"

print('---------------------------------')
print(f'Model name : {model_name}')
print('---------------------------------')

for epoch in range(N_EPOCH):
    train_loss, train_acc = train(model, train_iterator, optimizer, loss_fn, epoch, **model_config)
    print('')
    print(f'\t Epoch : {epoch} | Train Loss : {train_loss:.4} | Train Acc : {train_acc:.4}')
    valid_loss, valid_acc = evaluate(model, valid_iterator, loss_fn, epoch, **model_config)
    print('')
    print(f'\t Epoch : {epoch} | Valid Loss : {valid_loss:.4} | Valid Acc : {valid_acc:.4}')
    # print('')
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), f'./{model_name}.pt')
        print(f'\t Model is saved at {epoch}-epoch')





---------------------------------
Model name : BERT
---------------------------------

KeyboardInterrupt: 

In [None]:
# Test

# Test set
# model.load_state_dict(torch.load(f'./{model_name}.pt'))
epoch = 0
test_loss, test_acc = evaluate(model, test_iterator, loss_fn, epoch, **model_config)
print('')
print(f'Test Loss : {test_loss:.4} | Test Acc : {test_acc:.4}')
