## BERT with HuggingFace

- Date: 2021.08.25
- Made by Cheonsol Lee

### Reference
- http://yonghee.io/bert_binary_classification_naver/
- https://colab.research.google.com/drive/1tIf0Ugdqg4qT7gcxia3tL7und64Rv1dP#scrollTo=I3vlyUJuVRo5  

In [1]:
import torch
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import get_linear_schedule_with_warmup

import pandas as pd
import numpy as np
import random
import time
import datetime

In [2]:
train = pd.read_csv("./../../data/naver/ratings_train.txt", sep='\t')
test = pd.read_csv("./../../data/naver/ratings_test.txt", sep='\t')

print(train.shape)
print(test.shape)

(150000, 3)
(50000, 3)


In [2]:
tokenizer

NameError: name 'tokenizer' is not defined

## Part 1. Pre-processing

### 1-1. Training set

In [None]:
sentences = ["[CLS] " + str(document) + " [SEP]" for document in train.document]
labels = train['label'].values

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

MAX_LEN = 128
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

attention_masks = []
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

train_inputs = torch.tensor(train_inputs) # 시퀀스를 토큰 ID로 표현
train_labels = torch.tensor(train_labels) # 긍/부정
train_masks = torch.tensor(train_masks) # 패딩 마스크 (attention mask)
validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks)    
    
BATCH_SIZE = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=BATCH_SIZE)    

### 1-2. Testing set

In [20]:
sentences = ["[CLS] " + str(document) + " [SEP]" for document in test.document]
labels = test['label'].values

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

MAX_LEN = 128
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

attention_masks = []
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

test_inputs = torch.tensor(input_ids)
test_labels = torch.tensor(labels)
test_masks = torch.tensor(attention_masks)

BATCH_SIZE = 32

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=BATCH_SIZE)

---
## Part 2. Model Training

In [21]:
n_devices = torch.cuda.device_count()

if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    for i in range(n_devices):
        print("Device",i,":", torch.cuda.get_device_name(i))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

There are 1 GPU(s) available.
Device 0 : NVIDIA GeForce RTX 3080


NVIDIA GeForce RTX 3080 with CUDA capability sm_86 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_37 sm_50 sm_60 sm_61 sm_70 sm_75 compute_37.
If you want to use the NVIDIA GeForce RTX 3080 GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/



### 2-1. Pre-training

In [22]:
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2) #label개수에 따라 변경
model.cuda()

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

### 2-2. Hyperparameter setting

In [23]:
# 옵티마이저 설정
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # 학습률
                  eps = 1e-8 # 0으로 나누는 것을 방지하기 위한 epsilon 값
                )

# 에폭수
epochs = 2

# 총 훈련 스텝 = 배치반복 횟수 * 에폭
total_steps = len(train_dataloader) * epochs

# lr 조금씩 감소시키는 스케줄러
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [24]:
# 정확도 계산 함수
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# 시간 표시 함수
def format_time(elapsed):
    # 반올림
    elapsed_rounded = int(round((elapsed)))
    # hh:mm:ss으로 형태 변경
    return str(datetime.timedelta(seconds=elapsed_rounded))

### 2-3. Run Training

In [25]:
# 재현을 위해 랜덤시드 고정
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# 그래디언트 초기화
model.zero_grad()

In [68]:
# 에폭 수만큼 반복
for epoch in range(epochs):
    
    # ========================================
    #               1. Training
    # ========================================
    
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch + 1, epochs))
    print('Training...')
    
    # 시작 시간 설정
    start_time = time.time()
    
    # 로스 초기화
    total_loss = 0
    
    # 훈련모드로 변경
    model.train()
    
    for step, batch in enumerate(train_dataloader):
        # 경과 정보 표시 (step 500번마다 출력)
        if step % 500 == 0 and not step == 0:
            elapsed = format_time(time.time() - start_time)
            print('Batch {:>5,}  of  {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
        
        # 배치를 GPU에 올림
        batch = tuple(b.to(device) for b in batch)
        
        # 배치에서 데이터 추출 (input, mask, label 순으로 넣었었음)
        b_input_ids, b_input_mask, b_labels = batch
        
        # forward 수행
        outputs = model(b_input_ids,
                        attention_mask=b_input_mask,
                       token_type_ids=None,
                        labels=b_labels)
        
        # 로스 구함
        loss = outputs.loss # outputs[0]
        
        # 총 로스 계산
        total_loss += loss.item()
        
        # Backward 수행으로 그래디언트 계산 (Back-propagation)
        loss.backward()
        
        # 그래디언트 클리핑
        torch.nn.utils.clip_grad_norm_(model.parameters(), 2.0) # 예제 코드에서는 1.0이었음
        
        # 그래디언트를 이용해 가중치 파라미터를 lr만큼 업데이트
        optimizer.step()
        
        # 스케줄러로 학습률 감소
        scheduler.step()
        
        # 그래디언트 초기화
        ## (호출시 경사값을 0으로 설정. 이유 : 반복 때마다 기울기를 새로 계산하기 때문)
        model.zero_grad()
    
    # 1 에폭이 끝나면 평균 train 로스 계산 (전체 loss / 배치 수)
    avg_train_loss = total_loss / len(train_dataloader)
    
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - start_time)))
    
    # ========================================
    #               2. Validation
    # ========================================
    
    # 1 에폭이 끝나면 validation 시행
    
    print("")
    print("Running Validation...")
    
    # 시작 시간 설정
    start_time = time.time()
    
    # 평가 모드로 변경
    model.eval()
    
    # 변수 초기화
    total_valid_accuracy = 0
    nb_valid_steps = 0
    
    # valid 데이터로더에서 배치만큼 반복하여 가져옴
    for batch in validation_dataloader:
        
        # 배치를 GPU에 넣음
        batch = tuple(b.to(device) for b in batch)
        
        # 배치에서 데이터 추출
        b_input_ids, b_input_mask, b_labels = batch
        
        # 그래디언트 계산 안함!
        with torch.no_grad():
            # Forward 수행
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        # 로스 구함 (train할 때는 loss, validation할 때는 logits)
        ## logits은 softmax를 거치기 전의 classification score를 반환합니다. shape: (batch_size, config.num_labels)
        logits = outputs.logits

        # CPU로 데이터 이동
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # 출력 로짓과 라벨을 비교하여 정확도 계산
        valid_accuracy = flat_accuracy(logits, label_ids)
        total_valid_accuracy += valid_accuracy

    print("  Accuracy: {0:.2f}".format(total_valid_accuracy/len(validation_dataloader)))
    print("  Validation took: {:}".format(format_time(time.time() - start_time)))

print("")
print("Total took: {:}".format(format_time(time.time() - start_time)))
print("Training complete!")


Training...
Batch   500  of  4,219. Elapsed: 0:08:01.
Batch 1,000  of  4,219. Elapsed: 0:16:03.
Batch 1,500  of  4,219. Elapsed: 0:24:04.
Batch 2,000  of  4,219. Elapsed: 0:32:04.
Batch 2,500  of  4,219. Elapsed: 0:40:05.
Batch 3,000  of  4,219. Elapsed: 0:48:05.
Batch 3,500  of  4,219. Elapsed: 0:56:05.
Batch 4,000  of  4,219. Elapsed: 1:04:05.

  Average training loss: 0.28
  Training epcoh took: 1:07:35

Running Validation...
  Accuracy: 0.86
  Validation took: 0:02:25

Training...
Batch   500  of  4,219. Elapsed: 0:08:00.
Batch 1,000  of  4,219. Elapsed: 0:16:00.
Batch 1,500  of  4,219. Elapsed: 0:24:00.
Batch 2,000  of  4,219. Elapsed: 0:32:34.
Batch 2,500  of  4,219. Elapsed: 0:41:35.
Batch 3,000  of  4,219. Elapsed: 0:50:47.
Batch 3,500  of  4,219. Elapsed: 0:59:50.
Batch 4,000  of  4,219. Elapsed: 1:08:30.

  Average training loss: 0.26
  Training epcoh took: 1:12:04

Running Validation...
  Accuracy: 0.87
  Validation took: 0:02:28

Training complete!


### 2-4. Save the model

In [None]:
torch.save(model.state_dict(), "model.pth")

---
## Part 3. Model Testing

### 3-1. Load the model

In [26]:
model.load_state_dict(torch.load("./../../data/model.pth"))
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

### 3-2. Run Testing

In [27]:
#시작 시간 설정
start_time = time.time()

# 평가모드로 변경
model.eval()

# 변수 초기화
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

# 데이터로더에서 배치만큼 반복하여 가져옴
for step, batch in enumerate(test_dataloader):
    # 경과 정보 표시
    if step % 100 == 0 and not step == 0:
        elapsed = format_time(time.time() - start_time)
        print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(test_dataloader), elapsed))

    # 배치를 GPU에 넣음
    batch = tuple(b.to(device) for b in batch)
    
    # 배치에서 데이터 추출
    b_input_ids, b_input_mask, b_labels = batch
    
    # 그래디언트 계산 안함
    with torch.no_grad():     
        # Forward 수행
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)
    
    # 로스 구함
    logits = outputs[0]

    # CPU로 데이터 이동
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    
    # 출력 로짓과 라벨을 비교하여 정확도 계산
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    eval_accuracy += tmp_eval_accuracy

print("")
print("Accuracy: {0:.2f}".format(eval_accuracy/len(test_dataloader)))
print("Test took: {:}".format(format_time(time.time() - start_time)))

  Batch   100  of  1,563.    Elapsed: 0:01:58.
  Batch   200  of  1,563.    Elapsed: 0:02:28.
  Batch   300  of  1,563.    Elapsed: 0:02:59.
  Batch   400  of  1,563.    Elapsed: 0:03:30.
  Batch   500  of  1,563.    Elapsed: 0:04:01.
  Batch   600  of  1,563.    Elapsed: 0:04:32.
  Batch   700  of  1,563.    Elapsed: 0:05:03.
  Batch   800  of  1,563.    Elapsed: 0:05:34.
  Batch   900  of  1,563.    Elapsed: 0:06:05.
  Batch 1,000  of  1,563.    Elapsed: 0:06:36.
  Batch 1,100  of  1,563.    Elapsed: 0:07:07.
  Batch 1,200  of  1,563.    Elapsed: 0:07:38.
  Batch 1,300  of  1,563.    Elapsed: 0:08:09.
  Batch 1,400  of  1,563.    Elapsed: 0:08:40.
  Batch 1,500  of  1,563.    Elapsed: 0:09:11.

Accuracy: 0.87
Test took: 0:09:30


---
## Part 4. Test new sentences

In [28]:
# 입력 데이터 변환
def convert_input_data(sentences):

    # BERT의 토크나이저로 문장을 토큰으로 분리
    tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

    # 입력 토큰의 최대 시퀀스 길이
    MAX_LEN = 128

    # 토큰을 숫자 인덱스로 변환
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    
    # 문장을 MAX_LEN 길이에 맞게 자르고, 모자란 부분을 패딩 0으로 채움
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

    # 어텐션 마스크 초기화
    attention_masks = []

    # 어텐션 마스크를 패딩이 아니면 1, 패딩이면 0으로 설정
    # 패딩 부분은 BERT 모델에서 어텐션을 수행하지 않아 속도 향상
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)

    # 데이터를 파이토치의 텐서로 변환
    inputs = torch.tensor(input_ids)
    masks = torch.tensor(attention_masks)

    return inputs, masks

In [29]:
# 문장 테스트
def test_sentences(sentences):

    # 평가모드로 변경
    model.eval()

    # 문장을 입력 데이터로 변환
    inputs, masks = convert_input_data(sentences)

    # 데이터를 GPU에 넣음
    b_input_ids = inputs.to(device)
    b_input_mask = masks.to(device)
            
    # 그래디언트 계산 안함
    with torch.no_grad():     
        # Forward 수행
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)

    # 로스 구함
    logits = outputs[0]

    # CPU로 데이터 이동
    logits = logits.detach().cpu().numpy()

    return logits

In [30]:
logits = test_sentences(['연기는 별로지만 재미 하나는 끝내줌!'])

print(logits)
print(np.argmax(logits))

[[-1.2956648  1.3821511]]
1


In [31]:
logits = test_sentences(['주연배우가 아깝다. 총체적 난국...'])

print(logits)
print(np.argmax(logits))

[[ 2.9581914 -3.235781 ]]
0
