In [1]:
# Attention Mask : 실제토큰 1 / 0 패딩
# Token Type IDS(Segment IDs): 두개의 문장(A/B) 구성될 때 각 토큰이 어느 문장에 속하는지 알려주는 임베딩
# CLS Token Pooling : [CLS] + token + [SEP]

In [15]:
#  1. Bert Tokenizer : 단어를 의미있는 조각(subword)로 나눕니다. unbelievable "un" "believ" "able"
from transformers import BertTokenizer
# 토크나이져 로드
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
sentences = [
    'Hello World',
    'unbelievable performance!',
    'COVID-19 pendamic'
]
for sentence in sentences:
    # 토큰화
    tokens = tokenizer.tokenize(sentence)
    print(f'원문 : {sentence}')
    print(f'토큰 : {tokens}')

    # ID 변환
    ids = tokenizer.convert_tokens_to_ids(tokens)
    print(f'ID : {ids}')

    # 역변환
    decoded_string = tokenizer.decode(ids)
    print(f'역변환 : {decoded_string}/n')

원문 : Hello World
토큰 : ['hello', 'world']
ID : [7592, 2088]
역변환 : hello world/n
원문 : unbelievable performance!
토큰 : ['unbelievable', 'performance', '!']
ID : [23653, 2836, 999]
역변환 : unbelievable performance!/n
원문 : COVID-19 pendamic
토큰 : ['co', '##vid', '-', '19', 'pen', '##dam', '##ic']
ID : [2522, 17258, 1011, 2539, 7279, 17130, 2594]
역변환 : covid - 19 pendamic/n


In [18]:
# 2. Attention Mask : 실제단어 1 
from transformers import BertTokenizer
# 토크나이져 로드
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
sentences = [
    'short sentence',
    'This is a much longer sentence with more words'
]
# 여러문장을 한꺼번에 토크나이징하고 가장 긴 문장길이에 맞춰 자동 패딩 수
encoded = tokenizer(
    sentences,
    padding=True,
    return_tensor='pt'
)
encoded


Keyword arguments {'return_tensor': 'pt'} not recognized.
Keyword arguments {'return_tensor': 'pt'} not recognized.


{'input_ids': [[101, 2460, 6251, 102, 0, 0, 0, 0, 0, 0, 0], [101, 2023, 2003, 1037, 2172, 2936, 6251, 2007, 2062, 2616, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [None]:
# token_type_ids : 두 문장을 입력할 때 첫번째, 두번째 구문
from transformers import BertTokenizer
# 토크나이져 로드
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
sentence_A = "The weather is nice"
sentence_B = "Let's go for a walk"
# 두 문장을 하나의 입력으로 인코딩
encoded = tokenizer(
    sentence_A,
    sentence_B,
    padding=True,
    return_tensors='pt'
)
print(encoded['token_type_ids'])
tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0])

for token, token_id, type_id in zip(tokens, encoded['input_ids'][0], encoded['token_type_ids'][0]):
    segment = "문장 A" if type_id == 0 else "문장 B"
    if token == "[SEP]":
        segment = "구분자"
    elif token == "[CLS]":
        segment = "시작"
    print(f'{token:20s} {token_id.item():6d} {type_id.item():6d} ({segment})')

tensor([[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]])
[CLS]                   101      0 (문장 A)
the                    1996      0 (문장 A)
weather                4633      0 (문장 A)
is                     2003      0 (문장 A)
nice                   3835      0 (문장 A)
[SEP]                   102      0 (구분자)
let                    2292      1 (문장 B)
'                      1005      1 (문장 B)
s                      1055      1 (문장 B)
go                     2175      1 (문장 B)
for                    2005      1 (문장 B)
a                      1037      1 (문장 B)
walk                   3328      1 (문장 B)
[SEP]                   102      1 (구분자)


In [None]:
# [CLS] Token Pooling : BERT 첫번째 토큰 [CLS] 문서 전체의 요약 => 분류 작업을 할때
# 이 토큰의 출력만 가져와서 분류기(classifier)에 연결
from transformers import BertTokenizer, BertModel
import torch
# 토크나이져 로드
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
sentence = "BERT is amazing for NLP tasks!"
# 인코딩
inputs = tokenizer(sentence,return_tensors='pt')
# BERT 통과
with torch.no_grad():
    outputs = model(**inputs)
# 출력 형태 확인
last_hidden_state = outputs.last_hidden_state
print(f'입력문장 : {sentence}')
print(f'last_hidden_state 형태 : {last_hidden_state.shape}')
print(f'batch_size = 1 sequence_length = {last_hidden_state.shape[1]} hidden_size = {last_hidden_state.shape[2]}')
# [CLS] 토큰 추출
cls_embedding = last_hidden_state[:,0,:]
print(f'cls_embedding 형태 : {cls_embedding.shape}')
# 분류기 (2-class)
classifier = torch.nn.Linear(768, 2)
logits = classifier(cls_embedding)
probs = torch.softmax(logits, dim=-1)
print(f'logits : {logits}')
print(f'probs : {probs}')
print(f'predicted class : {torch.argmax(probs).item()}')

입력문장 : BERT is amazing for NLP tasks!
last_hidden_state 형태 : torch.Size([1, 10, 768])
batch_size = 1 sequence_length = 10 hidden_size = 768
cls_embedding 형태 : torch.Size([1, 768])


In [77]:
# 미세 조정 학습 Fine-turning
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
texts = [
    "This movie is fantastic!",
    "Terrible film, waste of time.",
    "Amazing plot and great acting.",
    "Boring and predictable."
]
labels = [1, 0, 1, 0]  # 1=positive, 0=negative

# 토크나이져 모델
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# 모델
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=2)
# 데이터셋
class SimpleDataset(Dataset):
  def __init__(self, texts, labels):
    self.encodings = tokenizer(texts, truncation=True, padding=True, return_tensors='pt')
    self.labels = labels
  def __getitem__(self, idx):
    item = {key: val[idx] for key, val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[idx])
    return item
  def __len__(self):
    return len(self.labels)
dataset = SimpleDataset(texts,labels)
loader = DataLoader(dataset, batch_size=2)
# 학습설정
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
optimizer = AdamW(model.parameters(), lr=1e-5)
# 미세조정
model.train()
for epoch in range(20):
  total_loss = 0
  for batch in loader:
    optimizer.zero_grad()
    inputs = { k:v.to(device) for k,v in batch.items() if k != 'labels'}
    labels = batch['labels'].to(device)
    outputs = model(**inputs, labels=labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    total_loss += loss.item()
  print(f'epoch : {epoch+1}, loss : {total_loss/len(loader)}')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


epoch : 1, loss : 0.6073481440544128
epoch : 2, loss : 0.661867082118988
epoch : 3, loss : 0.6461204886436462
epoch : 4, loss : 0.5061250329017639
epoch : 5, loss : 0.3955150842666626
epoch : 6, loss : 0.4086294621229172
epoch : 7, loss : 0.35578736662864685
epoch : 8, loss : 0.3439542055130005
epoch : 9, loss : 0.3139413297176361
epoch : 10, loss : 0.33423812687397003
epoch : 11, loss : 0.28059518337249756
epoch : 12, loss : 0.23552437126636505
epoch : 13, loss : 0.2799035906791687
epoch : 14, loss : 0.2156563103199005
epoch : 15, loss : 0.2041623443365097
epoch : 16, loss : 0.19555847346782684
epoch : 17, loss : 0.22804558277130127
epoch : 18, loss : 0.22621099650859833
epoch : 19, loss : 0.1387973204255104
epoch : 20, loss : 0.1997784674167633


In [92]:
# 추론
model.eval()    #평가모드
sample_sentences = [
"I am really disappointed with the result.",
"The service was terrible and not worth the money.",
"I don't like this product at all."
]
# 토큰화
inputs = tokenizer(
    sample_sentences,
    truncation=True,
    padding=True,
    return_tensors='pt'
)
inputs
# gpu/cpu 설정
device = 'cuda' if torch.cuda.is_available() else 'cpu'
inputs = {k:v.to(device) for k,v in inputs.items()}
# 추론
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits #batch, class number (1,2)
    probs = torch.softmax(logits, dim=-1)
    pred = torch.argmax(probs, dim=-1).detach().numpy()
    # probs
#     probs = torch.softmax(logits, dim=-1) # (1,2)
#     pred = torch.argmax(probs, dim=-1).item()
    print(probs, pred)  #1: positive 0: negative

TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.