In [None]:
# Bert의 서브워드 토크나이저: WordPiece
# Bert에서 사용하는 서브워드 토크나이저는 자주 등장하는 단어는 그대로 단어 집합에 추가하지만, 자주 등장하지 않는 단어의 경우
# 더 작은 단위인 서브워드로 분리되어 단어 집합에 추가  -> 생성된 단어 집합을 기반으로 토큰화 수행

In [None]:
# Single Text Classification
# 하나의 문서에 대한 텍스트 분류의 경우, 문서의 시작에 [CLS] 토큰을 추가
# [CLS] 토큰의 위치의 출력층에서 FC layer를 추가해 분류에 대한 예측 진행

# Masked Language Model

In [None]:
import pandas as pd
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
result = tokenizer.tokenize('Here is the sentence I want embeddings for.')
print(result)

['here', 'is', 'the', 'sentence', 'i', 'want', 'em', '##bed', '##ding', '##s', 'for', '.']


In [None]:
print(tokenizer.vocab['here'])

2182


In [None]:
print(tokenizer.vocab['##ding'])

4667


In [None]:
from transformers import BertForMaskedLM
from transformers import AutoTokenizer

In [None]:
model = BertForMaskedLM.from_pretrained('bert-large-uncased')
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased")

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForMaskedLM.

All the weights of TFBertForMaskedLM were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
inputs = tokenizer('Soccer is a really fun [MASK].', return_tensors='pt')

In [None]:
# 정수 인코딩
print(inputs['input_ids'])

tensor([[ 101, 4715, 2003, 1037, 2428, 4569,  103, 1012,  102]])


In [None]:
# segment embedding (토큰화된 단어들을 하나의 문장으로 만드는 것)
# 문장이 한 개이기 때문에 문장의 길이만큼 0이 출력

print(inputs['token_type_ids'])

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]])


In [None]:
# attention mask (실제 단어와 패딩 토큰을 구분)
# 1- 실제 단어이므로 마스킹을 하지 않음, 0- 패딩 토큰이므로 마스킹을 함

print(inputs['attention_mask'])

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])


In [None]:
from transformers import FillMaskPipeline
pip = FillMaskPipeline(model=model, tokenizer=tokenizer)

In [None]:
# [MASK]의 위치에 들어갈 수 있는 상위 5개의 후보 단어를 출

pip('Soccer is a really fun [MASK].')

[{'score': 0.7621120810508728,
  'token': 4368,
  'token_str': 'sport',
  'sequence': 'soccer is a really fun sport.'},
 {'score': 0.20341980457305908,
  'token': 2208,
  'token_str': 'game',
  'sequence': 'soccer is a really fun game.'},
 {'score': 0.012208483181893826,
  'token': 2518,
  'token_str': 'thing',
  'sequence': 'soccer is a really fun thing.'},
 {'score': 0.0018630196573212743,
  'token': 4023,
  'token_str': 'activity',
  'sequence': 'soccer is a really fun activity.'},
 {'score': 0.0013354852562770247,
  'token': 2492,
  'token_str': 'field',
  'sequence': 'soccer is a really fun field.'}]

In [None]:
pip('I went to [MASK] this morning.')

[{'score': 0.3573068678379059,
  'token': 2147,
  'token_str': 'work',
  'sequence': 'i went to work this morning.'},
 {'score': 0.2330448031425476,
  'token': 2793,
  'token_str': 'bed',
  'sequence': 'i went to bed this morning.'},
 {'score': 0.1284506469964981,
  'token': 2082,
  'token_str': 'school',
  'sequence': 'i went to school this morning.'},
 {'score': 0.06230588257312775,
  'token': 3637,
  'token_str': 'sleep',
  'sequence': 'i went to sleep this morning.'},
 {'score': 0.046952612698078156,
  'token': 2465,
  'token_str': 'class',
  'sequence': 'i went to class this morning.'}]

# Next Sentence Prediction

In [None]:
from transformers import BertForNextSentencePrediction
from transformers import AutoTokenizer

In [None]:
model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [None]:
prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
next_sentence = "pizza is eaten with the use of a knife and fork. In casual settings, however, it is cut into wedges to be eaten while held in the hand."

In [None]:
encoding = tokenizer(prompt, next_sentence, return_tensors='pt')

In [None]:
print(encoding['input_ids'] )

tensor([[  101,  1999,  3304,  1010, 10733,  2366,  1999,  5337, 10906,  1010,
          2107,  2004,  2012,  1037,  4825,  1010,  2003,  3591,  4895, 14540,
          6610,  2094,  1012,   102, 10733,  2003,  8828,  2007,  1996,  2224,
          1997,  1037,  5442,  1998,  9292,  1012,  1999, 10017, 10906,  1010,
          2174,  1010,  2009,  2003,  3013,  2046, 17632,  2015,  2000,  2022,
          8828,  2096,  2218,  1999,  1996,  2192,  1012,   102]])


In [None]:
print(tokenizer.cls_token, ':', tokenizer.cls_token_id)
print(tokenizer.sep_token, ':' , tokenizer.sep_token_id)

[CLS] : 101
[SEP] : 102


In [None]:
print(tokenizer.decode(encoding['input_ids'][0]))

[CLS] in italy, pizza served in formal settings, such as at a restaurant, is presented unsliced. [SEP] pizza is eaten with the use of a knife and fork. in casual settings, however, it is cut into wedges to be eaten while held in the hand. [SEP]


In [None]:
# 두 개의 문장이 입력으로 들어갈 경우 맨 앞에는 [CLS] 토큰, 첫 번째 문장과 두 번째 문장이 끝나면 [SEP] 토큰이 추

print(encoding['token_type_ids'])

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])


In [None]:
# 모델의 입력값을 softmax에 통과시킨 후 마지막 차원 (class 차원)에 대한 확률값을 출력

import torch
import torch.nn.functional as F

logits = model(**encoding).logits
probs = F.softmax(logits, dim=-1)
print(probs)

tensor([[1.0000e+00, 2.8382e-06]], grad_fn=<SoftmaxBackward0>)


In [None]:
# 더 큰 확률값을 가진 인덱스를 리턴 - 이어지는 문장이기 때문에 0이 출력

max_prob = torch.argmax(probs)
print("Index with max probability:", max_prob.item())

Index with max probability: 0


In [None]:
prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
next_sentence = "The sky is blue due to the shorter wavelength of blue light."

In [None]:
encoding = tokenizer(prompt, next_sentence, return_tensors='pt')

In [None]:
logits = model(**encoding).logits
probs = F.softmax(logits, dim=-1)
print(probs)

tensor([[1.2606e-04, 9.9987e-01]], grad_fn=<SoftmaxBackward0>)


In [None]:
# 이어지지 않는 두 개의 문장이기 때문에 1이 출력

max_prob = torch.argmax(probs)
print("Index with max probability:", max_prob.item())

Index with max probability: 1
