In [1]:
!pip install transformers



You should consider upgrading via the 'C:\Users\suljeewoo\anaconda3\envs\zeze\python.exe -m pip install --upgrade pip' command.




## Huggingface Tutorial

* 설명 : https://github.com/huggingface/transformers/blob/master/README_ko.md
* 튜토리얼 : https://www.ohsuz.dev/22f4e8e7-64a3-4789-9dd2-171913883733

In [3]:
from transformers import AutoModel, AutoTokenizer, BertTokenizer

# 원하는 모델 이름은 사이트에서 검색
MODEL_NAME = 'bert-base-multilingual-cased'
model = AutoModel.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

Downloading: 100%|██████████| 625/625 [00:00<00:00, 104kB/s]
Downloading: 100%|██████████| 681M/681M [00:59<00:00, 12.1MB/s] 
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Downloading: 100%|

### 1. Tokenizer
* input_ids : 모델의 입력
* token_type_ids : 문장을 구분짓는 id (현재 한 문장밖에 입력으로 주지 않았으므로 다 0)
* attention_mask : padding을 구분짓음 (padding은 0으로 표시 - 현재 padding이 없음)

In [5]:
text = "이순신은 조선 중기의 무신이다."

tokenized_input_text = tokenizer(text, return_tensors="pt")     # pt = pytorch tensor 형태로 반환
for key,value in tokenized_input_text.items():
    print("key {}:\n\t value {}".format(key, value))

key input_ids:
	 value tensor([[   101,   9638, 119064,  25387,  10892,  59906,   9694,  46874,   9294,
          25387,  11925,    119,    102]])
key token_type_ids:
	 value tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
key attention_mask:
	 value tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])


In [7]:
single_seg_input = tokenizer("이순신은 조선 중기의 무신이다")
print("Single segment token (str): {}".format(tokenizer.convert_ids_to_tokens(single_seg_input['input_ids'])))
print("Single segment token (int): {}".format(single_seg_input['input_ids']))
print("Single segment type       : {}".format(single_seg_input['token_type_ids']))
print()

multi_seg_input = tokenizer("이순신은 조선 중기의 무신이다", "그는 임진왜란을 승리로 이끌었다")
print("Multi segment token (str): {}".format(tokenizer.convert_ids_to_tokens(multi_seg_input['input_ids'])))
print("Multi segment token (int): {}".format(multi_seg_input['input_ids']))
print("Multi segment type       : {}".format(multi_seg_input['token_type_ids']))

Single segment token (str): ['[CLS]', '이', '##순', '##신', '##은', '조선', '중', '##기의', '무', '##신', '##이다', '[SEP]']
Single segment token (int): [101, 9638, 119064, 25387, 10892, 59906, 9694, 46874, 9294, 25387, 11925, 102]
Single segment type       : [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Multi segment token (str): ['[CLS]', '이', '##순', '##신', '##은', '조선', '중', '##기의', '무', '##신', '##이다', '[SEP]', '그는', '임', '##진', '##왜', '##란', '##을', '승', '##리로', '이', '##끌', '##었다', '[SEP]']
Multi segment token (int): [101, 9638, 119064, 25387, 10892, 59906, 9694, 46874, 9294, 25387, 11925, 102, 17889, 9644, 18623, 119164, 49919, 10622, 9484, 100434, 9638, 118705, 17706, 102]
Multi segment type       : [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [8]:
tokenize_text = tokenizer.tokenize(text)
print(tokenize_text)
input_ids = tokenizer.encode(text)
print(input_ids)
decoded_ids = tokenizer.decode(input_ids)
print(decoded_ids)

['이', '##순', '##신', '##은', '조선', '중', '##기의', '무', '##신', '##이다', '.']
[101, 9638, 119064, 25387, 10892, 59906, 9694, 46874, 9294, 25387, 11925, 119, 102]
[CLS] 이순신은 조선 중기의 무신이다. [SEP]


In [9]:
# SEP, CLS 제거
tokenize_text = tokenizer.tokenize(text)
print(tokenize_text)
input_ids = tokenizer.encode(text, add_special_tokens=False)
print(input_ids)
decoded_ids = tokenizer.decode(input_ids, add_special_tokens=False)
print(decoded_ids)

['이', '##순', '##신', '##은', '조선', '중', '##기의', '무', '##신', '##이다', '.']
[9638, 119064, 25387, 10892, 59906, 9694, 46874, 9294, 25387, 11925, 119]
이순신은 조선 중기의 무신이다.


In [12]:
tokenized_text = tokenizer.tokenize(
    text,
    add_special_tokens=False,
    max_length=5,   # 5개의 token만 살리고 뒤는 짤라버리자
    truncation=True
)
print(tokenized_text)

['이', '##순', '##신', '##은', '조선']


In [15]:
input_ids = tokenizer.encode(
    text,
    add_special_tokens=False,
    max_length=5,
    truncation=True
)
print(input_ids)

[9638, 119064, 25387, 10892, 59906]


In [16]:
tokenizer.decode(input_ids)

'이순신은 조선'

In [17]:
print('pad token :', tokenizer.pad_token)
print('pad token id :', tokenizer.pad_token_id)

pad token : [PAD]
pad token id : 0


In [18]:
tokenized_text = tokenizer.tokenize(
    text,
    add_special_tokens=False,
    max_length=20,
    padding="max_length"
)
print(tokenized_text)

input_ids = tokenizer.encode(
    text,
    add_special_tokens=False,
    max_length=20,
    padding="max_length"
)
print(input_ids)

['이', '##순', '##신', '##은', '조선', '중', '##기의', '무', '##신', '##이다', '.', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
[9638, 119064, 25387, 10892, 59906, 9694, 46874, 9294, 25387, 11925, 119, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [19]:
tokenizer.decode(input_ids)

'이순신은 조선 중기의 무신이다. [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [20]:
text = "깟뻬뜨랑 리뿔이 뜨럽거 므리커럭이 케쇽 냐왜쇼 우뤼갸 쳥쇼섀료다혀뚜여"

tokenized_text = tokenizer.tokenize(text, add_special_tokens=False)
print(tokenized_text)
input_ids = tokenizer.encode(text, add_special_tokens=False)
print(input_ids)
decoded_ids = tokenizer.decode(input_ids)
print(decoded_ids)

['[UNK]', '리', '##뿔', '##이', '뜨', '##럽', '##거', '므', '##리', '##커', '##럭', '##이', '[UNK]', '냐', '##왜', '##쇼', '[UNK]', '[UNK]']
[100, 9238, 119021, 10739, 9151, 118867, 41521, 9308, 12692, 106826, 118864, 10739, 100, 9002, 119164, 119060, 100, 100]
[UNK] 리뿔이 뜨럽거 므리커럭이 [UNK] 냐왜쇼 [UNK] [UNK]


### 3. BERT 모델 테스트

* [MASK] token을 예측해보자

In [21]:
text = "이순신은 [MASK] 중기의 무신이다."
tokenized_text = tokenizer.tokenize(text)

print(tokenized_text)

['이', '##순', '##신', '##은', '[MASK]', '중', '##기의', '무', '##신', '##이다', '.']


In [22]:
from transformers import pipeline

# [MASK] token을 채운 결과를 반환
nlp_fill = pipeline('fill-mask', model=MODEL_NAME)
nlp_fill("이순신은 [MASK] 중기의 무신이다.")

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'score': 0.8747127652168274,
  'token': 59906,
  'token_str': '조선',
  'sequence': '이순신은 조선 중기의 무신이다.'},
 {'score': 0.06436415016651154,
  'token': 9751,
  'token_str': '청',
  'sequence': '이순신은 청 중기의 무신이다.'},
 {'score': 0.010954867117106915,
  'token': 9665,
  'token_str': '전',
  'sequence': '이순신은 전 중기의 무신이다.'},
 {'score': 0.004647163674235344,
  'token': 22200,
  'token_str': '##종',
  'sequence': '이순신은종 중기의 무신이다.'},
 {'score': 0.0036106579937040806,
  'token': 12310,
  'token_str': '##기',
  'sequence': '이순신은기 중기의 무신이다.'}]

* last hidden state: 입력된 문장이 13개의 토큰으로 구성됨 -> 13개의 토큰에 대한 임베딩 차원이 768
* pooler output : CLS 토큰에 대한 벡터값

In [24]:
# tokenizer의 결과 형식과 model의 입력 형식을 맞춰놓음
tokens_pt = tokenizer("이순신은 조선 중기의 무신이다.", "그는 임진왜란에서 승리함", return_tensors="pt")
for key,value in tokens_pt.items():
    print("{}:\n\t{}".format(key, value))

outputs = model(**tokens_pt)
last_hidden_state = outputs.last_hidden_state
pooler_output = outputs.pooler_output

print("\nToken wise output: {}, Pooled output: {}".format(last_hidden_state.shape, pooler_output.shape))

input_ids:
	tensor([[   101,   9638, 119064,  25387,  10892,  59906,   9694,  46874,   9294,
          25387,  11925,    119,    102,  17889,   9644,  18623, 119164,  49919,
          11489,   9484,  12692,  48533,    102]])
token_type_ids:
	tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
attention_mask:
	tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

Token wise output: torch.Size([1, 23, 768]), Pooled output: torch.Size([1, 768])


In [26]:
sent1 = tokenizer("오늘 하루 어떻게 보냈나요?", return_tensors="pt")
sent2 = tokenizer("오늘은 어떤 하루를 보내셨나요?", return_tensors="pt")
sent3 = tokenizer("이순신은 조선 중기의 무신이다.", return_tensors="pt")
sent4 = tokenizer("깟뻬뜨랑 리뿔이 뜨럽거 므리커럭이 케쇽 냐왜쇼 우뤼갸 쳥쇼섀료다혀뚜여", return_tensors="pt")

In [27]:
outputs = model(**sent1)
sent_1_pooler_output = outputs.pooler_output

outputs = model(**sent2)
sent_2_pooler_output = outputs.pooler_output

outputs = model(**sent3)
sent_3_pooler_output = outputs.pooler_output

outputs = model(**sent4)
sent_4_pooler_output = outputs.pooler_output

In [28]:
from torch import nn

cos = nn.CosineSimilarity(dim=1, eps=1e-6)
print(cos(sent_1_pooler_output, sent_2_pooler_output))
print(cos(sent_2_pooler_output, sent_3_pooler_output))
print(cos(sent_3_pooler_output, sent_4_pooler_output))
print(cos(sent_1_pooler_output, sent_4_pooler_output))

tensor([0.9757], grad_fn=<DivBackward0>)
tensor([0.6075], grad_fn=<DivBackward0>)
tensor([0.5997], grad_fn=<DivBackward0>)
tensor([0.9258], grad_fn=<DivBackward0>)
