In [4]:
!pip install transformers==4.5.0 fugashi==1.1.0 ipadic==1.0.0

Collecting transformers==4.5.0
  Downloading transformers-4.5.0-py3-none-any.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 3.1 MB/s 
[?25hCollecting fugashi==1.1.0
  Downloading fugashi-1.1.0-cp37-cp37m-manylinux1_x86_64.whl (486 kB)
[K     |████████████████████████████████| 486 kB 23.8 MB/s 
[?25hCollecting ipadic==1.0.0
  Downloading ipadic-1.0.0.tar.gz (13.4 MB)
[K     |████████████████████████████████| 13.4 MB 6.4 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 15.3 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 27.7 MB/s 
Building wheels for collected packages: ipadic
  Building wheel for ipadic (setup.py) ... [?25l[?25hdone
  Created wheel for ipadic: filename=ipadic-1.0.0-py3-none-any.w

In [14]:
import torch
from transformers import BertJapaneseTokenizer, BertModel

In [15]:
model_name = 'cl-tohoku/bert-base-japanese-whole-word-masking'
tokenizer = BertJapaneseTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/258k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/110 [00:00<?, ?B/s]

In [16]:
tokenizer.tokenize('明日は自然言語処理の勉強をしよう。')

['明日', 'は', '自然', '言語', '処理', 'の', '勉強', 'を', 'しよ', 'う', '。']

In [17]:
tokenizer.vocab

OrderedDict([('[PAD]', 0),
             ('[UNK]', 1),
             ('[CLS]', 2),
             ('[SEP]', 3),
             ('[MASK]', 4),
             ('の', 5),
             ('、', 6),
             ('に', 7),
             ('。', 8),
             ('は', 9),
             ('た', 10),
             ('を', 11),
             ('で', 12),
             ('と', 13),
             ('が', 14),
             ('し', 15),
             ('て', 16),
             ('1', 17),
             ('な', 18),
             ('年', 19),
             ('れ', 20),
             ('い', 21),
             ('あ', 22),
             ('(', 23),
             (')', 24),
             ('2', 25),
             ('さ', 26),
             ('こ', 27),
             ('も', 28),
             ('か', 29),
             ('##する', 30),
             ('ある', 31),
             ('日', 32),
             ('いる', 33),
             ('する', 34),
             ('・', 35),
             ('「', 36),
             ('月', 37),
             ('」', 38),
             ('19', 39),
             ('から', 40

In [18]:
input_text = '京都大学は京都の外れにあります'
tokenizer.tokenize(input_text)

['京都大', '学', 'は', '京都', 'の', '外れ', 'に', 'あり', 'ます']

In [19]:
input_ids = tokenizer.encode(input_text)
input_ids

[2, 9712, 112, 9, 1316, 5, 8786, 7, 130, 2610, 3]

In [20]:
ids = tokenizer.encode(input_text)
tokenizer.convert_ids_to_tokens(ids)

['[CLS]', '京都大', '学', 'は', '京都', 'の', '外れ', 'に', 'あり', 'ます', '[SEP]']

In [21]:
encoding = tokenizer(
    input_text, max_length=12, padding='max_length', truncation=True
)
print('# Encoding:')
print(encoding)

print('## input_ids:')
print(encoding['input_ids'])

print('## token_type_ids:')
print(encoding['token_type_ids'])

print('## attention_mask:')
print(encoding['attention_mask'])

tokens = tokenizer.convert_ids_to_tokens(encoding['input_ids'])
print('# tokens:')
print(tokens)

# Encoding:
{'input_ids': [2, 9712, 112, 9, 1316, 5, 8786, 7, 130, 2610, 3, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]}
## input_ids:
[2, 9712, 112, 9, 1316, 5, 8786, 7, 130, 2610, 3, 0]
## token_type_ids:
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
## attention_mask:
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]
# tokens:
['[CLS]', '京都大', '学', 'は', '京都', 'の', '外れ', 'に', 'あり', 'ます', '[SEP]', '[PAD]']


In [22]:
encoding = tokenizer(
    input_text, max_length=6, padding='max_length', truncation=True
)
print('# Encoding:')
print(encoding)

print('## input_ids:')
print(encoding['input_ids'])

print('## token_type_ids:')
print(encoding['token_type_ids'])

print('## attention_mask:')
print(encoding['attention_mask'])

tokens = tokenizer.convert_ids_to_tokens(encoding['input_ids'])
print('# tokens:')
print(tokens)

# Encoding:
{'input_ids': [2, 9712, 112, 9, 1316, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]}
## input_ids:
[2, 9712, 112, 9, 1316, 3]
## token_type_ids:
[0, 0, 0, 0, 0, 0]
## attention_mask:
[1, 1, 1, 1, 1, 1]
# tokens:
['[CLS]', '京都大', '学', 'は', '京都', '[SEP]']


In [23]:
input_list = [
    input_text,
    'もっと京都駅近くに作ってよ〜',
]
encoding = tokenizer(
    input_list, max_length=12, padding='max_length', truncation=True
)
print('# Encoding:')
print(encoding)

print('## input_ids:')
for ids in encoding['input_ids']:
    print(ids)

print('## token_type_ids:')
print(encoding['token_type_ids'])

print('## attention_mask:')
print(encoding['attention_mask'])

print('# tokens:')
for ids in encoding['input_ids']:
    tokens = tokenizer.convert_ids_to_tokens(ids)
    print(tokens)

# Encoding:
{'input_ids': [[2, 9712, 112, 9, 1316, 5, 8786, 7, 130, 2610, 3, 0], [2, 8065, 1316, 235, 1919, 7, 3379, 16, 54, 1143, 3, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]]}
## input_ids:
[2, 9712, 112, 9, 1316, 5, 8786, 7, 130, 2610, 3, 0]
[2, 8065, 1316, 235, 1919, 7, 3379, 16, 54, 1143, 3, 0]
## token_type_ids:
[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
## attention_mask:
[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]]
# tokens:
['[CLS]', '京都大', '学', 'は', '京都', 'の', '外れ', 'に', 'あり', 'ます', '[SEP]', '[PAD]']
['[CLS]', 'もっと', '京都', '駅', '近く', 'に', '作っ', 'て', 'よ', '〜', '[SEP]', '[PAD]']


In [24]:
tokenizer(
    input_list,
    max_length=10,
    padding='max_length',
    truncation=True,
    return_tensors='pt'
)

{'input_ids': tensor([[   2, 9712,  112,    9, 1316,    5, 8786,    7,  130,    3],
        [   2, 8065, 1316,  235, 1919,    7, 3379,   16,   54,    3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

## BERT

In [25]:
import torch
from transformers import BertJapaneseTokenizer, BertModel

In [26]:
# LOAD JAPANESE MODEL
model_name = 'cl-tohoku/bert-base-japanese-whole-word-masking'
bert = BertModel.from_pretrained(model_name)

# set to GPU
bert = bert.cuda() 

In [27]:
print(bert.config)

BertConfig {
  "_name_or_path": "cl-tohoku/bert-base-japanese-whole-word-masking",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "tokenizer_class": "BertJapaneseTokenizer",
  "transformers_version": "4.5.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 32000
}



In [28]:
# can ?
bert_test = BertModel.from_pretrained(model_name)

# set to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
bert_test.to(device)

cuda


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(32000, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [34]:
text_list = [
    '明日は自然言語処理の勉強をしよう。',
    '明日はマシーンラーニングの勉強をしよう。'
]

# 文章の符号化
encoding = tokenizer(
    text_list,
    max_length=32,
    padding='max_length',
    truncation=True,
    return_tensors='pt' # これが必要だー
)

# データをGPUに載せる
encoding = { k: v.cuda() for k, v in encoding.items() } 

# BERTでの処理
output = bert(**encoding) # それぞれの入力は2次元のtorch.Tensor
last_hidden_state = output.last_hidden_state # 最終層の出力


In [32]:
print(encoding)

{'input_ids': tensor([[    2, 11475,     9,  1757,  1882,  2762,     5,  8192,    11,  2132,
           205,     8,     3,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0],
        [    2, 11475,     9,    96, 13866,   422,  1581,     5,  8192,    11,
          2132,   205,     8,     3,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0]], device='cuda:0'), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 