In [52]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# References

- [SKTBrain/KoBERT](https://github.com/SKTBrain/KoBERT)
- [eagle705/pytorch-bert-crf-ner](https://github.com/eagle705/pytorch-bert-crf-ner/blob/master/Visualization_BERT_NER.ipynb)
- [BERT to the rescue!](https://towardsdatascience.com/bert-to-the-rescue-17671379687f)

# Load Data

In [53]:
import io

import pandas as pd
import numpy as np

import tqdm
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
font_dirs = ['/usr/share/fonts/truetype/nanum']
font_files = font_manager.findSystemFonts(fontpaths=font_dirs)
font_list = font_manager.createFontList(font_files)
font_manager.fontManager.ttflist.extend(font_list)
plt.rcParams['font.family'] = 'NanumGothic'

# Using SKT BERT

In [54]:
import torch
from kobert.pytorch_kobert import get_pytorch_kobert_model
from gluonnlp.data import SentencepieceTokenizer
from kobert.utils import get_tokenizer
import sentencepiece as spm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model, vocab  = get_pytorch_kobert_model()
model.to(device)

using cached model
using cached model


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(8002, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Line

In [101]:
# Load pre-trained model tokenizer
tok_path = get_tokenizer()
print(tok_path)
sp = spm.SentencePieceProcessor()
print(sp)
sp.Load(tok_path)

# Tokenized input
#text = "[CLS] 누가 기침소리를 내었는가 ? [SEP] 누구인가 ? [SEP]"
text = "누가 기침소리를 내었는가 ? 누구인가 ?"
tokenized_text = sp.EncodeAsPieces(text)
indexed_tokens = vocab.to_indices(tokenized_text)
reconstructed = vocab.to_tokens(indexed_tokens)

print(text)
print(tokenized_text)
print(indexed_tokens)
print(reconstructed)

using cached model
/root/kobert/tokenizer_78b3253a26.model
<sentencepiece.SentencePieceProcessor; proxy of <Swig Object of type 'sentencepiece::SentencePieceProcessor *' at 0x7f7dc26a7c90> >
누가 기침소리를 내었는가 ? 누구인가 ?
['▁누가', '▁기', '침', '소리', '를', '▁내', '었', '는', '가', '▁?', '▁누구', '인', '가', '▁?']
[1527, 1258, 7491, 6609, 6116, 1434, 6885, 5760, 5330, 633, 1528, 7119, 5330, 633]
['▁누가', '▁기', '침', '소리', '를', '▁내', '었', '는', '가', '▁?', '▁누구', '인', '가', '▁?']


In [56]:
indexed_tokens = sp.EncodeAsIds(text)
print(indexed_tokens)

[5272, 248, 1175, 4526, 12, 121, 1528, 10, 11, 4859, 2861, 24, 11, 4859]


In [104]:
# github example

input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]).to(device)
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]).to(device)
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]).to(device)

all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)

print(all_encoder_layers[-1][0])
print(pooled_output.shape)
print(pooled_output)

tensor([[-0.2461,  0.2428,  0.2590,  ..., -0.4861, -0.0731,  0.0756],
        [-0.2478,  0.2420,  0.2552,  ..., -0.4877, -0.0727,  0.0754],
        [-0.2472,  0.2420,  0.2561,  ..., -0.4874, -0.0733,  0.0765]],
       device='cuda:0', grad_fn=<SelectBackward>)
torch.Size([2, 768])
tensor([[-0.0903, -0.0444,  0.1579,  ...,  0.1010, -0.0819,  0.0529],
        [ 0.0742, -0.0116, -0.6845,  ...,  0.0024, -0.0447,  0.0122]],
       device='cuda:0', grad_fn=<TanhBackward>)
