# 문장 토큰 단위 분류 모델 학습

### 1. CPU 및 GPU 환경설정

In [1]:
import torch
import random
import numpy as np
import torch.backends.cudnn as cudnn

torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.cuda.manual_seed_all(42)
np.random.seed(42)
cudnn.benchmark = False
cudnn.deterministic = True
random.seed(42)

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [3]:
torch.cuda.get_device_name(0)

'CUDA GPU'

### 2. 데이터셋

In [4]:
!wget https://raw.githubusercontent.com/KLUE-benchmark/KLUE/main/klue_benchmark/klue-ner-v1.1/klue-ner-v1.1_train.tsv
!wget https://raw.githubusercontent.com/KLUE-benchmark/KLUE/main/klue_benchmark/klue-ner-v1.1/klue-ner-v1.1_dev.tsv

--2022-07-05 07:16:43--  https://raw.githubusercontent.com/KLUE-benchmark/KLUE/main/klue_benchmark/klue-ner-v1.1/klue-ner-v1.1_train.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10570302 (10M) [text/plain]
Saving to: ‘klue-ner-v1.1_train.tsv’


2022-07-05 07:16:45 (32.4 MB/s) - ‘klue-ner-v1.1_train.tsv’ saved [10570302/10570302]

--2022-07-05 07:16:45--  https://raw.githubusercontent.com/KLUE-benchmark/KLUE/main/klue_benchmark/klue-ner-v1.1/klue-ner-v1.1_dev.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2635045 (2.5M)

### 3. 허깅페이스 트랜스포머 설치

In [4]:
!pip install transformers==4.5.1

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


### 4. 데이터셋 샘플

In [5]:
import pandas as pd

In [6]:
train = pd.read_csv("klue-ner-v1.1_train.tsv", names=['src', 'tar'], sep="\t").iloc[5:, :]
train['tag'] = train['src'].apply(lambda x : 1 if '##' in x else 0)
train['tag'] = train['tag'].diff()
train = train.query('tag in [-1, 0]')
train = train.reset_index(drop=True).reset_index(drop=True)
train['tag'][0] = 0.0
train = train.replace(' ', '_')
train

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['tag'][0] = 0.0


Unnamed: 0,src,tar,tag
0,특,O,0.0
1,히,O,0.0
2,_,O,0.0
3,영,B-LC,0.0
4,동,I-LC,0.0
...,...,...,...
429113,탄,O,0.0
429114,탄,O,0.0
429115,해,O,0.0
429116,요,O,0.0


In [7]:
dev = pd.read_csv("klue-ner-v1.1_dev.tsv", names=['src', 'tar'], sep="\t", engine='python', quotechar='"', error_bad_lines=False).iloc[5:, :]
dev['tag'] = dev['src'].apply(lambda x : 1 if '##' in x else 0)
dev['tag'] = dev['tag'].diff()
dev = dev.query('tag in [-1, 0]')
dev = dev.reset_index(drop=True).reset_index(drop=True)
dev['tag'][0] = 0.0
dev = dev.replace(' ', '_')
dev

Skipping line 12471: '	' expected after '"'
Skipping line 21437: '	' expected after '"'
Skipping line 21473: '	' expected after '"'
Skipping line 21505: '	' expected after '"'
Skipping line 21515: '	' expected after '"'
Skipping line 21522: '	' expected after '"'
Skipping line 21536: '	' expected after '"'
Skipping line 21811: '	' expected after '"'
Skipping line 21821: '	' expected after '"'
Skipping line 27962: '	' expected after '"'
Skipping line 75003: '	' expected after '"'
Skipping line 79608: '	' expected after '"'
Skipping line 103812: '	' expected after '"'
Skipping line 103818: '	' expected after '"'
Skipping line 103835: '	' expected after '"'
Skipping line 112774: '	' expected after '"'
Skipping line 122300: '	' expected after '"'
Skipping line 122305: '	' expected after '"'
Skipping line 122309: '	' expected after '"'
Skipping line 128274: '	' expected after '"'
Skipping line 128289: '	' expected after '"'
Skipping line 128296: '	' expected after '"'
Skipping line 128319: 

Unnamed: 0,src,tar,tag
0,경,B-OG,0.0
1,찰,I-OG,0.0
2,은,O,0.0
3,_,O,0.0
4,또,O,0.0
...,...,...,...
128302,는,O,0.0
128303,_,O,0.0
128304,이,O,0.0
128305,어,O,0.0


### 5. 데이터셋 전처리

In [8]:
def read_file(train):
    token_docs = []
    tag_docs = []

    tokens = []
    tags = []

    for i in range(train.shape[0]):

        if (train['tag'][i] != -1):
            tokens.append(train['src'][i])
            tags.append(train['tar'][i])

        else:
            token_docs.append(tokens)
            tag_docs.append(tags)
            tokens = []
            tags = []
            tokens.append(train['src'][i])
            tags.append(train['tar'][i])
        
    return token_docs, tag_docs

In [9]:
texts, tags = read_file(train)
dev_texts, dev_tags = read_file(dev)

In [10]:
print(len(texts))
print(len(tags))

7901
7901


In [11]:
print(texts[1], end='\n\n') # 음절 단위로 잘 잘렸네요!
print(tags[1])

['한', '군', '데', '서', '_', '필', '름', '을', '_', '너', '무', '_', '낭', '비', '한', '_', '작', '품', '입', '니', '다', '.']

['B-QT', 'I-QT', 'I-QT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [12]:
unique_tags = set(train['tar'])
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

In [13]:
for i, tag in enumerate(unique_tags):
    print(tag)  # 학습을 위한 label list를 확인합니다.

I-TI
B-PS
I-LC
I-DT
B-DT
I-QT
I-OG
I-PS
O
B-TI
B-QT
B-OG
B-LC


### 6. EDA

In [14]:
import numpy as np
import matplotlib.pyplot as plt

**6.2 문장의 길이의 히스토그램**

In [15]:
for tag in list(tag2id.keys()):
    globals()[tag] = 0

In [16]:
for tag in train['tar']:
    globals()[tag] += 1

In [17]:
for tag in list(tag2id.keys()): 
    print('{:>6} : {:>7,}'. format(tag, globals()[tag]))

  I-TI :   2,237
  B-PS :   5,449
  I-LC :   6,972
  I-DT :   7,973
  B-DT :   2,954
  I-QT :   8,559
  I-OG :  10,093
  I-PS :  10,599
     O : 363,440
  B-TI :     738
  B-QT :   4,362
  B-OG :   3,221
  B-LC :   2,521


### 7. Train Test Split

In [18]:
from sklearn.model_selection import train_test_split
train_texts, test_texts, train_tags, test_tags = train_test_split(texts, tags, test_size=.1, random_state=42)

In [19]:
print('Train 문장 : {:>6,}' .format(len(train_texts)))
print('Train 태그 : {:>6,}' .format(len(train_tags)))
print('Test  문장 : {:>6,}' .format(len(test_texts)))
print('Test  태그 : {:>6,}' .format(len(test_tags)))

Train 문장 :  7,110
Train 태그 :  7,110
Test  문장 :    791
Test  태그 :    791


### 8. BERT 토크나이저

In [20]:
from transformers import AutoModel, AutoTokenizer, BertTokenizer
MODEL_NAME = "monologg/koelectra-base-v3-discriminator"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

Downloading:   0%|          | 0.00/467 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/263k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/61.0 [00:00<?, ?B/s]

In [21]:
pad_token_id = tokenizer.pad_token_id # 0
cls_token_id = tokenizer.cls_token_id # 101
sep_token_id = tokenizer.sep_token_id # 102
pad_token_label_id = tag2id['O']    # tag2id['O']
cls_token_label_id = tag2id['O']
sep_token_label_id = tag2id['O']

In [22]:
# 기존 토크나이저는 wordPiece tokenizer로 tokenizing 결과를 반환합니다.
# 데이터 단위를 음절 단위로 변경했기 때문에, tokenizer도 음절 tokenizer로 바꿀게요! :-)

# berttokenizer를 쓸건데 여기에 들어있는 8000개 밖에 안되는 한국어가 들어있지만
# 많은 수가 음절이다. 그래서 음절로 쪼개면 UNK가 별로 없을듯.
def ner_tokenizer(sent, max_seq_length):
    pre_syllable = "_"
    input_ids = [pad_token_id] * (max_seq_length - 1)
    attention_mask = [0] * (max_seq_length - 1)
    token_type_ids = [0] * max_seq_length
    sent = sent[:max_seq_length-2]

    for i, syllable in enumerate(sent):
        if syllable == '_':
            pre_syllable = syllable
        if pre_syllable != "_":
            syllable = '##' + syllable  # 중간 음절에는 모두 prefix를 붙입니다.
            # 우리가 구성한 학습 데이터도 이렇게 구성되었기 때문이라고 함.
            # 이순신은 조선 -> [이, ##순, ##신, ##은, 조, ##선]
        pre_syllable = syllable

        input_ids[i] = (tokenizer.convert_tokens_to_ids(syllable))
        attention_mask[i] = 1
    
    input_ids = [cls_token_id] + input_ids
    input_ids[len(sent)+1] = sep_token_id
    attention_mask = [1] + attention_mask
    attention_mask[len(sent)+1] = 1
    return {"input_ids": input_ids,
            "attention_mask": attention_mask,
            "token_type_ids": token_type_ids}

In [23]:
print(ner_tokenizer(train_texts[0], 5))
# 'token_type_ids': [0, 0, 0, 0, 0] 은 segmentA로 0000으로 되어있는거.

{'input_ids': [2, 2252, 4070, 67, 3], 'attention_mask': [1, 1, 1, 1, 1], 'token_type_ids': [0, 0, 0, 0, 0]}


In [24]:
tokenized_train_sentences = []
tokenized_test_sentences = []
tokenized_dev_sentences = []

for text in train_texts:    # 전체 데이터를 tokenizing 합니다.
    tokenized_train_sentences.append(ner_tokenizer(text, 128))
for text in test_texts:
    tokenized_test_sentences.append(ner_tokenizer(text, 128))
for text in dev_texts:
    tokenized_dev_sentences.append(ner_tokenizer(text, 128))

In [25]:
# 우리의 label도 truncation과 tokenizing이 필요하다.
def encode_tags(tags, max_seq_length):
    # label 역시 입력 token과 개수를 맞춰줍니다 :-)
    tags = tags[:max_seq_length-2]
    labels = [tag2id[tag] for tag in tags]
    labels = [tag2id['O']] + labels

    padding_length = max_seq_length - len(labels)
    labels = labels + ([pad_token_label_id] * padding_length)

    return labels

In [26]:
tag2id

{'I-TI': 0,
 'B-PS': 1,
 'I-LC': 2,
 'I-DT': 3,
 'B-DT': 4,
 'I-QT': 5,
 'I-OG': 6,
 'I-PS': 7,
 'O': 8,
 'B-TI': 9,
 'B-QT': 10,
 'B-OG': 11,
 'B-LC': 12}

In [27]:
encode_tags(train_tags[0], 5)

[8, 8, 8, 8, 8]

In [28]:
train_labels = []
test_labels = []
dev_labels = []

for tag in train_tags:
    train_labels.append(encode_tags(tag, 128))

for tag in test_tags:
    test_labels.append(encode_tags(tag, 128))
    
for tag in dev_tags:
    dev_labels.append(encode_tags(tag, 128))

In [29]:
len(train_labels), len(test_labels), len(dev_labels)

(7110, 791, 2219)

### 9. Token 데이터셋

In [30]:
import torch

# 여기 부터는 이제 지겨워지죠? :-)
class TokenDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val) for key, val in self.encodings[idx].items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TokenDataset(tokenized_train_sentences, train_labels)
test_dataset = TokenDataset(tokenized_test_sentences, test_labels)
dev_dataset = TokenDataset(tokenized_dev_sentences, dev_labels)

In [31]:
# BertForSencenceClassification이 아니다! token이 목적이야
from transformers import BertForTokenClassification, Trainer, TrainingArguments, AutoModelForTokenClassification, BigBirdForTokenClassification
import sys
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=8,   # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100,
    learning_rate=3e-5,
    save_total_limit=5
)

### 10. BertForTokenClassification

In [32]:
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME, num_labels=len(unique_tags))

model.to(device)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,           # evaluation dataset
)

Downloading:   0%|          | 0.00/452M [00:00<?, ?B/s]

Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraForTokenClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForTokenClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: ['classifier

In [33]:
trainer.train()

Step,Training Loss
100,0.7132
200,0.3311
300,0.2281
400,0.2131
500,0.168
600,0.1461
700,0.144
800,0.1334
900,0.1228
1000,0.0935


TrainOutput(global_step=4445, training_loss=0.08841807466792309, metrics={'train_runtime': 482.462, 'train_samples_per_second': 9.213, 'total_flos': 3067172065497600.0, 'epoch': 5.0, 'init_mem_cpu_alloc_delta': 4231168, 'init_mem_gpu_alloc_delta': 0, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 27475968, 'train_mem_gpu_alloc_delta': 1379048448, 'train_mem_cpu_peaked_delta': 215003136, 'train_mem_gpu_peaked_delta': 900408320})

In [34]:
predictions = trainer.predict(dev_dataset)
print(predictions.predictions.shape, predictions.label_ids.shape)

(2219, 128, 13) (2219, 128)


In [35]:
import numpy as np

preds = np.argmax(predictions.predictions, axis=-1)

In [36]:
index_to_ner = {i:j for j, i in tag2id.items()}
f_label = [i for i, j in tag2id.items()]
val_tags_l = [index_to_ner[x] for x in np.ravel(predictions.label_ids).astype(int).tolist()]
y_predicted_l = [index_to_ner[x] for x in np.ravel(preds).astype(int).tolist()]

In [37]:
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

In [38]:
print(classification_report(val_tags_l, y_predicted_l, labels=f_label))

              precision    recall  f1-score   support

        I-TI       0.92      0.98      0.95       833
        B-PS       0.91      0.91      0.91      2008
        I-LC       0.80      0.82      0.81      2640
        I-DT       0.92      0.92      0.92      2945
        B-DT       0.88      0.92      0.90      1039
        I-QT       0.92      0.95      0.93      3419
        I-OG       0.79      0.81      0.80      3405
        I-PS       0.91      0.91      0.91      4576
           O       0.99      0.99      0.99    259763
        B-TI       0.94      0.96      0.95       236
        B-QT       0.94      0.95      0.95      1443
        B-OG       0.75      0.82      0.78       932
        B-LC       0.80      0.85      0.83       793

    accuracy                           0.98    284032
   macro avg       0.88      0.91      0.89    284032
weighted avg       0.98      0.98      0.98    284032



### 11. New Data Inference

In [39]:
# 우리가 전에 사용했던건 word piece tokenizer
# 지금 사용한건 음절단위 tokenizer
# 반드시 음절 tokenizer를 거친 후에 model에 들어가야 한다.

def ner_inference(text) : 
  
    model.eval()
    text = text.replace(' ', '_')

    predictions , true_labels = [], []
    
    tokenized_sent = ner_tokenizer(text, len(text)+2)
    input_ids = torch.tensor(tokenized_sent['input_ids']).unsqueeze(0).to(device)
    attention_mask = torch.tensor(tokenized_sent['attention_mask']).unsqueeze(0).to(device)
    token_type_ids = torch.tensor(tokenized_sent['token_type_ids']).unsqueeze(0).to(device)    
    
    with torch.no_grad():
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids)
        
    logits = outputs['logits']
    logits = logits.detach().cpu().numpy()
    label_ids = token_type_ids.cpu().numpy()

    predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
    true_labels.append(label_ids)

    pred_tags = [list(tag2id.keys())[p_i] for p in predictions for p_i in p]

    print('{}\t{}'.format("TOKEN", "TAG"))
    print("===========")
    # for token, tag in zip(tokenizer.decode(tokenized_sent['input_ids']), pred_tags):
    #   print("{:^5}\t{:^5}".format(token, tag))
    for i, tag in enumerate(pred_tags):
        print("{:^5}\t{:^5}".format(tokenizer.convert_ids_to_tokens(tokenized_sent['input_ids'][i]), tag))

In [40]:
text = '이순신은 조선 중기의 무신이다.'

In [41]:
ner_inference(text)

TOKEN	TAG
[CLS]	  O  
  이  	B-PS 
 ##순 	I-PS 
 ##신 	I-PS 
 ##은 	  O  
  _  	  O  
  조  	B-DT 
 ##선 	I-DT 
  _  	I-DT 
  중  	I-DT 
 ##기 	I-DT 
 ##의 	  O  
  _  	  O  
  무  	  O  
 ##신 	  O  
 ##이 	  O  
 ##다 	  O  
[UNK]	  O  
[SEP]	  O  


In [42]:
text = '로스트아크는 스마일게이트 RPG가 개발한 쿼터뷰 액션 MMORPG 게임이다.'
ner_inference(text)

TOKEN	TAG
[CLS]	  O  
  로  	B-OG 
 ##스 	I-OG 
 ##트 	I-PS 
 ##아 	I-OG 
 ##크 	I-OG 
 ##는 	  O  
  _  	  O  
  스  	  O  
 ##마 	  O  
 ##일 	  O  
 ##게 	  O  
 ##이 	  O  
 ##트 	  O  
  _  	  O  
  R  	B-OG 
 ##P 	I-OG 
 ##G 	I-OG 
 ##가 	  O  
  _  	  O  
  개  	  O  
 ##발 	  O  
 ##한 	  O  
  _  	  O  
  쿼  	  O  
 ##터 	  O  
 ##뷰 	  O  
  _  	  O  
  액  	  O  
 ##션 	  O  
  _  	  O  
  M  	  O  
 ##M 	  O  
 ##O 	  O  
 ##R 	  O  
 ##P 	  O  
 ##G 	  O  
  _  	  O  
  게  	  O  
 ##임 	  O  
 ##이 	  O  
 ##다 	  O  
[UNK]	  O  
[SEP]	  O  


In [43]:
text = '2014년 11월 12일 최초 공개했으며 2018년 11월 7일부터 오픈 베타 테스트를 진행하다 2019년 12월 4일 정식 오픈했다.'
ner_inference(text)

TOKEN	TAG
[CLS]	  O  
  2  	B-DT 
 ##0 	I-DT 
 ##1 	I-DT 
 ##4 	I-DT 
 ##년 	I-DT 
  _  	I-DT 
  1  	I-DT 
 ##1 	I-DT 
 ##월 	I-DT 
  _  	I-DT 
  1  	I-DT 
 ##2 	I-DT 
 ##일 	I-DT 
  _  	  O  
  최  	  O  
 ##초 	  O  
  _  	  O  
  공  	  O  
 ##개 	  O  
 ##했 	  O  
 ##으 	  O  
 ##며 	  O  
  _  	  O  
  2  	B-DT 
 ##0 	I-DT 
 ##1 	I-DT 
 ##8 	I-DT 
 ##년 	I-DT 
  _  	I-DT 
  1  	I-DT 
 ##1 	I-DT 
 ##월 	I-DT 
  _  	I-DT 
  7  	I-DT 
 ##일 	I-DT 
 ##부 	  O  
 ##터 	  O  
  _  	  O  
  오  	  O  
 ##픈 	  O  
  _  	  O  
  베  	  O  
 ##타 	  O  
  _  	  O  
  테  	  O  
 ##스 	  O  
 ##트 	  O  
 ##를 	  O  
  _  	  O  
  진  	  O  
 ##행 	  O  
 ##하 	  O  
 ##다 	  O  
  _  	  O  
  2  	B-DT 
 ##0 	I-DT 
 ##1 	I-DT 
 ##9 	I-DT 
 ##년 	I-DT 
  _  	I-DT 
  1  	I-DT 
 ##2 	I-DT 
 ##월 	I-DT 
  _  	I-DT 
  4  	I-DT 
 ##일 	I-DT 
  _  	  O  
  정  	  O  
 ##식 	  O  
  _  	  O  
  오  	  O  
 ##픈 	  O  
 ##했 	  O  
 ##다 	  O  
[UNK]	  O  
[SEP]	  O  


In [44]:
text = '짜장면 7,000원'
ner_inference(text)

TOKEN	TAG
[CLS]	  O  
  짜  	  O  
 ##장 	  O  
 ##면 	  O  
  _  	  O  
  7  	B-QT 
[UNK]	I-QT 
 ##0 	I-QT 
 ##0 	I-QT 
 ##0 	I-QT 
 ##원 	I-QT 
[SEP]	  O  


In [45]:
text = '안녕하세요 저는 이지평이라고 합니다. 국민대학교에 다니고 있으며, 현재는 4학년 1학기를 마쳤습니다.'
ner_inference(text)

TOKEN	TAG
[CLS]	  O  
  안  	  O  
 ##녕 	  O  
 ##하 	  O  
 ##세 	  O  
 ##요 	  O  
  _  	  O  
  저  	  O  
 ##는 	  O  
  _  	  O  
  이  	B-PS 
 ##지 	I-PS 
 ##평 	I-PS 
 ##이 	  O  
 ##라 	  O  
 ##고 	  O  
  _  	  O  
  합  	  O  
 ##니 	  O  
 ##다 	  O  
[UNK]	  O  
  _  	  O  
  국  	B-OG 
 ##민 	I-OG 
 ##대 	I-OG 
 ##학 	I-OG 
 ##교 	I-OG 
 ##에 	  O  
  _  	  O  
  다  	  O  
 ##니 	  O  
 ##고 	  O  
  _  	  O  
  있  	  O  
 ##으 	  O  
 ##며 	  O  
[UNK]	  O  
  _  	  O  
  현  	  O  
 ##재 	  O  
 ##는 	  O  
  _  	  O  
  4  	B-QT 
 ##학 	I-QT 
 ##년 	I-QT 
  _  	I-QT 
  1  	I-QT 
 ##학 	I-QT 
 ##기 	I-QT 
 ##를 	  O  
  _  	  O  
  마  	  O  
 ##쳤 	  O  
 ##습 	  O  
 ##니 	  O  
 ##다 	  O  
[UNK]	  O  
[SEP]	  O  


In [46]:
text = '안녕하세요 저는 이지평이라고 합니다. 국민대학교에 다니고 있으며, 현재는 4학년 1학기를 마쳤습니다.'
ner_inference(text)

TOKEN	TAG
[CLS]	  O  
  안  	  O  
 ##녕 	  O  
 ##하 	  O  
 ##세 	  O  
 ##요 	  O  
  _  	  O  
  저  	  O  
 ##는 	  O  
  _  	  O  
  이  	B-PS 
 ##지 	I-PS 
 ##평 	I-PS 
 ##이 	  O  
 ##라 	  O  
 ##고 	  O  
  _  	  O  
  합  	  O  
 ##니 	  O  
 ##다 	  O  
[UNK]	  O  
  _  	  O  
  국  	B-OG 
 ##민 	I-OG 
 ##대 	I-OG 
 ##학 	I-OG 
 ##교 	I-OG 
 ##에 	  O  
  _  	  O  
  다  	  O  
 ##니 	  O  
 ##고 	  O  
  _  	  O  
  있  	  O  
 ##으 	  O  
 ##며 	  O  
[UNK]	  O  
  _  	  O  
  현  	  O  
 ##재 	  O  
 ##는 	  O  
  _  	  O  
  4  	B-QT 
 ##학 	I-QT 
 ##년 	I-QT 
  _  	I-QT 
  1  	I-QT 
 ##학 	I-QT 
 ##기 	I-QT 
 ##를 	  O  
  _  	  O  
  마  	  O  
 ##쳤 	  O  
 ##습 	  O  
 ##니 	  O  
 ##다 	  O  
[UNK]	  O  
[SEP]	  O  


In [47]:
text = '이지평'
ner_inference(text)

TOKEN	TAG
[CLS]	  O  
  이  	B-PS 
 ##지 	I-PS 
 ##평 	I-PS 
[SEP]	  O  


In [48]:
text = '마민정'
ner_inference(text)

TOKEN	TAG
[CLS]	  O  
  마  	B-PS 
 ##민 	I-PS 
 ##정 	I-PS 
[SEP]	  O  


In [49]:
text = '유광열'
ner_inference(text)

TOKEN	TAG
[CLS]	  O  
  유  	B-PS 
 ##광 	I-PS 
 ##열 	I-PS 
[SEP]	  O  


In [50]:
text = '최현상 멘토님'
ner_inference(text)

TOKEN	TAG
[CLS]	  O  
  최  	B-PS 
 ##현 	I-PS 
 ##상 	I-PS 
  _  	  O  
  멘  	  O  
 ##토 	  O  
 ##님 	  O  
[SEP]	  O  


In [51]:
text = '강다니엘'
ner_inference(text)

TOKEN	TAG
[CLS]	  O  
  강  	B-PS 
 ##다 	I-PS 
 ##니 	I-PS 
 ##엘 	I-PS 
[SEP]	  O  


In [52]:
text = '최다니엘'
ner_inference(text)

TOKEN	TAG
[CLS]	  O  
  최  	B-PS 
 ##다 	I-PS 
 ##니 	I-PS 
 ##엘 	I-PS 
[SEP]	  O  


In [53]:
text = '백지헌'
ner_inference(text)

TOKEN	TAG
[CLS]	  O  
  백  	B-PS 
 ##지 	I-PS 
 ##헌 	I-PS 
[SEP]	  O  


In [54]:
text = '장규리'
ner_inference(text)

TOKEN	TAG
[CLS]	  O  
  장  	B-PS 
 ##규 	I-PS 
 ##리 	I-PS 
[SEP]	  O  


In [55]:
text = '오슬기나래'
ner_inference(text)

TOKEN	TAG
[CLS]	  O  
  오  	B-PS 
 ##슬 	I-PS 
 ##기 	I-PS 
 ##나 	  O  
 ##래 	  O  
[SEP]	  O  


In [56]:
text = '크리스티나'
ner_inference(text)

TOKEN	TAG
[CLS]	  O  
  크  	B-DT 
 ##리 	I-DT 
 ##스 	I-DT 
 ##티 	I-DT 
 ##나 	I-DT 
[SEP]	  O  


In [57]:
text = '크리스토퍼'
ner_inference(text)

TOKEN	TAG
[CLS]	  O  
  크  	B-PS 
 ##리 	I-PS 
 ##스 	I-PS 
 ##토 	I-PS 
 ##퍼 	I-PS 
[SEP]	  O  


In [58]:
text = '레오나르도'
ner_inference(text)

TOKEN	TAG
[CLS]	  O  
  레  	  O  
 ##오 	I-PS 
 ##나 	I-PS 
 ##르 	  O  
 ##도 	  O  
[SEP]	  O  


In [59]:
text = '엘리자베스'
ner_inference(text)

TOKEN	TAG
[CLS]	  O  
  엘  	B-PS 
 ##리 	I-PS 
 ##자 	I-PS 
 ##베 	I-PS 
 ##스 	I-PS 
[SEP]	  O  


In [60]:
text = '크리스티안'
ner_inference(text)

TOKEN	TAG
[CLS]	  O  
  크  	B-PS 
 ##리 	I-PS 
 ##스 	I-PS 
 ##티 	I-PS 
 ##안 	I-PS 
[SEP]	  O  


In [61]:
text = '세바스티안'
ner_inference(text)

TOKEN	TAG
[CLS]	  O  
  세  	B-OG 
 ##바 	I-OG 
 ##스 	I-OG 
 ##티 	I-PS 
 ##안 	I-PS 
[SEP]	  O  
