# 필요한 라이브러리 Import

In [3]:
import torch
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
from KoBERT.kobert.pytorch_kobert import get_pytorch_kobert_model
import random
import numpy as np
from tqdm import tqdm, trange, tqdm_notebook
import pandas as pd
from ast import literal_eval
from torch import nn
from transformers import BertModel, BertConfig
from torchcrf import CRF
from pytorch_transformers import AdamW, WarmupLinearSchedule
from gluonnlp.data import BERTSPTokenizer
import sentencepiece as spm
from seqeval.metrics import f1_score
from pathlib import Path
import json

In [4]:
class Config:
    def __init__(self, json_path):
        with open(json_path, mode='r') as io:
            params = json.loads(io.read())
        self.__dict__.update(params)

    def save(self, json_path):
        with open(json_path, mode='w') as io:
            json.dump(self.__dict__, io, indent=4)

    def update(self, json_path):
        with open(json_path, mode='r') as io:
            params = json.loads(io.read())
        self.__dict__.update(params)

    @property
    def dict(self):
        return self.__dict__

# KoBERT + CRF

In [5]:
#클래스 참고: KoBERT와 CRF로 만든 한국어 개체명 인식 https://github.com/eagle705/pytorch-bert-crf-ner

bert_config = {'attention_probs_dropout_prob': 0.1,
                 'hidden_act': 'gelu',
                 'hidden_dropout_prob': 0.1,
                 'hidden_size': 768,
                 'initializer_range': 0.02,
                 'intermediate_size': 3072,
                 'max_position_embeddings': 512,
                 'num_attention_heads': 12,
                 'num_hidden_layers': 12,
                 'type_vocab_size': 2,
                 'vocab_size': 11575,
                 'padding': True,
                 'pair': False
                 }
                 
class KobertCRF(nn.Module):
    """ KoBERT with CRF """
    def __init__(self, config, num_classes, vocab=None) -> None:
        super(KobertCRF, self).__init__()

        if vocab is None:
            self.bert, self.vocab = get_pytorch_kobert_model()
        else:
            self.bert = BertModel(config=BertConfig.from_dict(bert_config))
            self.vocab = vocab

        self.dropout = nn.Dropout(config.dropout)
        self.position_wise_ff = nn.Linear(config.hidden_size, num_classes)
        self.crf = CRF(num_tags=num_classes, batch_first=True)

    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()
        
    def forward(self, input_ids, valid_length, token_type_ids=None, tags=None):
        attention_mask = self.gen_attention_mask(input_ids, valid_length)

        # outputs: (last_encoder_layer, pooled_output, attention_weight)
        outputs = self.bert(input_ids=input_ids,
                            token_type_ids=token_type_ids,
                            attention_mask=attention_mask.float().to(input_ids.device))
        last_encoder_layer = outputs[0]
        last_encoder_layer = self.dropout(last_encoder_layer)
        emissions = self.position_wise_ff(last_encoder_layer)

        if tags is not None:
            log_likelihood, sequence_of_tags = self.crf(emissions, tags), self.crf.decode(emissions)
            return log_likelihood, sequence_of_tags
        else:
            sequence_of_tags = self.crf.decode(emissions)
            return sequence_of_tags

# Tokenizer & Vocab 생성


In [None]:
#Tokenizer model 생성

corpus = "vocab.txt" # KoCharELECTRA (https://github.com/monologg/KoCharELECTRA) vocab.txt 파일 사용 (음절 vocab 파일)
prefix = "vocab"
vocab_size = 11568 # vocab.txt 크기

spm.SentencePieceTrainer.train(
    f"--input={corpus} --model_prefix={prefix} --vocab_size={vocab_size + 7}" + 
    " --model_type=bpe" +
    " --max_sentence_length=10" + # 문장 최대 길이
    " --pad_id=0 --pad_piece=[PAD]" + # pad (0)
    " --unk_id=1 --unk_piece=[UNK]" + # unknown (1)
    " --bos_id=2 --bos_piece=[BOS]" + # begin of sequence (2)
    " --eos_id=3 --eos_piece=[EOS]" + # end of sequence (3)
    " --user_defined_symbols=[SEP],[CLS],[MASK]") # 사용자 정의 토큰

In [6]:
bertmodel, _ = get_pytorch_kobert_model() # KoBERT 모델 불러오기
vocab = nlp.vocab.BERTVocab.from_sentencepiece('vocab.model', padding_token='[PAD]')
tokenizer = "vocab.model" #생성한 vocab.model 이용
tok = BERTSPTokenizer(tokenizer, vocab, lower=False) # BERTSPTokenizer 생성

using cached model
using cached model


## 데이터 셋 불러오기

In [33]:
import re
p = re.compile('[一-龥]') #한자를 포함한 문장 제거하기 위한 정규식

train_list_csv = pd.read_csv('Data/human_train.csv', converters={"tag": literal_eval})
test_list_csv = pd.read_csv('Data/human_test.csv', converters={"tag": literal_eval})

tr_tag = train_list_csv['tag']
tr_sent = train_list_csv['sent']
ts_tag = test_list_csv['tag']
ts_sent = test_list_csv['sent']

for i in range(len(train_list_csv['tag'])):
    if p.search(tr_sent[i]): #한자 문장 제거
        del tr_sent[i]
        del tr_tag[i]
    elif len(tr_tag[i]) != len(tr_sent[i])+3: #데이터 셋 준비 과정 중 길이 다르면 해당 인댁스 출력 +3 해주는 이유는 버트에 넣기 전 붙을 태그 등 데이터 떄문
        print(i)
    elif len(tr_tag[i]) > 512 or len(tr_sent[i])+3 > 512: # 문장의 길이가 512가 넘는 문장 제거
        del tr_sent[i]
        del tr_tag[i]

for i in range(len(test_list_csv['tag'])):
    if p.search(ts_sent[i]): #한자 문장 제거
        del ts_sent[i]
        del ts_tag[i]
    elif 'Ⅳ' in ts_sent[i]: #로마자 제거
        del ts_sent[i]
        del ts_tag[i]
    elif len(ts_tag[i]) != len(ts_sent[i])+3: #데이터 셋 준비 과정 중 길이 다르면 해당 인댁스 출력 +3 해주는 이유는 버트에 넣기 전 붙을 태그 등 데이터 떄문
        print(i)
    elif len(ts_tag[i]) > 512 or len(ts_sent[i])+3 > 512: # 문장의 길이가 512가 넘는 문장 제거
        del ts_sent[i]
        del ts_tag[i]
    

목척수신경C2:턱하부,머리뒤쪽C3:목상부,머리뒤쪽C4:목하부,어깨상부C5:쇄골부위,어깨상부C6:어깨,팔바깥쪽,엄지C7:등상부,팔뒤쪽,검지,중지C8:등상부,팔안쪽,약지,새끼손가락DermatomesoftheUpperPartsoftheBody,displayingsignificantoverlapping(Modified,fromFender,afterFoerster)가슴척수신경T1:가슴상부와등,겨드랑이,팔앞쪽T2:가슴상부와등T3:가슴상부와등T4:가슴상부(유두부분)그리고등T5:가슴중부와등T6:가슴중부와등T7:가슴중부와등T8:상복부와등중부T9:상복부와등중부T10:복부(배꼽주위)그리고등중부T11:복부와등중부T12:하복부와등중부LumbarspinalnervesL1:요추,고관절,서혜부L2:요추,허벅지앞쪽과안쪽L3:요추,허벅지앞쪽과안쪽L4:요추,허벅지앞쪽과종아리,무릎부분,발목안쪽L5:요추,종아리앞쪽과바깥쪽,발등과발바닥,발가락1-4번SacralspinalnervesS1:요추,허벅지뒤쪽,종아리뒤쪽과안쪽,새끼발가락S2:엉덩이,음부,허벅지뒤쪽과종아리S3:엉덩이,음부S4:엉덩이S5:엉덩이Coccygealspinalnerves엉덩이,꼬리뼈부분[출처][한님쌤필라테스]통증을관리하려면꼭알아야하는,더마토미dermatome|작성자한님쌤필라테스네그렇습니다. 3238


## 버트 데이터 셋 클래스 생성

In [8]:
# 코드 참고: 버트를 이용한 네이버 댓글 분류기 https://github.com/SKTBrain/KoBERT/blob/master/scripts/NSMC/naver_review_classifications_pytorch_kobert.ipynb

class BERTDataset(Dataset):
    def __init__(self, sent, tag, bert_tokenizer, max_len,
                pad, pair):
        transform = nlp.data.BERTSentenceTransform(bert_tokenizer, max_len, pad=pad, pair=pair)
        self.sentences = [transform([i]) for i in sent] #문장
        self.labels = [np.pad(np.int32(i), (0,max_len-len(i)), 'constant', constant_values=0) for i in tag] #질병 태그 및 질병 태그 max_len 패딩

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

## 사용자 입력 문장을 테스하기 위한 데이터 셋 클래스 생성

In [None]:
class BERTTESTsent(Dataset):
    def __init__(self, sent, bert_tokenizer, max_len,
                pad, pair):
        transform = nlp.data.BERTSentenceTransform(bert_tokenizer, max_len, pad=pad, pair=pair)
        self.sentences = [transform([i]) for i in sent] #문장

    def __getitem__(self, i):
        return (self.sentences[i])

    def __len__(self):
        return (len(self.sentences))

## 데이터 셋 나누기

In [24]:
data_train = BERTDataset(tr_sent,tr_tag, tok, 512, True, False)
data_test = BERTDataset(ts_sent,ts_tag, tok, 512, True, False)

In [25]:
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=32, num_workers = 5, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=32, num_workers = 5, shuffle=True)

In [11]:
data_train[0]

(array([    5,   248,   918,  3493,  8030,  6888,   638, 11110,  3914,
         2738,  6210,  9738,   414,  2115,  5839,  7462,  6994,  7218,
          280,     4,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
      

## 데이터 확인

In [None]:
def count_len(list):
    count = 0
    for i in range(len(list)):
        if list[i] == 0:
            break
        else:
            count += 1
    return count

In [None]:
data_tr_count = 0
data_ts_count = 0
for i in range(len(data_train)):
    
    if count_len(data_train[i][0]) != count_len(data_train[i][3]):
        data_tr_count += 1
        print(i)

for i in tqdm_notebook(range(len(data_test))):

    if count_len(data_test[i][0]) != count_len(data_test[i][3]):
        data_ts_count += 1
        print(i)
print(data_tr_count, data_ts_count)

# 모델 선언 (초기 학습을 위한)

In [36]:
device = torch.device('cuda')
model_config = Config('config.json')
model = KobertCRF(config=model_config, num_classes=7)
model.to(device)
model.train()

using cached model
using cached model


KobertCRF(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(8002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
   

# 학습된 모델 불러오기 (파인 튜닝을 위한)

In [None]:
model = torch.load('KoBERT_DIS_NER/Human_DIS(epoch3).pth')
model_config = Config('config.json')

# 모델 환경 설정

In [29]:
def set_seed(seed=100):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    n_gpu = torch.cuda.device_count()
    if n_gpu > 0:
        torch.cuda.manual_seed_all(seed)

In [30]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(
        nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]

t_total = len(
    train_dataloader) // model_config.gradient_accumulation_steps * model_config.epochs
optimizer = AdamW(optimizer_grouped_parameters,
                    lr=model_config.learning_rate, eps=model_config.adam_epsilon)
scheduler = WarmupLinearSchedule(
    optimizer, warmup_steps=model_config.warmup_steps, t_total=t_total)

model_dir = 'DIS_NER/'

global_step = 0
tr_loss, logging_loss = 0.0, 0.0
model.zero_grad()
set_seed()

# 모델 학습

In [31]:
import os
from torch.utils.tensorboard import SummaryWriter

In [32]:
train_iterator = trange(int(model_config.epochs), desc="Epoch")
device = torch.device('cuda')
writer = SummaryWriter('scalar/')

for _epoch, _ in enumerate(train_iterator):
    epoch = _epoch
    model.train()
    for step, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
 
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
 
        log_likelihood, sequence_of_tags = model(token_ids,valid_length, segment_ids, label)

        # loss: negative log-likelihood
        loss = -1 * log_likelihood
        writer.add_scalar("Loss/train", loss, epoch)
            
        loss.backward()
        torch.nn.utils.clip_grad_norm_(
            model.parameters(), model_config.max_grad_norm)
        tr_loss += loss.item()

        if (step + 1) % model_config.gradient_accumulation_steps == 0:
            optimizer.step()
            scheduler.step()
            model.zero_grad()
            global_step += 1

            with torch.no_grad():
                sequence_of_tags = torch.tensor(
                    sequence_of_tags).to(device)
                mb_acc = (sequence_of_tags == label).float()[label != 0].mean()
                writer.add_scalar("Acc/train", mb_acc, epoch)

            tr_acc = mb_acc.item()
            tr_loss_avg = tr_loss / global_step
            tr_summary = {'loss': tr_loss_avg, 'acc': tr_acc}
            state = {'global_step': global_step + 1,
                             'model_state_dict': model.state_dict(),
                             'opt_state_dict': optimizer.state_dict()}
            if step % 100 == 0:
                print('epoch : {}, global_step : {}, tr_loss: {:.3f}, tr_acc: {:.2%}'.format(
                        epoch + 1, global_step, tr_summary['loss'], tr_summary['acc']))
                
    torch.save(model, 'KoBERT_DIS_NER/Human_DIS(epoch'+str(epoch+1)+').pth')
    print("save")

writer.close()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=9475.0), HTML(value='')))

Epoch:   0%|          | 0/15 [00:00<?, ?it/s]







IndexError: index out of range in self

# 모델 저장

In [None]:
torch.save(model, 'KoBERT_DIS_NER/Human_PET_DIS.pth')

# 모델 평가

In [None]:
def index_to_ner(sequences, val_len):

    idx_to_ner = {
        1 : "[CLS]", 
        2 : "[SEP]", 
        0 : "O", #[PAD] to 'O'
        3 : "[MASK]",
        4 : "O",
        5 : "B-DIS",
        6 : "I-DIS"
    }
    result = []
    for i in range(val_len):
        result.append(idx_to_ner[sequences[i]])
    return result

In [None]:
device = torch.device('cuda')
model_config = Config('config.json')
model = torch.load('KoBERT_DIS_NER/Human_PET_DIS.pth')
model.to(device)
model.eval()
eval_step = 0
sum_f1 = 0
predic_list = []
true_list = []
predic_list2= []
true_list2 = []
for step, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
    
    token_ids = token_ids.long().to(device)
    segment_ids = segment_ids.long().to(device)
    valid_length= valid_length
    label = label.long().to(device)

    log_likelihood, sequence_of_tags = model(token_ids,valid_length, segment_ids, label)
    # loss: negative log-likelihood
    eval_loss = -1 * log_likelihood

    if (eval_step + 1) % model_config.gradient_accumulation_steps == 0:
        eval_step += 1
        with torch.no_grad():
            sequence_of_tags = torch.tensor(
                sequence_of_tags).to(device)
            eval_acc = (sequence_of_tags == label).float()[label != 0].mean()
        eval_loss_avg = eval_loss / eval_step
        eval_summary = {'loss': eval_loss_avg, 'acc': eval_acc}

        predic_list.append(index_to_ner(sequence_of_tags[0].cpu().numpy(), valid_length[0].cpu().numpy()))
        true_list.append(index_to_ner(label[0].cpu().numpy(), valid_length[0].cpu().numpy()))
        
        if eval_step % 10 == 0:
            print('eval_step : {}, evl_loss: {:.3f}, evl_acc: {:.2f}'.format(
                eval_step, eval_summary['loss'], eval_summary['acc']))

## F1 스코어 평가

In [None]:
f1_score(true_list, predic_list)

# 사용자 입력 문장 모델 평가

In [None]:
bertmodel, _ = get_pytorch_kobert_model() # KoBERT 모델 불러오기
vocab = nlp.vocab.BERTVocab.from_sentencepiece('vocab.model', padding_token='[PAD]')
tokenizer = "vocab.model" #생성한 vocab.model 이용
tok = BERTSPTokenizer(tokenizer, vocab, lower=False) # BERTSPTokenizer 생성

###### model_config = Config('config.json')

# model
model = torch.load('KoBERT_DIS_NER/Human_PET_DIS.pth')
model.eval()
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

input_text = input('input> ')
print(input_text)
text_sent = []
text_sent.append(input_text)
sent_test = BERTTESTsent(text_sent, tok, 512, True, False)
sent_dataloader = torch.utils.data.DataLoader(sent_test, batch_size=1, num_workers = 5)

for _, (token_ids, valid_length, segment_ids) in enumerate(tqdm_notebook(sent_dataloader)):
    token_ids = token_ids.long().to(device)
    segment_ids =  segment_ids.long().to(device)
    valid_length =  valid_length
    sequence_of_tags = model(token_ids, valid_length, segment_ids)
    print(sequence_of_tags)
