In [1]:
!pip install pytorch-crf

Collecting pytorch-crf
  Downloading https://files.pythonhosted.org/packages/96/7d/4c4688e26ea015fc118a0327e5726e6596836abce9182d3738be8ec2e32a/pytorch_crf-0.7.2-py3-none-any.whl
Installing collected packages: pytorch-crf
Successfully installed pytorch-crf-0.7.2


In [2]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)
[K     |▍                               | 10kB 24.4MB/s eta 0:00:01[K     |▉                               | 20kB 1.6MB/s eta 0:00:01[K     |█▎                              | 30kB 2.2MB/s eta 0:00:01[K     |█▊                              | 40kB 2.5MB/s eta 0:00:01[K     |██▏                             | 51kB 1.9MB/s eta 0:00:01[K     |██▋                             | 61kB 2.2MB/s eta 0:00:01[K     |███                             | 71kB 2.4MB/s eta 0:00:01[K     |███▍                            | 81kB 2.7MB/s eta 0:00:01[K     |███▉                            | 92kB 2.8MB/s eta 0:00:01[K     |████▎                           | 102kB 2.7MB/s eta 0:00:01[K     |████▊                           | 112kB 2.7MB/s eta 0:00:01[K     |█████▏                          | 122kB 2.7M

In [6]:
from transformers import *
import numpy as np

import torch
from torch import nn
import torch.nn.init as init
import torch.optim as optim
from torch.utils.data import DataLoader
from urllib.request import urlretrieve

import pandas as pd
from keras.preprocessing.sequence import pad_sequences
from torchcrf import CRF
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [14]:
import logging
import os
import unicodedata
from shutil import copyfile

from transformers import PreTrainedTokenizer

logger = logging.getLogger(__name__)

VOCAB_FILES_NAMES = {"vocab_file": "tokenizer_78b3253a26.model",
                     "vocab_txt": "vocab.txt"}

PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "monologg/kobert": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/kobert/tokenizer_78b3253a26.model",
        "monologg/kobert-lm": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/kobert-lm/tokenizer_78b3253a26.model",
        "monologg/distilkobert": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/distilkobert/tokenizer_78b3253a26.model"
    },
    "vocab_txt": {
        "monologg/kobert": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/kobert/vocab.txt",
        "monologg/kobert-lm": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/kobert-lm/vocab.txt",
        "monologg/distilkobert": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/distilkobert/vocab.txt"
    }
}

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "monologg/kobert": 512,
    "monologg/kobert-lm": 512,
    "monologg/distilkobert": 512
}

PRETRAINED_INIT_CONFIGURATION = {
    "monologg/kobert": {"do_lower_case": False},
    "monologg/kobert-lm": {"do_lower_case": False},
    "monologg/distilkobert": {"do_lower_case": False}
}

SPIECE_UNDERLINE = u'▁'


class KoBertTokenizer(PreTrainedTokenizer):
    """
        SentencePiece based tokenizer. Peculiarities:
            - requires `SentencePiece <https://github.com/google/sentencepiece>`_
    """
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES

    def __init__(
            self,
            vocab_file,
            vocab_txt,
            do_lower_case=False,
            remove_space=True,
            keep_accents=False,
            unk_token="[UNK]",
            sep_token="[SEP]",
            pad_token="[PAD]",
            cls_token="[CLS]",
            mask_token="[MASK]",
            **kwargs):
        super().__init__(
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            **kwargs
        )

        # Build vocab
        self.token2idx = dict()
        self.idx2token = []
        with open(vocab_txt, 'r', encoding='utf-8') as f:
            for idx, token in enumerate(f):
                token = token.strip()
                self.token2idx[token] = idx
                self.idx2token.append(token)

        try:
            import sentencepiece as spm
        except ImportError:
            logger.warning("You need to install SentencePiece to use KoBertTokenizer: https://github.com/google/sentencepiece"
                           "pip install sentencepiece")

        self.do_lower_case = do_lower_case
        self.remove_space = remove_space
        self.keep_accents = keep_accents
        self.vocab_file = vocab_file
        self.vocab_txt = vocab_txt

        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(vocab_file)

    @property
    def vocab_size(self):
        return len(self.idx2token)

    def get_vocab(self):
        return dict(self.token2idx, **self.added_tokens_encoder)

    def __getstate__(self):
        state = self.__dict__.copy()
        state["sp_model"] = None
        return state

    def __setstate__(self, d):
        self.__dict__ = d
        try:
            import sentencepiece as spm
        except ImportError:
            logger.warning("You need to install SentencePiece to use KoBertTokenizer: https://github.com/google/sentencepiece"
                           "pip install sentencepiece")
        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(self.vocab_file)

    def preprocess_text(self, inputs):
        if self.remove_space:
            outputs = " ".join(inputs.strip().split())
        else:
            outputs = inputs
        outputs = outputs.replace("``", '"').replace("''", '"')

        if not self.keep_accents:
            outputs = unicodedata.normalize('NFKD', outputs)
            outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
        if self.do_lower_case:
            outputs = outputs.lower()

        return outputs

    def _tokenize(self, text, return_unicode=True, sample=False):
        """ Tokenize a string. """
        text = self.preprocess_text(text)

        if not sample:
            pieces = self.sp_model.EncodeAsPieces(text)
        else:
            pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
        new_pieces = []
        for piece in pieces:
            if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
                cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
                if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
                    if len(cur_pieces[0]) == 1:
                        cur_pieces = cur_pieces[1:]
                    else:
                        cur_pieces[0] = cur_pieces[0][1:]
                cur_pieces.append(piece[-1])
                new_pieces.extend(cur_pieces)
            else:
                new_pieces.append(piece)

        return new_pieces

    def _convert_token_to_id(self, token):
        """ Converts a token (str/unicode) in an id using the vocab. """
        return self.token2idx.get(token, self.token2idx[self.unk_token])

    def _convert_id_to_token(self, index, return_unicode=True):
        """Converts an index (integer) in a token (string/unicode) using the vocab."""
        return self.idx2token[index]

    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (strings for sub-words) in a single string."""
        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
        return out_string

    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
        by concatenating and adding special tokens.
        A KoBERT sequence has the following format:
            single sequence: [CLS] X [SEP]
            pair of sequences: [CLS] A [SEP] B [SEP]
        """
        if token_ids_1 is None:
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        return cls + token_ids_0 + sep + token_ids_1 + sep

    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
        """
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
        Args:
            token_ids_0: list of ids (must not contain special tokens)
            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
                for sequence pairs
            already_has_special_tokens: (default False) Set to True if the token list is already formated with
                special tokens for the model
        Returns:
            A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
        """

        if already_has_special_tokens:
            if token_ids_1 is not None:
                raise ValueError(
                    "You should not supply a second sequence if the provided sequence of "
                    "ids is already formated with special tokens for the model."
                )
            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))

        if token_ids_1 is not None:
            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
        return [1] + ([0] * len(token_ids_0)) + [1]

    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
        """
        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
        A KoBERT sequence pair mask has the following format:
        0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence
        if token_ids_1 is None, only returns the first portion of the mask (0's).
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]
        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]

    def save_vocabulary(self, save_directory):
        """ Save the sentencepiece vocabulary (copy original file) and special tokens file
            to a directory.
        """
        if not os.path.isdir(save_directory):
            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
            return

        # 1. Save sentencepiece model
        out_vocab_model = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])

        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_model):
            copyfile(self.vocab_file, out_vocab_model)

        # 2. Save vocab.txt
        index = 0
        out_vocab_txt = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_txt"])
        with open(out_vocab_txt, "w", encoding="utf-8") as writer:
            for token, token_index in sorted(self.token2idx.items(), key=lambda kv: kv[1]):
                if index != token_index:
                    logger.warning(
                        "Saving vocabulary to {}: vocabulary indices are not consecutive."
                        " Please check that the vocabulary is not corrupted!".format(out_vocab_txt)
                    )
                    index = token_index
                writer.write(token + "\n")
                index += 1

        return out_vocab_model, out_vocab_txt

In [59]:
def set_seed(seed = 2020):
    torch.manual_seed(seed)
    if torch.cuda.is_available:
        torch.cuda.manual_seed_all(seed)

set_seed(2020)

# ● train/test data set 소개
* **NER 대회용 data set**과 **직접 스크래핑한 데이터**를 섞어 train/test

## 1. NLP Challenge

* 네이버와 창원대에서 개최한 NLP challenge의 train set
* parsing이 잘못되는 부분이 있어 load후 약간의 전처리를 거친다.

**아래와 같은 총 14개의 개체명 태그가 존재한다.**

1. PERSON	PER	실존, 가상 등 인물명에 해당 하는 것
2. FIELD	FLD	학문 분야 및 이론, 법칙, 기술 등
3. ARTIFACTS_WORKS	AFW	인공물로 사람에 의해 창조된 대상물
4. ORGANIZATION	ORG	기관 및 단체와 회의/회담을 모두 포함
5. LOCATION	LOC	지역명칭과 행정구역 명칭 등
6. CIVILIZATION	CVL	문명 및 문화에 관련된 용어
7. DATE	DAT	날짜
8. TIME	TIM	시간
9. NUMBER	NUM	숫자
10. EVENT	EVT	특정 사건 및 사고 명칭과 행사 등
11. ANIMAL	ANM	동물
12. PLANT	PLT	식물
13. MATERIAL	MAT	금속, 암석, 화학물질 등
14. TERM	TRM	의학 용어, IT곤련 용어 등 일반 용어를 총칭



NOTE

- index는 새로운 문장이 시작될 때마다 1로 초기화된다.
- tag의 앞부분은 개체명의 의미를, 뒷부분은 BIO tagging을 뜻한다.
- B는 개체명의 시작 어절, I는 끝 어절, -는 개체명이 아닌 어절을 뜻한다.
- 두 개체명이 조합된 경우, 앞에 등장하는 개체명을 따라 태그를 부여한다. 
ex ) 포항공과대학교(LOC_B) 컴퓨터공학과(ORG_B) => LOC로 부여


In [8]:
# load #
url = "https://github.com/naver/nlp-challenge/raw/master/missions/ner/data/train/train_data"
urlretrieve(url, "./train_data")

# preprocessing #
train_raw = pd.read_csv("train_data",sep="\n", header=None)
train_raw = train_raw.applymap(lambda x:x.split("\t"))
train = pd.DataFrame(train_raw[0].tolist(), columns = ["index","word","tag"])
train['index'] = train['index'].map(int)
train['word'] = train['word'].str.replace("．", ".", regex=False)
train['word'] = train['word'].str.replace(r'[^ㄱ-ㅣ가-힣0-9a-zA-Z.]+', "", regex=True)

In [9]:
train.head(20)

Unnamed: 0,index,word,tag
0,1,비토리오,PER_B
1,2,양일,DAT_B
2,3,만에,-
3,4,영사관,ORG_B
4,5,감호,CVL_B
5,6,용퇴,-
6,7,항룡,-
7,8,압력설,-
8,9,의심만,-
9,10,가율,-


In [10]:
labels = train.tag.unique()
tag_to_label = {tag:i for i,tag in enumerate(labels)}
tag_to_label["[PAD]"] = len(labels)
label_to_tag =  {label:tag for tag,label in tag_to_label.items()}

data = train_raw[0].tolist()
sentences, targets = [], []
temp_sts, temp_targets = ['[CLS]'], [tag_to_label["-"]]

for sentence_index, word, tag in data:
    # 새로운 문장이 등장했을 경우 SEP token을 추가하고 초기화시킨다.
    if sentence_index == "1":
        temp_sts.append("[SEP]")
        temp_targets.append(tag_to_label["-"])

        sentences.append(temp_sts)
        targets.append(temp_targets)

        temp_sts, temp_targets = ['[CLS]'], [tag_to_label["-"]] # 초기화

    temp_sts.append(word)
    temp_targets.append(tag_to_label[tag])

del [[sentences[0], targets[0]]] # 초깃값  ['[CLS]'], [tag_to_label["-"]] 제거

In [11]:
sentences[0] # model input

['[CLS]',
 '비토리오',
 '양일',
 '만에',
 '영사관',
 '감호',
 '용퇴,',
 '항룡',
 '압력설',
 '의심만',
 '가율',
 '[SEP]']

In [12]:
targets[0] # model target

[2, 0, 1, 2, 3, 4, 2, 2, 2, 2, 2, 2]

In [15]:
tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=371391.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=77779.0, style=ProgressStyle(descriptio…




In [16]:
def tokenize_and_preserve_labels(sentence, text_labels): # 
  tokenized_sentence = []
  labels = []

  for word, label in zip(sentence, text_labels):

    tokenized_word = tokenizer.tokenize(word)
    n_subwords = len(tokenized_word)

    tokenized_sentence.extend(tokenized_word)
    labels.extend([label] * n_subwords)

  return tokenized_sentence, labels

In [17]:
#####################
# token화 된 data로 #
#####################

tokenized_texts_and_labels = [tokenize_and_preserve_labels(sent, labs)
                              for sent, labs in zip(sentences, targets)]

tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]

# params #
max_len = int(np.quantile(np.array([len(x) for x in tokenized_texts]), 0.975))
batch_size = 32

# input data #
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=max_len, dtype = "int",
                          value=tokenizer.convert_tokens_to_ids("[PAD]"),
                          truncating="post", padding="post")
tags = pad_sequences([lab for lab in labels], maxlen=max_len,
                     value=tag_to_label["[PAD]"], padding='post',
                     dtype='int', truncating='post')

## 2. 한국민족문화대백과사전

* 인물, 지명, 문화재, 유물, 단체 등의 카테고리를 이용하여 true tag를 생성

  1. 각 카테고리에 접근한다.

  2. 단체 카테고리에 속하는 단어들은 모두 `ORG `를 true tag로 지정한다.

  3. NER 학습을 위해서는 문장이 필요하다. 해당 단어가 포함된 설명을 스크래핑한다.

     > `교민` : `-` , `중국` : `-` , `관헌도` : `-` ,  `간민회` : `ORG_B`

  4. true tag가 달리지 않은 `교민`, `중국`, `관헌도 `등은 **기존의 model(acc 97%)를 이용하여 약한 정답**을 생성한다.

     > **기존 모델에 의한 정답**  `교민` : `PER` , `중국` : `LOC` , `관헌도` : `LOC`  ,`간민회` : `-`
     >
     > **스크래핑으로 생성한 정답**  `교민` : `-` , `중국` : `-` , `관헌도` : `-` ,  `간민회` : `ORG_B`
     >
     > **=> 최종 모델에 대한 정답 ** `교민` : `PER` , `중국` : `LOC` , `관헌도` : `LOC`  ,`간민회` : `ORG_B`



#### NOTE

* 전체 카테고리 중 "유물","유적","작품","제도","지명","문헌","단체","문화재" 를 이용

In [21]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [22]:
targets_new = eval([line.rstrip('\n') for line in open("/content/drive/My Drive/projects/targets_new.txt")][0]) # targets_scraping.txt
sentences_new = eval([line.rstrip('\n') for line in open("/content/drive/My Drive/projects/sentences_new.txt", encoding = 'utf-8')][0]) # sentences_scraping.tt

In [23]:
#####################
# token화 된 data로 #
#####################

tokenized_texts_and_labels = [tokenize_and_preserve_labels(sent, labs)
                              for sent, labs in zip(sentences_new, targets_new)]

tokenized_texts = [['[CLS]'] + token_label_pair[0]  + ['[SEP]'] for token_label_pair in tokenized_texts_and_labels]
labels = [[2] + token_label_pair[1] + [2] for token_label_pair in tokenized_texts_and_labels]

input_ids_new = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=max_len, dtype = "int",
                          value=tokenizer.convert_tokens_to_ids("[PAD]"),
                          truncating="post", padding="post")
tags_new = pad_sequences([lab for lab in labels], maxlen=max_len,
                     value=tag_to_label["[PAD]"], padding='post',
                     dtype='int', truncating='post')

In [24]:
len(input_ids_new) + len(input_ids)

95194

In [25]:
 len(input_ids)

89999

In [26]:
len(input_ids_new) + 1063571

1068766

# ● class DistilKoBertCRF

In [27]:
class DistilKobertCRF(nn.Module):
    def __init__(self, num_classes):
        super(DistilKobertCRF, self).__init__()

        self.hidden_size = 768
        self.num_classes = num_classes
        self.pad_id = 1

        self.bert = DistilBertModel.from_pretrained("monologg/distilkobert")
        self.FC = torch.nn.Linear(self.hidden_size,self.num_classes)
        self.crf = CRF(num_tags = num_classes, batch_first = True)

    def forward(self, input_ids, real_tags = None):
        attention_mask = input_ids.ne(self.pad_id).float()
        last_hidden_state = self.bert.forward(input_ids, attention_mask)
        dense = self.FC(last_hidden_state[0])
        
        if real_tags is not None:
            log_likelihood = self.crf(dense,real_tags)
            pred_tags = self.crf.decode(dense)
            return log_likelihood, pred_tags
        
        else:
            pred_tags =  self.crf.decode(dense)
            return pred_tags

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cuda


# ● Train
* NLP challenge data와 한국민족문화대백과사전 데이터를 각각 80%씩 추출하여 합친 후 train data로 사용한다.

In [None]:
# torch.save(model.state_dict(),"/content/drive/My Drive/projects/ner_model_weight")
# torch.save(model, "/content/drive/My Drive/projects/ner_model_structure")

# model = DistilKobertCRF(num_classes = 30)
# model.load_state_dict(torch.load("/content/drive/My Drive/projects/ner_model_weight_concat_epoch_18"))
# model.to(device)

In [30]:
inputs_concat_train = np.concatenate((input_ids[:train_size],input_ids_new[:train_size_2]),axis=0)
tags_concat_train = np.concatenate((tags[:train_size],tags_new[:train_size_2]),axis=0)

In [31]:
inputs_concat_test = np.concatenate((input_ids[train_size:],input_ids_new[train_size_2:]),axis=0)
tags_concat_test = np.concatenate((tags[train_size:],tags_new[train_size_2:]),axis=0)

In [33]:
train_ids_loader = torch.utils.data.DataLoader(inputs_concat_train, batch_size=batch_size,drop_last=True,shuffle=False)
train_tags_loader = torch.utils.data.DataLoader(tags_concat_train, batch_size=batch_size,drop_last=True, shuffle=False)

In [34]:
test_ids_loader = torch.utils.data.DataLoader(inputs_concat_test, batch_size=batch_size,drop_last=True,shuffle=False)
test_tags_loader = torch.utils.data.DataLoader(tags_concat_test, batch_size=batch_size,drop_last=True, shuffle=False)

In [None]:
batch_size = 32
train_size = int(len(input_ids)*0.8)
train_size_2 = int(len(targets_new)*0.8)

In [47]:
model = DistilKobertCRF(num_classes = 30)
model.to(device)
model.train()
optimizer = optim.Adam(model.parameters(), lr=0.00002)

In [48]:
model

DistilKobertCRF(
  (bert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(8002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_feat

In [49]:
epochs = 18

In [50]:
train_accuracy = []
loss_ = []

for epoch in range(epochs):
    print("Epoch:",epoch ,"================================================================")
    for iters, [input_id, true_tag] in enumerate(zip(train_ids_loader, train_tags_loader)):
        optimizer.zero_grad()

        input_id = input_id.long().to(device)
        true_tag = true_tag.long().to(device)
        
        log_likelihood , pred_tag = model.forward(input_id, true_tag)
        loss = -1*log_likelihood
        loss = loss.mean()
        loss.backward()
        optimizer.step()

        if iters % 100 == 0:
            true_tag,pred_tag = true_tag.cpu(),torch.tensor(pred_tag).cpu()

            correct_iters = (pred_tag == true_tag).float()[true_tag != tag_to_label['[PAD]']].sum()
            total_iters = len(true_tag[true_tag != tag_to_label['[PAD]'] ])

            temp_acc = correct_iters/total_iters
            train_accuracy.append(temp_acc.cpu().detach().numpy())

            print(f"accuracy({iters}) :",temp_acc)

    print("Loss :", loss)
    loss_.append(loss.cpu().detach().numpy())  

accuracy(0) : tensor(0.0366)
accuracy(100) : tensor(0.6354)
accuracy(200) : tensor(0.5923)
accuracy(300) : tensor(0.6087)
accuracy(400) : tensor(0.7132)
accuracy(500) : tensor(0.6984)
accuracy(600) : tensor(0.6844)
accuracy(700) : tensor(0.8034)
accuracy(800) : tensor(0.7384)
accuracy(900) : tensor(0.7978)
accuracy(1000) : tensor(0.7968)
accuracy(1100) : tensor(0.7571)
accuracy(1200) : tensor(0.8026)
accuracy(1300) : tensor(0.8030)
accuracy(1400) : tensor(0.8018)
accuracy(1500) : tensor(0.8424)
accuracy(1600) : tensor(0.7835)
accuracy(1700) : tensor(0.8377)
accuracy(1800) : tensor(0.8170)
accuracy(1900) : tensor(0.8616)
accuracy(2000) : tensor(0.8195)
accuracy(2100) : tensor(0.8553)
accuracy(2200) : tensor(0.8067)
accuracy(2300) : tensor(0.8450)
Loss : tensor(909.0316, device='cuda:0', grad_fn=<MeanBackward0>)
accuracy(0) : tensor(0.7925)
accuracy(100) : tensor(0.8579)
accuracy(200) : tensor(0.8373)
accuracy(300) : tensor(0.8005)
accuracy(400) : tensor(0.8626)
accuracy(500) : tensor(0.

In [None]:
# torch.save(model.state_dict(),"/content/drive/My Drive/projects/ner_model_weight_concat_epoch_28")

## evaluate

In [None]:
# model = DistilKobertCRF(num_classes = 30)
# model.load_state_dict(torch.load("/content/drive/My Drive/projects/ner_model_weight_concat_epoch_18"))
# model.to(device)

In [51]:
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for iters, [input_id, true_tag] in enumerate(zip(test_ids_loader, test_tags_loader)):
        
        input_id = input_id.long().to(device)
        true_tag = true_tag.long()
        
        pred_tag = model.forward(input_id)
        pred_tag = torch.tensor(pred_tag)
        
        correct_iters = (pred_tag == true_tag).float()[true_tag != tag_to_label['[PAD]']].sum()
        total_iters = len(true_tag[true_tag != tag_to_label['[PAD]'] ])
        
        correct += correct_iters
        total += total_iters
        
        if not iters % 50:
            print(f"{iters} - accuracy : {correct_iters/total_iters}")

    print("Accuracy of the model: {}".format(correct/total))

0 - accuracy : 0.9451277256011963
50 - accuracy : 0.9257143139839172
100 - accuracy : 0.917475700378418
150 - accuracy : 0.9093511700630188
200 - accuracy : 0.9065108299255371
250 - accuracy : 0.8427543640136719
300 - accuracy : 0.9092559218406677
350 - accuracy : 0.9175340533256531
400 - accuracy : 0.9255319237709045
450 - accuracy : 0.8990195989608765
500 - accuracy : 0.90444016456604
550 - accuracy : 0.885660707950592
Accuracy of the model: 0.9060052633285522


## f1, confusion

In [52]:
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

In [None]:
# print(classification_report(val_tags_l, y_predicted_l, labels=f_label))

In [None]:
f1_score(val_tags_l, y_predicted_l,average=None,labels=f_label)

  average, "true nor predicted", 'F-score is', len(true_sum)


array([0.88120297, 0.92668941, 0.95711281, 0.84819191, 0.82424621,
       0.94676573, 0.77282801, 0.8027807 , 0.81881559, 0.57605496,
       0.66418492, 0.81123181, 0.4847865 , 0.77518523, 0.87061184,
       0.89298044, 0.7002728 , 0.90532349, 0.47955903, 0.73890533,
       0.48627451, 0.        , 0.56848433, 0.17040359, 0.1957905 ,
       0.15189873, 0.        , 0.        , 0.        ])

In [None]:
f1_score(val_tags_l, y_predicted_l,average=None,labels=f_label)

  average, "true nor predicted", 'F-score is', len(true_sum)


array([0.88120297, 0.92668941, 0.95711281, 0.84819191, 0.82424621,
       0.94676573, 0.77282801, 0.8027807 , 0.81881559, 0.57605496,
       0.66418492, 0.81123181, 0.4847865 , 0.77518523, 0.87061184,
       0.89298044, 0.7002728 , 0.90532349, 0.47955903, 0.73890533,
       0.48627451, 0.        , 0.56848433, 0.17040359, 0.1957905 ,
       0.15189873, 0.        , 0.        , 0.        ])

In [None]:
# F1 score
df = pd.DataFrame(f1_score(val_tags_l, y_predicted_l,average=None,labels=f_label), index = f_label).round(2)
df.sort_values(by=0,ascending=False).transpose()

  average, "true nor predicted", 'F-score is', len(true_sum)


Unnamed: 0,-,NUM_B,DAT_B,DAT_I,TIM_I,PER_B,TIM_B,ORG_B,CVL_B,TRM_B,PER_I,EVT_B,NUM_I,LOC_B,ORG_I,ANM_B,EVT_I,TRM_I,AFW_B,MAT_B,FLD_B,CVL_I,AFW_I,LOC_I,PLT_B,MAT_I,FLD_I,ANM_I,PLT_I
0,0.96,0.95,0.93,0.91,0.89,0.88,0.87,0.85,0.82,0.82,0.81,0.8,0.78,0.77,0.74,0.7,0.66,0.58,0.57,0.49,0.48,0.48,0.2,0.17,0.15,0.0,0.0,0.0,0.0


In [None]:
# Recall
df = pd.DataFrame(recall_score(val_tags_l, y_predicted_l,average=None,labels=f_label), index = f_label).round(2)
df.sort_values(by=0,ascending=False).transpose()

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,-,NUM_B,DAT_I,DAT_B,LOC_B,PER_B,TIM_I,ORG_B,TIM_B,NUM_I,CVL_B,PER_I,EVT_B,TRM_B,ORG_I,ANM_B,EVT_I,TRM_I,AFW_B,FLD_B,CVL_I,MAT_B,AFW_I,PLT_B,MAT_I,LOC_I,FLD_I,ANM_I,PLT_I
0,0.96,0.96,0.95,0.93,0.91,0.9,0.9,0.84,0.81,0.8,0.79,0.79,0.79,0.74,0.69,0.67,0.58,0.44,0.4,0.3,0.3,0.27,0.1,0.03,0.0,0.0,0.0,0.0,0.0


In [None]:
# Precision
df = pd.DataFrame(precision_score(val_tags_l, y_predicted_l,average=None,labels=f_label), index = f_label).round(2)
df.sort_values(by=0,ascending=False).transpose()

# ● inference

In [54]:
context = pd.read_table("/content/drive/My Drive/projects/04강 고려의 발전과 변화.txt", header=None)
context

Unnamed: 0,0
0,"안녕하세요, 여러분. 역사는 최태성! 빵! 지금 수능특강을 열심히 열심히 달리고 있..."
1,우리가 드디어 중세 와우! 70만 년 역사 끝냈고 천 년의 고대 끝냈고 이제 중세 ...
2,고려 오백 년이고요. 그다음이 조선 오백 년. 이렇게 합쳐서 천 년이 있어요.
3,"그중에서 중세 고려 오백여 년의, 그 오백 년 조금 안 되겠구나. 역사를 한번 살펴..."
4,앞에서 한번 설명드렸지만 고려는 호족의 시대입니다. 호족이 나라를 세웠죠. 대표적인...
...,...
249,"호족의 시대, 문벌귀족의 시대, 무신의 시대, 권문세족의 시대, 신진사대부의 시대...."
250,"외침에 맞서 싸웠던 거란, 여진, 몽골, 홍건적과 왜구에 맞서 싸웠던 그런 역사. ..."
251,"비록 충 자가 들어가고 부마국이 되었지만, 그래도 우리의 자주성을 잃지는 않았다는 사실."
252,이걸 여러분들이 고려인들을 통해서 배우면 어떨까 하는 생각이 듭니다. 그런 DNA를...


In [None]:
model.to('cpu')

In [63]:
def ner_inference(test_sentence):
    tokenized_sentence = torch.tensor([tokenizer.encode(test_sentence,truncation=True, max_length=max_len, pad_to_max_length=True)])
    ans = model.forward(tokenized_sentence,real_tags=None)
    tokens = tokenizer.convert_ids_to_tokens(tokenized_sentence[0])

    new_tokens, new_labels = [], []
    for token, label_idx in zip(tokens, ans[0]):

        if token.startswith("▁"):
            new_labels.append(label_to_tag[label_idx])
            new_tokens.append(token[1:])
        elif token not in ['[CLS]', '[SEP]', '[PAD]']:
            new_tokens[-1] = new_tokens[-1] + token

    for token, label in zip(new_tokens, new_labels):
        print("{}\t{}".format(label, token))

In [64]:
ner_inference(context.iloc[13,0])

-	그
-	새로운
-	세력들이
-	누구냐면,
LOC_B	향리
-	출신의
-	성리학을
-	수용한
ORG_B	신진사대부들.
-	그
-	신진사대부들이
CVL_B	권문세족을
-	공격하면서


In [65]:
ner_inference(context.iloc[48,0])

-	어떤
-	걸
-	했냐면
-	먼저
TRM_B	노비안검법이라는
-	걸
-	시행합니다.
-	왜냐면
CVL_B	호족들이
-	노비를
-	많이
-	갖고
-	있었거든요.
-	노비는
-	뭐예요?


In [66]:
ner_inference(context.iloc[163,0])

PER_B	충선왕,
PER_B	충렬왕,
PER_B	충목왕,
PER_B	충숙왕.
-	이런
-	식으로
-	충
-	자가
-	들어가요.
-	특히
LOC_B	몽골의
CVL_B	공주와
-	결혼을
-	해야
-	돼요.
-	그래서
LOC_B	부마국.


In [67]:
ner_inference(context.iloc[170,0])

ORG_B	쌍성총관부를
-	가져갑니다.
LOC_B	철령
LOC_B	이북
-	지역이거든요.
-	이
-	지역을
-	가져가는
-	모습들
-	보이고
-	있고요.


In [68]:
ner_inference(context.iloc[179,0])

PER_B	공민왕의
-	어떤
-	반원
-	자주
-	정책.
-	이걸
-	다
-	부정합니다.
ORG_B	쌍성총관부
-	없애고요.
LOC_B	정동행성
LOC_B	이문소
-	없애고요.
-	정방
-	없애고요.


In [69]:
ner_inference(context.iloc[245,0])

FLD_B	성리학을
-	장착했던
PER_B	신진사대부가
-	딱
ANM_B	손을
-	잡고
PER_B	이성계는
-	당시
LOC_B	왜구,
PER_B	홍건적을
-	물리치면서
CVL_B	슈퍼스타가
-	되는
-	거야.
