<a href="https://colab.research.google.com/github/jiwoniee98/graduation-project/blob/master/NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git

In [None]:
 cd Mecab-ko-for-Google-Colab

In [None]:
! bash install_mecab-ko_on_colab190912.sh

In [None]:
import numpy as np
import copy
import os
import argparse
import pickle

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F

from google.colab import files
uploaded = files.upload()


In [None]:
from data_utils import Vocabulary
from data_utils import load_data_interactive

from data_loader import prepare_sequence, prepare_char_sequence, prepare_lex_sequence
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
#from Seq2Seq_test import Seq2Seq
from data_loader import get_loader
from sklearn.metrics import f1_score


In [None]:

vocab_path='vocab_ko_NER.pkl'
char_vocab_path='char_vocab_ko_NER.pkl'
pos_vocab_path='pos_vocab_ko_NER.pkl'
lex_dict_path='lex_dict.pkl'
model_load_path='cnn_bilstm_tagger-179-400_f1_0.8739_maxf1_0.8739_100_200_2.pkl'
num_layers=2
embed_size=100
hidden_size=200 
gpu_index=0

predict_NER_dict = {0: '<PAD>',
                    1: '<START>',
                    2: '<STOP>',
                    3: 'B_LC',
                    4: 'B_DT',
                    5: 'B_OG',
                    6: 'B_TI',
                    7: 'B_PS',
                    8: 'I',
                    9: 'O'}

NER_idx_dic = {'<unk>': 0, 'LC': 1, 'DT': 2, 'OG': 3, 'TI': 4, 'PS': 5}


In [None]:

def to_np(x):
    return x.data.cpu().numpy()

def to_var(x, volatile=False):
    if torch.cuda.is_available():
        x = x.cuda(gpu_index)
    return Variable(x, volatile=volatile)


In [None]:
# apply word2vec
from google.colab import files
uploaded = files.upload()

from gensim.models import word2vec
pretrained_word2vec_file = 'ko_word2vec_' + str(embed_size) + '.model'
wv_model_ko = word2vec.Word2Vec.load(pretrained_word2vec_file)
word2vec_matrix = wv_model_ko.wv.vectors


In [None]:
from google.colab import files
uploaded = files.upload()

# build vocab
with open(vocab_path, 'rb') as f:
    vocab = pickle.load(f)
print("len(vocab): ",len(vocab))
print("word2vec_matrix: ",np.shape(word2vec_matrix))
with open(char_vocab_path, 'rb') as f:
    char_vocab = pickle.load(f)
with open(pos_vocab_path, 'rb') as f:
    pos_vocab = pickle.load(f)
with open(lex_dict_path, 'rb') as f:
    lex_dict = pickle.load(f)


In [None]:

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
# PackedSequence
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence


class Encoder(nn.Module):
    def __init__(self, vocab_size, char_vocab_size, pos_vocab_size, lex_ner_size, hidden_size, num_layers, embed_size,
                 word2vec, num_classes):  # kernel_num=128, kernel_sizes=[2,3,4],
        # 항상 torch.nn.Module 상속받고 시작
        super(Encoder, self).__init__()

        kernel_size = 1  # 커널
        # 입력채널
        channel_input_word = 1
        channel_input_lexicon = 1
        kernel_num = 128
        kernel_sizes = [2, 3, 4, 5]
        channel_output = kernel_num

        if word2vec is not None:
            self.embed = nn.Embedding(vocab_size, embed_size, padding_idx=0)
            self.embed.weight = torch.nn.parameter.Parameter(torch.Tensor(word2vec))
            self.embed.weight.requires_grad = False

            self.trainable_embed = nn.Embedding(vocab_size, embed_size, padding_idx=0)
            self.trainable_embed.weight = torch.nn.parameter.Parameter(torch.Tensor(word2vec))


        else:
            self.embed = nn.Embedding(vocab_size, embed_size, padding_idx=0)

        self.char_embed = nn.Embedding(char_vocab_size, embed_size, padding_idx=0)
        self.pos_embed = nn.Embedding(pos_vocab_size, embed_size, padding_idx=0)
        self.convs1 = nn.ModuleList(
            [nn.Conv2d(channel_input_word, channel_output, (kernel_size, embed_size)) for kernel_size in kernel_sizes])
        self.dropout = nn.Dropout(0.5)

        #self.fc1 = nn.Linear(2 * hidden_size, num_classes)

    # forward 함수 : 주어진 input을 init에서 선언한 모듈에 입력, output
    def forward(self, x, x_char, x_pos, x_lex_embedding, lengths):

        x_word_embedding = self.embed(x)  # (batch,words,word_embedding)
        trainable_x_word_embedding = self.trainable_embed(x)

        char_output = []
        for i in range(x_char.size(1)):
            x_char_embedding = self.char_embed(x_char[:, i]).unsqueeze(1)  # (batch,channel_input,words,word_embedding)

            h_convs1 = [F.relu(conv(x_char_embedding)).squeeze(3) for conv in self.convs1]
            h_pools1 = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in
                        h_convs1]  # [(batch,channel_out), ...]*len(kernel_sizes)
            h_pools1 = torch.cat(h_pools1, 1)  # 리스트에 있는걸 쌓아서 Tensor로
            h_pools1 = self.dropout(h_pools1)
            out = h_pools1.unsqueeze(1)
            char_output.append(out)

        char_output = torch.cat(char_output, 1)  # 단어 단위끼리 붙이고
        x_pos_embedding = self.pos_embed(x_pos)
        enhanced_embedding = torch.cat((char_output, x_word_embedding, trainable_x_word_embedding, x_pos_embedding), 2)  # 임베딩 차원(2)으로 붙이고
        enhanced_embedding = self.dropout(enhanced_embedding)
        enhanced_embedding = torch.cat((enhanced_embedding, x_lex_embedding), 2)

        packed = pack_padded_sequence(enhanced_embedding, lengths, batch_first=True)

        return packed

    def sample(self, x, x_char, x_pos, x_lex_embedding, lengths):
        x_word_embedding = self.embed(x)  # (batch,words,word_embedding)
        trainable_x_word_embedding = self.trainable_embed(x)

        char_output = []
        for i in range(x_char.size(1)):
            x_char_embedding = self.char_embed(x_char[:, i]).unsqueeze(1)  # (batch,channel_input,words,word_embedding)

            h_convs1 = [F.relu(conv(x_char_embedding)).squeeze(3) for conv in self.convs1]
            h_pools1 = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in
                        h_convs1]
            h_pools1 = torch.cat(h_pools1, 1)  # 리스트에 있는걸 쌓아서 Tensor로
            h_pools1 = self.dropout(h_pools1)
            out = h_pools1.unsqueeze(1)  # 단어단위 고려
            char_output.append(out)

        char_output = torch.cat(char_output, 1)  # 단어 단위끼리 붙이고

        x_pos_embedding = self.pos_embed(x_pos)

        enhanced_embedding = torch.cat((char_output, x_word_embedding, trainable_x_word_embedding, x_pos_embedding),
                                       2)  # 임베딩 차원(2)으로 붙이고
        enhanced_embedding = self.dropout(enhanced_embedding)
        enhanced_embedding = torch.cat((enhanced_embedding, x_lex_embedding), 2)

        return enhanced_embedding



class Decoder(nn.Module):
    def __init__(self, encoder, lex_ner_size, hidden_size, num_layers, embed_size, word2vec, num_classes):
        super(Decoder, self).__init__()

        self.encoder = encoder

        kernel_num = 128
        kernel_sizes = [2, 3, 4, 5]
        channel_output = kernel_num

        if word2vec is not None:
            # BiLSTM
            self.lstm = nn.LSTM((channel_output * len(kernel_sizes) + 2 * embed_size + embed_size + lex_ner_size),
                                hidden_size, num_layers, dropout=0.6, batch_first=True, bidirectional=True)
        else:
            self.lstm = nn.LSTM((channel_output * len(kernel_sizes) + embed_size + embed_size + lex_ner_size),
                                hidden_size, num_layers, dropout=0.6, batch_first=True, bidirectional=True)

       #self.encdoer.fc1 = nn.Linear(2 * hidden_size, num_classes)
        self.fc1 = nn.Linear(2 * hidden_size, num_classes)

    def forward(self, packed):
        output_word, state_word = self.lstm(packed)
        return output_word

    def sample(self, enhanced_embedding):
        output_word, state_word = self.lstm(enhanced_embedding)
        return output_word



class Seq2Seq(nn.Module):
    def __init__(self, decoder):
        super(Seq2Seq, self).__init__()
        #self.encoder = encoder
        self.decoder = decoder

    def forward(self, output_word):
        return self.decoder.fc1(output_word[0])

    def sample(self, output_word):
        return self.decoder.fc1(output_word)


In [None]:


# build models
seq2seq_encoder = Encoder(vocab_size=len(vocab),
                          char_vocab_size=len(char_vocab),
                          pos_vocab_size=len(pos_vocab),
                          lex_ner_size=len(NER_idx_dic),
                          embed_size=embed_size,
                          hidden_size=hidden_size,
                          num_layers=num_layers,
                          word2vec=word2vec_matrix,
                          num_classes=10)
seq2seq_decoder = Decoder(seq2seq_encoder,
                          lex_ner_size=len(NER_idx_dic),
                          hidden_size=hidden_size,
                          num_layers=num_layers,
                          embed_size=embed_size,
                          word2vec=word2vec_matrix,
						  num_classes=10)
seq2seq_model = Seq2Seq(decoder=seq2seq_decoder)


# infernce를 위해 모델 저장하고 불러오기
# seq2seq_model.load_state_dict(torch.load(model_load_path))
torch.save(seq2seq_model.state_dict(), model_load_path)
seq2seq_model.load_state_dict(torch.load(model_load_path, map_location=lambda storage, loc: storage))

if torch.cuda.is_available():
    seq2seq_model.cuda(gpu_index)


# 추론 전 드롭아웃 및 배치 정규화를 평가모드로 설정
seq2seq_model.eval()

In [None]:
def preprocessing(x_text_batch, x_pos_batch, x_split_batch):
   # print("1", x_text_batch)
   # print("2", x_pos_batch)
   # print("3", x_split_batch)
    x_text_char_item = []
    for x_word in x_text_batch[0]:
        x_char_item = []
        for x_char in x_word:
            x_char_item.append(x_char)
        x_text_char_item.append(x_char_item)
    x_text_char_batch = [x_text_char_item]
    print("4", x_split_batch)

    x_idx_item = prepare_sequence(x_text_batch[0], vocab.word2idx)
    print("5",  x_idx_item)
    x_idx_char_item = prepare_char_sequence(x_text_char_batch[0], char_vocab.word2idx)
    print("6",  x_idx_char_item)
    x_pos_item = prepare_sequence(x_pos_batch[0], pos_vocab.word2idx)
    print("7",  x_pos_item)
    x_lex_item = prepare_lex_sequence(x_text_batch[0], lex_dict)
    print("8",  x_lex_item)

    x_idx_batch = [x_idx_item]
    x_idx_char_batch = [x_idx_char_item]
    x_pos_batch = [x_pos_item]
    x_lex_batch = [x_lex_item]


    max_word_len = int(np.amax([len(word_tokens) for word_tokens in x_idx_batch])) # ToDo: usually, np.mean can be applied
    batch_size = len(x_idx_batch)
    batch_words_len = [len(word_tokens) for word_tokens in x_idx_batch]
    batch_words_len = np.array(batch_words_len)

    # Padding procedure (word)
    padded_word_tokens_matrix = np.zeros((batch_size, max_word_len), dtype=np.int64)
    for i in range(padded_word_tokens_matrix.shape[0]):
        for j in range(padded_word_tokens_matrix.shape[1]):
            try:
                padded_word_tokens_matrix[i, j] = x_idx_batch[i][j]
            except IndexError:
                pass

    max_char_len = int(np.amax([len(char_tokens) for word_tokens in x_idx_char_batch for char_tokens in word_tokens]))
    if max_char_len < 5: # size of maximum filter of CNN
        max_char_len = 5
        
    # Padding procedure (char)
    padded_char_tokens_matrix = np.zeros((batch_size, max_word_len, max_char_len), dtype=np.int64)
    for i in range(padded_char_tokens_matrix.shape[0]):
        for j in range(padded_char_tokens_matrix.shape[1]):
            for k in range(padded_char_tokens_matrix.shape[1]):
                try:
                    padded_char_tokens_matrix[i, j, k] = x_idx_char_batch[i][j][k]
                except IndexError:
                    pass

    # Padding procedure (pos)
    padded_pos_tokens_matrix = np.zeros((batch_size, max_word_len), dtype=np.int64)
    for i in range(padded_pos_tokens_matrix.shape[0]):
        for j in range(padded_pos_tokens_matrix.shape[1]):
            try:
                padded_pos_tokens_matrix[i, j] = x_pos_batch[i][j]
            except IndexError:
                pass

    # Padding procedure (lex)
    padded_lex_tokens_matrix = np.zeros((batch_size, max_word_len, len(NER_idx_dic)))
    for i in range(padded_lex_tokens_matrix.shape[0]):
        for j in range(padded_lex_tokens_matrix.shape[1]):
            for k in range(padded_lex_tokens_matrix.shape[2]):
                try:
                    for x_lex in x_lex_batch[i][j]:
                        k = NER_idx_dic[x_lex]
                        padded_lex_tokens_matrix[i, j, k] = 1
                except IndexError:
                    pass

                
    x_text_batch = x_text_batch
    x_split_batch = x_split_batch
    padded_word_tokens_matrix = torch.from_numpy(padded_word_tokens_matrix)
    padded_char_tokens_matrix = torch.from_numpy(padded_char_tokens_matrix)
    padded_pos_tokens_matrix = torch.from_numpy(padded_pos_tokens_matrix)
    padded_lex_tokens_matrix = torch.from_numpy(padded_lex_tokens_matrix).float()
    lengths = batch_words_len

    return x_text_batch, x_split_batch, padded_word_tokens_matrix, padded_char_tokens_matrix, padded_pos_tokens_matrix, padded_lex_tokens_matrix, lengths

def parsing_seq2NER(argmax_predictions, x_text_batch):
    predict_NER_list = []
    predict_text_NER_result_batch = copy.deepcopy(x_text_batch[0]) #tuple ([],) -> return first list (batch_size == 1)
    for argmax_prediction_seq in argmax_predictions:
        #print("argmax_predictions", argmax_predictions)
        #print("argmax_prediction_seq", argmax_prediction_seq)
        predict_NER = []
        NER_B_flag = None # stop B
        prev_NER_token = None
        for i, argmax_prediction in enumerate(argmax_prediction_seq):
                now_NER_token = predict_NER_dict[argmax_prediction.cpu().data.numpy()[0]]
                predict_NER.append(now_NER_token)
                ###
                #print("1", predict_NER)
                #print("2", now_NER_token)
                if now_NER_token in ['B_LC', 'B_DT', 'B_OG', 'B_TI', 'B_PS'] and NER_B_flag is None: # O B_LC
                    NER_B_flag = now_NER_token # start B
                    predict_text_NER_result_batch[i] = '<'+predict_text_NER_result_batch[i]
                    prev_NER_token = now_NER_token
                    if i == len(argmax_prediction_seq)-1:
                        predict_text_NER_result_batch[i] = predict_text_NER_result_batch[i]+':'+now_NER_token[-2:]+'>'

                elif now_NER_token in ['B_LC', 'B_DT', 'B_OG', 'B_TI', 'B_PS'] and NER_B_flag is not None: # O B_LC B_DT
                    predict_text_NER_result_batch[i-1] = predict_text_NER_result_batch[i-1]+':'+prev_NER_token[-2:]+'>'
                    predict_text_NER_result_batch[i] = '<' + predict_text_NER_result_batch[i]
                    prev_NER_token = now_NER_token
                    if i == len(argmax_prediction_seq)-1:
                        predict_text_NER_result_batch[i] = predict_text_NER_result_batch[i]+':'+now_NER_token[-2:]+'>'

                elif now_NER_token in ['I'] and NER_B_flag is not None:
                    if i == len(argmax_prediction_seq) - 1:
                        predict_text_NER_result_batch[i] = predict_text_NER_result_batch[i] + ':' + NER_B_flag[-2:] + '>'

                elif now_NER_token in ['O'] and NER_B_flag is not None: # O B_LC I O
                    predict_text_NER_result_batch[i-1] = predict_text_NER_result_batch[i-1] + ':' + prev_NER_token[-2:] + '>'
                    NER_B_flag = None # stop B
                    prev_NER_token = now_NER_token

        predict_NER_list.append(predict_NER)
    return predict_NER_list, predict_text_NER_result_batch

def generate_text_result(text_NER_result_batch, x_split_batch):
    prev_x_split = 0 
    text_string = ''
    for i, x_split in enumerate(x_split_batch[0]):
        if prev_x_split != x_split:
            text_string = text_string+' '+text_NER_result_batch[i]
            prev_x_split = x_split
        else:
            text_string = text_string +''+ text_NER_result_batch[i]
            prev_x_split = x_split
    return text_string


def NER_print(input_str):
    input_str.replace("  ", "")
    input_str = input_str.strip()
    #print("input_str : ", input_str)

    x_text_batch, x_pos_batch, x_split_batch = load_data_interactive(input_str)
    x_text_batch, x_split_batch, padded_word_tokens_matrix, padded_char_tokens_matrix, padded_pos_tokens_matrix, padded_lex_tokens_matrix, lengths = preprocessing(x_text_batch, x_pos_batch, x_split_batch)
   # print("x_text_batch", x_text_batch)
   # print("x_pos_batc",x_pos_batch)
   # print("x_split_batch",x_split_batch)
   # print("padded_word_tokens_matrix",padded_word_tokens_matrix)
   # print("padded_char_tokens_matrix",padded_char_tokens_matrix)
   # print("padded_pos_tokens_matrix",padded_pos_tokens_matrix)
   # print("padded_lex_tokens_matrix",padded_lex_tokens_matrix)
   # print("lengths",lengths)

    # Test
    argmax_labels_list = []
    argmax_predictions_list = []

    with torch.no_grad():
        padded_word_tokens_matrix = to_var(padded_word_tokens_matrix)

    with torch.no_grad():
        padded_char_tokens_matrix = to_var(padded_char_tokens_matrix)

    with torch.no_grad():
        padded_pos_tokens_matrix = to_var(padded_pos_tokens_matrix)

    with torch.no_grad():
        padded_lex_tokens_matrix = to_var(padded_lex_tokens_matrix)
    #print(padded_word_tokens_matrix)
    #print(padded_char_tokens_matrix)
    #print(padded_pos_tokens_matrix)
    #print(padded_lex_tokens_matrix)
    
    predictions_en = seq2seq_encoder.sample(padded_word_tokens_matrix, padded_char_tokens_matrix, padded_pos_tokens_matrix, padded_lex_tokens_matrix, lengths)
    
    predictions_de = seq2seq_decoder.sample(predictions_en)

    predictions = seq2seq_model.sample(predictions_de)
    
    #predictions = seq2seq_model.sample(padded_word_tokens_matrix, padded_char_tokens_matrix, padded_pos_tokens_matrix, padded_lex_tokens_matrix, lengths)
    max_predictions, argmax_predictions = predictions.max(2)

    if len(argmax_predictions.size()) != len(
        predictions.size()):  # Check that class dimension is reduced or not (API version issue, pytorch 0.1.12)
        max_predictions, argmax_predictions = predictions.max(2, keepdim=True)

    argmax_predictions_list.append(argmax_predictions)
    
    predict_NER_list, predict_text_NER_result_batch = parsing_seq2NER(argmax_predictions, x_text_batch)


#     print("x_text: ",x_text_batch)
#     print("NER_pred: ",predict_NER_list)
#     print("predict_text_NER_result_batch: ",predict_text_NER_result_batch)
#     print("x_split_batch: ",x_split_batch)
    
    
    origin_text_string = generate_text_result(x_text_batch[0], x_split_batch)
    predict_NER_text_string = generate_text_result(predict_text_NER_result_batch, x_split_batch)


#     print("origin:  ",origin_text_string)
#     print("predict: ",predict_NER_text_string)
    print("output> ",predict_NER_text_string)
    print("")

In [None]:
# 모델성능 f1 87.39
while(True):
    
    input_str = input('input> ')
    
    if input_str == 'exit':
        break
    else:
        NER_print(input_str)