## 개체명 인식 (Named Entity Recognition)

#### 철수 -이름 / 밥 - 사물

### 1. NlTK 

In [1]:
import nltk

nltk.download('words')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package words to /Users/jody/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to /Users/jody/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jody/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/jody/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


True

### 토큰화 및 품사 태깅

In [2]:
from nltk import word_tokenize, pos_tag, ne_chunk

In [3]:
sentence = 'Jame is working at Diseny in London'
sentence = pos_tag(word_tokenize(sentence))
print(sentence)

[('Jame', 'NNP'), ('is', 'VBZ'), ('working', 'VBG'), ('at', 'IN'), ('Diseny', 'NNP'), ('in', 'IN'), ('London', 'NNP')]


### 개체명 인식

In [4]:
sentence = ne_chunk(sentence)

print(sentence)

(S
  (GPE Jame/NNP)
  is/VBZ
  working/VBG
  at/IN
  (ORGANIZATION Diseny/NNP)
  in/IN
  (GPE London/NNP))


### 개체명 인식 (LSTM)

In [6]:
import numpy as np
import urllib.request

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

In [9]:
tagged_sentences = []
sentence = []

with urllib.request.urlopen('https://raw.githubusercontent.com/Franck-Dernoncourt/NeuroNER/master/neuroner/data/conll2003/en/train.txt') as f:
    for line in f:
        line = line.decode('utf-8')
        if len(line) == 0 or line.startswith('-DOCSTART') or line[0] == "\n":
            if len(sentence) > 0:
                tagged_sentences.append(sentence)
                sentence = []
            continue
        splits = line.strip().split(' ')
        word = splits[0].lower()
        sentence.append([word, splits[-1]]) # 단어랑 개체명
        
print(len(tagged_sentences))
print(tagged_sentences[0])

14041
[['eu', 'B-ORG'], ['rejects', 'O'], ['german', 'B-MISC'], ['call', 'O'], ['to', 'O'], ['boycott', 'O'], ['british', 'B-MISC'], ['lamb', 'O'], ['.', 'O']]


#### Data preprocessing 
##### - 단어와 개체명 태그를 분리해서 데이터를 구성

In [11]:
sentences, ner_tags = [], []

for tagged_sentence in tagged_sentences:
    sentence, tag_info = zip(*tagged_sentence) # 단어들은 sentence / 개체명은 tag_info에
    sentences.append(list(sentence))
    ner_tags.append(list(tag_info))

In [12]:
max_words = 4000
src_tokenizer = Tokenizer(num_words=max_words, oov_token='OOV')
src_tokenizer.fit_on_texts(sentences)

tar_tokenizer = Tokenizer()
tar_tokenizer.fit_on_texts(ner_tags)

In [13]:
vocab_size = max_words
tag_size = len(tar_tokenizer.word_index) + 1

print(vocab_size)
print(tag_size)

4000
10


In [14]:
X_train = src_tokenizer.texts_to_sequences(sentences)
y_train = tar_tokenizer.texts_to_sequences(ner_tags)

In [15]:
max_len = 70
X_train = pad_sequences(X_train, padding = 'post', maxlen = max_len)
y_train = pad_sequences(y_train, padding = 'post', maxlen = max_len)

#### one-hot encoding

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = 0.2, random_state = 111)

y_train = to_categorical(y_train, num_classes=tag_size)
y_test = to_categorical(y_test, num_classes=tag_size)

In [18]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(11232, 70)
(11232, 70, 10)
(2809, 70)
(2809, 70, 10)


In [20]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Bidirectional, TimeDistributed
from tensorflow.keras.optimizers import Adam

In [21]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length = max_len, mask_zero = True))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(TimeDistributed(Dense(tag_size, activation='softmax')))

2022-01-26 13:05:18.614519: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [22]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 70, 128)           512000    
                                                                 
 bidirectional (Bidirectiona  (None, 70, 512)          788480    
 l)                                                              
                                                                 
 time_distributed (TimeDistr  (None, 70, 10)           5130      
 ibuted)                                                         
                                                                 
Total params: 1,305,610
Trainable params: 1,305,610
Non-trainable params: 0
_________________________________________________________________


In [24]:
model.compile(loss='categorical_crossentropy',
             optimizer = Adam(0.001),
             metrics=['accuracy'])
model.fit(X_train, y_train, batch_size=128, epochs=3, validation_data=(X_test, y_test))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fe03b1c5760>

In [25]:
model.evaluate(X_test,y_test)



[0.05507287010550499, 0.9198400974273682]

### 학습한 모델을 통한 예측
#### - 예측을 확인하기 위해서 인덱스를 단어로 변환해줄 사전이 필요
#### - 사전은 토큰화 툴의 사전을 이용

In [26]:
idx2word = src_tokenizer.index_word
idx2ner = tar_tokenizer.index_word
idx2ner[0] = 'PAD'

### 예측 시각화

In [34]:
i = 50
y_predicted = model.predict(np.array([X_test[i]]))
y_predicted = np.argmax(y_predicted, axis=-1)
true = np.argmax(y_test[i], -1)

print("{:15}|{:5}|{}".format('단어', '실제값', '예측값'))
print('-' * 30)

for w, t, pred in zip(X_test[i], true, y_predicted[0]):
    if w != 0:
        print("{:17}: {:7} {}".format(idx2word[w], idx2ner[t].upper(), idx2ner[pred].upper()))

단어             |실제값  |예측값
------------------------------
dhaka            : B-LOC   B-LOC
1996-08-22       : O       O
