In [1]:
import tensorflow as tf
from tensorflow.keras import preprocessing
from seqeval.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sys
sys.path.append('../../utils')
from preprocess import Preprocess

In [2]:
def read_file(file_name):
    sents = []
    # 학습 데이터 형태 참고
    with open(file_name, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for idx, l in enumerate(lines):
            if l[0] == ';' and lines[idx+1][0] == '$':
                this_sent = []
            elif l[0] == '$' and lines[idx-1][0] == ';':
                continue
            elif l[0] == '\n':
                sents.append(this_sent)
            else :
                this_sent.append(tuple(l.split()))
    return sents

In [3]:
# 전처리 객체
p = Preprocess(word2index_dic='../../train_tools/dict/chatbot_dict.bin',
               userdic='../../utils/user_dic.tsv')

# 학습용 말뭉치 데이터
corpus = read_file('ner_train.txt')

# 말뭉치 데이터에서 단어와 BIO 태그만 불러와 학습용 데이터셋 생성
sentences, tags = [], []
for t in corpus :
    tagged_sentence = []
    sentence, bio_tag = [], []
    for w in t:
        tagged_sentence.append((w[1],w[3]))
        sentence.append(w[1])
        bio_tag.append(w[3])
    sentences.append(sentence)
    tags.append(bio_tag)

print("sample size : ", len(sentences))
print("0번째 샘플 단어 시퀀스 : ", sentences[0])
print("0번째 샘플 bio tag : ", tags[0])
print("sample sequence maxlen : ", max(len(l) for l in sentences))
print("sample sequence average len : ", (sum(map(len, sentences))/len(sentences)))

sample size :  61999
0번째 샘플 단어 시퀀스 :  ['가락지빵', '주문', '하', '고', '싶', '어요']
0번째 샘플 bio tag :  ['B_FOOD', 'O', 'O', 'O', 'O', 'O']
sample sequence maxlen :  168
sample sequence average len :  8.796238649010467


In [4]:
# 토크나이저 정의
tag_tokenizer = preprocessing.text.Tokenizer(lower=False)
tag_tokenizer.fit_on_texts(tags)

# 단어 사전 및 태그 사전 크기
vocab_size = len(p.word_index) + 1
tag_size = len(tag_tokenizer.word_index) + 1
print("BIO 태그 사전 크기 : ", tag_size) # BIO(Beginning, Inside, Outside Tag), KoreanNERCorpus 참고
print("단어 사전 크기 : ", vocab_size)

'''
B_FOOD : 음식
B_DT, B_TI : 날짜, 시간 ~ 학습 데이터에서 날짜와 시간 혼용 사용
B_PS : 사람
B_OG : 조직, 회사
B_LC : 지역
'''

BIO 태그 사전 크기 :  10
단어 사전 크기 :  17751


'\nB_FOOD : 음식\nB_DT, B_TI : 날짜, 시간 ~ 학습 데이터에서 날짜와 시간 혼용 사용\nB_PS : 사람\nB_OG : 조직, 회사\nB_LC : 지역\n'

In [5]:
zip_test = list(zip(sentences[0],tags[0]))
zip_test

[('가락지빵', 'B_FOOD'),
 ('주문', 'O'),
 ('하', 'O'),
 ('고', 'O'),
 ('싶', 'O'),
 ('어요', 'O')]

In [6]:
zip_test[0]

('가락지빵', 'B_FOOD')

In [7]:
BIO_data_list = []
for i in range(len(sentences)):
    for j in range(len(sentences[i])):
        zip_list = list(zip(sentences[i],tags[i]))
        for word_tag_set in zip_list:
            BIO_data_list.append(word_tag_set)
            
ner_data = pd.DataFrame(BIO_data_list,
               columns =['word', 'tag'])
ner_data.head(20)

Unnamed: 0,word,tag
0,가락지빵,B_FOOD
1,주문,O
2,하,O
3,고,O
4,싶,O
5,어요,O
6,가락지빵,B_FOOD
7,주문,O
8,하,O
9,고,O


In [8]:
ner_unique_data = ner_data.drop_duplicates(['word'], keep='first')
ner_unique_data.head(20)

Unnamed: 0,word,tag
0,가락지빵,B_FOOD
1,주문,O
2,하,O
3,고,O
4,싶,O
5,어요,O
37,먹,O
87,원,O
103,어떻,O
104,게,O


In [16]:
ner_unique_data.info(20)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14174 entries, 0 to 7188948
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   word    14174 non-null  object
 1   tag     14174 non-null  object
dtypes: object(2)
memory usage: 332.2+ KB


In [9]:
ner_unique_data['tag'].value_counts()

O         9568
B_PS      1487
B_OG      1144
I          792
B_LC       698
B_FOOD     239
B_DT       229
B_TI        17
Name: tag, dtype: int64

In [15]:
ner_unique_data[ner_unique_data['tag']=='B_OG']

Unnamed: 0,word,tag
1645358,E,B_OG
1646909,AP,B_OG
1646942,디트로이트,B_OG
1650284,메이저리그,B_OG
1651797,프리미어,B_OG
...,...,...
7156372,락앤락,B_OG
7161531,보광,B_OG
7164920,삼성물산,B_OG
7165994,선인완궈,B_OG


In [14]:
ner_unique_data[ner_unique_data['tag']=='B_PS']

Unnamed: 0,word,tag
1645763,박명환,B_PS
1646919,올라주원,B_PS
1646921,유잉,B_PS
1646926,애드리언,B_PS
1646929,팻,B_PS
...,...,...
7082432,박연경,B_PS
7083600,김진영,B_PS
7083605,쇼팽,B_PS
7083607,드뷔시,B_PS
