In [18]:
#!pip install konlpy

In [11]:
from konlpy.tag import Komoran

In [12]:
class Preprocess:
    def __init__(self, userdic=None):
        self.komoran = Komoran(userdic=userdic)
        
        self.exclusion_tags = [
            'JKS', 'JKC', 'JKG', 'JKO', 'JKB', 'JKV', 'JKQ',
            'JX', 'JX',
            'SF', 'SP', 'SS', 'SE', 'SO',
            'EP', 'EF', 'EC', 'ETN', 'ETM'
            'XSN', 'XSV', 'XSA'
        ]
    
    def pos(self, sentence):
        return self.komoran.pos(sentence)
    
    def get_keywords(self, pos, without_tag=False):
        f = lambda x: x in self.exclusion_tags
        word_list = []
        for p in pos:
            if f(p[1]) is False:
                word_list.append(p if without_tag is False else p[0])
        return word_list

In [13]:
sent = "내일 오전 10시에 탕수육 주문하고 싶어"

In [14]:
p = Preprocess(userdic='../utils/user_dic.tsv')

In [15]:
pos = p.pos(sent)

In [16]:
ret = p.get_keywords(pos, without_tag=False)
print(ret)

[('내일', 'NNG'), ('오전', 'NNP'), ('10', 'SN'), ('시', 'NNB'), ('탕수육', 'NNP'), ('주문', 'NNG'), ('싶', 'VX')]


In [17]:
ret = p.get_keywords(pos, without_tag=True)
print(ret)

['내일', '오전', '10', '시', '탕수육', '주문', '싶']


In [18]:
#!pip install --user --no-warn-script-location tensorflow

In [19]:
from tensorflow.keras import preprocessing

In [27]:
import pickle

In [20]:
def read_corpus_data(filename):
    with open(filename, 'r', encoding='UTF8') as f:
        data = [line.split('\t') for line in f.read().splitlines()]
        data = data[1:]
    return data

In [21]:
corpus_data =read_corpus_data('corpus.txt')

In [23]:
p = Preprocess()

In [24]:
dict = []
for c in corpus_data:
    pos = p.pos(c[1])
    for k in pos:
        dict.append(k[0])

In [25]:
tokenizer = preprocessing.text.Tokenizer(oov_token='OOV')
tokenizer.fit_on_texts(dict)
word_index = tokenizer.word_index

In [28]:
f = open("chatbot_dict.bin","wb")
try:
    pickle.dump(word_index, f)
except Exception as e:
    print(e)
finally:
    f.close()

In [30]:
f = open("chatbot_dict.bin","rb")
word_index = pickle.load(f)
f.close()

In [31]:
sent = "내일 오전 10시에 탕수육 주문하고 싶어 ㅋㅋ"

In [33]:
p = Preprocess(userdic='../utils/user_dic.tsv')

In [34]:
pos = p.pos(sent)

In [35]:
keywords = p.get_keywords(pos, without_tag=True)

In [37]:
for word in keywords:
    try:
        print(word, word_index[word])
    except KeyError:
        print(word, word_index['OOV'])

내일 14
오전 269
10 105
시 4
탕수육 431
주문 3
싶 11
ㅋㅋ 10728
