In [1]:
!pip install konlpy



In [2]:
from konlpy.tag import Kkma

import nltk
from nltk.tokenize import sent_tokenize

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [16]:
class Tokenizer:
  def __init__(self):
    self.kkma = Kkma()

  def make_vocab(self, documents):
    word2index = {'<unk>':0}
    for document in documents:
      tokens = self.tokenize(document)
      for voca in tokens:
        if voca not in word2index.keys():
          word2index[voca] = len(word2index)
    self.vocab = word2index

  def tokenize(self, document):
    morphs = []
    sentences = sent_tokenize(document)

    for sentence in sentences:
      sentence_morphs = self.kkma.pos(sentence)
      morphs.extend([morph[0] + '/' + morph[1] for morph in sentence_morphs])

    print(morphs)
    return morphs
  
  def bag_of_words(self, sentence):
    morphs = self.tokenize(sentence)
    vector = [0] * len(self.vocab)
    for morph in morphs:
      if morph not in self.vocab.keys():
        morph = '<unk>'
      vector[self.vocab[morph]] += 1
    
    return vector

In [18]:
tokenizer = Tokenizer()
texts = ['안녕하세요', '안녕하십니까', '오늘은 날씨가 좋네요', '기분이 좋아요']
tokenizer.make_vocab(texts)

print(tokenizer.vocab)

tokenizer.bag_of_words('오늘은 날씨가 어떨 것 같으세요') # -> 문장의 벡터가 됨

['안녕/NNG', '하/XSV', '세요/EFN']
['안녕/NNG', '하/XSV', '시/EPH', 'ㅂ니까/EFQ']
['오늘/NNG', '은/JX', '날씨/NNG', '가/JKS', '좋/VA', '네요/EFN']
['기분/NNG', '이/JKS', '좋/VA', '아요/EFN']
{'<unk>': 0, '안녕/NNG': 1, '하/XSV': 2, '세요/EFN': 3, '시/EPH': 4, 'ㅂ니까/EFQ': 5, '오늘/NNG': 6, '은/JX': 7, '날씨/NNG': 8, '가/JKS': 9, '좋/VA': 10, '네요/EFN': 11, '기분/NNG': 12, '이/JKS': 13, '아요/EFN': 14}
['오늘/NNG', '은/JX', '날씨/NNG', '가/JKS', '어떻/VA', 'ㄹ/ETD', '것/NNB', '같/VA', '으세요/EFA']


[5, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0]