In [1]:
from konlpy.tag import Okt
from nltk.tokenize import word_tokenize
import nltk

In [2]:
nltk.download('punkt')
okt = Okt()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
def pos_tokenize(raw_sent):
    pos_sent = []

    # raw_sent = '사과가 좋아'
    sent = okt.pos(raw_sent, norm=True, stem=True)
    # sent = [('사과', 'Noun'), ('가', 'Josa'), ('좋다', 'Adjective')]

    for tup in sent:
        word, tag = tup[0], tup[1]
        word_tag = word + '/' + tag
        pos_sent.append(word_tag)

    return ' '.join(pos_sent)

In [4]:
# 데이터 입력 - 한국어
train = [('사과가 좋아', 'pos'), ('밤에 먹는 사과는 비추야', 'neg'),
         ('사과가 잘 익어서 맛있겠다', 'pos')]

In [5]:
# train에 있는 문장의 단어 set 만들기
# set()을 이용해 집합 만들기
all_words = set()

for tup in train:
    sent, label = tup[0], tup[1]
    sent = pos_tokenize(sent)
    words = word_tokenize(sent)

    for word in words:
        all_words.add(word)

print(all_words)

{'좋다/Adjective', '가/Josa', '는/Josa', '사과/Noun', '자다/Verb', '익다/Verb', '먹다/Verb', '에/Josa', '비추다/Verb', '맛있다/Adjective', '밤/Noun'}


In [6]:
# 각 문장에 단어 집합 안의 단어의 존재 여부 표현
train_features = []

for tup in train:
    sent, label = tup[0], tup[1]
    sent = pos_tokenize(sent)
    words = word_tokenize(sent)

    tmp = {set_word: (set_word in words) for set_word in all_words}

    sent_tup = (tmp, label)
    train_features.append(sent_tup)

# 단어 등장 순서 무시 -> 빈도만 사용, 실행할 때마다 단어 순서 바뀜
print(train_features[0])
print(train_features[1])
print(train_features[2])

({'좋다/Adjective': True, '가/Josa': True, '는/Josa': False, '사과/Noun': True, '자다/Verb': False, '익다/Verb': False, '먹다/Verb': False, '에/Josa': False, '비추다/Verb': False, '맛있다/Adjective': False, '밤/Noun': False}, 'pos')
({'좋다/Adjective': False, '가/Josa': False, '는/Josa': True, '사과/Noun': True, '자다/Verb': False, '익다/Verb': False, '먹다/Verb': True, '에/Josa': True, '비추다/Verb': True, '맛있다/Adjective': False, '밤/Noun': True}, 'neg')
({'좋다/Adjective': False, '가/Josa': True, '는/Josa': False, '사과/Noun': True, '자다/Verb': True, '익다/Verb': True, '먹다/Verb': False, '에/Josa': False, '비추다/Verb': False, '맛있다/Adjective': True, '밤/Noun': False}, 'pos')


In [7]:
# 단어별 확률 계산
classifier = nltk.NaiveBayesClassifier.train(train_features)
classifier.show_most_informative_features() # 긍정/부정을 계산하는데 가장 영향력있는 단어는 무엇인가?

Most Informative Features
           맛있다/Adjective = False             neg : pos    =      1.5 : 1.0
                 익다/Verb = False             neg : pos    =      1.5 : 1.0
                 자다/Verb = False             neg : pos    =      1.5 : 1.0
            좋다/Adjective = False             neg : pos    =      1.5 : 1.0


In [16]:
# 테스트
test_sent = "사과는 비추다"

test_sent = pos_tokenize(test_sent)
words = word_tokenize(test_sent)
test_feature = {set_word: (set_word in words) for set_word in all_words}

print(test_feature)
classifier.classify(test_feature)

{'좋다/Adjective': False, '가/Josa': False, '는/Josa': True, '사과/Noun': True, '자다/Verb': False, '익다/Verb': False, '먹다/Verb': False, '에/Josa': False, '비추다/Verb': True, '맛있다/Adjective': False, '밤/Noun': False}


'neg'