In [14]:
from konlpy.tag import Okt
from nltk.tokenize import word_tokenize
import nltk
from urllib.request import urlretrieve

In [15]:
okt = Okt()

In [16]:
def load_data(file_path):
    # file_path에 있는 데이터를 읽어 옴

    train = []
    count = 0

    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f.readlines():
            if count == 500: break

            line = line.strip()
            id, doc, label = line.split('\t')

            if label == '1': label = 'pos'
            elif label == '0': label = 'neg'
            train.append((doc, label))

            count += 1

    return train

In [17]:
def pos_tokenize(raw_sent):
    pos_sent = []
    sent = okt.pos(raw_sent, norm=True, stem=True)

    for tup in sent:
        word, tag = tup[0], tup[1]
        word_tag = word + '/' + tag
        pos_sent.append(word_tag)

    return ' '.join(pos_sent)

In [18]:
# test_sent = urlretrieve("https://github.com/e9t/nsmc/raw/master/ratings_test.txt", filename="ratings_test.txt")

test_sent = load_data('ratings_test.txt')
print(test_sent[:5])

[('document', 'label'), ('굳 ㅋ', 'pos'), ('GDNTOPCLASSINTHECLUB', 'neg'), ('뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아', 'neg'), ('지루하지는 않은데 완전 막장임... 돈주고 보기에는....', 'neg')]


In [19]:
all_words = set()

for tup in test_sent:
    sent, label = tup[0], tup[1]
    sent = pos_tokenize(sent)
    words = word_tokenize(sent)

    for word in words:
        all_words.add(word)

print(all_words)

{'이별/Noun', '긁다/Verb', '가다/Verb', '김희선/Noun', '글쎄/Noun', '전/Noun', '나가다/Verb', '일본/Noun', '이쁘다/Adjective', '다미앙/Noun', '스릴러/Noun', '패권/Noun', '순수/Noun', '정점/Noun', '이만/Noun', '또는/Adverb', '순위/Noun', '8/Number', '들보/Noun', '아직도/Adverb', '부활/Noun', '참고/Noun', '하얗다/Adjective', '웃기다/Verb', '본적/Noun', '영화배우/Noun', '소품/Noun', '소통/Noun', '80~90년/Number', '장난/Noun', '연/Modifier', '속편/Noun', '뭐병/Noun', '대로/Josa', '슈래기/Noun', '-_-', '스탈/Noun', '로망스/Noun', '쯤/Suffix', '계/Suffix', '만/Noun', '장가/Noun', '폭발/Noun', '청춘/Noun', 'C/Alpha', '흘러나오다/Verb', '읽다/Verb', '아름답다/Adjective', '조카/Noun', '로/Noun', '옆/Noun', '감싸다/Verb', '비/Noun', '몇/Noun', '조잡하다/Adjective', '건빵/Noun', '예술/Noun', '땐/Noun', '사이코패스/Noun', '보신/Noun', '이란/Josa', '과소/Noun', '욕/Noun', '씩/Suffix', 'ㅉㅉ/KoreanParticle', '깨뜨리다/Verb', '자극/Noun', '헷/Noun', '신선하다/Adjective', '소/Modifier', '껄껄/Noun', '말다/Verb', '오래되다/Adjective', '차원/Noun', '쓰다/Verb', '떠올리다/Verb', '소녀/Noun', '화판/Noun', '한번/Noun', '목숨/Noun', 'EBS/Alpha', '질/Noun', '물론/Adverb', 'ㅐ/Ko

In [20]:
train_features = []

for tup in test_sent:
    sent, label = tup[0], tup[1]
    sent = pos_tokenize(sent)
    words = word_tokenize(sent)

    tmp = {set_word: (set_word in words) for set_word in all_words}

    sent_tup = (tmp, label)
    train_features.append(sent_tup)

In [21]:
classifier = nltk.NaiveBayesClassifier.train(train_features)
classifier.show_most_informative_features()

Most Informative Features
           재밌다/Adjective = True              pos : neg    =      7.8 : 1.0
                쓰레기/Noun = True              neg : pos    =      6.9 : 1.0
                  뭐/Noun = True              neg : pos    =      5.9 : 1.0
                해주다/Verb = True              pos : neg    =      5.2 : 1.0
            안/VerbPrefix = True              neg : pos    =      4.8 : 1.0
                 최고/Noun = True              pos : neg    =      4.6 : 1.0
                  요/Josa = True              pos : neg    =      4.2 : 1.0
            내/Determiner = True              neg : pos    =      4.0 : 1.0
                  냐/Josa = True              neg : pos    =      4.0 : 1.0
                  못/Noun = True              neg : pos    =      4.0 : 1.0


In [22]:
test_sent = test_sent[1][0]
test_sent = pos_tokenize(test_sent)
test_words = word_tokenize(test_sent)

test_feature = {set_word: (set_word in test_words) for set_word in all_words}

In [23]:
print(test_words)
classifier.classify(test_feature)

['굳다/Adjective', 'ㅋ/KoreanParticle']


'pos'