In [1]:
import urllib
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")

('ratings_test.txt', <http.client.HTTPMessage at 0x20905f84100>)

In [47]:
from nltk.tokenize import word_tokenize
import nltk
from konlpy.tag import Okt
okt = Okt()

In [58]:
def load_data(filename):
    train = []
    with open(filename, 'r', encoding='utf-8') as f:
        count = 0
        for line in f.readlines():
            if count == 1001: break
            count += 1

            line = line.strip()
            id, document, label = line.split('\t')
            if label == '1': label = 'pos'
            else: label = 'neg'    
            train_tup = (document, label)
            train.append(train_tup)
    return train

In [59]:
def pos_tokenize(raw_sent):
    pos_sent = []
    
    sent = okt.pos(raw_sent, norm=True, stem=True)
    for tup in sent:
        word, tag = tup
        word_tag = word + '/' + tag
        pos_sent.append(word_tag)
       
    return ' '.join(pos_sent) 

In [60]:
def make_word_dict(train, use_morph=False):
    all_words = set()
    for tup in train:
        sent, label = tup
        if use_morph: sent = pos_tokenize(sent)
        word_list = word_tokenize(sent)
        for word in word_list:
            all_words.add(word)
    return all_words

In [61]:
def make_train_feats(train, all_words, use_morph=False):
    train_features = []
    for tup in train:
        sent, label = tup # sent:'I like you' label: 'pos'
        if use_morph: sent = pos_tokenize(sent)
        word_list = word_tokenize(sent) # ['I', like, you]
        tmp = {set_word: set_word in word_list for set_word in all_words}
        train_feature = (tmp, label)
        train_features.append(train_feature)
    return train_features

In [52]:
train = load_data('ratings_train.txt')[1:]

In [53]:
all_words = make_word_dict(train, use_morph=True)

In [54]:
train_features = make_train_feats(train, all_words, use_morph=True)

In [55]:
classifier = nltk.NaiveBayesClassifier.train(train_features)

In [56]:
classifier.show_most_informative_features(n=5)

Most Informative Features
                       ; = True              neg : pos    =      8.2 : 1.0
          재미없다/Adjective = True              neg : pos    =      8.2 : 1.0
                주인공/Noun = True              neg : pos    =      7.5 : 1.0
                 최고/Noun = True              pos : neg    =      6.9 : 1.0
                  뭐/Noun = True              neg : pos    =      6.8 : 1.0


In [57]:
test = load_data('ratings_test.txt')[1:]
test_features = make_train_feats(test, all_words, use_morph=True) 
nltk.classify.accuracy(classifier, test_features)

0.712

In [62]:
test = load_data('ratings_test.txt')[1:]
test_features = make_train_feats(test, all_words, use_morph=True) 
nltk.classify.accuracy(classifier, test_features)

0.711

# 13-2

In [32]:
import urllib
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")

('ratings_test.txt', <http.client.HTTPMessage at 0x1dfe9b5fcd0>)

In [16]:
import math

class MyNaiveBayesClassifier:
    def __init__(self, k=0.5, use_morph=False):
        self.k = k
        self.word_probs = []
        self.use_morph = use_morph
        
        if self.use_morph:
            from konlpy.tag import Okt
            self.okt = Okt()
            
    def load_data(self, file_path):
        docs = []
        labels = []
        
        count = 0
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f.readlines():
                if count == 500:
                    break
                line = line.strip()
                id, doc, label = line.split('\t')
                docs.append(doc)
                if label == '1': label = 'pos'
                elif label == '0': label = 'neg'
                labels.append(label)
                count += 1
            
        return docs[1:], labels[1:]
    
    def tokenize(self, sentence):
        if self.use_morph:
            pos_sent = []
            
            sent = self.okt.pos(sentence, norm=True, stem=True)
            
            for tup in sent:
                word, tag = tup
                word_tag = word + '/' + tag
                pos_sent.append(word_tag)
                
            sentence = ' '.join(pos_sent)
            
        return sentence.split()
    
    def count_words(self, docs, labels):
        count_dict = dict()
        for doc, label in zip(docs, labels):
            for word in self.tokenize(doc):
                if word not in count_dict:
                    count_dict[word] = {'pos': 0, 'neg':0}
                count_dict[word][label] += 1
        
        print('num of words...', len(count_dict))
        return count_dict
    
    def word_prob(self, count_dict, pos_class_num, neg_class_num, k):
        word_prob_list = []
        
        for word in count_dict:
            pos_word_num = count_dict[word]['pos']
            neg_word_num = count_dict[word]['neg']
            
            pos_class_prob = (pos_word_num + k) / (pos_class_num + 2*k)
            neg_class_prob = (neg_word_num + k) / (neg_class_num + 2*k)
            
            tup = (word, pos_class_prob, neg_class_prob)
            word_prob_list.append(tup)
        
        self.word_prob_list = word_prob_list
        return word_prob_list
    
    def class_prob(self, word_prob_list, test_sentence, use_unseen=False):
        test_words = self.tokenize(test_sentence)
        
        sent_log_pos_class_prob, sent_log_neg_class_prob = 0.0, 0.0
        
        for word, word_pos_class_prob, word_neg_class_prob in word_prob_list:
            if word in test_words:
                sent_log_pos_class_prob += math.log(word_pos_class_prob) 
                sent_log_neg_class_prob += math.log(word_neg_class_prob)
            else:
                if use_unseen:
                    sent_log_pos_class_prob += math.log(1-word_pos_class_prob) 
                    sent_log_neg_class_prob += math.log(1-word_neg_class_prob)
            
        sent_pos_class_prob = math.exp(sent_log_pos_class_prob)
        sent_neg_class_prob = math.exp(sent_log_neg_class_prob)
        
        pos_class_prob = sent_pos_class_prob/(sent_pos_class_prob+sent_neg_class_prob)
        neg_class_prob = sent_neg_class_prob/(sent_pos_class_prob+sent_neg_class_prob)
        
        return pos_class_prob, neg_class_prob
    
    def train(self, train_file_path):
        train_docs, train_labels = self.load_data(train_file_path)
        
        word_count_dict = self.count_words(train_docs, train_labels)
        
        pos_class_num = len([label for label in train_labels if label == 'pos'])
        neg_class_num = len([label for label in train_labels if label == 'neg'])
        
        self.word_probs = self.word_prob(word_count_dict, pos_class_num, neg_class_num, self.k)

    def classify(self, test_sentence, use_unseen=False):
        pos_class_prob, neg_class_prob = self.class_prob(self.word_prob_list, test_sentence, use_unseen)
        
        if pos_class_prob > neg_class_prob:
            print('pos', pos_class_prob)
        else:
            print('neg', neg_class_prob)

In [20]:
classifier = MyNaiveBayesClassifier(use_morph=False)

In [21]:
classifier.train('ratings_train.txt')

num of words... 3055


In [22]:
classifier.classify('영화가 너무 재미있어요', use_unseen=False)

pos 0.8863323105771248


In [23]:
classifier.classify('영화가 너무 재미있어요', use_unseen=True)

pos 0.9421707038103775


In [24]:
classifier = MyNaiveBayesClassifier(use_morph=True)

In [25]:
classifier.train('ratings_train.txt')

num of words... 2344


In [26]:
classifier.classify('영화가 너무 재미있어요', use_unseen=False)

pos 0.9081932116346049


In [27]:
classifier.classify('영화가 너무 재미있어요', use_unseen=True)

pos 0.9883056598023678


In [28]:
classifier = MyNaiveBayesClassifier(use_morph=False)
classifier.train('ratings_train.txt')
classifier.classify('영화가 너무 재미있어요', use_unseen=True)

num of words... 3055
pos 0.9421707038103775


In [29]:
test_docs, test_labels = classifier.load_data('ratings_test.txt')

In [30]:
classifier.classify(test_docs[0], use_unseen=True)

pos 0.6541187739929468


In [31]:
test_docs[0]

'굳 ㅋ'