네이버 영화 리뷰를 담은 데이터를 이용해 나이브 베이즈 모델을 학습하고 사용자의 감정을 분석하는 코드

In [1]:
import io
import numpy
import matplotlib as mpl
mpl.use("Agg")
import matplotlib.pyplot as plt
import statsmodels.api
import numpy
import re
import math
import os


def main():
    training1_sentence = read_text_data('./txt_sentoken/pos/')
    training2_sentence = read_text_data('./txt_sentoken/neg/')
    testing_sentence = input("영화평을 입력하세요 > ")

    alpha = 0.1
    prob1 = 0.5
    prob2 = 0.5

    prob_pair = naive_bayes(training1_sentence, training2_sentence, testing_sentence, alpha, prob1, prob2)

    plot_title = testing_sentence
    if len(plot_title) > 50: plot_title = plot_title[:50] + "..."
    print(visualize_boxplot(plot_title,
                            list(prob_pair),
                            ['Positive', 'Negative']))


def read_text_data(directory):
    files = os.listdir(directory)
    files = [f for f in files if f.endswith('.txt')]

    all_text = ''
    for f in files:
        all_text += ' '.join(open(directory + f).readlines()) + ' '

    return all_text


def naive_bayes(training1_sentence, training2_sentence, testing_sentence, alpha, prob1, prob2):
    training1_model = create_BOW(training1_sentence)
    training2_model = create_BOW(training2_sentence)
    testing_model = create_BOW(testing_sentence)

    classify1 = calculate_doc_prob(training1_model, testing_model, alpha) + math.log(prob1)
    classify2 = calculate_doc_prob(training2_model, testing_model, alpha) + math.log(prob2)

    return normalize_log_prob(classify1, classify2)


def normalize_log_prob(prob1, prob2):
    maxprob = max(prob1, prob2)

    prob1 -= maxprob
    prob2 -= maxprob
    prob1 = math.exp(prob1)
    prob2 = math.exp(prob2)

    normalize_constant = 1.0 / float(prob1 + prob2)
    prob1 *= normalize_constant
    prob2 *= normalize_constant

    return (prob1, prob2)


def calculate_doc_prob(training_model, testing_model, alpha):
    logprob = 0

    num_tokens_training = sum(training_model[1])
    num_words_training = len(training_model[0])

    for word in testing_model[0]:
        word_freq = testing_model[1][testing_model[0][word]]
        word_freq_in_training = 0
        if word in training_model[0]:
            word_freq_in_training = training_model[1][training_model[0][word]]
        for i in range(0, word_freq):
            logprob += math.log(word_freq_in_training + alpha)
            logprob -= math.log(num_tokens_training + num_words_training * alpha)

    return logprob


def create_BOW(sentence):
    bow_dict = {}
    bow = []

    sentence = sentence.lower()
    sentence = replace_non_alphabetic_chars_to_space(sentence)
    words = sentence.split(' ')
    for token in words:
        if len(token) < 1: continue
        if token not in bow_dict:
            new_idx = len(bow)
            bow.append(0)
            bow_dict[token] = new_idx
        bow[bow_dict[token]] += 1

    return bow_dict, bow


def replace_non_alphabetic_chars_to_space(sentence):
    return re.sub(r'[^a-z]+', ' ', sentence)


def visualize_boxplot(title, values, labels):
    width = .35

    fig, ax = plt.subplots()
    ind = numpy.arange(len(values))
    rects = ax.bar(ind, values, width)
    ax.set_title(title)
    ax.bar(ind, values, width=width)
    ax.set_xticks(ind + width/2)
    ax.set_xticklabels(labels)

    def autolabel(rects):
        for rect in rects:
            height = rect.get_height()
            ax.text(rect.get_x()+rect.get_width()/2., height + 0.01, '%.2lf%%' % (height * 100), ha='center', va='bottom')

    autolabel(rects)

    plt.savefig("image.svg", format="svg")
    elice_utils.send_image("image.svg")


if __name__ == "__main__":
    main()


ModuleNotFoundError: No module named 'statsmodels'