In [12]:
import math
from collections import defaultdict
import MeCab
import re
import random
import numpy as np

In [13]:
category=["world","economy","entertainment","sports","IT","science"]

In [15]:
class NaiveBayes:
    def __init__(self):
        self.categories = set()     # カテゴリの集合
        self.vocabularies = set()   # ボキャブラリの集合
        self.wordcount = {}         # wordcount[cat][word] カテゴリでの単語の出現回数
        self.catcount = {}          # catcount[cat] カテゴリの出現回数
        self.denominator = {}       # denominator[cat] P(word|cat)の分母の値

    def train(self, data):
        # 文書集合からカテゴリを抽出して辞書を初期化
        for d in data:
            cat = d[0]
            self.categories.add(cat)
        for cat in self.categories:
            self.wordcount[cat] = defaultdict(int)
            self.catcount[cat] = 0
        # 文書集合からカテゴリと単語をカウント
        for d in data:
            cat, doc = d[0], d[1:]
            self.catcount[cat] += 1
            for word in doc:
                self.vocabularies.add(word)
                self.wordcount[cat][word] += 1
        # 単語の条件付き確率の分母の値をあらかじめ一括計算しておく（高速化のため）
        for cat in self.categories:
            self.denominator[cat] = sum(
                self.wordcount[cat].values()) + len(self.vocabularies)

    def classify(self, doc):
        best = None
        max = -10000000
        for cat in self.catcount.keys():
            p = self.score(doc, cat)
            if p > max:
                max = p
                best = cat
        return best
    #自作のメソッド
    def evaluate(self,test):
        global category
        result=np.zeros((6,6))
        for i in range(len(test)):
            category_name=test[i][0]
            document=test[i][1:]
            output_category=self.classify(document)
            result[category.index(category_name)][category.index(output_category)]+=1
        accuracy=sum([result[j][j] for j in range(6)])/np.sum(result)
        result/=np.sum(result,axis=1)
        return result,accuracy

    def wordProb(self, word, cat):
        # ラプラススムージングを適用
        # wordcount[cat]はdefaultdict(int)なのでカテゴリに存在しなかった単語はデフォルトの0を返す
        # 分母はtrain()の最後で一括計算済み
        return float(self.wordcount[cat][word] + 1) / float(self.denominator[cat])

    def score(self, doc, cat):
        total = sum(self.catcount.values())  # 総文書数
        score = math.log(float(self.catcount[cat]) / total)  # log P(cat)
        for word in doc:
            # logをとるとかけ算は足し算になる
            score += math.log(self.wordProb(word, cat))  # log P(word|cat)
        return score

    def __str__(self):
        total = sum(self.catcount.values())  # 総文書数
        return "documents: %d, vocabularies: %d, categories: %d" % (total, len(self.vocabularies), len(self.categories))


In [16]:
def Wakati(text):
    m = MeCab.Tagger ("-Ochasen -d /usr/lib/mecab/dic/mecab-ipadic-neologd -Owakati")
    result=m.parse(text)
    ws = re.compile(" ")
    words = [word for word in ws.split(result)]
    if words[-1] == u"\n":
        words = words[:-1]
    return words

In [17]:
def train_test_divide(data,test_rate):
    datanum=len(data)
    n=math.floor(datanum*test_rate)
    random.shuffle(data)
    train_data=data[:datanum-n]
    test_data=data[datanum-n:]
    return train_data,test_data

In [18]:
#データ作成
#[[IT,word,word,...],[IT,word,word,...],[IT,word,word,...]...]みたいな
def create_data(filename):
    data=[]
    with open(filename, "r") as f:
        lines = [line for line in f]
        for line in lines:
            title,category_num=line.split(" ")
            _data=[category[int(category_num)]]
            _data.extend(Wakati(title))
            data.append(_data)
    return data

In [27]:
yahoo_data=create_data("../data/yahoo_data.txt")
livedoor_data=create_data("../data/livedoor_data.txt")
train_data,test_data=train_test_divide(yahoo_data,0.1)

In [28]:
# ナイーブベイズ分類器を訓練
nb = NaiveBayes()
nb.train(train_data)

In [29]:
# テストデータのカテゴリを予測
result_yahoo,accuracy_yahoo=nb.evaluate(test_data)
result_livedoor,accuracy_livedoor=nb.evaluate(livedoor_data)
print("yahoo: "+str(accuracy_yahoo))
print("livedoor: "+str(accuracy_livedoor))
print("---------yahoo---------")
for i in range(6):
    print("{}:{}".format(category[i],result_yahoo[i][i]*100))
print("------livedoor---------")
for i in range(6):
    print("{}:{}".format(category[i],result_livedoor[i][i]*100))
    

yahoo: 0.8631979997618764
livedoor: 0.6923076923076923
---------yahoo---------
world:85.59322033898306
economy:80.55954088952653
entertainment:87.18487394957984
sports:91.22681883024251
IT:85.4004252303331
science:87.98219584569733
------livedoor---------
world:58.71559633027523
economy:25.423728813559322
entertainment:78.90625
sports:65.23955147808358
IT:89.28571428571429
science:nan


