In [1]:
from sklearn import feature_extraction
import neologdn
import glob
import MeCab

t = MeCab.Tagger()
t.parse('')

def tokenizer(content):
    nouns = []
    for line in filter(bool, content.splitlines()[2:]):
        for token in t.parse(line).splitlines()[:-1]:
            if '名詞,' in token and '非自立' not in line and '接尾' not in line:
                nouns.append(token.split('\t')[0])
    return ' '.join(nouns)
    


tfv = feature_extraction.text.TfidfVectorizer(input='filename', preprocessor=neologdn.normalize,
                                              tokenizer=tokenizer, analyzer='word')
X = tfv.fit_transform(glob.glob('/tmp/text/*/*.txt'))

In [2]:
y = []
for directory in glob.glob('/tmp/text/*/'):
    label = directory.split('/')[-2]
    y += [label] * len(glob.glob(directory + '/*'))

In [3]:
from sklearn.cross_validation import train_test_split
import sklearn.metrics
import sklearn.naive_bayes

def benchmark(clf, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    clf.fit(X_train, y_train)
    result = clf.predict(X_test)
    print(sklearn.metrics.classification_report(y_test, result))

In [4]:
benchmark(sklearn.naive_bayes.MultinomialNB(), X, y)

             precision    recall  f1-score   support

dokujo-tsushin       0.63      0.92      0.75       104
it-life-hack       0.00      0.00      0.00        25
kaden-channel       0.35      0.83      0.50        83
livedoor-homme       1.00      0.09      0.16        47
movie-enter       1.00      0.03      0.05        36
     peachy       0.00      0.00      0.00        38
       smax       0.59      1.00      0.74        72
sports-watch       1.00      0.06      0.12        47
 topic-news       0.00      0.00      0.00        27

avg / total       0.56      0.51      0.39       479



  'precision', 'predicted', average, warn_for)


In [5]:
import complement_nb
benchmark(complement_nb.ComplementNB(), X, y)

             precision    recall  f1-score   support

dokujo-tsushin       0.48      0.99      0.64       104
it-life-hack       0.00      0.00      0.00        25
kaden-channel       0.56      0.69      0.62        83
livedoor-homme       0.00      0.00      0.00        47
movie-enter       0.00      0.00      0.00        36
     peachy       0.00      0.00      0.00        38
       smax       0.44      1.00      0.62        72
sports-watch       0.00      0.00      0.00        47
 topic-news       0.00      0.00      0.00        27

avg / total       0.27      0.48      0.34       479



  'precision', 'predicted', average, warn_for)


In [6]:
import negation_nb
benchmark(negation_nb.NegationNB(), X, y)

             precision    recall  f1-score   support

dokujo-tsushin       0.62      0.99      0.77       104
it-life-hack       1.00      0.12      0.21        25
kaden-channel       0.74      0.71      0.72        83
livedoor-homme       0.89      0.17      0.29        47
movie-enter       0.89      0.67      0.76        36
     peachy       1.00      0.05      0.10        38
       smax       0.48      1.00      0.65        72
sports-watch       1.00      0.77      0.87        47
 topic-news       1.00      0.26      0.41        27

avg / total       0.78      0.66      0.60       479



In [7]:
import universalset_nb
benchmark(universalset_nb.UniversalSetNB(), X, y)

             precision    recall  f1-score   support

dokujo-tsushin       0.70      0.91      0.79       104
it-life-hack       0.00      0.00      0.00        25
kaden-channel       0.32      0.80      0.46        83
livedoor-homme       0.00      0.00      0.00        47
movie-enter       0.00      0.00      0.00        36
     peachy       0.00      0.00      0.00        38
       smax       0.53      1.00      0.69        72
sports-watch       0.00      0.00      0.00        47
 topic-news       0.00      0.00      0.00        27

avg / total       0.29      0.49      0.35       479



  'precision', 'predicted', average, warn_for)
