# データのダウンロード

In [1]:
%system curl -O 'https://www.rondhuit.com/download/ldcc-20140209.tar.gz'

['  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current',
 '                                 Dload  Upload   Total   Spent    Left  Speed',
 '',
 '  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0',
 '  7 8647k    7  671k    0     0  2057k      0  0:00:04 --:--:--  0:00:04 2054k',
 ' 60 8647k   60 5191k    0     0  3918k      0  0:00:02  0:00:01  0:00:01 3918k',
 '100 8647k  100 8647k    0     0  4448k      0  0:00:01  0:00:01 --:--:-- 4448k']

In [2]:
import tarfile

with tarfile.open('ldcc-20140209.tar.gz') as tar:
    tar.extractall(path='livedoor')

In [3]:
import glob
import os

# 一部クラスのデータを削除してデータ量を偏らせる
y = []
for directory in glob.glob('livedoor/text/*/'):
    dir_name = directory.split('/')[-2]
    if dir_name not in ('kaden-channel', 'dokujo-tsushin', 'smax'):
        filelist = sorted(glob.glob(directory + '*'))
        for i in range(int(len(filelist) * 0.6)):
            os.remove(filelist[i])
    else:
        os.remove(directory + '/LICENSE.txt')

            
for directory in glob.glob('livedoor/text/*/'):
    dir_name = directory.split('/')[-2]
    label = dir_name
    y += [label] * len(glob.glob(directory + '*'))

In [4]:
from sklearn import feature_extraction
import neologdn
import glob
import MeCab

t = MeCab.Tagger()
t.parse('')

def tokenizer(content):
    nouns = []
    for line in filter(bool, content.splitlines()[2:]):
        for token in t.parse(line).splitlines()[:-1]:
            if '名詞,' in token and '非自立' not in line and '接尾' not in line:
                nouns.append(token.split('\t')[0])
    return ' '.join(nouns)
    


tfv = feature_extraction.text.TfidfVectorizer(input='filename', preprocessor=neologdn.normalize,
                                              tokenizer=tokenizer, analyzer='word')
X = tfv.fit_transform(glob.glob('livedoor/text/*/*.txt'))

In [5]:
from sklearn.model_selection import train_test_split
import sklearn.metrics


def benchmark(clf, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    clf.fit(X_train, y_train)
    result = clf.predict(X_test)
    print(sklearn.metrics.classification_report(y_test, result))

In [6]:
import sklearn.naive_bayes
benchmark(sklearn.naive_bayes.MultinomialNB(), X, y)

                precision    recall  f1-score   support

dokujo-tsushin       0.50      0.84      0.63        80
  it-life-hack       1.00      0.03      0.05        36
 kaden-channel       0.43      0.93      0.59        86
livedoor-homme       0.00      0.00      0.00        14
   movie-enter       0.00      0.00      0.00        44
        peachy       0.00      0.00      0.00        31
          smax       0.72      1.00      0.83        93
  sports-watch       0.00      0.00      0.00        36
    topic-news       1.00      0.06      0.12        32

   avg / total       0.47      0.54      0.41       452



  'precision', 'predicted', average, warn_for)


In [7]:
import complement_nb
benchmark(complement_nb.ComplementNB(), X, y)

                precision    recall  f1-score   support

dokujo-tsushin       0.40      0.97      0.57        80
  it-life-hack       1.00      0.03      0.05        36
 kaden-channel       0.67      0.71      0.69        86
livedoor-homme       0.00      0.00      0.00        14
   movie-enter       0.00      0.00      0.00        44
        peachy       0.00      0.00      0.00        31
          smax       0.56      1.00      0.72        93
  sports-watch       0.00      0.00      0.00        36
    topic-news       0.00      0.00      0.00        32

   avg / total       0.39      0.52      0.38       452



  'precision', 'predicted', average, warn_for)


In [8]:
import negation_nb
benchmark(negation_nb.NegationNB(), X, y)

                precision    recall  f1-score   support

dokujo-tsushin       0.54      0.97      0.69        80
  it-life-hack       1.00      0.14      0.24        36
 kaden-channel       0.87      0.72      0.79        86
livedoor-homme       1.00      0.07      0.13        14
   movie-enter       1.00      0.45      0.62        44
        peachy       1.00      0.03      0.06        31
          smax       0.57      1.00      0.73        93
  sports-watch       0.89      0.69      0.78        36
    topic-news       0.94      0.53      0.68        32

   avg / total       0.79      0.67      0.62       452



In [9]:
import universalset_nb
benchmark(universalset_nb.UniversalSetNB(), X, y)

                precision    recall  f1-score   support

dokujo-tsushin       0.57      0.91      0.71        80
  it-life-hack       1.00      0.14      0.24        36
 kaden-channel       0.57      0.91      0.70        86
livedoor-homme       1.00      0.07      0.13        14
   movie-enter       1.00      0.30      0.46        44
        peachy       1.00      0.06      0.12        31
          smax       0.68      1.00      0.81        93
  sports-watch       0.94      0.47      0.63        36
    topic-news       1.00      0.34      0.51        32

   avg / total       0.77      0.65      0.59       452



In [10]:
import selective_nb
benchmark(selective_nb.SelectiveNB(), X, y)

                precision    recall  f1-score   support

dokujo-tsushin       0.61      0.96      0.74        80
  it-life-hack       1.00      0.14      0.24        36
 kaden-channel       0.92      0.70      0.79        86
livedoor-homme       1.00      0.14      0.25        14
   movie-enter       0.94      0.73      0.82        44
        peachy       0.50      0.06      0.11        31
          smax       0.60      1.00      0.75        93
  sports-watch       0.88      0.83      0.86        36
    topic-news       0.88      0.69      0.77        32

   avg / total       0.77      0.71      0.67       452

