# Verifications: scikit-learn

- Purpose: To classify news with using scikit-learn's NB classifier and perform hyper parameter tuning
- Keywords: Naive Bayes, Pipeline, Hyperparameter Tuning, Grid search

## ニュースコーパスの読み込みと表示

In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
# 20グループ中、４グループをサンプルとしてコーパスを読み込み
categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(subset='train',categories=categories, shuffle=True, random_state=42)
# ファイルはpkz拡張子という特殊な形式で収められているため、通常のFileOpenは適用できない（20news-bydate_py3.pkz）
# f = open(twenty_train.filenames[0], 'r') #=>FileNotFoundError

In [3]:
# 4グループの名称を取得・表示
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [4]:
# 訓練データの全データ数
print(len(twenty_train), ',', len(twenty_train.data), ',', len(twenty_train.filenames), ',', len(twenty_train.target))

6 , 2257 , 2257 , 2257


In [5]:
# 訓練データのファイル形式
type(twenty_train)

sklearn.utils.Bunch

In [6]:
# データ形式が特殊（sklearn.utils.Bunch）でも、dataプロパティはリストとなっているためデータの中身を確認可能
print(type(twenty_train.data))
print(type(twenty_train.data[0]))
twenty_train.data[0]

<class 'list'>
<class 'str'>


'From: sd345@city.ac.uk (Michael Collier)\nSubject: Converting images to HP LaserJet III?\nNntp-Posting-Host: hampton\nOrganization: The City University\nLines: 14\n\nDoes anyone know of a good way (standard PC application/PD utility) to\nconvert tif/img/tga files into LaserJet III format.  We would also like to\ndo the same, converting to HPGL (HP plotter) files.\n\nPlease email any response.\n\nIs this the correct group?\n\nThanks in advance.  Michael.\n-- \nMichael Collier (Programmer)                 The Computer Unit,\nEmail: M.P.Collier@uk.ac.city                The City University,\nTel: 071 477-8000 x3769                      London,\nFax: 071 477-8565                            EC1V 0HB.\n'

In [7]:
# pretty フォーマットにて再度表示
from pprint import pprint
pprint(twenty_train.data[0])

('From: sd345@city.ac.uk (Michael Collier)\n'
 'Subject: Converting images to HP LaserJet III?\n'
 'Nntp-Posting-Host: hampton\n'
 'Organization: The City University\n'
 'Lines: 14\n'
 '\n'
 'Does anyone know of a good way (standard PC application/PD utility) to\n'
 'convert tif/img/tga files into LaserJet III format.  We would also like to\n'
 'do the same, converting to HPGL (HP plotter) files.\n'
 '\n'
 'Please email any response.\n'
 '\n'
 'Is this the correct group?\n'
 '\n'
 'Thanks in advance.  Michael.\n'
 '-- \n'
 'Michael Collier (Programmer)                 The Computer Unit,\n'
 'Email: M.P.Collier@uk.ac.city                The City University,\n'
 'Tel: 071 477-8000 x3769                      London,\n'
 'Fax: 071 477-8565                            EC1V 0HB.\n')


In [8]:
# ラベル（カテゴリ名称）はinteger形式で保存されており、targetプロパティで取得可能
print(twenty_train.target[:10])

[1 1 3 3 3 3 3 2 2 2]


In [9]:
# 以下の通りテキスト形式で表示させることも可能
for t in twenty_train.target[:10]:
    print(twenty_train.target_names[t])

comp.graphics
comp.graphics
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
sci.med
sci.med
sci.med


## 特徴量抽出

In [10]:
# 全単語に対する頻度をカウントしベクトル化することで特徴量とする
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(2257, 35788)

In [11]:
# 全コーパス中でalgorithmが出現した回数
count_vect.vocabulary_.get(u'algorithm')

4690

In [12]:
# 各単語のID及び出現頻度が訓練データに収められた。サンプルを以下に表示
print(type(X_train_counts[0]))
print(X_train_counts[0])

<class 'scipy.sparse.csr.csr_matrix'>
  (0, 230)	1
  (0, 12541)	1
  (0, 3166)	1
  (0, 14085)	1
  (0, 20459)	1
  (0, 35416)	1
  (0, 3062)	1
  (0, 2326)	2
  (0, 177)	2
  (0, 31915)	1
  (0, 33572)	1
  (0, 9338)	1
  (0, 26175)	1
  (0, 4378)	1
  (0, 17556)	1
  (0, 32135)	1
  (0, 15837)	1
  (0, 9932)	1
  (0, 32270)	1
  (0, 18474)	1
  (0, 27836)	1
  (0, 5195)	1
  (0, 12833)	2
  (0, 25337)	1
  (0, 25361)	1
  :	:
  (0, 5201)	1
  (0, 12051)	1
  (0, 587)	1
  (0, 20253)	1
  (0, 33597)	2
  (0, 32142)	5
  (0, 23915)	1
  (0, 16082)	1
  (0, 16881)	1
  (0, 25663)	1
  (0, 23122)	1
  (0, 17302)	2
  (0, 19780)	2
  (0, 16916)	2
  (0, 32493)	4
  (0, 17366)	1
  (0, 9805)	2
  (0, 31077)	1
  (0, 9031)	3
  (0, 21661)	3
  (0, 33256)	2
  (0, 4017)	2
  (0, 8696)	4
  (0, 29022)	1
  (0, 14887)	1


In [13]:
# 頻度ベクトルからTFベクトルへ変換
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(2257, 35788)

In [14]:
# 頻度ベクトルからTF-IDFベクトルへ変換
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2257, 35788)

In [15]:
# TF-IDF形式に変換した訓練データのサンプルを表示
print(type(X_train_tfidf[0]))
print(X_train_tfidf[0])

<class 'scipy.sparse.csr.csr_matrix'>
  (0, 14887)	0.016797806021219684
  (0, 29022)	0.1348710554299733
  (0, 8696)	0.314400065528974
  (0, 4017)	0.12491817585060791
  (0, 33256)	0.11819702490105698
  (0, 21661)	0.1962279892331408
  (0, 9031)	0.3841803935867984
  (0, 31077)	0.016797806021219684
  (0, 9805)	0.21567205914741705
  (0, 17366)	0.0744441018788533
  (0, 32493)	0.07283773941616518
  (0, 16916)	0.17358472047671197
  (0, 19780)	0.24645540709354397
  (0, 17302)	0.18626015109199115
  (0, 23122)	0.036374916362300114
  (0, 25663)	0.034290706362898604
  (0, 16881)	0.0360441471878483
  (0, 16082)	0.11382738609462074
  (0, 23915)	0.017762318563562172
  (0, 32142)	0.08865416253721688
  (0, 33597)	0.06567578043186388
  (0, 20253)	0.016864892977128034
  (0, 587)	0.05966162012870271
  (0, 12051)	0.037793189755988436
  (0, 5201)	0.04316199700711876
  :	:
  (0, 25361)	0.11947938145690981
  (0, 25337)	0.04935883383975408
  (0, 12833)	0.125601499991304
  (0, 5195)	0.0310951485922154
  (0, 2783

## 分類器による訓練

In [16]:
# NaiveBayes分類機のインスタンスを生成
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [17]:
# サンプル文の評価
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)  # 頻度計算
X_new_tfidf = tfidf_transformer.transform(X_new_counts)  # TF-IDF形式への変換
predicted = clf.predict(X_new_tfidf)  # 推論
for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category])) # 推論結果の表示

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics


## パイプラインの構築

In [18]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('clf', MultinomialNB()),])

In [19]:
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)

## テスト用コーパスの評価

In [20]:
import numpy as np
twenty_test = fetch_20newsgroups(subset='test',categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)

0.8348868175765646

## 確率的勾配降下法を用いた訓練・評価

scikit-learnではアルゴリズムの置き換えが容易だが、  
パイプラインを作っておけば、一連のプロセスも比較的容易に置き換え可能

In [21]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter=5, random_state=42)),])

_ = text_clf.fit(twenty_train.data, twenty_train.target)
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)

0.9127829560585885

classification_report()を使用することで、性能評価も容易に行える

In [22]:
from sklearn import metrics
report = metrics.classification_report(twenty_test.target, predicted,target_names=twenty_test.target_names)
print(report)

                        precision    recall  f1-score   support

           alt.atheism       0.95      0.81      0.87       319
         comp.graphics       0.88      0.97      0.92       389
               sci.med       0.94      0.90      0.92       396
soc.religion.christian       0.90      0.95      0.93       398

           avg / total       0.92      0.91      0.91      1502



Jupyternotebook で上記表示は美しくないため、dataframeの表形式で表示してみる

In [23]:
# レポートをDataframeへ置き換える関数
import pandas as pd

def classifaction_report_to_dataframe(report):
    report_data = []
    labels = ['class','precision','recall','f1_score','support']
    lines = report.split('\n')
    for line in lines[2:-3]:
        row = {}
        row_data = line.split('      ')
        idx = 0
        for element in row_data:
            if len(element)>0:
                row[labels[idx]] = element.strip()
                idx += 1                
        report_data.append(row)
    dataframe = pd.DataFrame.from_dict(report_data)

    return dataframe

In [24]:
classifaction_report_to_dataframe(report)

Unnamed: 0,class,f1_score,precision,recall,support
0,alt.atheism,0.87,0.95,0.81,319
1,comp.graphics,0.92,0.88,0.97,389
2,sci.med,0.92,0.94,0.9,396
3,soc.religion.christian,0.93,0.9,0.95,398


In [25]:
# 混合行列の表示
cf_matrix = metrics.confusion_matrix(twenty_test.target, predicted)
cf_df = pd.DataFrame(cf_matrix, columns=categories)
cf_df

Unnamed: 0,alt.atheism,soc.religion.christian,comp.graphics,sci.med
0,258,11,15,35
1,4,379,3,3
2,5,33,355,3
3,5,10,4,379


## グリッドサーチによるハイパーパラメータチューニング

以下の組み合わせで最適なものを総当たり方式で求める
- N-gram:monogramとbigramで切り替え
- TF-IDF:TFとTF-IDFで切り替え
- 分類器のパラメータαを0.01と0.001で切り替え

In [26]:
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
          'tfidf__use_idf': (True, False),
          'clf__alpha': (1e-2, 1e-3),}

In [27]:
#　上で定義していたパイプラインtext_clfを使ってグリッドサーチ
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)

In [28]:
gs_clf = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])

In [29]:
twenty_train.target_names[gs_clf.predict(['Show a dot on display'])[0]]

'comp.graphics'

In [30]:
gs_clf.best_score_

0.9

In [31]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 1)


Reference: http://scikit-learn.org/