In [1]:
from sklearn.datasets import fetch_20newsgroups

categories = [
    'alt.atheism',
    'comp.graphics',
    'comp.os.ms-windows.misc',
    'comp.sys.ibm.pc.hardware',
    'comp.sys.mac.hardware',
    'comp.windows.x',
    'misc.forsale',
    'rec.autos',
    'rec.motorcycles',
    'rec.sport.baseball',
    'rec.sport.hockey'
    ]
remove = ('headers', 'footers', 'quotes')
# 学習データ
train_data = fetch_20newsgroups(
    subset='train', remove=remove, categories=categories)
# 検証データ
test_data = fetch_20newsgroups(
    subset='test', remove=remove, categories=categories)

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
X_train_bagwords = count_vect.fit_transform(train_data.data)
X_test_bagwords = count_vect.transform(test_data.data)

In [3]:
import numpy as np
from sklearn.svm import LinearSVC

# 線形サポートベクターマシンの分類器
model = LinearSVC(max_iter=30000, dual=True)
model.fit(X_train_bagwords, train_data.target)
predicted = model.predict(X_test_bagwords)
np.mean(predicted == test_data.target)

0.652327221438646

In [4]:
from sklearn.ensemble import RandomForestClassifier

# ランダムフォレスト分類のモデルを作成
rf_model = RandomForestClassifier(n_estimators=500)
rf_model.fit(X_train_bagwords, train_data.target)
predicted = rf_model.predict(X_test_bagwords)
np.mean(predicted == test_data.target)

0.7101551480959097

In [5]:
from sklearn.ensemble import GradientBoostingClassifier

# 勾配ブースティング決定木のモデルを作成
# 最大深度を3、決定木を500、0.5の割合で確率的勾配降下法、学習率を0.1
gb_model = GradientBoostingClassifier(
    max_depth=3, n_estimators=500, subsample=0.5, learning_rate=0.1)
gb_model.fit(X_train_bagwords, train_data.target)
predicted = gb_model.predict(X_test_bagwords)
np.mean(predicted == test_data.target)

0.7070992007522332