In [3]:
# 模型配置一般就是模型的超参数，比如K近邻的K，支持向量机中不同的核函数
# 一般选择都是无限的
from sklearn.datasets import fetch_20newsgroups
import numpy as np
news = fetch_20newsgroups(subset='all')
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(news.data[:3000], news.target[:3000], test_size=0.25, random_state=33)

In [4]:
# 导入支持向量机和文本抽取器
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
 
# 导入Pipline
from sklearn.pipeline import Pipeline
# 使用pipline简化系统搭建流程，将文本抽取和分类器模型串联起来
clf = Pipeline([('vect', TfidfVectorizer(stop_words='english', analyzer='word')), ('svc', SVC())])

In [5]:
# 这里需要试验的两个超参数的个数分别是4，3，
# svc_gamma的参数共有10的负二次方，10的负一次方等常见值
# 一共12种组合，12个不同参数下的模型
parameters = {'svc__gamma': np.logspace(-2, 1, 4), 'svc__C': np.logspace(-1, 1, 3)}
# SVC主要调节的是gamma和C
# gamma：‘rbf’,‘poly’ 和‘sigmoid’的核函数参数。默认是’auto’，则会选择1/n_features
# C：C越大，相当于惩罚松弛变量，希望松弛变量接近0，
# 即对误分类的惩罚增大，趋向于对训练集全分对的情况，这样对训练集测试时准确率很高，但泛化能力弱。
# C值小，对误分类的惩罚减小，允许容错，将他们当成噪声点，泛化能力较强。
# logspace是用来创建等比数列的，
# 开始点是10的-2次方，结束点是10的1次方，元素个数是4，每一次是乘10
# 如果改变base的话，就可以改变每项乘的值

In [6]:
#从sklearn.grid_search中导入网格搜索模块GridSearchCV
from sklearn.model_selection import GridSearchCV
#将12组参数组合以及初始化的Pipline包括3折交叉验证的要求全部告知GridSearchCV。
# 特别需要注意的是refit=True这样一个设定
gs = GridSearchCV(clf, parameters, verbose=2, refit=True, cv=3)

In [7]:
gs.fit(x_train, y_train)
 
gs.best_params_, gs.best_score_
 
print(gs.score(x_test, y_test))
# 以后推荐使用pipline来简化代码
# 这个其实就是你设置好了他自己一个个试着来找最好的

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] svc__C=0.1, svc__gamma=0.01 .....................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ...................... svc__C=0.1, svc__gamma=0.01, total=   5.5s
[CV] svc__C=0.1, svc__gamma=0.01 .....................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.9s remaining:    0.0s


[CV] ...................... svc__C=0.1, svc__gamma=0.01, total=   5.3s
[CV] svc__C=0.1, svc__gamma=0.01 .....................................


[CV] ...................... svc__C=0.1, svc__gamma=0.01, total=   5.4s
[CV] svc__C=0.1, svc__gamma=0.1 ......................................


[CV] ....................... svc__C=0.1, svc__gamma=0.1, total=   5.1s
[CV] svc__C=0.1, svc__gamma=0.1 ......................................


[CV] ....................... svc__C=0.1, svc__gamma=0.1, total=   5.5s
[CV] svc__C=0.1, svc__gamma=0.1 ......................................


[CV] ....................... svc__C=0.1, svc__gamma=0.1, total=   5.5s
[CV] svc__C=0.1, svc__gamma=1.0 ......................................


[CV] ....................... svc__C=0.1, svc__gamma=1.0, total=   5.5s
[CV] svc__C=0.1, svc__gamma=1.0 ......................................


[CV] ....................... svc__C=0.1, svc__gamma=1.0, total=   5.5s
[CV] svc__C=0.1, svc__gamma=1.0 ......................................


[CV] ....................... svc__C=0.1, svc__gamma=1.0, total=   5.6s
[CV] svc__C=0.1, svc__gamma=10.0 .....................................


[CV] ...................... svc__C=0.1, svc__gamma=10.0, total=   5.2s
[CV] svc__C=0.1, svc__gamma=10.0 .....................................


[CV] ...................... svc__C=0.1, svc__gamma=10.0, total=   5.3s
[CV] svc__C=0.1, svc__gamma=10.0 .....................................


[CV] ...................... svc__C=0.1, svc__gamma=10.0, total=   5.7s
[CV] svc__C=1.0, svc__gamma=0.01 .....................................


[CV] ...................... svc__C=1.0, svc__gamma=0.01, total=   5.1s
[CV] svc__C=1.0, svc__gamma=0.01 .....................................


[CV] ...................... svc__C=1.0, svc__gamma=0.01, total=   5.2s
[CV] svc__C=1.0, svc__gamma=0.01 .....................................


[CV] ...................... svc__C=1.0, svc__gamma=0.01, total=   5.2s
[CV] svc__C=1.0, svc__gamma=0.1 ......................................


[CV] ....................... svc__C=1.0, svc__gamma=0.1, total=   5.2s
[CV] svc__C=1.0, svc__gamma=0.1 ......................................


[CV] ....................... svc__C=1.0, svc__gamma=0.1, total=   5.4s
[CV] svc__C=1.0, svc__gamma=0.1 ......................................


[CV] ....................... svc__C=1.0, svc__gamma=0.1, total=   5.5s
[CV] svc__C=1.0, svc__gamma=1.0 ......................................


[CV] ....................... svc__C=1.0, svc__gamma=1.0, total=   5.3s
[CV] svc__C=1.0, svc__gamma=1.0 ......................................


[CV] ....................... svc__C=1.0, svc__gamma=1.0, total=   5.6s
[CV] svc__C=1.0, svc__gamma=1.0 ......................................


[CV] ....................... svc__C=1.0, svc__gamma=1.0, total=   5.7s
[CV] svc__C=1.0, svc__gamma=10.0 .....................................


[CV] ...................... svc__C=1.0, svc__gamma=10.0, total=   5.6s
[CV] svc__C=1.0, svc__gamma=10.0 .....................................


[CV] ...................... svc__C=1.0, svc__gamma=10.0, total=   6.0s
[CV] svc__C=1.0, svc__gamma=10.0 .....................................


[CV] ...................... svc__C=1.0, svc__gamma=10.0, total=   5.4s
[CV] svc__C=10.0, svc__gamma=0.01 ....................................


[CV] ..................... svc__C=10.0, svc__gamma=0.01, total=   5.2s
[CV] svc__C=10.0, svc__gamma=0.01 ....................................


[CV] ..................... svc__C=10.0, svc__gamma=0.01, total=   5.2s
[CV] svc__C=10.0, svc__gamma=0.01 ....................................


[CV] ..................... svc__C=10.0, svc__gamma=0.01, total=   6.8s
[CV] svc__C=10.0, svc__gamma=0.1 .....................................


[CV] ...................... svc__C=10.0, svc__gamma=0.1, total=   5.2s
[CV] svc__C=10.0, svc__gamma=0.1 .....................................


[CV] ...................... svc__C=10.0, svc__gamma=0.1, total=   5.2s
[CV] svc__C=10.0, svc__gamma=0.1 .....................................


[CV] ...................... svc__C=10.0, svc__gamma=0.1, total=   5.3s
[CV] svc__C=10.0, svc__gamma=1.0 .....................................


[CV] ...................... svc__C=10.0, svc__gamma=1.0, total=   5.2s
[CV] svc__C=10.0, svc__gamma=1.0 .....................................


[CV] ...................... svc__C=10.0, svc__gamma=1.0, total=   5.3s
[CV] svc__C=10.0, svc__gamma=1.0 .....................................


[CV] ...................... svc__C=10.0, svc__gamma=1.0, total=   5.3s
[CV] svc__C=10.0, svc__gamma=10.0 ....................................


[CV] ..................... svc__C=10.0, svc__gamma=10.0, total=   5.2s
[CV] svc__C=10.0, svc__gamma=10.0 ....................................


[CV] ..................... svc__C=10.0, svc__gamma=10.0, total=   5.3s
[CV] svc__C=10.0, svc__gamma=10.0 ....................................


[CV] ..................... svc__C=10.0, svc__gamma=10.0, total=   5.4s


[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:  4.8min finished


0.8226666666666667


In [8]:
gs = GridSearchCV(clf, parameters, verbose=2, refit=True, cv=3,n_jobs=-1)
# 这里n_job -1是使用计算机的全部CPU
# 也就是并行搜索

In [None]:
# 以后还是不要在自己电脑上并行搜索了，电脑会卡住的
# 虽然速度快了很多
# 还是放云平台去并行搜索吧

In [9]:
gs.fit(x_train, y_train)
 
gs.best_params_, gs.best_score_
 
print(gs.score(x_test, y_test))

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:  1.8min finished


0.8226666666666667
