# load data

In [2]:
from sklearn.datasets import fetch_20newsgroups
import numpy as np

news = fetch_20newsgroups(subset='all')

# preprocess data

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(news.data[:3000], news.target[:3000], test_size=0.25, random_state=33)

# learn data

In [11]:
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

clf = Pipeline([('vect', TfidfVectorizer(stop_words='english', analyzer='word')), ('svc', SVC())])
parameters = {'svc__gamma': np.logspace(-1, 1, 4), 'svc__C': np.logspace(-1, 1, 3)}
gs = GridSearchCV(clf, parameters, verbose=2, refit=True, cv=3)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] svc__C=0.1, svc__gamma=0.1 ......................................
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV] ....................... svc__C=0.1, svc__gamma=0.1, total=   7.0s
[CV] svc__C=0.1, svc__gamma=0.1 ......................................
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.9s remaining:    0.0s
[CV] ....................... svc__C=0.1, svc__gamma=0.1, total=   7.4s
[CV] svc__C=0.1, svc__gamma=0.1 ......................................
[CV] ....................... svc__C=0.1, svc__gamma=0.1, total=   7.1s
[CV] svc__C=0.1, svc__gamma=0.46415888336127786 ......................
[CV] ....... svc__C=0.1, svc__gamma=0.46415888336127786, total=   6.7s
[CV] svc__C=0.1, svc__gamma=0.46415888336127786 ......................
[CV] ....... svc__C=0.1, svc__gamma=0.46415888336127786, total=   6.5s
[CV] svc__C=0.1, svc__gamma=0.46415888336127786 ....................

# predict data

In [None]:
%time _ = gs.fit(X_train, y_train)
gs.best_params_, gs.best_score_
print(gs.score(X_test, y_test))

# lift performance

In [13]:
# 多线程
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(news.data[:3000], news.target[:3000], test_size=0.25, random_state=33)

from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

clf = Pipeline([('vect', TfidfVectorizer(stop_words='english', analyzer='word')), ('svc', SVC())])
parameters = {'svc__gamma': np.logspace(-1, 1, 4), 'svc__C': np.logspace(-1, 1, 3)}
gs = GridSearchCV(clf, parameters, verbose=2, refit=True, cv=3, n_jobs=-1)

%time _ = gs.fit(X_train, y_train)
gs.best_params_, gs.best_score_
print(gs.score(X_test, y_test))

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:  1.7min finished
Wall time: 1min 50s
0.8226666666666667
