In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
# 学習器
from sklearn.linear_model import LogisticRegression
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
# model_selection
from sklearn import model_selection
from sklearn import metrics

In [None]:
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
digits = load_digits()
X = digits.data
y = digits.target
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, train_size=0.8)
clf = RandomForestClassifier()
# clf = GaussianNB()
# clf = LogisticRegression()
pred_y = clf.fit(X_train, y_train).predict(X_test)
a = metrics.classification_report(y_test, pred_y)
print(a)

## 最も簡単なCVの方法
`cross_val_score(clf, data, target, cv=5, scoring="accuracy")`

|変数|役割|
|---|---|
|clf|学習器|
|data|データ|
|target|ラベル|
|cv|クロスバリデーションの回数|
|scoring|評価する指標[(参照:metrics)](http://scikit-learn.org/stable/modules/model_evaluation.html)|

scoringに適用できるもの:
```python
['accuracy', 'adjusted_rand_score', 'average_precision', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc']
```

In [None]:
# computing cross-validated metrics
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
digits = load_digits()
clf = RandomForestClassifier()
scoring = "f1_macro"
scores = cross_val_score(clf, digits.data, digits.target, cv=5, scoring=scoring)
print("{}:{:.3f}+/-{:.3f}".format(scoring, scores.mean(), scores.std()))

## わき道 : pipline
`sklearn.pipeline`を使うと、処理が簡潔に書ける。[(参考)](http://scikit-learn.org/stable/modules/pipeline.html#combining-estimators)

例 : 正規化した後の効果をcross_validationしたい

In [None]:
# 通常
from sklearn import preprocessing
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, train_size=0.8)
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_transformed = scaler.transform(X_train)
clf = SVC(C=1).fit(X_train_transformed, y_train)
X_test_transformed = scaler.transform(X_test)
print(clf.score(X_test_transformed, y_test))
# pipeline
from sklearn.pipeline import make_pipeline
# 処理を流す順に関数を書いていく
scoring = "f1_macro"
clf = make_pipeline(preprocessing.StandardScaler(), SVC(kernel="linear",C=1))
scores = cross_val_score(clf, digits.data, digits.target, cv=5, scoring=scoring)
print("{}:{:.3f}+/-{:.3f}".format(scoring, scores.mean(), scores.std()))

## cross_validate 関数
さっきまでのが`cross_val_scores`関数。ややこしいが少し違う

- 評価に**複数の指標**を考慮できる
- テストスコアに加えて、**学習の時のスコア、学習時間、テストの時間**などを算出してくれる。

つまり、より強力なcross_validationを行える。

※0.19.1では、windowsでなぜか動かない。ubuntu14.04では動いた。

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score
from sklearn.svm import SVC
scoring = ["f1_macro", "recall_macro"]
clf = SVC(C=1)
scores = cross_validate(clf, digits.data, digits.target, scoring=scoring, cv=5)
for key,value in scores.items():
    print("{}:{:.2f}+/-{:.2f}".format(key, value.mean(), value.std()))

## 予測結果としてのcross validation
test, trainに分けることなく結果の検証ができる関数

In [None]:
from sklearn.model_selection import cross_val_predict
clf = RandomForestClassifier()
predicted = cross_val_predict(clf, X, y, cv=5)
print(metrics.classification_report(y, predicted))

## iterators
CrossValidationを行うときの、データセットの分け方
- i.i.d data
    - K-fold
    - Repeated K-Fold
    - Leave One Out (LOO)
    - Leave P out (LPO)
    - Shuffle & Split
- iterators with stratification based on class labels(サンプリングとか)
    - Stratified k-fold
    - Stratified Shuffle Split
- Grouped data　i.i.dデータと基本おなじ

In [None]:
import numpy as np
# iterator
from sklearn.model_selection import KFold
# from sklearn.model_selection import RepeatedKFold
# from sklearn.model_selection import LeaveOneOut
# from sklearn.model_selection import LeavePOut
# from sklearn.model_selection import ShuffleSplit
# from sklearn.model_selection import StratifiedKFold
# from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_digits
from sklearn.metrics import accuracy_score
# leave-one-out 交差検定
iterater = KFold(n_splits=5)
results = []
for train_indexes, test_indexes in iterater.split(digits.data):
#     print(train_indexes, test_indexes)
    X = digits.data[train_indexes]
    y = digits.target[train_indexes]
    clf = RandomForestClassifier()
    clf.fit(X,y)
    pred_y = clf.predict(digits.data[test_indexes])
    ac = accuracy_score(digits.target[test_indexes], pred_y)
    results.append(ac)
results = np.array(results)
print("KFold accuracy: {:.2f}+/-{:.2f}".format(results.mean(),results.std()))

## パラメータを調整する
[Tuning the hyper-parameters of an estimator](http://scikit-learn.org/stable/modules/grid_search.html)

- グリッドサーチ
- ランダム選択
- モデルに合わせた賢いCV


他変数最適化の問題として、ある指標を評価関数にして解くというのもよさそう。自分で関数をつくらなきゃだけど。
- Gaussian Process Optimization
- PSO, CMA-ES など

In [None]:
from sklearn.model_selection import GridSearchCV
import pandas as pd
clf.get_params()
svc = SVC()
scoring = "accuracy"
param_grid = [
    {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
    {'C': [1, 10, 100, 1000], 'kernel': ['rbf'], 'gamma': [0.001, 0.0001]},
    {'C': [1, 10, 100, 1000], 'kernel': ['poly'], 'degree': [2, 3, 4], 'gamma': [0.001, 0.0001]},
    {'C': [1, 10, 100, 1000], 'kernel': ['sigmoid'], 'gamma': [0.001, 0.0001]}
    ]
clf = GridSearchCV(svc, param_grid,cv=4)
clf.fit(digits.data, digits.target)
df = pd.DataFrame(clf.cv_results_)
df_scored = df.sort_values(by=["rank_test_score"])[["params","mean_test_score","std_test_score","mean_fit_time"]]
df_scored.head()