# Cross Validation

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm

In [2]:
X, y = datasets.load_iris(return_X_y=True)
X.shape, y.shape

((150, 4), (150,))

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, random_state=0)

print( X_train.shape, y_train.shape )
print( X_test.shape, y_test.shape )

(90, 4) (90,)
(60, 4) (60,)


In [4]:
clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
clf.score(X_test, y_test) # 직접 train_set과 test_set을 나누어 평가, holdout 방식

0.9666666666666667

In [5]:
from sklearn.model_selection import cross_val_score
clf = svm.SVC(kernel='linear', C=1, random_state=42)
scores = cross_val_score(clf, X, y, cv=5) #cv를 추가하여 scoring!
scores

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [6]:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.98 accuracy with a standard deviation of 0.02


In [7]:
from sklearn.model_selection import ShuffleSplit
n_samples = X.shape[0]
cv = ShuffleSplit(n_splits=5, test_size=0.1, random_state=0) #split을 직접하도록 입력
scores = cross_val_score(clf, X, y, cv=cv)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.99 accuracy with a standard deviation of 0.03


In [8]:
n_samples = X.shape[0]
cv = ShuffleSplit(n_splits=5, test_size=0.95, random_state=0) # test_size를 극단적으로 키워봄
scores = cross_val_score(clf, X, y, cv=cv)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.86 accuracy with a standard deviation of 0.11


In [9]:
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
clf = make_pipeline(preprocessing.StandardScaler(), svm.SVC(C=1)) #pipeline도 활용할 수 있음
scores = cross_val_score(clf, X, y, cv=cv)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.67 accuracy with a standard deviation of 0.05


In [10]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score
scoring = ['precision_macro', 'recall_macro'] # 여러가지 scoring 방식을 사용
clf = svm.SVC(kernel='linear', C=1, random_state=0)
scores = cross_validate(clf, X, y, scoring=scoring) #scoring 방식을 입력. 기존에 입력을 안해주었는데, 그렇다면? estimator(SCV)의 default scoring 방식(Mean accuracy)을 사용
scores

{'fit_time': array([0.00270796, 0.00165582, 0.00415993, 0.001652  , 0.00182915]),
 'score_time': array([0.0108819 , 0.01048422, 0.00444603, 0.00427604, 0.00262499]),
 'test_precision_macro': array([0.96969697, 1.        , 0.96969697, 0.96969697, 1.        ]),
 'test_recall_macro': array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])}

In [11]:
scores['test_recall_macro']

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [12]:
import numpy as np
from sklearn.model_selection import KFold

rkf = KFold(n_splits=5, shuffle=True) #default는 shuffle을 안하는 것. 성능이 낮을 수 있음.
for train, test in rkf.split(X, y):
    clf = svm.SVC(kernel='linear', C=1).fit(X[train], y[train])
    score = clf.score(X[test], y[test])
    print(score)

1.0
1.0
1.0
0.9333333333333333
1.0


In [13]:
train

array([  0,   1,   2,   3,   6,   8,   9,  11,  12,  13,  14,  15,  16,
        17,  18,  19,  20,  21,  22,  23,  24,  26,  27,  28,  29,  30,
        32,  33,  35,  36,  38,  39,  40,  41,  42,  43,  47,  48,  49,
        50,  51,  52,  53,  55,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  90,  93,  94,  97,
        99, 100, 101, 103, 104, 106, 107, 108, 109, 110, 112, 113, 114,
       115, 116, 117, 118, 120, 122, 124, 125, 126, 127, 128, 129, 130,
       131, 132, 133, 135, 136, 137, 139, 140, 141, 142, 143, 144, 146,
       147, 148, 149])

In [14]:
test

array([  4,   5,   7,  10,  25,  31,  34,  37,  44,  45,  46,  54,  56,
        57,  88,  89,  91,  92,  95,  96,  98, 102, 105, 111, 119, 121,
       123, 134, 138, 145])

In [15]:
from sklearn.model_selection import StratifiedKFold, KFold

X, y = np.ones((50, 1)), np.hstack(([0] * 45, [1] * 5))

In [16]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 1])

In [17]:
kf = KFold(n_splits=3)
for train, test in kf.split(X, y):
    print('train -  {}   |   test -  {}'.format(
        np.bincount(y[train]), np.bincount(y[test]))) # y가 0인 개수, 1인 개수 count. 클래스 1이 쏠려있는 것을 볼 수 있음.

train -  [28  5]   |   test -  [17]
train -  [28  5]   |   test -  [17]
train -  [34]   |   test -  [11  5]


In [18]:
y[train]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [19]:
y[test]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1])

In [20]:
skf = StratifiedKFold(n_splits=3)
for train, test in skf.split(X, y):
    print('train -  {}   |   test -  {}'.format(
        np.bincount(y[train]), np.bincount(y[test])))

train -  [30  3]   |   test -  [15  2]
train -  [30  3]   |   test -  [15  2]
train -  [30  4]   |   test -  [15  1]


In [21]:
y[train]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1])

In [22]:
y[test]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1])

# K를 결정하는 방법

In [None]:
#보통 K를 3이나 5로 설정하는데 이유가 없음. 

In [23]:
X[140:].shape, y[:140].shape

((0, 1), (50,))

In [24]:
from sklearn.model_selection import cross_val_score

X, y = datasets.load_iris(return_X_y=True)

for k in range(2, 6):
  kf = KFold(n_splits=k, shuffle=True)
  scores = []
  for train, test in kf.split(X, y):
    clf = svm.SVC(kernel='linear', C=1).fit(X[train], y[train])
    scores.append(clf.score(X[test], y[test]))
  print('when k is', k, ':', np.mean(scores), np.std(scores))

#print(clf.score(X[140:], y[140:]))

when k is 2 : 0.9666666666666667 0.020000000000000018
when k is 3 : 0.9866666666666667 0.009428090415820642
when k is 4 : 0.9736842105263157 0.026315789473684237
when k is 5 : 0.9733333333333334 0.02494438257849294


In [25]:
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle

X, y = datasets.load_iris(return_X_y=True)
X, y = shuffle(X, y, random_state=0) # 데이터가 잘 섞여있지 않아서 shuffle 추가

for k in range(2, 6):
  kf = KFold(n_splits=k, shuffle=True)
  scores = []
  for train, test in kf.split(X[:140], y[:140]): # 학습데이터와 시험셋 구분
    clf = svm.SVC(kernel='linear', C=1).fit(X[train], y[train])
    scores.append(clf.score(X[test], y[test]))
  print('when k is', k, ':', np.mean(scores), np.std(scores))
  print(' test score :', clf.score(X[140:], y[140:])) # 나머지 10개의 데이터로 시험셋에 활용

when k is 2 : 0.95 0.007142857142857173
 test score : 1.0
when k is 3 : 0.9639222941720629 0.03701048331679796
 test score : 0.9
when k is 4 : 0.9928571428571429 0.012371791482634842
 test score : 1.0
when k is 5 : 0.9714285714285715 0.014285714285714278
 test score : 0.9


In [26]:
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle

X, y = datasets.load_iris(return_X_y=True)
X, y = shuffle(X, y, random_state=0) 

for k in range(2, 6):
  kf = KFold(n_splits=k, shuffle=True)
  scores = []
  for train, test in kf.split(X[:140], y[:140]):
    clf = svm.SVC(kernel='linear', C=0.001).fit(X[train], y[train]) # 학습이 잘 안되도록 C를 변경
    scores.append(clf.score(X[test], y[test]))
  print('when k is', k, ':', np.mean(scores), np.std(scores))
  print(' test score :', clf.score(X[140:], y[140:]))

when k is 2 : 0.3 0.01428571428571429
 test score : 0.4
when k is 3 : 0.2858464384828862 0.02174732985816277
 test score : 0.4
when k is 4 : 0.3928571428571428 0.1670573651932357
 test score : 0.6
when k is 5 : 0.34285714285714286 0.18013600304169275
 test score : 0.4


# Optimizaton

## 1. GridSearch

In [27]:
search_grid = { 
    'C': [0.1, 1, 10, 100], 
    'gamma': [0.1, 0.01 ,0.001], 
    'kernel': ['rbf', 'poly', 'linear']
    }

In [28]:
from scipy.stats import loguniform

{
  'C': loguniform(1e-1, 1e3),
  'gamma': loguniform(1e-4, 1e-1),
  'kernel': ['rbf', 'linear']
}

{'C': <scipy.stats._distn_infrastructure.rv_frozen at 0x7fab8ed2e580>,
 'gamma': <scipy.stats._distn_infrastructure.rv_frozen at 0x7fab8d3a41c0>,
 'kernel': ['rbf', 'linear']}

In [29]:
%%time

from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV

iris = datasets.load_iris()
clf = svm.SVC()
clf.fit(iris.data, iris.target)

CPU times: user 6.96 ms, sys: 2.93 ms, total: 9.89 ms
Wall time: 7.98 ms


SVC()

In [30]:
%%time

from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV

iris = datasets.load_iris()
parameters = search_grid
svc = svm.SVC()
clf = GridSearchCV(svc, parameters)
clf.fit(iris.data, iris.target)
print(clf.best_params_)

{'C': 0.1, 'gamma': 0.1, 'kernel': 'poly'}
CPU times: user 475 ms, sys: 9.98 ms, total: 485 ms
Wall time: 515 ms


In [31]:
clf.best_params_

{'C': 0.1, 'gamma': 0.1, 'kernel': 'poly'}

## 2. Random Search

In [32]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
iris = load_iris()
logistic = LogisticRegression(solver='saga', tol=1e-2, max_iter=200,
                               random_state=0)
distributions = dict(C=uniform(loc=0, scale=4),
                     penalty=['l2', 'l1'])
clf = RandomizedSearchCV(logistic, distributions, random_state=0)
search = clf.fit(iris.data, iris.target)
search.best_params_

{'C': 2.195254015709299, 'penalty': 'l1'}

## 3. Bayesian Optimization

In [33]:
!pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.9.0-py2.py3-none-any.whl (100 kB)
[K     |████████████████████████████████| 100 kB 5.3 MB/s ta 0:00:011
[?25hCollecting pyaml>=16.9
  Downloading pyaml-21.10.1-py2.py3-none-any.whl (24 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-21.10.1 scikit-optimize-0.9.0


In [34]:
%%time

from skopt import BayesSearchCV
from sklearn.datasets import load_digits
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

X, y = load_digits(n_class=10, return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=.25, random_state=0)

# log-uniform: understand as search over p = exp(x) by varying x
opt = BayesSearchCV(
    SVC(),
    {
        'C': (1e-6, 1e+6, 'log-uniform'),
        'gamma': (1e-6, 1e+1, 'log-uniform'),
        'degree': (1, 8),  # integer valued parameter
        'kernel': ['linear', 'poly', 'rbf'],  # categorical parameter
    },
    n_iter=32,
    cv=3
)

opt.fit(X_train, y_train)

print("val. score: %s" % opt.best_score_)
print("test score: %s" % opt.score(X_test, y_test))

val. score: 0.985894580549369
test score: 0.9822222222222222
CPU times: user 1min 38s, sys: 11 s, total: 1min 49s
Wall time: 1min 17s
