<br>
<br>

# Cross-validation

In [1]:
from collections import Counter 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import datasets, model_selection, ensemble
from sklearn.metrics import accuracy_score, roc_curve, auc

In [2]:
data = datasets.load_breast_cancer()

x_data = data.data
y_data = data.target

In [25]:
def create_model():
    
    # Tree-based model이므로 Standard-scaling or Min-max scaling 필수 X
    # n_estimators : tree model갯수 max_depth : 트리의 깊이, min_saples_split : 가지치기 얼마나
    # learning_rate : 경사하강법
    model = ensemble.GradientBoostingClassifier(n_estimators=1000, 
                                            max_depth=4, 
                                            min_samples_split=5, 
                                            learning_rate=0.01,
                                            random_state=42)
    return model

<br>
<br>

## 1. Train-Test split → Model-training

In [4]:
from sklearn import model_selection

x_train, x_test, y_train, y_test = model_selection.train_test_split(x_data, 
                                                                    y_data, 
                                                                    test_size=0.3, 
                                                                    random_state=0)

In [5]:
model = create_model()
model.fit(x_train, y_train)

pred_test = model.predict_proba(x_test) 
fpr, tpr, _ = roc_curve(y_true=y_test, y_score=pred_test[:,1]) 
roc_auc = auc(fpr, tpr) 

print('AUC: ', roc_auc)
print('Accuracy: ', accuracy_score(model.predict(x_test), y_test))

AUC:  0.9961787184009406
Accuracy:  0.9590643274853801


<br>
<br>

## 2. K-Fold CV

In [6]:
from sklearn import model_selection # train_test_split, GridSearchCV, RandomizedSearchCV

kf = model_selection.KFold(n_splits=10, shuffle=False) # 5번에서 활용할 cross_val_score의 결과와 동일한 결과를 만들기 위함 

In [7]:
kf.split(x_data, y_data) # Generate indices to split data into training and test set

<generator object _BaseKFold.split at 0x000001C6A33C8970>

In [8]:
for i in kf.split(x_data, y_data):
    print(type(i), len(i))

<class 'tuple'> 2
<class 'tuple'> 2
<class 'tuple'> 2
<class 'tuple'> 2
<class 'tuple'> 2
<class 'tuple'> 2
<class 'tuple'> 2
<class 'tuple'> 2
<class 'tuple'> 2
<class 'tuple'> 2


In [18]:
x_data.shape[0] * 0.9

512.1

In [21]:
for i in kf.split(x_data, y_data):
    print(i[0].shape, i[1].shape, i[0].shape[0] + i[1].shape[0])


(512,) (57,) 569
(512,) (57,) 569
(512,) (57,) 569
(512,) (57,) 569
(512,) (57,) 569
(512,) (57,) 569
(512,) (57,) 569
(512,) (57,) 569
(512,) (57,) 569
(513,) (56,) 569


In [16]:
# i[0] # indices to split data into training and test set

In [20]:
example_matrix = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]])

example_matrix[ [1, 3] , : ]
# example_matrix[ (1, 3) , : ]
example_matrix[[]]

array([[ 4,  5,  6],
       [10, 11, 12]])

In [11]:
train_index = [2, 5, 10]

x_data[train_index, :]

array([[1.969e+01, 2.125e+01, 1.300e+02, 1.203e+03, 1.096e-01, 1.599e-01,
        1.974e-01, 1.279e-01, 2.069e-01, 5.999e-02, 7.456e-01, 7.869e-01,
        4.585e+00, 9.403e+01, 6.150e-03, 4.006e-02, 3.832e-02, 2.058e-02,
        2.250e-02, 4.571e-03, 2.357e+01, 2.553e+01, 1.525e+02, 1.709e+03,
        1.444e-01, 4.245e-01, 4.504e-01, 2.430e-01, 3.613e-01, 8.758e-02],
       [1.245e+01, 1.570e+01, 8.257e+01, 4.771e+02, 1.278e-01, 1.700e-01,
        1.578e-01, 8.089e-02, 2.087e-01, 7.613e-02, 3.345e-01, 8.902e-01,
        2.217e+00, 2.719e+01, 7.510e-03, 3.345e-02, 3.672e-02, 1.137e-02,
        2.165e-02, 5.082e-03, 1.547e+01, 2.375e+01, 1.034e+02, 7.416e+02,
        1.791e-01, 5.249e-01, 5.355e-01, 1.741e-01, 3.985e-01, 1.244e-01],
       [1.602e+01, 2.324e+01, 1.027e+02, 7.978e+02, 8.206e-02, 6.669e-02,
        3.299e-02, 3.323e-02, 1.528e-01, 5.697e-02, 3.795e-01, 1.187e+00,
        2.466e+00, 4.051e+01, 4.029e-03, 9.269e-03, 1.101e-02, 7.591e-03,
        1.460e-02, 3.042e-03, 1.919e

In [22]:
for fold_idx, (train_idx, valid_idx) in enumerate(kf.split(x_data, y_data)) :
    train_data, train_label = x_data[train_idx, :], y_data[train_idx]
    valid_data, valid_label = x_data[valid_idx, :], y_data[valid_idx]
    
    print('[{} Fold] \n Selected validation data : \n {} \n {} \n'.format(fold_idx, valid_label, Counter(valid_label)))

0
[0 Fold] 
 Selected validation data : 
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 0 0 1 0] 
 Counter({0: 46, 1: 11}) 

1
[1 Fold] 
 Selected validation data : 
 [0 1 1 1 1 0 1 0 0 1 1 1 1 0 1 0 0 1 0 1 0 0 1 1 1 0 0 1 0 0 0 1 1 1 0 1 1
 0 0 1 1 1 0 0 1 1 1 1 0 1 1 0 1 1 1 1 1] 
 Counter({1: 35, 0: 22}) 

2
[2 Fold] 
 Selected validation data : 
 [1 1 1 0 0 0 1 0 0 1 1 1 0 0 1 0 1 0 0 1 0 0 1 1 0 1 1 0 1 1 1 1 0 1 1 1 1
 1 1 1 1 1 0 1 1 1 1 0 0 1 0 1 1 0 0 1 1] 
 Counter({1: 36, 0: 21}) 

3
[3 Fold] 
 Selected validation data : 
 [0 0 1 1 1 1 0 1 1 0 0 0 1 0 1 0 1 1 1 0 1 1 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0
 1 1 0 1 0 0 0 0 1 1 0 0 1 1 1 0 1 1 1 1] 
 Counter({1: 29, 0: 28}) 

4
[4 Fold] 
 Selected validation data : 
 [1 0 0 1 1 0 1 1 0 0 1 0 1 1 1 1 0 1 1 1 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 1 1 1 1 1 0 1 0 1 1 0 1 1 0 1 0 0 1] 
 Counter({1: 29, 0: 28}) 

5
[5 Fold] 
 Selected validation data : 
 [1 1 1 1 1 1 1 1 1 1 1 

<br>
<br>

## 3. Stratified K-Fold CV

In [23]:
from sklearn import model_selection 

stratified_kf = model_selection.StratifiedKFold(n_splits=10, shuffle=False)

In [24]:
for fold_idx, (train_idx, valid_idx) in enumerate(stratified_kf.split(x_data, y_data)) :
    train_data, train_label = x_data[train_idx, :], y_data[train_idx]
    valid_data, valid_label = x_data[valid_idx, :], y_data[valid_idx]
    
    print('[{} Fold] \n Selected validation data : \n {} \n {} \n'.format(fold_idx, valid_label, Counter(valid_label)))

[0 Fold] 
 Selected validation data : 
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1] 
 Counter({1: 35, 0: 22}) 

[1 Fold] 
 Selected validation data : 
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1] 
 Counter({1: 35, 0: 22}) 

[2 Fold] 
 Selected validation data : 
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1] 
 Counter({1: 36, 0: 21}) 

[3 Fold] 
 Selected validation data : 
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1] 
 Counter({1: 36, 0: 21}) 

[4 Fold] 
 Selected validation data : 
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1] 
 Counter({1: 36, 0: 21}) 

[5 Fold] 
 Selected validation data : 
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

<br>
<br>

## 4. Stratified K-Fold CV → Model-training

In [11]:
valid_scores = []

stratified_kf = model_selection.StratifiedKFold(n_splits=10, shuffle=False)

for fold_idx, (train_idx, valid_idx) in enumerate(stratified_kf.split(x_data, y_data)) :
    train_data, train_label = x_data[train_idx, :], y_data[train_idx]
    valid_data, valid_label = x_data[valid_idx, :], y_data[valid_idx]
    
    model = create_model()
    model.fit(train_data, train_label)
    
    train_acc = accuracy_score(model.predict(train_data), train_label) * 100
    valid_acc = accuracy_score(model.predict(valid_data), valid_label) * 100
    print('[{} Fold] \n Accuracy-Training : {:.2f}% \n Accuracy-Validation : {:.2f}% \n'.format(fold_idx, 
                                                                                                train_acc, 
                                                                                                valid_acc))
    valid_scores.append(valid_acc)

print('Cross-Validation Score : {:.2f}%'.format(np.mean(valid_scores)))

[0 Fold] 
 Accuracy-Training : 100.00% 
 Accuracy-Validation : 96.49% 

[1 Fold] 
 Accuracy-Training : 100.00% 
 Accuracy-Validation : 92.98% 

[2 Fold] 
 Accuracy-Training : 100.00% 
 Accuracy-Validation : 92.98% 

[3 Fold] 
 Accuracy-Training : 100.00% 
 Accuracy-Validation : 92.98% 

[4 Fold] 
 Accuracy-Training : 100.00% 
 Accuracy-Validation : 100.00% 

[5 Fold] 
 Accuracy-Training : 100.00% 
 Accuracy-Validation : 96.49% 

[6 Fold] 
 Accuracy-Training : 100.00% 
 Accuracy-Validation : 98.25% 

[7 Fold] 
 Accuracy-Training : 100.00% 
 Accuracy-Validation : 98.25% 

[8 Fold] 
 Accuracy-Training : 100.00% 
 Accuracy-Validation : 98.25% 

[9 Fold] 
 Accuracy-Training : 100.00% 
 Accuracy-Validation : 98.21% 

Cross-Validation Score : 96.49%


## 5-1 활용

<br>
<br>

## 5. cross_val_score → Model-training

### 1) 'cv' parameter <- int

In [16]:
from sklearn import model_selection

model = create_model()

# Classification model이 활용되고 y_data가 이진분류 or 다중분류 형태일 경우 자동으로 Stratified K-Fold가 적용됨
valid_scores = model_selection.cross_val_score(model, x_data, y_data, 
                                               cv=10, verbose=1,
                                               n_jobs=-1) # Number of jobs to run in parallel. '-1' == use all processors

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   27.6s finished


In [17]:
valid_scores

array([0.96491228, 0.92982456, 0.92982456, 0.92982456, 1.        ,
       0.96491228, 0.98245614, 0.98245614, 0.98245614, 0.98214286])

In [18]:
print('Cross-Validation Score : {:.2f}%'.format(np.mean(valid_scores * 100)))

Cross-Validation Score : 96.49%


### 2) 'cv' parameter <- KFold or StratifiedKFold

In [26]:
# K-Fold

model = create_model()
kf = model_selection.KFold(n_splits=10, shuffle=False)

valid_scores_kf = model_selection.cross_val_score(model, x_data, y_data,
                                                  cv=kf, verbose=1, n_jobs=-1)

# Stratified K-Fold

model = create_model()
stratified_kf = model_selection.StratifiedKFold(n_splits=10, shuffle=False)

valid_scores_s_kf = model_selection.cross_val_score(model, x_data, y_data,
                                                    cv=stratified_kf, verbose=1, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    7.7s remaining:    5.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   11.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    5.2s remaining:    3.4s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    9.3s finished


In [27]:
print('Cross-Validation Score (K-Fold): {:.2f}%'.format(np.mean(valid_scores_kf * 100)))
print('Cross-Validation Score (Stratified K-Fold): {:.2f}%'.format(np.mean(valid_scores_s_kf * 100)))

Cross-Validation Score (K-Fold): 96.32%
Cross-Validation Score (Stratified K-Fold): 96.49%
