# 모델 검증 및 평가 (Validation & Evaluation metrics)

In [340]:
import warnings
warnings.filterwarnings(action='ignore')
import re
from datetime import datetime, date, time, timedelta
from dateutil.relativedelta import relativedelta

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
plt.rcParams['font.family']= 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False
# plt.rcParams['figure.figsize'] = [6.4, 4.8]

from sklearn.datasets import load_iris, load_wine
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV, KFold, StratifiedKFold
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, VotingRegressor
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# 샘플데이터 로드

In [101]:
data = load_iris()
df = pd.DataFrame(data.data, columns=['sl', 'sw', 'pl', 'pw'])
df['target'] = data.target
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   sl      150 non-null    float64
 1   sw      150 non-null    float64
 2   pl      150 non-null    float64
 3   pw      150 non-null    float64
 4   target  150 non-null    int32  
dtypes: float64(4), int32(1)
memory usage: 5.4 KB


In [102]:
df['target'].value_counts()

0    50
1    50
2    50
Name: target, dtype: int64

In [103]:
X = df.drop(['target'], axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
acc_scr = accuracy_score(y_test, y_pred)
acc_scr

(120, 4) (30, 4) (120,) (30,)


1.0

## 교차검증
KFold, StratifiedKFold, TimeSeriesSplit
* KFold - 회귀모델: 회귀는 골고루 개념이 없음
* StratifiedKFold - 분류모델: 골고루 섞어야 잘학습됨
* TimeSeriesSplit - 금융데이터 or 과거데이터로 미래예측

Fold(n_splits=5, *, shuffle=False, random_state=None)

### KFold

Fold(n_splits=5, *, shuffle=False, random_state=None)

In [339]:
kf = KFold(n_splits=3)
# kf.get_n_splits(X)

print(kf)
print('----------')

for i, (train_index, test_index) in enumerate(kf.split(X)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={test_index}")

KFold(n_splits=3, random_state=None, shuffle=False)
----------
Fold 0:
  Train: index=[ 50  51  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67
  68  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85
  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101 102 103
 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121
 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139
 140 141 142 143 144 145 146 147 148 149]
  Test:  index=[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49]
Fold 1:
  Train: index=[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49 100 101 102 103
 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121
 122 123 124 125 126 12

### StratifiedKFold

In [111]:
skf = StratifiedKFold(n_splits=3)
# kf.get_n_splits(X)

print(skf)
print('----------')

for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(f"Fold {i}:")
    # print(f"  Train: index={train_index}")
    print(y.loc[test_index].value_counts().sort_index())
    print('----------')

StratifiedKFold(n_splits=3, random_state=None, shuffle=False)
----------
Fold 0:
0    17
1    17
2    16
Name: target, dtype: int64
----------
Fold 1:
0    17
1    16
2    17
Name: target, dtype: int64
----------
Fold 2:
0    16
1    17
2    17
Name: target, dtype: int64
----------


<table border=1 width=800>
<tr>
    <td>
    <img width='300' height='300' src="https://scikit-learn.org/stable/_images/sphx_glr_plot_cv_indices_006.png">
    <img width='300' height='300'  src="https://scikit-learn.org/stable/_images/sphx_glr_plot_cv_indices_007.png">
    </td>
</tr>
<tr>
    <td>
    <img width='300' height='300' src="https://scikit-learn.org/stable/_images/sphx_glr_plot_cv_indices_008.png">
    <img width='300' height='300'  src="https://scikit-learn.org/stable/_images/sphx_glr_plot_cv_indices_009.png">
    </td>
</tr>
<tr>
    <td>
    <img width='300' height='300' src="https://scikit-learn.org/stable/_images/sphx_glr_plot_cv_indices_010.png">
    <img width='300' height='300'  src="https://scikit-learn.org/stable/_images/sphx_glr_plot_cv_indices_011.png">
    </td>
</tr>
<tr>
    <td>
    <img width='300' height='300' src="https://scikit-learn.org/stable/_images/sphx_glr_plot_cv_indices_012.png">
    <img width='300' height='300'  src="https://scikit-learn.org/stable/_images/sphx_glr_plot_cv_indices_013.png">
    </td>
</tr>    
</table>

In [196]:
X = df.drop(['target'], axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

model = DecisionTreeClassifier(random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
acc_scr = accuracy_score(y_test, y_pred)
acc_scr

(120, 4) (30, 4) (120,) (30,)


1.0

In [246]:
X = df.drop(['target'], axis=1)
y = df['target']

kf = KFold(n_splits=5)
acc_scr_list=[]
for i, (train_index, test_index) in enumerate(kf.split(X)):
    X_train = X.loc[train_index]
    X_test = X.loc[test_index]
    y_train = y.loc[train_index]
    y_test = y.loc[test_index]
    
    # print(y_test.value_counts().sort_index())
    
    model = DecisionTreeClassifier(random_state=0)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc_scr = accuracy_score(y_test, y_pred)
    acc_scr_list.append(acc_scr)
    print(acc_scr)
    print('--==---=-=')
    
print(acc_scr_list)
# np.mean(acc_scr_list)

1.0
--==---=-=
0.9666666666666667
--==---=-=
0.8333333333333334
--==---=-=
0.9333333333333333
--==---=-=
0.8
--==---=-=
[1.0, 0.9666666666666667, 0.8333333333333334, 0.9333333333333333, 0.8]


In [245]:
X = df.drop(['target'], axis=1)
y = df['target']

skf = StratifiedKFold(n_splits=5)
acc_scr_list=[]
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    X_train = X.loc[train_index]
    X_test = X.loc[test_index]
    y_train = y.loc[train_index]
    y_test = y.loc[test_index]
    
    # print(y_test.value_counts().sort_index())
    
    model = DecisionTreeClassifier(random_state=0)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc_scr = accuracy_score(y_test, y_pred)
    acc_scr_list.append(acc_scr)
    # print(acc_scr)
    # print('--==---=-=')
    
print(acc_scr_list)
np.mean(acc_scr_list)

[0.9666666666666667, 0.9666666666666667, 0.9, 0.9666666666666667, 1.0]


0.9600000000000002

### cross_val_score test점수만 리스트로 리턴
<pre>
<b>sklearn.model_selection.cross_val_score(estimator,
    X, y=None, 
    scoring=None, # 점수방식
    cv=None, # 몇등분? 폴드
    fit_params=None, # 제일좋은거적용해줘?
</b>
n_jobs=None, verbose=0, groups=None, , pre_dispatch='2*n_jobs', error_score=nan)

In [309]:
X = df.drop(['target'], axis=1)
y = df['target']

cross_val_score(DecisionTreeClassifier(random_state=0), X, y, cv=10)

array([1.        , 0.93333333, 1.        , 0.93333333, 0.93333333,
       0.86666667, 0.93333333, 1.        , 1.        , 1.        ])

In [303]:
X = df.drop(['target'], axis=1)
y = df['target']

res_list = cross_val_score(DecisionTreeClassifier(random_state=0), X, y, scoring='accuracy', cv=5)
print(res_list)
print(np.mean(res_list))

[0.96666667 0.96666667 0.9        0.96666667 1.        ]
0.9600000000000002


### cross_validate 이것저것 딕트로 리턴

점수: train_score, test_score

시간: fit_time, score_time

In [302]:
X = df.drop(['target'], axis=1)
y = df['target']

res_dict = cross_validate(DecisionTreeClassifier(random_state=0), X, y, scoring='accuracy', cv=5, return_train_score=True)
print(res_dict)
pd.DataFrame(res_dict)

{'fit_time': array([0.00400329, 0.00302005, 0.00199628, 0.0019989 , 0.00299239]), 'score_time': array([0.000983  , 0.00099897, 0.00099587, 0.00196505, 0.0010016 ]), 'test_score': array([0.96666667, 0.96666667, 0.9       , 0.96666667, 1.        ]), 'train_score': array([1., 1., 1., 1., 1.])}


Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.004003,0.000983,0.966667,1.0
1,0.00302,0.000999,0.966667,1.0
2,0.001996,0.000996,0.9,1.0
3,0.001999,0.001965,0.966667,1.0
4,0.002992,0.001002,1.0,1.0


### GridSearchCV 하이퍼파리미터 튜닝 모델로 리턴
<pre>
<b>GridSearchCV(
    estimator, # 모델
    scoring=None, # 점수방식
    cv=None, # 몇등분? 폴드
    param_grid, # 파라미터딕트로
    refit=True, # 제일좋은모델이 알아서 적용되게끔 그리드서치CV를 쓰는이유
    n_jobs=None,
</b>verbose=0, pre_dispatch='2*n_jobs', error_score=nan, return_train_score=False)

호출변수.cv_results_ # 결과보기
호출변수.best_estimator_ # 최고모델뭐임
호출변수.best_params_ # 최고파라미터뭐임
호출변수.best_score_ # 최고점수뭐임
호출변수.predict(X_test)

In [278]:
param_ = {
    'min_samples_split':[1,2,3],
    'min_samples_leaf':[1,2,3]
}

model = DecisionTreeClassifier(random_state=0)
models = GridSearchCV(model, scoring='accuracy', cv=5, param_grid=param_)
models.fit(X_train, y_train)
print(models.best_params_)
print(models.best_score_)
print(models.best_estimator_)
y_pred = models.predict(X_test)
y_pred

{'min_samples_leaf': 2, 'min_samples_split': 1}
0.95
DecisionTreeClassifier(min_samples_leaf=2, min_samples_split=1, random_state=0)


array([2, 1, 2, 1, 2, 2, 1, 1, 2, 1, 2, 2, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2])

### K-Fold 외 교차검증(Cross Validation)
<pre>
K-Fold : K개의 폴드로 나누어 cv횟수만큼 반복 학습

홀드아웃 (Holdout)    : 비복원추출 
부트스트랩(Bootstrap) : 복원추출(63.2%) / 추출되지않은 데이터는 test로

LOOCV(Leave One Out Cross Validation)   : test 폴드(1개데이터)
LpOCV(Leave p     Out Cross Validation) : test 폴드(p개데이터)

In [348]:
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

y = df['target']
X = df.drop('target', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
model.predict(X_test)

array([0, 2, 1, 0, 1, 1, 0, 2, 1, 1, 2, 2, 0, 1, 2, 1, 0, 0, 1, 0, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 2, 0, 0, 1, 0, 0, 0])