# 과적합, 분산 편향 트레이드오프, 교차 검증

In [1]:
#필요한 라이브러리 임포트

import numpy as np
import pandas as pd

# 교차검증

## 1. 사이킷런의 model_selection의 KFold()를 사용하는 경우(For loop 사용)

#### 폴드를 분리할 객체 생성

In [7]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=False)

#### 데이터를 준비하고 회귀 모형 객체를 생성

In [4]:
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression

diab = load_diabetes()

X = diab.data
y = diab.target

In [5]:
X[:5], y[:5]

(array([[ 0.03807591,  0.05068012,  0.06169621,  0.02187239, -0.0442235 ,
         -0.03482076, -0.04340085, -0.00259226,  0.01990749, -0.01764613],
        [-0.00188202, -0.04464164, -0.05147406, -0.02632753, -0.00844872,
         -0.01916334,  0.07441156, -0.03949338, -0.06833155, -0.09220405],
        [ 0.08529891,  0.05068012,  0.04445121, -0.00567042, -0.04559945,
         -0.03419447, -0.03235593, -0.00259226,  0.00286131, -0.02593034],
        [-0.08906294, -0.04464164, -0.01159501, -0.03665608,  0.01219057,
          0.02499059, -0.03603757,  0.03430886,  0.02268774, -0.00936191],
        [ 0.00538306, -0.04464164, -0.03638469,  0.02187239,  0.00393485,
          0.01559614,  0.00814208, -0.00259226, -0.03198764, -0.04664087]]),
 array([151.,  75., 141., 206., 135.]))

In [6]:
len(X)

442

#### split()함수를 호출하여 폴드별로 분리될 행 인덱스 세트를 구함

In [9]:
list(kf.split(X))

[(array([ 89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101,
         102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114,
         115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
         128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140,
         141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153,
         154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166,
         167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
         180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192,
         193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205,
         206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218,
         219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231,
         232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244,
         245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257,
         258, 259, 260, 261, 262, 263,

In [10]:
from sklearn.metrics import r2_score

r2_scores = []
lr = LinearRegression()

for train_idx, test_idx in kf.split(X):
    X_train, X_test = X[train_idx], X[test_idx] # tuple unpacking
    y_train, y_test = y[train_idx], y[test_idx]

    reg = lr.fit(X_train, y_train)
    y_pred = reg.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)

In [11]:
r2_scores # 이 다섯개의 평균이 일반오차이다.

[0.4295561538258379,
 0.5225993866099365,
 0.4826805413452824,
 0.42649776111040205,
 0.5502483366517519]

In [16]:
import numpy as np

for i, r2 in enumerate(r2_scores):
    print(i+1, f'-> R2 = {r2:.3f}')

print(f'average R2 = {np.round(np.mean(r2_scores),3)}')

1 -> R2 = 0.430
2 -> R2 = 0.523
3 -> R2 = 0.483
4 -> R2 = 0.426
5 -> R2 = 0.550
average R2 = 0.482


## 2. 사이킷런의 cross_val_score 함수를 사용하여 K폴드 교차 검증 수행 without shuffling:
- for loop 필요 없음

In [17]:
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

diab = load_diabetes()
X = diab.data
y = diab.target

lr = LinearRegression()

cross_val_score(lr, X, y, cv=5)

array([0.42955615, 0.52259939, 0.48268054, 0.42649776, 0.55024834])

In [19]:
np.round(np.mean(cross_val_score(lr, X, y, cv=5)), 3)

0.482

In [20]:
from sklearn.datasets import load_iris
import pandas as pd

iris = load_iris()
X = iris.data
y = iris.target

iris_df = pd.DataFrame(X, columns=iris.feature_names)
iris_df['kind'] = iris.target
iris_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),kind
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


## 3. 사이킷런의 cross_val_score 함수를 사용하여 K폴드 교차 검증 수행 with shuffling

In [24]:
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

diab = load_diabetes()
X = diab.data
y = diab.target

kf = KFold(5, shuffle=True, random_state=29) # stratify=True로 하면 같은 비율로 구성되게 shuffle 한다.
lr = LinearRegression()

np.round(np.mean(cross_val_score(lr, X, y, cv=kf)),3)

0.489

# 과제 - 보스턴 집값 데이터를 활용하여 다음 두 가지 과제를 수행하고 각각의  score 를 확인하세요.( 폴드 수 : 5, shuffle=False)

1. KFold 객체를 활용하되  인덱스 집합을 리턴받아 학습을 수행하여 r2 값을 구하시오 ( 소숫점 이하 3자리)
2. cross_val_score() 를 활용하여 r2값을 구하시오(소숫점 이하 세자리)

R2: [ 0.639 0.714 0.587 0.079 -0.253] 
average R2: 0.353

In [1]:
from sklearn import datasets

dir(datasets)

['__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__getattr__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_arff_parser',
 '_base',
 '_california_housing',
 '_covtype',
 '_kddcup99',
 '_lfw',
 '_olivetti_faces',
 '_openml',
 '_rcv1',
 '_samples_generator',
 '_species_distributions',
 '_svmlight_format_fast',
 '_svmlight_format_io',
 '_twenty_newsgroups',
 'clear_data_home',
 'dump_svmlight_file',
 'fetch_20newsgroups',
 'fetch_20newsgroups_vectorized',
 'fetch_california_housing',
 'fetch_covtype',
 'fetch_kddcup99',
 'fetch_lfw_pairs',
 'fetch_lfw_people',
 'fetch_olivetti_faces',
 'fetch_openml',
 'fetch_rcv1',
 'fetch_species_distributions',
 'get_data_home',
 'load_breast_cancer',
 'load_diabetes',
 'load_digits',
 'load_files',
 'load_iris',
 'load_linnerud',
 'load_sample_image',
 'load_sample_images',
 'load_svmlight_file',
 'load_svmlight_files',
 'load_wine',
 'make_biclusters',
 'make_blobs',
 'make_checkerboard',
 'make_circl

In [4]:
from sklearn.datasets import fetch_openml

boston = fetch_openml('boston')
boston.keys()

  warn(
  warn(


dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [5]:
boston.target

0      24.0
1      21.6
2      34.7
3      33.4
4      36.2
       ... 
501    22.4
502    20.6
503    23.9
504    22.0
505    11.9
Name: MEDV, Length: 506, dtype: float64

In [11]:
boston.data

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296.0,15.3,396.90,4.98
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.90,9.14
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273.0,21.0,391.99,9.67
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273.0,21.0,396.90,9.08
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273.0,21.0,396.90,5.64
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273.0,21.0,393.45,6.48


##### 1. 인덱스 집합을 리턴

In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
import numpy as np

X = boston.data.values
y = boston.target
lr = LinearRegression()
r2_scores = []

kfold_object = KFold(n_splits=5, shuffle=False)
for train_idx, test_idx in kfold_object.split(X):
    X_train, X_test, y_train, y_test = X[train_idx], X[test_idx], y[train_idx], y[test_idx]
    
    reg = lr.fit(X_train, y_train)
    y_pred = reg.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)
print(f'r2 scores are {np.round(r2_scores, 3)} and average r2 score is {np.round(np.mean(r2_scores),3)}')

r2 scores are [ 0.639  0.714  0.587  0.079 -0.253] and average r2 score is 0.353


##### 2. cross_val_score() 를 활용

In [15]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

kfold_object = KFold(n_splits=5, shuffle=True, random_state=42)

cross_val_score(lr, X, y, cv=kfold_object)

array([0.66875949, 0.7342547 , 0.70986601, 0.77595168, 0.68727731])

In [17]:
np.round(np.mean(cross_val_score(lr, X, y, cv=kfold_object)),3)

0.715