# 과적합, 분산 편향 트레이드오프, 교차 검증

In [1]:
# 관련 라이브러리 및 모듈 Import
# import
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import SGDRegressor

from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error, r2_score

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

### 데이터 준비 및 분할

In [2]:
# 데이터 csv 불러오기
train = pd.read_csv('../data/train3.csv')

In [3]:
# Unnamed: 0 column제거
train = train.iloc[:,1:]
# 소수점 2자리까지
train.match = round(train.match, 2)

In [4]:
# X(독립변수), Y(종속변수) 분할
X = train.drop('match', axis=1)
y = train['match']

In [5]:
X = np.array(X)
y = np.array(y)
X, y

(array([[  1.        ,   0.        ,   6.        , ...,  10.        ,
          10.        ,  10.        ],
        [  2.        ,   0.        ,   7.        , ...,   7.25      ,
           5.875     ,   7.        ],
        [  3.        ,   0.        ,   8.        , ...,   6.75      ,
           5.5       ,   5.5       ],
        ...,
        [549.        ,   1.        ,   7.        , ...,   6.5       ,
           5.66666667,   5.66666667],
        [550.        ,   1.        ,   7.        , ...,   7.125     ,
           5.125     ,   5.625     ],
        [552.        ,   1.        ,   8.        , ...,   6.71428571,
           6.14285714,   6.85714286]]),
 array([100.  ,  25.  ,   0.  ,  25.  ,  25.  ,   0.  ,   0.  ,  75.  ,
         50.  ,   0.  ,  16.67,  33.33,  66.67,  20.  ,   0.  ,  16.67,
         83.33,  16.67,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,
         33.33,   0.  ,  10.  ,   0.  ,   0.  ,  50.  ,  30.  ,  10.  ,
         20.  , 100.  ,   0.  ,   0.  ,   0.  ,  

# 교차검증

## 1. 사이킷런의 model_selection의 KFold()를 사용하는 경우(For loop 사용)

#### 폴드를 분리할 객체 생성

In [6]:
from sklearn.model_selection import KFold

#### 데이터를 준비하고 회귀 모형 객체를 생성

In [7]:
lr = LinearRegression()

In [8]:
X[:5], y[:5]

(array([[ 1.   ,  0.   ,  6.   ,  8.   ,  8.   ,  8.   ,  7.   , 15.   ,
         20.   , 20.   , 15.   , 15.   , 15.   , 10.   , 10.   , 10.   ,
         10.   , 10.   , 10.   , 10.   ],
        [ 2.   ,  0.   ,  7.   ,  5.   ,  8.   , 10.   ,  3.   , 45.   ,
          5.   , 25.   , 20.   ,  0.   ,  5.   ,  7.125,  6.375,  7.375,
          6.875,  7.25 ,  5.875,  7.   ],
        [ 3.   ,  0.   ,  8.   ,  9.   ,  9.   ,  8.   ,  8.   , 35.   ,
         10.   , 35.   , 10.   , 10.   ,  0.   ,  6.125,  6.375,  6.875,
          5.75 ,  6.75 ,  5.5  ,  5.5  ],
        [ 4.   ,  0.   ,  7.   ,  8.   ,  7.   ,  9.   ,  8.   , 20.   ,
         20.   , 20.   , 20.   , 10.   , 10.   ,  6.5  ,  6.625,  7.25 ,
          7.125,  7.375,  6.625,  6.75 ],
        [ 5.   ,  0.   ,  6.   ,  3.   , 10.   ,  6.   ,  8.   , 20.   ,
          5.   , 25.   , 25.   , 10.   , 15.   ,  4.75 ,  7.25 ,  7.125,
          6.875,  7.375,  5.875,  5.75 ]]),
 array([100.,  25.,   0.,  25.,  25.]))

In [9]:
len(X)

474

#### split()함수를 호출하여 폴드별로 분리될 행 인덱스 세트를 구함

In [10]:
kfold = KFold(5)
list(kfold.split(X))

[(array([ 95,  96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107,
         108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
         121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133,
         134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146,
         147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
         160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172,
         173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185,
         186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198,
         199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211,
         212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
         225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237,
         238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250,
         251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263,
         264, 265, 266, 267, 268, 269,

In [11]:
from sklearn.metrics import r2_score

r2_scores = []

for train_idx, test_idx in kfold.split(X):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    reg = lr.fit(X_train, y_train)
    y_pred = reg.predict(X_test)
    y_pred[y_pred < 0] = 0.
    
    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)

In [12]:
r2_scores

[0.03651506262494997,
 0.17231858760292884,
 0.10998600726579688,
 0.07105936956761927,
 -0.007269455119743196]

In [13]:
import numpy as np

for i, r2 in enumerate(r2_scores):
    print(i+1, f'- R2 = {r2:.3f}')
    
print(f'average R2 = {np.round(np.mean(r2_scores),3)}')

1 - R2 = 0.037
2 - R2 = 0.172
3 - R2 = 0.110
4 - R2 = 0.071
5 - R2 = -0.007
average R2 = 0.077
