## 교차검증 연습

### 과정
1. 사전처리 + 분석
2. all_estimators()
3. 교차검증
4. 튜닝(hyper-parameter)

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.datasets import load_iris

In [2]:
iris = load_iris()

In [5]:
# bunch 타입 -> 데이터 뽑아오기
data = iris['data']
target = iris['target']
featurename = iris['feature_names']
classname = iris['target_names']

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

### 모델 생성

In [8]:
# 5개 모델에 대한 정확도(accuracy)
lg = LogisticRegression(max_iter=1000)
cross_val_score(lg, data, target)

array([0.96666667, 1.        , 0.93333333, 0.96666667, 1.        ])

In [16]:
result = cross_validate(lg, data, target, return_train_score=True, cv=7)
result

{'fit_time': array([0.02703285, 0.02682948, 0.02290511, 0.02953291, 0.02699709,
        0.0329113 , 0.02691913]),
 'score_time': array([0.00099778, 0.0009973 , 0.00099707, 0.        , 0.        ,
        0.00042248, 0.00099707]),
 'test_score': array([0.95454545, 1.        , 0.90909091, 0.95238095, 0.95238095,
        1.        , 1.        ]),
 'train_score': array([0.96875   , 0.96875   , 0.9765625 , 0.97674419, 0.99224806,
        0.96899225, 0.97674419])}

In [17]:
resultDF = pd.DataFrame(result)
resultDF

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.027033,0.000998,0.954545,0.96875
1,0.026829,0.000997,1.0,0.96875
2,0.022905,0.000997,0.909091,0.976562
3,0.029533,0.0,0.952381,0.976744
4,0.026997,0.0,0.952381,0.992248
5,0.032911,0.000422,1.0,0.968992
6,0.026919,0.000997,1.0,0.976744


### splitter 객체 생성 - KFold()
- 교차 검증 상세옵션 KFold, StratifiedKFold를 써준다.
- 만든 객체는 cv에 담아준다.

In [23]:
from sklearn.model_selection import KFold, StratifiedKFold
kSplitter = KFold(n_splits=7, shuffle=True)
sSplitter = StratifiedKFold(n_splits=7, shuffle=True)
result = cross_validate(lg, data, target, return_train_score=True, cv=kSplitter)
result

{'fit_time': array([0.0229404 , 0.01708078, 0.02022052, 0.0225122 , 0.01605773,
        0.01788092, 0.02033377]),
 'score_time': array([0.        , 0.        , 0.0009973 , 0.        , 0.0009973 ,
        0.00099659, 0.00099659]),
 'test_score': array([1.        , 0.95454545, 0.90909091, 0.95238095, 1.        ,
        0.95238095, 0.95238095]),
 'train_score': array([0.9765625 , 0.9765625 , 0.9765625 , 0.97674419, 0.96899225,
        0.97674419, 0.98449612])}

In [24]:
result2 = cross_validate(lg, data, target, return_train_score=True, cv=sSplitter)
result2

{'fit_time': array([0.03091669, 0.02096868, 0.02732682, 0.02994657, 0.02394581,
        0.02171326, 0.01853251]),
 'score_time': array([0.00097775, 0.00099707, 0.        , 0.0010159 , 0.00099802,
        0.00023508, 0.        ]),
 'test_score': array([0.95454545, 0.95454545, 0.95454545, 0.9047619 , 1.        ,
        0.95238095, 1.        ]),
 'train_score': array([0.984375  , 0.96875   , 0.984375  , 0.99224806, 0.96899225,
        0.96899225, 0.96124031])}

### 튜닝
- 모델 정확도(accuracy) 높이기 위한 과정들 진행
- 데이터 정제, 여러가지 모델 테스트
- 모델 세부 튜닝, 하이퍼파라미터 변경하며 모델 테스트 진행
- GridSearchCV, RandomizedSearchCV

#### 선형회귀에선 건드릴 수 있는 하이퍼파라미터가 없다.
- 대신 사용하는게 ridge, lasso를 사용
- 그리드서치를 사용해서, 교차검증 + 하이퍼파라미터를 __최적화__ 하자.

In [33]:
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]

In [39]:
print(data.shape)
print(target.shape)

(506, 13)
(506,)


In [41]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate
lr = LinearRegression()
lr.fit(data, target)

In [43]:
cross_validate(lr, data, target, return_train_score=True)

{'fit_time': array([0.00199223, 0.00199533, 0.00199533, 0.00198269, 0.00199389]),
 'score_time': array([0.00099707, 0.        , 0.00042439, 0.0009973 , 0.0009973 ]),
 'test_score': array([ 0.63919994,  0.71386698,  0.58702344,  0.07923081, -0.25294154]),
 'train_score': array([0.74652533, 0.72763185, 0.69498059, 0.84181027, 0.73545537])}

### 선형회귀 하이퍼파라미터 알파 값 건드려보기

In [None]:
from sklearn.model_selection import train_test_split
train_data, test_data, train_target, test_target = train_test_split(data, target, random_state=42)

In [73]:
from sklearn.preprocessing import StandardScaler
st = StandardScaler()
st.fit(train_data)
st.fit(test_data)
st.transform(train_data)
st.transform(test_data)

array([[-0.42099129, -0.44496424, -1.08540424, ..., -1.41458158,
         0.44419754, -0.56723075],
       [-0.42475396,  1.22324119, -0.72648881, ..., -0.8318639 ,
         0.45799486, -1.34274421],
       [-0.41950497, -0.44496424,  2.51743753, ...,  0.6249303 ,
         0.39107788,  0.70371057],
       ...,
       [-0.41569758, -0.44496424, -0.6504474 , ..., -0.6570486 ,
         0.32465365, -1.02184209],
       [-0.39337713, -0.44496424, -0.57896848, ...,  0.33357146,
         0.45799486, -0.75583112],
       [-0.42877215,  3.0999723 , -1.07019596, ..., -0.6570486 ,
         0.413942  , -0.94443149]])

In [79]:
from sklearn.linear_model import Ridge
s = []
for alpha in [0.001, 0.01, 0.1, 1, 10]:
    ridge = Ridge(alpha=alpha).fit(train_data, train_target)
    result = ridge.score(train_data, train_target)
    result2 = ridge.score(test_data, test_target)
    coef = ridge.coef_; intercept = ridge.intercept_
    s.append([alpha, result, result2, coef, intercept])
s = pd.DataFrame(s, columns=['alpha', 'train', 'test', 'coef', 'intercept'])

In [86]:
s

Unnamed: 0,alpha,train,test,coef,intercept
0,0.001,0.748087,0.684421,"[-0.12831408550662934, 0.02955503453367559, 0....",29.826976
1,0.01,0.748087,0.684366,"[-0.12823778615255113, 0.029584135007629977, 0...",29.742722
2,0.1,0.74803,0.683805,"[-0.12753601596785039, 0.029853781509412102, 0...",28.967772
3,1.0,0.746116,0.678975,"[-0.12383038776770779, 0.03139178197340617, 0....",24.87837
4,10.0,0.739824,0.672424,"[-0.12137452683817439, 0.03421897073039659, -0...",22.652201


In [92]:
from sklearn.linear_model import Ridge
s = []
for solver in ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']:
    ridge = Ridge(solver=solver).fit(train_data, train_target)
    result = ridge.score(train_data, train_target)
    result2 = ridge.score(test_data, test_target)
    coef = ridge.coef_; intercept = ridge.intercept_
    s.append([solver, result, result2, coef, intercept])
s = pd.DataFrame(s, columns=['solver', 'train', 'test', 'coef', 'intercept'])

In [93]:
s

Unnamed: 0,solver,train,test,coef,intercept
0,auto,0.746116,0.678975,"[-0.12383038776770779, 0.03139178197340617, 0....",24.87837
1,svd,0.746116,0.678975,"[-0.12383038776770516, 0.03139178197340703, 0....",24.87837
2,cholesky,0.746116,0.678975,"[-0.12383038776770779, 0.03139178197340617, 0....",24.87837
3,lsqr,0.718491,0.668884,"[-0.14015221595819927, 0.06576558766086438, -0...",35.899991
4,sparse_cg,0.737208,0.661844,"[-0.1223784324572163, 0.03357271884331846, -0....",18.414456
5,sag,0.707792,0.671634,"[-0.12847464771203548, 0.039905841513960205, -...",42.797193
6,saga,0.693134,0.661319,"[-0.12460186949066726, 0.03559960436516488, -0...",45.53791


### Support Vector Machine
- Large Margin이 목적 
- c : 클수록 학습오류 허용 x, 마진 좁아짐, 과대적합 발생
- 작을수록 학습오류 허용 o, 마진 넓어짐, 과소적합 발생
- kernel : 커널 함수 설정 ('linear', 'sigmoid', 'poly', 'rbf')
- gamma : 하나의 데이터의 영향력 정도 (굴곡 곡선 정도)
    - 커질수록 결정 경계, 곡률 커지며 과대적합
    - 작을수록 결정 경계, 곡률 작아지며 과소적합
- degree : poly 커널 시 곡선 경계 각도 설정 (커질수록 과대적합 가능성)
- coef0 : poly, sigmoid 커널에서 다항식 차수 조절

## 그리드 서치

In [None]:
from sklearn.svm import SVC

### 좋은 모델 찾기

In [63]:
def get_all_estimators(train_data, test_data, train_target, test_target, type_filter='classifier'):
    from sklearn.utils import all_estimators
    import warnings
    warnings.filterwarnings('ignore')
    models = all_estimators(type_filter=type_filter)
    scores = []
    for name, model in models:
        try: 
            md =model()
            # 학습
            md.fit(data, target)
            # 평가
            result = np.round(md.score(train_data, train_target),4)
            result2 = np.round(md.score(test_data, test_target),4)
            scores.append([name, result, result2])
        except:
            pass
    scores = pd.DataFrame(scores, columns = ['name', 'train', 'test'])
    return scores

In [64]:
scores = get_all_estimators(train_data, test_data, train_target, test_target, type_filter='regressor')
scores

Unnamed: 0,name,train,test
0,ARDRegression,0.7341,0.7266
1,AdaBoostRegressor,0.9026,0.8836
2,BaggingRegressor,0.9773,0.9663
3,BayesianRidge,0.7358,0.7075
4,DecisionTreeRegressor,1.0,1.0
5,DummyRegressor,-0.0016,-0.0179
6,ElasticNet,0.6836,0.688
7,ElasticNetCV,0.6682,0.6749
8,ExtraTreeRegressor,1.0,1.0
9,ExtraTreesRegressor,1.0,1.0


### 최적의 모델 소팅하기

In [65]:
scores.sort_values('train', ascending=False).iloc[:10,:]

Unnamed: 0,name,train,test
8,ExtraTreeRegressor,1.0,1.0
4,DecisionTreeRegressor,1.0,1.0
11,GaussianProcessRegressor,1.0,1.0
9,ExtraTreesRegressor,1.0,1.0
35,RadiusNeighborsRegressor,0.9994,0.9978
36,RandomForestRegressor,0.9832,0.9842
13,HistGradientBoostingRegressor,0.9816,0.9794
12,GradientBoostingRegressor,0.9776,0.9702
2,BaggingRegressor,0.9773,0.9663
1,AdaBoostRegressor,0.9026,0.8836


(506, 13)