# Chapter12 모델 선택

## 12.0 소개

## 12.1 완전 탐색을 사용해 최선의 모델 선택하기

In [1]:
# 사이킷런의 GridSearchCV 활용
import numpy as np
from sklearn import linear_model, datasets
from sklearn.model_selection import GridSearchCV

# 데이터 로드
iris = datasets.load_iris()
features = iris.data
target = iris.target

# 로지스틱 회귀 모델 제작
logistic = linear_model.LogisticRegression()

# 패널티(penalty) 하이퍼파라미터 값의 후보를 만든다.
penalty = ['l1', 'l2']

# 규제 하이퍼파라미터 값의 후보 범위 제작
C = np.logspace(0, 4, 10)

# 하이퍼파라미터 후보 딕셔너리 값 제작
hyperparameters = dict(C=C, penalty=penalty)

# 그리드 서치 객체 제작
gridsearch = GridSearchCV(logistic, hyperparameters, cv=5, verbose=0)

# 그리드 서치 수행
best_model = gridsearch.fit(features, target)

In [2]:
np.logspace(0, 4, 10)

array([1.00000000e+00, 2.78255940e+00, 7.74263683e+00, 2.15443469e+01,
       5.99484250e+01, 1.66810054e+02, 4.64158883e+02, 1.29154967e+03,
       3.59381366e+03, 1.00000000e+04])

In [3]:
# 최선의 하이퍼파라미터 확인
print('가장 좋은 패널티:', best_model.best_estimator_.get_params()['penalty'])
print('가장 좋은 C값:', best_model.best_estimator_.get_params()['C'])


가장 좋은 패널티: l1
가장 좋은 C값: 7.742636826811269


* GridSearchCV : 최선의 하이퍼파라미터 확인한 후(교차검증에서 폴드 하나 제거하는 대신), 전체 데이터셋에서 최선의 하이퍼파라미터를 사용하여 모델 다시 훈련한다.

In [5]:
# 타깃 벡터 예측
best_model.predict(features)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

## 12.2 랜덤 탐색을 사용해 최선의 모델 선택하기
* 사이킷런 RandomizedSearchCV

In [2]:
from scipy.stats import uniform
from sklearn import linear_model, datasets
from sklearn.model_selection import RandomizedSearchCV

# 데이터 로드
iris = datasets.load_iris()
features = iris.data
target = iris.target

# 로지스틱
logistic = linear_model.LogisticRegression()

# Camdidate Regularization penalty hyperparameter values
penalty = ['l1','l2']

# Create distn of candidate regularization
C = uniform(loc=0, scale=4)

# Create hyperparameter options
hyperparameters = dict(C=C, penalty=penalty)

# Create randomized search
randomizedsearch = RandomizedSearchCV(
    logistic, hyperparameters, random_state=1, n_iter=100, cv=5, verbose=0,
    n_jobs=-1
)

# Fit RS
best_model = randomizedsearch.fit(features, target)


In [3]:
# Define a uniform distn btw 0 and 4, sample 10 values
uniform(loc=0, scale=4).rvs(10)

array([0.51527429, 2.17259286, 2.43949273, 1.65416853, 0.3004436 ,
       2.92883275, 2.39942691, 0.32694419, 3.5724987 , 2.48569169])

In [4]:
# best hyperparameters
print('Best Penalty:', best_model.best_estimator_.get_params()['penalty'])
print('Best C:', best_model.best_estimator_.get_params()['C'])

Best Penalty: l2
Best C: 3.730229437354635


In [5]:
best_model.predict(features)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

## 12.3 Selecting Best Models from Multiple Learning Algorithms

* Dictionary of candidate learning algorithms and their hyperparameters

In [7]:
import numpy as np
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# set random seed
np.random.seed(0)

# load the data
iris = datasets.load_iris()
features = iris.data
target = iris.target

# create a pipeline
pipe = Pipeline([("classifier", RandomForestClassifier())])

# create dictionary with candidate learning algorithms and their hyperparameters
search_space = [{"classifier": [LogisticRegression()],
"classifier__penalty": ['l1', 'l2'],
"classifier__C": np.logspace(0,4,10)},
{"classifier":[RandomForestClassifier()],
"classifier__n_estimators": [10,100,1000],
"classifier__max_features":[1,2,3]}]

# Create grid search
gridsearch = GridSearchCV(pipe, search_space, cv=5, verbose=0)

# Fit grid search
best_model = gridsearch.fit(features, target)

In [8]:
# To view the best model
best_model.best_estimator_.get_params()['classifier']

LogisticRegression(C=7.742636826811269, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [9]:
# Predict target vector
best_model.predict(features)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

## 12.4 Selecting Best Models when preprocessing
* Preprocessing step and parameters : make a pipeline including!

In [15]:
import numpy as np
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Set random seed
np.random.seed(0)

# Load Data
iris = datasets.load_iris()
features = iris.data
target = iris.target

# create a preprocessing object that includes StandardScaler features and PCA
preprocess = FeatureUnion([("std", StandardScaler()), ("pca", PCA())])

# Create a pipeline
pipe = Pipeline([("preprocess",preprocess), ("classifier", LogisticRegression())])

# Create space of candidate values
search_space = [{"preprocess__pca__n_components": [1,2,3],
"classifier__penalty": ["l1", "l2"],
"classifier__C": np.logspace(0,4,10)}]

# Create grid search
clf = GridSearchCV(pipe, search_space, cv=5, verbose=0, n_jobs=-1)

# Fit grid search
best_model = clf.fit(features, target)

In [19]:
best_model.best_estimator_.get_params()["classifier"]

LogisticRegression(C=7.742636826811269, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [20]:
best_model.best_estimator_.get_params()["preprocess"]

FeatureUnion(n_jobs=None,
             transformer_list=[('std',
                                StandardScaler(copy=True, with_mean=True,
                                               with_std=True)),
                               ('pca',
                                PCA(copy=True, iterated_power='auto',
                                    n_components=2, random_state=None,
                                    svd_solver='auto', tol=0.0,
                                    whiten=False))],
             transformer_weights=None, verbose=False)

In [21]:
# See the best n_components
best_model.best_estimator_.get_params()["preprocess__pca__n_components"]

2

## 12.5 Speeding up model selection with parallelization
* n_jobs = -1

In [4]:
import numpy as np
from sklearn import linear_model, datasets
from sklearn.model_selection import GridSearchCV

# Load Data
iris = datasets.load_iris()
features = iris.data
target = iris.target

# create logistic regression
logistic = linear_model.LogisticRegression()

# range of candidate regularization penalty hyperparameter values
penalty = ["l1", "l2"]

# Create range of candidate values for C
C = np.logspace(0, 4, 1000)

# Create hyperparameter options
hyperparameters = dict(C=C, penalty=penalty)

# grid search
gridsearch = GridSearchCV(logistic, hyperparameters, cv=5, n_jobs=-1, verbose=1)

# fit grid search
best_model = gridsearch.fit(features, target)

Fitting 5 folds for each of 2000 candidates, totalling 10000 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 7330 tasks      | elapsed:   11.8s
[Parallel(n_jobs=-1)]: Done 10000 out of 10000 | elapsed:   15.8s finished


## 12.6 Speeding Up Model Selection Using Algorithm-Specific Methods

In [5]:
# 사이킷런 활용하라!
from sklearn import linear_model, datasets

iris = datasets.load_iris()
features = iris.data
target = iris.target

# Create cross-validated logistic regression
logit = linear_model.LogisticRegressionCV(Cs=100) # 사이킷런 자체 내에 있는 model-specific cross-validation hyperparameter tuning 활용

# Train model
logit.fit(features, target)

LogisticRegressionCV(Cs=100, class_weight=None, cv='warn', dual=False,
                     fit_intercept=True, intercept_scaling=1.0, l1_ratios=None,
                     max_iter=100, multi_class='warn', n_jobs=None,
                     penalty='l2', random_state=None, refit=True, scoring=None,
                     solver='lbfgs', tol=0.0001, verbose=0)

* 단점 : LogisticRegressionCV는 오직 C 파라미터만 조정 가능하다.
    * penalty는 조정할 수 없다.

## 12. Evaluating Performance After Model Selection

* model selection을 통해 모델 성능을 평가하고 싶다.
* biased evaluation을 피하고자 nested cross-validation을 활용한다.

In [8]:
import numpy as np
from sklearn import linear_model, datasets
from sklearn.model_selection import GridSearchCV, cross_val_score

iris = datasets.load_iris()
features = iris.data
target = iris.target

logistic = linear_model.LogisticRegression()

# range of 20 candidate for C
C = np.logspace(0, 4, 20)

hyperparameters = dict(C=C)

# grid search
gridsearch = GridSearchCV(logistic, hyperparameters, cv=5, n_jobs=-1, verbose=0)

# nested cross-validation and output the average score
cross_val_score(gridsearch, features, target).mean()



0.9534313725490197

In [9]:
gridsearch = GridSearchCV(logistic, hyperparameters, cv=5, verbose=1)

In [10]:
# run the gridsearch and fitting and it is our inner cross-validation to find the best model
best_model = gridsearch.fit(features, target)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.1s finished


* inner cross-validation : GridSearchCV object
* wrap in an outer cross-validation : cross_val_score
    * trained 20 candidate models * 5 times = 100 models
    * nest clf inside a new cross-validation(3 folds, default)

In [11]:
scores = cross_val_score(gridsearch, features, target)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 20 candidates, totalling 100 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Fitting 5 folds for each of 20 candidates, totalling 100 fits
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.1s finished


* inner cross-validation trained 20 models * 5 times to find the best models
* The model evaluated by an outer 3-fold cv with total of 300 models trained