# Model Selection 모듈

- 훈련/테스트 데이터로 분리하지 않고 머신러닝 수행

In [3]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier

In [4]:
iris = load_iris()
dtc = DecisionTreeClassifier(random_state=2021)
dtc.fit(iris.data, iris.target)
dtc.score(iris.data, iris.target)

1.0

- cross-validate() : 교차 검증,cv=5가 디폴트

In [5]:
from sklearn.model_selection import cross_validate
dtc = DecisionTreeClassifier()
res = cross_validate(dtc, iris.data, iris.target)
res

{'fit_time': array([0.00097799, 0.00099993, 0.00099969, 0.        , 0.        ]),
 'score_time': array([0.        , 0.        , 0.        , 0.00099993, 0.        ]),
 'test_score': array([0.96666667, 0.96666667, 0.9       , 0.96666667, 1.        ])}

In [6]:
res['test_score']

array([0.96666667, 0.96666667, 0.9       , 0.96666667, 1.        ])

- cross_val_score()

In [7]:
from sklearn.model_selection import cross_val_score
cross_val_score(dtc, iris.data, iris.target, scoring='accuracy', cv=5)


array([0.96666667, 0.96666667, 0.9       , 0.93333333, 1.        ])

- GridSearchCV클래스 : 하이퍼 파라미터 튜닝 + 교차검증

In [8]:
# Train/Test dataset 분리
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, stratify=iris.target, test_size=0.2, random_state=2021
)

In [9]:
# 분류기와 그에 해당하는 하이퍼 파라미터
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state=2021)
dtc.get_params()


{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 2021,
 'splitter': 'best'}

In [10]:
params = {
    'max_depth' : [2,3,4,5,6],
    'min_samples_split' : [2,3,4]

}

In [11]:
from sklearn.model_selection import GridSearchCV

grid_dt = GridSearchCV(
    dtc, param_grid=params, scoring='accuracy',cv=5
)

In [12]:
grid_dt.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=2021),
             param_grid={'max_depth': [2, 3, 4, 5, 6],
                         'min_samples_split': [2, 3, 4]},
             scoring='accuracy')

In [13]:
# 최적의 파라메터
grid_dt.best_params_

{'max_depth': 4, 'min_samples_split': 2}

In [14]:
# 최적의 파라메터로 학습한 분류기
best_clf = grid_dt.best_estimator_

In [15]:
# 최적의 파라메터로 학습한 분류기로 예측 및 평가
best_clf.score(X_test, y_test)

0.9

Support Vector Machine

In [16]:
from sklearn.svm import SVC
svc = SVC(random_state=2021)
svc.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': 2021,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [17]:
params = {'C': [0.01, 0.1, 1, 10, 100]}

In [18]:
grid_sv = GridSearchCV(svc, param_grid=params, scoring='accuracy', cv=5)
grid_sv.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=SVC(random_state=2021),
             param_grid={'C': [0.01, 0.1, 1, 10, 100]}, scoring='accuracy')

In [19]:
grid_sv.best_params_

{'C': 10}

In [20]:
params = {'C': [3, 6, 10, 15, 20]}
grid_sv = GridSearchCV(svc, param_grid=params, scoring='accuracy', cv=5)
grid_sv.fit(X_train, y_train)
grid_sv.best_params_

{'C': 3}

In [21]:
params = {'C': [2, 3, 4, 5]}
grid_sv = GridSearchCV(svc, param_grid=params, scoring='accuracy', cv=5)
grid_sv.fit(X_train, y_train)
grid_sv.best_params_

{'C': 4}

In [22]:
best_svc = grid_sv.best_estimator_
best_svc.score(X_test, y_test)

1.0