In [9]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
import tqdm
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import export_graphviz
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import warnings 
warnings.filterwarnings(action="ignore")
from sklearn.model_selection import GridSearchCV # 그리드서치

In [5]:
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                    iris.target,
                                                    test_size=0.3,
                                                    random_state=99
                                                                  )

In [6]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(105, 4)
(45, 4)
(105,)
(45,)


### KNN 모델에 그리드 서치 적용

In [11]:
param_knn = {'n_neighbors':range(1,10)}
grid_search_knn = GridSearchCV(KNeighborsClassifier(),
                  param_knn,
                  cv= 10
                          )
grid_search_knn.fit(X_train, y_train)
# GridSearchCV에서 fit 명령은 최적의 하이퍼파라미터를 찾는 것 뿐만 아니라 
# 교차검증까지 같이 수행하여 성능이 가장 좋은 하이퍼 파라미터로 새로운 모델을 만들어줌

GridSearchCV(cv=10, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': range(1, 10)})

In [12]:
print("최적 파라미터 값:", grid_search_knn.best_params_)
print("최고 교차 검증 점수:", grid_search_knn.best_score_)
print("최고 성능 모델:", grid_search_knn.best_estimator_)
# 최고성능모델은 어차피 grid_search_knn에 저장이 되어 있기 때문에 굳이 사용하지 않아도 됨

최적 파라미터 값: {'n_neighbors': 7}
최고 교차 검증 점수: 0.9809090909090908
최고 성능 모델: KNeighborsClassifier(n_neighbors=7)


In [14]:
grid_search_knn.predict(X_test)

array([2, 0, 1, 2, 0, 1, 1, 2, 0, 1, 2, 1, 2, 1, 2, 0, 0, 1, 1, 1, 1, 1,
       0, 1, 2, 0, 0, 2, 1, 2, 0, 2, 0, 2, 1, 0, 0, 2, 2, 1, 1, 1, 2, 2,
       1])

In [16]:
print("test 세트 점수:" ,grid_search_knn.score(X_test,y_test))

test 세트 점수: 0.9555555555555556


- 실습: 디시젼트리 모델의 최적 파라미터 찾기(max_depth, max_leaf_nodes, min_samples_leaf)

In [32]:
param_tree={"max_depth": range(1,10),
           "max_leaf_nodes": range(1,50,1),
            "min_samples_leaf": range(1,100,1)
           }

In [33]:
grid_search_tree = GridSearchCV(DecisionTreeClassifier(),
                                param_tree,
                                cv=10
                               )
grid_search_tree.fit(X_train,y_train)

GridSearchCV(cv=10, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': range(1, 10),
                         'max_leaf_nodes': range(1, 50),
                         'min_samples_leaf': range(1, 100)})

In [34]:
print("최적 파라미터 값:", grid_search_tree.best_params_)
print("최고 교차 검증 점수:", grid_search_tree.best_score_)
print("최고 성능 모델:", grid_search_tree.best_estimator_)

최적 파라미터 값: {'max_depth': 3, 'max_leaf_nodes': 4, 'min_samples_leaf': 5}
최고 교차 검증 점수: 0.9618181818181817
최고 성능 모델: DecisionTreeClassifier(max_depth=3, max_leaf_nodes=4, min_samples_leaf=5)


In [35]:
print("test 세트 점수:" ,grid_search_tree.score(X_test,y_test))

test 세트 점수: 0.9555555555555556
