# iris dataset 분류
- model: KNN 사용
- gridsearch를 이용해서 최적의 K값 찾는다.

##### 데이터셋 로드 및 분리

In [67]:
from sklearn.datasets import load_iris
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline, make_pipeline

X, y = load_iris(return_X_y=True)

In [68]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   test_size=0.2, random_state=930,
                                                   stratify = y)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((120, 4), (30, 4), (120,), (30,))

##### Pipeline 생성

In [70]:
order = [
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier())
]
pipeline = Pipeline(order, verbose=True)

# makepipe 사용
# pipeline2 = make_pipeline(StandardScaler(), KNeighborsClassifier())


##### GridSearchCV 생성 및 학습

In [83]:
param = {
    "knn__n_neighbors": range(1,20),
    "knn__p": [1,2]
}

gs = GridSearchCV(pipeline, param, scoring='accuracy', cv=4, n_jobs=-1)

gs.fit(X_train, y_train)

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.0s
[Pipeline] ............... (step 2 of 2) Processing knn, total=   0.0s


GridSearchCV(cv=4,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('knn', KNeighborsClassifier())],
                                verbose=True),
             n_jobs=-1,
             param_grid={'knn__n_neighbors': range(1, 20), 'knn__p': [1, 2]},
             scoring='accuracy')

##### 결과 확인

In [84]:
result_df = pd.DataFrame(gs.cv_results_)
result_df.sort_values('rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_knn__n_neighbors,param_knn__p,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
10,0.0,0.0,0.0,0.0,6,1,"{'knn__n_neighbors': 6, 'knn__p': 1}",0.966667,1.0,0.966667,0.966667,0.975,0.014434,1
15,0.0,0.0,0.0,0.0,8,2,"{'knn__n_neighbors': 8, 'knn__p': 2}",0.966667,0.966667,0.966667,1.0,0.975,0.014434,2
13,0.0,0.0,0.0,0.0,7,2,"{'knn__n_neighbors': 7, 'knn__p': 2}",0.966667,0.966667,0.933333,1.0,0.966667,0.02357,3
12,0.0,0.0,0.0,0.0,7,1,"{'knn__n_neighbors': 7, 'knn__p': 1}",0.966667,0.966667,0.933333,1.0,0.966667,0.02357,3
11,0.0,0.0,0.0,0.0,6,2,"{'knn__n_neighbors': 6, 'knn__p': 2}",0.966667,0.966667,0.966667,0.966667,0.966667,0.0,3


In [85]:
best_model = gs.best_estimator_
best_model

Pipeline(steps=[('scaler', StandardScaler()),
                ('knn', KNeighborsClassifier(n_neighbors=6, p=1))],
         verbose=True)

In [86]:
pred_test = best_model.predict(X_test)

In [87]:
accuracy_score(y_test, pred_test)

0.9333333333333333

In [88]:
X_train.shape

(120, 4)