<a href="https://colab.research.google.com/github/jiin124/4th_kaggle_study/blob/main/4%EC%A3%BC%EC%B0%A8/13_%ED%8C%8C%EC%9D%B4%ED%94%84%EB%9D%BC%EC%9D%B8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. 파이프라인

In [1]:
from sklearn.datasets import load_breast_cancer

cancer=load_breast_cancer()
X=cancer.data
y=cancer.target

In [2]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,stratify=y,random_state=1)

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

pipeline=make_pipeline(StandardScaler(),PCA(n_components=4),LogisticRegression())

In [4]:
pipeline.fit(X_train,y_train)
y_pred=pipeline.predict(X_test)

In [5]:
#모델평가
from sklearn.metrics import accuracy_score

accuracy_score(y_test,y_pred)

0.9736842105263158

# 2. 교차검증을 통한 과적합 분석

In [6]:
from sklearn.datasets import load_breast_cancer

cancer=load_breast_cancer()

In [7]:
X=cancer.data
y=cancer.target

In [8]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,stratify=y,random_state=1)

In [11]:
# 파이파라인 기능을 이용한 모듈 설계

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(StandardScaler(), PCA(n_components=4), LogisticRegression() )


In [12]:
from sklearn.model_selection import cross_validate

scores=cross_validate(pipeline,X_train,y_train,cv=10,return_train_score=True)

In [13]:
sorted(scores.keys())

['fit_time', 'score_time', 'test_score', 'train_score']

In [14]:
import numpy as np

print('CV Validation Accuracy scores: ',scores['train_score'])
print('CV Validation Accuracy : %.3f +/- %.3f'%(np.mean(scores['train_score']),np.std(scores['train_score'])))

CV Validation Accuracy scores:  [0.96577017 0.96577017 0.96577017 0.96577017 0.96821516 0.96585366
 0.97073171 0.96829268 0.97560976 0.96585366]
CV Validation Accuracy : 0.968 +/- 0.003


In [15]:
import numpy as np

print('CV Validation Accuracy scores: ', scores['test_score'])
print('CV Validation Accuracy: %.3f +/- %.3f' %(np.mean(scores['test_score']), np.std(scores['test_score'])))

CV Validation Accuracy scores:  [0.97826087 0.97826087 0.95652174 1.         0.95652174 0.97777778
 0.93333333 0.95555556 0.91111111 1.        ]
CV Validation Accuracy: 0.965 +/- 0.027


## 교차검증 결과

- 과대적합도 과소적합도 발생하지 않는다.

In [16]:
from sklearn.model_selection import GridSearchCV

parameters={}

gs=GridSearchCV(pipeline,parameters,scoring='accuracy',cv=10)
gs.fit(X_train,y_train)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('pca', PCA(n_components=4)),
                                       ('logisticregression',
                                        LogisticRegression())]),
             param_grid={}, scoring='accuracy')

In [17]:
best=gs.best_estimator_

In [18]:
gs.cv_results_

{'mean_fit_time': array([0.01834941]),
 'mean_score_time': array([0.00200155]),
 'mean_test_score': array([0.9647343]),
 'params': [{}],
 'rank_test_score': array([1], dtype=int32),
 'split0_test_score': array([0.97826087]),
 'split1_test_score': array([0.97826087]),
 'split2_test_score': array([0.95652174]),
 'split3_test_score': array([1.]),
 'split4_test_score': array([0.95652174]),
 'split5_test_score': array([0.97777778]),
 'split6_test_score': array([0.93333333]),
 'split7_test_score': array([0.95555556]),
 'split8_test_score': array([0.91111111]),
 'split9_test_score': array([1.]),
 'std_fit_time': array([0.00651192]),
 'std_score_time': array([0.00201972]),
 'std_test_score': array([0.02665336])}

In [19]:
from sklearn.metrics import accuracy_score

y_train_pred=best.predict(X_train)
accuracy_score(y_train,y_train_pred)

0.967032967032967

In [20]:
from sklearn.metrics import accuracy_score

y_test_pred=best.predict(X_test)
accuracy_score(y_test,y_test_pred)

0.9736842105263158

과적합이 일어나지 않았군요!

# 3. 최적 모델 선정하기

In [22]:
from sklearn.datasets import load_breast_cancer

cancer=load_breast_cancer()

In [25]:
X=cancer.data
Y=cancer.target

In [26]:

# 학습 데이터 분할
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2, stratify=Y, random_state=1)

In [27]:
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.decomposition import PCA

In [28]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

#파라미터 parsing
estimators=[('normalization',StandardScaler()),('clf',SVC())]
pipe=Pipeline(estimators)

In [29]:

from sklearn.model_selection import ParameterGrid
grid = [{'clf__kernel': [['linear']], 'clf__C': [[0.001], [0.01],[0.1],[1],[10], [100], [1000]]}, 
        {'clf__kernel': [['rbf']], 'clf__gamma': [[0.001], [0.01],[0.1],[1],[10], [100], [1000]], 'clf__C': [[0.001], [0.01],[0.1],[1],[10], [100], [1000]]}]

grid_param = ParameterGrid(grid)
list(grid_param)

[{'clf__C': [0.001], 'clf__kernel': ['linear']},
 {'clf__C': [0.01], 'clf__kernel': ['linear']},
 {'clf__C': [0.1], 'clf__kernel': ['linear']},
 {'clf__C': [1], 'clf__kernel': ['linear']},
 {'clf__C': [10], 'clf__kernel': ['linear']},
 {'clf__C': [100], 'clf__kernel': ['linear']},
 {'clf__C': [1000], 'clf__kernel': ['linear']},
 {'clf__C': [0.001], 'clf__gamma': [0.001], 'clf__kernel': ['rbf']},
 {'clf__C': [0.001], 'clf__gamma': [0.01], 'clf__kernel': ['rbf']},
 {'clf__C': [0.001], 'clf__gamma': [0.1], 'clf__kernel': ['rbf']},
 {'clf__C': [0.001], 'clf__gamma': [1], 'clf__kernel': ['rbf']},
 {'clf__C': [0.001], 'clf__gamma': [10], 'clf__kernel': ['rbf']},
 {'clf__C': [0.001], 'clf__gamma': [100], 'clf__kernel': ['rbf']},
 {'clf__C': [0.001], 'clf__gamma': [1000], 'clf__kernel': ['rbf']},
 {'clf__C': [0.01], 'clf__gamma': [0.001], 'clf__kernel': ['rbf']},
 {'clf__C': [0.01], 'clf__gamma': [0.01], 'clf__kernel': ['rbf']},
 {'clf__C': [0.01], 'clf__gamma': [0.1], 'clf__kernel': ['rbf']},

In [30]:
gs = GridSearchCV(pipe, grid_param, scoring='accuracy', cv=10, n_jobs=1)

In [31]:
gs.fit(X_train,Y_train)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('normalization', StandardScaler()),
                                       ('clf', SVC())]),
             n_jobs=1,
             param_grid=<sklearn.model_selection._search.ParameterGrid object at 0x7f1288e90d90>,
             scoring='accuracy')

In [32]:
print(gs.best_score_)

0.9758454106280192


In [33]:
print(gs.best_params_)

{'clf__C': 10, 'clf__gamma': 0.001, 'clf__kernel': 'rbf'}


In [34]:
best_model = gs.best_estimator_
Y_test_pred = best_model.predict(X_test)

In [35]:
from sklearn.metrics import accuracy_score

Y_test_Pred = best_model.predict(X_test)
accuracy_score(Y_test, Y_test_Pred)

0.9824561403508771