### 그리드서치-스케일링
X_train , X_test  둘다 스케일링 하는 거 잊지 말아햐 함

In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import koreanize_matplotlib
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler ,StandardScaler , PolynomialFeatures
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.decomposition import PCA


In [2]:
cancer= load_breast_cancer()
X_train, X_test, y_train, y_test= train_test_split(cancer.data, cancer.target, random_state=42)

In [3]:
scaler = MinMaxScaler().fit(X_train)
X_train_scaled =  scaler.transform(X_train)


In [4]:
svm=SVC()
svm.fit(X_train_scaled,  y_train)

In [5]:
X_test_scaled= scaler.transform(X_test)


In [6]:
print(svm.score(X_test_scaled,y_test))

0.9790209790209791


### 최적화

In [10]:

param_grid = {"C" :[0.001,0.01,0.1,1,10,100], "gamma":[0.001,0.0,0.1,1,10,100]}
grid=GridSearchCV(SVC(), param_grid=param_grid, cv=5)
grid.fit(X_train_scaled, y_train)
print(grid.best_score_)
grid.score(X_test_scaled, y_test)
grid.best_params_

0.9788508891928865


{'C': 1, 'gamma': 1}

### 파이프라인 구축
####  파이프를 작게 만들어서 여러 개의 파이프를 연결하는 것

In [11]:
pipe= Pipeline([("scaler", MinMaxScaler()),("svm", SVC())])
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.9790209790209791

In [12]:
param_grid = {"svm__C" :[0.001,0.01,0.1,1,10,100], "svm__gamma":[0.001,0.0,0.1,1,10,100]}
grid=GridSearchCV(pipe, param_grid=param_grid, cv=5)
grid.fit(X_train, y_train)
grid.score(X_test, y_test)
print(grid.best_score_,grid.best_params_)

0.9741450068399453 {'svm__C': 10, 'svm__gamma': 0.1}


### make pipeline 파이프라인 인터페이스

In [14]:
pipe_short = make_pipeline(MinMaxScaler(), SVC(C=100))
pipe_short

In [15]:
#pipe_short은 단계의 이름을 자동으로 만든다.
pipe_short.steps

[('minmaxscaler', MinMaxScaler()), ('svc', SVC(C=100))]

In [17]:
#더 의미있는 이름을 붙이려면 pipeline으로 객체를 직접 만드는 것이 좋다
pipe_short= make_pipeline(StandardScaler(), PCA(n_components=2),StandardScaler() )
pipe_short

In [18]:
pipe_short.fit(cancer.data)

In [19]:
components= pipe_short.named_steps["pca"].components_
print(components)

[[ 0.21890244  0.10372458  0.22753729  0.22099499  0.14258969  0.23928535
   0.25840048  0.26085376  0.13816696  0.06436335  0.20597878  0.01742803
   0.21132592  0.20286964  0.01453145  0.17039345  0.15358979  0.1834174
   0.04249842  0.10256832  0.22799663  0.10446933  0.23663968  0.22487053
   0.12795256  0.21009588  0.22876753  0.25088597  0.12290456  0.13178394]
 [-0.23385713 -0.05970609 -0.21518136 -0.23107671  0.18611302  0.15189161
   0.06016536 -0.0347675   0.19034877  0.36657547 -0.10555215  0.08997968
  -0.08945723 -0.15229263  0.20443045  0.2327159   0.19720728  0.13032156
   0.183848    0.28009203 -0.21986638 -0.0454673  -0.19987843 -0.21935186
   0.17230435  0.14359317  0.09796411 -0.00825724  0.14188335  0.27533947]]


### 예제(430쪽) 
전처리와 모델의 매개변수를 위한 그리드 서치 <br />
<br />
가능하면 전처리를 엑셀이나, 구글 스프레드쉬트 등을 활용하세요<br />
vscode에서 텍스트 전처리를 진행하세요

In [26]:
data_url="http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url,sep=r"\s+" ,skiprows=22 , header=None)
raw_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.00632,18.00,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3
1,396.90000,4.98,24.00,,,,,,,,
2,0.02731,0.00,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8
3,396.90000,9.14,21.60,,,,,,,,
4,0.02729,0.00,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8
...,...,...,...,...,...,...,...,...,...,...,...
1007,396.90000,5.64,23.90,,,,,,,,
1008,0.10959,0.00,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0
1009,393.45000,6.48,22.00,,,,,,,,
1010,0.04741,0.00,11.93,0.0,0.573,6.030,80.8,2.5050,1.0,273.0,21.0


In [29]:
data= np.hstack([raw_df.values[::2,:],raw_df.values[1::2,:2]])
target= raw_df.values[1::2, 2]
X_train, X_test, y_train, y_test= train_test_split(data, target, random_state=42)

In [48]:
# 보스턴 주택 데이터를 평균으로 StandardScaler() 한다.  정규분포 그래프 (종 모양 그래프)
# 평균으로 할 시에 평균특성이 사그라드므로 특성을 키우고자 PolynomialFeatures 한다. 이차곡선그래프(U)
# 선형으로 선을 그어 회귀한다.Ridge()  선형그래프(/) =>보스턴 집값이 매우큰쪽과 매우작은쪽은 회귀의 정확도가 떨어질 것이다.
pipe =make_pipeline(StandardScaler(), PolynomialFeatures(), Ridge())
pipe

In [49]:
param_grid={"polynomialfeatures__degree" : [1,2,3] ,
            "ridge__alpha":[0.001,0.01,0.1,1,10,100]}


In [50]:
grid=GridSearchCV(pipe, param_grid=param_grid, cv=5, n_jobs=-1)
grid.fit(X_train,y_train)

In [51]:
grid.best_params_

{'polynomialfeatures__degree': 2, 'ridge__alpha': 10}

In [52]:
grid.score(X_test, y_test)

0.8054402042295686

### 모델 선택을 위한 그리드 서치
GridSearchCV와 Pipeline을 연결하는 것에서 더 나아가, 파이프라인을 구성하는 단계도 탐색 대상으로 삼을 수 있다. <br/>
스케일링을 StandardScaler()와 MinMaxScaler 중 어떤 것을 사용할지 모르겠으면 일단 그리드 서치해서 test데이터에 대한 스코어를 보고  더 나은 쪽의 스케일링방식을 선택한다.

In [53]:
# cache_folder는 전처리가 끝난 데이터일 경우 써야한다.
# cache_folder을 비유하면 빠르게 화면을 전환할 경우, 탭이 그래도 있는 것과 유사하다.

pipe =Pipeline([('preprocessing', StandardScaler()),('classifier',SVC())], memory="cache_folder")


In [54]:
param_grid=[{"classifier" : [SVC()] ,
            "preprocessing":[StandardScaler()],
            "classifier__C" : [0.001,0.01,0.1,1,10,100]},
            {"classifier" :[RandomForestClassifier(n_estimators=100)],
             "preprocessing":[None] ,
              "classifier__max_features" : [1,2,3]}
            ]

In [55]:
cancer= load_breast_cancer()
X_train, X_test, y_train, y_test= train_test_split(cancer.data, cancer.target, random_state=42)

In [56]:
grid=GridSearchCV(pipe, param_grid=param_grid, cv=5, n_jobs=-1)
grid.fit(X_train,y_train)

In [57]:
grid.best_params_

{'classifier': SVC(), 'classifier__C': 1, 'preprocessing': StandardScaler()}

In [58]:
# svc score이 잘 나온 이유-알고리즘이 잘 나온다.
grid.best_score_

0.9717920656634748

In [59]:
grid.score(X_test, y_test)

0.972027972027972