# Bagging(3)

1. singleML 함수를 사용해 가장 높은 스코어를 보여주는 알고리즘 확인
    - KNeighborsClassifier
2. KNeighborsClassifier에 대하여 GridSearchCV를 적용해 최적의 파라미터 찾기
3. BaggingClassifier에 대해 GridSearchCV를 적용
    - base_estimator=KNeighborsClassifier()에는 2번 단계에서 도출한 파라미터 적용

## #01. 패키지

In [1]:
import warnings
warnings.filterwarnings('ignore')

from pandas import read_excel,DataFrame
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.neighbors import KNeighborsClassifier

## #02. 분류 문제

### 1. 데이터

In [3]:
origin = read_excel("https://data.hossam.kr/G02/breast_cancer.xlsx")
print(origin.info())
origin.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


### 2. 데이터 전처리

#### 독립/종속변수 분리

In [4]:
origin.columns

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension',
       'target'],
      dtype='object')

In [5]:
x = origin.drop("target", axis=1)
y = origin['target']
x.shape, y.shape

((569, 30), (569,))

#### 데이터 표준화

In [6]:
scaler = StandardScaler()
std_x = scaler.fit_transform(x)
std_x[:1]

array([[ 1.09706398, -2.07333501,  1.26993369,  0.9843749 ,  1.56846633,
         3.28351467,  2.65287398,  2.53247522,  2.21751501,  2.25574689,
         2.48973393, -0.56526506,  2.83303087,  2.48757756, -0.21400165,
         1.31686157,  0.72402616,  0.66081994,  1.14875667,  0.90708308,
         1.88668963, -1.35929347,  2.30360062,  2.00123749,  1.30768627,
         2.61666502,  2.10952635,  2.29607613,  2.75062224,  1.93701461]])

#### 훈련/검증 데이터 분할

In [7]:
x_train, x_test, y_train, y_test = train_test_split(std_x, y, test_size=0.3, random_state=777)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((398, 30), (171, 30), (398,), (171,))

### 3. 분류 모델 구현

#### Bagging 모델 구현

`cv` : 쪼개는 단위

`n_jobs` : 실행할 병렬 작업의 수. CPU의 프로세스 수만큼 설정 가능. -1은 모든 프로세서를 사용함을 의미.

In [13]:
clf = BaggingClassifier(
    base_estimator=KNeighborsClassifier(),
    random_state=777,
    n_jobs=-1
)

params = {
    'bootstrap_features':[True,False],
    "bootstrap":[True, False],
    "n_estimators":[30,50]
}

In [10]:
# 학습
grid = GridSearchCV(clf, param_grid=params, cv=5)
grid.fit(x_train, y_train)
# 최적화 파라미터 확인
print(grid.best_params_)

{'bootstrap': True, 'bootstrap_features': True, 'n_estimators': 50}


In [14]:
# 학습 결과 시각화
result_df = DataFrame(grid.cv_results_['params'])
# 평균 값 도출ㄹ
result_df['mean_test_score'] = grid.cv_results_['mean_test_score']
# 정렬
result_df.sort_values(by='mean_test_score', ascending=False)

Unnamed: 0,bootstrap,bootstrap_features,n_estimators,mean_test_score
1,True,True,50,0.964778
5,False,True,50,0.964778
3,True,False,50,0.964747
6,False,False,30,0.964747
7,False,False,50,0.964747
0,True,True,30,0.962278
4,False,True,30,0.962278
2,True,False,30,0.959715


### 최적의 파라미터에 대한 학습 정확도

In [15]:
grid.best_score_

0.9647784810126583

### 최적의 파라미터를 갖는 객체

In [16]:
best = grid.best_estimator_
best

### 최적의 객체로 검증 데이터 예측

In [17]:
y_pred = best.predict(x_test)
y_pred[:5]

array([1, 1, 0, 1, 0], dtype=int64)

### 결과에 대한 정확도

In [18]:
score = accuracy_score(y_test, y_pred)
print(f'GridSearchCV 분류기 정확도: {score:.4f}')

GridSearchCV 분류기 정확도: 0.9766
