# Voting(1)

## #01. 패키지

In [1]:
from pandas import DataFrame, read_excel
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## #02. 데이터

In [2]:
origin = read_excel("https://data.hossam.kr/G02/breast_cancer.xlsx")
print(origin.info())
origin.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


## #03. 데이터 전처리

### 독립/종속변수 분리

In [3]:
origin.columns

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension',
       'target'],
      dtype='object')

In [4]:
x = origin.drop("target", axis=1)
y = origin['target']
x.shape, y.shape

((569, 30), (569,))

### 데이터 표준화

In [5]:
scaler = StandardScaler()
std_x = scaler.fit_transform(x)
std_x[:5]

array([[ 1.09706398e+00, -2.07333501e+00,  1.26993369e+00,
         9.84374905e-01,  1.56846633e+00,  3.28351467e+00,
         2.65287398e+00,  2.53247522e+00,  2.21751501e+00,
         2.25574689e+00,  2.48973393e+00, -5.65265059e-01,
         2.83303087e+00,  2.48757756e+00, -2.14001647e-01,
         1.31686157e+00,  7.24026158e-01,  6.60819941e-01,
         1.14875667e+00,  9.07083081e-01,  1.88668963e+00,
        -1.35929347e+00,  2.30360062e+00,  2.00123749e+00,
         1.30768627e+00,  2.61666502e+00,  2.10952635e+00,
         2.29607613e+00,  2.75062224e+00,  1.93701461e+00],
       [ 1.82982061e+00, -3.53632408e-01,  1.68595471e+00,
         1.90870825e+00, -8.26962447e-01, -4.87071673e-01,
        -2.38458552e-02,  5.48144156e-01,  1.39236330e-03,
        -8.68652457e-01,  4.99254601e-01, -8.76243603e-01,
         2.63326966e-01,  7.42401948e-01, -6.05350847e-01,
        -6.92926270e-01, -4.40780058e-01,  2.60162067e-01,
        -8.05450380e-01, -9.94437403e-02,  1.80592744e+

> std_x, x로 표준화 유무를 통한 성능 평가를 할 수 있음

대체로 표준화를 한 경우 성능이 개선

### 훈련/검증 데이터 분할

In [6]:
x_train, x_test, y_train, y_test = train_test_split(std_x, y, test_size=0.3, random_state=777)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((398, 30), (171, 30), (398,), (171,))

## #04. Simple Model

### 개발 모델 정의

In [7]:
lr = LogisticRegression()
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier()

### 개별 모델을 앙상을 모델로 결합

`estimators` : 결합할 알고리즘 정의 (이름, 알고리즘객체) - 이름은 개발자 임의로 지정

`voting` : soft/hard 중 선택 - hard : 다수결, soft : 확률의 평균

In [8]:
# 학습 객체 생성
vo = VotingClassifier(estimators=[('LR',lr), ('KNN', knn), ('DTREE', dt)], 
                      voting='soft')

# 학습
vo.fit(x_train, y_train)

# 예측
y_pred = vo.predict(x_test)

# 검증 데이터에 대한 정확도
score = accuracy_score(y_test, y_pred)
print(f"Voting 분류기 정확도 : {score:.4f}")

Voting 분류기 정확도 : 0.9708


### 개별 모델과의 결과 비교

In [9]:
classifiers = [lr, knn, dt]

for c in classifiers:
    c.fit(x_train, y_train)
    y_pred = c.predict(x_test)
    score = accuracy_score(y_test, y_pred)
    print(f"{c.__class__.__name__} 정확도 : {score:.4f}")

LogisticRegression 정확도 : 0.9766
KNeighborsClassifier 정확도 : 0.9708
DecisionTreeClassifier 정확도 : 0.9123


> 대부분의 예제에서는 Voting의 성능이 조금 더 높게 나오지만 항상 그렇다고 확신할 수는 없다. 여러 번 실행할 경우 Voting 분류기의 성능이 더 높을 때도, 더 낮을 때도 발생하는 만큼 최적의 결과를 도출하기 위해서는 여러 번 다양한 테스트를 수행해야 한다. 즉, 다양한 실험을 진행.

## #05. 최적 파라미터 찾기

`cv` : 쪼개는 단위

In [11]:
# 학습 객체 생성
vo = VotingClassifier(
    estimators=[('LR',LogisticRegression()), 
                ('KNN', KNeighborsClassifier()), 
                ('DTREE', DecisionTreeClassifier())],
    voting='soft'
    )

# 테스트할 파라미터
# params = {"voting" : ['hard', 'soft']}
params = {"voting" : ['hard', 'soft'],
          "KNN__n_neighbors":[1,3,5,7,],
          "KNN__metric":["euclidean", "manhattan"],
        #   "KNN__weights":["uniform", "distance"],
          "DTREE__max_depth":[3,5,7],
        #   "DTREE__min_samples_split":[2,3,5],
        #   "DTREE__min_samples_leaf":[1,3,5],
          "DTREE__max_features":[2,3,5],}

grid = GridSearchCV(vo, param_grid=params, cv=5)
# 학습
grid.fit(x_train, y_train)

# 최적의 파라미터 추출
print(grid.best_params_)

# 26-SVM(1) 참고
# 보고서 용으로 각 회차마다 검증을 수행한 과정을 보여준다
result_df = DataFrame(grid.cv_results_['params'])
# result_df
result_df['mean_test_score'] = grid.cv_results_['mean_test_score']
# result_df
result_df.sort_values(by='mean_test_score', ascending=False)

{'DTREE__max_depth': 3, 'DTREE__max_features': 5, 'KNN__metric': 'euclidean', 'KNN__n_neighbors': 1, 'voting': 'hard'}


Unnamed: 0,DTREE__max_depth,DTREE__max_features,KNN__metric,KNN__n_neighbors,voting,mean_test_score
32,3,5,euclidean,1,hard,0.987405
73,5,3,manhattan,1,soft,0.987405
120,7,3,manhattan,1,hard,0.987405
113,7,3,euclidean,1,soft,0.987405
56,5,2,manhattan,1,hard,0.984905
...,...,...,...,...,...,...
143,7,5,manhattan,7,soft,0.962247
141,7,5,manhattan,5,soft,0.962215
103,7,2,euclidean,7,soft,0.959715
61,5,2,manhattan,5,soft,0.957215


> 알고리즘(파라미터)를 많이 설정할 수록 시간이 오래 걸리는 만큼 일정 수만 설정할 수 있도록 유도

### 최적의 파라미터에 대한 학습 정확도

In [14]:
grid.best_score_

0.9874050632911393

### 최적의 파라미터를 갖는 객체

In [15]:
best = grid.best_estimator_
best

### 최적의 객체로 검증 데이터 예측

In [16]:
y_pred = best.predict(x_test)
y_pred[:5]

array([1, 1, 0, 1, 0], dtype=int64)

### 결과 정확도

In [17]:
score = accuracy_score(y_test, y_pred)
print(f'GridSearchCV 분류기 정확도: {score:.4f}')

GridSearchCV 분류기 정확도: 0.9708
