# 앙상블 학습 p.210

In [1]:
import pandas as pd

from sklearn.ensemble import VotingClassifier # 보팅 앙상블 클래스
from sklearn.linear_model import LogisticRegression # 로지스틱회기
from sklearn.neighbors import KNeighborsClassifier # 중심점에서 근처의 거리를 계산해 다시 중심점을 찾는다. 반복
from sklearn.datasets import load_breast_cancer # 위스콘신 유방암 데이터셋
from sklearn.model_selection import train_test_split # 학습,결과 나누기
from sklearn.metrics import accuracy_score # 정확도 평가

In [5]:
cancer = load_breast_cancer()
data_df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
data_df.head(2)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902


In [10]:
data_df.info()
# 평균10, 에러10, 최악10

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [11]:
# 데이터의 균형 확인
pd.Series(cancer.target).value_counts()

1    357
0    212
dtype: int64

In [14]:
# 개별 모델 '로지스틱 회귀', 'KNN' 객체 생성
lr_clf = LogisticRegression(solver='liblinear')
knn_clf = KNeighborsClassifier(n_neighbors=8)
# 학습에 이용한 데이터세팅을 알아야 새로 데이터를 수집하고 예측 처리할 수 있다.

# 개별 모델을 소프트 보팅 기반의 앙상블 모델로 구현한 분류기
# estimators는 튜플형식으로 개별 모델들을 리스트로 받는다.
vo_clf = VotingClassifier(estimators=[('LR', lr_clf), ('KNN', knn_clf)], voting='soft')

# 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(cancer.data,
                                                    cancer.target,
                                                    test_size=0.2,
                                                    random_state=156)

# VotingClassifier 학습, 예측, 평가
vo_clf.fit(X_train, y_train) # 학습
pred = vo_clf.predict(X_test) # 예측
accuracy_score(y_test, pred) # 정확도 평가

0.956140350877193

In [16]:
# 개별 모델의 학습, 예측, 평가
classifiers = [lr_clf, knn_clf]
for classifier in classifiers:
    classifier.fit(X_train, y_train)
    pred = classifier.predict(X_test)
    class_name=classifier.__class__.__name__
    print(f'{class_name} 정확도: {accuracy_score(y_test, pred)}')

LogisticRegression 정확도: 0.9473684210526315
KNeighborsClassifier 정확도: 0.9385964912280702


In [21]:
classifier.__class__.__name__

'KNeighborsClassifier'

In [None]:
# 보편적으로 앙상블 학습은 개별 학습보다는 정확도가 올라간다.

# 랜덤 포레스트 p.216

In [None]:
# 배깅 보팅, 여러 DecisionTree 객체로 보팅