### 앙상블(Ensemble)학습

In [4]:
from sklearn.datasets import load_breast_cancer
cancer =load_breast_cancer()

In [5]:
from sklearn.preprocessing import StandardScaler
cancer_std =StandardScaler().fit_transform(cancer.data)

In [6]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test =train_test_split(
    cancer_std,cancer.target,stratify=cancer.target,test_size=0.2,random_state=2023
)

#### 1. Voting 방식
#### 1.1 Hard voting
- 로지스틱 회귀
- 서포트 벡터 머신
- K최근접 이웃

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [8]:
lrc =LogisticRegression(random_state=2023)
svc =SVC(random_state=2023)
knn =KNeighborsClassifier()

In [10]:
# 하드 보팅을 위한 아앙블 분류기
from sklearn.ensemble import VotingClassifier
voc =VotingClassifier(
    estimators=[('LRC',lrc),('SVC',svc),('KNN',knn)],  # 이옵션은 이 로지스틱에 있는거구나
    voting='hard'
)

In [11]:
voc.fit(X_train,y_train)
voc.score(X_test,y_test)

0.9298245614035088

In [12]:
# 개별 분류기의 성능
lrc.fit(X_train,y_train)
svc.fit(X_train,y_train)
knn.fit(X_train,y_train)
lrc.score(X_test,y_test),svc.score(X_test,y_test),knn.score(X_test,y_test)


(0.9473684210526315, 0.9298245614035088, 0.9122807017543859)

#### 1.2 Soft voting

- predict_proba() 메소드를 지원하는 분류기인 경우에  사용 가능

In [14]:
#dir(sklearn)은 scikit-learn 모듈에서 사용 가능한 모든 속성과 메서드의 목록을 반환
dir(lrc)

['C',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_check_feature_names',
 '_check_n_features',
 '_estimator_type',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_parameter_constraints',
 '_predict_proba_lr',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_validate_data',
 '_validate_params',
 'class_weight',
 'classes_',
 'coef_',
 'decision_function',
 'densify',
 'dual',
 'fit',
 'fit_intercept',
 'get_params',
 'intercept_',
 'intercept_scaling',
 'l1_ratio',
 'max_iter',
 'multi_class',
 'n_features_in_',
 'n_iter_',
 'n_jobs',
 'penalty',
 'predict',
 'predict_log_proba',
 'predict_prob

In [15]:
lrc.predict_proba(X_test[:3])

array([[9.98781249e-01, 1.21875135e-03],
       [9.76075906e-04, 9.99023924e-01],
       [6.40363671e-02, 9.35963633e-01]])

In [16]:
knn.predict_proba(X_test[:3])

array([[1., 0.],
       [0., 1.],
       [0., 1.]])

In [19]:
# SVC 는 probability=False(default)인 경우에 predict_proba()메소를 지원하지 않음
svc.predict_proba(X_test[:3])

AttributeError: predict_proba is not available when  probability=False

In [20]:
svc2 =SVC(probability=True,random_state=2023)
svc2.fit(X_train,y_train)
svc2.predict_proba(X_test[:3])

array([[9.99574375e-01, 4.25625266e-04],
       [5.14249474e-08, 9.99999949e-01],
       [1.65822655e-02, 9.83417734e-01]])

- Soft voting

In [21]:
voc2 = VotingClassifier(
    estimators=[('LRC',lrc),('SVC',svc2),('KNN',knn)],
    voting='soft'
)

In [22]:
voc2.fit(X_train,y_train)
voc2.score(X_test,y_test)

0.9298245614035088

In [23]:
voc2.predict_proba(X_test[:3])

array([[9.99451874e-01, 5.48125537e-04],
       [3.25375777e-04, 9.99674624e-01],
       [2.68728775e-02, 9.73127122e-01]])

- GridsearchCV

In [24]:
lrc.C,svc2.C

(1.0, 1.0)

In [26]:
params ={
    'LRC__C':[0.1,1,10],
    'SVC__C':[0.1,1,10]
}

In [27]:
from sklearn.model_selection import GridSearchCV
grid_voc2 =GridSearchCV(voc2,params,scoring='accuracy',cv=5)
grid_voc2.fit(X_train,y_train)
grid_voc2.best_params_

{'LRC__C': 10, 'SVC__C': 1}

In [29]:
params ={
    'LRC__C':[5,10,20],
    'SVC__C':[0.5,1,3]
}
grid_voc2 =GridSearchCV(voc2,params,scoring='accuracy',cv=5)
grid_voc2.fit(X_train,y_train)
grid_voc2.best_params_

{'LRC__C': 10, 'SVC__C': 1}

In [30]:
grid_voc2.best_estimator_.score(X_test,y_test)

0.9473684210526315

### 2. Bagging 방식 -Random Forest

In [31]:
from sklearn.ensemble import RandomForestClassifier
rfc =RandomForestClassifier(random_state=2023)
rfc.fit(X_train,y_train)
rfc.score(X_test,y_test)

0.9210526315789473

In [32]:
rfc.predict_proba(X_test[:3])

array([[0.99, 0.01],
       [0.  , 1.  ],
       [0.18, 0.82]])