## 보팅방식 모델 구현 <hr>
- 데이터 : sklearn.datasets의 breast_cancer
- 유형 : 지도학습 + 분류
- 방법 : Voting 방식으로 진행 => LogisticRegression, DecisionTreeClassifier, SVC
- 학습데이터셋 : 동일한 데이터셋으로 3개의 모델로 학습 진행

### [1] 모듈 로딩 및 데이터 준비 <hr>

In [9]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer

X, y = load_breast_cancer(as_frame=True, return_X_y=True)
X.shape, y.shape
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    stratify=y,
                                                    random_state=5)

### [2] 학습 진행 <hr>

#### [2-1] 앙상블 보팅 학습에 사용할 모델 인스턴스 생성

In [10]:
# LogisticRegression 인스턴스

from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(solver='liblinear')
lr_model.fit(X_train,y_train)

In [11]:
# DecisionTreeClassifier 인스턴스 생성
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)

In [43]:
# SVC 인스턴스 생성

from sklearn.svm import SVC

sv_model = SVC(probability=True)
sv_model.fit(X_train, y_train)

#### [2-2] Ensemble 알고리즘 기반 분류

In [44]:
from sklearn.ensemble import VotingClassifier

# 동일 데이터셋으로 병렬 학습 진행할 모델 리스트 선정 및 결과 결정 방법 설정
vt_models = VotingClassifier(estimators=[('lr_model', lr_model), 
                                        ('dt_model', dt_model), 
                                        ('sv_model', sv_model)],
                            verbose=True,
                            )

In [45]:
# 동일 데이터 셋을 전달해서 3개의 모델 동시에 학습 진행
vt_models.fit(X_train,y_train)

[Voting] ................. (1 of 3) Processing lr_model, total=   0.0s
[Voting] ................. (2 of 3) Processing dt_model, total=   0.0s
[Voting] ................. (3 of 3) Processing sv_model, total=   0.0s


In [46]:
# 예측하기
new_data = pd.DataFrame([X_test.iloc[0]], columns=X_test.columns)
new_data
# vt_models.predict([X_test.iloc[0]])

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
525,8.571,13.1,54.53,221.3,0.1036,0.07632,0.02565,0.0151,0.1678,0.07126,...,9.473,18.45,63.3,275.6,0.1641,0.2235,0.1754,0.08512,0.2983,0.1049


In [47]:
# 보팅 인스턴스 내의 학습기들 => 접근 방법 (1) 리스트 형식
vt_models.estimators_

[LogisticRegression(solver='liblinear'),
 DecisionTreeClassifier(),
 SVC(probability=True)]

In [48]:
# 보팅 인스턴스 내의 학습기들 => 접근 방법 (2) 딕셔너리 형식
vt_models.named_estimators_

{'lr_model': LogisticRegression(solver='liblinear'),
 'dt_model': DecisionTreeClassifier(),
 'sv_model': SVC(probability=True)}

In [49]:
vt_models.named_estimators_.get('lr_model').predict(new_data), vt_models.named_estimators_.get('dt_model').predict(new_data), vt_models.named_estimators_.get('sv_model').predict(new_data)

(array([1]), array([1]), array([1]))

In [50]:
for k, v in vt_models.named_estimators_.items():
    print(k, v.predict(new_data))

lr_model [1]
dt_model [1]
sv_model [1]


In [51]:
X_test.iloc[0].to_frame().T

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
525,8.571,13.1,54.53,221.3,0.1036,0.07632,0.02565,0.0151,0.1678,0.07126,...,9.473,18.45,63.3,275.6,0.1641,0.2235,0.1754,0.08512,0.2983,0.1049


- 3개 알고리즘 모델에 대한 성능 평가

In [22]:
print(f'[LogisticRegression]      Train : {lr_model.score(X_train,y_train):5.3f}, Test : {lr_model.score(X_test,y_test):5.3f}')
print(f'[DecisionTreeClassifier]  Train : {dt_model.score(X_train,y_train):5.3f}, Test : {dt_model.score(X_test,y_test):5.3f}')
print(f'[SupportVectorClassifier] Train : {sv_model.score(X_train,y_train):5.3f}, Test : {sv_model.score(X_test,y_test):5.3f}')

[LogisticRegression]      Train : 0.958, Test : 0.965
[DecisionTreeClassifier]  Train : 1.000, Test : 0.912
[SupportVectorClassifier] Train : 0.916, Test : 0.877
