# Voting
- 서로 다른 종류의 알고리즘들을 결합한다.
- 유형
    - hard voting
        - 다수의 추정기가 결정한 예측값들 중 많은 것을 선택하는 방식
    - soft voting
        - 다수의 추정기에서 각 레이블별 예측한 확률들의 평균을 내서 높은 레이블값을 결과값으로 선택하는 방식
        - 일반적으로 soft voting의 성능이 더 좋다.
        - Voting은 성향이 다르면서 비슷한 성능을 가진 모델들을 묶었을때 가장 좋은 성능을 낸다.
    

# 01. VotingClassifier
- 매개변수
    - estimators : 양상블한 모델을 설정.("추정기이름",추정기)의 튜플을 리스트로 묶어서 전달
    - voting : voting방식
        - hard(default)
        - soft

In [4]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [7]:
wine_red = pd.read_csv('./data/winequality-red.csv',sep=';')
wine_white = pd.read_csv('./data/winequality-white.csv',sep=';')
wine_red['color'] = 1
wine_white['color'] = 0
wine = pd.concat([wine_red,wine_white])

In [8]:
from sklearn.preprocessing import LabelEncoder

wine[['quality']] = wine[['quality']].apply(LabelEncoder().fit_transform)
wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,color
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,2,1
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,2,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,2,1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,3,1
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,2,1


In [9]:
y = wine['color']
X = wine.drop(columns=['color'])

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,stratify=y)

In [10]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

knn = KNeighborsClassifier()
dt = DecisionTreeClassifier()
svm = SVC(C=0.1, gamma='auto', probability=True)
lg = LogisticRegression()

estimators = [('knn',knn),('dt',dt),('svm',svm),('lg',lg)]

from sklearn.metrics import accuracy_score
for name, model in estimators:
    
    model.fit(X_train,y_train)
    
    pred_test = model.predict(X_test)
    
    print(name,'정확도:',accuracy_score(y_test,pred_test))

knn 정확도: 0.9425641025641026
dt 정확도: 0.9856410256410256
svm 정확도: 0.8897435897435897
lg 정확도: 0.9784615384615385


In [13]:
from sklearn.ensemble import VotingClassifier

v_clf = VotingClassifier(estimators=estimators)

v_clf.fit(X_train,y_train)

pred_test_hard = v_clf.predict(X_test)

accuracy_score(y_test,pred_test_hard)

0.9553846153846154

In [14]:
from sklearn.ensemble import VotingClassifier

v_clf = VotingClassifier(estimators=estimators, voting='soft')

v_clf.fit(X_train,y_train)

pred_test_soft = v_clf.predict(X_test)

accuracy_score(y_test,pred_test_soft)

0.9738461538461538

In [16]:
from sklearn.datasets import load_boston
boston = load_boston()
boston_df = pd.DataFrame(boston.data, columns = boston.feature_names)
boston_df['PRICE'] = boston.target
boston_df.shape

(506, 14)

In [17]:
boston_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [25]:
y_target = boston_df['PRICE']
X_data = boston_df.drop(['PRICE'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X_data, y_target, test_size=0.3)


In [26]:
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor

ridge = Ridge()
lasso = Lasso()
dtr = DecisionTreeRegressor()

estimators = [('ridge',ridge),('lasso',lasso),('dtr',dtr)]

from sklearn.metrics import mean_squared_error
for name, model in estimators:
    
    model.fit(X_train,y_train)
    
    pred_test = model.predict(X_test)
    
    print(name,'RMSE:',np.sqrt(mean_squared_error(y_test,pred_test)))

ridge RMSE: 4.646668060168905
lasso RMSE: 4.988342934954998
dtr RMSE: 4.0999518610781145


In [27]:
from sklearn.ensemble import VotingRegressor

v_clf = VotingRegressor(estimators=estimators)

v_clf.fit(X_train,y_train)

pred_test_hard = v_clf.predict(X_test)

np.sqrt(mean_squared_error(y_test,pred_test))

4.0999518610781145

In [30]:
from sklearn.ensemble import VotingRegressor

v_clf = VotingRegressor(estimators=estimators, n_jobs=-1)

v_clf.fit(X_train,y_train)

pred_test_all = v_clf.predict(X_test)

np.sqrt(mean_squared_error(y_test,pred_test_all))

3.7437649240237656