# 集成学习（python实现）

#### 介绍
集成多个模型，普遍化和多样化，降低方差

#### 基础基础方法
* 最大投票法（Max Voting）
* 平均（Averaging）法
* 加权平均（Weighted Average）法

#### 高级集成方法
* 堆叠（Stacking）
* 混合（Blending）
* Bagging
* 提升（Boosting）

#### 基于Bagging和Boosting的算法

* Bagging meta-estimator
* 随机森林
* AdaBoost
* GBM
* XGB
* Light GBM
* CatBoost

In [21]:
from sklearn import datasets
import pandas as pd
import numpy as np
from statistics import mode

In [73]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [64]:
iris = datasets.load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

#### 最大投票法

In [74]:
model1 = DecisionTreeClassifier()
model2 = KNeighborsClassifier()
model3= LogisticRegression()

model1.fit(X_train,y_train)
model2.fit(X_train,y_train)
model3.fit(X_train,y_train)

pred1=model1.predict(X_test)
pred2=model2.predict(X_test)
pred3=model3.predict(X_test)

final_pred = np.array([])

for i in range(0,len(X_test)):
    final_pred =np.append(final_pred, mode([pred1[i], pred2[i], pred3[i]]))
accuracy_score(final_pred, y_test)

1.0

In [107]:
from sklearn.ensemble import VotingClassifier

model1 = LogisticRegression()
model2 = KNeighborsClassifier()
model3 = DecisionTreeClassifier()

model = VotingClassifier(estimators=[('lr', model1), ('knn', model2),('dt', model3)], voting='hard')
model.fit(X_train,y_train)
final_pred = model.predict(X_test)
# model.score(X_test,y_test)
accuracy_score(final_pred, y_test)

  if diff:


1.0

In [109]:
final_pred

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, 0, 0, 2, 1, 0])

#### 平均法

In [105]:
model1 = DecisionTreeClassifier()
model2 = KNeighborsClassifier()
model3= LogisticRegression()

model1.fit(X_train,y_train)
model2.fit(X_train,y_train)
model3.fit(X_train,y_train)

pred1=model1.predict_proba(X_test)
pred2=model2.predict_proba(X_test)
pred3=model3.predict_proba(X_test)

finalpred=(pred1+pred2+pred3)/3
accuracy_score(np.argmax(finalpred, axis=1), y_test)

1.0

In [106]:
np.argmax(finalpred, axis=1)

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, 0, 0, 2, 1, 0])

#### 加权平均法

In [103]:
model1 = DecisionTreeClassifier()
model2 = KNeighborsClassifier()
model3= LogisticRegression()

model1.fit(X_train,y_train)
model2.fit(X_train,y_train)
model3.fit(X_train,y_train)

pred1=model1.predict_proba(X_test)
pred2=model2.predict_proba(X_test)
pred3=model3.predict_proba(X_test)

finalpred=(pred1*0.3+pred2*0.3+pred3*0.4)
accuracy_score(np.argmax(finalpred, axis=1), y_test)

1.0

In [104]:
np.argmax(finalpred, axis=1)

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, 0, 0, 2, 1, 0])

### 高级集成方法
#### 堆叠/Stacking

In [97]:
def Stacking(model, train, y, test, n_fold):
    folds = StratifiedKFold(n_splits=n_fold,random_state=1)
    test_pred = np.empty((test.shape[0],1),float)
    train_pred = np.empty((0,1),float)

    for train_indices, val_indices in folds.split(train,y):
        X_train,X_val=train[train_indices],train[val_indices]
        y_train,y_val=y[train_indices],y[val_indices]

        model.fit(X=X_train,y=y_train)
        train_pred=np.append(train_pred,model.predict(X_val))
        test_pred=np.append(test_pred,model.predict(test))
        
    return test_pred.reshape(-1,1),train_pred

In [98]:
# X_train = pd.DataFrame(X_train)
# y_train = pd.DataFrame(y_train)
model1 = DecisionTreeClassifier(random_state=1)
test_pred1, train_pred1=Stacking(model=model1,n_fold=10, train=X_train,test=X_test,y=y_train)
train_pred1=pd.DataFrame(train_pred1)
test_pred1=pd.DataFrame(test_pred1)

model2 = KNeighborsClassifier()
test_pred2, train_pred2=Stacking(model=model2,n_fold=10,train=X_train,test=X_test,y=y_train)

train_pred2=pd.DataFrame(train_pred2)
test_pred2=pd.DataFrame(test_pred2)

In [100]:
df = pd.concat([train_pred1, train_pred2], axis=1)
df_test = pd.concat([test_pred1, test_pred2], axis=1)

model = LogisticRegression(random_state=1)
model.fit(df,y_train)
model.predict(df_test)
# model.score(df_test, y_test)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 1, 1, 0,
       1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2, 0, 2, 2, 2, 2, 2,
       0, 0, 0, 0, 1, 0, 0, 2, 1, 0, 1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0,
       0, 0, 0, 1, 2, 1, 1, 2, 0, 2, 0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, 0,
       0, 2, 1, 0, 1, 0, 2, 1, 2, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1,
       1, 2, 0, 2, 0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, 0, 0, 2, 1, 0, 1, 0,
       2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2, 0, 2,
       2, 2, 2, 2, 0, 0, 0, 0, 1, 0, 0, 2, 1, 0, 1, 0, 2, 1, 1, 0, 1, 2,
       1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2, 0, 2, 2, 2, 2, 2, 0, 0,
       0, 0, 1, 0, 0, 2, 1, 0, 1, 0, 2, 1, 2, 0, 1, 2, 1, 1, 2, 0, 0, 0,
       0, 1, 2, 1, 1, 2, 0, 2, 0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, 0, 0, 2,
       1, 0, 1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2,
       0, 2, 0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, 0, 0,

参考来源：
* http://url.cn/5LWIsw8
* https://www.analyticsvidhya.com/blog/2017/09/common-machine-learning-algorithms/