In [2]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

## Adaboost
Adaboost为一种集成学习方法，集成学习利用多个模型以同个输入的预测结果进行投票，通常可以提升2~3%的精准度，集成学习大致上分为两种方法，一种为bagging，另一种为boost
1. bagging算法里的多个模型都是同个算法，例如都是决策树，但是利用不同的训练样本来制造模型的差异性，训练样本用随机抽取再放回的方式，知道训练集的大小与原来的一样，故会有重复的样本，比如随机森林。
2. boost算法的训练需要序列化的训练，但是对数据设置权重，分类错误的样本提升权重，故一开始对所有样本的权重设为1进行训练，针对分类错误的样本提升权重，对的样本降低权重，直到错误率降低至一个阈值，比如adaboost。

接下来讲Adaboost具体的算法
1. 初始化所有训练数据的权重为1/N。
2. 训练分类器，如果某个样本正确分类，则降低权重，错误分类则提升权重，再基于这些权重进行训练，反复迭代。我们有错误率$\epsilon $以及$\alpha =\frac{1}{2}ln(\frac{1-\epsilon }{\epsilon })$，我们有个权重向量V，对于分类正确的样本，乘$\frac{e^{-\alpha}}{sum(V)}$，对于分类错误的样本，乘$\frac{e^{-\alpha}}{sum(V)}$
3. 最后一步则是将所有序列化训练的分类器组成集成学习。但是对分类误差低的分类器增加权重，再进行投票。

### 优缺点

优点：精准度高、复现容易、不需要调试超参数  
缺点：对离群值敏感

我们先用最简单的决策树分类器来演示adaboost

In [9]:
def loadSimpData():
    datMat = np.matrix([[ 1. ,  2.1],
        [ 2. ,  1.1],
        [ 1.3,  1. ],
        [ 1. ,  1. ],
        [ 2. ,  1. ]])
    classLabels = [1.0, 1.0, -1.0, -1.0, 1.0]
    return datMat,classLabels

def stumpClassify(dataMatrix,dimen,threshVal,threshIneq):#just classify the data
    retArray = np.ones((np.shape(dataMatrix)[0],1))
    if threshIneq == 'lt':
        retArray[dataMatrix[:,dimen] <= threshVal] = -1.0
    else:
        retArray[dataMatrix[:,dimen] > threshVal] = -1.0
    return retArray
    

def buildStump(dataArr,classLabels,D):
    dataMatrix = np.mat(dataArr); labelMat = np.mat(classLabels).T
    m,n = np.shape(dataMatrix)
    numSteps = 10.0; bestStump = {}; bestClasEst = np.mat(np.zeros((m,1)))
    minError = np.inf #init error sum, to +infinity
    for i in range(n):#loop over all dimensions
        rangeMin = dataMatrix[:,i].min(); rangeMax = dataMatrix[:,i].max();
        stepSize = (rangeMax-rangeMin)/numSteps
        for j in range(-1,int(numSteps)+1):#loop over all range in current dimension
            for inequal in ['lt', 'gt']: #go over less than and greater than
                threshVal = (rangeMin + float(j) * stepSize)
                predictedVals = stumpClassify(dataMatrix,i,threshVal,inequal)#call stump classify with i, j, lessThan
                errArr = np.mat(np.ones((m,1)))
                errArr[predictedVals == labelMat] = 0
                weightedError = D.T*errArr  #calc total error multiplied by D
                #print "split: dim %d, thresh %.2f, thresh ineqal: %s, the weighted error is %.3f" % (i, threshVal, inequal, weightedError)
                if weightedError < minError:
                    minError = weightedError
                    bestClasEst = predictedVals.copy()
                    bestStump['dim'] = i
                    bestStump['thresh'] = threshVal
                    bestStump['ineq'] = inequal
    return bestStump,minError,bestClasEst
D=np.mat(np.ones((5,1))/5)
dataMat, labels = loadSimpData()
buildStump(dataMat, labels, D)

({'dim': 0, 'ineq': 'lt', 'thresh': 1.3}, matrix([[0.2]]), array([[-1.],
        [ 1.],
        [-1.],
        [-1.],
        [ 1.]]))

In [16]:
def adaBoostTrainDS(dataArr,classLabels,numIt=40):
    weakClassArr = []
    m = np.shape(dataArr)[0]
    D = np.mat(np.ones((m,1))/m)   #init D to all equal
    aggClassEst = np.mat(np.zeros((m,1)))
    for i in range(numIt):
        bestStump,error,classEst = buildStump(dataArr,classLabels,D)#build Stump
        #print "D:",D.T
        alpha = float(0.5*np.log((1.0-error)/max(error,1e-16)))#calc alpha, throw in max(error,eps) to account for error=0
        bestStump['alpha'] = alpha  
        weakClassArr.append(bestStump)                  #store Stump Params in Array
        #print "classEst: ",classEst.T
        expon = np.multiply(-1*alpha*np.mat(classLabels).T,classEst) #exponent for D calc, getting messy
        D = np.multiply(D,np.exp(expon))                              #Calc New D for next iteration
        D = D/D.sum()
        #calc training error of all classifiers, if this is 0 quit for loop early (use break)
        aggClassEst += alpha*classEst
        #print "aggClassEst: ",aggClassEst.T
        aggErrors = np.multiply(np.sign(aggClassEst) != np.mat(classLabels).T,np.ones((m,1)))
        errorRate = aggErrors.sum()/m
        print("total error: ",errorRate)
        if errorRate == 0.0: break
    return weakClassArr,aggClassEst
classifiers, aggClassEst = adaBoostTrainDS(dataMat, labels, 9)

total error:  0.2
total error:  0.2
total error:  0.0


In [43]:
def adaClassify(datToClass,classifierArr):
    dataMatrix = np.mat(datToClass) # do stuff similar to last aggClassEst in adaBoostTrainDS
    m = np.shape(dataMatrix)[0]
    aggClassEst = np.mat(np.zeros((m,1)))
    for i in range(len(classifierArr)):
        classEst = stumpClassify(dataMatrix,int(classifierArr[i]['dim']),\
                                 classifierArr[i]['thresh'],\
                                 classifierArr[i]['ineq']) # call stump classify
        aggClassEst += classifierArr[i]['alpha']*classEst
#         print(aggClassEst)
    return np.sign(aggClassEst)

我们接下来我们使用之前判断马的生死数据进行示例

In [20]:
def loadDataSet(fileName):      #general function to parse tab -delimited floats
    numFeat = len(open(fileName).readline().split('\t')) #get number of fields 
    dataMat = []; labelMat = []
    fr = open(fileName)
    for line in fr.readlines():
        lineArr =[]
        curLine = line.strip().split('\t')
        for i in range(numFeat-1):
            lineArr.append(float(curLine[i]))
        dataMat.append(lineArr)
        labelMat.append(float(curLine[-1]))
    return dataMat,labelMat
dataMat, labels = loadDataSet('datasets/horseColicTraining2.txt')

In [38]:
classifiers, aggClassEst=adaBoostTrainDS(dataMat, labels, 20)

total error:  0.2842809364548495
total error:  0.2842809364548495
total error:  0.24749163879598662
total error:  0.24749163879598662
total error:  0.25418060200668896
total error:  0.2408026755852843
total error:  0.2408026755852843
total error:  0.22073578595317725
total error:  0.24749163879598662
total error:  0.23076923076923078
total error:  0.2408026755852843
total error:  0.2140468227424749
total error:  0.22742474916387959
total error:  0.21739130434782608
total error:  0.22073578595317725
total error:  0.21739130434782608
total error:  0.22408026755852842
total error:  0.22408026755852842
total error:  0.23076923076923078
total error:  0.22408026755852842


In [45]:
testMat,test_labels=loadDataSet('datasets/horseColicTest2.txt')
predicts = adaClassify(testKMat,classifiers)
errArr=np.mat(np.ones((67,1)))
errArr[predicts!=np.mat(test_labels).T].sum()

15.0

错误率来到0.15，之前使用逻辑回归预测马是死是活的错误率是0.34左右