In [1]:
from numpy import *

#加载数据
def loadDataSet(fileName):      
    numFeat = len(open(fileName).readline().split('\t')) #get number of fields 
    dataMat = []; labelMat = []
    fr = open(fileName)
    for line in fr.readlines():
        lineArr =[]
        curLine = line.strip().split('\t')
        for i in range(numFeat-1):
            lineArr.append(float(curLine[i]))
        dataMat.append(lineArr)
        labelMat.append(float(curLine[-1]))
    return dataMat,labelMat
    
#构建单层决策树（弱学习器）
#简单的只是将样本集以阈值为基准分开
def stumpClassify(dataMatrix,dimen,threshVal,threshIneq):
    retArray = ones((shape(dataMatrix)[0],1))
    #若选取的是小于阈值的样本，则将所有样本中第dimen维特征小于阈值的返类别值设为-1
    if threshIneq == 'lt':
        retArray[dataMatrix[:,dimen] <= threshVal] = -1.0
    else:
        retArray[dataMatrix[:,dimen] > threshVal] = -1.0
    return retArray
    
#在加权数据集中循环，找到有最低错误率的单层决策树
def buildStump(dataArr,classLabels,D):
    dataMatrix = mat(dataArr); labelMat = mat(classLabels).T
    m,n = shape(dataMatrix)
    numSteps = 10.0; bestStump = {}; bestClasEst = mat(zeros((m,1)))
    #初始化最小错误率为正无穷
    minError = inf 
    #在所有维的特征中循环
    for i in range(n):
        #找到该维度特征的最大值和最小值，用于计算移动步长
        rangeMin = dataMatrix[:,i].min(); rangeMax = dataMatrix[:,i].max();
        stepSize = (rangeMax-rangeMin)/numSteps
        #在当前特征的所有取值中循环（以特定步长）
        for j in range(-1,int(numSteps)+1):
            #在‘大于’和‘小于’之间切换
            for inequal in ['lt', 'gt']: #go over less than and greater than
                threshVal = (rangeMin + float(j) * stepSize)
                #当inequal为‘lt’时，即以threshVal为阈值，将该特征值小于阈值的设为-1类
                predictedVals = stumpClassify(dataMatrix,i,threshVal,inequal)#call stump classify with i, j, lessThan
                errArr = mat(ones((m,1)))
                #计算分类错误的向量，分类正确，则对应样本位置的errArr值为0
                errArr[predictedVals == labelMat] = 0
                #计算加权错误率（该权重即adaboost中对样本所赋给的权重），若分错，则该错误的权值大，即更能影响最小错误率的判断
                weightedError = D.T*errArr
                #若加权错误率更小，则更新最佳特征的维度，阈值，和是使用‘大于’还是‘小于’
                if weightedError < minError:
                    minError = weightedError
                    bestClasEst = predictedVals.copy()
                    bestStump['dim'] = i
                    bestStump['thresh'] = threshVal
                    bestStump['ineq'] = inequal
    return bestStump,minError,bestClasEst

#adaboost构建过程
def adaBoostTrainDS(dataArr,classLabels,numIt=40):
    weakClassArr = []
    m = shape(dataArr)[0]
    #初始化样本权重为1/m, m为样本个数
    D = mat(ones((m,1))/m) 
    aggClassEst = mat(zeros((m,1)))
    #迭代numIt次
    for i in range(numIt):
        #构建最佳单层决策树
        bestStump,error,classEst = buildStump(dataArr,classLabels,D)
        #计算该单层决策树的权重，将最佳alpha加入存储字典，max项为了防止没有错误时的零溢出
        alpha = float(0.5*log((1.0-error)/max(error,1e-16)))
        bestStump['alpha'] = alpha  
        #将该轮的单层决策树存起来（存的是字典，包含决策树的最佳分类特征，特征对应的阈值，对应的最低的错误率，以及该决策树的权重）
        weakClassArr.append(bestStump)
        #对每个样本计算新的权重
        expon = multiply(-1*alpha*mat(classLabels).T,classEst) 
        D = multiply(D,exp(expon))                              
        D = D/D.sum()
        #计算当前轮决策树的错误率（考虑其对应的权重），并加到类别估计累积数组中
        aggClassEst += alpha*classEst
        #print "aggClassEst: ",aggClassEst.T
        aggErrors = multiply(sign(aggClassEst) != mat(classLabels).T,ones((m,1)))
        #计算目前为止所有决策树加权后的错误率是否为零，若为零则退出循环
        errorRate = aggErrors.sum()/m
        print ("total error: ",errorRate)
        if errorRate == 0.0: break
    #返回的是所有弱分类器的集合，每个都是一个字典，其中包含了该单层决策树选取哪个特征的哪个值，以及该决策树的权重
    return weakClassArr

#分类函数
def adaClassify(datToClass,classifierArr):
    dataMatrix = mat(datToClass)
    m = shape(dataMatrix)[0]
    #累积分类结果数组
    aggClassEst = mat(zeros((m,1)))
    #使用每个弱分类器进行分类，计算加权的分类结果并加到累积分类结果中
    for i in range(len(classifierArr)):
        classEst = stumpClassify(dataMatrix, classifierArr[i]['dim'],\
                                 classifierArr[i]['thresh'],\
                                 classifierArr[i]['ineq'])#call stump classify
        aggClassEst += classifierArr[i]['alpha']*classEst
        print (aggClassEst)
    #返回分类结果（利用sign函数将浮点数转化为类别标签）
    return sign(aggClassEst)



训练adaboost模型

In [4]:
datArr, labelArr = loadDataSet('horseColicTraining2.txt')
#迭代10次，即得到10个弱分类器
classifierArray = adaBoostTrainDS(datArr, labelArr, 10)

total error:  0.2842809364548495
total error:  0.2842809364548495
total error:  0.24749163879598662
total error:  0.24749163879598662
total error:  0.25418060200668896
total error:  0.2408026755852843
total error:  0.2408026755852843
total error:  0.22073578595317725
total error:  0.24749163879598662
total error:  0.23076923076923078


使用上述得到的adaboost模型（加权弱分类器集合）来预测测试数据

In [12]:
testArr, testLabel = loadDataSet('horseColicTest2.txt')
prediction10 = adaClassify(testArr, classifierArray)
#计算错误率,共67个测试数据
errArr = mat(ones((67,1)))
print('test error rate is(67 test examples in total): ',((errArr[prediction10 != mat(testLabel).T].sum())/67))

[[ 0.46166238]
 [ 0.46166238]
 [-0.46166238]
 [-0.46166238]
 [ 0.46166238]
 [ 0.46166238]
 [ 0.46166238]
 [ 0.46166238]
 [ 0.46166238]
 [ 0.46166238]
 [ 0.46166238]
 [-0.46166238]
 [-0.46166238]
 [ 0.46166238]
 [ 0.46166238]
 [ 0.46166238]
 [ 0.46166238]
 [-0.46166238]
 [-0.46166238]
 [-0.46166238]
 [-0.46166238]
 [ 0.46166238]
 [-0.46166238]
 [-0.46166238]
 [ 0.46166238]
 [ 0.46166238]
 [ 0.46166238]
 [ 0.46166238]
 [ 0.46166238]
 [ 0.46166238]
 [ 0.46166238]
 [ 0.46166238]
 [-0.46166238]
 [ 0.46166238]
 [ 0.46166238]
 [ 0.46166238]
 [ 0.46166238]
 [ 0.46166238]
 [ 0.46166238]
 [ 0.46166238]
 [ 0.46166238]
 [ 0.46166238]
 [ 0.46166238]
 [ 0.46166238]
 [ 0.46166238]
 [ 0.46166238]
 [-0.46166238]
 [ 0.46166238]
 [-0.46166238]
 [ 0.46166238]
 [ 0.46166238]
 [ 0.46166238]
 [ 0.46166238]
 [ 0.46166238]
 [ 0.46166238]
 [ 0.46166238]
 [ 0.46166238]
 [-0.46166238]
 [ 0.46166238]
 [-0.46166238]
 [ 0.46166238]
 [ 0.46166238]
 [-0.46166238]
 [ 0.46166238]
 [ 0.46166238]
 [ 0.46166238]
 [ 0.46166