In [1]:
'''
Created on Feb 4, 2011
Tree-Based Regression Methods
@author: Peter Harrington
'''
from numpy import *


def loadDataSet(fileName):  #general function to parse tab -delimited floats
    dataMat = []  #assume last column is target value
    fr = open(fileName)
    for line in fr.readlines():
        curLine = line.strip().split('\t')
        for i in range(len(curLine)):
            curLine[i] = float(curLine[i])
        dataMat.append(curLine)
    return dataMat


def binSplitDataSet(dataSet, feature, value):
    mat0 = dataSet[nonzero(dataSet[:, feature] > value)[0], :][0]
    mat1 = dataSet[nonzero(dataSet[:, feature] <= value)[0], :][0]
    return mat0, mat1


def regLeaf(dataSet):  #returns the value used for each leaf
    return mean(dataSet[:, -1])


def regErr(dataSet):
    return var(dataSet[:, -1]) * shape(dataSet)[0]


def linearSolve(dataSet):  #helper function used in two places
    m, n = shape(dataSet)
    X = mat(ones((m, n)))
    Y = mat(ones((m, 1)))  #create a copy of data with 1 in 0th postion
    X[:, 1:n] = dataSet[:, 0:n - 1]
    Y = dataSet[:, -1]  #and strip out Y
    xTx = X.T * X
    if linalg.det(xTx) == 0.0:
        raise NameError('This matrix is singular, cannot do inverse,\n\
        try increasing the second value of ops')
    ws = xTx.I * (X.T * Y)
    return ws, X, Y


def modelLeaf(dataSet):  #create linear model and return coeficients
    ws, X, Y = linearSolve(dataSet)
    return ws


def modelErr(dataSet):
    ws, X, Y = linearSolve(dataSet)
    yHat = X * ws
    return sum(power(Y - yHat, 2))


def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)):
    tolS = ops[0]
    tolN = ops[1]
    #if all the target variables are the same value: quit and return value
    if len(set(dataSet[:, -1].T.A.tolist()[0])) == 1:  #exit cond 1
        print('1111111')
        return None, leafType(dataSet)
    m, n = shape(dataSet)
    #the choice of the best feature is driven by Reduction in RSS error from mean
    S = errType(dataSet)
    bestS = inf
    bestIndex = 0
    bestValue = 0
    for featIndex in range(n - 1):
        for splitVal in unique(dataSet[:, featIndex]):
            mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)
            if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): continue
            newS = errType(mat0) + errType(mat1)
            if newS < bestS:
                bestIndex = featIndex
                bestValue = splitVal
                bestS = newS
    #if the decrease (S-bestS) is less than a threshold don't do the split
    if (S - bestS) < tolS:
        print('tolS=', tolS)
        print('2222222')
        return None, leafType(dataSet)  #exit cond 2
    mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
    if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN):  #exit cond 3
        print('3333333')
        return None, leafType(dataSet)
    return bestIndex, bestValue  #returns the best feature to split on
    #and the value used for that split


def createTree(
    dataSet,
    leafType=regLeaf,
    errType=regErr,
    ops=(0, 4)):  #assume dataSet is NumPy Mat so we can array filtering
    feat, val = chooseBestSplit(dataSet, leafType, errType,
                                ops)  #choose the best split

    print('feat, val=', feat, val)
    if feat == None:
        return val  #if the splitting hit a stop condition return val
    retTree = {}
    retTree['spInd'] = feat
    retTree['spVal'] = val
    lSet, rSet = binSplitDataSet(dataSet, feat, val)
    retTree['left'] = createTree(lSet, leafType, errType, ops)
    retTree['right'] = createTree(rSet, leafType, errType, ops)
    return retTree


def isTree(obj):
    return (type(obj).__name__ == 'dict')


def getMean(tree):
    if isTree(tree['right']): tree['right'] = getMean(tree['right'])
    if isTree(tree['left']): tree['left'] = getMean(tree['left'])
    return (tree['left'] + tree['right']) / 2.0


def prune(tree, testData):
    if shape(testData)[0] == 0:
        return getMean(tree)  #if we have no test data collapse the tree
    if (isTree(tree['right']) or isTree(
            tree['left'])):  #if the branches are not trees try to prune them
        lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
    if isTree(tree['left']): tree['left'] = prune(tree['left'], lSet)
    if isTree(tree['right']): tree['right'] = prune(tree['right'], rSet)
    #if they are now both leafs, see if we can merge them
    if not isTree(tree['left']) and not isTree(tree['right']):
        lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
        errorNoMerge = sum(power(lSet[:,-1] - tree['left'],2)) +\
            sum(power(rSet[:,-1] - tree['right'],2))
        treeMean = (tree['left'] + tree['right']) / 2.0
        errorMerge = sum(power(testData[:, -1] - treeMean, 2))
        if errorMerge < errorNoMerge:
            print("merging")
            return treeMean
        else:
            return tree
    else:
        return tree


def regTreeEval(model, inDat):
    return float(model)


def modelTreeEval(model, inDat):
    n = shape(inDat)[1]
    X = mat(ones((1, n + 1)))
    X[:, 1:n + 1] = inDat
    return float(X * model)


def treeForeCast(tree, inData, modelEval=regTreeEval):
    if not isTree(tree): return modelEval(tree, inData)
    if inData[tree['spInd']] > tree['spVal']:
        if isTree(tree['left']):
            return treeForeCast(tree['left'], inData, modelEval)
        else:
            return modelEval(tree['left'], inData)
    else:
        if isTree(tree['right']):
            return treeForeCast(tree['right'], inData, modelEval)
        else:
            return modelEval(tree['right'], inData)


def createForeCast(tree, testData, modelEval=regTreeEval):
    m = len(testData)
    yHat = mat(zeros((m, 1)))
    for i in range(m):
        yHat[i, 0] = treeForeCast(tree, mat(testData[i]), modelEval)
    return yHat

In [2]:
testMat = mat(eye(4))
testMat

matrix([[1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 0., 1.]])

In [3]:
mat0, mat1 = binSplitDataSet(testMat, 1, 0.5)
mat0, mat1

(matrix([[0., 1., 0., 0.]]), matrix([[1., 0., 0., 0.]]))

In [4]:
myDat = loadDataSet('ex00.txt')
myMat = mat(myDat)
myMat

matrix([[ 3.609800e-02,  1.550960e-01],
        [ 9.933490e-01,  1.077553e+00],
        [ 5.308970e-01,  8.934620e-01],
        [ 7.123860e-01,  5.648580e-01],
        [ 3.435540e-01, -3.717000e-01],
        [ 9.801600e-02, -3.327600e-01],
        [ 6.911150e-01,  8.343910e-01],
        [ 9.135800e-02,  9.993500e-02],
        [ 7.270980e-01,  1.000567e+00],
        [ 9.519490e-01,  9.452550e-01],
        [ 7.685960e-01,  7.602190e-01],
        [ 5.413140e-01,  8.937480e-01],
        [ 1.463660e-01,  3.428300e-02],
        [ 6.731950e-01,  9.150770e-01],
        [ 1.835100e-01,  1.848430e-01],
        [ 3.395630e-01,  2.067830e-01],
        [ 5.179210e-01,  1.493586e+00],
        [ 7.037550e-01,  1.101678e+00],
        [ 8.307000e-03,  6.997600e-02],
        [ 2.439090e-01, -2.946700e-02],
        [ 3.069640e-01, -1.773210e-01],
        [ 3.649200e-02,  4.081550e-01],
        [ 2.955110e-01,  2.882000e-03],
        [ 8.375220e-01,  1.229373e+00],
        [ 2.020540e-01, -8.774400e-02],


In [5]:
regTrees = createTree(myMat)
regTrees

tolS= 0
2222222
feat, val= None 0.571743005


0.571743005

In [6]:
def linearSolve(dataSet):  #helper function used in two places
    m, n = shape(dataSet)
    X = mat(ones((m, n)))
    Y = mat(ones((m, 1)))  #create a copy of data with 1 in 0th postion
    X[:, 1:n] = dataSet[:, 0:n - 1]
    Y = dataSet[:, -1]  #and strip out Y
    xTx = X.T * X
    if linalg.det(xTx) == 0.0:
        raise NameError('This matrix is singular, cannot do inverse,\n\
        try increasing the second value of ops')
    ws = xTx.I * (X.T * Y)
    return ws, X, Y


def modelLeaf(dataSet):  #create linear model and return coeficients
    ws, X, Y = linearSolve(dataSet)
    return ws


def modelErr(dataSet):
    ws, X, Y = linearSolve(dataSet)
    yHat = X * ws
    return sum(power(Y - yHat, 2))

