In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

  return f(*args, **kwds)
  return f(*args, **kwds)


# Tree-based Regression (回归树)
树的结构有助于解决非线性问题，之前分类问题使用的决策树基于熵来划分(ID3算法)，这次我们介绍另一个算法-CART，Cart是分类和回归树的综合体，可以同时处理离散以及连续问题。离散特征的部分，Cart采用基尼指数进行划分，连续特征的部分，采用均方误差乘数据个数来决定切分点。

### 优缺点

优点：可解决复杂、非线性的数据   
缺点：较难解释模型

### 整理

##### ID3算法
- 每次选取最佳特征分割数据，按照该特征的所有可能值来切分。如果一个特征有4个取值，数据将被切为4份
- 某特征切分后，该特征在之后算法将不会再起作用，这种切分被认为过于迅速
- ID3不能处理连续型特征。只能事先把连续型转换成离散型才能使用。
- 使用香农熵来度量集合的无组织程度

##### Cart算法
- 二元切分法，每次把数据集切成2份
- 如果某个特征值等于切分要求的值，那数据进入左子树，反之进入右子树
- 易于对数构建过程进行调整以处理连续型数据
- 采用采用平方误差的总值（总方差）计算连续型数值的混乱度

接下来我们先定义数据结构，大概是如下形式

In [2]:
class treeNode():
    def __init__(self, feat, val, right, left):
        featureToSplitOn=feat
        valueOfSplit=val
        rightBranch=right
        leftBranch=left

In [37]:
def loadDataSet(fileName):      #general function to parse tab -delimited floats
    dataMat = []                #assume last column is target value
    fr = open(fileName)
    for line in fr.readlines():
        curLine = line.strip().split('\t')
        fltLine = list(map(float,curLine)) #map all elements to float()
        dataMat.append(fltLine)
    return dataMat

def regLeaf(dataSet):#returns the value used for each leaf
    return np.mean(dataSet[:,-1])

def regErr(dataSet):
    return np.var(dataSet[:,-1]) * np.shape(dataSet)[0]

def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):
    tolS = ops[0]; tolN = ops[1]
    #if all the target variables are the same value: quit and return value
    if len(set(dataSet[:,-1].T.tolist()[0])) == 1: #exit cond 1
        return None, leafType(dataSet)
    m,n = np.shape(dataSet)
    #the choice of the best feature is driven by Reduction in RSS error from mean
    S = errType(dataSet)
    bestS = np.inf; bestIndex = 0; bestValue = 0
    for featIndex in range(n-1):
        for splitVal in set(dataSet[:,featIndex].flatten().A[0]):
            mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)
            if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN): continue
            newS = errType(mat0) + errType(mat1)
            if newS < bestS: 
                bestIndex = featIndex
                bestValue = splitVal
                bestS = newS
    #if the decrease (S-bestS) is less than a threshold don't do the split
    if (S - bestS) < tolS: 
        return None, leafType(dataSet) #exit cond 2
    mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
    if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN):  #exit cond 3
        return None, leafType(dataSet)
    return bestIndex,bestValue#returns the best feature to split on
                              #and the value used for that split

def binSplitDataSet(dataSet, feature, value):
    mat0 = dataSet[np.nonzero(dataSet[:,feature] > value)[0],:]#[0]
    mat1 = dataSet[np.nonzero(dataSet[:,feature] <= value)[0],:]#[0]
    return mat0,mat1

def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):#assume dataSet is NumPy Mat so we can array filtering
    feat, val = chooseBestSplit(dataSet, leafType, errType, ops)#choose the best split
    if feat == None: return val #if the splitting hit a stop condition return val
    retTree = {}
    retTree['spInd'] = feat
    retTree['spVal'] = val
    lSet, rSet = binSplitDataSet(dataSet, feat, val)
    retTree['left'] = createTree(lSet, leafType, errType, ops)
    retTree['right'] = createTree(rSet, leafType, errType, ops)
    return retTree  

testMat=np.mat(np.eye(4))
mat0, mat1=binSplitDataSet(testMat,1,0.5)
testMat,mat0, mat1

(matrix([[1., 0., 0., 0.],
         [0., 1., 0., 0.],
         [0., 0., 1., 0.],
         [0., 0., 0., 1.]]),
 matrix([[0., 1., 0., 0.]]),
 matrix([[1., 0., 0., 0.],
         [0., 0., 1., 0.],
         [0., 0., 0., 1.]]))

In [38]:
myData=loadDataSet('./datasets/ex00.txt')
myMat=np.mat(myData)
createTree(myMat)

{'left': 1.0180967672413792,
 'right': -0.04465028571428572,
 'spInd': 0,
 'spVal': 0.48813}

In [39]:
myData=loadDataSet('./datasets/ex0.txt')
myMat=np.mat(myData)
createTree(myMat)

{'left': {'left': {'left': 3.9871632,
   'right': 2.9836209534883724,
   'spInd': 1,
   'spVal': 0.797583},
  'right': 1.980035071428571,
  'spInd': 1,
  'spVal': 0.582002},
 'right': {'left': 1.0289583666666666,
  'right': -0.023838155555555553,
  'spInd': 1,
  'spVal': 0.197834},
 'spInd': 1,
 'spVal': 0.39435}

由于树有太多叶子的话容易过拟合，所以我们需要剪掉一些叶子来避免过拟合，剪枝有分预剪枝和后剪枝

##### 预剪枝
- 设定一个阈值(样本数小于阈值或是基尼指数小于阈值)，若误差超过这个阈值则剪枝，否则不进行剪枝
- 代码的部分可以透过修改ops来进行调参

##### 后剪枝
- 先用训练集训练好一棵树，再利用测试集进行剪枝
- 自底向上，如果合并会降低误差的话，则合并

In [50]:
def isTree(obj):
    return (type(obj).__name__=='dict')

def getMean(tree):
    if isTree(tree['right']): tree['right'] = getMean(tree['right'])
    if isTree(tree['left']): tree['left'] = getMean(tree['left'])
    return (tree['left']+tree['right'])/2.0
    
def prune(tree, testData):
    if np.shape(testData)[0] == 0: return getMean(tree) #if we have no test data collapse the tree
    if (isTree(tree['right']) or isTree(tree['left'])):#if the branches are not trees try to prune them
        lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
    if isTree(tree['left']): tree['left'] = prune(tree['left'], lSet)
    if isTree(tree['right']): tree['right'] =  prune(tree['right'], rSet)
    #if they are now both leafs, see if we can merge them
    if not isTree(tree['left']) and not isTree(tree['right']):
        lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
        errorNoMerge = sum(np.power(lSet[:,-1] - tree['left'],2)) +\
            sum(np.power(rSet[:,-1] - tree['right'],2))
        treeMean = (tree['left']+tree['right'])/2.0
        errorMerge = sum(np.power(testData[:,-1] - treeMean,2))
        if errorMerge < errorNoMerge: 
            print("merging")
            return treeMean
        else: return tree
    else: return tree

In [45]:
myData=loadDataSet('./datasets/ex2.txt')
myMat=np.mat(myData)
myTree=createTree(myMat,ops=(0,1))

In [46]:
myDataTest=loadDataSet('./datasets/ex2test.txt')
myMatTest=np.mat(myDataTest)

In [51]:
prune(myTree,myMatTest)

merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging


{'left': {'left': {'left': {'left': 92.5239915,
    'right': {'left': {'left': {'left': 112.386764,
       'right': 123.559747,
       'spInd': 0,
       'spVal': 0.960398},
      'right': 135.837013,
      'spInd': 0,
      'spVal': 0.958512},
     'right': 111.2013225,
     'spInd': 0,
     'spVal': 0.956951},
    'spInd': 0,
    'spVal': 0.965969},
   'right': {'left': {'left': {'left': {'left': {'left': {'left': {'left': {'left': {'left': {'left': {'left': 96.41885225,
              'right': 69.318649,
              'spInd': 0,
              'spVal': 0.948822},
             'right': {'left': {'left': 110.03503850000001,
               'right': {'left': 65.548418,
                'right': {'left': 115.753994,
                 'right': {'left': {'left': 94.3961145,
                   'right': 85.005351,
                   'spInd': 0,
                   'spVal': 0.912161},
                  'right': {'left': {'left': 106.814667,
                    'right': 118.513475,
               

# Tree-based Model (模型树)
有时候，我们会遇到分段線性（piecewise linear)的特征，分段线性如下图所示
![](./imgs/piecewise_linear.png)
数据在某几段之间是线性的，我们可以找到线性斜率变换的点进行切分

In [52]:
def linearSolve(dataSet):   #helper function used in two places
    m,n = np.shape(dataSet)
    X = np.mat(np.ones((m,n))); Y = np.mat(np.ones((m,1)))#create a copy of data with 1 in 0th postion
    X[:,1:n] = dataSet[:,0:n-1]; Y = dataSet[:,-1]#and strip out Y
    xTx = X.T*X
    if np.linalg.det(xTx) == 0.0:
        raise NameError('This matrix is singular, cannot do inverse,\n\
        try increasing the second value of ops')
    ws = xTx.I * (X.T * Y)
    return ws,X,Y

def modelLeaf(dataSet):#create linear model and return coeficients
    ws,X,Y = linearSolve(dataSet)
    return ws

def modelErr(dataSet):
    ws,X,Y = linearSolve(dataSet)
    yHat = X * ws
    return sum(np.power(Y - yHat,2))

In [53]:
myData=loadDataSet('./datasets/exp2.txt')
myMat=np.mat(myData)
createTree(myMat,modelLeaf,modelErr)

{'left': matrix([[1.69855694e-03],
         [1.19647739e+01]]), 'right': matrix([[3.46877936],
         [1.18521743]]), 'spInd': 0, 'spVal': 0.285477}