In [1]:
import numpy as np


#加载数据
def load_data(file_name):
    with open(file_name) as fr:
        lines = fr.readlines()

    x = np.empty((len(lines), 21), dtype=float)
    y = np.empty(len(lines), dtype=int)
    for i in range(len(lines)):
        line = lines[i].strip().split('\t')
        x[i] = line[:21]
        y[i] = line[21]

    #数据整形
    for i in range(x.shape[1]):
        #取出一列数据
        xi = x[:, i]

        #最小值是0
        xi -= xi.min()

        #最大值是1
        xi /= xi.max()

        x[:, i] = xi

    return x, y


w = np.ones(21)
b = 0

x, y = load_data('病马数据_训练.txt')

x[:5], y[:5], x.shape, y.shape

(array([[1.        , 0.        , 0.94362745, 0.35869565, 0.29166667,
         0.75      , 0.75      , 0.        , 0.66666667, 1.        ,
         1.        , 1.        , 0.        , 0.        , 0.        ,
         0.75      , 1.        , 0.6       , 0.09438202, 0.        ,
         0.        ],
        [0.        , 0.        , 0.96078431, 0.47826087, 0.20833333,
         0.        , 0.        , 0.66666667, 0.33333333, 0.6       ,
         1.        , 0.5       , 0.        , 0.        , 0.        ,
         1.        , 0.4       , 0.66666667, 0.95505618, 0.66666667,
         0.1980198 ],
        [1.        , 0.        , 0.93872549, 0.2173913 , 0.25      ,
         0.25      , 0.25      , 0.5       , 0.33333333, 0.6       ,
         0.75      , 0.25      , 0.        , 0.        , 0.        ,
         0.25      , 0.2       , 0.44      , 0.0752809 , 0.        ,
         0.        ],
        [0.        , 1.        , 0.95833333, 0.89130435, 0.875     ,
         1.        , 0.25      , 1.  

In [2]:
N = len(x)

#数据权重,在初始化时,认为所有的数据都是等价的
D = np.empty(N)
D.fill(1 / N)
D[:5]

array([0.00334448, 0.00334448, 0.00334448, 0.00334448, 0.00334448])

In [3]:
class Tree():
    def __init__(self, col, value, eq):
        self.col = col
        self.value = value
        self.eq = eq
        #权重
        self.weight = 1

    #预测方法,简单的根据某个值分割数据
    def __call__(self, xi):
        if self.eq == '<':
            if xi[self.col] < self.value:
                return 1
            return -1

        if self.eq == '>':
            if xi[self.col] >= self.value:
                return 1
            return -1

    def __str__(self):
        return 'Tree{col=%d,value=%.2f,eq=%s,weight=%.2f}' % (
            self.col, self.value, self.eq, self.weight)


tree = Tree(0, 1, '<')
print(tree)
tree(x[0])

Tree{col=0,value=1.00,eq=<,weight=1.00}


-1

In [4]:
#计算树的loss,考虑数据的权重,越重要的数据,惩罚的越严重
def get_loss(tree):
    loss = 0
    for xi, yi, di in zip(x, y, D):
        pred = tree(xi)
        if pred != yi:
            loss += di
    return loss


get_loss(tree)

0.5618729096989964

In [5]:
#训练一棵树,总的来说,就是求loss最小
def get_tree():

    min_loss = np.inf

    min_col = 0
    min_value = 0
    min_eq = '<'

    min_loss_tree = None

    #遍历所有列
    for col in range(x.shape[1]):

        #遍历符号
        for eq in ['<', '>']:

            #从 列最小-0.1 遍历到 列最大+0.1
            col_min = x[:, col].min() - 0.1
            col_max = x[:, col].max() + 0.1

            value = col_min

            #遍历value值
            while value < col_max:
                tree = Tree(col, value, eq)
                loss = get_loss(tree)

                if loss < min_loss:
                    min_loss = loss
                    min_tree = tree

                value += 0.1

    return min_tree


tree = get_tree()
print(tree)
get_loss(tree)

Tree{col=9,value=0.70,eq=<,weight=1.00}


0.28428093645484936

In [6]:
#计算树的权重
def get_tree_weight(tree):
    #在当前数据权重的情况下,计算loss
    loss = get_loss(tree)

    #计算权重,这是一个恒正的数,loss约低,权重越大
    #防止分母为0
    weight = (1 - loss) / max(loss, 1e-5)

    #取对数,防止小数连乘
    return np.log(weight) / 2


tree.weight = get_tree_weight(tree)
tree.weight

0.46166237926576786

In [7]:
#计算数据的权重
def get_D(tree):

    new_D = np.empty(N)

    for i in range(N):
        #如果预测结果不正确,则增加数据的权重.预测正确,则减小数据的权重,当然也要考虑树本身的权重
        temp = tree.weight * -y[i] * tree(x[i])

        #取exp,可以认为是转换为了百分比,负数取exp,是一个小于1的数, 正数取exp, 是一个大于1的数.当然,exp是恒正的
        temp = np.exp(temp)

        #让D在上面的的计算结果上伸展
        new_D[i] = D[i] * temp

    #归一化
    new_D = new_D / new_D.sum()

    return new_D


get_D(tree)[:5]

array([0.00233645, 0.00588235, 0.00233645, 0.00588235, 0.00588235])

In [8]:
def train():
    global D

    trees = []
    #训练10棵树
    for i in range(10):
        tree = get_tree()
        tree.weight = get_tree_weight(tree)
        trees.append(tree)

        #重新计算数据权重
        D = get_D(tree)

    return trees


#重新初始化数据权重
D.fill(1 / N)
D

trees = train()
for i in trees:
    print(i)

Tree{col=9,value=0.70,eq=<,weight=0.46}
Tree{col=17,value=0.70,eq=<,weight=0.31}
Tree{col=3,value=0.30,eq=<,weight=0.29}
Tree{col=18,value=0.70,eq=>,weight=0.23}
Tree{col=10,value=0.10,eq=>,weight=0.20}
Tree{col=5,value=0.60,eq=<,weight=0.19}
Tree{col=12,value=0.40,eq=>,weight=0.15}
Tree{col=7,value=0.20,eq=<,weight=0.16}
Tree{col=5,value=0.10,eq=>,weight=0.14}
Tree{col=4,value=0.30,eq=>,weight=0.13}


In [9]:
#预测函数
def prediction(trees, xi):

    #就是累加树的权重*树的结果,因为前面取了对数权重了,所以这里用加号而不是乘号
    pred = 0
    for tree in trees:
        pred += tree.weight * tree(xi)

    #判断正负
    pred = np.sign(pred)
    return pred


prediction(trees, x[0])

-1.0

In [10]:
#测试
correct = 0
for xi, yi in zip(x, y):
    pred = prediction(trees, xi)
    if pred == yi:
        correct += 1

correct / N

0.7692307692307693

In [11]:
x, y = load_data('病马数据_测试.txt')

#测试
correct = 0
for xi, yi in zip(x, y):
    pred = prediction(trees, xi)
    if pred == yi:
        correct += 1

correct / len(x)

0.7611940298507462