### 决策树

#### 1.信息熵

In [2]:
def get_dataset():
    """
    获得数据集
    """
    data = [
        [0, 0, 0, 0, 'no'],  #数据集
        [0, 0, 0, 1, 'no'],
        [0, 1, 0, 1, 'yes'],
        [0, 1, 1, 0, 'yes'],
        [0, 0, 0, 0, 'no'],
        [1, 0, 0, 0, 'no'],
        [1, 0, 0, 1, 'no'],
        [1, 1, 1, 1, 'yes'],
        [1, 0, 1, 2, 'yes'],
        [1, 0, 1, 2, 'yes'],
        [2, 0, 1, 2, 'yes'],
        [2, 0, 1, 1, 'yes'],
        [2, 1, 0, 1, 'yes'],
        [2, 1, 0, 2, 'yes'],
        [2, 0, 0, 0, 'no']
    ]
    labels = ['年龄', '有工作', '有自己的房子', '信贷情况']  #分类属性
    return data, labels

In [3]:
dataset, labels = get_dataset()
labels

['年龄', '有工作', '有自己的房子', '信贷情况']

In [3]:
import math


def get_entropy(dataset):
    """
    根据给定的数据集计算经验熵
    """
    nums_dataset = len(dataset)
    labels = {}
    for data in dataset:
        label = data[-1]
        if label not in labels.keys():
            labels[label] = 0
        labels[label] += 1
    # print(labels)
    # 计算经验熵
    entropy = 0.0
    for key in labels:
        # 概率
        prob = labels[key] / nums_dataset
        entropy -= prob * math.log(prob, 2)
    return entropy

In [53]:
get_entropy(dataset)

{'no': 6, 'yes': 9}


0.9709505944546686

In [4]:
def split_dataset(dataset, axis, value):
    ret_dataset = []
    for feature in dataset:
        if feature[axis] == value:
            reduced_feature = feature[:axis]
            reduced_feature.extend(feature[axis + 1:])
            ret_dataset.append(reduced_feature)
    return ret_dataset

In [8]:
import numpy as np
import collections

dataset = np.array(dataset)
collections.Counter(dataset[:, -1])

Counter({'no': 6, 'yes': 9})

In [13]:
len(dataset[0])

5

In [5]:
def get_info_gain(dataset):
    """
    计算信息增益
    """
    num_dataset=len(dataset)
    num_features = len(dataset[0]) - 1
    entropy = get_entropy(dataset)
    info_gain = {}
    for fe in range(num_features):
        features = [feature[fe] for feature in dataset]  # 取出特征列
        unique_feature = set(features)
        condaition_entropy = 0.0
        for value in unique_feature:
            sub_dataset = split_dataset(dataset, fe, value)
            prob = len(sub_dataset) / float(num_dataset)
            condaition_entropy += prob * get_entropy(sub_dataset)
        info_gain[fe] = entropy - condaition_entropy
        # print(info_gain)
    return info_gain

In [6]:
def get_best_feature(dataset):
    """
    获得最好的特征，即信息增益最大值，以ID3为例
    """
    info_gain=get_info_gain(dataset)
    return sorted(info_gain.items(),key=lambda v:v[1],reverse=True)[0][0]

In [7]:
# %%time
credit_dataset, labels = get_dataset()
get_info_gain(credit_dataset)

{0: 0.08300749985576883,
 1: 0.32365019815155627,
 2: 0.4199730940219749,
 3: 0.36298956253708536}

In [None]:
info_gain=get_info_gain(credit_dataset)
sorted(info_gain.items(),key=lambda v:v[1],reverse=True)[0][0]

In [45]:
def get_major_class(class_list):
    """
        统计类别的数量，并返回数量最多的类别
    """
    class_count = {}
    for vote in class_list:
        if vote not in class_count.keys():
            class_count[vote] = 0
        class_count[vote] += 1
    return sorted(class_count.items(), key=lambda x: x[1], reverse=True)[0][0]

In [73]:
def get_decision_tree(dataset, labels):
    """
        决策树生成
    """
    class_list = [example[-1] for example in dataset]
    if class_list.count(class_list[0]) == len(class_list):
        return class_list[0]  # 终止条件1：样本都是同一类型
    if len(dataset[0]) == 1:
        return get_major_class(class_list)  # 属性用完或分不开情形，使用后验分布

    best_feature = get_best_feature(dataset)
    print(labels, best_feature)
    best_fea_label = labels[best_feature]
    d_tree = {best_fea_label: {}}
    # 删除已经用于分裂的特征
    del (labels[best_feature])
    # 取出特征值
    feature_values = [fe[best_feature] for fe in dataset]
    unique_feature = set(feature_values)
    for v in unique_feature:
        sub_labels = labels[:]
        sub_dataset = split_dataset(dataset, best_feature, v)
        d_tree[best_fea_label][v] = get_decision_tree(sub_dataset, sub_labels)
    return d_tree

In [76]:
credit_dataset, credit_labels = get_dataset()
get_decision_tree(credit_dataset,credit_labels)

['年龄', '有工作', '有自己的房子', '信贷情况'] 2
['年龄', '有工作', '信贷情况'] 1


{'有自己的房子': {0: {'有工作': {0: 'no', 1: 'yes'}}, 1: 'yes'}}

In [9]:
### 使用库函数优化决策树的算法
import numpy as np
import math
import collections

In [61]:
np_dataset = np.array(credit_dataset)
X, y = np_dataset[:, :-1].astype('int64'), np_dataset[:, -1]
X,y

(array([[0, 0, 0, 0],
        [0, 0, 0, 1],
        [0, 1, 0, 1],
        [0, 1, 1, 0],
        [0, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 1],
        [1, 1, 1, 1],
        [1, 0, 1, 2],
        [1, 0, 1, 2],
        [2, 0, 1, 2],
        [2, 0, 1, 1],
        [2, 1, 0, 1],
        [2, 1, 0, 2],
        [2, 0, 0, 0]]),
 array(['no', 'no', 'yes', 'yes', 'no', 'no', 'no', 'yes', 'yes', 'yes',
        'yes', 'yes', 'yes', 'yes', 'no'], dtype='<U21'))

In [21]:
entropy = 0.0
feature_dict = collections.Counter(y).values()
nums = sum(feature_dict)
for v in feature_dict:
    prob = v / nums
    entropy -= prob * math.log(prob, 2)
entropy

0.9709505944546686

In [18]:
sum(collections.Counter(y).values())

15

In [None]:
collections.Counter(y)

In [60]:
np_dataset[:, :-1].astype('int64')


array([[0, 0, 0, 0],
       [0, 0, 0, 1],
       [0, 1, 0, 1],
       [0, 1, 1, 0],
       [0, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 1],
       [1, 1, 1, 1],
       [1, 0, 1, 2],
       [1, 0, 1, 2],
       [2, 0, 1, 2],
       [2, 0, 1, 1],
       [2, 1, 0, 1],
       [2, 1, 0, 2],
       [2, 0, 0, 0]])

In [62]:
x0=X[:,0]
x0

array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2])

In [65]:
np.bincount(X[:,1])

array([10,  5])

In [69]:
y_map={
    "no":0,
    "yes":1
}
np.apply_along_axis(y_map,0,y)

TypeError: 'dict' object is not callable

### numpy 实现


In [1]:
import numpy as np


class Node:
    def __init__(self, left, right, rule):
        self.left = left
        self.right = right
        self.feature = rule[0]
        self.threshold = rule[1]


class Leaf:
    def __init__(self, value):
        """
        `value` is an array of class probabilities if classifier is True, else
        the mean of the region
        """
        self.value = value


class DecisionTree:
    def __init__(
            self,
            classifier=True,
            max_depth=None,
            n_feats=None,
            criterion="entropy",
            seed=None,
    ):
        """
        A decision tree model for regression or classification problems.

        Parameters
        ----------
        classifier : bool (default: True)
            Whether to treat target values as categorical (True) or
            continuous (False)
        max_depth: int (default: None)
            The depth at which to stop growing the tree. If None, grow the tree
            until all leaves are pure.
        n_feats : int (default: None)
            Specifies the number of features to sample on each split. If None,
            use all features on each split.
        criterion : str (default: 'entropy')
            The error criterion to use when calculating splits. When
            `classifier` is False, valid entries are {'mse'}. When `classifier`
            is True, valid entries are {'entropy', 'gini'}.
        seed : int (default: None)
            Seed for the random number generator
        """
        if seed:
            np.random.seed(seed)

        self.depth = 0
        self.root = None

        self.n_feats = n_feats
        self.criterion = criterion
        self.classifier = classifier
        self.max_depth = max_depth if max_depth else np.inf

        if not classifier and criterion in ["gini", "entropy"]:
            raise ValueError(
                "{} is a valid criterion only when classifier = True.".format(
                    criterion))
        if classifier and criterion == "mse":
            raise ValueError(
                "`mse` is a valid criterion only when classifier = False.")

    def fit(self, X, Y):
        """
        Trains a binary decision tree classifier.

        Parameters
        ----------
        X : numpy array of shape (N, M)
            The training data of N examples, each with M features
        Y : numpy array of shape (N,)
            An array of integer labels ranging between [0, n_classes-1] for
            each example in X if `self.classifier`=True else the set of target
            values for each example in X.
        """
        self.n_classes = max(Y) + 1 if self.classifier else None
        self.n_feats = X.shape[1] if not self.n_feats else min(
            self.n_feats, X.shape[1])
        self.root = self._grow(X, Y)

    def predict(self, X):
        """
        Use the trained decision tree to classify or predict the examples in X.

        Parameters
        ----------
        X : numpy array of shape (N, M)
            The training data of N examples, each with M features

        Returns
        -------
        preds : numpy array of shape (N,)
            The integer class labels predicted for each example in X if
            classifier = True, otherwise the predicted target values.
        """
        return np.array([self._traverse(x, self.root) for x in X])

    def predict_class_probs(self, X):
        """
        Use the trained decision tree to return the class probabilities for the
        examples in X.

        Parameters
        ----------
        X : numpy array of shape (N, M)
            The training data of N examples, each with M features

        Returns
        -------
        preds : numpy array of shape (N, n_classes)
            The class probabilities predicted for each example in X
        """
        assert self.classifier, "`predict_class_probs` undefined for classifier = False"
        return np.array([self._traverse(x, self.root, prob=True) for x in X])

    def _grow(self, X, Y):
        # if all labels are the same, return a leaf
        if len(set(Y)) == 1:
            if self.classifier:
                prob = np.zeros(self.n_classes)
                prob[Y[0]] = 1.0
            return Leaf(prob) if self.classifier else Leaf(Y[0])

        # if we have reached max_depth, return a leaf
        if self.depth >= self.max_depth:
            v = np.mean(Y, axis=0)
            if self.classifier:
                v = np.bincount(Y, minlength=self.n_classes) / len(Y)
            return Leaf(v)

        N, M = X.shape
        self.depth += 1
        feat_idxs = np.random.choice(M, self.n_feats, replace=False)

        # greedily select the best split according to `criterion`
        feat, thresh = self._segment(X, Y, feat_idxs)
        l = np.argwhere(X[:, feat] <= thresh).flatten()
        r = np.argwhere(X[:, feat] > thresh).flatten()

        # grow the children that result from the split
        left = self._grow(X[l, :], Y[l])
        right = self._grow(X[r, :], Y[r])
        return Node(left, right, (feat, thresh))

    def _segment(self, X, Y, feat_idxs):
        """
        Find the optimal split rule (feature index and split threshold) for the
        data according to `self.criterion`.
        """
        best_gain = -np.inf
        split_idx, split_thresh = None, None
        for i in feat_idxs:
            vals = X[:, i]
            levels = np.unique(vals)
            thresholds = (levels[:-1] + levels[1:]) / 2
            gains = np.array(
                [self._impurity_gain(Y, t, vals) for t in thresholds])

            if gains.max() > best_gain:
                split_idx = i
                best_gain = gains.max()
                split_thresh = thresholds[gains.argmax()]

        return split_idx, split_thresh

    def _impurity_gain(self, Y, split_thresh, feat_values):
        """
        Compute the impurity gain associated with a given split.

        IG(split) = loss(parent) - weighted_avg[loss(left_child), loss(right_child)]
        """
        global loss
        if self.criterion == "entropy":
            loss = entropy
        elif self.criterion == "gini":
            loss = gini
        elif self.criterion == "mse":
            loss = mse

        parent_loss = loss(Y)

        # generate split
        left = np.argwhere(feat_values <= split_thresh).flatten()
        right = np.argwhere(feat_values > split_thresh).flatten()

        if len(left) == 0 or len(right) == 0:
            return 0

        # compute the weighted avg. of the loss for the children
        n = len(Y)
        n_l, n_r = len(left), len(right)
        e_l, e_r = loss(Y[left]), loss(Y[right])
        child_loss = (n_l / n) * e_l + (n_r / n) * e_r

        # impurity gain is difference in loss before vs. after split
        ig = parent_loss - child_loss
        return ig

    def _traverse(self, X, node, prob=False):
        if isinstance(node, Leaf):
            if self.classifier:
                return node.value if prob else node.value.argmax()
            return node.value
        if X[node.feature] <= node.threshold:
            return self._traverse(X, node.left, prob)
        return self._traverse(X, node.right, prob)


def mse(y):
    """
    Mean squared error for decision tree (ie., mean) predictions
    """
    return np.mean((y - np.mean(y))**2)


def entropy(y):
    """
    Entropy of a label sequence
    """
    hist = np.bincount(y)
    ps = hist / np.sum(hist)
    return -np.sum([p * np.log2(p) for p in ps if p > 0])


def gini(y):
    """
    Gini impurity (local entropy) of a label sequence
    """
    hist = np.bincount(y)
    N = np.sum(hist)
    return 1 - sum([(i / N)**2 for i in hist])

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.datasets.samples_generator import make_blobs


def test_DecisionTree():
    i = 1
    np.random.seed(12345)
    while True:
        n_ex = np.random.randint(2, 100)
        n_feats = np.random.randint(2, 100)
        max_depth = np.random.randint(1, 5)

        classifier = np.random.choice([True, False])
        if classifier:
            # create classification problem
            n_classes = np.random.randint(2, 10)
            X, Y = make_blobs(n_samples=n_ex,
                              centers=n_classes,
                              n_features=n_feats,
                              random_state=i)
            X, X_test, Y, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.3,
                                                    random_state=i)

            # initialize model
            def loss(yp, y):
                return 1 - accuracy_score(yp, y)

            criterion = np.random.choice(["entropy", "gini"])
            mine = DecisionTree(classifier=classifier,
                                max_depth=max_depth,
                                criterion=criterion)
            gold = DecisionTreeClassifier(
                criterion=criterion,
                max_depth=max_depth,
                splitter="best",
                random_state=i,
            )
        else:
            # create regeression problem
            X, Y = make_regression(n_samples=n_ex,
                                   n_features=n_feats,
                                   random_state=i)
            X, X_test, Y, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.3,
                                                    random_state=i)

            # initialize model
            criterion = "mse"
            loss = mean_squared_error
            mine = DecisionTree(criterion=criterion,
                                max_depth=max_depth,
                                classifier=classifier)
            gold = DecisionTreeRegressor(criterion=criterion,
                                         max_depth=max_depth,
                                         splitter="best")

        print("Trial {}".format(i))
        print("\tClassifier={}, criterion={}".format(classifier, criterion))
        print("\tmax_depth={}, n_feats={}, n_ex={}".format(
            max_depth, n_feats, n_ex))
        if classifier:
            print("\tn_classes: {}".format(n_classes))

        # fit 'em
        mine.fit(X, Y)
        gold.fit(X, Y)

        # get preds on training set
        y_pred_mine = mine.predict(X)
        y_pred_gold = gold.predict(X)

        loss_mine = loss(y_pred_mine, Y)
        loss_gold = loss(y_pred_gold, Y)

        # get preds on test set
        y_pred_mine_test = mine.predict(X_test)
        y_pred_gold_test = gold.predict(X_test)

        loss_mine_test = loss(y_pred_mine_test, Y_test)
        loss_gold_test = loss(y_pred_gold_test, Y_test)

        try:
            np.testing.assert_almost_equal(loss_mine, loss_gold)
            print("\tLoss on training: {}".format(loss_mine))
        except AssertionError as e:
            print("\tTraining losses not equal:\n{}".format(e))

        try:
            np.testing.assert_almost_equal(loss_mine_test, loss_gold_test)
            print("\tLoss on test: {}".format(loss_mine_test))
        except AssertionError as e:
            print("\tTest losses not equal:\n{}".format(e))
        i += 1

In [76]:
test_DecisionTree()

Trial 1
	Classifier=False, criterion=mse
	max_depth=1, n_feats=3, n_ex=31
	Loss on training: 8709.329410399534
	Loss on test: 4103.533167451686
Trial 2
	Classifier=True, criterion=gini
	max_depth=2, n_feats=31, n_ex=36
	n_classes: 3
	Loss on training: 0.0
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 0.0
 DESIRED: 0.09090909090909094
Trial 3
	Classifier=False, criterion=mse
	max_depth=2, n_feats=25, n_ex=60
	Training losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 15854.344961985342
 DESIRED: 11995.845959423437
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 22374.86521889598
 DESIRED: 13456.376936406066
Trial 4
	Classifier=False, criterion=mse
	max_depth=2, n_feats=88, n_ex=98
	Training losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 18341.679068928388
 DESIRED: 13730.891900511522
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 30131.600387285824
 DESIRED: 

	Loss on training: 0.0
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 0.033333333333333326
 DESIRED: 0.0
Trial 38
	Classifier=False, criterion=mse
	max_depth=3, n_feats=38, n_ex=51
	Training losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 16876.866489142303
 DESIRED: 4231.498791725337
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 30498.659249724013
 DESIRED: 58750.975472303806
Trial 39
	Classifier=True, criterion=entropy
	max_depth=2, n_feats=31, n_ex=63
	n_classes: 5
	Training losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 0.38636363636363635
 DESIRED: 0.15909090909090906
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 0.42105263157894735
 DESIRED: 0.26315789473684215
Trial 40
	Classifier=True, criterion=gini
	max_depth=2, n_feats=33, n_ex=75
	n_classes: 2
	Loss on training: 0.0
	Loss on test: 0.0
Trial 41
	Classifier=True, criterion=entropy
	max_depth=3,

	Loss on training: 0.0
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 0.0
 DESIRED: 0.052631578947368474
Trial 75
	Classifier=True, criterion=entropy
	max_depth=3, n_feats=56, n_ex=11
	n_classes: 7
	Training losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 0.2857142857142857
 DESIRED: 0.0
	Loss on test: 1.0
Trial 76
	Classifier=True, criterion=gini
	max_depth=1, n_feats=22, n_ex=87
	n_classes: 8
	Loss on training: 0.6833333333333333
	Loss on test: 0.8888888888888888
Trial 77
	Classifier=True, criterion=entropy
	max_depth=1, n_feats=92, n_ex=6
	n_classes: 9
	Loss on training: 0.5
	Loss on test: 1.0
Trial 78
	Classifier=False, criterion=mse
	max_depth=3, n_feats=60, n_ex=90
	Training losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 23742.79736723788
 DESIRED: 8997.297581431018
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 39365.75226846165
 DESIRED: 38608.026929013635
Trial 79
	Classifier

	Loss on training: 34048.86951780208
	Loss on test: 47762.84929494157
Trial 115
	Classifier=True, criterion=gini
	max_depth=3, n_feats=33, n_ex=8
	n_classes: 4
	Loss on training: 0.0
	Loss on test: 0.0
Trial 116
	Classifier=True, criterion=entropy
	max_depth=4, n_feats=22, n_ex=37
	n_classes: 7
	Training losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 0.19999999999999996
 DESIRED: 0.0
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 0.41666666666666663
 DESIRED: 0.33333333333333337
Trial 117
	Classifier=True, criterion=gini
	max_depth=3, n_feats=22, n_ex=61
	n_classes: 5
	Loss on training: 0.16666666666666663
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 0.26315789473684215
 DESIRED: 0.3157894736842105
Trial 118
	Classifier=True, criterion=gini
	max_depth=4, n_feats=64, n_ex=48
	n_classes: 3
	Loss on training: 0.0
	Loss on test: 0.0
Trial 119
	Classifier=False, criterion=mse
	max_depth=4, n_feats=32, n_ex=93

	Training losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 10763.579708824256
 DESIRED: 1029.9630251130761
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 25167.933281413687
 DESIRED: 17284.92593144478
Trial 156
	Classifier=False, criterion=mse
	max_depth=1, n_feats=33, n_ex=31
	Loss on training: 4801.97617295728
	Loss on test: 10886.422282767331
Trial 157
	Classifier=False, criterion=mse
	max_depth=2, n_feats=11, n_ex=59
	Training losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 13416.25814834548
 DESIRED: 10186.505198421093
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 21516.446395051866
 DESIRED: 36256.11070091842
Trial 158
	Classifier=True, criterion=gini
	max_depth=4, n_feats=69, n_ex=8
	n_classes: 3
	Loss on training: 0.0
	Loss on test: 0.0
Trial 159
	Classifier=False, criterion=mse
	max_depth=4, n_feats=2, n_ex=31
	Training losses not equal:

Arrays are not almost equal to 7 decim

	Loss on training: 17138.281368018208
	Loss on test: 19731.1900083892
Trial 194
	Classifier=True, criterion=gini
	max_depth=1, n_feats=86, n_ex=2
	n_classes: 8
	Loss on training: 0.0
	Loss on test: 1.0
Trial 195
	Classifier=True, criterion=entropy
	max_depth=3, n_feats=89, n_ex=20
	n_classes: 8
	Training losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 0.3571428571428571
 DESIRED: 0.0
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 0.8333333333333334
 DESIRED: 0.16666666666666663
Trial 196
	Classifier=False, criterion=mse
	max_depth=1, n_feats=6, n_ex=40
	Loss on training: 4873.666297814666
	Loss on test: 10944.727772812083
Trial 197
	Classifier=True, criterion=entropy
	max_depth=1, n_feats=95, n_ex=12
	n_classes: 2
	Loss on training: 0.0
	Loss on test: 0.0
Trial 198
	Classifier=True, criterion=entropy
	max_depth=3, n_feats=62, n_ex=95
	n_classes: 3
	Loss on training: 0.0
	Test losses not equal:

Arrays are not almost equal to 7 decimals

	Loss on training: 0.5333333333333333
	Loss on test: 0.8461538461538461
Trial 233
	Classifier=True, criterion=gini
	max_depth=1, n_feats=78, n_ex=76
	n_classes: 3
	Loss on training: 0.28301886792452835
	Loss on test: 0.4782608695652174
Trial 234
	Classifier=True, criterion=gini
	max_depth=1, n_feats=54, n_ex=99
	n_classes: 9
	Loss on training: 0.7246376811594203
	Loss on test: 0.9
Trial 235
	Classifier=False, criterion=mse
	max_depth=3, n_feats=91, n_ex=73
	Training losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 22151.498912832554
 DESIRED: 6511.871654905961
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 42538.28397077676
 DESIRED: 73981.97575878142
Trial 236
	Classifier=False, criterion=mse
	max_depth=4, n_feats=66, n_ex=72
	Training losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 8884.105451779598
 DESIRED: 1180.6453197013084
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 26021.7764

	Training losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 15122.78611548675
 DESIRED: 1887.0745772010928
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 36448.0642513086
 DESIRED: 48955.51290937236
Trial 267
	Classifier=True, criterion=gini
	max_depth=2, n_feats=69, n_ex=55
	n_classes: 2
	Loss on training: 0.0
	Loss on test: 0.0
Trial 268
	Classifier=False, criterion=mse
	max_depth=2, n_feats=87, n_ex=63
	Training losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 17907.881873813476
 DESIRED: 10720.83729258959
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 39474.04315654316
 DESIRED: 46807.43759841955
Trial 269
	Classifier=True, criterion=gini
	max_depth=3, n_feats=15, n_ex=52
	n_classes: 4
	Loss on training: 0.0
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 0.0
 DESIRED: 0.0625
Trial 270
	Classifier=False, criterion=mse
	max_depth=3, n_feats=20, n_ex=23
	Trai

	Loss on training: 0.6862745098039216
	Loss on test: 0.8636363636363636
Trial 303
	Classifier=False, criterion=mse
	max_depth=1, n_feats=44, n_ex=43
	Loss on training: 24775.05305275647
	Loss on test: 16712.48530565897
Trial 304
	Classifier=False, criterion=mse
	max_depth=1, n_feats=75, n_ex=95
	Loss on training: 28354.9727930121
	Loss on test: 39772.60191026967
Trial 305
	Classifier=False, criterion=mse
	max_depth=4, n_feats=46, n_ex=94
	Training losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 9650.909165820547
 DESIRED: 2399.9944927835963
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 16445.526679593342
 DESIRED: 22513.326376732763
Trial 306
	Classifier=True, criterion=entropy
	max_depth=2, n_feats=84, n_ex=22
	n_classes: 6
	Training losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 0.4
 DESIRED: 0.19999999999999996
	Loss on test: 0.7142857142857143
Trial 307
	Classifier=True, criterion=entropy
	max_depth=3, n_fea

	Training losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 21258.341210289687
 DESIRED: 2806.7062136982304
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 101658.85778815136
 DESIRED: 66407.15129755513
Trial 339
	Classifier=False, criterion=mse
	max_depth=3, n_feats=9, n_ex=68
	Training losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 7248.3952144609375
 DESIRED: 3513.4575448852747
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 14888.251526613052
 DESIRED: 11548.89791422524
Trial 340
	Classifier=True, criterion=entropy
	max_depth=2, n_feats=60, n_ex=70
	n_classes: 5
	Training losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 0.34693877551020413
 DESIRED: 0.16326530612244894
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 0.5238095238095238
 DESIRED: 0.2857142857142857
Trial 341
	Classifier=False, criterion=mse
	max_depth=2, n_feats=56, n_ex=

	Loss on training: 0.52
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 0.8636363636363636
 DESIRED: 0.9545454545454546
Trial 387
	Classifier=True, criterion=gini
	max_depth=3, n_feats=14, n_ex=61
	n_classes: 2
	Loss on training: 0.0
	Loss on test: 0.0
Trial 388
	Classifier=False, criterion=mse
	max_depth=3, n_feats=85, n_ex=78
	Training losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 15398.76894904887
 DESIRED: 7478.822490858262
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 36098.27439378268
 DESIRED: 45499.38248104561
Trial 389
	Classifier=True, criterion=entropy
	max_depth=4, n_feats=29, n_ex=24
	n_classes: 5
	Loss on training: 0.0
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 0.125
 DESIRED: 0.5
Trial 390
	Classifier=False, criterion=mse
	max_depth=4, n_feats=25, n_ex=22
	Training losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 5433.852656745013
 DESIR

	Loss on training: 0.0
	Loss on test: 0.0
Trial 421
	Classifier=False, criterion=mse
	max_depth=2, n_feats=74, n_ex=51
	Training losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 32085.455630692333
 DESIRED: 28537.325944753786
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 99204.99599498784
 DESIRED: 126204.04651996262
Trial 422
	Classifier=False, criterion=mse
	max_depth=1, n_feats=81, n_ex=43
	Loss on training: 12963.8025192825
	Loss on test: 14288.12729277804
Trial 423
	Classifier=False, criterion=mse
	max_depth=2, n_feats=5, n_ex=46
	Training losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 12652.36872955038
 DESIRED: 9004.884141538805
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 27267.099832754426
 DESIRED: 25589.60836222683
Trial 424
	Classifier=True, criterion=entropy
	max_depth=4, n_feats=71, n_ex=52
	n_classes: 8
	Training losses not equal:

Arrays are not almost equal to 7 dec

	Loss on training: 16207.312670330055
	Loss on test: 46892.85184541416
Trial 460
	Classifier=False, criterion=mse
	max_depth=2, n_feats=32, n_ex=42
	Training losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 19879.103794060735
 DESIRED: 18186.23785471192
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 25563.738233136126
 DESIRED: 25699.416297129974
Trial 461
	Classifier=True, criterion=gini
	max_depth=4, n_feats=43, n_ex=58
	n_classes: 4
	Loss on training: 0.0
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 0.0
 DESIRED: 0.11111111111111116
Trial 462
	Classifier=False, criterion=mse
	max_depth=4, n_feats=27, n_ex=10
	Training losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 808.0963554256134
 DESIRED: 0.0
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 116626.96080202155
 DESIRED: 130220.10669066745
Trial 463
	Classifier=False, criterion=mse
	max_depth=2, n_feats

	Training losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 31998.648510525985
 DESIRED: 21751.20429764971
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 28464.89738582984
 DESIRED: 33176.57067200875
Trial 498
	Classifier=False, criterion=mse
	max_depth=3, n_feats=58, n_ex=9
	Training losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 53.15254326902596
 DESIRED: 0.07514283288478309
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 43026.38618843128
 DESIRED: 33233.577187034876
Trial 499
	Classifier=True, criterion=entropy
	max_depth=2, n_feats=71, n_ex=82
	n_classes: 8
	Training losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 0.5263157894736843
 DESIRED: 0.4035087719298246
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 0.8
 DESIRED: 0.6799999999999999
Trial 500
	Classifier=False, criterion=mse
	max_depth=2, n_feats=59, n_ex=58
	Training losses

	Loss on training: 0.21052631578947367
	Loss on test: 0.3529411764705882
Trial 536
	Classifier=False, criterion=mse
	max_depth=2, n_feats=4, n_ex=92
	Training losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 996.78820868816
 DESIRED: 825.9686348237773
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 2076.0315527183293
 DESIRED: 2078.8588051903143
Trial 537
	Classifier=True, criterion=gini
	max_depth=1, n_feats=42, n_ex=13
	n_classes: 5
	Loss on training: 0.4444444444444444
	Loss on test: 0.75
Trial 538
	Classifier=True, criterion=entropy
	max_depth=4, n_feats=97, n_ex=29
	n_classes: 6
	Loss on training: 0.0
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 0.4444444444444444
 DESIRED: 0.6666666666666667
Trial 539
	Classifier=True, criterion=entropy
	max_depth=1, n_feats=59, n_ex=99
	n_classes: 8
	Loss on training: 0.6811594202898551
	Loss on test: 0.9
Trial 540
	Classifier=True, criterion=entropy
	max_depth=4, n_

	Loss on training: 0.0
	Loss on test: 0.0
Trial 577
	Classifier=True, criterion=gini
	max_depth=4, n_feats=78, n_ex=90
	n_classes: 4
	Loss on training: 0.0
	Loss on test: 0.0
Trial 578
	Classifier=False, criterion=mse
	max_depth=3, n_feats=86, n_ex=90
	Training losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 12156.051953950591
 DESIRED: 4456.23829487647
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 32401.59811022847
 DESIRED: 37403.2759419366
Trial 579
	Classifier=False, criterion=mse
	max_depth=4, n_feats=67, n_ex=47
	Training losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 2247.413510645711
 DESIRED: 246.84783078293862
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 22021.144569112275
 DESIRED: 25475.294395526627
Trial 580
	Classifier=False, criterion=mse
	max_depth=2, n_feats=92, n_ex=33
	Training losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 7154.95200124706

	Training losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 15228.817512287233
 DESIRED: 10758.575233797406
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 32173.353315497952
 DESIRED: 31613.449391033395
Trial 622
	Classifier=False, criterion=mse
	max_depth=1, n_feats=65, n_ex=15
	Loss on training: 5581.514429923171
	Loss on test: 45328.564481122405
Trial 623
	Classifier=False, criterion=mse
	max_depth=4, n_feats=83, n_ex=92
	Training losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 14564.908630077018
 DESIRED: 2867.238159861555
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 44001.62674153375
 DESIRED: 47755.4275523784
Trial 624
	Classifier=True, criterion=entropy
	max_depth=3, n_feats=80, n_ex=95
	n_classes: 2
	Loss on training: 0.0
	Loss on test: 0.0
Trial 625
	Classifier=True, criterion=entropy
	max_depth=2, n_feats=18, n_ex=65
	n_classes: 3
	Loss on training: 0.0
	Loss on test: 0.0
Tri

	Loss on training: 0.0
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 0.05882352941176472
 DESIRED: 0.0
Trial 662
	Classifier=True, criterion=gini
	max_depth=1, n_feats=33, n_ex=80
	n_classes: 5
	Loss on training: 0.5357142857142857
	Loss on test: 0.75
Trial 663
	Classifier=False, criterion=mse
	max_depth=1, n_feats=5, n_ex=38
	Loss on training: 5135.541521877391
	Loss on test: 14438.290850886237
Trial 664
	Classifier=True, criterion=gini
	max_depth=2, n_feats=20, n_ex=23
	n_classes: 6
	Loss on training: 0.375
	Loss on test: 0.7142857142857143
Trial 665
	Classifier=True, criterion=entropy
	max_depth=1, n_feats=74, n_ex=94
	n_classes: 6
	Loss on training: 0.6
	Loss on test: 0.7931034482758621
Trial 666
	Classifier=True, criterion=gini
	max_depth=2, n_feats=51, n_ex=95
	n_classes: 6
	Loss on training: 0.43939393939393945
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 0.6896551724137931
 DESIRED: 0.6551724137931034
Trial 667
	Class

	Loss on training: 0.0
	Loss on test: 0.0
Trial 701
	Classifier=False, criterion=mse
	max_depth=2, n_feats=12, n_ex=39
	Training losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 12568.094555442874
 DESIRED: 7313.503322977128
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 14103.887920459789
 DESIRED: 20987.24000489201
Trial 702
	Classifier=False, criterion=mse
	max_depth=2, n_feats=85, n_ex=35
	Training losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 9232.581539329023
 DESIRED: 5003.429631114534
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 36836.66385397758
 DESIRED: 38539.69267919327
Trial 703
	Classifier=False, criterion=mse
	max_depth=1, n_feats=14, n_ex=35
	Loss on training: 9858.218958775205
	Loss on test: 18995.028294861495
Trial 704
	Classifier=True, criterion=gini
	max_depth=2, n_feats=7, n_ex=39
	n_classes: 2
	Loss on training: 0.0
	Loss on test: 0.0
Trial 705
	Classifier=Fals

	Training losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 0.3666666666666667
 DESIRED: 0.0
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 0.5384615384615384
 DESIRED: 0.0
Trial 746
	Classifier=True, criterion=entropy
	max_depth=1, n_feats=98, n_ex=82
	n_classes: 7
	Loss on training: 0.7017543859649122
	Loss on test: 0.72
Trial 747
	Classifier=False, criterion=mse
	max_depth=3, n_feats=21, n_ex=29
	Training losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 6459.228608876934
 DESIRED: 1360.987096923323
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 14123.52761470641
 DESIRED: 25246.194855585938
Trial 748
	Classifier=True, criterion=entropy
	max_depth=4, n_feats=25, n_ex=45
	n_classes: 7
	Training losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 0.25806451612903225
 DESIRED: 0.0
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 0.35714285714285

	Training losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 5351.308952161601
 DESIRED: 188.0717220154644
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 64751.36292749515
 DESIRED: 58843.93734846978
Trial 782
	Classifier=True, criterion=entropy
	max_depth=3, n_feats=47, n_ex=6
	n_classes: 3
	Loss on training: 0.0
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 0.5
 DESIRED: 0.0
Trial 783
	Classifier=False, criterion=mse
	max_depth=4, n_feats=37, n_ex=39
	Training losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 7140.2039159819
 DESIRED: 1093.2611754716756
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 53919.15476205198
 DESIRED: 71851.7658522686
Trial 784
	Classifier=False, criterion=mse
	max_depth=1, n_feats=86, n_ex=70
	Loss on training: 28725.006868488952
	Loss on test: 57895.878983277085
Trial 785
	Classifier=True, criterion=entropy
	max_depth=2, n_feats=9,

	Training losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 5169.972595745179
 DESIRED: 1855.5630810598527
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 34692.22008706483
 DESIRED: 32699.073760706233
Trial 819
	Classifier=True, criterion=gini
	max_depth=1, n_feats=89, n_ex=70
	n_classes: 7
	Loss on training: 0.6734693877551021
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 0.8571428571428572
 DESIRED: 0.8095238095238095
Trial 820
	Classifier=True, criterion=entropy
	max_depth=2, n_feats=92, n_ex=27
	n_classes: 2
	Loss on training: 0.0
	Loss on test: 0.0
Trial 821
	Classifier=False, criterion=mse
	max_depth=1, n_feats=9, n_ex=73
	Loss on training: 17534.01432920888
	Loss on test: 28272.701600879875
Trial 822
	Classifier=True, criterion=gini
	max_depth=2, n_feats=4, n_ex=48
	n_classes: 8
	Loss on training: 0.5151515151515151
	Loss on test: 0.8666666666666667
Trial 823
	Classifier=True, criterion=gini
	max_dept

	Training losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 0.4411764705882353
 DESIRED: 0.2647058823529411
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 0.6
 DESIRED: 0.4666666666666667
Trial 858
	Classifier=False, criterion=mse
	max_depth=2, n_feats=10, n_ex=34
	Training losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 7527.337120704782
 DESIRED: 5537.166841074517
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 44732.886583863634
 DESIRED: 39066.55589774941
Trial 859
	Classifier=True, criterion=gini
	max_depth=3, n_feats=45, n_ex=45
	n_classes: 4
	Loss on training: 0.0
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 0.0
 DESIRED: 0.0714285714285714
Trial 860
	Classifier=False, criterion=mse
	max_depth=1, n_feats=30, n_ex=58
	Loss on training: 27907.27257249875
	Loss on test: 23671.163964737127
Trial 861
	Classifier=False, criterion=mse
	max_depth=3, n_feats=1

	Training losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 0.5217391304347826
 DESIRED: 0.08695652173913049
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 0.6
 DESIRED: 0.15000000000000002
Trial 895
	Classifier=False, criterion=mse
	max_depth=4, n_feats=22, n_ex=24
	Training losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 8225.79974373306
 DESIRED: 20.35180054212683
	Test losses not equal:

Arrays are not almost equal to 7 decimals
 ACTUAL: 212408.34286740964
 DESIRED: 222210.4806730745
Trial 896
	Classifier=True, criterion=gini
	max_depth=2, n_feats=55, n_ex=66
	n_classes: 6
	Loss on training: 0.4782608695652174
	Loss on test: 0.55
Trial 897
	Classifier=True, criterion=entropy
	max_depth=1, n_feats=27, n_ex=78
	n_classes: 9
	Loss on training: 0.7407407407407407
	Loss on test: 0.8333333333333334
Trial 898
	Classifier=False, criterion=mse
	max_depth=3, n_feats=13, n_ex=94
	Training losses not equal:

Arrays are not 

KeyboardInterrupt: 

In [77]:
def my_entropy(y):
    """
    Entropy of a label sequence
    """
    hist = np.bincount(y)
    ps = hist / np.sum(hist)
    return -np.sum([p * np.log2(p) for p in ps if p > 0])

In [7]:
n_ex = np.random.randint(2, 100)
n_feats = np.random.randint(2, 100)
max_depth = np.random.randint(1, 5)
classifier = np.random.choice([True, False])
n_classes = np.random.randint(2, 10)

In [9]:
  X, y = make_blobs(n_samples=n_ex,
                              centers=n_classes,
                              n_features=n_feats,
                              random_state=1)

In [10]:
X,y

(array([[ -6.14453188, -11.18715795,   3.72292349, ...,  -3.93485061,
           7.70237032,   1.12245889],
        [ -7.02851559, -10.24460323,   2.53221005, ...,  -4.88520648,
           9.27238209,   1.38084099],
        [ -5.63642576,  -8.62115465,   3.44539663, ...,  -5.00734845,
           7.19787177,   1.89098151],
        ...,
        [-10.02637826,   0.95394909,  -3.89551406, ...,  -8.83740332,
          -8.70051817,  -3.90699393],
        [ -1.1719657 ,   6.1455629 ,  -9.9287427 , ...,   9.00247473,
          -4.01449899,  -5.48723894],
        [-10.98813991,   2.4779139 ,  -4.90068742, ...,  -8.15951149,
          -7.02521209,  -5.10894308]]),
 array([1, 1, 1, 2, 2, 2, 2, 1, 0, 2, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
        2, 2, 2, 1, 1, 2, 0, 0, 1, 2, 0, 2]))

In [12]:
X.shape

(34, 49)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=1)


In [24]:
# initialize model
def loss(yp, y):
    return 1 - accuracy_score(yp, y)


criterion = np.random.choice(["entropy", "gini"])
mine = DecisionTree(classifier=True, max_depth=max_depth, criterion="entropy")

In [25]:
mine.fit(X_train,y_train)
y_pred_mine = mine.predict(X_train)
loss(y_pred_mine, y_train)

TypeError: entropy() takes 1 positional argument but 2 were given

In [None]:
gold = DecisionTreeClassifier(
    criterion=criterion,
    max_depth=max_depth,
    splitter="best",
    random_state=1,
)

In [26]:
### 基尼系数的理解


In [1]:
import pandas as pd
play_data=pd.read_csv("datasets/play")
play_data.head()

Unnamed: 0,Day,Outlook,Temp.,Humidity,Wind,Decision
0,1,Sunny,Hot,High,Weak,No
1,2,Sunny,Hot,High,Strong,No
2,3,Overcast,Hot,High,Weak,Yes
3,4,Rain,Mild,High,Weak,Yes
4,5,Rain,Cool,Normal,Weak,Yes


In [2]:
number,_ = play_data.shape
number

14

In [4]:
import numpy as np
play_data=np.array(play_data)
play_data

array([[1, 'Sunny', 'Hot', 'High', 'Weak', 'No'],
       [2, 'Sunny', 'Hot', 'High', 'Strong', 'No'],
       [3, 'Overcast', 'Hot', 'High', 'Weak', 'Yes'],
       [4, 'Rain', 'Mild', 'High', 'Weak', 'Yes'],
       [5, 'Rain', 'Cool', 'Normal', 'Weak', 'Yes'],
       [6, 'Rain', 'Cool', 'Normal', 'Strong', 'No'],
       [7, 'Overcast', 'Cool', 'Normal', 'Strong', 'Yes'],
       [8, 'Sunny', 'Mild', 'High', 'Weak', 'No'],
       [9, 'Sunny', 'Cool', 'Normal', 'Weak', 'Yes'],
       [10, 'Rain', 'Mild', 'Normal', 'Weak', 'Yes'],
       [11, 'Sunny', 'Mild', 'Normal', 'Strong', 'Yes'],
       [12, 'Overcast', 'Mild', 'High', 'Strong', 'Yes'],
       [13, 'Overcast', 'Hot', 'Normal', 'Weak', 'Yes'],
       [14, 'Rain', 'Mild', 'High', 'Strong', 'No']], dtype=object)

In [5]:
X_train, y = play_data[:, 1:-1], play_data[:, -1]
X_train, y

(array([['Sunny', 'Hot', 'High', 'Weak'],
        ['Sunny', 'Hot', 'High', 'Strong'],
        ['Overcast', 'Hot', 'High', 'Weak'],
        ['Rain', 'Mild', 'High', 'Weak'],
        ['Rain', 'Cool', 'Normal', 'Weak'],
        ['Rain', 'Cool', 'Normal', 'Strong'],
        ['Overcast', 'Cool', 'Normal', 'Strong'],
        ['Sunny', 'Mild', 'High', 'Weak'],
        ['Sunny', 'Cool', 'Normal', 'Weak'],
        ['Rain', 'Mild', 'Normal', 'Weak'],
        ['Sunny', 'Mild', 'Normal', 'Strong'],
        ['Overcast', 'Mild', 'High', 'Strong'],
        ['Overcast', 'Hot', 'Normal', 'Weak'],
        ['Rain', 'Mild', 'High', 'Strong']], dtype=object),
 array(['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes',
        'Yes', 'Yes', 'Yes', 'No'], dtype=object))

### 决策树可视化
1. 使用graphviz https://graphviz.gitlab.io/_pages/Download/Download_source.html