In [34]:
import pandas as pd
import numpy as np

## Собственная реализация решающего дерева для классификации

In [35]:
from sklearn.base import BaseEstimator, RegressorMixin
from numba import njit
class Node:
    __slots__ = ('feature', 'threshold', 'left', 'right', 'value')
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature      # индекс признака для разбиения (у внутреннего узла)
        self.threshold = threshold  # порог разбиения
        self.left = left            # левое поддерево (Node)
        self.right = right          # правое поддерево (Node)
        self.value = value 
        
class MyDecisionTreeRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, max_depth = 5, min_samples_split = 2, min_samples_leaf = 1, criteria = 'mse'):
        self.root = None
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.criteria = self.mse_criteria

    @staticmethod
    @njit
    def mse_criteria(side_volume, side_sum, side_square_sum):
        
        return side_square_sum / side_volume - (side_sum / side_volume)**2


    '''
    1. Перебираем признаки
    2. Получаем массив значений каждого признака у объектов
    3. Сортируем его, сортируем таргет по этим индексам
    4. Сначала слева - 0 элементов, справа - все
    5. Бежим слева направо, после нахождения каждой группы одинаковых элементов считаем критерий качества данного разделения
    6. Запоминаем лучший
    '''

    # indices - индексы элементов, которые входят в текущий узел

    @staticmethod
    @njit
    def best_split_for_feature(sorted_features, sorted_labels, parent_score, min_samples_leaf, criteria):
        n_samples = len(sorted_features)
        best_threshold = None
        best_split = parent_score
        left_border = 0

        right_sum = np.sum(sorted_labels)
        right_square_sum = np.sum(np.square(sorted_labels))
        right_volume = n_samples
        left_sum = 0
        left_square_sum = 0
        left_volume = 0
        while left_border < n_samples - 1:
            start_value = sorted_features[left_border]
            j = left_border
            while j < n_samples and sorted_features[j] == start_value:
                right_sum -= sorted_labels[j]
                right_square_sum -= sorted_labels[j]**2
                left_sum += sorted_labels[j]
                left_square_sum += sorted_labels[j]**2
                right_volume -= 1
                left_volume += 1
                j+=1
                
            if left_volume >= min_samples_leaf and right_volume >= min_samples_leaf:
                criteria_left = criteria(left_volume, left_sum, left_square_sum)
                criteria_right = criteria(right_volume, right_sum, right_square_sum)
                current_score = left_volume / n_samples * criteria_left + right_volume / n_samples * criteria_right
                
                if current_score < best_split:
                    best_split = current_score
                    if j < n_samples:
                        best_threshold = (sorted_features[j] + start_value) / 2.0
                    else:
                        best_threshold = start_value
                        
            left_border = j
        return best_threshold, best_split
                
    def find_best_split(self, X, y, indices, parent_score):
        best_feature, best_threshold = None, None
        best_split = parent_score
        
        n_samples = len(indices)
        labels = y[indices]
        
        if n_samples < self.min_samples_leaf:
            return best_feature, best_threshold
        
        for feature in range(X.shape[1]):
            feature_values = X[indices, feature]

            sorted_feature_idx = np.argsort(feature_values)
            sorted_features = feature_values[sorted_feature_idx]
            sorted_labels = labels[sorted_feature_idx]

            current_threshold, current_split = self.best_split_for_feature(sorted_features, sorted_labels, parent_score, self.min_samples_leaf, self.criteria)
            if current_split < best_split:
                best_feature = feature
                best_threshold = current_threshold
                best_split = current_split
        return best_feature, best_threshold
        
    def build_tree(self, X, y, indices, depth):
        n_samples = len(indices)
        
        if (depth >= self.max_depth) or (n_samples<self.min_samples_split):
            return Node(value=np.mean(y[indices]))

        parent_sum = np.sum(y[indices])
        parent_square_sum = np.sum(y[indices] ** 2)
        parent_volume = len(y[indices])
        parent_score = self.criteria(parent_volume, parent_sum, parent_square_sum)
        
        best_feature, best_threshold = self.find_best_split(X, y, indices, parent_score)
        
        if best_feature is None:  # не удалось найти разбиение
            return Node(value=np.mean(y[indices]))
            
        feature_values = X[indices, best_feature]
        left_indices = indices[feature_values <= best_threshold]
        right_indices = indices[feature_values > best_threshold]

        left_subtree = self.build_tree(X, y, left_indices, depth+1)
        right_subtree = self.build_tree(X, y, right_indices, depth+1)
        
        return Node(feature = best_feature, threshold = best_threshold, left = left_subtree, right = right_subtree)
        
    def fit(self, X, y):
        X, y = np.array(X), np.array(y)
        self.root = self.build_tree(X, y, indices = np.arange(X.shape[0]), depth = 0)
        return self
        
    def predict(self, X):
        X = np.array(X)    
        return np.array([self.predict_one(x, self.root) for x in X])

    def predict_one(self, obj, node):
        if node.value is not None:
            return node.value
        if obj[node.feature] <= node.threshold:
            return self.predict_one(obj, node.left)
        return self.predict_one(obj, node.right)        

## Замерим качество работы

In [36]:
from sklearn.datasets import load_diabetes
data = load_diabetes()
X = data.data
y = data.target
print(data.feature_names)

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']


### Моя реализация

In [37]:
%%time
from sklearn.model_selection import cross_val_score
my_tree_regressor = MyDecisionTreeRegressor()
print(f'RMSE on cross_validation, (number of folds is 7): {np.sqrt(-np.mean(cross_val_score(my_tree_regressor, X, y, cv = 7, scoring = 'neg_mean_squared_error')))}')

RMSE on cross_validation, (number of folds is 7): 65.16165741889287
CPU times: total: 1.02 s
Wall time: 1.05 s


### Готовая реализация в sklearn

In [38]:
%%time
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
tree_regressor = DecisionTreeRegressor(max_depth = 5)
print(f'RMSE on cross_validation, (number of folds is 7): {np.sqrt(-np.mean(cross_val_score(tree_regressor, X, y, cv = 7, scoring = 'neg_mean_squared_error')))}')

RMSE on cross_validation, (number of folds is 7): 64.44169422602118
CPU times: total: 46.9 ms
Wall time: 48.8 ms
