In [1]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.datasets import california_housing

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, KFold

import matplotlib.pyplot as plt

    Задача: построить алгоритма градиентного бустинга с квадратичной функцией потерь, в качестве базового алгоритма использовать алгоритм CART.
   

In [259]:
class MyDecisionTreeRegr:
    def __init__(self, max_depth=None, min_samples_split=2, min_samples_leaf=1,
                 min_weight_fraction_leaf=0.0, max_features=None, random_state=None,
                 max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.min_weight_fraction_leaf = min_weight_fraction_leaf
        self.max_features = max_features
        self.random_state = random_state
        self.max_leaf_nodes = max_leaf_nodes
        self.min_impurity_decrease = min_impurity_decrease
        self.min_impurity_split = min_impurity_split
        
        self.tree = dict()  # Номер узла: информация об узле.
        
        self.Leaf = True
        self.notLeaf = False
        
    def _calculate_impurity(self, y):
        return np.var(y)
        
#     def _search_split(self, X, y):
#         best_criterion = float('-inf')
#         best_feature = None
#         best_threshold = None
        
#         node_impurity = self._calculate_impurity(y)
        
#         for col in range(X.shape[1]):
#             feature_level = np.unique(X[:, col])
# #             thresholds = X[:, col]
#             # TODO: не очень понимаю зач здесь так сделано, надо разобраться.
#             thresholds = (feature_level[1:] + feature_level[:-1]) / 2.0
            
#             for threshold in thresholds:
#                 y_left = y[X[:, col] <= threshold]
#                 left_impurity = self._calculate_impurity(y_left)
#                 N_left = y_left.shape[0] / y.shape[0]
                
#                 y_right = y[X[:, col] > threshold]
#                 right_impurity = self._calculate_impurity(y_right)
#                 N_right = y_right.shape[0] / y.shape[0]
                
#                 impurity_criterion = node_impurity - N_left * left_impurity \
#                                     - N_right * right_impurity
                    
#                 if impurity_criterion > best_criterion:
#                     best_criterion = impurity_criterion
#                     best_feature = col
#                     best_threshold = threshold
        
#         if best_criterion == float('-inf'):
#             return best_feature, best_threshold, False
                    
#         return best_feature, best_threshold, True 

    def _search_split(self, X, y):
        sorted_idx = np.argsort(X, axis=0)
        sortedX = np.sort(X, axis=0)
        sortedY = y[sorted_idx]
        
        # Дисперсии для левого поддерева и правого
        # объединяются вместе.
        sumY = np.cumsum(sortedY, axis=0)
        sumY_2 = np.cumsum(sortedY**2, axis=0)
        revsumY = np.cumsum(sortedY[::-1], axis=0)[::-1]
        revsumY_2 = np.cumsum(sortedY[::-1]**2, axis=0)[::-1]
        
        length = np.array(range(1, X.shape[0]+1)).reshape(-1, 1)
        left_crit = sumY_2 - 2 * sumY**2 / length + sumY**2 / length**2
        right_crit = revsumY_2 - 2 * revsumY**2 / length[::-1] + revsumY**2 / length[::-1]**2
        
        crit = left_crit
        # Т.к. разделение в правую ветку идет со знаком <.
        crit[:-1] += right_crit[1:]
        
        # Удалить неинформативные фичи.
        bad_idx = np.where(np.max(sortedX, axis=0) - np.min(sortedX, axis=0) <= 0.0001)[0]
        if bad_idx.shape[0] > 0:
            crit[:, bad_idx] = np.nan
        
#         close_feats = np.isclose(sortedX, np.roll(sortedX, shift=-1, axis=0))
#         crit[close_feats] = np.nan
#         bad_idx = np.hstack([bad_idx, close_feats])
        
        # Если все фичи неинформативные, то надо строить лист.
        if bad_idx.shape[0] == X.shape[1]:
            return None, None, False
        
        feat_flat_idx = np.nanargmin(crit)
        thrl_id, feat_id = np.unravel_index(feat_flat_idx, crit.shape)
        
        return feat_id, sortedX[thrl_id, feat_id], True
        
    def _fit_node(self, X, y, node_id, depth):
        # Если выполнен критерий остановки, создать листовую вершину.
        if (X.shape[0] < self.min_samples_split or depth == self.max_depth):
            self.tree[node_id] = [self.Leaf, np.mean(y)]
            return
        
        feature_id, threshold, isOkay = self._search_split(X, y)
        
        # Значит скорее всего все фичи уже одинаковые в этом поддереве, двигаться некуда.
        if isOkay is False:
            self.tree[node_id] = [self.Leaf, np.mean(y)]
            return
        
        X_left, y_left = X[X[:, feature_id] >= threshold], y[X[:, feature_id] >= threshold]
        X_right, y_right = X[X[:, feature_id] < threshold], y[X[:, feature_id] < threshold]
        
        if (X_left.shape[0] < self.min_samples_leaf or
            X_right.shape[0] < self.min_samples_leaf):

            self.tree[node_id] = [self.Leaf, np.mean(y)]            
        else:
            self.tree[node_id] = [self.notLeaf, feature_id, threshold]
            self._fit_node(X_left, y_left, 2*node_id+1, depth+1)
            self._fit_node(X_right, y_right, 2*node_id+2, depth+1)
        
    def fit(self, X, y):
        X = np.array(X)
        y = np.array(y)
        
        self._fit_node(X, y, 0, 0)
        
    def _predict(self, X, node_id):
        node_info = self.tree[node_id]
        answer = np.zeros(X.shape[0])

        if node_info[0] is self.notLeaf:
            feature_id, threshold = node_info[1], node_info[2]

            ids_left = np.where(X[:, feature_id] >= threshold)
            answer[ids_left] = self._predict(X[ids_left], 2*node_id+1)

            ids_right = np.where(X[:, feature_id] < threshold)
            answer[ids_right] = self._predict(X[ids_right], 2*node_id+2)
        else:
            answer = np.array([node_info[1]] * X.shape[0])
            
        return answer
            
        
    def predict(self, X):
        X = np.array(X)
        return self._predict(X, 0)

In [460]:
class MyDecisionTreeRegrNew:
    def __init__(self, max_depth=None, min_samples_split=2, min_samples_leaf=1,
                 min_weight_fraction_leaf=0.0, max_features=None, random_state=None,
                 max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.min_weight_fraction_leaf = min_weight_fraction_leaf
        self.max_features = max_features
        self.random_state = random_state
        self.max_leaf_nodes = max_leaf_nodes
        self.min_impurity_decrease = min_impurity_decrease
        self.min_impurity_split = min_impurity_split
        
        self.tree = dict()  # Номер узла: информация об узле.
        
        self.Leaf = True
        self.notLeaf = False
        
    def _calculate_impurity(self, y):
        return np.var(y)

    def _search_split(self, x, y):
        sorted_idx = x.argsort(axis=0)
        sortX, sortY = np.sort(x, axis=0), y[sorted_idx]

        cumsumY2 = np.cumsum(sortY ** 2, axis=0)
        cumsumRevY2 = np.cumsum(sortY[::-1] ** 2, axis=0)[::-1]

        cumsumY = np.cumsum(sortY, axis=0)
        cumsumRevY = np.cumsum(sortY[::-1], axis=0)[::-1]

        lenArray = np.array(range(1, cumsumY.shape[0] + 1)).reshape(-1, 1)
#         left = cumsumY2 - cumsumY ** 2 / lenArray
#         right = cumsumRevY2 - cumsumRevY ** 2 / lenArray[::-1]
        left = cumsumY2 - 2 * cumsumY ** 2 / lenArray + cumsumY ** 2 / lenArray**2
        right = cumsumRevY2 - 2 * cumsumRevY ** 2 / lenArray[::-1] + cumsumRevY ** 2 / lenArray[::-1]**2

        mse = left
        mse[:-1] += right[1:]
        
        # удаляем бесполезные фичи
        bad_features = np.where((np.max(x, axis=0) - np.min(x, axis=0)) < 1e-3)[0]
        mse[:, bad_features] = np.nan
        # если две фичи рядом очень похожи между собой, то надо выкинуть фичу -> пропускаем очень близкие значения
        # np.isclose - возвращает логический массив, где два массива поэлементно равны в пределах допуска
        mse[np.isclose(sortX, np.roll(sortX, shift=-1, axis=0))] = np.nan
        # если ошибка - значит топ сплита нет - и мы переобучимся - делаем резкий 
        
        try:
            argmin = np.nanargmin(mse)
        except ValueError:
            return None, None, False
        
        # получаем номер объекта и фичу лучшего сплита
        idx = argmin // mse.shape[1]
        feature_id = argmin - idx*mse.shape[1]
        
        threshold = sortX[idx, feature_id]
        
        return feature_id, threshold, True
        
    def _fit_node(self, X, y, node_id, depth):
        # Если выполнен критерий остановки, создать листовую вершину.
        if (X.shape[0] < self.min_samples_split or depth == self.max_depth):
            self.tree[node_id] = [self.Leaf, np.mean(y)]
            return
        
        feature_id, threshold, isOkay = self._search_split(X, y)
        
        # Значит скорее всего все фичи уже одинаковые в этом поддереве, двигаться некуда.
        if isOkay is False:
            self.tree[node_id] = [self.Leaf, np.mean(y)]
            return
        
        X_left, y_left = X[X[:, feature_id] >= threshold], y[X[:, feature_id] >= threshold]
        X_right, y_right = X[X[:, feature_id] < threshold], y[X[:, feature_id] < threshold]
        
        if (X_left.shape[0] < self.min_samples_leaf or
            X_right.shape[0] < self.min_samples_leaf):

            self.tree[node_id] = [self.Leaf, np.mean(y)]            
        else:
            self.tree[node_id] = [self.notLeaf, feature_id, threshold]
            self._fit_node(X_left, y_left, 2*node_id+1, depth+1)
            self._fit_node(X_right, y_right, 2*node_id+2, depth+1)
        
    def fit(self, X, y):
        X = np.array(X)
        y = np.array(y)
        
        self._fit_node(X, y, 0, 0)
        
    def _predict(self, X, node_id):
        node_info = self.tree[node_id]
        answer = np.zeros(X.shape[0])

        if node_info[0] is self.notLeaf:
            feature_id, threshold = node_info[1], node_info[2]

            ids_left = np.where(X[:, feature_id] >= threshold)
            answer[ids_left] = self._predict(X[ids_left], 2*node_id+1)

            ids_right = np.where(X[:, feature_id] < threshold)
            answer[ids_right] = self._predict(X[ids_right], 2*node_id+2)
        else:
            answer = np.array([node_info[1]] * X.shape[0])
            
        return answer
            
        
    def predict(self, X):
        X = np.array(X)
        return self._predict(X, 0)

In [491]:
class DecisionTree:
    """
        author: Michael Pritugin
    """
    NON_LEAF_TYPE = 0
    LEAF_TYPE = 1

    def __init__(self, max_depth=None, min_samples_split=2, max_features=None, min_samples_leaf=1):
        self.tree = dict()
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
    
    @staticmethod
    def _sort_samples(x, y):
        sorted_idx = x.argsort(axis=0)
        return np.sort(x, axis=0), y[sorted_idx]
    
    @staticmethod
    def _div_samples(x, y, feature_id, threshold):
        left_mask = x[:, feature_id] <= threshold
        right_mask = ~left_mask
        return x[left_mask], x[right_mask], y[left_mask], y[right_mask]

    def __fit_node(self, x, y, node_id, depth):
        if (x.shape[0] < self.min_samples_split) or (depth == self.max_depth):
            self.tree[node_id] = self.LEAF_TYPE, np.mean(y)
            return

        sortX, sortY = DecisionTree._sort_samples(x, y)

        cumsumY2 = np.cumsum(sortY ** 2, axis=0)
        cumsumRevY2 = np.cumsum(sortY[::-1] ** 2, axis=0)[::-1]

        cumsumY = np.cumsum(sortY, axis=0)
        cumsumRevY = np.cumsum(sortY[::-1], axis=0)[::-1]

        lenArray = np.array(range(1, cumsumY.shape[0] + 1)).reshape(-1, 1)
#         left = cumsumY2 - cumsumY ** 2 / lenArray
#         right = cumsumRevY2 - cumsumRevY ** 2 / lenArray[::-1]
        left = cumsumY2 - 2 * cumsumY ** 2 / lenArray + cumsumY ** 2 / lenArray**2
        right = cumsumRevY2 - 2 * cumsumRevY ** 2 / lenArray[::-1] + cumsumRevY ** 2 / lenArray[::-1]**2

        mse = left
        mse[:-1] += right[1:]
        
        # удаляем бесполезные фичи
        bad_features = np.where((np.max(x, axis=0) - np.min(x, axis=0)) < 1e-3)[0]
        mse[:, bad_features] = np.nan
        # если две фичи рядом очень похожи между собой, то надо выкинуть фичу -> пропускаем очень близкие значения
        # np.isclose - возвращает логический массив, где два массива поэлементно равны в пределах допуска
        mse[np.isclose(sortX, np.roll(sortX, shift=-1, axis=0))] = np.nan
        # если ошибка - значит топ сплита нет - и мы переобучимся - делаем резкий 
        
        try:
            argmin = np.nanargmin(mse)
        except ValueError:
            self.tree[node_id] = self.LEAF_TYPE, np.mean(y)
            return
        
        # получаем номер объекта и фичу лучшего сплита
        idx = argmin // mse.shape[1]
        feature_id = argmin - idx*mse.shape[1]
        
        threshold = sortX[idx, feature_id]
        # логика построения дерева
        Xleft, Xright, Yleft, Yright = self._div_samples(x, y, feature_id, threshold)

        if Xleft.shape[0] < self.min_samples_leaf or Xright.shape[0] < self.min_samples_leaf:
            self.tree[node_id] = self.LEAF_TYPE, np.mean(y)
        else:
            self.tree[node_id] = self.NON_LEAF_TYPE, feature_id, threshold
            
            self.__fit_node(Xleft, Yleft, 2 * node_id + 1, depth + 1)
            self.__fit_node(Xright, Yright, 2 * node_id + 2, depth + 1)

    def fit(self, X, y):
        self.__fit_node(X, y, 0, 0)
        return self

    def __predict(self, x, node_id):
        node = self.tree[node_id]
        answer = np.zeros(x.shape[0])
        if node[0] == self.__class__.NON_LEAF_TYPE:
            _, feature_id, threshold = node
            left_idx = np.where(x[:, feature_id] <= threshold)
            right_idx = np.where(x[:, feature_id] > threshold)

            answer[left_idx] = self.__predict(x[left_idx], 2*node_id + 1)
            answer[right_idx] = self.__predict(x[right_idx], 2*node_id + 2)
            return answer
        
        
        return np.array([node[1]]*x.shape[0])
            
    def predict(self, X):
        x = X
        answer = self.__predict(X, 0)
        return answer

In [258]:
a = np.array([1, 2, 3])
b = np.array([4])
np.hstack([a, b])

array([1, 2, 3, 4])

In [473]:
%%time
results = []
for train_id, test_id in kf.split(X, y):
    X_train, y_train = X[train_id], y[train_id]
    X_test, y_test = X[test_id], y[test_id]

    mytree_ex = MyDecisionTreeRegrNew()
    mytree_ex.fit(X_train, y_train)

    y_res = mytree_ex.predict(X_test)
    results.append(mean_squared_error(y_test, y_res))

print(results)
print(np.mean(results))

[23.036375000000003, 13.619499999999999, 17.154999999999998, 21.00814729574223, 18.285696202531646]
18.620943699654777
CPU times: user 336 ms, sys: 41 µs, total: 336 ms
Wall time: 335 ms


In [501]:
%%time
results = []
for train_id, test_id in kf.split(X, y):
    X_train, y_train = X[train_id], y[train_id]
    X_test, y_test = X[test_id], y[test_id]

    mytree_ex = DecisionTree()
    mytree_ex.fit(X_train, y_train)

    y_res = mytree_ex.predict(X_test)
    results.append(mean_squared_error(y_test, y_res))
    
print(results)
print(np.mean(results))

[19.83508491403495, 12.525776175240726, 11.880222248888362, 12.486017729397638, 13.619292409841883]
14.06927869548071
CPU times: user 22.2 ms, sys: 0 ns, total: 22.2 ms
Wall time: 21.5 ms


In [375]:
names = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year', 'origin', 'name']
data = pd.read_csv('auto-mpg.data', delim_whitespace=True, names=names)

data['horsepower'].replace('?', -999, inplace=True)
data[['cylinders', 'year', 'origin', 'horsepower']] = data[['cylinders', 'year', 'origin', 'horsepower']
                                                          ].astype(np.float32)
data.drop(['origin', 'name'], axis=1, inplace=True)

X, y = data.iloc[:, 1:], data.iloc[:, 0]
X, y = np.array(X), np.array(y)

data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year
0,18.0,8.0,307.0,130.0,3504.0,12.0,70.0
1,15.0,8.0,350.0,165.0,3693.0,11.5,70.0
2,18.0,8.0,318.0,150.0,3436.0,11.0,70.0
3,16.0,8.0,304.0,150.0,3433.0,12.0,70.0
4,17.0,8.0,302.0,140.0,3449.0,10.5,70.0


In [294]:
kf = KFold(n_splits=5, shuffle=True)

In [295]:
%%time
results = []
for train_id, test_id in kf.split(X, y):
    X_train, y_train = X[train_id], y[train_id]
    X_test, y_test = X[test_id], y[test_id]

    mytree_ex = MyDecisionTreeRegr()
    mytree_ex.fit(X_train, y_train)

    y_res = mytree_ex.predict(X_test)
    results.append(mean_squared_error(y_test, y_res))

print(results)
print(np.mean(results))

[10.414813947135647, 14.695552884449706, 13.400787812853675, 15.50393538086141, 12.129899869737478]
13.228997979007584
CPU times: user 86.3 ms, sys: 4.12 ms, total: 90.4 ms
Wall time: 92.7 ms


In [296]:
%%time
results = []
for train_id, test_id in kf.split(X, y):
    X_train, y_train = X[train_id], y[train_id]
    X_test, y_test = X[test_id], y[test_id]

    tree_ex = DecisionTreeRegressor()
    tree_ex.fit(X_train, y_train)

    y_res_ex = tree_ex.predict(X_test)
    results.append(mean_squared_error(y_test, y_res_ex))

print(results)
print(np.mean(results))

[20.422500000000003, 11.460250000000002, 19.699, 9.181265822784813, 11.206835443037976]
14.393970253164559
CPU times: user 6.84 ms, sys: 0 ns, total: 6.84 ms
Wall time: 6.24 ms


Дерево вроде работает нормально, но долго, скорее всего из-за np.unique и все такое.

Дальше забабахаем градиентный бустинг на этих деревьях.

In [297]:
class constanta:
    def fit(self, X, y):
        self.result = np.mean(y)

    def predict(self, X):
        return self.result

In [503]:
class MyGradientBoostingRegr:
    def __init__(self, learning_rate=0.1, n_estimators=100, subsample=1.0,
                 min_samples_split=2, min_samples_leaf=1,
                 max_depth=3, alpha=0.9):
        self.learning_rate = learning_rate
        self.n_estimators = n_estimators + 1  # +1 для константы.
        self.subsample = subsample
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.max_depth = max_depth
        self.alpha = alpha
        
        self.estimators = []
        self.weights = []
        
    def _calculate_weight(self, G_ij, X, h_next):
        y_pred = h_next.predict(X)
        weights = G_ij / y_pred
        
        return np.mean(weights)
        
    def fit(self, X, y):
        X = np.array(X)
        y = np.array(y)
        
#         h_curr = constanta()
#         h_curr = MyDecisionTreeRegrNew(max_depth=3)
        h_curr = DecisionTree(max_depth=3)
        h_curr.fit(X, y)
        w_curr = 1.0 * 1  #self.learning_rate
        self.estimators.append(h_curr)
        self.weights.append(w_curr)
        
        G_ij = y
        
        for i in range(1, self.n_estimators):
            if i % 30 == 0:
                self.max_depth += 1
            
            y_pred = h_curr.predict(X) * w_curr
            G_ij = G_ij - y_pred
            
#             h_next = MyDecisionTreeRegrNew(max_depth=self.max_depth)
            h_next = DecisionTree(max_depth=self.max_depth)
            h_next.fit(X, G_ij)
            
            w_next = self._calculate_weight(G_ij, X, h_next) * self.learning_rate
            
            self.estimators.append(h_next)
            self.weights.append(w_next)
            
            h_curr = h_next
            w_curr = w_next

    def predict(self, X):
        answer = np.zeros(X.shape[0])
        for i in range(self.n_estimators):
            answer += self.weights[i] * self.estimators[i].predict(X)
        return answer

Контрольная проверка, сравнение бустингов на 5 случайно пошафленных вариантов данных Auto-mpg, с валидацией по 5 фолдов.

In [272]:
random_seed = 123
results_test = []

for i in range(5):
    kf = KFold(n_splits=5, shuffle=True, random_state=random_seed)

    results = []
    for train_id, test_id in kf.split(X, y):
        X_train, y_train = X[train_id], y[train_id]
        X_test, y_test = X[test_id], y[test_id]

        my_gbr = MyGradientBoostingRegr()
        my_gbr.fit(X_train, y_train)

        y_res_ex = my_gbr.predict(X_test)
        results.append(mean_squared_error(y_test, y_res_ex))

    results_test.append(np.mean(results))

In [273]:
results_valid = []

for i in range(5):
    kf = KFold(n_splits=5, shuffle=True, random_state=random_seed)

    results = []
    for train_id, test_id in kf.split(X, y):
        X_train, y_train = X[train_id], y[train_id]
        X_test, y_test = X[test_id], y[test_id]

        gbr = GradientBoostingRegressor()
        gbr.fit(X_train, y_train)

        y_res_ex = gbr.predict(X_test)
        results.append(mean_squared_error(y_test, y_res_ex))

    results_valid.append(np.mean(results))

In [274]:
print("Результаты тестового градиентного бстинга:")
print(results_test)
print("\nРезультаты валидационного градиентного бстинга:")
print(results_valid)

Результаты тестового градиентного бстинга:
[12.708354276158309, 12.708354276158309, 12.708354276158309, 12.708354276158309, 12.708354276158309]

Результаты валидационного градиентного бстинга:
[8.419915555798287, 8.401905560039898, 8.478585763826114, 8.52594547597791, 8.492341034999306]


In [275]:
names = ['vendor', 'model', 'myct', 'mmin', 'mmax', 'cach', 'chmin', 'chmax', 'prp', 'erp']
data = pd.read_csv('machine.data', names=names)

data[['myct', 'mmin', 'mmax', 'cach', 'chmin', 'chmax', 'prp']] = data[['myct', 'mmin', 'mmax', 'cach',
                                                                        'chmin', 'chmax', 'prp']].astype(np.float32)
y_erp = data.iloc[:, -1]
data.drop(['vendor', 'model', 'erp'], axis=1, inplace=True)

X, y = data.iloc[:, :-1], data.iloc[:, -1]
X, y = np.array(X), np.array(y)

data.head()

Unnamed: 0,myct,mmin,mmax,cach,chmin,chmax,prp
0,125.0,256.0,6000.0,256.0,16.0,128.0,198.0
1,29.0,8000.0,32000.0,32.0,8.0,32.0,269.0
2,29.0,8000.0,32000.0,32.0,8.0,32.0,220.0
3,29.0,8000.0,32000.0,32.0,8.0,32.0,172.0
4,29.0,8000.0,16000.0,32.0,8.0,16.0,132.0


In [276]:
random_seed = 123
results_test = []

for i in range(5):
    kf = KFold(n_splits=5, shuffle=True, random_state=random_seed)

    results = []
    for train_id, test_id in kf.split(X, y):
        X_train, y_train = X[train_id], y[train_id]
        X_test, y_test = X[test_id], y[test_id]

        my_gbr = MyGradientBoostingRegr()
        my_gbr.fit(X_train, y_train)

        y_res_ex = my_gbr.predict(X_test)
        results.append(mean_squared_error(y_test, y_res_ex))

    results_test.append(np.mean(results))

In [277]:
results_valid = []

for i in range(5):
    kf = KFold(n_splits=5, shuffle=True, random_state=random_seed)

    results = []
    for train_id, test_id in kf.split(X, y):
        X_train, y_train = X[train_id], y[train_id]
        X_test, y_test = X[test_id], y[test_id]

        gbr = GradientBoostingRegressor()
        gbr.fit(X_train, y_train)

        y_res_ex = gbr.predict(X_test)
        results.append(mean_squared_error(y_test, y_res_ex))

    results_valid.append(np.mean(results))

In [278]:
print("Результаты тестового градиентного бстинга:")
print(results_test)
print("\nРезультаты валидационного градиентного бстинга:")
print(results_valid)

Результаты тестового градиентного бстинга:
[8735.651821829833, 8735.651821829833, 8735.651821829833, 8735.651821829833, 8735.651821829833]

Результаты валидационного градиентного бстинга:
[2230.7257861662715, 2111.4784932409807, 2009.703142471611, 2012.8923023430211, 2012.892446337455]


In [63]:
# Для сравнения ошабка для предсказания от людей, выложивших датасет.
mean_squared_error(y, np.array(y_erp))

1737.3349282296651

In [370]:
%%time
X_train = []
y_train = []
with open('Regression dataset/reg.train.txt', 'r') as file:

    for line in file:
        try:
            line = line.rstrip()
            buf = line.split()
            y = buf[0]
            y_train.append(float(y))

            x = np.array([-9.0]*245)
            for elem in buf[1:-1]:
                id, feat = elem.split(':')
                x[int(id)-1] = float(feat)
            X_train.append(x)
        except:
            print(line)
    
    line = file.readline()
        
    
    
X_train = np.array(X_train)
y_train = np.array(y_train)

CPU times: user 661 ms, sys: 90 µs, total: 661 ms
Wall time: 662 ms


In [371]:
%%time
X_test = []
y_test = []
with open('Regression dataset/reg.test.txt', 'r') as file:

    for line in file:
        try:
            line = line.rstrip()
            buf = line.split()
            y = buf[0]
            y_test.append(float(y))

            x = np.array([-9.0]*245)
            for elem in buf[1:-1]:
                id, feat = elem.split(':')
                x[int(id)-1] = float(feat)
            X_test.append(x)
        except:
            print(line)
    
    line = file.readline()
        
    
    
X_test = np.array(X_test)
y_test = np.array(y_test)

CPU times: user 912 ms, sys: 0 ns, total: 912 ms
Wall time: 910 ms


In [504]:
%%time
my_gbr = MyGradientBoostingRegr(n_estimators=300)
my_gbr.fit(X_train, y_train)

y_res_ex = my_gbr.predict(X_test)
print(mean_squared_error(y_test, y_res_ex))

9.6365417352784
CPU times: user 5.01 s, sys: 12.1 ms, total: 5.03 s
Wall time: 5.01 s


In [347]:
%%time
gbr = GradientBoostingRegressor(n_estimators=300, criterion='mse')
gbr.fit(X_train, y_train)

y_res_ex = gbr.predict(X_test)
print(mean_squared_error(y_test, y_res_ex))

0.7948611471657767
CPU times: user 15.2 s, sys: 4.01 ms, total: 15.2 s
Wall time: 15.2 s
