In [1]:
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

In [2]:
# Реализуем класс узла

class Node:
    
    def __init__(self, index, t, true_branch, false_branch):
        self.index = index  # индекс признака, по которому ведется сравнение с порогом в этом узле
        self.t = t  # значение порога
        self.true_branch = true_branch  # поддерево, удовлетворяющее условию в узле
        self.false_branch = false_branch  # поддерево, не удовлетворяющее условию в узле

# И класс терминального узла (листа)

class Leaf:
    
    def __init__(self, data, labels, weights):
        self.data = data
        self.labels = labels
        self.weights = weights
        self.prediction = self.predict()
        
    def predict(self):
        # подсчет количества объектов разных классов
        classes = {}  # сформируем словарь "класс: количество объектов"
        for label in self.labels:
            if label not in classes:
                classes[label] = 0
            classes[label] += 1
            
        # найдем класс, количество объектов которого будет максимальным в этом листе и вернем его    
        prediction = max(classes, key=classes.get)
        return prediction
    

    
def weighted_gini(labels, weights):
    #  подсчет количества объектов разных классов
    classes = {}
    for i in range(len(labels)):
        if labels[i] not in classes:
            classes[labels[i]] = 0
        classes[labels[i]] += weights[i]
    
    #  расчет критерия
    impurity = 1
    for label in classes:
        p = classes[label] / sum(classes.values())
        impurity -= p ** 2
        
    return impurity


# Расчет прироста

def gain(left_labels, right_labels, left_weights, right_weights, root_gini):

    # доля выборки, ушедшая в левое поддерево
    p = float(left_labels.shape[0]) / (left_labels.shape[0] + right_labels.shape[0])
    
    return root_gini - p * weighted_gini(left_labels, left_weights) - (1 - p) * weighted_gini(right_labels, right_weights)

# Разбиение датасета в узле

def split(data, labels, weights, column_index, t):
    
    left = np.where(data[:, column_index] <= t)
    right = np.where(data[:, column_index] > t)
        
    true_data = data[left]
    false_data = data[right]
    
    true_labels = labels[left]
    false_labels = labels[right]
    
    true_weights = weights[left]
    false_weights = weights[right]
        
    return true_data, false_data, true_labels, false_labels, true_weights, false_weights


# Нахождение наилучшего разбиения

def find_best_split(data, labels, weights, min_samples_leaf):
    
    root_gini = weighted_gini(labels, weights)

    best_gain = 0
    best_t = None
    best_index = None
    
    n_features = data.shape[1]
    
    for index in range(n_features):
        # будем проверять только уникальные значения признака, исключая повторения
        t_values = np.unique(data[:, index])
        
        for t in t_values:
            true_data, false_data, true_labels, false_labels, true_weights, false_weights = split(data, labels, weights, index, t)
            #  пропускаем разбиения, в которых в узле остается менее заданного количества объектов
            if len(true_data) < min_samples_leaf or len(false_data) < min_samples_leaf:
                continue
            
            current_gain = gain(true_labels, false_labels, true_weights, false_weights, root_gini)
            
            #  выбираем порог, на котором получается максимальный прирост качества
            if current_gain > best_gain:
                best_gain, best_t, best_index = current_gain, t, index

    return best_gain, best_t, best_index

# Построение дерева с помощью рекурсивной функции

def build_tree(data, labels, weights, max_depth=1, min_samples_leaf=1):

    gain, t, index = find_best_split(data, labels, weights, min_samples_leaf)

    #  Базовый случай - прекращаем рекурсию, когда нет прироста в качестве, или же мы достигли максимальной глубины
    if (gain == 0) or (max_depth == 0):
        return Leaf(data, labels, weights)

    true_data, false_data, true_labels, false_labels, true_weights, false_weights = split(data, labels, weights, index, t)

    # Рекурсивно строим два поддерева, уменьшаем параметр max_depth на 1
    true_branch = build_tree(true_data, true_labels, true_weights, max_depth-1, min_samples_leaf)
    false_branch = build_tree(false_data, false_labels, false_weights, max_depth-1, min_samples_leaf)
    
    
    # Возвращаем класс узла со всеми поддеревьями
    return Node(index, t, true_branch, false_branch)


def classify_object(obj, node):

    #  Останавливаем рекурсию, если достигли листа
    if isinstance(node, Leaf):
        return node.prediction

    if obj[node.index] <= node.t:
        return classify_object(obj, node.true_branch)
    else:
        return classify_object(obj, node.false_branch)
    
def predict(data, tree):
    
    classes = []
    for obj in data:
        prediction = classify_object(obj, tree)
        classes.append(prediction)
    return classes

In [3]:
def amount_of_say(x):
    return 1/2 * np.log((1 - x) / x)

In [4]:
def ab_predict(X, trees, alphas):
    
    predictions = []
    for tree in trees:
        predictions.append(predict(X, tree))

    # сформируем список с предсказаниями для каждого объекта
    predictions_per_object = list(zip(*predictions))
    #     print(predictions_per_object)

    # выберем в качестве итогового предсказания вероятности для каждого объекта отношение количества объектов 1 класса ко всем объектам
    final_predictions = []
    for obj in predictions_per_object:
        classes = {}
        for i in range(len(obj)):
            if obj[i] not in classes:
                    classes[obj[i]] = 0
            classes[obj[i]] += alphas[i]
        final_predictions.append(max(classes, key=classes.get))
    

    return final_predictions

In [5]:
def ab_fit(n_trees, X_train, y_train, positive_class=1, min_samples_leaf=1):
    
    # Деревья будем записывать в список
    trees = []
    alphas = []
    
    weights = np.array([1/X_train.shape[0]] * X_train.shape[0])
    
    
    for i in range(n_trees):
        
        tree = build_tree(X_train, y_train, weights, max_depth=1, min_samples_leaf=min_samples_leaf)       
        predictions = predict(X_train, tree)
        wrong_predictions = predictions != y_train
        error = np.sum(weights[wrong_predictions])
        alpha = amount_of_say(error)
        weights[wrong_predictions] *= np.exp(alpha)
        weights[~wrong_predictions] *= np.exp(-alpha)
        weights /= sum(weights)
        
        trees.append(tree)
        alphas.append(alpha)

        
        
    return trees, alphas

In [93]:
X, y = make_classification(n_samples=10000, n_features=4, n_informative=4, n_classes=2, n_redundant=0, 
                           n_clusters_per_class=1, random_state=42)


In [94]:
model = AdaBoostClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred_train = model.predict(X_train)
y_pred = model.predict(X_test)
print('Train - ', accuracy_score(y_train, y_pred_train))
print('Test - ', accuracy_score(y_test, y_pred))

Train -  0.8945
Test -  0.8835


In [98]:
trees, alphas = ab_fit(n_trees=50, X_train=X_train, y_train=y_train)


In [100]:
y_pred_train_ab = ab_predict(X_train, trees, alphas)
y_pred_ab = ab_predict(X_test, trees, alphas)
print('Train - ', accuracy_score(y_train, y_pred_train_ab))
print('Test - ', accuracy_score(y_test, y_pred_ab))

Train -  0.8485
Test -  0.846


In [101]:
def ab_predict_proba(X, trees, alphas, positive_class=1):
    predictions = []
    for tree in trees:
        predictions.append(predict(X, tree))

    # сформируем список с предсказаниями для каждого объекта
    predictions_per_object = list(zip(*predictions))
    #     print(predictions_per_object)

    # выберем в качестве итогового предсказания вероятности для каждого объекта отношение количества объектов 1 класса ко всем объектам
    final_predictions = []
    for obj in predictions_per_object:
        classes = {}
        for i in range(len(obj)):
            if obj[i] not in classes:
                classes[obj[i]] = 0
            classes[obj[i]] += alphas[i]
        final_predictions.append(classes.get(positive_class, 0)/sum(classes.values()))

    return final_predictions

In [103]:
y_pred_train_proba = model.predict_proba(X_train)
y_pred_proba = model.predict_proba(X_test)
print('Train - ', roc_auc_score(y_train, y_pred_train_proba[:, 1]))
print('Test - ', roc_auc_score(y_test, y_pred_proba[:, 1]))

Train -  0.9645425407195114
Test -  0.9564990933871296


In [104]:
y_pred_train_ab_proba = ab_predict_proba(X_train, trees, alphas)
y_pred_ab_proba = ab_predict_proba(X_test, trees, alphas)
print('Train - ', roc_auc_score(y_train, y_pred_train_ab_proba))
print('Test - ', roc_auc_score(y_test, y_pred_ab_proba))

Train -  0.8817303613395886
Test -  0.8746857875924126


In [115]:
u_train = y_train
u_train[u_train == 0] = -1
u_train

array([-1,  1,  1, ..., -1,  1,  1])

In [116]:
class DecisionStump():
    def __init__(self):
        self.polarity = 1
        self.feature_idx = None
        self.threshold = None
        self.alpha = None

    def predict(self, X):
        n_samples = X.shape[0]
        X_column = X[:, self.feature_idx]
        predictions = np.ones(n_samples)
        if self.polarity == 1:
            predictions[X_column < self.threshold] = -1
        else:
            predictions[X_column > self.threshold] = -1

        return predictions


class Adaboost():

    def __init__(self, n_clf=50):
        self.n_clf = n_clf

    def fit(self, X, y):
        n_samples, n_features = X.shape

        # Initialize weights to 1/N
        w = np.full(n_samples, (1 / n_samples))

        self.clfs = []
        # Iterate through classifiers
        for _ in range(self.n_clf):
            clf = DecisionStump()

            min_error = float('inf')
            # greedy search to find best threshold and feature
            for feature_i in range(n_features):
                X_column = X[:, feature_i]
                thresholds = np.unique(X_column)

                for threshold in thresholds:
                    # predict with polarity 1
                    p = 1
                    predictions = np.ones(n_samples)
                    predictions[X_column < threshold] = -1

                    # Error = sum of weights of misclassified samples
                    misclassified = w[y != predictions]
                    error = sum(misclassified)

                    if error > 0.5:
                        error = 1 - error
                        p = -1

                    # store the best configuration
                    if error < min_error:
                        clf.polarity = p
                        clf.threshold = threshold
                        clf.feature_idx = feature_i
                        min_error = error

            # calculate alpha
            EPS = 1e-10
            clf.alpha = 0.5 * np.log((1.0 - min_error + EPS) / (min_error + EPS))

            # calculate predictions and update weights
            predictions = clf.predict(X)

            w *= np.exp(-clf.alpha * y * predictions)
            # Normalize to one
            w /= np.sum(w)

            # Save classifier
            self.clfs.append(clf)

    def predict(self, X):
        clf_preds = [clf.alpha * clf.predict(X) for clf in self.clfs]
        y_pred = np.sum(clf_preds, axis=0)
        y_pred = np.sign(y_pred)

        return y_pred

In [118]:
%time
ab = Adaboost()
ab.fit(X_train, u_train)

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 6.91 µs


In [121]:
y_pred_t = ab.predict(X_train)
y_pred_t[y_pred_t == -1] = 0
y_pred = ab.predict(X_test)
y_pred[y_pred == -1] = 0
roc_auc_score(y_train, y_pred_t)



0.8941726157398047

In [122]:
roc_auc_score(y_test, y_pred)

0.8841817068338197

In [123]:
model = AdaBoostClassifier(random_state=42, algorithm='SAMME')
model.fit(X_train, y_train)
y_pred_train_proba = model.predict_proba(X_train)
y_pred_proba = model.predict_proba(X_test)
print('Train - ', roc_auc_score(y_train, y_pred_train_proba[:, 1]))
print('Test - ', roc_auc_score(y_test, y_pred_proba[:, 1]))

Train -  0.9579093322439503
Test -  0.9521941832678891


In [129]:
from sklearn.tree import DecisionTreeClassifier
weights = np.array([1/X_train.shape[0]] * X_train.shape[0])

In [130]:
DecisionTreeClassifier._check_sample_weight(weights, X_train)

AttributeError: type object 'DecisionTreeClassifier' has no attribute '_check_sample_weight'

In [131]:
weighted_gini([1, 2, 1], [1, 2, 3])

0.4444444444444445

In [22]:
# Реализуем класс узла

class Node:
    
    def __init__(self, index, t, true_branch, false_branch):
        self.index = index  # индекс признака, по которому ведется сравнение с порогом в этом узле
        self.t = t  # значение порога
        self.true_branch = true_branch  # поддерево, удовлетворяющее условию в узле
        self.false_branch = false_branch  # поддерево, не удовлетворяющее условию в узле

# И класс терминального узла (листа)

class Leaf:
    
    def __init__(self, data, labels, weights, positive_class=1):
        self.data = data
        self.labels = labels
        self.weights = weights
        self.positive_class = positive_class
        self.prediction = self.predict()
        self.pred_proba = self.predict_proba()
        
    def predict(self):
        # подсчет количества объектов разных классов
        classes = {}  # сформируем словарь "класс: количество объектов"
        for i in range(len(self.labels)):
            if self.labels[i] not in classes:
                classes[self.labels[i]] = 0
            classes[self.labels[i]] += self.weights[i]
            
        # найдем класс, количество объектов которого будет максимальным в этом листе и вернем его    
        prediction = max(classes, key=classes.get)
        return prediction
    
    def predict_proba(self):
        # подсчет количества объектов разных классов
        classes = {}  # сформируем словарь "класс: количество объектов"
        for i in range(len(self.labels)):
            if self.labels[i] not in classes:
                classes[self.labels[i]] = 0
            classes[self.labels[i]] += self.weights[i]
            
        return classes.get(self.positive_class, 0) / sum(classes.values())
            
        # найдем класс, количество объектов которого будет максимальным в этом листе и вернем его    
#         prediction = max(classes, key=classes.get)
#         return prediction
    

    
def weighted_gini(labels, weights):
    #  подсчет количества объектов разных классов
    classes = {}
    for i in range(len(labels)):
        if labels[i] not in classes:
            classes[labels[i]] = 0
        classes[labels[i]] += weights[i]
    
    #  расчет критерия
    impurity = 1
    for label in classes:
        p = classes[label] / sum(classes.values())
        impurity -= p ** 2
        
    return impurity


# Расчет прироста

def gain(left_labels, right_labels, left_weights, right_weights, root_gini):

    # доля выборки, ушедшая в левое поддерево
    p = float(left_labels.shape[0]) / (left_labels.shape[0] + right_labels.shape[0])
    
    return root_gini - p * weighted_gini(left_labels, left_weights) - (1 - p) * weighted_gini(right_labels, right_weights)

# Разбиение датасета в узле

def split(data, labels, weights, column_index, t):
    
    left = np.where(data[:, column_index] <= t)
    right = np.where(data[:, column_index] > t)
        
    true_data = data[left]
    false_data = data[right]
    
    true_labels = labels[left]
    false_labels = labels[right]
    
    true_weights = weights[left]
    false_weights = weights[right]
        
    return true_data, false_data, true_labels, false_labels, true_weights, false_weights


# Нахождение наилучшего разбиения

def find_best_split(data, labels, weights):
    
    root_gini = weighted_gini(labels, weights)

    best_gain = 0
    best_t = None
    best_index = None
    
    n_features = data.shape[1]
    
    for index in range(n_features):
        # будем проверять только уникальные значения признака, исключая повторения
        t_values = np.unique(data[:, index])
        
        for t in t_values:
            true_data, false_data, true_labels, false_labels, true_weights, false_weights = split(data, labels, weights, index, t)
            
            current_gain = gain(true_labels, false_labels, true_weights, false_weights, root_gini)
            
            #  выбираем порог, на котором получается максимальный прирост качества
            if current_gain > best_gain:
                best_gain, best_t, best_index = current_gain, t, index

    return best_gain, best_t, best_index

# Построение дерева с помощью рекурсивной функции

def build_tree(data, labels, weights, max_depth=1):

    gain, t, index = find_best_split(data, labels, weights)

    #  Базовый случай - прекращаем рекурсию, когда нет прироста в качестве, или же мы достигли максимальной глубины
    if (gain == 0) or (max_depth == 0):
        return Leaf(data, labels, weights)

    true_data, false_data, true_labels, false_labels, true_weights, false_weights = split(data, labels, weights, index, t)

    # Рекурсивно строим два поддерева, уменьшаем параметр max_depth на 1
    true_branch = build_tree(true_data, true_labels, true_weights, max_depth-1)
    false_branch = build_tree(false_data, false_labels, false_weights, max_depth-1)
    
    
    # Возвращаем класс узла со всеми поддеревьями
    return Node(index, t, true_branch, false_branch)


def classify_object(obj, node):

    #  Останавливаем рекурсию, если достигли листа
    if isinstance(node, Leaf):
        return node.prediction

    if obj[node.index] <= node.t:
        return classify_object(obj, node.true_branch)
    else:
        return classify_object(obj, node.false_branch)
    
    
def predict(data, tree):
    
    classes = []
    for obj in data:
        prediction = classify_object(obj, tree)
        classes.append(prediction)
    return classes

def get_proba_object(obj, node):

    #  Останавливаем рекурсию, если достигли листа
    if isinstance(node, Leaf):
        return node.pred_proba

    if obj[node.index] <= node.t:
        return get_proba_object(obj, node.true_branch)
    else:
        return get_proba_object(obj, node.false_branch)


def predict_proba(data, tree):
    
    probas = []
    for obj in data:
        prob = get_proba_object(obj, tree)
        probas.append(prob)
    return probas

def amount_of_say(x):
    return 1/2 * np.log((1 - x) / x)

def ab_predict(X, trees, alphas):
    
    predictions = []
    for tree in trees:
        predictions.append(predict(X, tree))

    # сформируем список с предсказаниями для каждого объекта
    predictions_per_object = list(zip(*predictions))
    #     print(predictions_per_object)

    # выберем в качестве итогового предсказания вероятности для каждого объекта отношение количества объектов 1 класса ко всем объектам
    final_predictions = []
    for obj in predictions_per_object:
        classes = {}
        for i in range(len(obj)):
            if obj[i] not in classes:
                    classes[obj[i]] = 0
            classes[obj[i]] += alphas[i]
        final_predictions.append(max(classes, key=classes.get))
    

    return final_predictions

def ab_fit(n_trees, X_train, y_train, positive_class=1):
    
    # Деревья будем записывать в список
    trees = []
    alphas = []
    
    weights = np.array([1/X_train.shape[0]] * X_train.shape[0])
    
    
    for i in range(n_trees):
        
        tree = build_tree(X_train, y_train, weights, max_depth=1)
#         tree=DecisionTreeClassifier(max_depth=1, random_state=42)
#         tree.fit(X_train, y_train, sample_weight=weights)
#         predictions = tree.predict(X_train)
        predictions = np.array(predict(X_train, tree))
        wrong_predictions = predictions != y_train
        error = sum(weights[wrong_predictions])
        alpha = amount_of_say(error)
        weights[wrong_predictions] *= np.exp(alpha)
        weights[~wrong_predictions] *= np.exp(-alpha)
        weights /= sum(weights)
        
        trees.append(tree)
        alphas.append(alpha)

        
        
    return trees, alphas

def ab_fit_ada(n_trees, X_train, y_train, positive_class=1):
    
    # Деревья будем записывать в список
    trees = []
    alphas = []
    
    weights = np.array([1/X_train.shape[0]] * X_train.shape[0])
    
    
    for i in range(n_trees):
        
#         tree = build_tree(X_train, y_train, weights, max_depth=1)
        tree=DecisionTreeClassifier(max_depth=1, random_state=42)
        tree.fit(X_train, y_train, sample_weight=weights)
        predictions = tree.predict(X_train)
#         predictions = np.array(predict(X_train, tree))
        wrong_predictions = predictions != y_train
        error = sum(weights[wrong_predictions])
        alpha = amount_of_say(error)
        weights[wrong_predictions] *= np.exp(alpha)
        weights[~wrong_predictions] *= np.exp(-alpha)
        weights /= sum(weights)
        
        trees.append(tree)
        alphas.append(alpha)

        
        
    return trees, alphas

def ab_predict_proba(X, trees, alphas, positive_class=1):
    predictions = []
    for tree in trees:
        predictions.append(predict(X, tree))

    # сформируем список с предсказаниями для каждого объекта
    predictions_per_object = list(zip(*predictions))

    # выберем в качестве итогового предсказания вероятности для каждого объекта отношение количества объектов 1 класса ко всем объектам
    final_predictions = []
    for obj in predictions_per_object:
        classes = {}
        for i in range(len(obj)):
            if obj[i] not in classes:
                classes[obj[i]] = 0
            classes[obj[i]] += alphas[i]
        final_predictions.append(classes.get(positive_class, 0)/sum(classes.values()))

    return final_predictions

In [23]:
%%time
ab_trees, alphas = ab_fit(n_trees=100, X_train=X_train, y_train=y_train)

CPU times: user 4min 16s, sys: 2.01 s, total: 4min 18s
Wall time: 4min 26s


In [24]:
from sklearn.metrics import accuracy_score, roc_auc_score

In [21]:
ab_trees[0].false_branch

<__main__.Leaf at 0x10dc8b820>

In [25]:
pr = ab_predict(X_train, ab_trees, alphas)
accuracy_score(y_train, pr)

0.8915

In [26]:
pr = ab_predict(X_test, ab_trees, alphas)
accuracy_score(y_test, pr)

0.8795

In [30]:
train_pr = ab_predict_proba(X_train, ab_trees, alphas)
roc_auc_score(y_train, train_pr)

0.7444640854098229

In [31]:
test_pr = ab_predict_proba(X_test, ab_trees, alphas)
roc_auc_score(y_test, test_pr)

0.7490971152507908

In [74]:
df_test = pd.read_csv('test.csv')
df_test.head()
XX_test = df.drop(columns=['Id', 'history', 'english']).to_numpy()

In [75]:
submit_answers_proba = ab_predict_proba(XX_test, ab_trees, alphas)

In [76]:
sample_submission = pd.read_csv('sample_submission.csv')
sample_submission['choose'] = submit_answers_proba
sample_submission.to_csv('subm3.csv', index=False)

In [186]:
ada = AdaBoostClassifier(random_state=42, n_estimators=50)
ada.fit(X_train, y_train)
ada_y_tr = ada.predict_proba(X_train)
ada_y_test = ada.predict_proba(X_test)
print(roc_auc_score(y_train, ada_y_tr[:, 1]))
print(roc_auc_score(y_test, ada_y_test[:, 1]))
# print(accuracy_score(y_train, ada.predict(X_train)))
# print(accuracy_score(y_test, ada.predict(X_test)))

0.8614733911674778
0.8424003170416989


In [155]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train, y_train)
gb_y_tr = gb.predict_proba(X_train)
gb_y_test = gb.predict_proba(X_test)
print(roc_auc_score(y_train, gb_y_tr[:, 1]))
print(roc_auc_score(y_test, gb_y_test[:, 1]))

0.8750898139365384
0.8519056706587815


In [185]:
from xgboost import XGBClassifier
xgb = XGBClassifier(random_state=42, objective='binary:logistic', learning_rate=0.005, n_estimators=3500)
xgb.fit(X_train, y_train)
xgb_y_tr = xgb.predict_proba(X_train)
xgb_y_test = xgb.predict_proba(X_test)
print(roc_auc_score(y_train, xgb_y_tr[:, 1]))
print(roc_auc_score(y_test, xgb_y_test[:, 1]))



0.9424738505754554
0.833456391433269


In [190]:
proba = sum(estimator.predict_proba(X_train) * w for estimator, w in zip(ada.estimators_, ada.estimator_weights_))
proba /= ada.estimator_weights_.sum()
proba = np.exp((1. / (ada.estimators_[0].tree_.n_classes - 1)) * proba)
normalizer = proba.sum(axis=1)[:, np.newaxis]
# normalizer[normalizer == 0.0] = 1.0
proba /= normalizer
proba

array([[0.50527065, 0.49472935],
       [0.5009871 , 0.4990129 ],
       [0.49878879, 0.50121121],
       ...,
       [0.50196228, 0.49803772],
       [0.50446982, 0.49553018],
       [0.51477505, 0.48522495]])

In [129]:
normalizer

array([[3.2974586 ],
       [3.29769135],
       [3.29777257],
       ...,
       [3.29744401],
       [3.29784897],
       [3.2975267 ]])

In [130]:
roc_auc_score(y_test, proba[:,1])

0.8440468580082516

In [96]:
ada_y_tr = ada.predict_proba(X_train)
ada_y_tr

array([[0.50783137, 0.49216863],
       [0.50316761, 0.49683239],
       [0.50057549, 0.49942451],
       ...,
       [0.50230715, 0.49769285],
       [0.50695841, 0.49304159],
       [0.51913141, 0.48086859]])

In [148]:
def ab_predict_proba(X, trees, alphas, positive_class=1):
    predictions = []
    for tree in trees:
        predictions.append(predict_proba(X, tree))
        
    alphas = np.array(alphas)
    weights = np.ones(alphas.shape[0])

    # сформируем список с предсказаниями для каждого объекта
    predictions_per_object = list(zip(*predictions))
#     print(predictions_per_object)

    # выберем в качестве итогового предсказания вероятности для каждого объекта отношение количества объектов 1 класса ко всем объектам
    final_predictions = []
    for obj in predictions_per_object:
#         classes = {}
#         for i in range(len(obj)):
#             if obj[i] not in classes:
#                 classes[obj[i]] = 0
#             classes[obj[i]] += alphas[i]
#         final_predictions.append(classes.get(positive_class, 0)/sum(classes.values()))
        obj = np.array(obj)
        w_pr_pos = np.array(sum(obj * weights) / sum(weights))
        w_pr_not_pos = np.array(sum((1 - obj) * weights) / sum(weights))
        final_predictions.append(np.exp(w_pr_pos) / (np.exp(w_pr_pos) + np.exp(w_pr_not_pos)))
        
    final_predictions = np.array(final_predictions)

    return final_predictions

In [99]:
alphas = np.array(alphas)
weights = np.ones(alphas.shape[0])
weights

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.

In [4]:
import pandas as pd
df = pd.read_csv('train.csv')
y = df['choose']
X = df.drop(columns=['choose', 'Id', 'history', 'english'])

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_test = X_test.to_numpy()
y_test = y_test.to_numpy()
X_train = X_train.to_numpy()
y_train = y_train.to_numpy()

In [161]:
%%time
weights = np.array([1/X_train.shape[0]] * X_train.shape[0])
tree = build_tree(X_train, y_train, weights, max_depth=1)
#         tree=DecisionTreeClassifier(max_depth=1, random_state=42)
#         tree.fit(X_train, y_train, sample_weight=weights)
#         predictions = tree.predict(X_train)
predictions = np.array(predict(X_train, tree))
wrong_predictions = predictions != y_train
error = sum(weights[wrong_predictions])
alpha = amount_of_say(error)
weights[wrong_predictions] *= np.exp(alpha)
weights[~wrong_predictions] *= np.exp(-alpha)
weights /= sum(weights)

CPU times: user 2.66 s, sys: 26 ms, total: 2.69 s
Wall time: 2.8 s


In [146]:
tree = build_tree(X_train, y_train, weights, max_depth=1)
#         tree=DecisionTreeClassifier(max_depth=1, random_state=42)
#         tree.fit(X_train, y_train, sample_weight=weights)
#         predictions = tree.predict(X_train)
predictions = np.array(predict(X_train, tree))
wrong_predictions = predictions != y_train
error = sum(weights[wrong_predictions])
alpha = amount_of_say(error)
weights[wrong_predictions] *= np.exp(alpha)
weights[~wrong_predictions] *= np.exp(-alpha)
weights /= sum(weights)

In [150]:
np.unique(predictions)

array([0])

In [200]:
from sklearn.tree import DecisionTreeClassifier
sk_tree = DecisionTreeClassifier(max_depth=1, random_state=42)
sk_tree.fit(X_train, y_train, sample_weight=weights)


DecisionTreeClassifier(max_depth=1, random_state=42)

In [201]:
w_predictions = sk_tree.predict(X_train) != y_train
error = sum(weights[w_predictions])

In [202]:
error

0.3505392713053071

In [203]:
amount_of_say(error)

0.30833481325693723

In [222]:
prb = sk_tree.tree_.predict(X_train.astype(np.float32))
normalizer = prb.sum(axis=1)[:, np.newaxis]
normalizer[normalizer == 0.0] = 1.0
prb /= normalizer
sum(prb)

array([4280.17967719, 3719.82032281])

In [236]:
ew = np.array([[2, 3]])
pr = np.array([[0.33, 0.17], [0.21, 0.40]])
pr /= ew.sum()
pr

array([[0.066, 0.034],
       [0.042, 0.08 ]])

array([2])

In [167]:
tree = build_tree(X_train, y_train, weights, max_depth=1)
#         tree=DecisionTreeClassifier(max_depth=1, random_state=42)
#         tree.fit(X_train, y_train, sample_weight=weights)
#         predictions = tree.predict(X_train)
predictions = np.array(predict(X_train, tree))
# wrong_predictions = predictions != y_train
# error = sum(weights[wrong_predictions])
# alpha = amount_of_say(error)
# weights[wrong_predictions] *= np.exp(alpha)
# weights[~wrong_predictions] *= np.exp(-alpha)
# weights /= sum(weights)

In [168]:
weighted_gini(y_train, weights)

0.5

In [170]:
weighted_gini(tree.false_branch.labels, tree.false_branch.weights)

0.4978763171929792

In [173]:
tree.true_branch.labels.shape[0]

922

In [177]:
sk_tree.tree_.value

array([[[0.5       , 0.5       ]],

       [[0.33321649, 0.18375576]],

       [[0.16678351, 0.31624424]]])