In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
class Node:
    
    def __init__(self, index, t, true_branch, false_branch):
        self.index = index
        self.t = t
        self.true_branch = true_branch
        self.false_branch = false_branch

In [3]:
class Leaf:
    
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels
        self.prediction = self.predict()
        
    def predict(self):
        prediction = np.mean(self.labels)
        return prediction

In [4]:
class MyTreeRegressor:
    
    def __init__(self, max_depth=3, min_leaf = 5, max_values = 150):
        self.max_depth = max_depth
        self.min_leaf = min_leaf
        self.max_values = max_values
        self.tree = None
        
    def dispersion(self, labels):
        return np.std(labels)

    def quality(self, left_labels, right_labels, current_dispersion):
        p = float(left_labels.shape[0]) / (left_labels.shape[0] + right_labels.shape[0])
        return current_dispersion - p * self.dispersion(left_labels) - (1 - p) * self.dispersion(right_labels)

    def split(self, data, labels, index, t):
        left = np.where(data[:, index] <= t)
        right = np.where(data[:, index] > t)
        
        true_data = data[left]
        false_data = data[right]
        true_labels = labels[left]
        false_labels = labels[right]
        
        return true_data, false_data, true_labels, false_labels

    def find_best_split(self, data, labels):
        
        current_dispersion = self.dispersion(labels)

        best_quality = 0
        best_t = None
        best_index = None
    
        n_features = data.shape[1]
    
        for index in range(n_features):
            t_values = np.unique([row[index] for row in data])
            
            if t_values.shape[0] > self.max_values:
                t_values = np.quantile([row[index] for row in data], np.linspace(0.01, 0.99, 99))
      
            for t in t_values:
                true_data, false_data, true_labels, false_labels = self.split(data, labels, index, t)
                if len(true_data) < self.min_leaf or len(false_data) < self.min_leaf:
                    continue
        
                current_quality = self.quality(true_labels, false_labels, current_dispersion)
            
                if current_quality > best_quality:
                    best_quality, best_t, best_index = current_quality, t, index

        return best_quality, best_t, best_index

    # Построение дерева с помощью рекурсивной функции

    def build_tree(self, data, labels, tree_depth, max_depth):

        quality, t, index = self.find_best_split(data, labels)
        
        if quality == 0:
            return Leaf(data, labels)
        
        if tree_depth >= max_depth:
            return Leaf(data, labels)
        
        tree_depth += 1

        true_data, false_data, true_labels, false_labels = self.split(data, labels, index, t)

        true_branch = self.build_tree(true_data, true_labels, tree_depth, max_depth)
        false_branch = self.build_tree(false_data, false_labels, tree_depth, max_depth)

        return Node(index, t, true_branch, false_branch)

    def predict_object(self, obj, node):
        
        if isinstance(node, Leaf):
            answer = node.prediction
            return answer

        if obj[node.index] <= node.t:
            return self.predict_object(obj, node.true_branch)
        else:
            return self.predict_object(obj, node.false_branch)

    def predict(self, data):
    
        val = []
        for obj in data:
            prediction = self.predict_object(obj, self.tree)
            val.append(prediction)
        return val

    def fit(self, data, labels):
        self.tree = self.build_tree(data, labels, 0, self.max_depth)
        return self

In [5]:
class MyGradienBoost:
    
    def __init__(self, max_depth = 3, alpha = 0.1, l2 = True, 
                 max_trees = 100, lr_decrement = 0.5, min_leaf = 5, max_values = 150):
        self.max_depth = max_depth
        self.min_leaf = min_leaf
        self.max_values = max_values
        self.alpha = alpha
        self.trees = []
        self.coefs = []
        #self.coefs = [1] * self.n_trees
        self.loss = self.bias_l2 if l2 else self.bias_l1
        self.max_trees = max_trees
        self.error = float('inf')
        self.lr_decrement = lr_decrement
        self.sigma = 0.1
    
    def predict(self, data):
        return np.array([sum([self.alpha * coef * alg.predict([row])[0] for alg, coef in zip(self.trees, self.coefs)]) for row in data])
 
    def bias_l1(self, y, z):
        return np.sign(y - z)
    
    def bias_l2(self, y, z):
        return (y - z)
    
    def mse(self, y_real, prediction):
        return (sum((y_real - prediction)**2)) / len(y_real)
                   
    def fit_val_base(self, X, X_val, y, y_val):
        
        def conditions(coef, n_trees):
            if coef < self.sigma:
                return False
            if n_trees >= self.max_trees:
                return False
            return True
        
        coef = 1
        
        while conditions(coef, len(self.trees)):
            
            tree = MyTreeRegressor(max_depth=self.max_depth, min_leaf = self.min_leaf, max_values=self.max_values)
                
            if len(self.trees) == 0:
                tree.fit(X, y)
                self.trees.append(tree)
                self.coefs.append(coef)
            else:
                target = self.predict(X)
                tree.fit(X, self.loss(y, target))
                self.trees.append(tree)
                self.coefs.append(coef)
                    
            _pred = self.predict(X_val)                    
            error = self.mse(y_val, _pred)
                
            if self.error < error:
                self.error = error
                self.coefs[-1] = self.coefs[-1] * 0.1
                coef *= self.lr_decrement
            else:
                self.error = error
                    
        #debugging
            print(f'mse: {error}, trees: {len(self.trees)}, current coef: {coef}\nr2_score: {r2_score(y_val, _pred)}')

In [6]:
def r2_score(y_true, y_pred):
    return 1 - (np.sum((y_true - y_pred) ** 2))/np.sum((y_true - y_true.mean()) ** 2)

In [7]:
train_df = pd.read_csv('kaggle_data/train.csv')
test_df = pd.read_csv('kaggle_data/test.csv')

train_ind = train_df.shape[0]
df = pd.concat([train_df, test_df], sort=False)
df.describe()

Unnamed: 0,Id,age,years_of_experience,lesson_price,qualification,physics,chemistry,biology,english,geography,history,mean_exam_points
count,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,10000.0
mean,9999.5,45.9254,1.9834,1698.1,1.71445,0.37815,0.1282,0.11485,0.05485,0.03175,0.0189,64.3408
std,5773.647028,8.000165,1.777718,524.562578,0.79287,0.484937,0.334321,0.318849,0.227693,0.175338,0.136175,13.536823
min,0.0,23.0,0.0,200.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,32.0
25%,4999.75,41.0,0.0,1300.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,55.0
50%,9999.5,46.0,2.0,1500.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,63.0
75%,14999.25,51.0,3.0,2150.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,73.0
max,19999.0,68.0,10.0,4050.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,100.0


In [8]:
df['qual_mult_years'] = df['qualification'] * (df['years_of_experience'] + 1)
df['qual_mult_years_0.5'] = df['qualification'] * ((df['years_of_experience'] + 1) ** 0.5)
df['log_price'] = np.log(df['lesson_price'])
df['price_feature_1'] = df['lesson_price'] / df['qual_mult_years']
df['price_feature_2'] = df['lesson_price'] / ((df['years_of_experience'] + 1) ** 0.5)
df['price_feature_3'] = df['lesson_price'] / (df['years_of_experience'] + 1)
df['age_min_years_of_exp'] = df['age'] - df['years_of_experience']
df['age_min_years_of_exp_0.5'] = df['age'] - df['years_of_experience'] ** 0.5
df.describe()

Unnamed: 0,Id,age,years_of_experience,lesson_price,qualification,physics,chemistry,biology,english,geography,history,mean_exam_points,qual_mult_years,qual_mult_years_0.5,log_price,price_feature_1,price_feature_2,price_feature_3,age_min_years_of_exp,age_min_years_of_exp_0.5
count,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,10000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0
mean,9999.5,45.9254,1.9834,1698.1,1.71445,0.37815,0.1282,0.11485,0.05485,0.03175,0.0189,64.3408,5.39175,2.901318,7.391729,557.502496,1122.578368,826.855188,43.942,44.775745
std,5773.647028,8.000165,1.777718,524.562578,0.79287,0.484937,0.334321,0.318849,0.227693,0.175338,0.136175,13.536823,4.805012,1.849101,0.301641,404.299562,440.276724,555.596633,8.086582,8.01305
min,0.0,23.0,0.0,200.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,32.0,1.0,1.0,5.298317,18.181818,75.592895,28.571429,17.0,20.55051
25%,4999.75,41.0,0.0,1300.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,55.0,2.0,1.732051,7.17012,241.666667,779.422863,400.0,38.0,39.267949
50%,9999.5,46.0,2.0,1500.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,63.0,4.0,2.0,7.31322,400.0,1073.312629,625.0,44.0,45.0
75%,14999.25,51.0,3.0,2150.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,73.0,6.0,4.0,7.673223,750.0,1400.0,1200.0,49.0,50.0
max,19999.0,68.0,10.0,4050.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,100.0,33.0,11.313708,8.306472,2200.0,2700.0,2700.0,68.0,68.0


In [9]:
df_train = df[:train_ind]
df_test = df[train_ind:]

In [10]:
columns = [col for col in df_train.columns if col not in ['Id', 'mean_exam_points']]
target = 'mean_exam_points'

In [11]:
X, X_val, y, y_val = train_test_split(df_train[columns], df_train[target], test_size=0.2, random_state=35)
X.shape

(8000, 18)

In [12]:
model = MyGradienBoost(max_depth = 5, alpha = 0.2, max_trees = 75, lr_decrement = 0.7, min_leaf = 5)
model.fit_val_base(X.values, X_val.values, y.values, y_val.values)

mse: 2784.035530918676, trees: 1, current coef: 1
r2_score: -13.906790197324968
mse: 1797.2145097328648, trees: 2, current coef: 1
r2_score: -8.622973320076738
mse: 1164.8830772169647, trees: 3, current coef: 1
r2_score: -5.237229174570765
mse: 761.2551833781301, trees: 4, current coef: 1
r2_score: -3.0760511779457635
mse: 501.8769616728644, trees: 5, current coef: 1
r2_score: -1.6872410532991964
mse: 336.2745008447629, trees: 6, current coef: 1
r2_score: -0.8005421903322258
mse: 229.95070772015808, trees: 7, current coef: 1
r2_score: -0.23124396856374863
mse: 161.644399855732, trees: 8, current coef: 1
r2_score: 0.13449410811693974
mse: 118.05589974956546, trees: 9, current coef: 1
r2_score: 0.3678835957447397
mse: 90.18589964924658, trees: 10, current coef: 1
r2_score: 0.517110227258952
mse: 71.92832207565236, trees: 11, current coef: 1
r2_score: 0.6148682750203427
mse: 60.250201827909684, trees: 12, current coef: 1
r2_score: 0.6773973938117208
mse: 52.74662738070192, trees: 13, curr

In [13]:
predictions = model.predict(df_test[columns].values)
predictions_df = pd.read_csv('kaggle_data/submission_example.csv')
predictions_df['mean_exam_points'] = predictions
predictions_df.head()

Unnamed: 0,Id,mean_exam_points
0,10000,55.524054
1,10001,63.917991
2,10002,47.185339
3,10003,90.365954
4,10004,88.974492


In [14]:
predictions_df.to_csv('kaggle_data/predictions_final.csv', index=False)