In [38]:
import pandas as pd
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
import sys
from sklearn import tree
import matplotlib.pyplot as plt
import graphviz


In [43]:
df = pd.read_csv('train.csv')
df_train, df_test = train_test_split(df,test_size = 0.2,shuffle=False)
feature_names = ['OverallQual', 'GrLivArea', 'GarageCars']
X_train = df_train[feature_names]
y_train = df_train['SalePrice']
y_train.to_csv('y_train.csv')

X_test = df_test[feature_names]
y_test = df_test[['SalePrice']]

In [45]:
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [44]:
## build sklearn regression trree
tree_max_depth = 5 # controls overfitting 
min_samples_leaf = 5 # controls overfitting 

sk_dt_reg = tree.DecisionTreeRegressor(max_depth = tree_max_depth, min_samples_leaf = min_samples_leaf, criterion = 'squared_error')
sk_dt_reg.fit(X_train,y_train)
sk_dt_preds = sk_dt_reg.predict(X_train)
sk_dt_scores = r2_score(sk_dt_preds,y_train)
print('sk dt scores train:',sk_dt_scores)
sk_dt_preds = sk_dt_reg.predict(X_test)
sk_dt_scores = r2_score(sk_dt_preds,y_test)
print('sk dt scores test:',sk_dt_scores)

sk dt scores train: 0.7793135243935883
sk dt scores test: 0.6410367361839437


In [61]:
### plot the tree
dot_data = tree.export_graphviz(sk_dt_reg,out_file = None,feature_names = feature_names, filled = True, special_characters = True,max_depth=3)
graph = graphviz.Source(dot_data)
# graph

ValueError: Length of feature_names, 3 does not match number of features, 30

In [60]:
## build sklearn random forest
n_estimators = 100
criterion='squared_error'
max_depth = None
min_samples_split = 2
min_samples_leaf = 1
min_weight_fraction_leaf = 0
max_features = sqrt(len(feature_names) - 1)/len(feature_names) # according to GENIE3 suggestion
max_leaf_nodes = None
min_impurity_decrease = 0
bootstrap = True
oob_score = True  # to use out-of-bag samples to estimate the generalization score (available for bootstrap=True)
n_jobs = None # number of jobs in parallel. fit, predict, decision_path, and apply can be done in parallel over the trees
random_state=None # controls randomness in bootstrapping as well as drawing features
verbose=1 
warm_start=False # reuse the slution of the previous call to fit and add more ensembles to the estimator. look up on Glosery
ccp_alpha=0 # complexity parameter used for minima cost-complexity pruning. by default, no prunning
max_samples = None # if bootstrap is True, the number of samples to draw from the samples. if none, draw X.shape[0]

sk_dt_reg = RandomForestRegressor(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split,
                                  min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, 
                                  max_features=max_features, max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=min_impurity_decrease,
                                  bootstrap=bootstrap, oob_score=oob_score, n_jobs=n_jobs, random_state=random_state, verbose=verbose,
                                  warm_start=warm_start, ccp_alpha=ccp_alpha, max_samples=max_samples)
sk_dt_reg.fit(X_train,y_train)
sk_dt_preds = sk_dt_reg.predict(X_train)
sk_dt_scores = r2_score(sk_dt_preds,y_train)
print('sk dt scores train:',sk_dt_scores)
sk_dt_preds = sk_dt_reg.predict(X_test)
sk_dt_scores = r2_score(sk_dt_preds,y_test)
print('sk dt scores test:',sk_dt_scores)

sk dt scores train: 0.9756206622686743
sk dt scores test: 0.8408906189026994


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


In [None]:
### build sklearn random forest
sk_dt_reg = RandomForestRegressor()
sk_dt_reg.fit(X_train,y_train)
sk_dt_preds = sk_dt_reg.predict(X_train)
sk_dt_scores = r2_score(sk_dt_preds,y_train)
print('sk dt scores train:',sk_dt_scores)
sk_dt_preds = sk_dt_reg.predict(X_test)
sk_dt_scores = r2_score(sk_dt_preds,y_test)
print('sk dt scores test:',sk_dt_scores)

In [None]:
dot_data = tree.export_graphviz(clf, out_file=None, 
                      feature_names=iris.feature_names,  
                      class_names=iris.target_names,  
                      filled=True, rounded=True,  
                      special_characters=True)  
graph = graphviz.Source(dot_data) 

In [None]:
class Node:
    def __init__(self, x, y, idxs, min_leaf=5):
        self.x = x 
        self.y = y
        self.idxs = idxs 
        self.min_leaf = min_leaf
        self.row_count = len(idxs)
        self.col_count = x.shape[1]
        self.val = np.mean(y[idxs]) 
        self.score = float('inf')
        self.find_varsplit()
    def find_varsplit(self): #find where to split
        for c in range(self.col_count): self.find_better_split(c) # after this, the row and column of split is determined by scoring
        if self.is_leaf: return
        x = self.split_col
        lhs = np.nonzero(x <= self.split)[0]
        rhs = np.nonzero(x > self.split)[0]
        self.lhs = Node(self.x, self.y, self.idxs[lhs], self.min_leaf)
        self.rhs = Node(self.x, self.y, self.idxs[rhs], self.min_leaf)
    @property
    def split_col(self): return self.x.values[self.idxs,self.var_idx]

    @property
    def is_leaf(self): return self.score == float('inf') 
    def find_better_split(self, var_idx): # determines row and column of the split
        x = self.x.values[self.idxs, var_idx]
        for r in range(self.row_count):
            lhs = x <= x[r]
            rhs = x > x[r]
            if rhs.sum() < self.min_leaf or lhs.sum() < self.min_leaf: continue # prunning

            curr_score = self.find_score(lhs, rhs)
            if curr_score < self.score: 
                self.var_idx = var_idx # the chosen column
                self.score = curr_score
                self.split = x[r] # the row to split

    def find_score(self, lhs, rhs):
        y = self.y[self.idxs]
        lhs_std = y[lhs].std()
        rhs_std = y[rhs].std()
#         return lhs_std * lhs.sum() + rhs_std * rhs.sum() #score is calculated by: std_l * sum_x_l + std_r*sum_x_r => ??? why sum of ihs? 
        return lhs_std + rhs_std  #score is calculated by: std_l * sum_x_l + std_r*sum_x_r => ??? why sum of ihs? 

    def predict(self, x):
        return np.array([self.predict_row(xi) for xi in x])

    def predict_row(self, xi):
        if self.is_leaf: return self.val
        node = self.lhs if xi[self.var_idx] <= self.split else self.rhs
        return node.predict_row(xi)
class DecisionTreeRegressor:
  
    def fit(self, X, y, min_leaf = 5):
        self.dtree = Node(X, y, idxs = np.array(np.arange(len(y))), min_leaf=min_leaf)
        return self

    def predict(self, X):
        return self.dtree.predict(X.values)
regressor = DecisionTreeRegressor().fit(X_train, y_train)

In [None]:
preds = regressor.predict(X)
r2_score(y, preds)

In [None]:
pred_test = regressor.predict(X_test)
r2_score(y_test, pred_test)