In [29]:
import copy
import numpy as np
import pandas as pd

from src.common.functions import get_feature_importance
%cd /home/heza7322/PycharmProjects/missing-value-handling-in-carts
from src.trinary_tree import TrinaryTree
from src.weighted_tree import WeightedTree
from src.common.functions import get_indices, calculate_loss, fit_response

/home/heza7322/PycharmProjects/missing-value-handling-in-carts


### Create data

In [132]:
n = 1000
seed = 11
np.random.seed(seed)
X_true = pd.DataFrame()
X_true[0] = np.floor(np.arange(n)/10)*10

t = X_true[0].quantile(0.10)
y = pd.Series(index = X_true.index, dtype = float)

y.loc[X_true[0]>=t] = 100
y.loc[X_true[0]<t]  = 10

# Remove data
missing_prob = 0.1
to_remove = np.random.binomial(1,missing_prob,n) == 1
X = X_true.copy()
X.loc[to_remove,0] = np.nan

In [133]:
# Hyperparameters
max_depth = 1
min_samples_leaf = 1

In [134]:
# Weighted tree
weight_tree = WeightedTree(max_depth = max_depth, min_samples_leaf=min_samples_leaf)
weight_tree.fit(X,y)
weight_tree.print()

print(f'mse: {(weight_tree.predict(X)-y).pow(2).mean()}')

Number of observations: 1000.0
Response estimate: 91.0
loss: 729.0
if 0 <  95.0:
---Number of observations: 104.03
---Response estimate: 18.91
---loss: 2008.73
if 0 >=  95.0:
---Number of observations: 895.97
---Response estimate: 99.37
---loss: 56.31
mse: 61.6470102


In [135]:
# Trinary tree
trin_tree = TrinaryTree(max_depth=max_depth, min_samples_leaf=min_samples_leaf)
trin_tree.fit(X,y)
trin_tree.print()

print(f'mse: {(trin_tree.predict(X)-y).pow(2).mean()}')


Number of observations: 1000
Response estimate: 91.0
loss: 729.0
if 0 <  95.0:
---Number of observations: 93
---Response estimate: 10.0
---loss: 0.0
if 0 n/a:
---Number of observations: 106
---Response estimate: 91.0
---loss: 508.92
if 0 >=  95.0:
---Number of observations: 801
---Response estimate: 100.0
---loss: 0.0
mse: 53.946


In [136]:
feature = 0
splitter = t
w = pd.Series(index = y.index, dtype = float)
w[:]=  1


In [137]:
new_loss = trin_tree._calculate_split_loss(X,y,feature,splitter)

print(f'Previous loss: {trin_tree.loss}')
print(f'New loss: {new_loss}')

Previous loss: 729.0
New loss: 407.6485471698113


In [138]:
new_loss = weight_tree._calculate_split_loss(X=X,y=y,w = w,feature=feature,splitter=splitter)

print(f'Previous loss: {weight_tree.loss}')
print(f'New loss: {new_loss}')


Previous loss: 729.0
New loss: 259.4115490273654


In [139]:
# Figure out why
index_left, index_right = get_indices(X[feature], splitter)
p_left, p_right = weight_tree._get_split_probabilities(index_left, index_right)
w_left, w_right = w.copy(), w.copy()
index_na = (~index_left) & (~index_right)
w_left.loc[index_na] *= p_left
w_right.loc[index_na] *= p_right
index_left |= index_na
index_right |= index_na

In [140]:
fit_response(y = y.loc[index_left], w = w_left.loc[index_left])

(18.909999999999997, {}, None)

In [141]:
fit_response(y = y.loc[index_right], w = w_right.loc[index_right])

(99.37, {}, None)

In [149]:
a_r = 100
a_l = 10
p_r = 0.9
p_m = missing_prob



In [150]:
# Weighted
a_r_hat = (1-p_m)*a_r + p_m*p_r*a_r + p_m*(1-p_r)*a_l
a_l_hat = (1-p_m)*a_l + p_m*(1-p_r)*a_l + p_m * p_r * a_r
a_m_hat = p_r * a_r_hat + (1-p_r) * a_l_hat
[a_l_hat,a_m_hat,a_r_hat]

[18.1, 91.0, 99.1]

In [151]:
y.mean()

91.0

In [54]:
y_left = y.loc[index_left]
w_left_true = w_left.loc[index_left]

In [60]:
(y_left*w_left_true).sum()/w_left_true.sum()

66.00000000000001

In [64]:
y_w = pd.DataFrame()
y_w['y'] = y_left
y_w['w'] = w_left_true

y_w.groupby('w')['y'].mean()

w
0.09434    89.795918
1.00000     0.000000
Name: y, dtype: float64

In [100]:
w_right.loc[index_right].sum()

905.6603773584905

In [38]:
loss_left_weighted = calculate_loss(y=y.loc[index_left], w=w_left.loc[index_left]) * w_left.loc[index_left].sum()
loss_right_weighted = calculate_loss(y=y.loc[index_right], w=w_right.loc[index_right]) * w_right.loc[index_right].sum()
(loss_left_weighted + loss_right_weighted) / weight_tree.n


881.0139982439954