In [169]:
import pickle
P = pickle.load(open('hist1D.p', 'rb'))

In [335]:
from pandas.api.types import CategoricalDtype
class DTreeNode:
    def __init__(self, md, qf):
        self.max_depth = md
        self.quality_func = qf
    def leaf(self, y):
        bests = y.columns[np.argmax(np.array(y), axis=1)]                                             
        freq_table = pd.value_counts(bests)                                                           
        self.pred = freq_table.idxmax() 
        return self
    def train(self, X, y):
        if(len(X.columns) == 0 or self.max_depth == 0):
            return self.leaf(y)
        cur_score = self.quality_func(y)
        best_col = ''
        q_min = cur_score
        for col in X:
            x = X[col]
            if(isinstance(x.dtype, CategoricalDtype)):
                sizes = np.array([(x==c).sum() for c in x.cat.categories]) / len(x)
                scores = np.array([self.quality_func(y[x==c]) for c in x.cat.categories])
                qs = (sizes*scores).sum()
                if(qs < q_min):
                    q_min = qs
                    best_col = (col, )
            else:
                for elem in np.random.choice(x, min(len(x), 50), replace=False):
                    less = y[x < elem]
                    geq = y[x >= elem]
                    qs = (len(less)*self.quality_func(less) + len(geq)*self.quality_func(geq)) / len(x)
                    if(qs < q_min):
                        q_min = qs
                        best_col = (col, elem)
        if(q_min >= cur_score):
            return self.leaf(y)
        self.best_col = best_col
        x = X[best_col[0]]
        if(len(best_col) == 1):
            self.children = dict([(c, DTreeNode(self.max_depth-1, self.quality_func).train(X[x==c], y[x==c]))
                             for c in x.cat.categories])
        else:
            e = best_col[1]
            self.children = [DTreeNode(self.max_depth-1, self.quality_func).train(X[x<e], y[x<e]),
                             DTreeNode(self.max_depth-1, self.quality_func).train(X[x>=e], y[x>=e])]
        return self
    def get_pred(self, x):
        if(hasattr(self, 'pred')):
            return self.pred
        col = self.best_col[0]
        if(len(self.best_col) == 1):
            return self.children[x[col]].get_pred(x)
        split = self.best_col[1]
        if(x[col] < split):
            return self.children[0].get_pred(x)
        else:
            return self.children[1].get_pred(x)
class DTree:
    def __init__(self, max_depth, qf):
        self.max_depth = max_depth
        self.quality_func = qf
        
    def fit(self, X, y):
        md = min(self.max_depth, X.shape[1])
        self.dtree = DTreeNode(md, self.quality_func).train(X, y)
        return
    def predict(self, x):
        if(len(x) == 1):
            return self.dtree.get_pred(x)
        else:
            preds = []
            for i in x.index:
                row = x.loc[i, :]
                preds.append(self.dtree.get_pred(row))
            return np.array(preds)
    def score(self, X, y):
        preds = np.array(self.predict(X))
        return (preds == y).sum()

In [350]:
def entropy(col):
    probs = pd.value_counts(col)
    probs /= probs.sum()
    Q = -probs*np.log(probs)/np.log(2)
    return Q.sum()
def gini_cnts(cnts):
    probs = cnts / cnts.sum()
    return 1-(probs*probs).sum()
def gini(col):
    cnts = pd.value_counts(col)
    return gini_cnts(cnts)

In [231]:
from sklearn import datasets
iris = datasets.load_iris()
X_iris = pd.DataFrame(iris.data)

In [278]:
X = P[['domsize', 'scale', 'nnz', 'tvd', 'partcost', 'wkloadhalf']]
ps =  P[['dawaerr', 'iderr', 'hberr']]
y = ps.columns[np.argmax(np.array(ps), axis=1)]

In [263]:
m = np.min(np.array(ps), axis=1)
regrets = ps.divide(m, axis='index')

In [343]:

def group_gini(regrets, theta=1.0):
    mean_regrets = regrets.mean(axis='index')
    mean_regrets.sort_values(inplace=True)
    last_idx = mean_regrets.index[0]
    num_in_group = 0
    cnts = []
    for i in mean_regrets.index:
        if(mean_regrets[i] - mean_regrets[last_idx] > theta):
            last_idx = i
            cnts.append(num_in_group)
            num_in_group = (y == i).sum()
        else:
            num_in_group += (y == i).sum()
    if(num_in_group  > 0):
        cnts.append(num_in_group)
    cnts = np.array(cnts)
    return gini_cnts(cnts)

"""
mean_regrets = regrets.mean(axis='index')
mean_regrets.sort_values(inplace=True)
L = len(mean_regrets)-1
FMT = '{0:0>' + str(L) + 'b}'
min_part = []
min_gini = 2
for p in range(0, 2**L):
    pts = FMT.format(p)
    num_in_group = (mean_regrets.index[0] == y).sum()
    parts = [[mean_regrets.index[0]]]
    cnts = []
    for i in range(0, L):
        M = mean_regrets.index[i+1]
        if(pts[i] == '0'):
            cnts.append(num_in_group)
            num_in_group = (M == y).sum()
            parts.append([M])
        else:
            num_in_group += (M == y).sum()
            parts[-1].append(M)
    if(num_in_group > 0):
        cnts.append(num_in_group)
    cnts = np.array(cnts)
    G = gini_cnts(cnts)
    if(G < min_gini):
        min_gini = G
        min_part = parts
"""
None

In [348]:
group_gini(regrets)

0.647796910430839

In [349]:
d = DTree(3, group_gini)
d.fit(X,regrets)

In [318]:
len(regrets)

1680

In [333]:
regrets[X.scale < X.scale[10]]

Unnamed: 0,dawaerr,iderr,hberr
0,1.000000,5.210177,7.387663
1,4.715180,3.248413,1.000000
2,1.423533,1.000000,1.978203
3,1.000000,1.454090,1.278760
4,1.000000,5.040383,2.084326
5,1.000000,2.430788,108.481782
6,3.193924,1.000000,2.663389
7,1.000000,4.273642,5.563629
8,1.000000,1.542052,1.774583
9,1.000000,2.647664,47.198848


In [332]:
y

Index(['hberr', 'dawaerr', 'hberr', 'iderr', 'iderr', 'hberr', 'dawaerr',
       'hberr', 'hberr', 'hberr',
       ...
       'dawaerr', 'iderr', 'hberr', 'dawaerr', 'hberr', 'dawaerr', 'dawaerr',
       'dawaerr', 'dawaerr', 'iderr'],
      dtype='object', length=1680)