In [486]:
from scipy import special
import numpy as np
import matplotlib.pyplot as plt
from numba import jit, vectorize, float64, int64

In [487]:
## object class for node
class node():
    """the class for each node"""
    
    def __init__(self, left, right, data, cnum):
        self.l = left
        self.r = right
        if(left == None and right == None):
            self.n = 1
        else:
            self.n = left.n + right.n
        self.data = data
        self.cluster = cnum

In [489]:
@jit(float64(float64[:,:], int64[:]))
## function to calculate the posterior probability
def p_hyp1(dataset, a):
    # extract the number of features and the total number of data
    #print(dataset)
    if (len(dataset.shape) == 1):
        N = 1
        k = dataset.shape[0]
        #print(k)
        # part I
        p1 = 1
        comp = special.gamma(np.sum(dataset)+1) / np.prod(special.gamma(dataset+1))
        p1 = p1 * comp
        
        # part II
        # iterate to calculate the probability
        p2 = p1 * special.gamma(np.sum(a)) / special.gamma(np.sum(dataset) + np.sum(a))
        for j in range(k):
            #print(j)
            comp = special.gamma(a[j] + np.sum(dataset[j])) / special.gamma(a[j])
            p2 = p2 * comp
    else:
        N = dataset.shape[0]
        k = dataset.shape[1]
        #print(k)
    
        # part I
        p1 = 1
        for i in range(N):
            comp = special.gamma(np.sum(dataset[i, :])+1) / np.prod(special.gamma(dataset[i, :]+1))
            p1 = p1 * comp
        
        # part II
        # iterate to calculate the probability
        p2 = p1 * special.gamma(np.sum(a)) / special.gamma(np.sum(dataset) + np.sum(a))
        for j in range(k):
            #print(j)
            comp = special.gamma(a[j] + np.sum(dataset[:, j])) / special.gamma(a[j])
            p2 = p2 * comp

    return p2

In [490]:
## function to calculate the d
def get_d(node, a):
    if node.l == None and node.r == None:
        return a
    else:
        return a*special.gamma(node.n) + get_d(node.l, a)*get_d(node.r, a)

In [491]:
## function to calculate the weight or pi
def get_pi(node, a):
    dk = get_d(node, a)
    pi_k = a*special.gamma(node.n)/dk
    return pi_k

In [492]:
# get dk
def get_dk(node, a):
    post = p_hyp1(node.data, np.repeat(a, data.shape[1]))
    pi = get_pi(node, a)
    if node.l == None and node.r == None:
        return  pi * post
    else:
        return  pi * post + (1-pi) * get_dk(node.l, a) * get_dk(node.r, a)

In [493]:
## test dataset
sdata = np.random.randint(0,4, size=(5,5))
sdata

array([[1, 3, 0, 2, 3],
       [3, 3, 0, 1, 3],
       [2, 3, 1, 0, 2],
       [1, 2, 2, 3, 1],
       [2, 2, 0, 2, 3]])

In [494]:
with open("wine.csv") as f:
    next(f)
    text = f.read() 

data = []
lines  = text.split('\n')
for line in lines[:-1]:
    arr = line.split(';')
    fl = [int(np.round(float(x))) for x in arr]
    data.append(fl)

data = np.array(data)
data

array([[ 7,  1,  0, ...,  1,  9,  5],
       [ 8,  1,  0, ...,  1, 10,  5],
       [ 8,  1,  0, ...,  1, 10,  5],
       ..., 
       [ 6,  1,  0, ...,  1, 11,  6],
       [ 6,  1,  0, ...,  1, 10,  5],
       [ 6,  0,  0, ...,  1, 11,  6]])

In [497]:
tdata = data[:10,:5]

In [498]:
def bhc(data, a=1, r_thres=0.5):
    node_list = []
    node_list_copy = []
    for i in range(data.shape[0]):
        node_list.append(node(None, None, np.array([data[i,:]]), i))
        node_list_copy.append(node(None, None, np.array([data[i,:]]), i))
    #print(node_list)
    
    c = data.shape[0]
    
    while c > 1:
        flag = False
        #print([node.cluster for node in node_list_copy])
        for i in range(len(node_list)):
            for j in range(i+1, len(node_list)):
                #print("first node:", node_list[i].data)
                #print("second node:", node_list[j].data)
                #if len(node_list[i].data.shape) == 1 and len(node_list[j].data.shape) == 1:
                    #newdata = np.array([node_list[i].data, node_list[i].data])
                #else:
                newdata = np.concatenate((node_list[i].data, node_list[j].data), axis = 0)
                #print(np.concatenate((node_list[i].data, node_list[j].data), axis = 0))
                node_new = node(node_list[i], node_list[j], newdata, min(node_list[i].cluster,node_list[j].cluster))
                pi_k = get_pi(node_new, a)
                print(pi_k)
                p_hyp = p_hyp1(node_new.data, np.repeat(a, data.shape[1]))
                print(p_hyp)
                p_dk = get_dk(node_new, a)
                print(p_dk)
                rk = pi_k * p_hyp / p_dk
                #print(rk)
                if rk >= r_thres:
                    for k in range(len(node_list_copy)):
                        entry = node_list_copy[k].cluster
                        if entry == node_list[i].cluster or entry == node_list[j].cluster:
                            node_list_copy[k].cluster = min(node_list[i].cluster,node_list[j].cluster)
                    node_list =  node_list[:i] + node_list[(i+1):j] + node_list[(j+1):]
                    node_list = [node_new] + node_list
                    #print([node.cluster for node in node_list])
                    c = c - 1
                    flag = True
                    break
            if(flag == True):
                break
        
        if(flag == False):
            c = 1        
        #print(c)
    return node_list, node_list_copy
    

In [499]:
tdata

array([[ 7,  1,  0,  2,  0],
       [ 8,  1,  0,  3,  0],
       [ 8,  1,  0,  2,  0],
       [11,  0,  1,  2,  0],
       [ 7,  1,  0,  2,  0],
       [ 7,  1,  0,  2,  0],
       [ 8,  1,  0,  2,  0],
       [ 7,  1,  0,  1,  0],
       [ 8,  1,  0,  2,  0],
       [ 8,  0,  0,  6,  0]])

In [500]:
node_list, node_list_cluster = bhc(tdata, a=1, r_thres=0)

0.5
1.33128414189e-05
5.15232463476e-10
0.5
4.80976074773e-07
2.07134316484e-12
0.6
3.83213331073e-09
2.52372357587e-15
0.705882352941
2.1018439599e-10
5.05059038668e-17
0.779220779221
1.30712986589e-11
1.27229338988e-18
0.823798627002
8.35393751282e-13
3.29605358114e-20
0.852215082854
6.60479493437e-14
1.34267571441e-21
0.872085478219
4.82785695329e-15
4.66294650314e-23
0.886989934346
4.18770573233e-17
1.72582164001e-25


In [501]:
[node.cluster for node in node_list_cluster]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [502]:
[node.data for node in node_list]

[array([[ 7,  1,  0,  2,  0],
        [ 8,  1,  0,  3,  0],
        [ 8,  1,  0,  2,  0],
        [11,  0,  1,  2,  0],
        [ 7,  1,  0,  2,  0],
        [ 7,  1,  0,  2,  0],
        [ 8,  1,  0,  2,  0],
        [ 7,  1,  0,  1,  0],
        [ 8,  1,  0,  2,  0],
        [ 8,  0,  0,  6,  0]])]

In [503]:
print([node.cluster for node in node_list])

[0]


In [504]:
len(node_list)

1

In [505]:
len(np.unique([node.cluster for node in node_list]))

1

In [506]:
len(np.unique([node.cluster for node in node_list_cluster]))

1