In [4]:
from math import sqrt

In [5]:
def pearson(v1, v2):
    sum1 = sum(v1)
    sum2 = sum(v2)
    
    sum1Sq = sum(pow(v, 2) for v in v1)
    sum2Sq = sum(pow(v, 2) for v in v2)
    
    pSum = sum(v1[i] * v2[i] for i in range(len(v1)))
    
    num = pSum-(sum1*sum2/len(v1))
    den = sqrt((sum1Sq - pow(sum1, 2)/len(v1)) * (sum2Sq-pow(sum2, 2)/len(v1)))
    if den == 0:
        return 0
    
    return 1.0-num/den

In [6]:
class bicluster:
    def __init__(self, vec, left=None, right=None, distance=0.0, id=None):
        self.left = left
        self.right = right
        self.vec = vec
        self.id = id
        self.distance = distance

In [19]:
def hcluster(rows, distance=pearson):
    distances = {}
    currentclustid = -1
    
    # 最开始的聚类
    clust = [bicluster(rows[i], id=i) for i in range(len(rows))]
    
    while len(clust) > 1:
        lowestpair=(0, 1)
        closest = distance(clust[0].vec, clust[1].vec)
        
        for i in range(len(clust)):
            for j in range(i+1, len(clust)):
                # 用distances缓存距离值
                if (clust[i].id, clust[j].id) not in distances:
                    distances[(clust[i].id, clust[j].id)] = distance(clust[i].vec, clust[j].vec)
                    
                d = distances[(clust[i].id, clust[j].id)]
                
                if d < closest:
                    closest = d
                    lowestpair = (i,j)
            
        # 两个聚类的平均值
        mergevec = [
            (clust[lowestpair[0]].vec[i] + clust[lowestpair[1]].vec[i])/2.0 for i in range(len(clust[0].vec))
        ]
        # 建立新的聚类
        newcluster = bicluster(mergevec, left=clust[lowestpair[0]], right=clust[lowestpair[1]], distance=closest, id=currentclustid)

        # 不在原始集合中的聚类，id为负数
        currentclustid -= 1
        del clust[lowestpair[1]]
        del clust[lowestpair[0]]
        clust.append(newcluster)
            
    return clust[0]

In [44]:
def printclust(clust, labels=None, n=0):
    string = ''
    for i in range(n):
        string += ' '
    if clust.id < 0:
        string += '-'
        print(string)
    else:
        if labels==None: 
            num = str(clust.id)
            string += num
        else: 
            string += labels[clust.id]
        print(string)
        
    if clust.left != None: printclust(clust.left, labels=labels, n=n+1)
    if clust.right != None: printclust(clust.right, labels=labels, n=n+1)

In [45]:
data = [
    [4,5,0,1],
    [6,3,1,0],
    [0,1,6,8],
    [1,0,7,7]
]
clust = hcluster(data)
printclust(clust)

-
 -
  2
  3
 -
  0
  1
