**COMPARING CLUSTERS**

In [2]:
import numpy as np
from math import comb
import itertools

cluster_indices = np.array([1, 2, 1, 1, 1, 3, 1, 1, 1, 1])
true_classes = np.array([1, 2, 1, 1, 1, 2, 2, 2, 2, 1])

f11 = 0
for i in range(cluster_indices.min(), cluster_indices.max()+1):
    i_cluster_ind = cluster_indices == i
    cluster = true_classes[i_cluster_ind]
    for j in range(true_classes.min(), true_classes.max()+1):
        count_same = cluster[cluster == j].size
        f11 += comb(count_same, 2)

f00 = 0
clusters = []
for i in range(cluster_indices.min(), cluster_indices.max()+1):
    i_cluster_ind = cluster_indices == i
    clusters.append(true_classes[i_cluster_ind])
for cluster_a, cluster_b in itertools.combinations(clusters, 2):
    for j in range(true_classes.min(), true_classes.max()+1):
        count_diff = cluster_a[cluster_a == j].size * cluster_b[cluster_b != j].size
        f00 += count_diff

K = cluster_indices.size * (cluster_indices.size - 1) /2

R = (f11 + f00) / K

print("clusters:", clusters)
print("f11, f00:", f11, f00)
print("K:", K)
print("R:", R)

clusters: [array([1, 1, 1, 1, 2, 2, 2, 1]), array([2]), array([2])]
f11, f00: 13 10
K: 45.0
R: 0.5111111111111111


In [4]:
import sklearn.metrics.cluster as cluster_metrics
import numpy as np

def clusterval(y, clusterid):
    '''
    CLUSTERVAL Estimate cluster validity using Entropy, Purity, Rand Statistic,
    and Jaccard coefficient.
    
    Usage:
      Entropy, Purity, Rand, Jaccard = clusterval(y, clusterid);
    
    Input:
       y         N-by-1 vector of class labels 
       clusterid N-by-1 vector of cluster indices
    
    Output:
      Entropy    Entropy measure.
      Purity     Purity measure.
      Rand       Rand index.
      Jaccard    Jaccard coefficient.
    '''
    NMI = cluster_metrics.normalized_mutual_info_score(y,clusterid)
    
    #y = np.asarray(y).ravel(); clusterid = np.asarray(clusterid).ravel()
    C = np.unique(y).size; K = np.unique(clusterid).size; N = y.shape[0]
    EPS = 2.22e-16
    
    p_ij = np.zeros((K,C))          # probability that member of i'th cluster belongs to j'th class
    m_i = np.zeros((K,1))           # total number of objects in i'th cluster
    for k in range(K):
        m_i[k] = (clusterid==k).sum()
        yk = y[clusterid==k]
        for c in range(C):
            m_ij = (yk==c).sum()    # number of objects of j'th class in i'th cluster
            p_ij[k,c] = m_ij.astype(float)/m_i[k]
    entropy = ( (1-(p_ij*np.log2(p_ij+EPS)).sum(axis=1))*m_i.T ).sum() / (N*K) 
    purity = ( p_ij.max(axis=1) ).sum() / K

    f00=0; f01=0; f10=0; f11=0
    for i in range(N):
        for j in range(i):
            if y[i]!=y[j] and clusterid[i]!=clusterid[j]: f00 += 1;     # different class, different cluster    
            elif y[i]==y[j] and clusterid[i]==clusterid[j]: f11 += 1;   # same class, same cluster
            elif y[i]==y[j] and clusterid[i]!=clusterid[j]: f10 += 1;   # same class, different cluster    
            else: f01 +=1;                                              # different class, same cluster
    rand = float(f00+f11)/(f00+f01+f10+f11)
    jaccard = float(f11)/(f01+f10+f11)

    return rand, jaccard, NMI

import numpy as np

cluster_indices = np.array([1, 1, 1, 1, 1, 1, 2, 3, 3, 3])
true_classes = np.array([1, 2, 1, 2, 3, 3, 3, 2, 3, 3])

Rand, jaccard, NMI =  clusterval(true_classes, cluster_indices)

print("R(and):", Rand)
print("Jaccard:", jaccard)
print("NMI:", NMI)

R(and): 0.4666666666666667
Jaccard: 0.14285714285714285
NMI: 0.1862746501647187


  p_ij[k,c] = m_ij.astype(float)/m_i[k]
