In [1]:
import faiss
from sklearn.datasets import load_iris
import numpy as np

In [10]:
def run_kmeans(x, nmb_clusters, verbose=False, use_gpu=False):
    """Runs kmeans on 1 GPU.
    Args:
        x: data
        nmb_clusters (int): number of clusters
    Returns:
        list: ids of data in each cluster
    """
    n_data, d = x.shape

    # faiss implementation of k-means
    clus = faiss.Clustering(d, nmb_clusters)

    # Change faiss seed at each k-means so that the randomly picked
    # initialization centroids do not correspond to the same feature ids
    # from an epoch to another.
    clus.seed = np.random.randint(1234)

    clus.niter = 20
    clus.max_points_per_centroid = 10000000
    if use_gpu:
        res = faiss.StandardGpuResources()
        flat_config = faiss.GpuIndexFlatConfig()
        flat_config.useFloat16 = False
        flat_config.device = 0
        index = faiss.GpuIndexFlatL2(res, d, flat_config)
    else:
        index = faiss.IndexFlatL2(d)

    # perform the training
    clus.train(x, index)
#     return clus
    _, I = index.search(x, 1)
    
    # compute distance
    distance = []
    centroids = faiss.vector_to_array(clus.centroids)
    centroids = centroids.reshape(nmb_clusters, x.shape[1])
    images_lists = [[] for i in range(nmb_clusters)]
    for i in range(x.shape[0]):
        images_lists[I[i]].append(i)
    
    for clu in images_lists:
        for index in clu:
            distance.append(compute_dis(clu, index, centroids))
    
    clus_index2dis = [(clus_index, dis) for clus_index, dis in zip(I, distance)]
    
    # losses = faiss.vector_to_array(clus.obj)  # this option was replaced. The fix is:
    stats = clus.iteration_stats
    losses = np.array([stats.at(i).obj for i in range(stats.size())])
    if verbose:
        print('k-means loss evolution: {0}'.format(losses))

    return clus, clus_index2dis, losses[-1]

In [11]:
def preprocess_features(npdata, pca=256):
    """Preprocess an array of features.
    Args:
        npdata (np.array N * ndim): features to preprocess
        pca (int): dim of output
    Returns:
        np.array of dim N * pca: data PCA-reduced, whitened and L2-normalized
    """
    _, ndim = npdata.shape
    npdata =  npdata.astype('float32')

#     # Apply PCA-whitening with Faiss
#     mat = faiss.PCAMatrix (ndim, pca, eigen_power=-0.5)
#     mat.train(npdata)
#     assert mat.is_trained
#     npdata = mat.apply_py(npdata)

#     # L2 normalization
#     row_sums = np.linalg.norm(npdata, axis=1)
#     npdata = npdata / row_sums[:, np.newaxis]

    return npdata


In [12]:
from collections import Counter

def accuracy(clu2index, path2label):
    aux = [[] for i in range(len(clu2index))]
    for i, clu in enumerate(clu2index):
        for x_index in clu:
            for index, label in enumerate(path2label):
#             for index, (_, label) in enumerate(path2label):
                if x_index == index:
                    aux[i].append(label)

    # clu_index2label = {k: 0 for k in range(len(clu2index))}
    clu_index2label = np.empty(shape=(len(clu2index), len(clu2index)))
    axis2times = []
    for i, clu in enumerate(aux):
        c = Counter(clu)
        for k, v in c.items():
            clu_index2label[i, k] = v  # 第i个类簇k出现了v次
            axis2times.append(((i, k), v))

    axis2times.sort(key=lambda v: v[1], reverse=True)
    clu2label = {k: 0 for k in range(len(clu2index))}
    flag1 = {k: False for k in range(len(clu2index))}
    flag2 = {k: False for k in range(len(clu2index))}
    for t in axis2times:
        if t[1] > clu2label[t[0][0]]:
            if flag1[t[0][1]] or flag2[t[0][0]]:
                continue
            clu2label[t[0][0]] = t[0][1]
            flag1[t[0][1]] = True
            flag2[t[0][0]] = True

    correct = 0
    for i, clu in enumerate(clu2index):
        for index in clu:
            if path2label[index] == clu2label[i]:
                correct += 1

    acc = correct / len(path2label)
    print(f"acc: {acc}")

In [13]:
iris = load_iris()
X = iris["data"]
Y = iris["target"]

In [14]:
X.shape

(150, 4)

In [15]:
Y.shape

(150,)

In [16]:
X = preprocess_features(X)

In [17]:
kmeans, clus_index2dis, loss = run_kmeans(X, nmb_clusters=3, verbose=True)

TypeError: only integer scalar arrays can be converted to a scalar index

In [None]:
kmeans

In [None]:
centroids = faiss.vector_to_array(kmeans.centroids)

In [None]:
centroids.shape

In [None]:
centroids

In [None]:
centroids = centroids.reshape(3, 4)

In [None]:
centroids

In [None]:
images_lists = [[] for i in range(3)]
for i in range(X.shape[0]):
    images_lists[I[i]].append(i)

In [None]:
len(images_lists)

In [None]:
path2label = Y.tolist()

## validate acc

In [None]:
accuracy(images_lists, path2label)

## distance to centroids

In [None]:
# doubt
distance.shape

In [None]:
centroids

In [None]:
centroids.shape

In [None]:
X.shape

In [None]:
images_lists

In [None]:
X

In [None]:
X[0]

In [None]:
centroids

In [None]:
centroids[0]

In [None]:
def compute_dis(clu_index, path_index, centroids):
    feature = X[path_index]
    clu_centroid = centroids[clu_index]
    dis = np.linalg.norm(feature - clu_centroid)
    return dis

In [None]:
compute_dis(0, images_lists[0][0], centroids)