In [333]:
import random


import numpy as np
from numpy.linalg import eig, inv, eigh, norm

import scipy as sc
from scipy.linalg import sqrtm
from scipy.spatial.distance import cdist

from sklearn import metrics, datasets
from sklearn.cluster import KMeans, SpectralClustering

import matplotlib.pyplot as plt

## Trying to write a KMeans clustering that would work with complex vectors

In [245]:
class ComplexKMeans:
    def __init__(self, k_clusters=2, n_iter=500, random_seed=None):
        self.k_clusters = k_clusters
        self.n_iter = n_iter
        self.is_fit = False
        self.random_seed = random_seed
    
    distance_matrix = lambda self, x: np.array([[norm(vec - centroid) for centroid in self.centroids] for vec in x])
    
    def _assign_labels(self):
        is_changed = False
        for vec_num, vec_dists in enumerate(self.distances):
            new_label = vec_dists.argmin()
            if new_label != self.labels[vec_num]:
                is_changed = True
            self.labels[vec_num] = new_label
        return is_changed
    
    def fit(self, x):
        # initialize centroids and labels
        random.seed(self.random_seed)
        uniques = np.unique(x, axis=0)
        self.centroids = uniques[random.sample(range(len(uniques)), self.k_clusters)]
        #print(self.centroids)
        self.labels = np.array([0] * len(x))
        
        for it in range(self.n_iter):
            # calculate distances and assign centroids
            self.distances = self.distance_matrix(x)
            is_changed = self._assign_labels()
            
            if not is_changed:
                self.is_fit = True
                break
            """
            print('---')
            print('Step {}: '.format(it))
            print('Labels: {}'.format(self.labels))
            print('Distances: {}'.format(self.distances))
            """
            
            # recalculate centroids
            for cl_num in range(self.k_clusters):
                vecs = [vec for ind, vec in enumerate(x) if self.labels[ind] == cl_num]
                self.centroids[cl_num] = np.average(vecs, axis=0)
        
        return self
    
    # not sure what I should do with these methods yet
    def predict(self):
        return self.labels
    
    def fit_predict(self, x):
        self.fit(x)
        return self.predict()

In [329]:
def best_kmeans_choice(kmeans, data, n_iter=100):
    results = {}
    for it in range(n_iter):
        fitted_kmeans = kmeans.fit(data)
        metric = np.average([[norm(vec - centroid) for c_ind, centroid in enumerate(fitted_kmeans.centroids) if fitted_kmeans.labels[v_ind] == c_ind] for v_ind, vec in enumerate(data)])
        results[metric] = fitted_kmeans
    print(min(results))
    return results[min(results)]

### Testing it on the iris dataset

Currently one of the points (`iris.data[50]`) gets a cluster label different from the `sklearn.cluster.KMeans` results

In [3]:
iris = datasets.load_iris()

In [116]:
complexify_2d = lambda data: np.array([list(map(lambda x: complex(float(x), 0), data[i])) for i in range(len(data))])

In [298]:
iris_ckmeans = best_kmeans_choice(ComplexKMeans(k_clusters=3), iris.data)
ckmeans_labels = iris_ckmeans.labels

iris_kmeans = KMeans(n_clusters=3).fit(iris.data)
kmeans_labels = iris_kmeans.labels_

99


In [225]:
print('ComplexKMeans score: {}'.format(metrics.adjusted_rand_score(iris.target, ckmeans_labels)))
print('KMeans score: {}'.format(metrics.adjusted_rand_score(iris.target, kmeans_labels)))

ComplexKMeans score: 0.6907610401119466
KMeans score: 0.6907610401119466


In [228]:
permutation = {0: 2, 1: 1, 2: 0}
np.array([permutation[i] for i in ckmeans_labels])

array([0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0,
       0, 0, 0, 2, 2, 0, 0, 0, 0, 2, 0, 2, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0,
       0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 2])

In [227]:
kmeans_labels

array([0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0,
       0, 0, 0, 2, 2, 0, 0, 0, 0, 2, 0, 2, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0,
       0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 2], dtype=int32)

In [229]:
[i for i, j in enumerate([k == permutation[l] for k, l in zip(kmeans_labels, ckmeans_labels)]) if j == False]

[]

In [75]:
iris.data[50]

array([7. , 3.2, 4.7, 1.4])

In [207]:
iris_ckmeans.centroids

array([[5.90132731, 2.74826934, 4.39346548, 1.43387562],
       [6.85008133, 3.07376241, 5.74140944, 2.07061647],
       [5.01276596, 3.42978723, 1.47021277, 0.24680851]])

In [208]:
iris_kmeans.cluster_centers_

array([[6.85008133, 3.07376241, 5.74140944, 2.07061647],
       [5.01276596, 3.42978723, 1.47021277, 0.24680851],
       [5.90132731, 2.74826934, 4.39346548, 1.43387562]])

## Spectral clustering of the graph itself

In [328]:
def spectral_clustering(adjacency):

    adj_matrix = np.matrix(adjacency)

    diag_degrees = np.diag([np.sum(i) for i in adj_matrix])
    sqrt_diag_degrees = sqrtm(inv(diag_degrees))

    laplacian = np.identity(len(adj_matrix)) - np.dot(np.dot(sqrt_diag_degrees, adj_matrix), sqrt_diag_degrees)
    #laplacian = diag_degrees - adj_matrix
    #laplacian = adj_matrix
    
    vals, vecs = eig(laplacian)
    vecs = vecs.transpose()
    #print(vals)
    
    # dict of eigen_value: eigen_vector
    eigs = dict(zip(vals, vecs))
    # find max eigenvalues
    eig_vals = list(eigs.keys())
    max_eig_val = max(eig_vals, key=abs)
    eig_vals.remove(max_eig_val)
    max2_eig_val = max(eig_vals, key=abs)
    
    to_cluster = np.array(list(map(lambda vec: np.array(vec)[0], [eigs[max_eig_val], eigs[max2_eig_val]]))).transpose()
    #kmeans = KMeans(n_clusters=2, random_state=0).fit(to_cluster)
    #print("To cluster: ")
    #print(to_cluster)
    ckmeans = best_kmeans_choice(ComplexKMeans(k_clusters=2), to_cluster)
    print("Final centroids: ")
    print(ckmeans.centroids)
    return ckmeans.labels

In [320]:
l = 100
cluster_size = int(l / 2)
adjacency = np.array([[1] * cluster_size + [0] * cluster_size] * cluster_size +
                        [[0] * cluster_size + [1] * cluster_size] * cluster_size)

In [332]:
true_labels = [0] * cluster_size + [1] * cluster_size
pred_labels = spectral_clustering(adjacency)

print("Real matrix: {}".format(metrics.adjusted_rand_score(true_labels, pred_labels)))
print("Real labels: {}".format(pred_labels))


0.05533532753788884
Final centroids: 
[[ 0.00359287+0.j  0.02591154+0.j]
 [-0.0220705 +0.j -0.1591709 +0.j]]
Real matrix: 0.07389482573813623
Real labels: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 1 1 0 0 1 0
 0 0 0 0 1 0 1 1 0 0 0 0 1 0 0 0 1 0 1 1 0 1 0 0 0 0]


In [163]:
def rnd_test(a, seed):
    random.seed(seed)
    return random.sample(a, int(len(a) / 2))

In [166]:
print(rnd_test(range(10), 0))
print(rnd_test(list(map(lambda x: x + 1, range(10))), 0))

[6, 9, 0, 2, 4]
[7, 10, 1, 3, 5]


In [156]:
pred_labels

array([1, 1, 1, 1, 1, 1, 1, 0, 1, 0])

In [157]:
pred_labels_complex

array([1, 1, 1, 1, 1, 1, 0, 1, 1, 0])

In [336]:
sc = SpectralClustering(n_clusters=2, affinity='precomputed').fit(adjacency)



In [337]:
sc.labels_

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)