## Importing
This cell is only concerned with importing the libraries and methods needed for implementing spectral clustering.

In [216]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.cluster import KMeans
from sklearn import metrics

## Reading data
To run the algorithm, we read the data in a multidimensional array of size 19 * 8 * 12 * 125 * 45.

In [217]:
aps_eval = []
temp = []
for i in range(1, 20):
    p_eval = []
    for j in range(1, 9):
        s_eval = []
        for k in range (49, 61):
            temp = []
            path = "data\\a"
            path += f'0{i}' if i < 10 else f'{i}'
            path += f'\\p{j}\\s'
            path += f'{k}.txt'
            file = open(path, "r")
            for l in range(125):
                temp.append(np.array(file.readline().split(','), dtype=float))
            s_eval.append(np.array(temp))
        p_eval.append(np.array(np.array(s_eval)))
    aps_eval.append(np.array(p_eval))
aps_eval = np.array(aps_eval)

## Points using means (first method)

Here we loop on our array to create the points using the means of every column of a file as a feature, making a total of 1824 samples each with 45 features.

In [218]:
eval_points_means = []
for a in range(19):
    for p in range(8):
        for s in range(12):
            eval_points_means.append(np.mean(aps_eval[a][p][s], axis=0))
eval_points_means = np.array(eval_points_means)
print("Number of points of the means method = ", len(eval_points_means))
print("Number of features of the means method = ", len(eval_points_means[0]))

Number of points of the means method =  1824
Number of features of the means method =  45


## Points using flattening (second method)

Here we loop on our array to create the points using each element of every file as a feature, making a total of 1824 samples each with 5625 features, so we perform PCA on them to reduce their dimensionality to about 135 feature each.

In [219]:
eval_points_flattened = []
for a in range(19):
    for p in range(8):
        for s in range(12):
            eval_points_flattened.append([])
            for r in range(125):
                for n in range(45):
                    eval_points_flattened[a*96+p*12+s].append(aps_eval[a][p][s][r][n])
eval_points_flattened = PCA(n_components=0.85).fit_transform(eval_points_flattened)
print("Number of points of the flattening method = ", len(eval_points_flattened))
print("Number of features of the flattening method = ", len(eval_points_flattened[0]))

Number of points of the flattening method =  1824
Number of features of the flattening method =  136


## Getting the matrices B

To calculate the matrix B for each dataset, we need first to calculate the similarity matrix for each of them, where we used th rbf kernel function, with passing the gamma parameter with value 0.00001, then we minus the degree matrix with it getting the laplacian matrix, and multiplying it with the inverse degree matrix.

In [220]:
# Similarity matrix for the first method
sim_mat_means = rbf_kernel(eval_points_means, eval_points_means, 0.00001)
for i in range(len(sim_mat_means)):
    # Getting degree of a data point
    x = np.sum(sim_mat_means[i])
    # Subtracting from the diagonal matrix
    sim_mat_means[i][i] += -x
    sim_mat_means[i] *= -1
    # Multiplying by the inverse diagonal matrix
    sim_mat_means[i] /= x

# Similarity matrix for the second method
sim_mat_flattened = rbf_kernel(eval_points_flattened, eval_points_flattened, 0.00001)
for i in range(len(sim_mat_flattened)):
    # Getting degree of a data point
    x = np.sum(sim_mat_flattened[i])
    # Subtracting from the diagonal matrix
    sim_mat_flattened[i][i] += -x
    sim_mat_flattened[i] *= -1
    # Multiplying by the inverse diagonal matrix
    sim_mat_flattened[i] /= x


## Getting points for clustering

To get the points to be clustered, we need the eigen vectors and eigen values of the B matrix, then we get the least k eigen values, and the eigen vectors corresponding to them, they will make the columns for our data.

In [221]:
# Getting eigen values and eigen vectors of the means method
eigen_values, eigen_vectors = np.linalg.eig(sim_mat_means)
# Getting the sorted indices of the eigen values
idx = np.real(eigen_values).argsort()[::-1]
# Sorting the eigen vectors according to their ascending eigen values
eigen_vectors = np.real(np.flip(np.array(eigen_vectors.transpose()[idx, :])))
# Getting the first 19 eigen vectors as columns
data_means = np.flip(np.array(eigen_vectors[:19]).transpose())

# Getting eigen values and eigen vectors of the means method
eigen_values, eigen_vectors = np.linalg.eig(sim_mat_flattened)
# Getting the sorted indices of the eigen values
idx = np.real(eigen_values).argsort()[::-1]
# Sorting the eigen vectors according to their ascending eigen values
eigen_vectors = np.real(np.flip(np.array(eigen_vectors.transpose()[idx, :])))
# Getting the first 19 eigen vectors as columns
data_flattened = np.flip(np.array(eigen_vectors[:19]).transpose())

## Normalizing the rows

Each row represents a point in our space, and they need to be normalized before performing the kmeans on them.

In [222]:
for i in range(len(data_means)):
    # Getting the norm of the row
    vec_sum = np.linalg.norm(data_means[i])
    # Normalizing the row
    data_means[i] /= vec_sum

for i in range(len(data_flattened)):
    # Getting the norm of the row
    vec_sum = np.linalg.norm(data_flattened[i])
    # Normalizing the row
    data_flattened[i] /= vec_sum

## Performing kmeans

The built-in kmeans method is used to cluster those points, and the labels are written in files to be used for evaluation if needed, and we calculate the silhouette score as a representation of the difference in accuracies.

In [223]:
kmeans = KMeans(n_clusters=19)
kmeans.fit(data_means)
print(metrics.silhouette_score(data_means, kmeans.labels_))
labels_file = open("labels-means.txt", "w")
for i in range(19):
    for j in range(8):
        for q in range(12):
            labels_file.write(str(kmeans.labels_[i*96+j*12+q]))
            labels_file.write(" ")
        labels_file.write('\n')
    labels_file.write('\n')
labels_file.close()
kmeans = KMeans(n_clusters=19)
kmeans.fit(data_flattened)
print(metrics.silhouette_score(data_flattened, kmeans.labels_))
labels_file = open("labels-flattened.txt", "w")
for i in range(19):
    for j in range(8):
        for q in range(12):
            labels_file.write(str(kmeans.labels_[i*96+j*12+q]))
            labels_file.write(" ")
        labels_file.write('\n')
    labels_file.write('\n')
labels_file.close()

0.1724464491082588
0.2646674903835613
