# 0. Setup
We will begin by importing all required libraries and fixing the plotting setup.

In [None]:
#imports
import pandas as pd
import numpy as np
import pickle
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
import matplotlib.colors as mcolors
import seaborn as sns
import random
import os
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import SpectralClustering
from sklearn.cluster import Birch
from sklearn.cluster import OPTICS
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
from sklearn import metrics
#colab specific: ######################################################################
!pip install umap-learn[plot]
!pip install holoviews
!pip install -U ipykernel
#######################################################################################
import umap

In [2]:
#plot settings
%matplotlib inline
sns.set(style='white', context='notebook', rc={'figure.figsize':(3,3)})
plt.ioff() #ensures that no plots are shown within the notebook if not explicity demanded by plt.show()

#color configuration
colors = list(mcolors.CSS4_COLORS.keys())
colors = random.sample(colors, 100)
colors = np.array(colors)

# 1. Setting the Parameters
For a better legibility of this document, we will store all parameter settings in this section.

In [3]:
#Path to workspace
w_path = '/content/gdrive/MyDrive/DIm_red_Clustering/'

#Pre-processing of the imported data - choose between...
# 'feature_stand': feature standardization leading to unit vairance and zero mean of all features across the samples
# 'norm_vecs': Normalized embedding vectors, that project all embedding vectors on a unit sphere
pre_processing = 'feature_stand'


#Dimensionality reduction of the imported data - choose between...
# 'PCA'
# 'UMAP'
dim_reduction = 'UMAP'


#Data-generation network - choose between...
# 'Mannheim':
# 'adapters':
data_generation = 'Mannheim'

# 2. Loading the embeddings

Now, we will have to load the user embeddings from the npy file they are stored in.

We also want to check that the imported data has the desired dimensions, to make sure that nothing went wrong throughout the process of creating and storing the embeddings in the npy file, and importing them into this document.

In [4]:
#colab specific import setup: ######################################################################
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
####################################################################################################

#checking which data to import
if data_generation == 'adapters':
  path='/content/gdrive/MyDrive/DIm_red_Clustering/user_embeddings_adapters.npy'
else:
  path='/content/gdrive/MyDrive/DIm_red_Clustering/user_embeddings_32d.npy'

#loading the data
data = np.load(path)

#create folder to sort images into
data_gen_path = w_path + data_generation
if os.path.exists(data_gen_path) == False:
  os.mkdir(data_gen_path)

#Check that embeddings have the correct shape
print(data.shape)

Mounted at /content/gdrive
(2062, 32)


# 3. Pre-processing the embeddings
Before passing our user embeddings on to the dimensionality reduction, we will have to pre-process them, to make sure that not a few features only dominate the dimensionality reduction due to scale differences.

In [5]:
#checking which means of data pre-processing to use
if pre_processing == 'norm_vecs':
  scale_factors = np.sum(data, axis = 1)
  scale_factors = scale_factors[:, np.newaxis]
  processed_data = data / scale_factors
else:
  processed_data = StandardScaler().fit_transform(data)

#create folder to sort images into
pre_processing_path = data_gen_path + '/' + pre_processing
if os.path.exists(pre_processing_path) == False:
  os.mkdir(pre_processing_path)

# 4. Performing Dimensionality Reduction


In [None]:
#initializing dict to save reduced dimensionality embeddings in, and to retrieve those embeddings based on the parameters used to create them
embeddings = {}


In [None]:
#Create folders to later deposit images in
path_dimred = pre_processing_path + '/' + dim_reduction
os.mkdir(path_dimred)
path2D = path_dimred + '/' + str(2)
os.mkdir(path2D)
path3D = path_dimred + '/' + str(3)
os.mkdir(path3D)

print(path2D)
print(path3D)

#Checking which means of dimensionality reduction to use
if dim_reduction == 'UMAP':

  #UMAP
  #iterating over plausible hyperparameter values: Dimension, number of neighbors, minimum distance and distance metric
  for n_dims in [2,3]:
    for n_neighbors in range(2,11,1): #n_neighbors in steps of 1 from 2-10
      for min_dist in range(1,10,1): #min_dist between 0 and 1 in steps of 0.1
        for measure in ['euclidean','manhattan','cosine']: #different distance measures

          reducer = umap.UMAP(n_components = n_dims, n_neighbors = n_neighbors, min_dist = min_dist/10, metric = measure) #initialize umap with desired hyperparams
          reduced_data = reducer.fit_transform(processed_data)#calculate umap and return reduced embeddings
          embeddings[data_generation+ pre_processing + dim_reduction + str(n_dims) + str(n_neighbors)+ str(min_dist/10) + measure] = reduced_data #saving embeddings for retrieval during clustering

          if n_dims == 2:
            #plotting result and saving plots as images
            plt.figure()
            plt.scatter(reduced_data[:, 0], reduced_data[:, 1], s=0.1)
            plt.title(data_generation + '     ' + pre_processing + '     ' + dim_reduction + '\n' + 'n_neighbors:' + str(n_neighbors)+ '     min_dist:' + str(min_dist/10) + '     metric:' + measure, fontsize=8)
            path2D_image = path2D + '/' + str(n_neighbors) + '_' + str(min_dist/10) + '_' + measure +'.pdf'
            plt.savefig(path2D_image, pad_inches = 15)

          else:
            #plotting result and saving plots as images
            plt.figure()
            ax = plt.axes(projection='3d')
            ax.scatter3D(reduced_data[:, 0], reduced_data[:, 1], reduced_data[:, 2],s=0.1)
            plt.title(data_generation + '     ' + pre_processing + '     ' + dim_reduction + '\n' + 'n_neighbors:' + str(n_neighbors)+ '     min_dist:' + str(min_dist/10) + '     metric:' + measure, fontsize=8)
            path3D_image = path3D + '/' + str(n_neighbors) + '_' + str(min_dist/10) + '_' + measure +'.pdf'
            plt.savefig(path3D_image, pad_inches = 15)

    if os.path.exists(data_gen_path +'/' + 'reduced_embeddings') == False:
      os.mkdir(data_gen_path +'/' +  'reduced_embeddings')

    with open(data_gen_path + '/' + 'reduced_embeddings'+ '/' + 'red_embeddings_UMAP_' + pre_processing + str(n_dims) + '.pickle', 'wb') as file:
      pickle.dump(embeddings, file)


else:
  #PCA
  #iterating over hyperparameter: Dimension
  for n_dims in [2,3]:
    pca = PCA(n_components = n_dims)
    reduced_data = pca.fit_transform(processed_data)
    embeddings[data_generation+ pre_processing + dim_reduction + str(n_dims) + 'na'+ 'na' + 'na'] = reduced_data #saving embeddings for retrieval during clustering

    if n_dims == 2:
      #plotting result and saving plots as images
      plt.figure()
      plt.scatter(reduced_data[:, 0], reduced_data[:, 1], s=0.1)
      plt.title(data_generation + '     ' + pre_processing + '     ' + dim_reduction, fontsize=8)
      path2D_image = path2D + '/'+ 'PCA' + '.pdf'
      plt.savefig(path2D_image, pad_inches = 15)

    else:
      #plotting result and saving plots as images
      plt.figure()
      ax = plt.axes(projection='3d')
      ax.scatter3D(reduced_data[:, 0], reduced_data[:, 1], reduced_data[:, 2],s=0.1)
      plt.title(data_generation + '     ' + pre_processing + '     ' + dim_reduction, fontsize=8)
      path3D_image = path3D + '/' + 'PCA' +'.pdf'
      plt.savefig(path3D_image, pad_inches = 15)



# 5. Selecting suitable reduced embeddings
As a next step, we will have to visually inspect the resulting plots to determine the most suitable reduced dimensionality embeddings. We will note down the hyperparamters used to create these embeddings.

The chosen embeddings can be retrieved later when clustering by listing the hyperparameters used to create them as a keyword for the "*embeddings*" dictionary.

In [29]:
#Merging the saved embeddings into one dictionary
umap_feature_stand2 = {}
umap_feature_stand3 = {}
umap_norm_vecs2 = {}
umap_norm_vecs3 = {}
pca = {}
length = 0

with open(w_path + 'adapters' + '/' + 'reduced_embeddings'+ '/' + 'red_embeddings_UMAP_feature_stand2.pickle', 'rb') as file:
    umap_feature_stand2 = pickle.load(file)
    len1 = len(umap_feature_stand2.values())
    length += len1
with open(w_path + 'adapters' + '/' + 'reduced_embeddings'+ '/' + 'red_embeddings_UMAP_feature_stand3.pickle', 'rb') as file:
    umap_feature_stand3 = pickle.load(file)
    len2 = len(umap_feature_stand3.values())
    length += len2
with open(w_path + 'adapters' + '/' + 'reduced_embeddings'+ '/' + 'red_embeddings_UMAP_norm_vecs2.pickle', 'rb') as file:
    umap_norm_vex2 = pickle.load(file)
    len3 = len(umap_norm_vecs2.values())
    length += len3
with open(w_path + 'adapters' + '/' + 'reduced_embeddings'+ '/' + 'red_embeddings_UMAP_norm_vecs3.pickle', 'rb') as file:
    umap_norm_vex3 = pickle.load(file)
    len4 = len(umap_norm_vecs3.values())
    length += len4
with open(w_path + 'adapters' + '/' + 'reduced_embeddings'+ '/' + 'red_embeddings_PCA.pickle', 'rb') as file:
    pca = pickle.load(file)
    len5 = len(pca.values())
    length += len5

embeddings = umap_feature_stand2 | umap_feature_stand3 | umap_norm_vex2 | umap_norm_vex3 | pca
print(embeddings)

{'adaptersfeature_standUMAP220.1euclidean': array([[12.82532  ,  1.9280121],
       [13.576631 , 14.858982 ],
       [15.574315 , 10.126566 ],
       ...,
       [11.80869  , -1.2324836],
       [-1.507207 , 11.75822  ],
       [ 2.4852324, 14.774012 ]], dtype=float32), 'adaptersfeature_standUMAP220.1manhattan': array([[-2.4922097,  2.4435098],
       [ 2.5095065, -6.0698004],
       [ 8.163734 , 10.883086 ],
       ...,
       [-1.8012158,  1.6931683],
       [13.7668915,  6.6616306],
       [ 7.9556355,  7.600185 ]], dtype=float32), 'adaptersfeature_standUMAP220.1cosine': array([[ 0.6771398 ,  1.7429458 ],
       [ 0.16681385, 15.417144  ],
       [ 4.8123307 , 13.840278  ],
       ...,
       [15.589223  , -2.9510105 ],
       [15.352176  ,  9.896031  ],
       [ 5.063031  , 13.38885   ]], dtype=float32), 'adaptersfeature_standUMAP220.2euclidean': array([[10.775826 ,  0.7216111],
       [17.656557 , -5.530971 ],
       [ 6.788407 ,  9.505705 ],
       ...,
       [11.244502 ,  3.083

In [31]:
#selecting best values from previous dimensionality reduction and placing them in iterable array
data_gen_array = np.full((34,), 'adapters')

pre_proc_array_1 = np.full((21,), 'feature_stand')
pre_proc_array_2 = np.full((13,), 'norm_vecs')
pre_proc_array = np.concatenate((pre_proc_array_1, pre_proc_array_2))

dim_red_array = np.full((34,),'UMAP')

n_dims_array_1 = np.full((10,),'2')
n_dims_array_2 = np.full((11,), '3')
n_dims_array_3 = np.full((7,),'2')
n_dims_array_4 = np.full((6,), '3')
n_dims_array = np.concatenate((n_dims_array_1, n_dims_array_2, n_dims_array_3, n_dims_array_4))

neighbors_array = np.full((34,), '2')

dist_array = ['0.4', '0.3', '0.6', '0.3', '0.7', '0.9', '0.3', '0.7', '0.5', '0.4', '0.4', '0.3', '0.6', '0.5', '0.6', '0.5', '0.6', '0.7','0.7','0.8','0.5','0.4','0.4','0.6','0.8','0.9','0.5','0.7','0.1', '0.2', '0.5', '0.6', '0.4','0.3']

metric_array = ['cosine','cosine','cosine','euclidean','euclidean','euclidean','euclidean','manhattan','euclidean','euclidean','cosine','cosine','cosine','cosine','euclidean','manhattan','manhattan','euclidean','manhattan','euclidean','euclidean','euclidean','manhattan','euclidean','euclidean','euclidean','manhattan','euclidean','euclidean','manhattan','manhattan','manhattan','manhattan','euclidean']

#iter_array = zip(data_gen_array, pre_proc_array, dim_red_array, n_dims_array, neighbors_array, dist_array, metric_array)
iter_array = np.column_stack((data_gen_array, pre_proc_array, dim_red_array, n_dims_array, neighbors_array, dist_array, metric_array))


# 6. Clustering
Finally, we will have to cluster the reduced dimensionality embeddings.

In this section, we will try out different clustering algorithms with different hyperparameters each. We will furthermore use the xxx metric to support qualitative assesments of the clustering, enabling us to choose our final set of hyperparameters.

Please note that this metric calculates the quality score of the clustering based on distance, i.e. factors such as cluster diameter, average distance between cluster points, distance between separate clusters etc..
Yet, as UMAP focuses on preserving the local structure of the data, the distances between clusters as well as the size of the clusters themselves are not interpretable. Moreover, some of the used clustering algorithms do not cluster according to distance but according to density, distribution, or graph structures. Therefore, the used metric does not perfectly evaluate the quality of the clustering and merely serves as an approximative assistance for the evalutation.

## 6.1 K-Means

In [32]:
#performing k-means clustering by iterating over plausible hyperparameter values

#creating dict to track ch scores
ch_tracker = {}
print(iter_array)
#iterating over the different hyperparameter values and reduced dimensionality embeddings
for data_generation, pre_processing, dim_reduction, n_dims, n_neighbors, min_dist, measure in iter_array: #iteration over different reduced dim embeddings
  reduced_data = embeddings[data_generation + pre_processing + dim_reduction + n_dims + n_neighbors+ min_dist + measure]
  print(reduced_data)
  for n_clusters in np.concatenate((range(2,10,1),range(10,101,5))): #iterating over number of clusters

    kmeans = KMeans(init="random", n_clusters=n_clusters, n_init=10, max_iter=300, random_state=42)
    kmeans.fit(reduced_data)
    labels = kmeans.labels_ #extracting labels of each sample
    ch_score = metrics.calinski_harabasz_score(reduced_data, labels)
    ch_tracker[data_generation + pre_processing + dim_reduction + n_dims + n_neighbors+ min_dist + measure + str(n_clusters)] = ch_score

    if n_dims == '2':
      #configuring plot settings, one color for each created label
      plt.figure()
      plt.scatter(reduced_data[:,0], reduced_data[:,1], c = np.take(colors, labels), s=0.1)
      plt.title(data_generation + '     ' + pre_processing + '     ' + dim_reduction + '\n' + 'n_neighbors:' + n_neighbors+ '     min_dist:' + min_dist + '     metric:' + measure + '\n' + 'ch score:'+str(ch_score) + '    nr clusters:' + str(n_clusters) + '\n Kmeans', fontsize=8)
      clustering_path = w_path + '/' + data_generation + '/' + pre_processing + '/' + dim_reduction + '/' + n_dims + '/' + 'k_means'
      if os.path.exists(clustering_path) == False:
        os.mkdir(clustering_path)
      clustering_path = clustering_path + '/' + n_neighbors + '_' + min_dist + '_' + measure + str(n_clusters) +'.pdf'
      plt.savefig(clustering_path , pad_inches = 15)


    else:
      plt.figure()
      ax = plt.axes(projection='3d')
      ax.scatter3D(reduced_data[:, 0], reduced_data[:, 1], reduced_data[:, 2], c = np.take(colors, labels), s=0.1)
      plt.title(data_generation + '     ' + pre_processing + '     ' + dim_reduction + '\n' + 'n_neighbors:' + n_neighbors+ '     min_dist:' + min_dist + '     metric:' + measure + '\n' + 'ch score:'+str(ch_score)+ '    nr clusters:' + str(n_clusters) + '\n Kmeans', fontsize=8)
      clustering_path = w_path + '/' + data_generation + '/' + pre_processing + '/' + dim_reduction + '/' + n_dims + '/' + 'k_means'
      if os.path.exists(clustering_path) == False:
        os.mkdir(clustering_path)
      clustering_path = clustering_path + '/' + n_neighbors + '_' + min_dist + '_' + measure + str(n_clusters) +'.pdf'
      plt.savefig(clustering_path , pad_inches = 15)




[['adapters' 'feature_stand' 'UMAP' '2' '2' '0.4' 'cosine']
 ['adapters' 'feature_stand' 'UMAP' '2' '2' '0.3' 'cosine']
 ['adapters' 'feature_stand' 'UMAP' '2' '2' '0.6' 'cosine']
 ['adapters' 'feature_stand' 'UMAP' '2' '2' '0.3' 'euclidean']
 ['adapters' 'feature_stand' 'UMAP' '2' '2' '0.7' 'euclidean']
 ['adapters' 'feature_stand' 'UMAP' '2' '2' '0.9' 'euclidean']
 ['adapters' 'feature_stand' 'UMAP' '2' '2' '0.3' 'euclidean']
 ['adapters' 'feature_stand' 'UMAP' '2' '2' '0.7' 'manhattan']
 ['adapters' 'feature_stand' 'UMAP' '2' '2' '0.5' 'euclidean']
 ['adapters' 'feature_stand' 'UMAP' '2' '2' '0.4' 'euclidean']
 ['adapters' 'feature_stand' 'UMAP' '3' '2' '0.4' 'cosine']
 ['adapters' 'feature_stand' 'UMAP' '3' '2' '0.3' 'cosine']
 ['adapters' 'feature_stand' 'UMAP' '3' '2' '0.6' 'cosine']
 ['adapters' 'feature_stand' 'UMAP' '3' '2' '0.5' 'cosine']
 ['adapters' 'feature_stand' 'UMAP' '3' '2' '0.6' 'euclidean']
 ['adapters' 'feature_stand' 'UMAP' '3' '2' '0.5' 'manhattan']
 ['adapters' 

  plt.figure()


[[-1.4098315 -1.2458988]
 [-0.19835   15.155466 ]
 [ 5.0484977 15.255997 ]
 ...
 [17.869959   1.8534476]
 [16.638546  10.022413 ]
 [ 5.547081  14.886041 ]]
[[-1.9019655 -1.3828804]
 [ 0.7018281 13.06356  ]
 [ 4.6110716 16.978762 ]
 ...
 [15.668492  -2.1295192]
 [16.923805   9.526438 ]
 [ 5.3650784 16.781603 ]]
[[ 5.214946    0.12763412]
 [22.352123    6.529311  ]
 [19.70649     5.9890723 ]
 ...
 [ 6.7625623   1.7476624 ]
 [-1.931388   13.321524  ]
 [ 3.0041745  13.011845  ]]
[[ 9.590451    0.4823868 ]
 [16.041098   10.058055  ]
 [18.187893    8.43209   ]
 ...
 [ 8.312008    0.6020971 ]
 [-0.23025209 15.540795  ]
 [ 2.9037366  12.9291725 ]]
[[10.035096    0.6314846 ]
 [17.263569   12.44225   ]
 [16.062506   11.140568  ]
 ...
 [ 8.945465   -0.33891827]
 [ 0.51793516 15.834822  ]
 [ 3.3785067  17.647436  ]]
[[ 5.214946    0.12763412]
 [22.352123    6.529311  ]
 [19.70649     5.9890723 ]
 ...
 [ 6.7625623   1.7476624 ]
 [-1.931388   13.321524  ]
 [ 3.0041745  13.011845  ]]
[[ 1.9714965 10.

In [58]:
#Determining the best clusters according to ch score and printing them in table
largest_keys = sorted(ch_tracker, key=ch_tracker.get, reverse=True)[:5]
largest_vals = [ch_tracker[x] for x in largest_keys]
length = len(largest_vals)
heading = np.empty(length, dtype = str)
heading[:] = 'value'
pd.DataFrame(largest_vals, index = largest_keys, columns=["values"])

Unnamed: 0,values


## 6.2 Agglomerative Clustering


In [10]:
#performing Agglomerative clustering by iterating over plausible hyperparameter values

#creating dict to track ch scores
ch_tracker = {}

#iterating over the different hyperparameter values and reduced dimensionality embeddings
for data_generation, pre_processing, dim_reduction, n_dims, n_neighbors, min_dist, measure in iter_array: #iteration over different reduced dim embeddings
  reduced_data = embeddings[data_generation + pre_processing + dim_reduction + n_dims + n_neighbors+ min_dist + measure]
  for n_clusters in np.concatenate((range(2,10,1),range(10,101,5))): #iterating over number of clusters
    agglo = AgglomerativeClustering(n_clusters = n_clusters)
    agglo.fit(reduced_data)
    labels =  agglo.labels_
    ch_score = metrics.calinski_harabasz_score(reduced_data, labels)
    ch_tracker[data_generation + pre_processing + dim_reduction + n_dims + n_neighbors+ min_dist + measure + str(n_clusters)] = ch_score

    if n_dims == '2':
      #configuring plot settings, one color for each created label
      plt.figure()
      plt.scatter(reduced_data[:,0], reduced_data[:,1], c = np.take(colors, labels), s=0.1)
      plt.title(data_generation + '     ' + pre_processing + '     ' + dim_reduction + '\n' + 'n_neighbors:' + n_neighbors+ '     min_dist:' + min_dist + '     metric:' + measure + '\n' + 'ch score:'+str(ch_score) + '    nr clusters:' + str(n_clusters) + '\n Agglo', fontsize=8)
      clustering_path = w_path + '/' + data_generation + '/' + pre_processing + '/' + dim_reduction + '/' + n_dims + '/' + 'agglo'
      if os.path.exists(clustering_path) == False:
        os.mkdir(clustering_path)
      clustering_path = clustering_path + '/' + n_neighbors + '_' + min_dist + '_' + measure + str(n_clusters) +'.pdf'
      plt.savefig(clustering_path , pad_inches = 15)


    else:
      plt.figure()
      ax = plt.axes(projection='3d')
      ax.scatter3D(reduced_data[:, 0], reduced_data[:, 1], reduced_data[:, 2], c = np.take(colors, labels), s=0.1)
      plt.title(data_generation + '     ' + pre_processing + '     ' + dim_reduction + '\n' + 'n_neighbors:' + n_neighbors+ '     min_dist:' + min_dist + '     metric:' + measure + '\n' + 'ch score:'+str(ch_score)+ '    nr clusters:' + str(n_clusters)+ '\n Agglo', fontsize=8)
      clustering_path = w_path + '/' + data_generation + '/' + pre_processing + '/' + dim_reduction + '/' + n_dims + '/' + 'agglo'
      if os.path.exists(clustering_path) == False:
        os.mkdir(clustering_path)
      clustering_path = clustering_path + '/' + n_neighbors + '_' + min_dist + '_' + measure + str(n_clusters) +'.pdf'
      plt.savefig(clustering_path , pad_inches = 15)

In [None]:
#Determining the best clusters according to ch score and printing them in table
largest_keys = sorted(ch_tracker, key=ch_tracker.get, reverse=True)[:5]
largest_vals = [ch_tracker[x] for x in largest_keys]
length = len(largest_vals)
heading = np.empty(length, dtype = str)
heading[:] = 'value'
pd.DataFrame(largest_vals, index = largest_keys, columns=["values"])

Unnamed: 0,values
adaptersnorm_vecsUMAP220.1euclidean100,15946.706822
adaptersnorm_vecsUMAP220.1euclidean95,14775.090654
adaptersnorm_vecsUMAP220.1euclidean90,13806.104279
adaptersnorm_vecsUMAP220.1euclidean85,12956.346527
adaptersnorm_vecsUMAP220.2euclidean100,12609.818757


## 6.3 Spectral Clustering

In [None]:
#performing Spectral clustering by iterating over plausible hyperparameter values


#creating dict to track ch scores
ch_tracker = {}

#iterating over the different hyperparameter values and reduced dimensionality embeddings
for data_generation, pre_processing, dim_reduction, n_dims, n_neighbors, min_dist, measure in iter_array: #iteration over different reduced dim embeddings
  reduced_data = embeddings[data_generation + pre_processing + dim_reduction + n_dims + n_neighbors+ min_dist + measure]
  for n_clusters in np.concatenate((range(2,10,1),range(10,101,5))): #iterating over different number of clusters
    spectral = SpectralClustering(n_clusters = n_clusters)
    spectral.fit(reduced_data)
    labels =  spectral.labels_
    ch_score = metrics.calinski_harabasz_score(reduced_data, labels)
    ch_tracker[data_generation + pre_processing + dim_reduction + n_dims + n_neighbors+ min_dist + measure + str(n_clusters)] = ch_score

    if n_dims == '2':
      #configuring plot settings, one color for each created label
      plt.figure()
      plt.scatter(reduced_data[:,0], reduced_data[:,1], c = np.take(colors, labels), s=0.1)
      plt.title(data_generation + '     ' + pre_processing + '     ' + dim_reduction + '\n' + 'n_neighbors:' + n_neighbors+ '     min_dist:' + min_dist + '     metric:' + measure + '\n' + 'ch score:'+str(ch_score) + '    nr clusters:' + str(n_clusters)+ '\n Spectral', fontsize=8)
      clustering_path = w_path + '/' + data_generation + '/' + pre_processing + '/' + dim_reduction + '/' + n_dims + '/' + 'spectral'
      if os.path.exists(clustering_path) == False:
        os.mkdir(clustering_path)
      clustering_path = clustering_path + '/' + n_neighbors + '_' + min_dist + '_' + measure + str(n_clusters) +'.pdf'
      plt.savefig(clustering_path , pad_inches = 15)


    else:
      plt.figure()
      ax = plt.axes(projection='3d')
      ax.scatter3D(reduced_data[:, 0], reduced_data[:, 1], reduced_data[:, 2], c = np.take(colors, labels), s=0.1)
      plt.title(data_generation + '     ' + pre_processing + '     ' + dim_reduction + '\n' + 'n_neighbors:' + n_neighbors+ '     min_dist:' + min_dist + '     metric:' + measure + '\n' + 'ch score:'+str(ch_score)+ '    nr clusters:' + str(n_clusters)+ '\n Spectral', fontsize=8)
      clustering_path = w_path + '/' + data_generation + '/' + pre_processing + '/' + dim_reduction + '/' + n_dims + '/' + 'spectral'
      if os.path.exists(clustering_path) == False:
        os.mkdir(clustering_path)
      clustering_path = clustering_path + '/' + n_neighbors + '_' + min_dist + '_' + measure + str(n_clusters) +'.pdf'
      plt.savefig(clustering_path , pad_inches = 15)

[1.44919714e-15 1.61829967e-04 3.05845843e-05]
not reaching the requested tolerance 3.072619438171387e-05.
Use iteration 136 instead with accuracy 
3.992587140607047e-05.

  _, diffusion_map = lobpcg(
[1.36148293e-15 3.25051026e-05 8.72725116e-05]
not reaching the requested tolerance 3.072619438171387e-05.
  _, diffusion_map = lobpcg(


In [None]:
#Determining the best clusters according to ch score and printing them in table
largest_keys = sorted(ch_tracker, key=ch_tracker.get, reverse=True)[:5]
largest_vals = [ch_tracker[x] for x in largest_keys]
length = len(largest_vals)
heading = np.empty(length, dtype = str)
heading[:] = 'value'
pd.DataFrame(largest_vals, index = largest_keys, columns=["values"])

## 6.4 BIRCH Clustering

In [None]:
#performing BIRCH clustering by iterating over plausible hyperparameter values

#creating dict to track ch scores
ch_tracker = {}

#iterating over the different hyperparameter values and reduced dimensionality embeddings
for data_generation, pre_processing, dim_reduction, n_dims, n_neighbors, min_dist, measure in iter_array: #iteration over different reduced dim embeddings
  reduced_data = embeddings[data_generation + pre_processing + dim_reduction + n_dims + n_neighbors+ min_dist + measure]
  for n_clusters in np.concatenate((range(2,10,1),range(10,101,5))): #iterating over number of clusters
    birch = Birch(n_clusters = n_clusters)
    birch.fit(reduced_data)
    labels =  birch.labels_
    ch_score = metrics.calinski_harabasz_score(reduced_data, labels)
    ch_tracker[data_generation + pre_processing + dim_reduction + n_dims + n_neighbors+ min_dist + measure + str(n_clusters)] = ch_score

    if n_dims == '2':
      #configuring plot settings, one color for each created label
      plt.figure()
      plt.scatter(reduced_data[:,0], reduced_data[:,1], c = np.take(colors, labels), s=0.1)
      plt.title(data_generation + '     ' + pre_processing + '     ' + dim_reduction + '\n' + 'n_neighbors:' + n_neighbors+ '     min_dist:' + min_dist + '     metric:' + measure + '\n' + 'ch score:'+str(ch_score) + '    nr clusters:' + str(n_clusters) + '\n BIRCH', fontsize=8)
      clustering_path = w_path + '/' + data_generation + '/' + pre_processing + '/' + dim_reduction + '/' + n_dims + '/' + 'BIRCH'
      if os.path.exists(clustering_path) == False:
        os.mkdir(clustering_path)
      clustering_path = clustering_path + '/' + n_neighbors + '_' + min_dist + '_' + measure + str(n_clusters) +'.pdf'
      plt.savefig(clustering_path , pad_inches = 15)


    else:
      plt.figure()
      ax = plt.axes(projection='3d')
      ax.scatter3D(reduced_data[:, 0], reduced_data[:, 1], reduced_data[:, 2], c = np.take(colors, labels), s=0.1)
      plt.title(data_generation + '     ' + pre_processing + '     ' + dim_reduction + '\n' + 'n_neighbors:' + n_neighbors+ '     min_dist:' + min_dist + '     metric:' + measure + '\n' + 'ch score:'+str(ch_score)+ '    nr clusters:' + str(n_clusters) + '\n Birch', fontsize=8)
      clustering_path = w_path + '/' + data_generation + '/' + pre_processing + '/' + dim_reduction + '/' + n_dims + '/' + 'BIRCH'
      if os.path.exists(clustering_path) == False:
        os.mkdir(clustering_path)
      clustering_path = clustering_path + '/' + n_neighbors + '_' + min_dist + '_' + measure + str(n_clusters) +'.pdf'
      plt.savefig(clustering_path , pad_inches = 15)

In [None]:
#Determining the best clusters according to ch score and printing them in table
largest_keys = sorted(ch_tracker, key=ch_tracker.get, reverse=True)[:5]
largest_vals = [ch_tracker[x] for x in largest_keys]
length = len(largest_vals)
heading = np.empty(length, dtype = str)
heading[:] = 'value'
pd.DataFrame(largest_vals, index = largest_keys, columns=["values"])

## 6.5 Optics Clustering

In [None]:
#performing OPTICS clustering by iterating over plausible hyperparameter values

#creating dict to track ch scores
ch_tracker = {}

#iterating over the different hyperparameter values and reduced dimensionality embeddings
for data_generation, pre_processing, dim_reduction, n_dims, n_neighbors, min_dist, measure in iter_array: #iteration over different reduced dim embeddings
  reduced_data = embeddings[data_generation + pre_processing + dim_reduction + n_dims + n_neighbors+ min_dist + measure]
  for min_samples in range(20,100,5): #iterating over number of samples in a neighborhood for a point to be considered as a core point
    optics = OPTICS(min_samples = min_samples)
    optics.fit(reduced_data)
    labels = optics.labels_
    ch_score = metrics.calinski_harabasz_score(reduced_data, labels)
    ch_tracker[data_generation + pre_processing + dim_reduction + n_dims + n_neighbors+ min_dist + measure + str(n_clusters)] = ch_score

    if n_dims == '2':
      #configuring plot settings, one color for each created label
      plt.figure()
      plt.scatter(reduced_data[:,0], reduced_data[:,1], c = np.take(colors, labels), s=0.1)
      plt.title(data_generation + '     ' + pre_processing + '     ' + dim_reduction + '\n' + 'n_neighbors:' + n_neighbors+ '     min_dist:' + min_dist + '     metric:' + measure + '\n' + 'ch score:'+str(ch_score) + '    min samples:' + str(min_samples) + + '\n Optics', fontsize=8)
      clustering_path = w_path + '/' + data_generation + '/' + pre_processing + '/' + dim_reduction + '/' + n_dims + '/' + 'Optics'
      if os.path.exists(clustering_path) == False:
        os.mkdir(clustering_path)
      clustering_path = clustering_path + '/' + n_neighbors + '_' + min_dist + '_' + measure + str(min_samples) +'.pdf'
      plt.savefig(clustering_path , pad_inches = 15)


    else:
      plt.figure()
      ax = plt.axes(projection='3d')
      ax.scatter3D(reduced_data[:, 0], reduced_data[:, 1], reduced_data[:, 2], c = np.take(colors, labels), s=0.1)
      plt.title(data_generation + '     ' + pre_processing + '     ' + dim_reduction + '\n' + 'n_neighbors:' + n_neighbors+ '     min_dist:' + min_dist + '     metric:' + measure + '\n' + 'ch score:'+str(ch_score)+ '    min samples:' + str(min_samples) + '\n Optics', fontsize=8)
      clustering_path = w_path + '/' + data_generation + '/' + pre_processing + '/' + dim_reduction + '/' + n_dims + '/' + 'Optics'
      if os.path.exists(clustering_path) == False:
        os.mkdir(clustering_path)
      clustering_path = clustering_path + '/' + n_neighbors + '_' + min_dist + '_' + measure + str(min_samples) +'.pdf'
      plt.savefig(clustering_path , pad_inches = 15)

In [None]:
#Determining the best clusters according to ch score and printing them in table
largest_keys = sorted(ch_tracker, key=ch_tracker.get, reverse=True)[:5]
largest_vals = [ch_tracker[x] for x in largest_keys]
length = len(largest_vals)
heading = np.empty(length, dtype = str)
heading[:] = 'value'
pd.DataFrame(largest_vals, index = largest_keys, columns=["values"])

## 6.6 DBScan Clustering

In [None]:
#performing DBScan clustering by iterating over plausible hyperparameter values

#creating dict to track ch scores
ch_tracker = {}

#iterating over the different hyperparameter values and reduced dimensionality embeddings
for data_generation, pre_processing, dim_reduction, n_dims, n_neighbors, min_dist, measure in iter_array: #iteration over different reduced dim embeddings
  reduced_data = embeddings[data_generation + pre_processing + dim_reduction + n_dims + n_neighbors+ min_dist + measure]
  for min_samples in range(20,100,10): #iterating over number of samples in a neighborhood for a point to be considered as a core point
    for eps in range(1, 20, 1):
      dbscan = DBSCAN(eps= eps/2, min_samples = min_samples)
      dbscan.fit(reduced_data)
      labels = dbscan.labels_
    ch_score = metrics.calinski_harabasz_score(reduced_data, labels)
    ch_tracker[data_generation + pre_processing + dim_reduction + n_dims + n_neighbors+ min_dist + measure + str(n_clusters)] = ch_score

    if n_dims == '2':
      #configuring plot settings, one color for each created label
      plt.figure()
      plt.scatter(reduced_data[:,0], reduced_data[:,1], c = np.take(colors, labels), s=0.1)
      plt.title(data_generation + '     ' + pre_processing + '     ' + dim_reduction + '\n' + 'n_neighbors:' + n_neighbors+ '     min_dist:' + min_dist + '     metric:' + measure + '\n' + 'ch score:'+str(ch_score) + '    min samples:' + str(min_samples) + '\n DBScan', fontsize=8)
      clustering_path = w_path + '/' + data_generation + '/' + pre_processing + '/' + dim_reduction + '/' + n_dims + '/' + 'DBScan'
      if os.path.exists(clustering_path) == False:
        os.mkdir(clustering_path)
      clustering_path = clustering_path + '/' + n_neighbors + '_' + min_dist + '_' + measure + str(min_samples) +'.pdf'
      plt.savefig(clustering_path , pad_inches = 15)


    else:
      plt.figure()
      ax = plt.axes(projection='3d')
      ax.scatter3D(reduced_data[:, 0], reduced_data[:, 1], reduced_data[:, 2], c = np.take(colors, labels), s=0.1)
      plt.title(data_generation + '     ' + pre_processing + '     ' + dim_reduction + '\n' + 'n_neighbors:' + n_neighbors+ '     min_dist:' + min_dist + '     metric:' + measure + '\n' + 'ch score:'+str(ch_score)+ '    min samples:' + str(min_samples)+ '\n DBScan', fontsize=8)
      clustering_path = w_path + '/' + data_generation + '/' + pre_processing + '/' + dim_reduction + '/' + n_dims + '/' + 'DBScan'
      if os.path.exists(clustering_path) == False:
        os.mkdir(clustering_path)
      clustering_path = clustering_path + '/' + n_neighbors + '_' + min_dist + '_' + measure + str(min_samples) +'.pdf'
      plt.savefig(clustering_path , pad_inches = 15)

In [None]:
#Determining the best clusters according to ch score and printing them in table
largest_keys = sorted(ch_tracker, key=ch_tracker.get, reverse=True)[:5]
largest_vals = [ch_tracker[x] for x in largest_keys]
length = len(largest_vals)
heading = np.empty(length, dtype = str)
heading[:] = 'value'
pd.DataFrame(largest_vals, index = largest_keys, columns=["values"])