In [1]:
from data_preparation.data_extraction import get_tracks
import itertools
import numpy as np
import pandas as pd

from sklearn import cluster
from sklearn import mixture
from sklearn_som.som import SOM
from minisom import MiniSom
from skfuzzy import cmeans
from sklearn.metrics import silhouette_score

import warnings

#warnings.filterwarnings(action='once')
warnings.filterwarnings('ignore')

  df = pd.read_csv(filepath_or_buffer='data.csv', sep=',', index_col=0)


In [2]:
df = get_tracks('5Rh7ikX5dteMXfc8tmeBJy')

df_features = df.drop(columns=['id', 'name', 'artist', 'album', 'release_date'])

In [3]:
performance_comparison = pd.DataFrame(columns=['algorithm', 'parameters', 'cluster_count', 'silhouette_score'])

In [4]:
# K-Means
k_parameters = np.arange(2, 16, 1)

for k in k_parameters:
    model = cluster.KMeans(
        n_clusters=k,
        random_state=0,
    )
    model.fit(df_features)

    labels = model.labels_
    ss = silhouette_score(df_features, labels)

    performance_comparison.loc[len(performance_comparison)] = ['kmeans', [k], len(set(labels)), ss]

In [5]:
# Affinity propagation
damping_parameters = np.linspace(0.5, 0.9, 5)
preference_parameters = np.arange(-50, 0, 5)
parameter_combinations = list(itertools.product(damping_parameters, preference_parameters))

for parameters in parameter_combinations:
    damping = parameters[0]
    preference = parameters[1]
    
    model = cluster.AffinityPropagation(
        damping=damping,
        preference=preference,
        affinity='euclidean',
        max_iter=500,
        random_state=0,
    )
    model.fit(df_features)

    labels = model.labels_
    ss = silhouette_score(df_features, labels)

    performance_comparison.loc[len(performance_comparison)] = ['affinity_propagation', [damping, preference], len(set(labels)), ss]

In [6]:
# Agglomerative Clustering
n_parameters = np.arange(2, 16, 1)
linkage_parameters = ['ward', 'complete', 'average', 'single']
parameter_combinations = list(itertools.product(n_parameters, linkage_parameters))

for parameters in parameter_combinations:
    n = parameters[0]
    linkage = parameters[1]

    model = cluster.AgglomerativeClustering(
        n_clusters=n,
        linkage=linkage,
        metric='euclidean',
    )
    model.fit(df_features)

    labels = model.labels_
    ss = silhouette_score(df_features, labels)

    performance_comparison.loc[len(performance_comparison)] = ['agglomerative_clustering', [n, linkage], len(set(labels)), ss]

In [7]:
# DBSCAN
eps_parameters = np.linspace(0.1, 0.9, 9)
min_samples_parameters = np.arange(1, 11)
algorithm_parameters = ['auto', 'ball_tree', 'kd_tree', 'brute']
parameter_combinations = list(itertools.product(eps_parameters, min_samples_parameters, algorithm_parameters))

for parameters in parameter_combinations:
    eps = parameters[0]
    min_samples = parameters[1]
    algorithm = parameters[2]

    model = cluster.DBSCAN(
        eps=eps,
        min_samples=min_samples,
        algorithm=algorithm,
        metric='euclidean',
    )
    model.fit(df_features)

    labels = model.labels_
    
    if len(set(labels)) == len(labels) or len(set(labels)) == 1:
        continue
    else:
        ss = silhouette_score(df_features, labels)

    performance_comparison.loc[len(performance_comparison)] = ['dbscan', [eps, min_samples, algorithm], len(set(labels)), ss]

In [8]:
# Spectral Clustering
n_cluster_parameters = np.arange(2, 16)
eigen_solver_parameters = ['arpack', 'lobpcg', 'amg']
n_component_parameters = np.arange(1, len(df_features.columns)+1)
# only for RBF
rbf_affinity_parameters = ['rbf']
gamma_parameters = np.linspace(0.5, 2, 16)
# only for NON RBF
non_rbf_affinity_parameters = ['nearest_neighbors', 'rbf', 'precomputed', 'precomputed_nearest_neighbors']
n_neighbors_parameters = np.arange(5, 21)

rbf_parameter_combinations = list(itertools.product(n_cluster_parameters, eigen_solver_parameters, n_component_parameters, rbf_affinity_parameters, gamma_parameters))
non_rbf_parameter_combinations = list(itertools.product(n_cluster_parameters, eigen_solver_parameters, n_component_parameters, non_rbf_affinity_parameters, n_neighbors_parameters))

parameter_combinations = rbf_parameter_combinations + non_rbf_parameter_combinations

for parameters in parameter_combinations:
    n_clusters = parameters[0]
    eigen_solver = parameters[1]
    n_components = parameters[2]
    affinity = parameters[3]
    gamma = 1.0 # default
    n_neighbors = 10 # default

    if affinity == 'rbf':
        gamma = parameters[4]
    else:
        n_neighbors = parameters[4]

    model = cluster.SpectralClustering(
        n_clusters=n_clusters,
        eigen_solver=eigen_solver,
        n_components=n_components,
        affinity=affinity,
        gamma=gamma,
        n_neighbors=n_neighbors,
        random_state=0,
    )
    model.fit(df_features)

    labels = model.labels_
    # model.fit_predict(data)
    
    ss = silhouette_score(df_features, labels)

    performance_comparison.loc[len(performance_comparison)] = ['spectral_clustering', [n_clusters, eigen_solver, n_components, affinity, gamma, n_neighbors], len(set(labels)), ss]

ValueError: 

In [9]:
# Gaussian Mixture
n_component_parameters = np.arange(2, 16, 1)
covariance_type_parameters = ['full', 'tied', 'diag', 'spherical']
parameter_combinations = list(itertools.product(n_component_parameters, covariance_type_parameters))

for parameters in parameter_combinations:
    n_components = parameters[0]
    covariance_type = parameters[1]

    model = mixture.GaussianMixture(
        n_components=n_components,
        covariance_type=covariance_type,
        random_state=0,
    )
    model.fit(df_features)

    labels = model.predict(df_features)
    ss = silhouette_score(df_features, labels)

    performance_comparison.loc[len(performance_comparison)] = ['spectral_clustering', [n_components, covariance_type], len(set(labels)), ss]

In [None]:
# Self Organized Maps (doesn't work)
m_parameters = np.arange(1, 16)
n_parameters = np.arange(1, 16)
dim_parameters = np.arange(1, len(df_features.columns) + 1)
sigma_parameters = np.linspace(0.5, 1.5, 11)
parameter_combinations = list(itertools.product(m_parameters, n_parameters, dim_parameters, sigma_parameters))

for parameters in parameter_combinations:
    m = len(df_features.columns)
    n = len(df_features.columns)
    dim = len(df_features.columns)
    sigma = 1

    df_float = df_features.astype(float).reset_index()

    model = SOM(
        m=8,
        n=8,
        dim=8,
        sigma=1,
        lr=1,
        random_state=0,
    )
    model.fit(df_float, epochs=1, shuffle=True)

    labels = model.predict(df_float)
    ss = silhouette_score(df_float, labels)

    performance_comparison.loc[len(performance_comparison)] = ['som', [m, n, dim, sigma], len(set(labels)), ss]

In [10]:
# CMeans
c_parameters = np.arange(2, 16, 1)
m_parameters = np.linspace(0.5, 2, 16)
parameter_combinations = list(itertools.product(c_parameters, m_parameters))

for parameters in parameter_combinations:
    c = parameters[0]
    m = parameters[1]

    cntr, u, u0, d, jm, p, fpc = cmeans(
        data=df_features.transpose(), c=c, m=m, error=0.005, maxiter=1000, seed=0
    )
    labels = np.argmax(u, axis=0)
    ss = silhouette_score(df_features, labels)

    performance_comparison.loc[len(performance_comparison)] = ['cmeans', [c, m], len(set(labels)), ss]

In [11]:
len(performance_comparison)

560

In [15]:
filtered = performance_comparison[performance_comparison['cluster_count'] > 3]
filtered.sort_values(by='silhouette_score', ascending=False, inplace=True)

In [17]:
filtered

Unnamed: 0,algorithm,parameters,cluster_count,silhouette_score
2,kmeans,[4],4,0.543238
74,agglomerative_clustering,"[4, average]",4,0.543238
73,agglomerative_clustering,"[4, complete]",4,0.543238
72,agglomerative_clustering,"[4, ward]",4,0.543238
291,spectral_clustering,"[4, spherical]",4,0.543238
...,...,...,...,...
148,dbscan,"[0.9, 2, auto]",6,-0.535474
143,dbscan,"[0.8, 2, brute]",5,-0.542708
142,dbscan,"[0.8, 2, kd_tree]",5,-0.542708
141,dbscan,"[0.8, 2, ball_tree]",5,-0.542708
