In [1]:
from data_preparation.data_extraction import get_tracks
import itertools
import numpy as np
import pandas as pd

from sklearn import cluster
from sklearn import mixture
from sklearn_som.som import SOM
from skfuzzy import cmeans
from sklearn.metrics import silhouette_score
from data_preparation.evaluation import best_fit_matching_score as matching_score

import warnings

#warnings.filterwarnings(action='once')
warnings.filterwarnings('ignore')

  df = pd.read_csv(filepath_or_buffer='data.csv', sep=',', index_col=0)


In [2]:
df = get_tracks('5Rh7ikX5dteMXfc8tmeBJy', test=True)

In [3]:
performance_comparison = pd.DataFrame(columns=['features', 'algorithm', 'parameters', 'cluster_count', 'silhouette_score', 'matching_score'])

In [4]:
# get unique feature combinations with 3-6 selected features

features = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'loudness', 'speechiness', 'tempo', 'valence', 'release_year']

feature_combinations = set()

for r in [3, 4, 5, 6]:
    feature_combinations.update(itertools.combinations(features, r))

In [5]:
def execute_algorithm(alg_func, parameter_combinations):
    for features in feature_combinations:
        df_features = df[list(features)]

        for parameters in parameter_combinations:
            details, labels = alg_func(df_features, parameters)
            if details is None or labels is None:
                continue
            
            if len(set(labels)) == 1 or len(set(labels)) == len(df):
                ss = None
            else:
                ss = silhouette_score(df_features, labels)
            ms = matching_score(df['expected_label'], labels)

            comparison = [features] + details + [len(set(labels)), ss, ms]
    
            performance_comparison.loc[len(performance_comparison)] = comparison

In [6]:
# K-Means
k_parameters = np.arange(2, 16, 1)

def k_means(df, parameters):
    k = parameters

    model = cluster.KMeans(
        n_clusters=k,
        random_state=0,
    )
    model.fit(df)

    labels = model.labels_

    return ['kmeans', [k]], labels

execute_algorithm(k_means, k_parameters)

In [7]:
# Affinity propagation
damping_parameters = np.linspace(0.5, 0.9, 5)
preference_parameters = np.arange(-50, 0, 5)
parameter_combinations = list(itertools.product(damping_parameters, preference_parameters))

def affinity_propagation(df, parameters):
    damping = parameters[0]
    preference = parameters[1]
    
    model = cluster.AffinityPropagation(
        damping=damping,
        preference=preference,
        affinity='euclidean',
        max_iter=500,
        random_state=0,
    )
    model.fit(df)

    labels = model.labels_

    return ['affinity_propagation', [damping, preference]], labels
    
execute_algorithm(affinity_propagation, parameter_combinations)

In [8]:
# Agglomerative Clustering
n_parameters = np.arange(2, 16, 1)
linkage_parameters = ['ward', 'complete', 'average', 'single']
parameter_combinations = list(itertools.product(n_parameters, linkage_parameters))

def agglomerative_clustering(df, parameters):
    n = parameters[0]
    linkage = parameters[1]

    model = cluster.AgglomerativeClustering(
        n_clusters=n,
        linkage=linkage,
        metric='euclidean',
    )
    model.fit(df)

    labels = model.labels_

    return ['agglomerative_clustering', [n, linkage]], labels

execute_algorithm(agglomerative_clustering, parameter_combinations)

In [9]:
# DBSCAN
eps_parameters = np.linspace(0.1, 0.9, 9)
min_samples_parameters = np.arange(1, 11)
algorithm_parameters = ['auto', 'ball_tree', 'kd_tree', 'brute']
parameter_combinations = list(itertools.product(eps_parameters, min_samples_parameters, algorithm_parameters))

def DBSCAN(df, parameters):
    eps = parameters[0]
    min_samples = parameters[1]
    algorithm = parameters[2]

    model = cluster.DBSCAN(
        eps=eps,
        min_samples=min_samples,
        algorithm=algorithm,
        metric='euclidean',
    )
    model.fit(df)

    labels = model.labels_

    return ['dbscan', [eps, min_samples, algorithm]], labels

execute_algorithm(DBSCAN, parameter_combinations)

In [10]:
# Spectral Clustering
n_cluster_parameters = np.arange(2, 16)
eigen_solver_parameters = ['arpack', 'lobpcg', 'amg']
n_component_parameters = np.arange(1, len(df.columns)+1)
# only for RBF
rbf_affinity_parameters = ['rbf']
gamma_parameters = np.linspace(0.5, 2, 16)
# only for NON RBF
non_rbf_affinity_parameters = ['nearest_neighbors', 'rbf', 'precomputed', 'precomputed_nearest_neighbors']
n_neighbors_parameters = np.arange(5, 21)

rbf_parameter_combinations = list(itertools.product(n_cluster_parameters, eigen_solver_parameters, n_component_parameters, rbf_affinity_parameters, gamma_parameters))
non_rbf_parameter_combinations = list(itertools.product(n_cluster_parameters, eigen_solver_parameters, n_component_parameters, non_rbf_affinity_parameters, n_neighbors_parameters))

parameter_combinations = rbf_parameter_combinations + non_rbf_parameter_combinations

def spectral_clustering(df, parameters):
    n_clusters = parameters[0]
    eigen_solver = parameters[1]
    n_components = parameters[2]
    affinity = parameters[3]
    gamma = 1.0 # default
    n_neighbors = 10 # default

    if affinity == 'rbf':
        gamma = parameters[4]
    else:
        n_neighbors = parameters[4]

    model = cluster.SpectralClustering(
        n_clusters=n_clusters,
        eigen_solver=eigen_solver,
        n_components=n_components,
        affinity=affinity,
        gamma=gamma,
        n_neighbors=n_neighbors,
        random_state=0,
    )
    try:
        model.fit(df)
    except ValueError:
        return None

    labels = model.labels_
    # model.fit_predict(data)

    return ['spectral_clustering', [n_clusters, eigen_solver, n_components, affinity, gamma, n_neighbors]], labels

execute_algorithm(spectral_clustering, parameter_combinations)

TypeError: cannot unpack non-iterable NoneType object

In [11]:
# Gaussian Mixture
n_component_parameters = np.arange(2, 16, 1)
covariance_type_parameters = ['full', 'tied', 'diag', 'spherical']
parameter_combinations = list(itertools.product(n_component_parameters, covariance_type_parameters))

def GMM(df, parameters):
    n_components = parameters[0]
    covariance_type = parameters[1]

    model = mixture.GaussianMixture(
        n_components=n_components,
        covariance_type=covariance_type,
        random_state=0,
    )
    model.fit(df)

    labels = model.predict(df)

    return ['gmm', [n_components, covariance_type]], labels

execute_algorithm(GMM, parameter_combinations)

In [12]:
%%script false --no-raise-error

# Self Organized Maps (doesn't work properly)
m_parameters = np.arange(1, 16)
n_parameters = np.arange(1, 16)
dim_parameters = np.arange(1, len(df.columns) + 1)
sigma_parameters = np.linspace(0.5, 1.5, 11)
parameter_combinations = list(itertools.product(m_parameters, n_parameters, dim_parameters, sigma_parameters))

def self_organized_maps(df, parameters):
    m = len(df.columns)
    n = len(df.columns)
    dim = len(df.columns)
    sigma = 1

    df_float = df.astype(float).reset_index()

    model = SOM(
        m=8,
        n=8,
        dim=8,
        sigma=1,
        lr=1,
        random_state=0,
    )
    model.fit(df_float, epochs=1, shuffle=True)

    labels = model.predict(df_float)

    return ['som', [m, n, dim, sigma]], labels

execute_algorithm(self_organized_maps, parameter_combinations)

In [13]:
# CMeans
c_parameters = np.arange(2, 16, 1)
m_parameters = np.linspace(0.5, 2, 16)
parameter_combinations = list(itertools.product(c_parameters, m_parameters))

def c_means(df, parameters):
    c = parameters[0]
    m = parameters[1]

    cntr, u, u0, d, jm, p, fpc = cmeans(
        data=df.transpose(), c=c, m=m, error=0.005, maxiter=1000, seed=0
    )

    # TODO apply relationship to multiple clusters
    labels = np.argmax(u, axis=0)

    return ['cmeans', [c, m]], labels

execute_algorithm(c_means, parameter_combinations)

In [14]:
len(performance_comparison)

319424

In [15]:
filtered = performance_comparison[performance_comparison['cluster_count'] >= 2]
filtered = filtered[filtered['cluster_count'] < 12]
filtered = filtered[filtered['silhouette_score'] > 0.1]
filtered.sort_values(by='matching_score', ascending=False, inplace=True)

In [37]:
filtered

Unnamed: 0,features,algorithm,parameters,cluster_count,silhouette_score,matching_score
279091,"(danceability, energy, speechiness)",cmeans,"[15, 0.8]",8,0.147900,0.326215
293112,"(loudness, speechiness, release_year)",cmeans,"[9, 1.3]",9,0.350959,0.319046
217756,"(acousticness, instrumentalness, loudness, spe...",spectral_clustering,"[9, full]",9,0.246471,0.315369
248405,"(acousticness, instrumentalness, speechiness, ...",cmeans,"[15, 1.0]",9,0.366303,0.313080
208293,"(acousticness, instrumentalness, speechiness, ...",spectral_clustering,"[9, tied]",9,0.290090,0.312947
...,...,...,...,...,...,...
132160,"(energy, instrumentalness, loudness, speechine...",dbscan,"[0.2, 1, auto]",10,0.111177,0.103682
35815,"(instrumentalness, loudness, tempo)",agglomerative_clustering,"[9, single]",9,0.171005,0.101866
39623,"(energy, instrumentalness, loudness, speechine...",agglomerative_clustering,"[9, single]",9,0.127234,0.098976
35823,"(instrumentalness, loudness, tempo)",agglomerative_clustering,"[11, single]",11,0.182830,0.092243


In [38]:
res, labels = agglomerative_clustering(df[list(filtered.iloc[11].features)], filtered.iloc[11].parameters)
labels

array([1, 0, 0, 1, 0, 3, 3, 0, 3, 1, 0, 3, 5, 3, 5, 1, 0, 4, 2, 2, 4, 4,
       6, 6, 8, 3, 6, 5, 0, 3, 0, 0, 8, 5, 3, 6, 0, 1, 2, 8, 1, 4, 4, 1,
       0, 4, 0, 4, 5, 0, 4, 1, 5, 4, 0, 0, 0, 1, 0, 3, 0, 1, 0, 4, 4, 6,
       0, 1, 0, 1, 8, 4, 0, 4, 6, 5, 8, 0, 1, 4, 6, 0, 3, 2, 1, 0, 7, 7,
       8, 0, 0, 3, 6, 1, 4, 0])

In [39]:
df['labels'] = labels

In [40]:
pd.set_option('display.max_rows', 500)
df[['name', 'artist', 'labels']].sort_values(by='labels')

Unnamed: 0,name,artist,labels
95,Ciao Ciao,La rappresentante di lista,0
55,Crazy Train,Ozzy Osbourne,0
56,Paper Planes,M.I.A.,0
30,In the End,Linkin Park,0
58,Alright,Kendrick Lamar,0
60,Papaoutai,Stromae,0
31,I Bet You Look Good On The Dancefloor,Arctic Monkeys,0
62,Lose Yourself,Eminem,0
66,Bye Bye Bye,*NSYNC,0
36,Chantaje (feat. Maluma),Shakira,0
