In [1]:
from data_preparation.data_extraction import get_tracks
import itertools
import numpy as np
import pandas as pd

from sklearn import cluster
from sklearn import mixture
from sklearn_som.som import SOM
from skfuzzy import cmeans
from sklearn.metrics import silhouette_score
from data_preparation.evaluation import best_fit_matching_score as matching_score, constraint_matching_score

import warnings

#warnings.filterwarnings(action='once')
warnings.filterwarnings('ignore')

  df = pd.read_csv(filepath_or_buffer='data.csv', sep=',', index_col=0)


In [2]:
df = get_tracks('5Rh7ikX5dteMXfc8tmeBJy', test=True)

In [3]:
performance_comparison = pd.DataFrame(columns=['features', 'algorithm', 'parameters', 'cluster_count', 'silhouette_score', 'matching_score', 'constraint_matching_score'])

In [4]:
# get unique feature combinations with 3-6 selected features

features = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'loudness', 'speechiness', 'tempo', 'valence', 'release_year']

feature_combinations = set()

for r in [3, 4, 5, 6]:
    feature_combinations.update(itertools.combinations(features, r))

In [5]:
def execute_algorithm(alg_func, parameter_combinations):
    for features in feature_combinations:
        df_features = df[list(features)]

        for parameters in parameter_combinations:
            details, labels = alg_func(df_features, parameters)
            if details is None or labels is None:
                continue
            
            if len(set(labels)) == 1 or len(set(labels)) == len(df):
                ss = None
            else:
                ss = silhouette_score(df_features, labels)
            ms = matching_score(df['expected_labels'], labels)
            cms = constraint_matching_score(df['constraints'], labels)

            comparison = [features] + details + [len(set(labels)), ss, ms, cms]
    
            performance_comparison.loc[len(performance_comparison)] = comparison

In [6]:
# K-Means
k_parameters = np.arange(2, 26, 1)

def k_means(df, parameters):
    k = parameters

    model = cluster.KMeans(
        n_clusters=k,
        random_state=0,
    )
    model.fit(df)

    labels = model.labels_

    return ['kmeans', [k]], labels

execute_algorithm(k_means, k_parameters)

In [7]:
# Affinity propagation
damping_parameters = np.linspace(0.5, 0.9, 5)
preference_parameters = np.arange(-50, 0, 5)
parameter_combinations = list(itertools.product(damping_parameters, preference_parameters))

def affinity_propagation(df, parameters):
    damping = parameters[0]
    preference = parameters[1]
    
    model = cluster.AffinityPropagation(
        damping=damping,
        preference=preference,
        affinity='euclidean',
        max_iter=500,
        random_state=0,
    )
    model.fit(df)

    labels = model.labels_

    return ['affinity_propagation', [damping, preference]], labels
    
execute_algorithm(affinity_propagation, parameter_combinations)

In [8]:
# Agglomerative Clustering
n_parameters = np.arange(2, 26, 1)
linkage_parameters = ['ward', 'complete', 'average', 'single']
parameter_combinations = list(itertools.product(n_parameters, linkage_parameters))

def agglomerative_clustering(df, parameters):
    n = parameters[0]
    linkage = parameters[1]

    model = cluster.AgglomerativeClustering(
        n_clusters=n,
        linkage=linkage,
        metric='euclidean',
    )
    model.fit(df)

    labels = model.labels_

    return ['agglomerative_clustering', [n, linkage]], labels

execute_algorithm(agglomerative_clustering, parameter_combinations)

In [9]:
# DBSCAN
eps_parameters = np.linspace(0.1, 0.9, 9)
min_samples_parameters = np.arange(1, 11)
algorithm_parameters = ['auto', 'ball_tree', 'kd_tree', 'brute']
parameter_combinations = list(itertools.product(eps_parameters, min_samples_parameters, algorithm_parameters))

def DBSCAN(df, parameters):
    eps = parameters[0]
    min_samples = parameters[1]
    algorithm = parameters[2]

    model = cluster.DBSCAN(
        eps=eps,
        min_samples=min_samples,
        algorithm=algorithm,
        metric='euclidean',
    )
    model.fit(df)

    labels = model.labels_

    return ['dbscan', [eps, min_samples, algorithm]], labels

execute_algorithm(DBSCAN, parameter_combinations)

In [10]:
# Spectral Clustering
n_cluster_parameters = np.arange(2, 26)
eigen_solver_parameters = ['arpack', 'lobpcg', 'amg']
# only for RBF
rbf_affinity_parameters = ['rbf']
gamma_parameters = np.linspace(0.5, 2, 16)
# only for NON RBF
non_rbf_affinity_parameters = ['nearest_neighbors', 'rbf', 'precomputed', 'precomputed_nearest_neighbors']
n_neighbors_parameters = np.arange(5, 21)

rbf_parameter_combinations = list(itertools.product(n_cluster_parameters, eigen_solver_parameters, rbf_affinity_parameters, gamma_parameters))
non_rbf_parameter_combinations = list(itertools.product(n_cluster_parameters, eigen_solver_parameters, non_rbf_affinity_parameters, n_neighbors_parameters))

parameter_combinations = rbf_parameter_combinations + non_rbf_parameter_combinations

def spectral_clustering(df, parameters):
    n_clusters = parameters[0]
    eigen_solver = parameters[1]
    n_components = len(df.columns)+1
    affinity = parameters[2]
    gamma = 1.0 # default
    n_neighbors = 10 # default

    if affinity == 'rbf':
        gamma = parameters[3]
    else:
        n_neighbors = parameters[3]

    model = cluster.SpectralClustering(
        n_clusters=n_clusters,
        eigen_solver=eigen_solver,
        n_components=n_components,
        affinity=affinity,
        gamma=gamma,
        n_neighbors=n_neighbors,
        random_state=0,
    )
    try:
        model.fit(df)
    except ValueError:
        return None, None

    labels = model.labels_
    # model.fit_predict(data)

    return ['spectral_clustering', [n_clusters, eigen_solver, n_components, affinity, gamma, n_neighbors]], labels

execute_algorithm(spectral_clustering, parameter_combinations)

In [11]:
# Gaussian Mixture
n_component_parameters = np.arange(2, 26, 1)
covariance_type_parameters = ['full', 'tied', 'diag', 'spherical']
parameter_combinations = list(itertools.product(n_component_parameters, covariance_type_parameters))

def GMM(df, parameters):
    n_components = parameters[0]
    covariance_type = parameters[1]

    model = mixture.GaussianMixture(
        n_components=n_components,
        covariance_type=covariance_type,
        random_state=0,
    )
    model.fit(df)

    labels = model.predict(df)

    return ['gmm', [n_components, covariance_type]], labels

execute_algorithm(GMM, parameter_combinations)

In [12]:
%%script false --no-raise-error

# Self Organized Maps (doesn't work properly)
m_parameters = np.arange(1, 26)
n_parameters = np.arange(1, 26)
dim_parameters = np.arange(1, len(df.columns) + 1)
sigma_parameters = np.linspace(0.5, 1.5, 11)
parameter_combinations = list(itertools.product(m_parameters, n_parameters, dim_parameters, sigma_parameters))

def self_organized_maps(df, parameters):
    m = len(df.columns)
    n = len(df.columns)
    dim = len(df.columns)
    sigma = 1

    df_float = df.astype(float).reset_index()

    model = SOM(
        m=8,
        n=8,
        dim=8,
        sigma=1,
        lr=1,
        random_state=0,
    )
    model.fit(df_float, epochs=1, shuffle=True)

    labels = model.predict(df_float)

    return ['som', [m, n, dim, sigma]], labels

execute_algorithm(self_organized_maps, parameter_combinations)

In [13]:
# CMeans
c_parameters = np.arange(2, 26, 1)
m_parameters = np.linspace(0.5, 2, 16)
parameter_combinations = list(itertools.product(c_parameters, m_parameters))

def c_means(df, parameters):
    c = parameters[0]
    m = parameters[1]

    cntr, u, u0, d, jm, p, fpc = cmeans(
        data=df.transpose(), c=c, m=m, error=0.005, maxiter=1000, seed=0
    )

    # TODO apply relationship to multiple clusters
    labels = np.argmax(u, axis=0)

    return ['cmeans', [c, m]], labels

execute_algorithm(c_means, parameter_combinations)

In [14]:
len(performance_comparison)

1391880

In [36]:
filtered = performance_comparison[performance_comparison['cluster_count'] >= 2]
# filtered = filtered[filtered['cluster_count'] < 15]
# filtered = filtered[filtered['silhouette_score'] > 0.1]
filtered = filtered[filtered['algorithm'] != 'spectral_clustering']
# filtered = filtered[filtered['algorithm'] != 'cmeans']
filtered = filtered[filtered['constraint_matching_score'] > 0.9]
filtered.sort_values(by='matching_score', ascending=False)

Unnamed: 0,features,algorithm,parameters,cluster_count,silhouette_score,matching_score,constraint_matching_score
8093,"(acousticness, danceability, energy, instrumen...",kmeans,[7],7,0.217341,0.208253,0.930556
8717,"(acousticness, danceability, energy, loudness,...",kmeans,[7],7,0.210183,0.204861,0.930556
1311700,"(acousticness, danceability, energy, valence, ...",cmeans,"[6, 1.7000000000000002]",6,0.20258,0.203904,0.916667
1282131,"(acousticness, danceability, energy, instrumen...",cmeans,"[6, 1.6]",6,0.185851,0.203125,0.916667
1249108,"(acousticness, danceability, energy, loudness,...",cmeans,"[6, 1.7000000000000002]",6,0.202983,0.200803,0.916667
1311696,"(acousticness, danceability, energy, valence, ...",cmeans,"[6, 1.3]",6,0.212854,0.199924,0.916667
1282129,"(acousticness, danceability, energy, instrumen...",cmeans,"[6, 1.4]",6,0.185266,0.199346,0.916667
1311699,"(acousticness, danceability, energy, valence, ...",cmeans,"[6, 1.6]",6,0.206113,0.198986,0.916667
1311698,"(acousticness, danceability, energy, valence, ...",cmeans,"[6, 1.5]",6,0.205139,0.1978,0.916667
1347411,"(acousticness, danceability, energy, speechine...",cmeans,"[6, 1.6]",6,0.195406,0.197142,0.916667


In [38]:
idx = 8093
res, labels = k_means(df[list(performance_comparison.iloc[idx].features)], performance_comparison.iloc[idx].parameters[0])
labels

array([2, 3, 3, 2, 1, 4, 1, 3, 4, 0, 1, 3, 0, 3, 3, 4, 3, 1, 3, 3, 4, 3,
       2, 2, 6, 1, 3, 0, 4, 4, 1, 4, 2, 2, 4, 4, 3, 0, 2, 0, 6, 3, 2, 2,
       4, 2, 4, 1, 3, 4, 2, 2, 0, 2, 2, 0, 4, 6, 3, 4, 1, 6, 1, 6, 1, 2,
       3, 0, 4, 2, 6, 2, 3, 1, 2, 0, 0, 4, 1, 1, 3, 3, 4, 4, 2, 3, 5, 5,
       2, 0, 4, 1, 2, 3, 2, 3], dtype=int32)

In [39]:
np.array(df['expected_labels'])

array([3, 3, 8, 2, 3, 1, 1, 3, 2, 3, 3, 2, 2, 2, 2, 1, 7, 1, 6, 6, 1, 5,
       1, 1, 1, 4, 1, 2, 2, 1, 4, 1, 2, 1, 4, 2, 8, 3, 6, 2, 3, 2, 2, 5,
       5, 7, 3, 1, 5, 1, 2, 3, 2, 7, 2, 4, 8, 6, 7, 4, 2, 3, 7, 1, 2, 1,
       3, 2, 4, 2, 1, 7, 7, 2, 2, 2, 2, 1, 2, 1, 5, 5, 1, 6, 3, 8, 9, 2,
       2, 1, 1, 4, 2, 8, 7, 2])

In [41]:
pd.set_option('display.max_rows', 500)
df_labeled = df.copy()
df_labeled['labels'] = labels
df_labeled[['name', 'artist', 'labels']].sort_values(by='labels')

Unnamed: 0,name,artist,labels
39,Jolene,Dolly Parton,0
37,Say My Name,Destiny's Child,0
52,Dancing Queen,ABBA,0
89,Sunday Bloody Sunday,U2,0
75,Rock Me Amadeus,Falco,0
27,Everybody Wants To Rule The World,Tears For Fears,0
55,Crazy Train,Ozzy Osbourne,0
76,Felicità,Al Bano And Romina Power,0
67,Voyage voyage,Desireless,0
9,Rockabye (feat. Sean Paul & Anne-Marie),Clean Bandit,0


In [45]:
df[df['constraints'] > 0]

Unnamed: 0,id,name,artist,album,acousticness,danceability,energy,instrumentalness,loudness,speechiness,tempo,valence,release_year,expected_labels,constraints
2,5w9c2J52mkdntKOmRLeM2m,Con Calma,Daddy Yankee,Con Calma,0.11,0.737,0.86,2e-06,0.866651,0.0593,0.375956,0.656,0.94,8,9.0
4,0TDLuuLlV54CkRRUOahJb4,Titanium (feat. Sia),David Guetta,Nothing but the Beat (Ultimate Edition),0.0679,0.604,0.787,0.15,0.851206,0.103,0.504248,0.301,0.87,3,6.0
7,1QV6tiMFM6fSOKOGLMHYYg,Poker Face,Lady Gaga,The Fame,0.118,0.851,0.806,2e-06,0.83691,0.0787,0.475996,0.787,0.83,3,5.0
10,2YWjW3wwQIBLNhxWKBQd16,Lean On (feat. MØ & DJ Snake),Major Lazer,Peace Is The Mission : Extended,0.00346,0.723,0.809,0.00123,0.860167,0.0625,0.392028,0.274,0.9,3,6.0
15,77NNZQSqzLNqh2A9JhLRkg,Don't Stop Believin',Journey,The Essential Journey,0.25,0.491,0.802,0.0,0.799341,0.0392,0.477,0.472,0.76,1,2.0
18,7iL6o9tox1zgHpKUfh9vuC,In Da Club,50 Cent,Get Rich Or Die Tryin',0.255,0.899,0.713,0.0,0.865139,0.366,0.360204,0.777,0.78,6,8.0
19,5n8Aro6j1bEGIy7Tpo7FV7,Fuck Tha Police,N.W.A.,Straight Outta Compton,0.0193,0.859,0.75,0.0,0.780919,0.303,0.394764,0.857,0.63,6,8.0
23,40riOy7x9W7GXjyGp4pjAv,Hotel California - 2013 Remaster,Eagles,Hotel California (2013 Remaster),0.00574,0.579,0.508,0.000494,0.763404,0.027,0.5885,0.609,0.51,1,1.0
29,37ZJ0p5Jm13JPevGcx4SkF,Livin' On A Prayer,Bon Jovi,Slippery When Wet,0.0778,0.532,0.887,0.000214,0.849952,0.0335,0.490044,0.795,0.61,1,2.0
33,2fuCquhmrzHpu5xcA1ci9x,Under Pressure - Remastered 2011,Queen,Hot Space (2011 Remaster),0.429,0.671,0.712,0.0,0.788627,0.0476,0.45522,0.462,0.57,1,1.0
