In [1]:
from data_preparation.data_extraction import get_tracks
import itertools
import numpy as np
import pandas as pd

from sklearn import cluster
from sklearn import mixture
from sklearn.metrics import silhouette_score
from data_preparation.evaluation import best_fit_matching_score as matching_score, constraint_matching_score

import warnings

#warnings.filterwarnings(action='once')
warnings.filterwarnings('ignore')

  df = pd.read_csv(filepath_or_buffer='data.csv', sep=',', index_col=0)


In [2]:
df = get_tracks('5Rh7ikX5dteMXfc8tmeBJy', test=True)

In [3]:
performance_comparison = pd.DataFrame(columns=['features', 'algorithm', 'parameters', 'cluster_count', 'silhouette_score', 'matching_score', 'constraint_matching_score'])

In [20]:
# get unique feature combinations with 3-6 selected features

features = [
    'acousticness',
    'danceability',
    'energy',
    'instrumentalness',
    'loudness',
    'speechiness',
    'tempo',
    'valence',
    'release_year',
]

feature_combinations = set()

for r in np.arange(3, len(features) + 1):
    feature_combinations.update(itertools.combinations(features, r))

In [5]:
def execute_algorithm(alg_func, parameter_combinations):
    for features in feature_combinations:
        df_features = df[list(features)]

        for parameters in parameter_combinations:
            details, labels = alg_func(df_features, parameters)
            if details is None or labels is None:
                continue
            
            if len(set(labels)) == 1 or len(set(labels)) == len(df):
                ss = None
            else:
                ss = silhouette_score(df_features, labels)
            ms = matching_score(df['expected_labels'], labels)
            cms = constraint_matching_score(df['constraints'], labels)

            comparison = [features] + details + [len(set(labels)), ss, ms, cms]
    
            performance_comparison.loc[len(performance_comparison)] = comparison

In [21]:
# K-Means
k_parameters = np.arange(2, 26, 1)

def k_means(df, parameters):
    k = parameters

    model = cluster.KMeans(
        n_clusters=k,
        random_state=0,
    )
    model.fit(df)

    labels = model.labels_

    return ['kmeans', [k]], labels

execute_algorithm(k_means, k_parameters)

In [22]:
# Affinity propagation
damping_parameters = np.linspace(0.5, 0.9, 5)
preference_parameters = np.arange(-50, 0, 5)
parameter_combinations = list(itertools.product(damping_parameters, preference_parameters))

def affinity_propagation(df, parameters):
    damping = parameters[0]
    preference = parameters[1]
    
    model = cluster.AffinityPropagation(
        damping=damping,
        preference=preference,
        affinity='euclidean',
        max_iter=500,
        random_state=0,
    )
    model.fit(df)

    labels = model.labels_

    return ['affinity_propagation', [damping, preference]], labels
    
execute_algorithm(affinity_propagation, parameter_combinations)

In [23]:
# Agglomerative Clustering
n_parameters = np.arange(2, 26, 1)
linkage_parameters = ['ward', 'complete', 'average', 'single']
parameter_combinations = list(itertools.product(n_parameters, linkage_parameters))

def agglomerative_clustering(df, parameters):
    n = parameters[0]
    linkage = parameters[1]

    model = cluster.AgglomerativeClustering(
        n_clusters=n,
        linkage=linkage,
        metric='euclidean',
    )
    model.fit(df)

    labels = model.labels_

    return ['agglomerative_clustering', [n, linkage]], labels

execute_algorithm(agglomerative_clustering, parameter_combinations)

In [24]:
# DBSCAN
eps_parameters = np.linspace(0.1, 0.9, 9)
min_samples_parameters = np.arange(1, 11)
algorithm_parameters = ['auto', 'ball_tree', 'kd_tree', 'brute']
parameter_combinations = list(itertools.product(eps_parameters, min_samples_parameters, algorithm_parameters))

def DBSCAN(df, parameters):
    eps = parameters[0]
    min_samples = parameters[1]
    algorithm = parameters[2]

    model = cluster.DBSCAN(
        eps=eps,
        min_samples=min_samples,
        algorithm=algorithm,
        metric='euclidean',
    )
    model.fit(df)

    labels = model.labels_

    return ['dbscan', [eps, min_samples, algorithm]], labels

execute_algorithm(DBSCAN, parameter_combinations)

In [25]:
# Gaussian Mixture
n_component_parameters = np.arange(2, 26, 1)
covariance_type_parameters = ['full', 'tied', 'diag', 'spherical']
parameter_combinations = list(itertools.product(n_component_parameters, covariance_type_parameters))

def GMM(df, parameters):
    n_components = parameters[0]
    covariance_type = parameters[1]

    model = mixture.GaussianMixture(
        n_components=n_components,
        covariance_type=covariance_type,
        random_state=0,
    )
    model.fit(df)

    labels = model.predict(df)

    return ['gmm', [n_components, covariance_type]], labels

execute_algorithm(GMM, parameter_combinations)

In [26]:
len(performance_comparison)

353690

In [28]:
filtered = performance_comparison[performance_comparison['cluster_count'] >= 2]
# filtered = filtered[filtered['cluster_count'] < 15]
# filtered = filtered[filtered['silhouette_score'] > 0.1]
# filtered = filtered[filtered['algorithm'] != 'spectral_clustering']
# filtered = filtered[filtered['algorithm'] != 'cmeans']
# filtered = filtered[filtered['constraint_matching_score'] > 0.6]
filtered.sort_values(by='constraint_matching_score', ascending=False)

Unnamed: 0,features,algorithm,parameters,cluster_count,silhouette_score,matching_score,constraint_matching_score
319722,"(danceability, instrumentalness, loudness, spe...",gmm,"[6, full]",6,0.071298,0.161440,0.966667
330872,"(acousticness, danceability, energy, speechine...",gmm,"[9, diag]",9,0.117236,0.307661,0.966667
319726,"(danceability, instrumentalness, loudness, spe...",gmm,"[7, full]",7,0.029908,0.178010,0.966667
344110,"(acousticness, danceability, instrumentalness,...",gmm,"[7, full]",7,0.026132,0.178376,0.966667
62051,"(acousticness, danceability, energy, instrumen...",kmeans,[7],7,0.217341,0.208253,0.933333
...,...,...,...,...,...,...,...
33045,"(energy, instrumentalness, tempo, release_year)",dbscan,"[0.1, 4, brute]",5,0.065984,0.134965,0.000000
236964,"(acousticness, danceability, instrumentalness,...",dbscan,"[0.1, 3, kd_tree]",9,-0.115973,0.167566,0.000000
33046,"(energy, instrumentalness, tempo, release_year)",dbscan,"[0.1, 5, auto]",5,0.050213,0.133026,0.000000
33047,"(energy, instrumentalness, tempo, release_year)",dbscan,"[0.1, 5, ball_tree]",5,0.050213,0.133026,0.000000


In [32]:
idx = 62051
res, labels = k_means(df[list(performance_comparison.iloc[idx].features)], performance_comparison.iloc[idx].parameters[0])
labels

array([2, 3, 3, 2, 1, 4, 1, 3, 4, 0, 1, 3, 0, 3, 3, 4, 3, 1, 3, 3, 4, 3,
       2, 2, 6, 1, 3, 0, 4, 4, 1, 4, 2, 2, 4, 4, 3, 0, 2, 0, 6, 3, 2, 2,
       4, 2, 4, 1, 3, 4, 2, 2, 0, 2, 2, 0, 4, 6, 3, 4, 1, 6, 1, 6, 1, 2,
       3, 0, 4, 2, 6, 2, 3, 1, 2, 0, 0, 4, 1, 1, 3, 3, 4, 4, 2, 3, 5, 5,
       2, 0, 4, 1, 2, 3, 2, 3], dtype=int32)

In [33]:
np.array(df['expected_labels'])

array([3, 3, 8, 2, 3, 1, 1, 3, 2, 3, 3, 2, 2, 2, 2, 1, 7, 1, 6, 6, 1, 5,
       1, 1, 1, 4, 1, 2, 2, 1, 4, 1, 2, 1, 4, 2, 8, 3, 6, 2, 3, 2, 2, 5,
       5, 7, 3, 1, 5, 1, 2, 3, 2, 7, 2, 4, 8, 6, 7, 4, 2, 3, 7, 1, 2, 1,
       3, 2, 4, 2, 1, 7, 7, 2, 2, 2, 2, 1, 2, 1, 5, 5, 1, 6, 3, 8, 9, 2,
       2, 1, 1, 4, 2, 8, 7, 2])

In [34]:
# indices = [2, 4, 7, 10, 15, 18, 19, 23, 29, 33, 34, 52, 59, 60, 66, 73, 76, 85]
indices = [23, 33, 15, 29, 76, 52, 73, 60, 66, 7, 4, 10, 59, 34, 18, 19, 85, 2]
print(np.array(labels[indices]))
print(np.array(df['constraints'])[indices].astype(int))

[2 2 4 4 0 0 1 1 3 3 1 1 4 4 3 3 3 3]
[0 0 2 2 3 3 0 0 5 5 6 6 7 7 8 8 0 0]


In [19]:
pd.set_option('display.max_rows', 500)
df_labeled = df.copy()
df_labeled['labels'] = labels
df_labeled[['name', 'artist', 'labels']].sort_values(by='labels')

Unnamed: 0,name,artist,labels
61,Beautiful,Christina Aguilera,0
62,Lose Yourself,Eminem,0
17,Use Somebody,Kings of Leon,0
86,Ne me quitte pas,Jacques Brel,0
79,With Or Without You,U2,0
25,Holy Diver,Dio,0
70,Time - 2011 Remastered Version,Pink Floyd,0
24,Stairway to Heaven - Remaster,Led Zeppelin,0
6,Thunderstruck,AC/DC,0
91,Ace of Spades,Motörhead,0


In [27]:
df.iloc[8]

id                  2WfaOiMkCvy7F5fcp2zZ8L
name                            Take on Me
artist                                a-ha
album                 Hunting High and Low
acousticness                         0.018
danceability                         0.573
energy                               0.902
instrumentalness                   0.00125
loudness                          0.791301
speechiness                          0.054
tempo                             0.337648
valence                              0.876
release_year                           0.6
expected_labels                          2
constraints                            NaN
Name: 8, dtype: object