In [1]:
from data_preparation.data_extraction import get_tracks
import itertools
import numpy as np
import pandas as pd

from sklearn import cluster
from sklearn import mixture
from sklearn.metrics import silhouette_score, rand_score
from data_preparation.evaluation import best_fit_matching_score as matching_score, constraint_matching_score

import warnings

#warnings.filterwarnings(action='once')
warnings.filterwarnings('ignore')

  df = pd.read_csv(filepath_or_buffer='data.csv', sep=',', index_col=0)


In [2]:
df = get_tracks('5Rh7ikX5dteMXfc8tmeBJy', test=True)
# df = get_tracks('4O0ZVhe765HDb16ug5MKcP', test=True)

missing song: {'name': 'Purple Rain', 'artist': 'Prince', 'album': 'Purple Rain', 'release_year': '1984'}
missing song: {'name': 'Royals', 'artist': 'Lorde', 'album': 'The Love Club EP', 'release_year': '2013'}
missing song: {'name': 'The Message', 'artist': 'Grandmaster Flash', 'album': 'Grandmaster Flash, Grandmaster Melle-Mel & The Furious Five: The Greatest Hits', 'release_year': '2006'}
missing song: {'name': 'Nuthin\' But A "G" Thang', 'artist': 'Dr. Dre', 'album': 'The Chronic', 'release_year': '1992'}


In [3]:
performance_comparison = pd.DataFrame(columns=['features', 'algorithm', 'parameters', 'cluster_count', 'silhouette_score', 'rand_score', 'matching_score', 'constraint_matching_score'])

In [4]:
# get unique feature combinations with 3-9 selected features

features = [
    'acousticness',
    'danceability',
    'energy',
    'instrumentalness',
    'loudness',
    'speechiness',
    'tempo',
    'valence',
    'release_year',
]

feature_combinations = set()

for r in np.arange(3, len(features) + 1):
    feature_combinations.update(itertools.combinations(features, r))

In [5]:
def execute_algorithm(alg_func, parameter_combinations):
    for features in feature_combinations:
        df_features = df[list(features)]

        for parameters in parameter_combinations:
            details, labels = alg_func(df_features, parameters)
            if details is None or labels is None:
                continue
            
            if len(set(labels)) == 1 or len(set(labels)) == len(df):
                ss = None
            else:
                ss = silhouette_score(df_features, labels)
            rs = rand_score(df['expected_labels'], labels)
            ms = matching_score(df['expected_labels'], labels)
            cms = constraint_matching_score(df['constraints'], labels)

            comparison = [features] + details + [len(set(labels)), ss, rs, ms, cms]
    
            performance_comparison.loc[len(performance_comparison)] = comparison

In [6]:
# K-Means
k_parameters = np.arange(2, 26, 1)

def k_means(df, parameters):
    k = parameters

    model = cluster.KMeans(
        n_clusters=k,
        random_state=0,
    )
    model.fit(df)

    labels = model.labels_

    return ['kmeans', [k]], labels

execute_algorithm(k_means, k_parameters)

In [7]:
# Affinity propagation
damping_parameters = np.linspace(0.5, 0.9, 5)
preference_parameters = np.arange(-50, 0, 5)
parameter_combinations = list(itertools.product(damping_parameters, preference_parameters))

def affinity_propagation(df, parameters):
    damping = parameters[0]
    preference = parameters[1]
    
    model = cluster.AffinityPropagation(
        damping=damping,
        preference=preference,
        affinity='euclidean',
        max_iter=500,
        random_state=0,
    )
    model.fit(df)

    labels = model.labels_

    return ['affinity_propagation', [damping, preference]], labels
    
execute_algorithm(affinity_propagation, parameter_combinations)

In [8]:
# Agglomerative Clustering
n_parameters = np.arange(2, 26, 1)
linkage_parameters = ['ward', 'complete', 'average', 'single']
parameter_combinations = list(itertools.product(n_parameters, linkage_parameters))

def agglomerative_clustering(df, parameters):
    n = parameters[0]
    linkage = parameters[1]

    model = cluster.AgglomerativeClustering(
        n_clusters=n,
        linkage=linkage,
        metric='euclidean',
    )
    model.fit(df)

    labels = model.labels_

    return ['agglomerative_clustering', [n, linkage]], labels

execute_algorithm(agglomerative_clustering, parameter_combinations)

In [9]:
# DBSCAN
eps_parameters = np.linspace(0.1, 0.9, 9)
min_samples_parameters = np.arange(1, 11)
algorithm_parameters = ['auto', 'ball_tree', 'kd_tree', 'brute']
parameter_combinations = list(itertools.product(eps_parameters, min_samples_parameters, algorithm_parameters))

def DBSCAN(df, parameters):
    eps = parameters[0]
    min_samples = parameters[1]
    algorithm = parameters[2]

    model = cluster.DBSCAN(
        eps=eps,
        min_samples=min_samples,
        algorithm=algorithm,
        metric='euclidean',
    )
    model.fit(df)

    labels = model.labels_

    return ['dbscan', [eps, min_samples, algorithm]], labels

execute_algorithm(DBSCAN, parameter_combinations)

In [10]:
# Gaussian Mixture
n_component_parameters = np.arange(2, 26, 1)
covariance_type_parameters = ['full', 'tied', 'diag', 'spherical']
parameter_combinations = list(itertools.product(n_component_parameters, covariance_type_parameters))

def GMM(df, parameters):
    n_components = parameters[0]
    covariance_type = parameters[1]

    model = mixture.GaussianMixture(
        n_components=n_components,
        covariance_type=covariance_type,
        random_state=0,
    )
    model.fit(df)

    labels = model.predict(df)

    return ['gmm', [n_components, covariance_type]], labels

execute_algorithm(GMM, parameter_combinations)

In [11]:
len(performance_comparison)

291716

In [12]:
performance_comparison[performance_comparison['algorithm'] == 'kmeans'].describe()

Unnamed: 0,cluster_count,silhouette_score,rand_score,matching_score,constraint_matching_score
count,11184.0,11184.0,11184.0,11184.0,11184.0
mean,13.5,0.269164,0.74182,0.176875,0.585917
std,6.922496,0.059334,0.078372,0.03173,0.149637
min,2.0,0.120652,0.240789,0.107476,0.066667
25%,7.75,0.22804,0.729386,0.155636,0.483333
50%,13.5,0.26083,0.773246,0.174572,0.583333
75%,19.25,0.301749,0.790351,0.195569,0.683333
max,25.0,0.726249,0.814474,0.350807,1.0


In [13]:
performance_comparison[performance_comparison['algorithm'] == 'agglomerative_clustering'].describe()

Unnamed: 0,cluster_count,silhouette_score,rand_score,matching_score,constraint_matching_score
count,44736.0,44736.0,44736.0,44736.0,44736.0
mean,13.5,0.217936,0.623921,0.172889,0.543795
std,6.922264,0.145611,0.182667,0.034322,0.141142
min,2.0,-0.337676,0.183772,0.07798,0.0
25%,7.75,0.194732,0.482895,0.148415,0.433333
50%,13.5,0.253062,0.712719,0.170597,0.533333
75%,19.25,0.301471,0.774123,0.193848,0.65
max,25.0,0.721716,0.814035,0.331031,0.966667


In [14]:
performance_comparison[performance_comparison['algorithm'] == 'gmm'].describe()

Unnamed: 0,cluster_count,silhouette_score,rand_score,matching_score,constraint_matching_score
count,44736.0,44736.0,44736.0,44736.0,44736.0
mean,13.497899,0.206396,0.724405,0.176201,0.570786
std,6.921912,0.102742,0.09943,0.032595,0.141053
min,2.0,-0.329799,0.230044,0.096073,0.0
25%,7.0,0.164226,0.705702,0.153907,0.483333
50%,13.0,0.221045,0.764254,0.172925,0.566667
75%,19.0,0.267719,0.786842,0.194848,0.666667
max,25.0,0.726249,0.814474,0.335534,1.0


In [15]:
performance_comparison[performance_comparison['algorithm'] == 'dbscan'].describe()

Unnamed: 0,cluster_count,silhouette_score,rand_score,matching_score,constraint_matching_score
count,167760.0,58940.0,167760.0,167760.0,167760.0
mean,2.302241,0.279632,0.23383,0.11943,0.494471
std,6.491712,0.201735,0.119234,0.017603,0.064829
min,1.0,-0.331689,0.175219,0.081803,0.0
25%,1.0,0.144162,0.175219,0.111111,0.5
50%,1.0,0.321717,0.175219,0.111111,0.5
75%,2.0,0.42463,0.220395,0.120511,0.5
max,96.0,0.710608,0.825,0.28687,0.883333


In [16]:
performance_comparison[performance_comparison['algorithm'] == 'affinity_propagation'].describe()

Unnamed: 0,cluster_count,silhouette_score,rand_score,matching_score,constraint_matching_score
count,23300.0,923.0,23300.0,23300.0,23300.0
mean,6.731459,0.223256,0.225778,0.110522,0.504124
std,22.44775,0.091987,0.16326,0.005081,0.031491
min,1.0,-0.019398,0.175219,0.084894,0.25
25%,1.0,0.196234,0.175219,0.111111,0.5
50%,1.0,0.236095,0.175219,0.111111,0.5
75%,1.0,0.273001,0.175219,0.111111,0.5
max,96.0,0.540825,0.825219,0.214473,0.883333


In [17]:
performance_comparison.sort_values(by='silhouette_score', ascending=False)

Unnamed: 0,features,algorithm,parameters,cluster_count,silhouette_score,rand_score,matching_score,constraint_matching_score
285577,"(instrumentalness, loudness, speechiness)",gmm,"[3, tied]",3,0.726249,0.408772,0.172140,0.583333
9649,"(instrumentalness, loudness, speechiness)",kmeans,[3],3,0.726249,0.408772,0.172140,0.583333
285579,"(instrumentalness, loudness, speechiness)",gmm,"[3, spherical]",3,0.726249,0.408772,0.172140,0.583333
73082,"(instrumentalness, loudness, speechiness)",agglomerative_clustering,"[3, average]",3,0.721716,0.400658,0.175227,0.583333
73081,"(instrumentalness, loudness, speechiness)",agglomerative_clustering,"[3, complete]",3,0.721716,0.400658,0.175227,0.583333
...,...,...,...,...,...,...,...,...
246975,"(energy, instrumentalness, loudness, release_y...",dbscan,"[0.9, 9, brute]",1,,0.175219,0.111111,0.500000
246976,"(energy, instrumentalness, loudness, release_y...",dbscan,"[0.9, 10, auto]",1,,0.175219,0.111111,0.500000
246977,"(energy, instrumentalness, loudness, release_y...",dbscan,"[0.9, 10, ball_tree]",1,,0.175219,0.111111,0.500000
246978,"(energy, instrumentalness, loudness, release_y...",dbscan,"[0.9, 10, kd_tree]",1,,0.175219,0.111111,0.500000


In [18]:
performance_comparison.sort_values(by='rand_score', ascending=False)

Unnamed: 0,features,algorithm,parameters,cluster_count,silhouette_score,rand_score,matching_score,constraint_matching_score
33188,"(loudness, speechiness, release_year)",affinity_propagation,"[0.5, -30]",94,0.023451,0.825219,0.095745,0.500000
16489,"(danceability, energy, instrumentalness, loudn...",affinity_propagation,"[0.5, -25]",89,0.011703,0.825219,0.099886,0.500000
12489,"(loudness, speechiness, tempo, release_year)",affinity_propagation,"[0.5, -25]",94,0.024346,0.825219,0.095745,0.500000
29848,"(speechiness, tempo, release_year)",affinity_propagation,"[0.6, -30]",94,0.025647,0.825219,0.095745,0.500000
191903,"(acousticness, danceability, energy, loudness,...",dbscan,"[0.1, 1, brute]",93,0.015613,0.825000,0.096211,0.583333
...,...,...,...,...,...,...,...,...
204880,"(acousticness, energy, tempo, valence, release...",dbscan,"[0.1, 6, auto]",1,,0.175219,0.111111,0.500000
204879,"(acousticness, energy, tempo, valence, release...",dbscan,"[0.1, 5, brute]",1,,0.175219,0.111111,0.500000
204878,"(acousticness, energy, tempo, valence, release...",dbscan,"[0.1, 5, kd_tree]",1,,0.175219,0.111111,0.500000
204877,"(acousticness, energy, tempo, valence, release...",dbscan,"[0.1, 5, ball_tree]",1,,0.175219,0.111111,0.500000


In [22]:
performance_comparison.sort_values(by='matching_score', ascending=False)

Unnamed: 0,features,algorithm,parameters,cluster_count,silhouette_score,rand_score,matching_score,constraint_matching_score
3223,"(acousticness, danceability, instrumentalness,...",kmeans,[9],9,0.249916,0.746930,0.350807,0.650000
7255,"(acousticness, danceability, instrumentalness,...",kmeans,[9],9,0.193092,0.784649,0.346246,0.766667
276000,"(acousticness, danceability, instrumentalness,...",gmm,"[9, full]",9,0.169386,0.777851,0.335534,0.766667
3222,"(acousticness, danceability, instrumentalness,...",kmeans,[8],8,0.242458,0.740570,0.331946,0.566667
57649,"(acousticness, danceability, energy, speechine...",agglomerative_clustering,"[9, complete]",9,0.150673,0.755482,0.331031,0.966667
...,...,...,...,...,...,...,...,...
176422,"(instrumentalness, loudness, valence, release_...",dbscan,"[0.1, 1, kd_tree]",16,-0.233773,0.361623,0.081803,0.250000
60471,"(instrumentalness, loudness, valence, release_...",agglomerative_clustering,"[18, single]",18,-0.251576,0.419518,0.081786,0.250000
64127,"(instrumentalness, loudness, tempo)",agglomerative_clustering,"[20, single]",20,-0.097369,0.448684,0.079713,0.250000
64135,"(instrumentalness, loudness, tempo)",agglomerative_clustering,"[22, single]",22,-0.076999,0.487500,0.078112,0.166667


In [23]:
performance_comparison.sort_values(by='constraint_matching_score', ascending=False)

Unnamed: 0,features,algorithm,parameters,cluster_count,silhouette_score,rand_score,matching_score,constraint_matching_score
4232,"(acousticness, danceability, energy, loudness,...",kmeans,[10],10,0.180594,0.753728,0.229300,1.000000
4233,"(acousticness, danceability, energy, loudness,...",kmeans,[11],11,0.184652,0.761404,0.219747,1.000000
4231,"(acousticness, danceability, energy, loudness,...",kmeans,[9],9,0.190836,0.753509,0.254249,1.000000
263904,"(acousticness, danceability, energy, loudness,...",gmm,"[9, full]",9,0.184243,0.749781,0.253936,1.000000
37668,"(acousticness, energy, loudness, speechiness, ...",agglomerative_clustering,"[6, ward]",6,0.270510,0.709430,0.183439,0.966667
...,...,...,...,...,...,...,...,...
225030,"(acousticness, instrumentalness, tempo, valence)",dbscan,"[0.1, 3, kd_tree]",5,-0.053051,0.584211,0.138139,0.000000
225031,"(acousticness, instrumentalness, tempo, valence)",dbscan,"[0.1, 3, brute]",5,-0.053051,0.584211,0.138139,0.000000
188675,"(danceability, instrumentalness, loudness, tem...",dbscan,"[0.1, 4, brute]",4,0.144139,0.656140,0.164928,0.000000
188674,"(danceability, instrumentalness, loudness, tem...",dbscan,"[0.1, 4, kd_tree]",4,0.144139,0.656140,0.164928,0.000000


In [26]:
performance_comparison['avg_score'] = performance_comparison[['silhouette_score', 'rand_score', 'matching_score', 'constraint_matching_score']].mean(axis=1)
performance_comparison.sort_values(by='avg_score', ascending=False)

Unnamed: 0,features,algorithm,parameters,cluster_count,silhouette_score,rand_score,matching_score,constraint_matching_score,avg_score
47565,"(speechiness, valence, release_year)",agglomerative_clustering,"[8, complete]",8,0.351603,0.750658,0.277032,0.966667,0.586490
47569,"(speechiness, valence, release_year)",agglomerative_clustering,"[9, complete]",9,0.339683,0.750219,0.269824,0.966667,0.581598
47568,"(speechiness, valence, release_year)",agglomerative_clustering,"[9, ward]",9,0.325699,0.764254,0.259715,0.966667,0.579084
47561,"(speechiness, valence, release_year)",agglomerative_clustering,"[7, complete]",7,0.332635,0.741886,0.264841,0.966667,0.576507
60048,"(loudness, speechiness, valence, release_year)",agglomerative_clustering,"[9, ward]",9,0.319851,0.762061,0.255252,0.966667,0.575958
...,...,...,...,...,...,...,...,...,...
93624,"(danceability, energy, speechiness, tempo, val...",dbscan,"[0.1, 2, auto]",6,-0.284144,0.298904,0.119633,0.250000,0.096098
185431,"(acousticness, danceability, instrumentalness,...",dbscan,"[0.1, 3, brute]",5,-0.227925,0.392105,0.133553,0.083333,0.095267
185430,"(acousticness, danceability, instrumentalness,...",dbscan,"[0.1, 3, kd_tree]",5,-0.227925,0.392105,0.133553,0.083333,0.095267
185429,"(acousticness, danceability, instrumentalness,...",dbscan,"[0.1, 3, ball_tree]",5,-0.227925,0.392105,0.133553,0.083333,0.095267


In [27]:
idx = 47565
res, labels = agglomerative_clustering(df[list(performance_comparison.iloc[idx].features)], performance_comparison.iloc[idx].parameters)
labels

array([7, 7, 7, 3, 3, 1, 3, 7, 6, 7, 3, 6, 6, 6, 6, 1, 5, 3, 0, 0, 1, 7,
       6, 1, 2, 2, 6, 1, 1, 6, 3, 7, 1, 1, 1, 1, 7, 7, 0, 6, 3, 0, 5, 1,
       7, 5, 7, 3, 6, 1, 4, 1, 6, 5, 1, 5, 5, 3, 5, 1, 3, 3, 4, 3, 1, 1,
       7, 7, 7, 7, 2, 5, 7, 3, 6, 6, 6, 1, 3, 3, 6, 7, 6, 0, 5, 7, 3, 1,
       1, 7, 7, 2, 1, 7, 5, 5])

In [28]:
np.array(df['expected_labels'])

array([3, 3, 8, 2, 3, 1, 1, 3, 2, 3, 3, 2, 2, 2, 2, 1, 7, 1, 6, 6, 1, 5,
       1, 1, 1, 4, 1, 2, 2, 1, 4, 1, 2, 1, 4, 2, 8, 3, 6, 2, 3, 2, 2, 5,
       5, 7, 3, 1, 5, 1, 2, 3, 2, 7, 2, 4, 8, 6, 7, 4, 2, 3, 7, 1, 2, 1,
       3, 2, 4, 2, 1, 7, 7, 2, 2, 2, 2, 1, 2, 1, 5, 5, 1, 6, 3, 8, 9, 2,
       2, 1, 1, 4, 2, 8, 7, 2])

In [29]:
df['labels'] = labels

In [32]:
df.sort_values(by='labels', ascending=False)

Unnamed: 0,id,name,artist,album,acousticness,danceability,energy,instrumentalness,loudness,speechiness,tempo,valence,release_year,expected_labels,constraints,labels
0,7BKLCZ1jbUBVqRi2FVlTVw,Closer,The Chainsmokers,Closer,0.41400,0.748,0.524,0.000000,0.822115,0.0676,0.380040,0.661,0.91,3,,7
72,5LH1z4ma2TN2aVeESXthj9,Ding,Seeed,Next!,0.03900,0.803,0.906,0.000011,0.853503,0.0842,0.467972,0.947,0.80,7,,7
36,6mICuAdrwEjh6Y6lroV2Kg,Chantaje (feat. Maluma),Shakira,El Dorado,0.18700,0.852,0.773,0.000030,0.862585,0.1552,0.408136,0.907,0.92,8,,7
37,7H6ev70Weq6DdpZyyTmUXk,Say My Name,Destiny's Child,The Writing's On The Wall,0.27300,0.713,0.678,0.000000,0.853458,0.2040,0.552036,0.734,0.74,3,,7
44,1L5tZi0izXsi5Kk5OJf4W0,Rehab,Amy Winehouse,Back To Black,0.04730,0.434,0.872,0.000002,0.861784,0.1404,0.286060,0.732,0.81,5,,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41,6g0Orsxv6glTJCt4cHsRsQ,Formation,Beyoncé,Lemonade,0.00532,0.896,0.621,0.000000,0.803104,0.4740,0.487864,0.818,0.91,2,,0
18,7iL6o9tox1zgHpKUfh9vuC,In Da Club,50 Cent,Get Rich Or Die Tryin',0.25500,0.899,0.713,0.000000,0.865139,0.7320,0.360204,0.777,0.78,6,8.0,0
83,7IdFdRlCjUi6kkhbPoRfnw,99 Problems,JAY-Z,The Black Album,0.00661,0.493,0.887,0.000000,0.841791,0.8000,0.356628,0.551,0.78,6,,0
38,6MdqqkQ8sSC0WB4i8PyRuQ,No Diggity,Blackstreet,Another Level,0.30300,0.868,0.646,0.000000,0.836094,0.5760,0.354564,0.670,0.71,6,,0
