In [18]:
import pandas as pd

from src.helpers.repositories import AudioRepository
from src.models import *

import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

import plotly.express as px

In [19]:
data = AudioRepository.load_processed_audio([
    '../../data\\playlist-1-extracted',
    '../../data\\playlist-2-extracted',
    '../../data\\playlist-3-extracted',
    '../../data\\playlist-4-extracted'
])
vectors = [vector for vector, _ in data]

In [20]:
dataset = pd.DataFrame([vector.as_dict() for vector in vectors])
song_names = dataset.pop('song_name')
song_names

0                                Easily
1                               Nothing
2              The Most Beautiful Thing
3     Lovesong (The Way) [feat. Bluets]
4              Best Part (feat. H.E.R.)
                    ...                
70                          Crystallize
71                               Retour
72                            Civilized
73                         Fire & Stone
74                        I Caught Fire
Name: song_name, Length: 75, dtype: object

In [21]:
artists = dataset.pop('artist')
artists

0          Bruno Major
1          Bruno Major
2          Bruno Major
3         Charlie Burg
4        Daniel Caesar
            ...       
70    Thomas Bergersen
71       Tony Anderson
72            Voyageur
73            Voyageur
74            Voyageur
Name: artist, Length: 75, dtype: object

In [22]:
playlists = dataset.pop('playlist')
playlists

0     playlist-1-extracted
1     playlist-1-extracted
2     playlist-1-extracted
3     playlist-1-extracted
4     playlist-1-extracted
              ...         
70    playlist-4-extracted
71    playlist-4-extracted
72    playlist-4-extracted
73    playlist-4-extracted
74    playlist-4-extracted
Name: playlist, Length: 75, dtype: object

In [23]:
dataset

Unnamed: 0,zero_crossings_mean,zero_crossings_var,bpm,spectral_centroid_mean,spectral_centroid_var,spectral_rolloff_mean,spectral_rolloff_var,spectral_flux_mean,spectral_flux_var,mfcc_mean_1,mfcc_var_1,mfcc_mean_2,mfcc_var_2,mfcc_mean_3,mfcc_var_3,mfcc_mean_4,mfcc_var_4,mfcc_mean_5,mfcc_var_5
0,0.040950,0.039273,123.046875,1325.705462,1.281488e+06,2646.148866,6.999132e+06,1.405987,4.380646,122.607567,2342.442383,41.752369,830.084106,29.401342,423.736786,20.773031,245.108810,11.525144,202.638687
1,0.031868,0.030852,143.554688,750.569751,3.044683e+05,1319.636496,1.612754e+06,1.132027,1.644858,166.858704,1735.323975,21.118891,456.029633,19.709267,504.540619,16.009113,387.933258,22.504009,190.261200
2,0.040446,0.038810,129.199219,1266.167543,7.255832e+05,2489.469988,3.907391e+06,1.428144,5.548598,125.978645,1397.635986,25.613192,358.246765,28.354788,235.979935,18.006464,166.277481,12.967431,143.211929
3,0.096648,0.087307,75.999540,2266.903839,1.107634e+06,4889.516004,4.571252e+06,1.310032,2.629859,86.361046,1635.238281,12.115478,878.224487,29.897362,489.664551,11.958034,249.108963,15.391748,269.410339
4,0.071633,0.066501,143.554688,1931.093252,5.833559e+05,4200.484102,3.314215e+06,1.247534,1.750443,107.245453,1037.345459,14.395954,425.511444,18.686554,338.837616,6.706423,310.398560,3.488143,212.048035
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,0.067735,0.063147,123.046875,1552.861796,5.532690e+05,3168.619062,2.435322e+06,0.860111,0.193992,121.357773,927.573242,-1.494154,950.097290,32.788578,121.798576,5.771464,68.384590,5.951822,45.975948
71,0.023181,0.022644,95.703125,503.955003,1.046665e+05,788.013815,6.424496e+05,0.835529,0.624931,187.431808,1902.659424,47.678169,647.584534,14.545411,212.372940,10.050799,94.826553,7.101512,64.315620
72,0.063476,0.059447,92.285156,1603.264420,4.643250e+05,3322.297781,2.658560e+06,0.950252,0.474175,124.430542,1319.083984,0.545243,266.261444,16.496122,246.155746,-3.790283,151.651932,5.472765,164.624512
73,0.062675,0.058747,75.999540,1566.899937,3.137780e+05,3209.446124,2.399429e+06,0.996284,0.614221,127.884583,953.000000,1.270565,162.211731,9.160923,208.752747,-4.171951,139.620926,9.664831,104.385056


In [24]:
scaler = StandardScaler()
normalised_dataset = pd.DataFrame(scaler.fit_transform(dataset), columns=dataset.columns)
normalised_dataset

Unnamed: 0,zero_crossings_mean,zero_crossings_var,bpm,spectral_centroid_mean,spectral_centroid_var,spectral_rolloff_mean,spectral_rolloff_var,spectral_flux_mean,spectral_flux_var,mfcc_mean_1,mfcc_var_1,mfcc_mean_2,mfcc_var_2,mfcc_mean_3,mfcc_var_3,mfcc_mean_4,mfcc_var_4,mfcc_mean_5,mfcc_var_5
0,-1.310017,-1.353782,0.181330,-0.246661,2.156194,-0.204866,2.294897,1.692801,2.360134,-0.359377,1.328278,2.097092,0.431606,1.188846,0.853930,2.022802,0.579506,1.122242,0.832484
1,-1.855906,-1.945730,0.956559,-1.461804,-0.628464,-1.300638,-0.667528,0.433764,0.249473,1.122129,0.248374,0.845940,-0.639848,0.353593,1.482203,1.437939,2.052155,2.484052,0.640944
2,-1.340322,-1.386340,0.413899,-0.372452,0.571779,-0.334291,0.594487,1.794631,3.261210,-0.246515,-0.352285,1.118461,-0.919941,1.098655,-0.605933,1.683153,-0.233316,1.301142,-0.087137
3,2.037673,2.022738,-1.597134,1.741897,1.660683,1.648279,0.959600,1.251823,1.009401,-1.572893,0.070347,0.300000,0.569501,1.231592,1.366537,0.940592,0.620751,1.601852,1.865766
4,0.534135,0.560204,0.956559,1.032399,0.166409,1.079101,0.268249,0.964600,0.330932,-0.873693,-0.993147,0.438282,-0.727266,0.265457,0.193815,0.295856,1.252702,0.125338,0.978092
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,0.299899,0.324436,0.181330,0.233274,0.080657,0.226723,-0.215128,-0.815882,-0.869871,-0.401220,-1.188403,-0.525246,0.775376,1.480754,-1.493726,0.181072,-1.242680,0.430931,-1.591850
71,-2.378029,-2.522758,-0.852307,-1.982850,-1.197930,-1.739788,-1.201180,-0.928853,-0.537402,1.810906,0.546019,2.456414,-0.091152,-0.091422,-0.789484,0.706442,-0.970039,0.573538,-1.308046
72,0.043914,0.064335,-0.981512,0.339764,-0.172847,0.353670,-0.092350,-0.401622,-0.653710,-0.298345,-0.492008,-0.401583,-1.183427,0.076688,-0.526813,-0.992816,-0.384118,0.371509,0.244220
73,-0.004256,0.015105,-1.597134,0.262933,-0.601929,0.260449,-0.234869,-0.190070,-0.545664,-0.182706,-1.143175,-0.357602,-1.481471,-0.555452,-0.817632,-1.039674,-0.508169,0.891490,-0.687977


In [25]:
pca = PCA(n_components=3)
pca_dataset = pd.DataFrame(pca.fit_transform(normalised_dataset), columns=['PCA1', 'PCA2', 'PCA3'])
pca_dataset

Unnamed: 0,PCA1,PCA2,PCA3
0,2.919897,4.809986,1.916142
1,-1.205138,3.603775,1.875852
2,2.139175,2.246617,3.158388
3,4.938939,1.078773,-2.432152
4,2.275366,-0.046115,-0.827671
...,...,...,...
70,0.385836,-2.174820,0.972916
71,-3.595409,1.454182,4.389701
72,0.078215,-1.333591,0.169466
73,-0.097318,-2.001859,0.937398


In [26]:
named_pca_dataset = pca_dataset.copy()
named_pca_dataset.insert(0, 'song_name', song_names)
named_pca_dataset.insert(1, 'artist', artists)
named_pca_dataset['playlist'] = playlists

In [27]:
fig = px.scatter_3d(
    named_pca_dataset,
    x='PCA1',
    y='PCA2',
    z='PCA3',
    hover_data=['artist', 'song_name'],
    template='plotly_dark',
    color='artist'
)
fig.show()

In [28]:
fig = px.scatter_3d(
    named_pca_dataset,
    x='PCA1',
    y='PCA2',
    z='PCA3',
    hover_data=['artist', 'song_name'],
    template='plotly_dark',
    color='playlist'
)
fig.show()

In [29]:
kmeans = KMeans(n_clusters=5)
kmeans.fit(pca_dataset)
named_pca_dataset['cluster'] = kmeans.labels_
named_pca_dataset




KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.



Unnamed: 0,song_name,artist,PCA1,PCA2,PCA3,playlist,cluster
0,Easily,Bruno Major,2.919897,4.809986,1.916142,playlist-1-extracted,3
1,Nothing,Bruno Major,-1.205138,3.603775,1.875852,playlist-1-extracted,4
2,The Most Beautiful Thing,Bruno Major,2.139175,2.246617,3.158388,playlist-1-extracted,2
3,Lovesong (The Way) [feat. Bluets],Charlie Burg,4.938939,1.078773,-2.432152,playlist-1-extracted,3
4,Best Part (feat. H.E.R.),Daniel Caesar,2.275366,-0.046115,-0.827671,playlist-1-extracted,2
...,...,...,...,...,...,...,...
70,Crystallize,Thomas Bergersen,0.385836,-2.174820,0.972916,playlist-4-extracted,2
71,Retour,Tony Anderson,-3.595409,1.454182,4.389701,playlist-4-extracted,0
72,Civilized,Voyageur,0.078215,-1.333591,0.169466,playlist-4-extracted,2
73,Fire & Stone,Voyageur,-0.097318,-2.001859,0.937398,playlist-4-extracted,2


In [30]:
fig = px.scatter_3d(
    named_pca_dataset,
    x='PCA1',
    y='PCA2',
    z='PCA3',
    hover_data=['artist', 'song_name'],
    template='plotly_dark',
    color='cluster'
)
fig.show()

In [31]:
# Todo: (1) Compress trajectories into feature vectors
# Todo: (2) Automatic elbow method / hierarchical clustering
# Todo: (3) Allow user to input new song, process it and place it within the clustering grid
# Todo: (4) Return nearest (cluster?) neighbours as recommendations