In [1]:
import math

import numpy as np
import pandas as pd

from src.helpers.repositories import AudioRepository
from src.models import *

import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from sklearn.manifold import MDS, TSNE
from scipy.spatial.distance import pdist, squareform

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

from glob import glob

In [2]:
extracted_dir = '../../data/extracted/*'
temp_dir = '../../data/temp/2d-tsne-gmm.data.bz2'
scaler = StandardScaler()

In [3]:
data_paths = glob(extracted_dir)
list(enumerate(data_paths))

[(0, '../../data/extracted\\kino'),
 (1, '../../data/extracted\\lights-and-motion'),
 (2, '../../data/extracted\\m83'),
 (3, '../../data/extracted\\odesza'),
 (4, '../../data/extracted\\README.md'),
 (5, '../../data/extracted\\sleeping-at-last'),
 (6, '../../data/extracted\\tony-anderson')]

In [4]:
data = AudioRepository.load_processed_audio([data_paths[0], data_paths[1], data_paths[3], data_paths[5]])
vectors = [vector for vector, _ in data]
del data

In [6]:
dataset = pd.DataFrame([vector.as_dict() for vector in vectors])
dataset['song_name']

0                                          Ivan Sings
1                   Prélude in E Minor, Op. 28, No. 4
2                                          As a Cloud
3                                                 All
4                                                Anew
                            ...                      
142              October 30, 2018- Kepler - Goodnight
143                            July 4, 2020- Aphelion
144        June 30, 2018- Pds 70b (Birth of a Planet)
145                  June 24, 2022- Parade of Planets
146    September 15, 2017- Cassini - The Grand Finale
Name: song_name, Length: 147, dtype: object

In [7]:
dataset

Unnamed: 0,song_name,artist,playlist,zero_crossings_mean,zero_crossings_var,bpm,spectral_centroid_mean,spectral_centroid_var,spectral_rolloff_mean,spectral_rolloff_var,...,mfcc_mean_1,mfcc_var_1,mfcc_mean_2,mfcc_var_2,mfcc_mean_3,mfcc_var_3,mfcc_mean_4,mfcc_var_4,mfcc_mean_5,mfcc_var_5
0,Ivan Sings,Aram Khachaturian,kino,0.030815,0.029865,143.554688,728.505121,164591.144472,1044.706810,1.492931e+06,...,139.550812,1305.861816,57.054276,234.785889,17.496822,167.898300,9.907036,68.982697,4.814508,74.525887
1,"Prélude in E Minor, Op. 28, No. 4",Frédéric Chopin,kino,0.028196,0.027401,103.359375,615.425486,95544.686241,892.440162,8.328338e+05,...,169.640305,1316.948975,54.812344,289.936859,13.840162,131.829498,9.722220,56.867085,5.577062,56.455524
2,As a Cloud,Kino,kino,0.026781,0.026064,92.285156,519.512435,134149.124135,816.659536,5.748878e+05,...,179.117065,1719.778564,65.865700,239.370544,0.333414,238.848633,-8.672693,78.373230,-5.365072,56.647114
3,All,Kino,kino,0.044240,0.042283,161.499023,619.260455,49458.448746,981.976649,2.497879e+05,...,211.416275,1042.716309,11.695502,684.767395,1.213135,455.439331,8.773780,122.202965,-1.666171,60.584774
4,Anew,Kino,kino,0.048969,0.046571,161.499023,677.808914,72961.813450,1098.534181,4.831254e+05,...,224.936325,1675.312134,-0.197897,452.366974,-7.838040,375.633240,-0.372361,102.292191,-8.401909,85.260345
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,"October 30, 2018- Kepler - Goodnight",Sleeping At Last,sleeping-at-last,0.055039,0.052010,135.999178,1011.699715,417024.700029,1782.488472,1.758349e+06,...,146.645050,2314.724365,3.981730,1146.798584,19.675064,307.971436,-0.206686,272.086609,2.004955,179.716309
143,"July 4, 2020- Aphelion",Sleeping At Last,sleeping-at-last,0.042680,0.040858,123.046875,808.087557,162284.147416,1371.124972,1.126312e+06,...,169.458984,2116.119629,18.671284,1251.702026,15.916750,425.085114,3.733818,151.921066,-2.034718,127.379448
144,"June 30, 2018- Pds 70b (Birth of a Planet)",Sleeping At Last,sleeping-at-last,0.053712,0.050827,143.554688,1015.153620,359987.744008,1802.860928,2.156270e+06,...,140.895432,2242.066895,13.926115,977.377502,11.589584,358.514557,-7.436963,336.783081,-7.203380,363.062531
145,"June 24, 2022- Parade of Planets",Sleeping At Last,sleeping-at-last,0.049423,0.046981,135.999178,969.236327,178439.925544,1731.864604,1.016820e+06,...,155.706894,1268.987793,8.853416,1115.989624,22.701586,298.568787,2.838877,133.105148,-2.892214,73.708160


In [8]:
note_trajectories = np.array([vector.harmonic.note_trajectory for vector in vectors])
note_trajectories

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [9]:
note_distances = pdist(scaler.fit_transform(note_trajectories.T).T, 'euclidean')
note_distances

array([168.2688913 , 176.60230531, 177.23226435, ..., 141.86588479,
       180.47581951, 179.90733191])

In [10]:
note_mds = MDS(n_components=2, dissimilarity='precomputed')
note_coordinates = note_mds.fit_transform(squareform(note_distances))
note_coordinates



array([[  34.29348328,   57.28446889],
       [ 108.28805798,   50.77937763],
       [ -61.81821956,   94.08676979],
       [  21.49380385,  115.69478407],
       [   8.3716579 ,   37.15129126],
       [ -95.51165541,   70.64473328],
       [  60.41554092,  113.54787565],
       [ -72.33035883,   11.29756263],
       [ -64.85981881,   69.83175705],
       [ -50.62131767,  107.33087618],
       [ -81.11374849,  -90.80821337],
       [ -69.68652326,   72.23548159],
       [ -52.05690352,   71.12223307],
       [ -27.07972246,  107.24505887],
       [  -6.33487137,  -94.86603033],
       [ -26.88933428,   72.15736066],
       [ 115.61316816,  -54.02896936],
       [  78.68544993,   45.47211012],
       [  75.49396211,   41.29800676],
       [ 116.47723945,   83.04461915],
       [ -18.38716148,  -97.90911276],
       [ -25.82786555,   91.7731387 ],
       [-105.96943733,  -23.99689178],
       [ 128.75877092,  -31.24971744],
       [  38.06793908,   28.13111253],
       [ -86.79156513,   

In [11]:
note_coordinates_df = pd.DataFrame(note_coordinates, columns=['x', 'y'])
dataset['note_x'] = note_coordinates_df['x']
dataset['note_y'] = note_coordinates_df['y']
dataset

Unnamed: 0,song_name,artist,playlist,zero_crossings_mean,zero_crossings_var,bpm,spectral_centroid_mean,spectral_centroid_var,spectral_rolloff_mean,spectral_rolloff_var,...,mfcc_mean_2,mfcc_var_2,mfcc_mean_3,mfcc_var_3,mfcc_mean_4,mfcc_var_4,mfcc_mean_5,mfcc_var_5,note_x,note_y
0,Ivan Sings,Aram Khachaturian,kino,0.030815,0.029865,143.554688,728.505121,164591.144472,1044.706810,1.492931e+06,...,57.054276,234.785889,17.496822,167.898300,9.907036,68.982697,4.814508,74.525887,34.293483,57.284469
1,"Prélude in E Minor, Op. 28, No. 4",Frédéric Chopin,kino,0.028196,0.027401,103.359375,615.425486,95544.686241,892.440162,8.328338e+05,...,54.812344,289.936859,13.840162,131.829498,9.722220,56.867085,5.577062,56.455524,108.288058,50.779378
2,As a Cloud,Kino,kino,0.026781,0.026064,92.285156,519.512435,134149.124135,816.659536,5.748878e+05,...,65.865700,239.370544,0.333414,238.848633,-8.672693,78.373230,-5.365072,56.647114,-61.818220,94.086770
3,All,Kino,kino,0.044240,0.042283,161.499023,619.260455,49458.448746,981.976649,2.497879e+05,...,11.695502,684.767395,1.213135,455.439331,8.773780,122.202965,-1.666171,60.584774,21.493804,115.694784
4,Anew,Kino,kino,0.048969,0.046571,161.499023,677.808914,72961.813450,1098.534181,4.831254e+05,...,-0.197897,452.366974,-7.838040,375.633240,-0.372361,102.292191,-8.401909,85.260345,8.371658,37.151291
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,"October 30, 2018- Kepler - Goodnight",Sleeping At Last,sleeping-at-last,0.055039,0.052010,135.999178,1011.699715,417024.700029,1782.488472,1.758349e+06,...,3.981730,1146.798584,19.675064,307.971436,-0.206686,272.086609,2.004955,179.716309,36.990955,88.611713
143,"July 4, 2020- Aphelion",Sleeping At Last,sleeping-at-last,0.042680,0.040858,123.046875,808.087557,162284.147416,1371.124972,1.126312e+06,...,18.671284,1251.702026,15.916750,425.085114,3.733818,151.921066,-2.034718,127.379448,-75.583123,-0.720036
144,"June 30, 2018- Pds 70b (Birth of a Planet)",Sleeping At Last,sleeping-at-last,0.053712,0.050827,143.554688,1015.153620,359987.744008,1802.860928,2.156270e+06,...,13.926115,977.377502,11.589584,358.514557,-7.436963,336.783081,-7.203380,363.062531,-76.360660,40.843246
145,"June 24, 2022- Parade of Planets",Sleeping At Last,sleeping-at-last,0.049423,0.046981,135.999178,969.236327,178439.925544,1731.864604,1.016820e+06,...,8.853416,1115.989624,22.701586,298.568787,2.838877,133.105148,-2.892214,73.708160,-15.543648,52.058679


In [12]:
chord_trajectories = np.array([vector.harmonic.chord_trajectory for vector in vectors])
del vectors
chord_trajectories

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [12.,  0.,  0., ...,  0.,  0.,  0.],
       [ 1.,  0.,  0., ...,  0.,  0.,  0.],
       ...,
       [80.,  1.,  5., ...,  0.,  0.,  0.],
       [30.,  0.,  3., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [13]:
chord_distances = pdist(scaler.fit_transform(chord_trajectories.T).T, 'euclidean')
chord_distances

array([27.23173018, 24.96089027, 30.51030035, ..., 24.05038606,
       30.35387858, 33.331774  ])

In [14]:
chord_mds = MDS(n_components=2, dissimilarity='precomputed')
chord_coordinates = chord_mds.fit_transform(squareform(chord_distances))
chord_coordinates



array([[-10.33865577,  14.33579231],
       [ -4.55940385,   0.12649419],
       [ -1.71264909,  25.3685239 ],
       [-13.81434274, -18.54264002],
       [ 23.40548165,  -6.19652975],
       [ 12.47544508,  21.52083014],
       [ 24.14134009,   9.25361938],
       [ 14.75287809,  18.4769826 ],
       [  9.57704893,  23.24107402],
       [ 17.53810261, -10.04819572],
       [ -0.95292586,   7.93208529],
       [  7.46980914,  24.06982947],
       [  4.92283775,  19.9653523 ],
       [  4.32951067,  25.31775903],
       [-15.5079341 ,  -5.80423739],
       [ 11.55793936,  18.98219731],
       [-18.66556718,  19.14592329],
       [-16.33678257, -13.64964954],
       [-16.83861187, -12.75887593],
       [ 14.65608768, -11.35526992],
       [ -6.24214496,  -7.44440306],
       [ 17.27604786,  12.48382586],
       [ -8.00366917,  -5.15634131],
       [ -7.31154526,  24.64807594],
       [ -2.90700112,  -8.58048513],
       [  3.37550313,  25.78329322],
       [ 10.93265294,  14.44752428],
 

In [15]:
chord_coordinates_df = pd.DataFrame(chord_coordinates, columns=['x', 'y'])
dataset['chord_x'] = chord_coordinates_df['x']
dataset['chord_y'] = chord_coordinates_df['y']

# Run from here if dataset already exists

In [167]:
read_temp = True

if read_temp:
    dataset = pd.read_pickle(temp_dir)
else:
    dataset.to_pickle(temp_dir, compression='bz2')

playlists = dataset.pop('playlist')
artists = dataset.pop('artist')
song_names = dataset.pop('song_name')
dataset

Unnamed: 0,zero_crossings_mean,zero_crossings_var,bpm,spectral_centroid_mean,spectral_centroid_var,spectral_rolloff_mean,spectral_rolloff_var,spectral_flux_mean,spectral_flux_var,mfcc_mean_1,...,mfcc_mean_3,mfcc_var_3,mfcc_mean_4,mfcc_var_4,mfcc_mean_5,mfcc_var_5,note_x,note_y,chord_x,chord_y
0,0.030815,0.029865,143.554688,728.505121,164591.144472,1044.706810,1.492931e+06,0.606103,0.621713,139.550812,...,17.496822,167.898300,9.907036,68.982697,4.814508,74.525887,34.293483,57.284469,-10.338656,14.335792
1,0.028196,0.027401,103.359375,615.425486,95544.686241,892.440162,8.328338e+05,0.683619,0.792420,169.640305,...,13.840162,131.829498,9.722220,56.867085,5.577062,56.455524,108.288058,50.779378,-4.559404,0.126494
2,0.026781,0.026064,92.285156,519.512435,134149.124135,816.659536,5.748878e+05,0.516051,0.319771,179.117065,...,0.333414,238.848633,-8.672693,78.373230,-5.365072,56.647114,-61.818220,94.086770,-1.712649,25.368524
3,0.044240,0.042283,161.499023,619.260455,49458.448746,981.976649,2.497879e+05,0.836040,0.742858,211.416275,...,1.213135,455.439331,8.773780,122.202965,-1.666171,60.584774,21.493804,115.694784,-13.814343,-18.542640
4,0.048969,0.046571,161.499023,677.808914,72961.813450,1098.534181,4.831254e+05,0.978895,1.435121,224.936325,...,-7.838040,375.633240,-0.372361,102.292191,-8.401909,85.260345,8.371658,37.151291,23.405482,-6.196530
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,0.055039,0.052010,135.999178,1011.699715,417024.700029,1782.488472,1.758349e+06,1.050923,2.551806,146.645050,...,19.675064,307.971436,-0.206686,272.086609,2.004955,179.716309,36.990955,88.611713,-0.331286,15.473378
143,0.042680,0.040858,123.046875,808.087557,162284.147416,1371.124972,1.126312e+06,0.769417,0.381804,169.458984,...,15.916750,425.085114,3.733818,151.921066,-2.034718,127.379448,-75.583123,-0.720036,-7.034498,3.600720
144,0.053712,0.050827,143.554688,1015.153620,359987.744008,1802.860928,2.156270e+06,0.940335,1.422233,140.895432,...,11.589584,358.514557,-7.436963,336.783081,-7.203380,363.062531,-76.360660,40.843246,15.670640,1.126776
145,0.049423,0.046981,135.999178,969.236327,178439.925544,1731.864604,1.016820e+06,0.834188,0.323093,155.706894,...,22.701586,298.568787,2.838877,133.105148,-2.892214,73.708160,-15.543648,52.058679,23.330706,3.577190


In [168]:
normalised_dataset = pd.DataFrame(scaler.fit_transform(dataset), columns=dataset.columns)
normalised_dataset

Unnamed: 0,zero_crossings_mean,zero_crossings_var,bpm,spectral_centroid_mean,spectral_centroid_var,spectral_rolloff_mean,spectral_rolloff_var,spectral_flux_mean,spectral_flux_var,mfcc_mean_1,...,mfcc_mean_3,mfcc_var_3,mfcc_mean_4,mfcc_var_4,mfcc_mean_5,mfcc_var_5,note_x,note_y,chord_x,chord_y
0,-1.456953,-1.495352,1.009816,-1.069471,-0.658528,-1.148490,-0.199506,-1.299649,0.021803,-0.057346,...,0.040407,-0.983728,0.998156,-0.749100,0.507178,-0.802599,0.432269,0.713514,-0.724511,0.999846
1,-1.580602,-1.629084,-0.512519,-1.283684,-0.895451,-1.270050,-0.769405,-0.910395,0.295432,0.926143,...,-0.299992,-1.253272,0.968986,-0.885787,0.624161,-1.058118,1.364970,0.632489,-0.319513,0.008822
2,-1.647394,-1.701633,-0.931938,-1.465378,-0.762986,-1.330548,-0.992104,-1.751856,-0.462188,1.235895,...,-1.557335,-0.453511,-1.934430,-0.643157,-1.054470,-1.055409,-0.779218,1.171910,-0.120019,1.769321
3,-0.823191,-0.821593,1.689430,-1.276419,-1.053589,-1.198569,-1.272780,-0.144994,0.215989,2.291610,...,-1.475442,1.165086,0.819286,-0.148674,-0.487022,-0.999730,0.270929,1.441051,-0.968080,-1.293251
4,-0.599979,-0.588951,1.689430,-1.165507,-0.972940,-1.105517,-1.071327,0.572372,1.325632,2.733519,...,-2.318015,0.568690,-0.624322,-0.373305,-1.520351,-0.650811,0.105525,0.462743,1.640206,-0.432175
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,-0.313405,-0.293828,0.723663,-0.532998,0.207659,-0.559493,0.029643,0.934067,3.115592,0.174533,...,0.243179,0.063049,-0.598173,1.542299,0.076165,0.684821,0.466271,1.103715,-0.023216,1.079187
143,-0.896855,-0.898902,0.233114,-0.918713,-0.666445,-0.887899,-0.516029,-0.479550,-0.362754,0.920217,...,-0.106682,0.938247,0.023788,0.186603,-0.543561,-0.055236,-0.952725,-0.008968,-0.492963,0.251131
144,-0.376067,-0.358018,1.009816,-0.526455,0.011945,-0.543228,0.373190,0.378739,1.304974,-0.013396,...,-0.509498,0.440761,-1.739385,2.272199,-1.336485,3.277384,-0.962526,0.508728,1.098165,0.078587
145,-0.578515,-0.566709,0.723663,-0.613439,-0.611009,-0.599907,-0.610559,-0.154293,-0.456863,0.470723,...,0.524918,-0.007218,-0.117467,-0.025676,-0.675109,-0.814162,-0.195928,0.648424,1.634966,0.249490


# Clustering

In [169]:
def generate_hover_template(field_names):
    return '<br>'.join([f'<b>{name}</b>: %{{customdata[{idx}]}}' for idx, name in enumerate(field_names)]) + '<extra></extra>'

In [170]:
def generate_tsne_trace(fig, tsne, row, col):
    tsne['playlist'] = playlists
    tsne['artist'] = artists
    tsne['song_name'] = song_names
    for pl in pd.unique(tsne['playlist']):
        filtered_tsne = tsne[tsne['playlist'] == pl]
        fig.add_trace(go.Scatter(
            x=filtered_tsne['TSNE1'],
            y=filtered_tsne['TSNE2'],
            customdata=filtered_tsne.iloc[:, -3:].to_numpy(),
            mode='markers',
            hovertemplate=generate_hover_template(['playlist', 'artist', 'song_name']),
            showlegend=False,
            name=pl),
            row=row,
            col=col
        )

In [171]:
def generate_tsne_perplexity_subplots(tsne_array):
    cols = 3
    rows = math.ceil(len(tsne_array) / cols)
    fig = make_subplots(rows=rows, cols=cols, subplot_titles=(tuple(f'Perplexity: {p}, Exaggeration: {e}' for e, p, _ in tsne_array)))

    for idx, item in enumerate(tsne_array):
        ee, perplexity, tsne = item
        generate_tsne_trace(
            fig,
            tsne,
            row=(math.floor(idx/cols)+1),
            col=(idx%cols)+1
        )

    fig.update_layout(title='TSNE Perplexities', height=350*rows)
    fig.update_annotations(font={'size': 12})
    fig.show()

In [172]:
def get_tsne_dataset(early_exaggeration, perplexity, early_exaggerations, perplexities, tsne_array):
    p_idx = perplexities.index(perplexity)
    e_idx = early_exaggerations.index(early_exaggeration)
    return tsne_array[3*p_idx + e_idx][2]

In [173]:
perplexities = [3, 5, 7, 10, 20]
early_exaggerations = [12, 24, 48]
tsne_array = []
for perplexity in perplexities:
    for ee in early_exaggerations:
        tsne = TSNE(n_components=2, early_exaggeration=ee, perplexity=perplexity, n_iter=5000)
        tsne_dataset = pd.DataFrame(tsne.fit_transform(normalised_dataset), columns=['TSNE1', 'TSNE2'])
        tsne_array.append((ee, perplexity, tsne_dataset))

generate_tsne_perplexity_subplots(tsne_array)

In [188]:
tsne_dataset = get_tsne_dataset(
    early_exaggeration=12,
    perplexity=7,
    early_exaggerations=early_exaggerations,
    perplexities=perplexities,
    tsne_array=tsne_array
)
tsne_dataset

Unnamed: 0,TSNE1,TSNE2,playlist,artist,song_name
0,-31.886988,-53.218220,kino,Aram Khachaturian,Ivan Sings
1,-29.924746,-56.681614,kino,Frédéric Chopin,"Prélude in E Minor, Op. 28, No. 4"
2,-52.143848,-23.822826,kino,Kino,As a Cloud
3,-7.916717,-27.953857,kino,Kino,All
4,-45.491215,-2.536048,kino,Kino,Anew
...,...,...,...,...,...
142,-20.121944,5.169987,sleeping-at-last,Sleeping At Last,"October 30, 2018- Kepler - Goodnight"
143,-8.646254,-6.536448,sleeping-at-last,Sleeping At Last,"July 4, 2020- Aphelion"
144,-21.776924,3.392226,sleeping-at-last,Sleeping At Last,"June 30, 2018- Pds 70b (Birth of a Planet)"
145,-24.297970,-15.374807,sleeping-at-last,Sleeping At Last,"June 24, 2022- Parade of Planets"


# Plotting

In [189]:
def generate_plot_coloured_by_features(dataframe, color_by):
    px.scatter(
        dataframe,
        x='TSNE1',
        y='TSNE2',
        hover_data=['artist', 'song_name'],
        template='plotly_dark',
        color=color_by,
        height=500,
        width=1000
    ).show()

In [190]:
result_dataset = dataset

try:
    result_dataset.insert(0, 'song_name', song_names)
    result_dataset.insert(1, 'artist', artists)
    result_dataset['playlist'] = playlists
except:
    pass

result_dataset['TSNE1'] = tsne_dataset['TSNE1']
result_dataset['TSNE2'] = tsne_dataset['TSNE2']

In [191]:
generate_plot_coloured_by_features(result_dataset, 'artist')

In [192]:
generate_plot_coloured_by_features(result_dataset, 'playlist')

In [193]:
gmm_array = []
bic_array = []
aic_array = []
cluster_array = [i for i in range(1, 15)]
for n in cluster_array:
    gmm = GaussianMixture(n_components=n)
    cluster_labels = gmm.fit_predict(result_dataset[['TSNE1', 'TSNE2']])
    gmm_array.append(cluster_labels)
    bic = gmm.bic(result_dataset[['TSNE1', 'TSNE2']])
    bic_array.append(bic)
    aic = gmm.aic(result_dataset[['TSNE1', 'TSNE2']])
    aic_array.append(aic)


KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.


KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.


KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.


KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.


KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.


KMeans is known to have a memory leak on Windows with MKL, when there are less 

In [194]:
px.line(
    x=cluster_array,
    y=bic_array,
    width=1000,
    height=500,
    title='BIC Score of GMM Clustering'
)

In [195]:
px.line(
    x=cluster_array,
    y=aic_array,
    width=1000,
    height=500,
    title='AIC Score of GMM Clustering'
)

In [200]:
# gmm = GaussianMixture(n_components=6)
# cluster_labels = gmm.fit_predict(result_dataset[['TSNE1', 'TSNE2']])
result_dataset['cluster'] = gmm_array[8 - 1]
result_dataset

Unnamed: 0,song_name,artist,zero_crossings_mean,zero_crossings_var,bpm,spectral_centroid_mean,spectral_centroid_var,spectral_rolloff_mean,spectral_rolloff_var,spectral_flux_mean,...,mfcc_mean_5,mfcc_var_5,note_x,note_y,chord_x,chord_y,playlist,TSNE1,TSNE2,cluster
0,Ivan Sings,Aram Khachaturian,0.030815,0.029865,143.554688,728.505121,164591.144472,1044.706810,1.492931e+06,0.606103,...,4.814508,74.525887,34.293483,57.284469,-10.338656,14.335792,kino,-31.886988,-53.218220,5
1,"Prélude in E Minor, Op. 28, No. 4",Frédéric Chopin,0.028196,0.027401,103.359375,615.425486,95544.686241,892.440162,8.328338e+05,0.683619,...,5.577062,56.455524,108.288058,50.779378,-4.559404,0.126494,kino,-29.924746,-56.681614,5
2,As a Cloud,Kino,0.026781,0.026064,92.285156,519.512435,134149.124135,816.659536,5.748878e+05,0.516051,...,-5.365072,56.647114,-61.818220,94.086770,-1.712649,25.368524,kino,-52.143848,-23.822826,4
3,All,Kino,0.044240,0.042283,161.499023,619.260455,49458.448746,981.976649,2.497879e+05,0.836040,...,-1.666171,60.584774,21.493804,115.694784,-13.814343,-18.542640,kino,-7.916717,-27.953857,0
4,Anew,Kino,0.048969,0.046571,161.499023,677.808914,72961.813450,1098.534181,4.831254e+05,0.978895,...,-8.401909,85.260345,8.371658,37.151291,23.405482,-6.196530,kino,-45.491215,-2.536048,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,"October 30, 2018- Kepler - Goodnight",Sleeping At Last,0.055039,0.052010,135.999178,1011.699715,417024.700029,1782.488472,1.758349e+06,1.050923,...,2.004955,179.716309,36.990955,88.611713,-0.331286,15.473378,sleeping-at-last,-20.121944,5.169987,3
143,"July 4, 2020- Aphelion",Sleeping At Last,0.042680,0.040858,123.046875,808.087557,162284.147416,1371.124972,1.126312e+06,0.769417,...,-2.034718,127.379448,-75.583123,-0.720036,-7.034498,3.600720,sleeping-at-last,-8.646254,-6.536448,3
144,"June 30, 2018- Pds 70b (Birth of a Planet)",Sleeping At Last,0.053712,0.050827,143.554688,1015.153620,359987.744008,1802.860928,2.156270e+06,0.940335,...,-7.203380,363.062531,-76.360660,40.843246,15.670640,1.126776,sleeping-at-last,-21.776924,3.392226,3
145,"June 24, 2022- Parade of Planets",Sleeping At Last,0.049423,0.046981,135.999178,969.236327,178439.925544,1731.864604,1.016820e+06,0.834188,...,-2.892214,73.708160,-15.543648,52.058679,23.330706,3.577190,sleeping-at-last,-24.297970,-15.374807,2


In [201]:
generate_plot_coloured_by_features(result_dataset, 'cluster')

### Todo

1. **[Done: did 5 instead]** Compress trajectories into feature vectors
2. **[Done: used BIC and AIC, but not auto]** Automatic elbow method / hierarchical clustering
3. Allow user to input new song, process it and place it within the clustering grid
4. Return nearest (cluster?) neighbours as recommendations
5. **[Done: ended up using t-SNE]** Use trajectory matrix to influence position of final PCA points (maybe calculate trajectory matrix first, get individual points (3 dimensions per trajectory, 6D total) and add them as features to spectral features first, then only do PCA
6. Create new features for beats and percussion (and make this high weighted)
7. Use deep neural network to train embedding to TSNE output for parameterisation (being able to input new points in the correct cluster space)

Problem with (5) is that we cannot control the weightage of influence of harmonic vs spectral