In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import plotly.express as px 
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist

import warnings
warnings.filterwarnings("ignore")

In [6]:
df = pd.read_csv('data.csv')

In [14]:
# getting just 2020 data
data = df[df['year'] == 2020]

In [15]:
data.reset_index(inplace=True)
del data['index']
data.head()

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year
0,0.881,['Joni Mitchell'],0.644,313093,0.212,0,55qyghODi24yaDgKBI6lx0,2.2e-05,11,0.798,-14.118,1,"The Circle Game - Live at The 2nd Fret, Philad...",19,2020-10-30,0.0347,117.072,0.441,2020
1,0.955,['Joni Mitchell'],0.627,295093,0.184,0,00xemFYjQNRpOlPhVaLAHa,0.000162,1,0.0986,-15.533,1,"Urge For Going - Live at The 2nd Fret, Philade...",19,2020-10-30,0.045,115.864,0.299,2020
2,0.888,['Joni Mitchell'],0.581,183440,0.331,0,2lm5FQJRHvc3rUN5YHpEWj,1.5e-05,6,0.147,-14.087,1,What's The Story Mr. Blue - Live at The 2nd Fr...,19,2020-10-30,0.243,88.303,0.642,2020
3,0.93,['Joni Mitchell'],0.442,147907,0.399,0,26g4FBGTB9YEj7q4HlblFf,0.000499,6,0.912,-12.661,1,"Brandy Eyes - Live at The 2nd Fret, Philadelph...",19,2020-10-30,0.078,121.662,0.554,2020
4,0.949,['Joni Mitchell'],0.57,64173,0.176,0,05sxkljafFBW2vEnVczQy1,0.0,6,0.147,-22.676,0,Intro To Urge For Going - Live at The 2nd Fret...,19,2020-10-30,0.299,135.687,0.348,2020


In [18]:
song_cluster_pipeline = Pipeline([('scaler', StandardScaler()), 
                                  ('kmeans', KMeans(n_clusters=20, 
                                   verbose=2, n_jobs=4))],verbose=True)

X = data.select_dtypes(np.number)
number_cols = list(X.columns)
song_cluster_pipeline.fit(X)
song_cluster_labels = song_cluster_pipeline.predict(X)
data['cluster_label'] = song_cluster_labels

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.0s
Initialization complete
Iteration 0, inertia 40510.49562909471
Iteration 1, inertia 30164.670628178752
Iteration 2, inertia 29066.986023198067
Iteration 3, inertia 28620.64995429569
Iteration 4, inertia 28336.2067150934
Iteration 5, inertia 28201.36656288552
Iteration 6, inertia 28148.439455597432
Iteration 7, inertia 28116.90986575879
Iteration 8, inertia 28076.2754300886
Iteration 9, inertia 27989.95261668311
Iteration 10, inertia 27887.22953419975
Iteration 11, inertia 27803.11832124109
Iteration 12, inertia 27766.58831751689
Iteration 13, inertia 27755.292870744815
Iteration 14, inertia 27748.148933465793
Iteration 15, inertia 27744.7134015034
Iteration 16, inertia 27738.747690730157
Iteration 17, inertia 27734.064668991516
Iteration 18, inertia 27731.605347607067
Iteration 19, inertia 27729.155107789757
Iteration 20, inertia 27728.097104231045
Iteration 21, inertia 27725.51566188527
Iteration 22, inertia 27722.

In [17]:
pca_pipeline = Pipeline([('scaler', StandardScaler()), ('PCA', PCA(n_components=2))])
song_embedding = pca_pipeline.fit_transform(X)
projection = pd.DataFrame(columns=['x', 'y'], data=song_embedding)
projection['title'] = data['name']
projection['cluster'] = data['cluster_label']

fig = px.scatter(projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'title'])
fig.show()