In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.model_selection import train_test_split
from collections import defaultdict

data = pd.read_csv('spotify_data/processed_data.csv')

train = data.drop(['genres', 'name'], axis=1)

genres = data['genres']

X_train, X_test = train_test_split(train, test_size=0.9) 
X_train.shape

(13687, 14)

In [2]:
scaler = StandardScaler()
scaler.fit(train)
train_scale = scaler.transform(X_train)

s_scores = []
ch_scores = []
db_scores = []

for i in range(5, 20):
    model = KMeans(n_clusters=i)
    y_pred = model.fit_predict(train_scale)
    s_scores.append(silhouette_score(train_scale, y_pred))
    ch_scores.append(calinski_harabasz_score(train_scale, y_pred))
    db_scores.append(davies_bouldin_score(train_scale, y_pred))
    print(f'{i}: {s_scores[-1]}, {ch_scores[-1]}, {db_scores[-1]}')

5: 0.13929231042802548, 1994.2129043685968, 2.042748626262003
6: 0.14297770877131452, 1825.6740196123058, 1.918571977282763
7: 0.13897132136739762, 1703.733380462285, 1.9509369621380321
8: 0.14195720884295748, 1601.832937157476, 1.8665059840737706
9: 0.12861082409339342, 1514.8060972363178, 1.89446027926578
10: 0.12998288828295182, 1446.6474442632584, 1.8244079873908663
11: 0.11982373752571625, 1373.5928506088965, 1.8160955950920281
12: 0.11341584177031251, 1308.231966221581, 1.8134269997924102
13: 0.11576718454447224, 1249.9826382509953, 1.8063126232955686
14: 0.11155051561967326, 1208.2797222682013, 1.8793944353518606
15: 0.1189408296707019, 1157.2818408335554, 1.9130272682201226
16: 0.11400335295952382, 1122.370870515229, 1.8095161691578163
17: 0.11572038092134454, 1080.1319635844095, 1.936831179295387
18: 0.11437651685623074, 1044.1218633746228, 1.813128380938959
19: 0.11384817640733885, 1017.5304814425908, 1.9020427442804497


In [10]:
import numpy as np

model = KMeans(n_clusters=6)
model.fit_predict(train_scale)

x_avg = train_scale.mean(axis=0)
dists = np.absolute(x_avg - model.cluster_centers_)
indices = np.argsort(dists, axis=1)[:,-5:]

features = train.columns

for cluster in range(6):
    print(f'Cluster {cluster} Top Features:')
    for i in range(4, -1, -1):
        print(f'\t{features[indices[cluster, i]]}')

Cluster 0 Top Features:
	liveness
	energy
	danceability
	explicit
	duration_ms
Cluster 1 Top Features:
	instrumentalness
	loudness
	energy
	acousticness
	popularity
Cluster 2 Top Features:
	explicit
	speechiness
	popularity
	danceability
	acousticness
Cluster 3 Top Features:
	acousticness
	energy
	mode
	loudness
	popularity
Cluster 4 Top Features:
	acousticness
	energy
	popularity
	loudness
	duration_ms
Cluster 5 Top Features:
	mode
	acousticness
	energy
	loudness
	popularity
