In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import Birch
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.model_selection import train_test_split
from collections import defaultdict

data = pd.read_csv('spotify_data/processed_data.csv')

train = data.drop(['genres', 'name'], axis=1)

genres = data['genres']

X_train, X_test = train_test_split(train, test_size=0.9) 
X_train.shape

(13687, 14)

In [2]:
scaler = StandardScaler()
scaler.fit(train)
train_scale = scaler.transform(X_train)

s_scores = []
ch_scores = []
db_scores = []

for i in range(5, 20):
    model = Birch(n_clusters=i)
    y_pred = model.fit_predict(train_scale)
    s_scores.append(silhouette_score(train_scale, y_pred))
    ch_scores.append(calinski_harabasz_score(train_scale, y_pred))
    db_scores.append(davies_bouldin_score(train_scale, y_pred))
    print(f'{i}: {s_scores[-1]}, {ch_scores[-1]}, {db_scores[-1]}')

5: 0.1418478962010837, 1619.0028837085479, 1.9902044013668991
6: 0.10049887061770037, 1463.1383116650775, 2.075593822092656
7: 0.10324755884170951, 1353.0408634899204, 1.9479874619625168
8: 0.10430770404875869, 1259.9559386755254, 1.8706112872038418
9: 0.09230370164514833, 1191.55491234242, 2.1136308597518276
10: 0.08098263389970663, 1138.6755539545911, 2.1251389009553074
11: 0.08264000856445398, 1093.4814959974162, 2.060147891844163
12: 0.08933625480680589, 1056.9727272854836, 1.979534993018066
13: 0.07897619781745314, 1021.4493080361062, 1.9397244643267741
14: 0.07913725706297994, 983.1035164523737, 1.9373817590160303
15: 0.07327531541211081, 944.0066506661777, 2.02604867562163
16: 0.07122725413136084, 912.8413577392987, 2.0369850379465104
17: 0.06873471768837408, 880.0755649491969, 1.9818788011659738
18: 0.06907393033778625, 850.6381600202553, 2.0256239064269557
19: 0.07051121531449032, 824.6831194272764, 2.0332641020676117


In [4]:
import numpy as np

model = Birch(n_clusters=6)
model.fit_predict(train_scale)

x_avg = train_scale.mean(axis=0)
dists = np.absolute(x_avg - model.subcluster_centers_)
indices = np.argsort(dists, axis=1)[:,-5:]

features = train.columns

for cluster in range(6):
    print(f'Cluster {cluster} Top Features:')
    for i in range(4, -1, -1):
        print(f'\t{features[indices[cluster, i]]}')

Cluster 0 Top Features:
	duration_ms
	liveness
	key
	danceability
	acousticness
Cluster 1 Top Features:
	liveness
	speechiness
	tempo
	mode
	acousticness
Cluster 2 Top Features:
	liveness
	energy
	key
	acousticness
	duration_ms
Cluster 3 Top Features:
	liveness
	duration_ms
	energy
	speechiness
	mode
Cluster 4 Top Features:
	explicit
	liveness
	valence
	energy
	speechiness
Cluster 5 Top Features:
	liveness
	duration_ms
	instrumentalness
	energy
	acousticness
