In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN, AffinityPropagation, AgglomerativeClustering, SpectralClustering
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.model_selection import train_test_split
from collections import Counter

data = pd.read_csv('spotify_data/processed_data.csv')

train = data.drop(['genres'], axis=1)

genres = data['genres']

X_train, X_test = train_test_split(train, test_size=0.9) 

scaler = StandardScaler()
scaler.fit(train)
train_scale = scaler.transform(X_train)

In [2]:
models = ['DBSCAN', 'Affinity Propagation', 'Agglomerative Clustering', 'Spectral Clustering', 'Gaussian Mixtures']
s_scores = []
ch_scores = []
db_scores = []

# DBSCAN

In [3]:
model = DBSCAN(eps=2, min_samples=100)
y_pred = model.fit_predict(train_scale)
predictions = model.fit_predict(train_scale)
y_pred = predictions[predictions != -1]
clustered = train_scale[predictions != -1]
outliers = train_scale[predictions == -1]

s_scores.append(silhouette_score(clustered, y_pred))
ch_scores.append(calinski_harabasz_score(clustered, y_pred))
db_scores.append(davies_bouldin_score(clustered, y_pred))

# Affinity Propagation

In [4]:
model = AffinityPropagation(random_state=0)
y_pred = model.fit_predict(train_scale)
s_scores.append(silhouette_score(train_scale, y_pred))
ch_scores.append(calinski_harabasz_score(train_scale, y_pred))
db_scores.append(davies_bouldin_score(train_scale, y_pred))

# Agglomerative Clustering

In [5]:
model = AgglomerativeClustering(n_clusters=8)
y_pred = model.fit_predict(train_scale)
s_scores.append(silhouette_score(train_scale, y_pred))
ch_scores.append(calinski_harabasz_score(train_scale, y_pred))
db_scores.append(davies_bouldin_score(train_scale, y_pred))

# Spectral Clustering

In [6]:
model = SpectralClustering(assign_labels='discretize', n_clusters=5, random_state=0)
y_pred = model.fit_predict(train_scale)
s_scores.append(silhouette_score(train_scale, y_pred))
ch_scores.append(calinski_harabasz_score(train_scale, y_pred))
db_scores.append(davies_bouldin_score(train_scale, y_pred))

# Gaussian Mixtures

In [7]:
model = GaussianMixture(n_components=6)
y_pred = model.fit_predict(train_scale)
s_scores.append(silhouette_score(train_scale, y_pred))
ch_scores.append(calinski_harabasz_score(train_scale, y_pred))
db_scores.append(davies_bouldin_score(train_scale, y_pred))

# Comparison

In [8]:
for i in range(len(models)):
    print(models[i])
    print(f'\tSilhouette Score: {s_scores[i]}')
    print(f'\tCalinski Harabasz Score: {ch_scores[i]}')
    print(f'\Davies Bouldin Score: {db_scores[i]}')

DBSCAN
	Silhouette Score: 0.12590906376407113
	Calinski Harabasz Score: 1040.9092663869171
\Davies Bouldin Score: 2.3025179473815696
Affinity Propagation
	Silhouette Score: 0.07795128370251218
	Calinski Harabasz Score: 124.84556454144625
\Davies Bouldin Score: 1.7973389930843504
Agglomerative Clustering
	Silhouette Score: 0.09690110389277737
	Calinski Harabasz Score: 1289.6440492075956
\Davies Bouldin Score: 2.0791985338435968
Spectral Clustering
	Silhouette Score: 0.24967018884226141
	Calinski Harabasz Score: 529.8777415270783
\Davies Bouldin Score: 1.1238161346574198
Gaussian Mixtures
	Silhouette Score: 0.03635208848332539
	Calinski Harabasz Score: 818.9382314733672
\Davies Bouldin Score: 3.590119578058087
