In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from collections import Counter

data = pd.read_csv('spotify_data/processed_data.csv')

train = data.drop(['genres'], axis=1)

genres = data['genres']

In [None]:
dbscan_scaler = StandardScaler()
dbscan_scaler.fit(train)
train_scale = dbscan_scaler.transform(train)


eps_space = [1.0, 2.0, 5.0, 10.0, 20.0]
min_samples_space = range(20,101,20) 

s_scores = []
ch_scores = []
db_scores = []

for eps in eps_space:
    for min_samples in min_samples_space:
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        predictions = dbscan.fit_predict(train_scale)
        y_pred = predictions[predictions != -1]
        clustered = train_scale[predictions != -1]
        outliers = train_scale[predictions == -1]
        
        s_scores.append(silhouette_score(clustered, y_pred))
        ch_scores.append(calinski_harabasz_score(clustered, y_pred))
        db_scores.append(davies_bouldin_score(clustered, y_pred))
        
        n_clusters = len(Counter(y_pred))
        print(f'{eps},{min_samples}:')
        print(f'\tClusters:{n_clusters}')
        print(f'\tOutliers:{outliers.shape[0]}')
        print(f'\tScores: {s_scores[-1]}, {ch_scores[-1]}, {db_scores[-1]}')
    

1.0,20:
	Clusters:25
	Outliers:89911
	Scores: -0.11385795948297768, 558.9457943492849, 1.3449673734356367
1.0,40:
	Clusters:6
	Outliers:106739
	Scores: 0.0698510019601026, 754.6904870100512, 1.3107250888751187
1.0,60:
	Clusters:5
	Outliers:118176
	Scores: 0.039510033912125, 120.42352581517575, 1.293810205219327
1.0,80:
	Clusters:4
	Outliers:125265
	Scores: 0.04699199604041255, 3408.734590140468, 1.1830264456928945
1.0,100:
	Clusters:6
	Outliers:129921
	Scores: 0.3437255891917812, 1467.6227698642745, 1.1522195184211184
2.0,20:
	Clusters:4
	Outliers:3227
	Scores: 0.11773504073827959, 10362.03109969418, 2.6515810883493764
2.0,40:
	Clusters:4
	Outliers:4505
	Scores: 0.11846622908312439, 10344.497476063983, 2.625038842171962
2.0,60:
	Clusters:4
	Outliers:5587
	Scores: 0.11890303482949581, 10292.24336949861, 2.6033418869140976
2.0,80:
	Clusters:4
	Outliers:6568
	Scores: 0.11933049338096521, 10254.12533974023, 2.5864801478168435
2.0,100:
	Clusters:4
	Outliers:7476
	Scores: 0.11952108661053262

In [3]:
dbscan_scaler = StandardScaler()
dbscan_scaler.fit(train)
train_scale = dbscan_scaler.transform(train)


eps_space = [2.1, 2.2, 2.3, 2.4, 2.5]
min_samples_space = range(20,101,20) 

s_scores = []
ch_scores = []
db_scores = []

for eps in eps_space:
    for min_samples in min_samples_space:
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        predictions = dbscan.fit_predict(train_scale)
        y_pred = predictions[predictions != -1]
        clustered = train_scale[predictions != -1]
        outliers = train_scale[predictions == -1]
        
        s_scores.append(silhouette_score(clustered, y_pred))
        ch_scores.append(calinski_harabasz_score(clustered, y_pred))
        db_scores.append(davies_bouldin_score(clustered, y_pred))
        
        n_clusters = len(Counter(y_pred))
        print(f'{eps},{min_samples}:')
        print(f'\tClusters:{n_clusters}')
        print(f'\tOutliers:{outliers.shape[0]}')
        print(f'\tScores: {s_scores[-1]}, {ch_scores[-1]}, {db_scores[-1]}')

2.1,20:
	Clusters:4
	Outliers:2471
	Scores: 0.11734314680737117, 10394.650989042439, 2.6686901332596626
2.1,40:
	Clusters:4
	Outliers:3347
	Scores: 0.11780518774472792, 10397.639421244268, 2.6487645726514426
2.1,60:
	Clusters:4
	Outliers:4085
	Scores: 0.1181389193817288, 10385.270030548068, 2.631973659632593
2.1,80:
	Clusters:4
	Outliers:4747
	Scores: 0.11844109890305479, 10349.30903986916, 2.6177960775593987
2.1,100:
	Clusters:4
	Outliers:5368
	Scores: 0.1189366580839846, 10337.186027462305, 2.6055951880499157
2.2,20:
	Clusters:3
	Outliers:1886
	Scores: 0.21428310394044603, 9198.303958212717, 2.110246800941697
2.2,40:
	Clusters:3
	Outliers:2525
	Scores: 0.21519244615304467, 9201.860700657893, 2.095347042542857
2.2,60:
	Clusters:4
	Outliers:3055
	Scores: 0.11764113613701782, 10430.00778907339, 2.6545852206113163
2.2,80:
	Clusters:4
	Outliers:3528
	Scores: 0.11787986424779746, 10423.843968905116, 2.643902130065796
2.2,100:
	Clusters:4
	Outliers:3997
	Scores: 0.11823704639493508, 10413.7