In [16]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.model_selection import train_test_split
from collections import Counter

data = pd.read_csv('spotify_data/processed_data.csv')

train = data.drop(['genres', 'name', 'key', 'mode', 'explicit', 'duration_ms', 'acousticness'], axis=1)

genres = data['genres']

X_train, _ = train_test_split(train, test_size=0.9) 

scaler = StandardScaler()
scaler.fit(train)
train_scale = scaler.transform(X_train)

train_scale.shape

(13687, 9)

In [17]:
eps_space = [0.9, 1.0, 1.1, 1.2, 1.3]
min_samples_space = range(20,41,10) 

s_scores = []
ch_scores = []
db_scores = []

for eps in eps_space:
    for min_samples in min_samples_space:
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        predictions = dbscan.fit_predict(train_scale)
        y_pred = predictions[predictions != -1]
        clustered = train_scale[predictions != -1]
        outliers = train_scale[predictions == -1]
        
        s_scores.append(silhouette_score(clustered, y_pred))
        ch_scores.append(calinski_harabasz_score(clustered, y_pred))
        db_scores.append(davies_bouldin_score(clustered, y_pred))
        
        n_clusters = len(Counter(y_pred))
        print(f'{eps},{min_samples}:')
        print(f'\tClusters:{n_clusters}')
        print(f'\tOutliers:{outliers.shape[0]}')
        print(f'\tScores: {s_scores[-1]}, {ch_scores[-1]}, {db_scores[-1]}')
    

0.9,20:
	Clusters:2
	Outliers:7113
	Scores: 0.4347872207582779, 1579.8026464220513, 0.7099167194807188
0.9,30:
	Clusters:2
	Outliers:8275
	Scores: 0.4712399528643333, 1404.430901117731, 0.6432848676436118
0.9,40:
	Clusters:2
	Outliers:9251
	Scores: 0.4989783810705785, 1114.8157180697228, 0.5792621158822744
1.0,20:
	Clusters:3
	Outliers:5176
	Scores: 0.1932719952540174, 1047.5779013174867, 0.879855388323374
1.0,30:
	Clusters:3
	Outliers:6020
	Scores: 0.20547685017554004, 936.1953804657586, 0.8491129224783145
1.0,40:
	Clusters:2
	Outliers:6712
	Scores: 0.4360825761952394, 1625.2852316232509, 0.7054964421991746


ValueError: Number of labels is 1. Valid values are 2 to n_samples - 1 (inclusive)