In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from collections import Counter

data = pd.read_csv('spotify_data/processed_data.csv')

train = data.drop(['genres'], axis=1)

genres = data['genres']

In [None]:
dbscan_scaler = StandardScaler()
dbscan_scaler.fit(train)
train_scale = dbscan_scaler.transform(train)


eps_space = [1.0, 2.0, 5.0, 10.0, 20.0]
min_samples_space = range(20,101,20) 

s_scores = []
ch_scores = []
db_scores = []

for eps in eps_space:
    for min_samples in min_samples_space:
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        predictions = dbscan.fit_predict(train_scale)
        y_pred = predictions[predictions != -1]
        clustered = train_scale[predictions != -1]
        outliers = train_scale[predictions == -1]
        
        s_scores.append(silhouette_score(clustered, y_pred))
        ch_scores.append(calinski_harabasz_score(clustered, y_pred))
        db_scores.append(davies_bouldin_score(clustered, y_pred))
        
        n_clusters = len(Counter(y_pred))
        print(f'{eps},{min_samples}:')
        print(f'\tClusters:{n_clusters}')
        print(f'\tOutliers:{outliers.shape[0]}')
        print(f'\tScores: {s_scores[-1]}, {ch_scores[-1]}, {db_scores[-1]}')


# Tabulate Genres
# label_map = defaultdict(lambda: defaultdict(int))

# for i in range(len(y_pred)):
#     for genre in literal_eval(y.iloc[i]):
#         label_map[y_pred[i]][genre] += 1
        
# for label, genres in label_map.items():
#     print(f'label: {label}')
#     genre_counts = genres.items()
#     print(sorted(genres.items(), key=lambda x: x[1], reverse=True))
    

1.0,20:
	Clusters:25
	Outliers:89911
	Scores: -0.11385795948297768, 558.9457943492849, 1.3449673734356367
1.0,40:
	Clusters:6
	Outliers:106739
	Scores: 0.0698510019601026, 754.6904870100512, 1.3107250888751187
1.0,60:
	Clusters:5
	Outliers:118176
	Scores: 0.039510033912125, 120.42352581517575, 1.293810205219327
1.0,80:
	Clusters:4
	Outliers:125265
	Scores: 0.04699199604041255, 3408.734590140468, 1.1830264456928945
1.0,100:
	Clusters:6
	Outliers:129921
	Scores: 0.3437255891917812, 1467.6227698642745, 1.1522195184211184
2.0,20:
	Clusters:4
	Outliers:3227
	Scores: 0.11773504073827959, 10362.03109969418, 2.6515810883493764
2.0,40:
	Clusters:4
	Outliers:4505
	Scores: 0.11846622908312439, 10344.497476063983, 2.625038842171962
2.0,60:
	Clusters:4
	Outliers:5587
	Scores: 0.11890303482949581, 10292.24336949861, 2.6033418869140976
2.0,80:
	Clusters:4
	Outliers:6568
	Scores: 0.11933049338096521, 10254.12533974023, 2.5864801478168435
2.0,100:
	Clusters:4
	Outliers:7476
	Scores: 0.11952108661053262