# DBScan Hyperparameter search on dataset without popularity = 0

This iteration reflects that the dataset has been split into two entries with popularity > 0 and entries with popularity = 0. These two datasets have been saved as csv files on the S§ bucket and will be used in this notebook. 

In [1]:
# Import the necessary libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.cluster import DBSCAN

# Clustering

## Hyperparameter scanning

In [2]:
# Load the dataset from csv file.
X = pd.read_csv('s3://flutz-bucket/spotify/no_pop0_train_encoded.csv', sep=",", header=0, index_col=0)
X.head()

Unnamed: 0_level_0,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2b8fOow8UzyDFAE27YhOZM,1.0,0.764,0.32,1.0,0.803915,1,0.0546,0.837,0.0,0.0822,0.575,0.385677,0.116732,0.8
21jGcNKet2qwijlDFuPiPb,0.989899,0.695,0.762,0.0,0.882646,1,0.0395,0.192,0.00244,0.0863,0.553,0.508657,0.134207,0.8
3eekarcy7kvN4yt5ZFzltW,0.979798,0.598,0.427,0.636364,0.770934,0,0.0317,0.0546,6e-06,0.21,0.0605,0.324024,0.107407,0.8
3ZCTVFBt2Brf31RLEnCkWJ,0.979798,0.704,0.225,0.545455,0.65025,0,0.0994,0.902,0.657,0.106,0.243,0.508504,0.154631,0.8
2XU0oxnq2qxCpomAAuJY8K,0.979798,0.824,0.588,0.545455,0.821074,0,0.0924,0.692,0.000104,0.149,0.513,0.415372,0.13025,0.8


In [3]:
%%time
#Take 1
print('Running a series of DBSCANs to find the best fitting parameters - eps between 0.1 and 0.4.')

clusters_last = []
for EPS in np.arange(0.1, 0.41, 0.1):
  for Min_Samples in range(3, 21):   # range of values for minimum number of samples - between 3 and 20
    dbscan = DBSCAN(eps=EPS, min_samples=Min_Samples)
    pred_dbscan = dbscan.fit_predict(X)

    clusters_this = [*zip(*np.unique(pred_dbscan, return_counts=True))]
    if (clusters_this == clusters_last): print(end='.')
    else: clusters_last = clusters_this; \
      print(f'\nDBSCAN(eps={dbscan.eps:0.1f}, min_samples={dbscan.min_samples:2}):',
        f'Found {max(pred_dbscan) + 1 :2} cluster' + ('s,' if max(pred_dbscan) else ', '),
        f'noise={len(pred_dbscan[pred_dbscan==-1]) / len(X) :6.2%},',
          'cluster_sizes=', clusters_this, end='')

Running a series of DBSCANs to find the best fitting parameters - eps between 0.1 and 0.4.

DBSCAN(eps=0.1, min_samples= 3): Found 11 clusters, noise=99.76%, cluster_sizes= [(-1, 14653), (0, 3), (1, 3), (2, 4), (3, 3), (4, 3), (5, 3), (6, 3), (7, 3), (8, 3), (9, 3), (10, 4)]
DBSCAN(eps=0.1, min_samples= 4): Found  1 cluster,  noise=99.97%, cluster_sizes= [(-1, 14684), (0, 4)]
DBSCAN(eps=0.1, min_samples= 5): Found  0 clusters, noise=100.00%, cluster_sizes= [(-1, 14688)]...............
DBSCAN(eps=0.2, min_samples= 3): Found 277 clusters, noise=57.84%, cluster_sizes= [(-1, 8495), (0, 4), (1, 1546), (2, 2926), (3, 5), (4, 14), (5, 3), (6, 14), (7, 13), (8, 4), (9, 4), (10, 3), (11, 3), (12, 4), (13, 4), (14, 4), (15, 3), (16, 3), (17, 4), (18, 4), (19, 3), (20, 3), (21, 4), (22, 3), (23, 4), (24, 5), (25, 6), (26, 3), (27, 3), (28, 3), (29, 7), (30, 3), (31, 3), (32, 4), (33, 3), (34, 8), (35, 202), (36, 3), (37, 4), (38, 4), (39, 3), (40, 4), (41, 10), (42, 4), (43, 3), (44, 4), (45, 18)

In [4]:
%%time
#Take 2
print('Running a series of DBSCANs to find the best fitting parameters - eps between 0.5 and 0.8.')

clusters_last = []
for EPS in np.arange(0.5, 0.81, 0.1):
  for Min_Samples in range(3, 21):   # range of values for minimum number of samples - between 3 and 20
    dbscan = DBSCAN(eps=EPS, min_samples=Min_Samples)
    pred_dbscan = dbscan.fit_predict(X)

    clusters_this = [*zip(*np.unique(pred_dbscan, return_counts=True))]
    if (clusters_this == clusters_last): print(end='.')
    else: clusters_last = clusters_this; \
      print(f'\nDBSCAN(eps={dbscan.eps:0.1f}, min_samples={dbscan.min_samples:2}):',
        f'Found {max(pred_dbscan) + 1 :2} cluster' + ('s,' if max(pred_dbscan) else ', '),
        f'noise={len(pred_dbscan[pred_dbscan==-1]) / len(X) :6.2%},',
          'cluster_sizes=', clusters_this, end='')

Running a series of DBSCANs to find the best fitting parameters - eps between 0.5 and 0.8.

DBSCAN(eps=0.5, min_samples= 3): Found 12 clusters, noise= 0.87%, cluster_sizes= [(-1, 128), (0, 9138), (1, 5385), (2, 6), (3, 3), (4, 3), (5, 3), (6, 5), (7, 4), (8, 3), (9, 3), (10, 3), (11, 4)]
DBSCAN(eps=0.5, min_samples= 4): Found  5 clusters, noise= 1.05%, cluster_sizes= [(-1, 154), (0, 9136), (1, 5383), (2, 6), (3, 5), (4, 4)]
DBSCAN(eps=0.5, min_samples= 5): Found  2 clusters, noise= 1.20%, cluster_sizes= [(-1, 176), (0, 9134), (1, 5378)]
DBSCAN(eps=0.5, min_samples= 6): Found  2 clusters, noise= 1.25%, cluster_sizes= [(-1, 183), (0, 9133), (1, 5372)]
DBSCAN(eps=0.5, min_samples= 7): Found  2 clusters, noise= 1.30%, cluster_sizes= [(-1, 191), (0, 9129), (1, 5368)]
DBSCAN(eps=0.5, min_samples= 8): Found  2 clusters, noise= 1.38%, cluster_sizes= [(-1, 202), (0, 9118), (1, 5368)]
DBSCAN(eps=0.5, min_samples= 9): Found  2 clusters, noise= 1.41%, cluster_sizes= [(-1, 207), (0, 9117), (1, 5364

In [5]:
%%time
#Take 3
print('Running a series of DBSCANs to find the best fitting parameters - eps between 0.9 and 1.1.')

clusters_last = []
for EPS in np.arange(0.9, 1.11, 0.1):
  for Min_Samples in range(3, 21):   # range of values for minimum number of samples - between 3 and 20
    dbscan = DBSCAN(eps=EPS, min_samples=Min_Samples)
    pred_dbscan = dbscan.fit_predict(X)

    clusters_this = [*zip(*np.unique(pred_dbscan, return_counts=True))]
    if (clusters_this == clusters_last): print(end='.')
    else: clusters_last = clusters_this; \
      print(f'\nDBSCAN(eps={dbscan.eps:0.1f}, min_samples={dbscan.min_samples:2}):',
        f'Found {max(pred_dbscan) + 1 :2} cluster' + ('s,' if max(pred_dbscan) else ', '),
        f'noise={len(pred_dbscan[pred_dbscan==-1]) / len(X) :6.2%},',
          'cluster_sizes=', clusters_this, end='')

Running a series of DBSCANs to find the best fitting parameters - eps between 0.9 and 1.1.

DBSCAN(eps=0.9, min_samples= 3): Found  2 clusters, noise= 0.01%, cluster_sizes= [(-1, 1), (0, 9220), (1, 5467)]...
DBSCAN(eps=0.9, min_samples= 7): Found  2 clusters, noise= 0.01%, cluster_sizes= [(-1, 2), (0, 9219), (1, 5467)].
DBSCAN(eps=0.9, min_samples= 9): Found  2 clusters, noise= 0.03%, cluster_sizes= [(-1, 5), (0, 9216), (1, 5467)]..
DBSCAN(eps=0.9, min_samples=12): Found  2 clusters, noise= 0.04%, cluster_sizes= [(-1, 6), (0, 9216), (1, 5466)]........
DBSCAN(eps=1.0, min_samples= 3): Found  2 clusters, noise= 0.00%, cluster_sizes= [(0, 9220), (1, 5468)].................
DBSCAN(eps=1.1, min_samples= 3): Found  1 cluster,  noise= 0.00%, cluster_sizes= [(0, 14688)].................CPU times: user 5min 51s, sys: 10.5 s, total: 6min 1s
Wall time: 6min 1s


### OK the "best fit" eps = 0.3  and min_samples=5

## Findings: ...

In [6]:
# Advice on limits of K-means and hierarchical
#the main disadvantages of partitioning and hierarchical methods are: handling noise and getting bad results with finding clusters of nonspherical shape.

In [7]:
# Advice on selecting parameters for DBScan:
#""...
#According to a research made in 2017 by Schubert, Sander, et al, the desirable amount of noise will usually be between 1% and 30%. 
#Another insight from that research is that if one of the clusters contains many (20%-50%) points of the dataset, 
#it indicates that you should choose a smaller value for ε or to try another clustering method.
#..." source: Amit Shreiber, A Practical Guide to DBSCAN Method,  https://towardsdatascience.com/a-practical-guide-to-dbscan-method-d4ec5ab2bc99