In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, AgglomerativeClustering
from genieclust import Genie
from sklearn.cluster import DBSCAN

import numpy as np
import clustbench
import genieclust
import matplotlib.pyplot as plt
import os
import warnings
warnings.filterwarnings("ignore")
from sklearn.neighbors import NearestNeighbors
import numpy as np
import matplotlib.pyplot as plt
from kneed import KneeLocator
from sklearn.metrics import adjusted_rand_score

In [6]:
sipu_collection = [
    'a1', 
    'a2', 
    'a3', 
    'aggregation', 
    'birch1', 
    'birch2', 
    'compound', 
    'd31', 
    'r15', 
    'flame', 
    'jain', 
    'pathbased', 
    'spiral', 
    's1', 
    's2', 
    's3', 
    's4', 
    'unbalance'
    ]

In [None]:
for sip in sipu_collection: 
    print("################")
    print("DATASET: ", sip)
    benchmark = clustbench.load_dataset("sipu", sip, url="https://github.com/gagolews/clustering-data-v1/raw/v1.1.0")
    print(benchmark.data.shape)

In [7]:
def find_eps(data, min_samples):
    neighbors = NearestNeighbors(n_neighbors=min_samples)
    neighbors_fit = neighbors.fit(data)
    distances, indices = neighbors_fit.kneighbors(data)
    distances = np.sort(distances[:, -1])  
    kneedle = KneeLocator(range(len(distances)), distances, curve="convex", direction="increasing")
    return distances, kneedle.knee_y 

In [58]:
results = []
for sip in sipu_collection: 
    
    b = clustbench.load_dataset("sipu", sip, path="./github.com/gagolews/clustering-data-v1/raw/v1.1.0")
    if b.data.shape[0] <= 10000: 
        data = b.data
        labels = b.labels
    else:
        np.random.seed(123)
        indices = np.random.choice(b.data.shape[0], 10000, replace=False)
        data = b.data[indices]
        labels =[b.labels[0][indices]]
    
    print("################")
    print(f"DATASET: {sip}; unique labels: {len(np.unique(labels))}")
    distances, knee_eps = find_eps(data, min_samples=5)
    candidates = np.arange(0.5 * knee_eps, 1.51 * knee_eps, 0.1*knee_eps)
    
    res = []
    for eps in candidates:
        for ms in range(4, 11):
            model = DBSCAN(eps=eps, min_samples=ms)
            res_labels = model.fit_predict(data)
            res.append((
                sip, 
                len(np.unique(labels)), 
                eps, 
                ms, 
                len(np.unique(res_labels)), 
                len(res_labels[res_labels==-1])/len(res_labels)*100, 
                genieclust.compare_partitions.adjusted_rand_score(labels[0], res_labels.astype(int))
                ))
            # print(f"Eps: {eps:.3f}, min samples: {ms}, unique clusters: {len(np.unique(res_labels))}, % of outliers: {len(res_labels[res_labels==-1])/len(res_labels)*100:.2f} score: \t",  genieclust.compare_partitions.adjusted_rand_score(labels[0], res_labels.astype(int)))

    results.append(pd.DataFrame(res))
results_df = pd.concat(results).reset_index(drop=True)
results_df.columns = ["data_name", "real_nr_unique_clusters", "eps", "min_samples", "nr_unique_clusters", "outlier_share", "score"]


################
DATASET: a1; unique labels: 20
################
DATASET: a2; unique labels: 35
################
DATASET: a3; unique labels: 50
################
DATASET: aggregation; unique labels: 7
################
DATASET: birch1; unique labels: 100
################
DATASET: birch2; unique labels: 100
################
DATASET: compound; unique labels: 7
################
DATASET: d31; unique labels: 31
################
DATASET: r15; unique labels: 15
################
DATASET: flame; unique labels: 3
################
DATASET: jain; unique labels: 2
################
DATASET: pathbased; unique labels: 4
################
DATASET: spiral; unique labels: 3
################
DATASET: s1; unique labels: 15
################
DATASET: s2; unique labels: 15
################
DATASET: s3; unique labels: 15
################
DATASET: s4; unique labels: 15
################
DATASET: unbalance; unique labels: 8


In [None]:
# results_df.to_csv("dbscan_analysis.csv", header=True)

In [3]:
results_df = pd.read_csv("dbscan_analysis.csv")

In [5]:
results_df[results_df["data_name"] == "flame"].sort_values("score", ascending=False).head(20)

Unnamed: 0.1,Unnamed: 0,data_name,real_nr_unique_clusters,eps,min_samples,nr_unique_clusters,outlier_share,score
732,732,flame,3,0.163551,8,3,1.25,0.949455
741,741,flame,3,0.179906,10,3,1.666667,0.944017
748,748,flame,3,0.196261,10,3,0.833333,0.938653
740,740,flame,3,0.179906,9,3,1.25,0.93319
716,716,flame,3,0.13084,6,3,3.75,0.922775
707,707,flame,3,0.114485,4,3,6.25,0.897256
725,725,flame,3,0.147196,8,3,7.083333,0.887994
733,733,flame,3,0.163551,9,3,4.166667,0.886239
734,734,flame,3,0.163551,10,3,6.666667,0.87699
708,708,flame,3,0.114485,5,4,10.0,0.79283
