In [15]:
import pandas as pd
import numpy as np
import DBCV
from dtaidistance import dtw
from dtaidistance import ed
from btseg.clustering.hdbscan_clustering import HDBSCANClusterer
from itertools import product
from sklearn.metrics import silhouette_score

In [16]:
def generate_dtw_matrix(X):
    X = np.array(X)
    distance_matrix = dtw.distance_matrix_fast(X)
    return distance_matrix

In [17]:
results_dict = {
    'noise': [],
    'p_noise': [],
    'instances': [],
    'p_instances': [],
    'smallest_group': [],
    'biggest_group': [],
    'average_size': [],
#     'q1': [],
#     'q2': [],
#     'q3': [],
    'silhouette_score': [],
    'mask': [],
    'metric': [],
    'min_cluster_samples': [],
    'min_samples': [],
    'cluster_selection_method': []
}

## Máscara 1

In [18]:
df = pd.read_csv('time_series_1.csv')
df.head()

Unnamed: 0,id,path,slice_path,a_0,a_1,a_2,a_3,a_4,a_5,a_6,...,a_350,a_351,a_352,a_353,a_354,a_355,a_356,a_357,a_358,a_359
0,1,C:/Users/Gilberto/Desktop/Gilberto/Projetos/bt...,C:/Users/Gilberto/Desktop/Gilberto/Projetos/bt...,109.041277,113.15918,113.282832,108.55874,106.794195,105.076163,103.406963,...,72.532751,102.420701,99.984999,99.724621,99.503769,100.31949,102.176318,104.076895,107.004673,107.004673
1,2,C:/Users/Gilberto/Desktop/Gilberto/Projetos/bt...,C:/Users/Gilberto/Desktop/Gilberto/Projetos/bt...,62.072538,64.195015,65.375837,66.483081,66.753277,67.082039,67.468511,...,66.490601,66.098411,61.66036,60.530984,60.299254,60.207973,60.074953,60.033324,60.0,61.032778
2,3,C:/Users/Gilberto/Desktop/Gilberto/Projetos/bt...,C:/Users/Gilberto/Desktop/Gilberto/Projetos/bt...,91.049437,90.088845,89.202018,89.274856,88.45903,87.572827,87.692645,...,92.655275,92.541882,92.347171,92.26592,92.135769,92.086915,92.0489,92.005435,92.005435,92.021737
3,4,C:/Users/Gilberto/Desktop/Gilberto/Projetos/bt...,C:/Users/Gilberto/Desktop/Gilberto/Projetos/bt...,3.0,3.0,3.0,3.0,7.211103,7.211103,7.211103,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
4,5,C:/Users/Gilberto/Desktop/Gilberto/Projetos/bt...,C:/Users/Gilberto/Desktop/Gilberto/Projetos/bt...,39.115214,39.204592,39.319207,39.458839,39.623226,39.812058,40.024992,...,41.593269,41.593269,40.311289,40.199502,39.115214,39.051248,39.012818,39.0,39.012818,39.051248


In [19]:
X = df.drop(['id', 'path', 'slice_path'], axis=1).values
len(X)

150

In [20]:
min_cluster_samples = list(range(5, 16, 1))
min_samples = [3]
metric = ['euclidean', 'dtw']
cluster_selection_method = ['leaf']

for c in product(min_cluster_samples, metric, min_samples, cluster_selection_method):
    hdbscan = HDBSCANClusterer(
        min_cluster_samples=c[0],
        metric=c[1],
        min_samples=c[2],
        cluster_selection_method=c[3]
    )
    hdbscan.fit(X)
    out_labels = hdbscan.model.labels_
    counts = np.unique(out_labels, return_counts=True)
    
    results_dict['noise'].append(counts[1][0]) # Numero de instâncias detectadas como ruido
    results_dict['p_noise'].append(counts[1][0] / counts[1].sum()) # Porcentagem de ruido
    results_dict['instances'].append(counts[1][1:].sum()) # Numero de instâncias em algum grupo
    results_dict['p_instances'].append(counts[1][1:].sum() / counts[1].sum()) # Porcentagem de instancias agrupadas

    results_dict['smallest_group'].append(counts[1][0] if len(counts[1]) == 1 else counts[1][1:].min())
    results_dict['biggest_group'].append(counts[1][0] if len(counts[1]) == 1 else counts[1][1:].max())
    results_dict['average_size'].append(counts[1][0] if len(counts[1]) == 1 else counts[1][1:].mean())
    
    Xn = df.drop(['id', 'path', 'slice_path'], axis=1).copy()
    Xn['label'] = out_labels
    Xn = Xn.query('label != -1')

    if c[1] == 'euclidean':
        if len(Xn) == 0:
            score = 0
        else:
            score = silhouette_score(Xn.drop(['label'], axis=1), Xn['label'])
    elif c[1] == 'dtw':
        if len(Xn) == 0:
            score = 0
        else:
            inputs = Xn.drop(['label'], axis=1).values
            labels = Xn['label']
            score = silhouette_score(generate_dtw_matrix(inputs), labels, metric='precomputed')
    else:
        score = 0

    results_dict['silhouette_score'].append(score)
#     results_dict['q1'].append(None)
#     results_dict['q2'].append(None)
#     results_dict['q3'].append(None)
    results_dict['mask'].append(1)
    results_dict['metric'].append(c[1])
    results_dict['min_cluster_samples'].append(c[0])
    results_dict['min_samples'].append(c[2])
    results_dict['cluster_selection_method'].append(c[3])

## Máscara 2

In [32]:
df2 = pd.read_csv('time_series_2.csv')
df2.head()

Unnamed: 0,id,path,slice_path,a_0,a_1,a_2,a_3,a_4,a_5,a_6,...,a_350,a_351,a_352,a_353,a_354,a_355,a_356,a_357,a_358,a_359
0,1,C:/Users/Gilberto/Desktop/Gilberto/Projetos/bt...,C:/Users/Gilberto/Desktop/Gilberto/Projetos/bt...,5.0,56.222771,56.320511,55.443665,54.744863,53.935146,54.129474,...,4.472136,4.472136,4.472136,4.472136,4.472136,4.472136,5.0,5.0,5.0,5.0
1,2,C:/Users/Gilberto/Desktop/Gilberto/Projetos/bt...,C:/Users/Gilberto/Desktop/Gilberto/Projetos/bt...,41.10961,41.19466,42.296572,42.579338,42.755117,43.931765,45.354162,...,38.639358,38.470768,39.319207,37.215588,36.124784,36.055513,36.013886,37.0,38.013156,40.049969
2,3,C:/Users/Gilberto/Desktop/Gilberto/Projetos/bt...,C:/Users/Gilberto/Desktop/Gilberto/Projetos/bt...,41.10961,41.19466,41.19466,41.303753,40.447497,40.447497,40.447497,...,44.102154,44.045431,44.045431,44.011362,43.0,43.0,42.011903,42.047592,42.047592,41.10961
3,4,C:/Users/Gilberto/Desktop/Gilberto/Projetos/bt...,C:/Users/Gilberto/Desktop/Gilberto/Projetos/bt...,23.194827,23.345235,23.345235,24.515301,24.515301,25.70992,25.70992,...,23.086793,23.021729,23.021729,23.021729,23.0,23.0,23.021729,23.086793,23.086793,23.194827
4,5,C:/Users/Gilberto/Desktop/Gilberto/Projetos/bt...,C:/Users/Gilberto/Desktop/Gilberto/Projetos/bt...,42.107007,42.190046,41.303753,39.623226,39.623226,38.832976,38.078866,...,45.891176,45.70558,45.398238,45.276926,45.177428,45.099889,44.045431,43.0,42.011903,42.047592


In [33]:
X2 = df2.drop(['id', 'path', 'slice_path'], axis=1).values
len(X2)

150

In [34]:
min_cluster_samples = [5, 10, 15, 20]
min_samples = [3, 4, 5]
metric = ['euclidean', 'dtw']
cluster_selection_method = ['leaf', 'eom']

for c in product(min_cluster_samples, metric, min_samples, cluster_selection_method):
    hdbscan = HDBSCANClusterer(
        min_cluster_samples=c[0],
        metric=c[1],
        min_samples=c[2],
        cluster_selection_method=c[3]
    )
    hdbscan.fit(X2)
    out_labels = hdbscan.model.labels_
    counts = np.unique(out_labels, return_counts=True)
    
    results_dict['noise'].append(counts[1][0]) # Numero de instâncias detectadas como ruido
    results_dict['p_noise'].append(counts[1][0] / counts[1].sum()) # Porcentagem de ruido
    results_dict['instances'].append(counts[1][1:].sum()) # Numero de instâncias em algum grupo
    results_dict['p_instances'].append(counts[1][1:].sum() / counts[1].sum()) # Porcentagem de instancias agrupadas

    results_dict['smallest_group'].append(counts[1][0] if len(counts[1]) == 1 else counts[1][1:].min())
    results_dict['biggest_group'].append(counts[1][0] if len(counts[1]) == 1 else counts[1][1:].max())
    results_dict['average_size'].append(counts[1][0] if len(counts[1]) == 1 else counts[1][1:].mean())
    
    Xn = df2.drop(['id', 'path', 'slice_path'], axis=1).copy()
    Xn['label'] = out_labels
    Xn = Xn.query('label != -1')

    if c[1] == 'euclidean':
        if len(Xn) == 0:
            score = 0
        else:
            score = silhouette_score(Xn.drop(['label'], axis=1), Xn['label'])
    elif c[1] == 'dtw':
        if len(Xn) == 0:
            score = 0
        else:
            inputs = Xn.drop(['label'], axis=1).values
            labels = Xn['label']
            score = silhouette_score(generate_dtw_matrix(inputs), labels, metric='precomputed')
    else:
        score = 0

    results_dict['silhouette_score'].append(score)
#     results_dict['q1'].append(None)
#     results_dict['q2'].append(None)
#     results_dict['q3'].append(None)
    results_dict['mask'].append(2)
    results_dict['metric'].append(c[1])
    results_dict['min_cluster_samples'].append(c[0])
    results_dict['min_samples'].append(c[2])
    results_dict['cluster_selection_method'].append(c[3])

## Máscara 3

In [35]:
df3 = pd.read_csv('time_series_3.csv')
df3 = df3.dropna()
df3.head()

Unnamed: 0,id,path,slice_path,area,a_0,a_1,a_2,a_3,a_4,a_5,...,a_350,a_351,a_352,a_353,a_354,a_355,a_356,a_357,a_358,a_359
0,1,C:/Users/Gilberto/Desktop/Gilberto/Projetos/bt...,C:/Users/Gilberto/Desktop/Gilberto/Projetos/bt...,617.0,13.341664,13.341664,11.7047,11.7047,11.7047,11.7047,...,15.033296,15.0,15.0,15.0,15.033296,15.033296,15.033296,15.132746,15.132746,13.341664
1,2,C:/Users/Gilberto/Desktop/Gilberto/Projetos/bt...,C:/Users/Gilberto/Desktop/Gilberto/Projetos/bt...,645.0,13.341664,13.341664,13.601471,13.601471,13.601471,13.0,...,14.035669,14.0,14.0,14.0,14.035669,14.035669,14.142136,14.142136,14.142136,13.341664
2,3,C:/Users/Gilberto/Desktop/Gilberto/Projetos/bt...,C:/Users/Gilberto/Desktop/Gilberto/Projetos/bt...,290.5,12.369317,12.369317,12.649111,12.649111,12.649111,12.649111,...,13.0,13.0,13.0,13.038405,13.038405,13.038405,12.165525,12.165525,12.165525,12.369317
3,4,C:/Users/Gilberto/Desktop/Gilberto/Projetos/bt...,C:/Users/Gilberto/Desktop/Gilberto/Projetos/bt...,454.0,10.440307,10.440307,11.7047,11.7047,11.7047,11.7047,...,10.0,10.049876,10.049876,10.049876,10.049876,10.198039,10.198039,10.198039,10.198039,10.440307
4,5,C:/Users/Gilberto/Desktop/Gilberto/Projetos/bt...,C:/Users/Gilberto/Desktop/Gilberto/Projetos/bt...,548.5,9.486833,9.486833,9.486833,8.944272,8.944272,8.944272,...,14.035669,10.049876,10.049876,10.049876,9.219544,9.219544,9.219544,9.219544,9.486833,9.486833


In [36]:
X3 = df3.drop(['id', 'path', 'slice_path', 'area'], axis=1).values
len(X3)

137

In [37]:
min_cluster_samples = [5, 10, 15, 20]
min_samples = [3, 4, 5]
metric = ['euclidean', 'dtw']
cluster_selection_method = ['leaf', 'eom']

for c in product(metric, min_cluster_samples, min_samples, cluster_selection_method):
    hdbscan = HDBSCANClusterer(
        min_cluster_samples=c[0],
        metric=c[1],
        min_samples=c[2],
        cluster_selection_method=c[3]
    )
    hdbscan.fit(X3)
    out_labels = hdbscan.model.labels_
    counts = np.unique(out_labels, return_counts=True)
    
    results_dict['noise'].append(counts[1][0]) # Numero de instâncias detectadas como ruido
    results_dict['p_noise'].append(counts[1][0] / counts[1].sum()) # Porcentagem de ruido
    results_dict['instances'].append(counts[1][1:].sum()) # Numero de instâncias em algum grupo
    results_dict['p_instances'].append(counts[1][1:].sum() / counts[1].sum()) # Porcentagem de instancias agrupadas

    results_dict['smallest_group'].append(counts[1][0] if len(counts[1]) == 1 else counts[1][1:].min())
    results_dict['biggest_group'].append(counts[1][0] if len(counts[1]) == 1 else counts[1][1:].max())
    results_dict['average_size'].append(counts[1][0] if len(counts[1]) == 1 else counts[1][1:].mean())
    
    Xn = df3.drop(['id', 'path', 'slice_path'], axis=1).copy()
    Xn['label'] = out_labels
    Xn = Xn.query('label != -1')

    if c[1] == 'euclidean':
        if len(Xn) == 0:
            score = 0
        else:
            score = silhouette_score(Xn.drop(['label'], axis=1), Xn['label'])
    elif c[1] == 'dtw':
        if len(Xn) == 0:
            score = 0
        else:
            inputs = Xn.drop(['label'], axis=1).values
            labels = Xn['label']
            score = silhouette_score(generate_dtw_matrix(inputs), labels, metric='precomputed')
    else:
        score = 0

    results_dict['silhouette_score'].append(score)
#     results_dict['q1'].append(None)
#     results_dict['q2'].append(None)
#     results_dict['q3'].append(None)
    results_dict['mask'].append(3)
    results_dict['metric'].append(c[1])
    results_dict['min_cluster_samples'].append(c[0])
    results_dict['min_samples'].append(c[2])
    results_dict['cluster_selection_method'].append(c[3])

## Resultados

In [21]:
results_df = pd.DataFrame(results_dict)
results_df

Unnamed: 0,noise,p_noise,instances,p_instances,smallest_group,biggest_group,average_size,silhouette_score,mask,metric,min_cluster_samples,min_samples,cluster_selection_method
0,96,0.64,54,0.36,15,23,18.0,0.492334,1,euclidean,5,3,leaf
1,81,0.54,69,0.46,14,38,23.0,0.570573,1,dtw,5,3,leaf
2,96,0.64,54,0.36,15,23,18.0,0.492334,1,euclidean,6,3,leaf
3,81,0.54,69,0.46,14,38,23.0,0.570573,1,dtw,6,3,leaf
4,96,0.64,54,0.36,15,23,18.0,0.492334,1,euclidean,7,3,leaf
5,81,0.54,69,0.46,14,38,23.0,0.570573,1,dtw,7,3,leaf
6,96,0.64,54,0.36,15,23,18.0,0.492334,1,euclidean,8,3,leaf
7,81,0.54,69,0.46,14,38,23.0,0.570573,1,dtw,8,3,leaf
8,96,0.64,54,0.36,15,23,18.0,0.492334,1,euclidean,9,3,leaf
9,81,0.54,69,0.46,14,38,23.0,0.570573,1,dtw,9,3,leaf


In [39]:
results_df.to_csv('./hyper_params_result.csv', index=False)

In [1]:
import pandas as pd
results_df = pd.read_csv('./hyper_params_result.csv')
results_df

Unnamed: 0,noise,p_noise,instances,p_instances,smallest_group,biggest_group,average_size,silhouette_score,mask,metric,min_cluster_samples,min_samples,cluster_selection_method
0,96,0.640000,54,0.360000,15,23,18.000000,0.492334,1,euclidean,5,3,leaf
1,96,0.640000,54,0.360000,15,23,18.000000,0.492334,1,euclidean,5,3,eom
2,106,0.706667,44,0.293333,10,20,14.666667,0.553571,1,euclidean,5,4,leaf
3,106,0.706667,44,0.293333,10,20,14.666667,0.553571,1,euclidean,5,4,eom
4,116,0.773333,34,0.226667,6,18,11.333333,0.601495,1,euclidean,5,5,leaf
...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,137,1.000000,0,0.000000,137,137,137.000000,0.000000,3,dtw,20,3,eom
140,137,1.000000,0,0.000000,137,137,137.000000,0.000000,3,dtw,20,4,leaf
141,137,1.000000,0,0.000000,137,137,137.000000,0.000000,3,dtw,20,4,eom
142,137,1.000000,0,0.000000,137,137,137.000000,0.000000,3,dtw,20,5,leaf


In [22]:
df_x = results_df.loc[(results_df['mask'] == 1) & (results_df['cluster_selection_method'] == 'leaf') & (results_df['min_cluster_samples'] < 20) & (results_df['min_samples'] == 3)]
df_x

Unnamed: 0,noise,p_noise,instances,p_instances,smallest_group,biggest_group,average_size,silhouette_score,mask,metric,min_cluster_samples,min_samples,cluster_selection_method
0,96,0.64,54,0.36,15,23,18.0,0.492334,1,euclidean,5,3,leaf
1,81,0.54,69,0.46,14,38,23.0,0.570573,1,dtw,5,3,leaf
2,96,0.64,54,0.36,15,23,18.0,0.492334,1,euclidean,6,3,leaf
3,81,0.54,69,0.46,14,38,23.0,0.570573,1,dtw,6,3,leaf
4,96,0.64,54,0.36,15,23,18.0,0.492334,1,euclidean,7,3,leaf
5,81,0.54,69,0.46,14,38,23.0,0.570573,1,dtw,7,3,leaf
6,96,0.64,54,0.36,15,23,18.0,0.492334,1,euclidean,8,3,leaf
7,81,0.54,69,0.46,14,38,23.0,0.570573,1,dtw,8,3,leaf
8,96,0.64,54,0.36,15,23,18.0,0.492334,1,euclidean,9,3,leaf
9,81,0.54,69,0.46,14,38,23.0,0.570573,1,dtw,9,3,leaf


In [23]:
df_x[df_x['metric'] == 'euclidean'].instances.values - df_x[df_x['metric'] == 'dtw'].instances.values

array([-15, -15, -15, -15, -15, -15, -15, -15, -15, -15, -24], dtype=int64)

In [24]:
df_x[df_x['metric'] == 'euclidean'].silhouette_score.values - df_x[df_x['metric'] == 'dtw'].silhouette_score.values

array([-0.07823921, -0.07823921, -0.07823921, -0.07823921, -0.07823921,
       -0.07823921, -0.07823921, -0.07823921, -0.07823921, -0.07823921,
        0.07530123])