#### In this notebook we develop a pipeline for hyperparameter tuning for UMAP + HDBSCAN.

We need to tune the following params:

UMAP:
- n_neighbors: [2, 0.25*len(df)]
- min_dist: [0, 0.99]
- n_components: [2, n_features]
- metric: [9 metrics for binary data]

HDBSCAN:
- min_cluster_size:
- min_samples: 
Note: If you wish to explore different min_cluster_size settings with a fixed min_samples value, especially for larger dataset sizes, you can cache the hard computation, and recompute only the relatively cheap flat cluster extraction using the memory parameter, which makes use of joblib
- cluster_selection_epsilon: ?
[- alpha]X
[- leaf clustering, not EOM]


##### Here we use the DBCV score, but could try others?


In [1]:
RANDOM_SEED = 42

In [10]:
from utilities import load_symptom_data
import hdbscan
import numpy as np
import pandas as pd
import time
import wandb

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [3]:
df = load_symptom_data('../data/cleaned_data_SYMPTOMS_9_13_23.csv')

In [4]:
n_iter = 2

In [5]:
# random_search = RandomizedSearchCV(
#     hdb,
#     param_distributions=hyper_params,
#     n_iter=n_iter,
#     scoring=clustering_score,
#     random_state=RANDOM_SEED
# )

# grid_search = GridSearchCV(
#     hdb,
#     param_grid=hyper_params,
#     scoring=clustering_score
# )

In [6]:
# start_time = time.time()
# random_search.fit(df)
# elapsed_time = time.time() - start_time
# print("%d fits took %.1f minutes" % (n_iter, elapsed_time/60))

In [7]:
# import itertools

# hyper_params = {
#     'penalty': ['l1', 'l2'],
#     'class_weight': [None, 'balanced'],
#     'max_iter': [500, 1000, 30]
# }


# a = hyper_params.values()
# combinations = list(itertools.product(*a))

##### Trying different approach to rescue grid search!

- To get GridSearchCV to fit and return the score for the full dataset, we need to use a predefined split with one copy of the data fro training and another copy for validation.
- We need to create our own scoring function with the correct signature (i.e. no need for y_true), as below.
- Need to make sure refit=False
- Need to make sure that random state is the same for each split ???

#### The following is basically working, but needs converting the hdbscan and different scoring metrics...
#### Also needs porting to scikit-optimize...Note: you need to specifiy the search space differently. 

#### And we need to add pipeline that includes a dim reduction algo.

### Necessary to downgrade numpy to <1.24 because skopt uses np.int :/

In [8]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.model_selection import PredefinedSplit
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA



from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

In [12]:
run = wandb.init(
        name='run1',
        project='test_clulster',
        config={}
    )

[34m[1mwandb[0m: Currently logged in as: [33mrusty-chris[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [13]:
ddf = pd.concat([df, df])

In [14]:
split = PredefinedSplit([0 if i < len(df) else 1 for i in range(len(ddf.index))])

In [15]:
test_id = np.array([0 if i < len(df) else 1 for i in ddf.index])

In [16]:
pca = PCA()

In [17]:
hdb = hdbscan.HDBSCAN(gen_min_span_tree=True)

In [18]:
# def cv_score(model, X):
#     model.fit(X)
#     labels = model.labels_
#     num_labels = len(set(labels))
#     if num_labels == 1:
#         raise ValueError('number of labels must be greater than 2')
#     else:
#         return silhouette_score(X, labels)

def cv_score(model, X):
    model.fit(X)
    labels = model.steps[1][1].labels_
    return hdbscan.validity.validity_index(X, labels)

In [19]:
# hyper_params = {"n_clusters": [2, 5, 10, 15, 20, 30]}

# hyper_params = {
#     'min_samples': [10,50],
#     'min_cluster_size':[100],  
#     'cluster_selection_method' : ['eom'],
#     'metric' : ['manhattan'] 
# }

# hyper_params = {
#     'min_samples': Integer(10,50),
#     'min_cluster_size':Integer(100, 101),  
#     'cluster_selection_method' : Categorical(['eom']),
#     'metric' : Categorical(['manhattan'])
# }

hyper_params = {
    'pca__n_components': [5, 15],#, 30, 45, 60],
    'hdbscan__min_samples': Integer(10,200),
    'hdbscan__min_cluster_size':Integer(100, 600),  
    'hdbscan__cluster_selection_method' : Categorical(['eom', 'leaf']),
    'hdbscan__metric' : Categorical(['euclidean', 'manhattan'])
}

# hyper_params = {
#     'min_samples': [10,30,50,60,100],
#     'min_cluster_size':[100,200,300,400,500,600],  
#     'cluster_selection_method' : ['eom','leaf'],
#     'metric' : ['euclidean','manhattan'] 
# }

In [20]:
pipe = Pipeline(steps=[('pca', pca), ('hdbscan', hdb)])

In [22]:
# tunning = GridSearchCV(
#    # estimator=KMeans(n_init='auto', random_state=42),
#    estimator=hdbscan.HDBSCAN(gen_min_span_tree=True),
#    param_grid=hyper_params,
#    scoring=cv_score,
#    cv=split,
#    n_jobs=-1,
#    refit=False,
#    return_train_score=True
# )

tunning = BayesSearchCV(
   # estimator=KMeans(n_init='auto', random_state=42),
   estimator=pipe,
   search_spaces=hyper_params,
   scoring=cv_score,
   cv=split,
   n_jobs=-1,
   refit=False,
   return_train_score=True,
   n_iter=2
)




In [35]:
# TODO: add wandb logging. 
# Include labels_ and params and save to disk every X iterations... 
def wandb_callback(result):
    print(len(result['x_iters']))
    print(tunning.best_score_)
    print(tunning.cv_results_)
    

In [33]:
pipe.steps[1][1]

In [34]:
start_time = time.time()
tunning.fit(ddf.to_numpy(), callback=wandb_callback)
elapsed_time = time.time() - start_time
print(elapsed_time)

1
0.0
{'mean_fit_time': array([0.51079166, 0.46101153]), 'std_fit_time': array([0.00151241, 0.00465   ]), 'mean_score_time': array([0.49981916, 0.44877553]), 'std_score_time': array([0.00352442, 0.00155067]), 'param_hdbscan__cluster_selection_method': masked_array(data=['eom', 'eom'],
             mask=[False, False],
       fill_value='?',
            dtype=object), 'param_hdbscan__metric': masked_array(data=['euclidean', 'manhattan'],
             mask=[False, False],
       fill_value='?',
            dtype=object), 'param_hdbscan__min_cluster_size': masked_array(data=[168, 293],
             mask=[False, False],
       fill_value='?',
            dtype=object), 'param_hdbscan__min_samples': masked_array(data=[159, 99],
             mask=[False, False],
       fill_value='?',
            dtype=object), 'param_pca__n_components': masked_array(data=[10, 6],
             mask=[False, False],
       fill_value='?',
            dtype=object), 'params': [OrderedDict([('hdbscan__cluster_se

In [25]:
tunning.total_iterations

10

In [26]:
tunning.cv_results_

{'mean_fit_time': array([0.43136942, 0.64241326]),
 'std_fit_time': array([0.00223982, 0.00187862]),
 'mean_score_time': array([0.42208838, 0.62923908]),
 'std_score_time': array([0.00247312, 0.00342727]),
 'param_hdbscan__cluster_selection_method': masked_array(data=['leaf', 'eom'],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'param_hdbscan__metric': masked_array(data=['manhattan', 'manhattan'],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'param_hdbscan__min_cluster_size': masked_array(data=[353, 349],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'param_hdbscan__min_samples': masked_array(data=[22, 150],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'param_pca__n_components': masked_array(data=[7, 9],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'params': [OrderedDict([('h

In [220]:
tunning.best_score_

0.0

In [221]:
tunning.best_params_

{'cluster_selection_method': 'eom',
 'metric': 'manhattan',
 'min_cluster_size': 100,
 'min_samples': 10}

In [189]:
# best = KMeans(n_init='auto',random_state=42, **tunning.best_params_).fit(df)
best = KMeans(n_init='auto',random_state=42, n_clusters=2).fit(df)

In [190]:
cv_score(best, df)

0.11371897603114886