#### In this notebook we develop a pipeline for hyperparameter tuning for UMAP + HDBSCAN.

We need to tune the following params:

UMAP:
- n_neighbors: [2, 0.25*len(df)]
- min_dist: [0, 0.99]
- n_components: [2, n_features]
- metric: [9 metrics for binary data]

HDBSCAN:
- min_cluster_size:
- min_samples: 
Note: If you wish to explore different min_cluster_size settings with a fixed min_samples value, especially for larger dataset sizes, you can cache the hard computation, and recompute only the relatively cheap flat cluster extraction using the memory parameter, which makes use of joblib
- cluster_selection_epsilon: ?
[- alpha]X
[- leaf clustering, not EOM]


##### Here we use the DBCV score, but could try others?


In [1]:
RANDOM_SEED = 42

In [2]:
from utilities import load_symptom_data
import hdbscan
import numpy as np
import pandas as pd
import time
import wandb

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [3]:
df = load_symptom_data('../data/cleaned_data_SYMPTOMS_9_13_23.csv')

##### Trying different approach to rescue grid search!

- To get GridSearchCV to fit and return the score for the full dataset, we need to use a predefined split with one copy of the data fro training and another copy for validation.
- We need to create our own scoring function with the correct signature (i.e. no need for y_true), as below.
- Need to make sure refit=False
- Need to make sure that random state is the same for each split ???


### Questions: should DVBC score use local value of 'metric' - problematic for comparing across different runs...


In [4]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.model_selection import PredefinedSplit
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA


from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

In [5]:
run = wandb.init(
        name='run_test_hdb_dbcv',
        project='test_clulster',
        config={}
    )

[34m[1mwandb[0m: Currently logged in as: [33mrusty-chris[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [6]:
ddf = pd.concat([df, df])

In [7]:
split = PredefinedSplit([0 if i < len(df) else 1 for i in range(len(ddf.index))])

In [8]:
test_id = np.array([0 if i < len(df) else 1 for i in ddf.index])

In [9]:
pca = PCA(random_state=42)

In [10]:
hdb = hdbscan.HDBSCAN(gen_min_span_tree=True, core_dist_n_jobs=4)

In [11]:
kmeans = KMeans(n_init='auto', random_state=42)

##### Notes: 
- getting overflow in this version of DBCV when distances are small. Is there another implementation we can use?
- this code may not work for n_jobs!=1 because of the way we obtain the iterations number from the length of the otpimisation result.
- have to downgrade to skopt==0.8.1 and sklearn=0.24.2 for correct behaviour with best_scores_ (and other features)?
- these scores use model.steps[1][1].labels_ instead of model.steps.labels_ because they are accessing the clustering model which is the second step in the pipeline.

In [12]:
def dbcv(data, labels, metric='euclidean'):
    
    if metric == None:
        metric = model.steps[1][1].get_params()['metric']
        
    return hdbscan.validity.validity_index(
            data, labels,
            metric=metric
        )

def dbcv_manhattan(data, labels):
    return dbcv(data, labels, metric='manhattan')

def silhouette(data, labels):
    num_labels = len(set(labels))
    if num_labels == 1:
        print("Warning: Valid number of clusters must be 2 or more.")
        return 0
    else:
        return silhouette_score(data, labels)

def calinski_harabasz(data, labels):
    num_labels = len(set(labels))
    if num_labels == 1:
        print("Warning: Valid number of clusters must be 2 or more.")
        return 0
    else:
        return calinski_harabasz_score(data, labels)

def davies_bouldin(data, labels):
    """
    Note: 0 is best. If using for CV need to use complement.
    """
    num_labels = len(set(labels))
    if num_labels == 1:
        print("Warning: Valid number of clusters must be 2 or more.")
        return 1
    else:
        return davies_bouldin_score(data, labels)

def cv_score(model, X, score='dbcv'):
    """
    If score == 'all' we return a dictionary of all scores, which
    can be logged to wandb on each iteration. 

    Otherwise this is intended for use as a scorer in <X>SearchCV methods.
    In that case metric should be fixed to allow comparison across different runs.
    """
    score_dict = {
        'silhouette': silhouette,
        'dbcv': dbcv,
        'calinski_harabasz': calinski_harabasz,
        'davies_bouldin': davies_bouldin
    }
    
    model.fit(X)
    labels = model.steps[1][1].labels_
    data = model.steps[0][1].transform(X)

    if score == 'all':
        return {
            score_name: score_func(data, labels) 
            for score_name, score_func in score_dict.items()
        }
    else:
        return score_dict[score](data, labels)

In [20]:
hyper_params = {
    'pca__n_components': Integer(5, 150),
    'hdbscan__cluster_selection_epsilon' : Real(0.0, 100.0),
    'hdbscan__cluster_selection_method' : Categorical(['eom', 'leaf']),
    'hdbscan__metric' : Categorical(['euclidean', 'manhattan']),
    'hdbscan__min_cluster_size':Integer(10, 2000),  
    'hdbscan__min_samples': Integer(1,1000),
    
}

# hyper_params = {
#     'pca__n_components': [5, 15],#, 30, 45, 60],
#     'kmeans__n_clusters': Integer(2, 20)
# }

In [21]:
pipe = Pipeline(steps=[('pca', pca), ('hdbscan', hdb)])
# pipe = Pipeline(steps=[('pca', pca), ('kmeans', kmeans)])

In [22]:
tunning = BayesSearchCV(
   estimator=pipe,
   search_spaces=hyper_params,
   scoring=cv_score,
   cv=split,
   n_jobs=-1,
   refit=False,
   return_train_score=True,
   n_iter=1
)

In [31]:
sorted(hyper_params.keys())

['hdbscan__cluster_selection_epsilon',
 'hdbscan__cluster_selection_method',
 'hdbscan__metric',
 'hdbscan__min_cluster_size',
 'hdbscan__min_samples',
 'pca__n_components']

In [27]:
tunning.get_params()

{'cv': PredefinedSplit(test_fold=array([0, 0, ..., 1, 1])),
 'error_score': 'raise',
 'estimator__memory': None,
 'estimator__steps': [('pca', PCA(random_state=42)),
  ('hdbscan', HDBSCAN(gen_min_span_tree=True))],
 'estimator__verbose': False,
 'estimator__pca': PCA(random_state=42),
 'estimator__hdbscan': HDBSCAN(gen_min_span_tree=True),
 'estimator__pca__copy': True,
 'estimator__pca__iterated_power': 'auto',
 'estimator__pca__n_components': None,
 'estimator__pca__n_oversamples': 10,
 'estimator__pca__power_iteration_normalizer': 'auto',
 'estimator__pca__random_state': 42,
 'estimator__pca__svd_solver': 'auto',
 'estimator__pca__tol': 0.0,
 'estimator__pca__whiten': False,
 'estimator__hdbscan__algorithm': 'best',
 'estimator__hdbscan__allow_single_cluster': False,
 'estimator__hdbscan__alpha': 1.0,
 'estimator__hdbscan__approx_min_span_tree': True,
 'estimator__hdbscan__cluster_selection_epsilon': 0.0,
 'estimator__hdbscan__cluster_selection_method': 'eom',
 'estimator__hdbscan__

In [128]:
tunning.total_iterations

6

In [24]:
def wandb_callback(result):
    iter = len(result['x_iters'])
    print('Iteration %d' %iter)
    print(result)
    assert False
    current_params = dict(zip(
        hyper_params.keys(), 
        result['x_iters'][-1]
    ))
    pipe.set_params(**current_params)
    all_scores = cv_score(pipe, df, score='all')
    
    log_dict = {
        'best_score': -result['fun'],
        'best_params': result['x'],
        'current_params': current_params
    }
    for key in all_scores.keys():
        log_dict[key] = all_scores[key]

    run.log(log_dict)
    print(log_dict)

In [25]:
start_time = time.time()
tunning.fit(ddf.to_numpy(), callback=wandb_callback)
elapsed_time = time.time() - start_time
print(elapsed_time)

Iteration 1
          fun: -0.0
            x: [6.450946819875848, 'leaf', 'euclidean', 1706, 981, 126]
    func_vals: [-0.000e+00]
      x_iters: [[6.450946819875848, 'leaf', 'euclidean', 1706, 981, 126]]
       models: []
        space: Space([Real(low=0.0, high=100.0, prior='uniform', transform='normalize'),
                      Categorical(categories=('eom', 'leaf'), prior=None),
                      Categorical(categories=('euclidean', 'manhattan'), prior=None),
                      Integer(low=10, high=2000, prior='uniform', transform='normalize'),
                      Integer(low=1, high=1000, prior='uniform', transform='normalize'),
                      Integer(low=5, high=150, prior='uniform', transform='normalize')])
 random_state: RandomState(MT19937)
        specs:     args:              dimensions: [Real(low=0.0, high=100.0, prior='uniform', transform='normalize'), Categorical(categories=('eom', 'leaf'), prior=None), Categorical(categories=('euclidean', 'manhattan'), 

AssertionError: 

In [132]:
def cv_results_sanity_check(pipe, df, cv_results):

    bs = tunning.best_score_
    bp = tunning.best_params_
    
    pipe.set_params(**bp)

    try:
        assert bs == cv_score(pipe, df.to_numpy())
    except:
        print(bs, cv_score(pipe, df.to_numpy()))
    bid = np.where(tunning.cv_results_['mean_test_score'] == bs)[0][0]

    assert bp == tunning.cv_results_['params'][bid]
    assert bs == tunning.cv_results_['split0_test_score'][bid]
    assert bs == tunning.cv_results_['split1_test_score'][bid]
    assert bs == tunning.cv_results_['split0_train_score'][bid]
    assert bs == tunning.cv_results_['split1_train_score'][bid]

    for i, s in enumerate(tunning.cv_results_['split0_test_score']):
        assert (
            s == tunning.cv_results_['split1_test_score'][i]
        )

    print("These search results passed all sanity checks. They are deterministic and consistent. :)")

In [133]:
cv_results_sanity_check(pipe, df, tunning.cv_results_)

These search results passed all sanity checks. They are deterministic and consistent. :)


In [134]:
cv_score(pipe, df.to_numpy())

0