In [1]:
import pandas as pd
import numpy as np
import sklearn
import optuna
import cuml
import cudf
from torchmetrics.clustering import DunnIndex
import torch
from sklearn.cluster import SpectralClustering
from sklearn.mixture import GaussianMixture
import os
import csv

In [2]:
def load_data_sample(filename):
    data = pd.read_csv(filename)
    days = 365
    data_sample = data[(abs(data['photo2test_bf'])<=days) & (abs(data['photo2test_temper'])<=days)].copy()
    data_sample.drop('id', axis=1, inplace=True)
    return data_sample

def calculate_metrics(X, labels):
    SI = cuml.metrics.cluster.silhouette_score(X, labels)
    DBI = sklearn.metrics.davies_bouldin_score(X, labels)
    CHI = sklearn.metrics.calinski_harabasz_score(X, labels)

    dunn_index = DunnIndex(p=2)
    DI = dunn_index(torch.from_numpy(X.to_numpy()), torch.from_numpy(labels.to_numpy()))

    return DI.item(), SI, DBI, CHI

In [3]:
def get_data(df):
    clip_data = df.drop(['vk_id', 'male', 'photo2test_temper', 'photo2test_bf'], axis=1)
    return clip_data

In [4]:
# LOG_FILENAME must be initialized

def init_logfile(filename, fieldnames):
    if not os.path.exists(filename):
        with open(filename, mode="w", newline="") as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
    
def save_trial(metrics, params, labels):
    row = {
        'DI': metrics[0],
        'SI': metrics[1],
        'DBI': metrics[2],
        'CHI': metrics[3],
        'labels': labels,
    }
    for p in params:
        row[p] = params[p]
    
    with open(LOG_FILENAME, mode="a", newline="") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames)
        writer.writerow(row)

In [5]:
class Pipeline:
    def __init__(self, umap, clust_method):
        self.umap_ = umap
        self.clust_method = clust_method

    def fit_predict(self, data):
        reduced_data = self.umap_.fit_transform(data)
        labels = self.clust_method.fit_predict(reduced_data)
        return reduced_data, labels

In [6]:
def objectiveGPU(trial):
    global clip_data
    # Hyperparameter chooce
    umap_n_components = trial.suggest_int("umap_n_components", 2, 50)
    umap_n_neighbors = trial.suggest_int("umap_n_neighbors", 100, 2000)
    umap_min_dist = trial.suggest_float("umap_min_dist", 0.0, 0.25, step=0.005)
    agglomerative_n_clusters = trial.suggest_int("agglomerative_n_clusters", 2, 200)

    # UMAP
    umap = cuml.UMAP(
        n_components=umap_n_components,
        n_neighbors=umap_n_neighbors,
        min_dist=umap_min_dist,
        metric='cosine',
        random_state=42
    )

    # Clusterization method
    cluster_method = cuml.AgglomerativeClustering(
        n_clusters=agglomerative_n_clusters
    )

    # Pipeline creation
    pipe = Pipeline(umap, cluster_method)
    
    # Clastering with created pipeline
    reduced_data, labels = pipe.fit_predict(clip_data)
        
    params = {
        'umap_n_components': umap_n_components,
        'umap_n_neighbors': umap_n_neighbors,
        'umap_min_dist': umap_min_dist,
        'agglomerative_n_clusters': agglomerative_n_clusters
    }
    
    metrics = calculate_metrics(reduced_data, labels)
    save_trial(metrics, params, labels.tolist())
    return metrics



In [None]:
raw_data = load_data_sample('20250520_df_ivp_set.csv')
clip_data = get_data(raw_data)

LOG_FILENAME = 'logs/agglomerative_bayesian.csv'
fieldnames = ['DI', 'SI', 'DBI', 'CHI', 'umap_n_components', 'umap_n_neighbors', 'umap_min_dist', 'agglomerative_n_clusters', 'labels']
init_logfile(LOG_FILENAME, fieldnames)

n_trials = 2000

bayesian_search_study = optuna.create_study(
    directions=["maximize", "maximize", "minimize", "maximize"], 
    sampler=optuna.samplers.TPESampler()
)

bayesian_search_study.optimize(
    objectiveGPU, 
    n_trials=n_trials, 
    n_jobs=1, 
    show_progress_bar=True,
    catch=[Exception]
)

In [None]:
raw_data = load_data_sample('20250520_df_ivp_set.csv')
clip_data = get_data(raw_data)

LOG_FILENAME = 'logs/agglomerative_grid.csv'
fieldnames = ['DI', 'SI', 'DBI', 'CHI', 'umap_n_components', 'umap_n_neighbors', 'umap_min_dist', 'agglomerative_n_clusters', 'labels']
init_logfile(LOG_FILENAME, fieldnames)

n_trials = 2400

search_space = {
    "umap_n_components": [2, 3, 4, 5, 7, 10, 15, 20, 35, 50],
    "umap_n_neighbors": [100, 250, 500, 1000, 1500, 2000],
    "umap_min_dist": [0.0, 0.025, 0.1, 0.25],
    "agglomerative_n_clusters": [2, 3, 5, 8, 10, 25, 50, 75, 100, 150, 200]
}

grid_search_study = optuna.create_study(
    directions=["maximize", "maximize", "minimize", "maximize"], 
    sampler=optuna.samplers.GridSampler(search_space)
)

grid_search_study.optimize(
    objectiveGPU, 
    n_trials=n_trials, 
    n_jobs=1, 
    show_progress_bar=True,
    catch=[Exception]
)