In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import LocalOutlierFactor as LOF
import optuna
# For CPU version of pipeline
from sklearn.metrics import silhouette_score
from sklearn.cluster import HDBSCAN
import umap

In [None]:
def write_study_csv(study, filename):
    """
    Write metric values and params from optuna study to csv file.
    params:
        study: Study - optimized optuna study
        filename: str - where put csv file with name
    """
    trials = bayesian_search_study.get_trials(deepcopy=True)
    data = {
        "value": [],
        "params": []
    }
    for trial in trials:
        data["value"].append(trial.value)
        data["params"].append(trial.params)
    df = pd.DataFrame(data)
    df.to_csv(filename)

# Чтение данных
def load_data(filename):
    data = pd.read_csv(filename)
    data.drop('id', axis=1, inplace=True)
    return data

def load_less_data(filename, frac):
    data = load_data(filename)
    return data.sample(frac=frac, random_state=42)

def find_max_value(csv_file):
    """
    Finds max value from csv file where study saved.
    """
    df = pd.read_csv(csv_file)
    max_value = df['value'].max()
    
    return max_value

def calculate_statistics(numbers):
    """
    Finds median and range value for list of values.
    """
    median = np.median(numbers)
    range_value = np.ptp(numbers)
    
    return median, range_value

In [None]:
class PipelineCPU:
    """
    Pipeline CPU realization: UMAP -> LOF -> HDBCAN
    """
    def __init__(
        self,
        umap_n_components, umap_n_neighbors, umap_min_dist, umap_metric,
        lof_n_neighbors, lof_metric,
        hdb_min_cluster_size, hdb_metric, hdb_metric_params,
    ):
        self.umap_ = umap.UMAP(
            n_components=umap_n_components, 
            n_neighbors=umap_n_neighbors, 
            min_dist=umap_min_dist, 
            metric=umap_metric,
        )
        self.lof = LOF(
            n_neighbors=lof_n_neighbors,
            metric=lof_metric
        )
        self.hdb = HDBSCAN(
            min_cluster_size=hdb_min_cluster_size,
            metric=hdb_metric,
            metric_params=hdb_metric_params
        )

    def fit_predict(self, data):
        """
        Returns full reduced data and full labels for it.
        """
        reduced_data = self.umap_.fit_transform(data)
        outliers = self.lof.fit_predict(reduced_data)
        reduced_clean_data = reduced_data[outliers == 1]
        labels = self.hdb.fit_predict(reduced_clean_data)
        
        full_labels = np.full(len(reduced_data), -1)
        full_labels[outliers == 1] = labels
        full_labels[outliers == -1] = -2
        return reduced_data, full_labels

    def fit_predict_without_outliers(self, data):
        """
        Returns clean reduced data and clean labels. All outliers deleted.
        """
        reduced_data = self.umap_.fit_transform(data)
        outliers = self.lof.fit_predict(reduced_data)
        reduced_clean_data = reduced_data[outliers == 1]
        labels = self.hdb.fit_predict(reduced_clean_data)
        reduced_clean_data = reduced_clean_data[labels != -1]
        labels = labels[labels != -1]
        return reduced_clean_data, labels

    def fit_predict_without_lof(self, data):
        """
        Realization of pipeline without LOF. 
        Returns clean reduced data and clean labels. All outliers deleted.
        """
        reduced_data = self.umap_.fit_transform(data)
        labels = self.hdb.fit_predict(reduced_data)
        reduced_clean_data = reduced_data[labels != -1]
        labels = labels[labels != -1]
        return reduced_clean_data, labels

In [None]:
def objectiveCPU(trial):
    """
    CPU version of objective for optuna study to optimize.

    WARNING: using global_data to prevent multiply data readings.
    """
    global global_data
    # Hyperparameter chooce
    umap_n_components = trial.suggest_int("umap_n_components", 2, 50)
    umap_n_neighbors = trial.suggest_int("umap_n_neighbors", 100, 2000)
    umap_min_dist = trial.suggest_float("umap_min_dist", 0.0, 0.25, step=0.01)
    lof_n_neighbors = trial.suggest_int("lof_n_neighbors", 10, 1000)
    
    # Pipeline creation
    pipe = PipelineCPU(
        umap_n_components=umap_n_components, umap_n_neighbors=umap_n_neighbors, umap_min_dist=umap_min_dist, 
        umap_metric="cosine",
        lof_n_neighbors=lof_n_neighbors, lof_metric='minkowski', 
        hdb_min_cluster_size=500, hdb_metric='l2', hdb_metric_params=None
    )

    # Clastering with created pipeline
    reduced_clean_data, clean_labels = pipe.fit_predict_without_outliers(global_data)
    
    # Get silhouette score for clasterisation
    if len(np.unique(clean_labels)) > 1:
        score = silhouette_score(reduced_clean_data, clean_labels)
        return score
    else:
        return 0


In [None]:
%%time
# Bayesian search (Most efficient)
global_data = load_less_data("photos_clip.csv", 0.1)
n_trials = 10
filename = f"bayesian_study.csv"

bayesian_search_study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())
bayesian_search_study.optimize(objectiveCPU, n_trials=n_trials, n_jobs=1, show_progress_bar=True, catch=[Exception])

write_study_csv(bayesian_search_study, filename)

print(f"Best found metric value = {bayesian_search_study.best_value}")
print(f"Best found params: {bayesian_search_study.best_params}")

In [None]:
%%time
# Random search
global_data = load_data("photos_clip.csv")
n_trials = 100
filename = f"random_study.csv"

random_search_study = optuna.create_study(direction="maximize", sampler=optuna.samplers.RandomSampler())
random_search_study.optimize(objectiveCPU, n_trials=n_trials, n_jobs=1, show_progress_bar=True, catch=[Exception])

write_study_csv(random_search_study, filename)

print(f"Best found metric value = {random_search_study.best_value}")
print(f"Best found params: {random_search_study.best_params}")

In [None]:
%%time
# Evolution search
global_data = load_data("photos_clip.csv")
n_trials = 100
filename = f"evolution_study.csv"

evolution_search_study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())
evolution_search_study.optimize(objectiveCPU, n_trials=n_trials, n_jobs=1, show_progress_bar=True, catch=[Exception])

write_study_csv(evolution_search_study, filename)

print(f"Best found metric value = {evolution_search_study.best_value}")
print(f"Best found params: {evolution_search_study.best_params}")

In [None]:
%%time
# Grid Search

global_data = load_data("photos_clip.csv")
n_trials = 100
filename = f"grid_study.csv"

search_space = {
    "umap_n_components": [2, 5, 10, 15, 20, 35, 50],
    "umap_n_neighbors": [100, 250, 500, 1000, 1500, 2000],
    "umap_min_dist": [0.0, 0.025, 0.1, 0.25],
    "lof_n_neighbors": [10, 100, 250, 500, 1000]
}

grid_search_study = optuna.create_study(direction="maximize", sampler=optuna.samplers.GridSampler(search_space))
grid_search_study.optimize(objectiveCPU, n_trials=n_trials, n_jobs=1, show_progress_bar=True, catch=[Exception])

write_study_csv(grid_search_study, filename)

print(f"Best found metric value = {grid_search_study.best_value}")
print(f"Best found params: {grid_search_study.best_params}")