In [18]:
import os
import sys
import warnings
import uuid
from collections import defaultdict
from tqdm import tqdm
import numpy as np # Can't install NumPy 2.2.2 which is what the pkls were saved with
import pandas as pd # 'v2.2.3'
# import anndata as ad

import optuna
import joblib
import pacmap

from scipy.spatial.distance import (
    pdist, 
    squareform,
)

from sklearn.metrics import (
    pairwise_distances,
    silhouette_score, 
)

from datafold.dynfold import (
    DiffusionMaps, 
    Roseland,
)

from pyexeggutor import (
    build_logger,
    write_pickle,
    # read_pickle,
    format_header,
)


# Metabolic Niche Space
# from metabolic_niche_space.manifold import GroupedNicheSpace
from metabolic_niche_space.neighbors import KNeighborsKernel
# from .utils import (
#     fast_groupby,
#     compile_parameter_space,
#     stop_when_exceeding_trials,
# )



%load_ext memory_profiler

The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler


In [2]:
%%time
# Data
df_quality = pd.read_csv("../data/quality.tsv.gz",sep="\t", index_col=0)

quality_label="completeness_gte90.contamination_lt5"
# quality_label="completeness_gte50.contamination_lt10"
output_directory=f"../data/cluster/mfc/{quality_label}"
os.makedirs(output_directory, exist_ok=True)

genome_to_clusterani = pd.read_csv(f"../data/training/{quality_label}/y.tsv.gz", sep="\t", index_col=0, header=None).iloc[:,0].astype("category")
X_genomic_traits = pd.read_csv(f"../data/training/{quality_label}/X.tsv.gz", sep="\t", index_col=0).astype(bool)
X_genomic_traits_clusterani = pd.read_csv(f"../data/training/{quality_label}/X_grouped.tsv.gz", sep="\t", index_col=0).astype(bool)
eukaryotes = read_list(f"../data/cluster/ani/eukaryotic/{quality_label}/organisms.list", set)
prokaryotes = read_list(f"../data/cluster/ani/prokaryotic/{quality_label}/organisms.list", set)

genome_to_taxonomy = pd.read_csv("../data/taxonomy.tsv.gz", sep="\t", index_col=0).iloc[:,0]
clusterani_to_taxonomy = pd.read_csv("../data/cluster/ani/cluster-ani_to_taxonomy.tsv.gz", sep="\t", index_col=0, header=None).iloc[:,0]
df_meta_mfc__genomes = pd.read_csv(f"../data/cluster/mfc/{quality_label}/identifier_mapping.mfc.genomes.with_openai.tsv.gz", sep="\t", index_col=0)
df_meta_mfc__slc = pd.read_csv(f"../data/cluster/mfc/{quality_label}/identifier_mapping.mfc.genome_clusters.with_openai.tsv.gz", sep="\t", index_col=0)

X_genomic_traits_mfc = X_genomic_traits_clusterani.groupby(df_meta_mfc__slc["id_cluster-mfc"]).sum() > 0
df_kegg = pd.read_csv("/home/ec2-user/SageMaker/s3/newatlantis-raw-veba-db-prod/VDB_v8.1/Annotate/KOfam/kegg-ortholog_metadata.tsv", sep="\t", index_col=0)
ko_to_description = df_kegg["definition"]

print("Number of genomes: {}, Number of features: {}, Number of SLCs: {}".format(*X_genomic_traits.shape, X_genomic_traits_clusterani.shape[0]))
# Number of genomes: 20377, Number of features: 2124, Number of SLCs: 6719

# CPU times: user 3.41 s, sys: 87.8 ms, total: 3.5 s
# Wall time: 3.5 s

Number of genomes: 20377, Number of features: 2124, Number of SLCs: 6719
CPU times: user 3.71 s, sys: 306 ms, total: 4.01 s
Wall time: 4.55 s


In [37]:
X = X_genomic_traits
print(X.shape)
y1 = genome_to_clusterani.loc[X.index]
y2 = df_meta_mfc__genomes["id_cluster-mfc"].loc[X.index].dropna()
y1 = y1.loc[y2.index]
X = X.loc[y2.index]
assert np.all(y1.notnull())
assert np.all(y2.notnull())
print(X.shape)

# genomes_with_mfc = df_meta_mfc__genomes.index[df_meta_mfc__genomes["id_cluster-mfc"].notnull()]
# X = X_genomic_traits.loc[genomes_with_mfc]
# y = df_meta_mfc__genomes.loc[genomes_with_mfc]["id_cluster-mfc"]
# n, m = X.shape

(20377, 2124)
(20355, 2124)


2025-02-18 19:09:15,020 - 4a62c720-c573-426e-83ea-9b855e130bf1 - INFO - [Start] Filtering observations and classes below feature threshold: 1
2025-02-18 19:09:15,022 - 4a62c720-c573-426e-83ea-9b855e130bf1 - INFO - [Dropping] N = 0 observations
2025-02-18 19:09:15,024 - 4a62c720-c573-426e-83ea-9b855e130bf1 - INFO - [Remaining] N = [2, 0, 1]
Categories (3, int64): [0, 1, 2] classes
2025-02-18 19:09:15,024 - 4a62c720-c573-426e-83ea-9b855e130bf1 - INFO - [Remaining] N = 1000 observations
2025-02-18 19:09:15,025 - 4a62c720-c573-426e-83ea-9b855e130bf1 - INFO - [Remaining] N = 100 features
2025-02-18 19:09:15,025 - 4a62c720-c573-426e-83ea-9b855e130bf1 - INFO - [End] Filtering observations and classes below feature threshold
2025-02-18 19:09:15,026 - 4a62c720-c573-426e-83ea-9b855e130bf1 - INFO - [Start] Processing distance matrix
2025-02-18 19:09:15,173 - 4a62c720-c573-426e-83ea-9b855e130bf1 - INFO - [End] Processing distance matrix
2025-02-18 19:09:15,174 - 4a62c720-c573-426e-83ea-9b855e130bf

[I 2025-02-18 19:09:15,176] A new study created in memory with name: 4a62c720-c573-426e-83ea-9b855e130bf1
[I 2025-02-18 19:09:16,136] Trial 0 finished with value: 0.004204608013938112 and parameters: {'n_neighbors': 59, 'n_components': 75, 'alpha': 0.6027633760716439}. Best is trial 0 with value: 0.004204608013938112.
[I 2025-02-18 19:09:17,046] Trial 1 finished with value: 0.011760174491523643 and parameters: {'n_neighbors': 59, 'n_components': 48, 'alpha': 0.6458941130666561}. Best is trial 1 with value: 0.011760174491523643.
[I 2025-02-18 19:09:18,215] Trial 2 finished with value: 0.002436184049775515 and parameters: {'n_neighbors': 49, 'n_components': 91, 'alpha': 0.9636627605010293}. Best is trial 1 with value: 0.011760174491523643.


2025-02-18 19:09:18,377 - 4a62c720-c573-426e-83ea-9b855e130bf1 - INFO - Tuned parameters (Score=0.011760174491523643): {'n_neighbors': 59, 'n_components': 48, 'alpha': 0.6458941130666561}
2025-02-18 19:09:18,378 - 4a62c720-c573-426e-83ea-9b855e130bf1 - INFO - [End] Hyperparameter Tuning


[Parallel Transformation] Initial data: 100%|██████████| 1000/1000 [00:01<00:00, 586.85it/s]

2025-02-18 19:09:20,811 - 4a62c720-c573-426e-83ea-9b855e130bf1 - INFO - Scaling embeddings by steady-state vector
2025-02-18 19:09:20,813 - 4a62c720-c573-426e-83ea-9b855e130bf1 - INFO - Calculating silhouette score for initial data





NicheSpace(Name:4a62c720-c573-426e-83ea-9b855e130bf1, ObservationType: None, FeatureType: None, ClassType: None)
    * kernel_distance_metric: jaccard
    * scoring_distance_metric: euclidean
    * niche_prefix: n
    * checkpoint_directory: checkpoints
    * n_neighbors: 59
    * n_components: 48
    * alpha: 0.6458941130666561
    * score: 0.015460333975544803

In [None]:
%%memit

n, m = X.shape
model_name="test"
mns = HierarchicalNicheSpace(
    observation_type="genome",
    feature_type="ko",
    class1_type="ani-cluster",
    class2_type="mfc-cluster",
    name=model_name,
    n_neighbors=[int, int(np.log(n)), int(np.sqrt(n)/2)],
    n_trials=100,
    n_jobs=-1,
    verbose=3,
    checkpoint_directory="checkpoints",
)
mns.fit(X, y1, y2)


In [38]:
pwd

'/home/ec2-user/SageMaker/projects/metabolic_niche_space/notebooks'

In [None]:
#     def qualitative_transform(
#         self, 
#         n_components=3,
#         n_neighbors=None,
#         MN_ratio=0.5, 
#         FP_ratio=2.0,
#         include_initial_data=True,
#         ):
#         if not self.is_fitted:
#             raise Exception("Please run .fit to build DiffusionMap model before continuing")
            
#         if hasattr(self, "pacmap_embedding_"):
#             return self.pacmap_embedding_
#         else:
#             # initializing the pacmap instance
#             # Setting n_neighbors to "None" leads to an automatic choice shown below in "parameter" section
#             pacmap_model = pacmap.PaCMAP(
#                 n_components=n_components, 
#                 n_neighbors=n_neighbors, 
#                 MN_ratio=MN_ratio, 
#                 FP_ratio=FP_ratio,
#             ) 
#             dataframes = list()
#             if include_initial_data:
#                 dataframes.append(self.X_)

#             X = pd.concat(dataframes, axis=0)

#             if not self.scale_by_steadystate:
#                 X = X.iloc[:,1:]

#             # fit the data (The index of transformed data corresponds to the index of the original data)
#             self.pacmap_embedding_ = pacmap_model.fit_transform(X, init="pca")
#             self.pacmap_embedding_ = pd.DataFrame(self.pacmap_embedding_, index=X.index)
#             self.pacmap_embedding_.columns = self.pacmap_embedding_.columns.map(lambda i: f"PaCMAP-{i+1}")
#             return self.pacmap_embedding_

#     def plot_qualitative(
#         self, 
#         n_dimensions:int=3,
#         engine:str="matplotlib",
#         figsize=(8,8),
#         title=None,
#         **kws,
#         ):
#         if not hasattr(self, "pacmap_embedding_"):
#             raise Exception("Please run .qualitative_transform to compute PaCMAP embeddings before continuing")
#         if engine == "matplotlib":
#             import matplotlib.pyplot as plt
#             from mpl_toolkits.mplot3d import Axes3D        
            
#             """
#             Plots a 3D scatter plot using matplotlib from a DataFrame with exactly three columns.

#             Parameters:
#             -----------
#             df : pd.DataFrame
#                 A DataFrame with three numerical columns representing X, Y, and Z coordinates.
#             title : str, optional
#                 Title of the plot (default is "3D Scatter Plot").
#             """
#             fig = plt.figure(figsize=figsize)
#             ax = fig.add_subplot(111, projection='3d')

#             # Extract X, Y, Z from DataFrame
#             df = self.pacmap_embedding_
#             x, y, z = df.iloc[:, 0], df.iloc[:, 1], df.iloc[:, 2]
            
#             c = df.index.map(lambda x: {True:"red", False:"black"}[x in self.classes_])

#             # Scatter plot
#             ax.scatter(x, y, z, c=c, alpha=0.618, **kws)

#             # Labels and title
#             ax.set_xlabel(df.columns[0])
#             ax.set_ylabel(df.columns[1])
#             ax.set_zlabel(df.columns[2])
#             if title:
#                 ax.set_title(title)

#             return fig, ax
        
    
#     def qualitative_transform(
#         self, 
#         n_components=3,
#         n_neighbors=None,
#         MN_ratio=0.5,  # Set up tuning here
#         FP_ratio=2.0, # Set up tuning here
#         include_initial_data=True,
#         include_grouped_data=True,
#         ):
#         if not self.is_fitted:
#             raise Exception("Please run .fit to build DiffusionMap model before continuing")
            
#         if hasattr(self, "pacmap_embedding_"):
#             return self.pacmap_embedding_
#         else:
#             # initializing the pacmap instance
#             # Setting n_neighbors to "None" leads to an automatic choice shown below in "parameter" section
#             pacmap_model = pacmap.PaCMAP(
#                 n_components=n_components, 
#                 n_neighbors=n_neighbors, 
#                 MN_ratio=MN_ratio, 
#                 FP_ratio=FP_ratio,
#             ) 
#             dataframes = list()
#             if include_initial_data:
#                 dataframes.append(self.X_)

#             if include_grouped_data:
#                 dataframes.append(self.X1_)

#             X = pd.concat(dataframes, axis=0)

#             if not self.scale_by_steadystate:
#                 X = X.iloc[:,1:]

#             # fit the data (The index of transformed data corresponds to the index of the original data)
#             self.pacmap_embedding_ = pacmap_model.fit_transform(X, init="pca")
#             self.pacmap_embedding_ = pd.DataFrame(self.pacmap_embedding_, index=X.index)
#             self.pacmap_embedding_.columns = self.pacmap_embedding_.columns.map(lambda i: f"PaCMAP-{i+1}")
#             return self.pacmap_embedding_
    
        # def plot_qualitative(
#         self, 
#         n_dimensions:int=3,
#         engine:str="matplotlib",
#         figsize=(8,8),
#         title=None,
#         **kws,
#         ):
#         if not hasattr(self, "pacmap_embedding_"):
#             raise Exception("Please run .qualitative_transform to compute PaCMAP embeddings before continuing")
#         if engine == "matplotlib":
#             import matplotlib.pyplot as plt
#             from mpl_toolkits.mplot3d import Axes3D        
            
#             """
#             Plots a 3D scatter plot using matplotlib from a DataFrame with exactly three columns.

#             Parameters:
#             -----------
#             df : pd.DataFrame
#                 A DataFrame with three numerical columns representing X, Y, and Z coordinates.
#             title : str, optional
#                 Title of the plot (default is "3D Scatter Plot").
#             """
#             fig = plt.figure(figsize=figsize)
#             ax = fig.add_subplot(111, projection='3d')

#             # Extract X, Y, Z from DataFrame
#             df = self.pacmap_embedding_
#             x, y, z = df.iloc[:, 0], df.iloc[:, 1], df.iloc[:, 2]
            
#             c = df.index.map(lambda x: {True:"red", False:"black"}[x in self.classes1_])

#             # Scatter plot
#             ax.scatter(x, y, z, c=c, alpha=0.618, **kws)

#             # Labels and title
#             ax.set_xlabel(df.columns[0])
#             ax.set_ylabel(df.columns[1])
#             ax.set_zlabel(df.columns[2])
#             if title:
#                 ax.set_title(title)

#             return fig, ax

In [None]:
# 

Grouping rows by: 100%|██████████| 2124/2124 [00:00<00:00, 7534.99 column/s]


2025-02-14 23:39:59,116 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Start] Filtering observations and classes below feature threshold: 100
2025-02-14 23:39:59,427 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Dropping] N = 248 y1 classes
2025-02-14 23:39:59,430 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Dropping] N = 609 observations
2025-02-14 23:39:59,432 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Remaining] N = 6462 y1 classes
2025-02-14 23:39:59,434 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Remaining] N = 65 y2 classes
2025-02-14 23:39:59,435 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Remaining] N = 19746 observations
2025-02-14 23:39:59,436 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Remaining] N = 2124 features
2025-02-14 23:39:59,437 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [End] Filtering observations and classes below feature threshold
2025-02-14 23:39:59,520 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Start] Processing distance matrix
2025-02-14 23:41:17,128 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [End] Processing distance matrix
2025-02-14 23:41:17,159 - 

[I 2025-02-14 23:41:17,163] A new study created in memory with name: NAL-GDB_MNS_v2.SLC-MFC
  0%|          | 0/100 [00:00<?, ?it/s]

2025-02-14 23:41:17,167 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Trial 0] Fitting Diffision Map: n_neighbors=43, n_components=75, alpha=0.6027633760716439
2025-02-14 23:41:20,871 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Trial 0] Transforming observations: n_neighbors=43, n_components=75, alpha=0.6027633760716439


[Trial 0] Projecting initial data into diffusion space: 100%|██████████| 19746/19746 [04:02<00:00, 81.59it/s]


2025-02-14 23:45:23,116 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Trial 0] Calculating silhouette score: n_neighbors=43, n_components=75, alpha=0.6027633760716439


Best trial: 0. Best value: 0.280766:   1%|          | 1/100 [04:09<6:52:06, 249.77s/it]

[I 2025-02-14 23:45:26,762] Trial 0 finished with value: 0.28076570723785577 and parameters: {'n_neighbors': 43, 'n_components': 75, 'alpha': 0.6027633760716439}. Best is trial 0 with value: 0.28076570723785577.
2025-02-14 23:45:26,933 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Trial 1] Fitting Diffision Map: n_neighbors=43, n_components=48, alpha=0.6458941130666561
2025-02-14 23:45:30,168 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Trial 1] Transforming observations: n_neighbors=43, n_components=48, alpha=0.6458941130666561


[Trial 1] Projecting initial data into diffusion space: 100%|██████████| 19746/19746 [03:59<00:00, 82.40it/s]


2025-02-14 23:49:30,017 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Trial 1] Calculating silhouette score: n_neighbors=43, n_components=48, alpha=0.6458941130666561


Best trial: 1. Best value: 0.384816:   2%|▏         | 2/100 [08:16<6:45:02, 247.99s/it]

[I 2025-02-14 23:49:33,564] Trial 1 finished with value: 0.38481572152753923 and parameters: {'n_neighbors': 43, 'n_components': 48, 'alpha': 0.6458941130666561}. Best is trial 1 with value: 0.38481572152753923.
2025-02-14 23:49:33,677 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Trial 2] Fitting Diffision Map: n_neighbors=36, n_components=91, alpha=0.9636627605010293
2025-02-14 23:49:37,176 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Trial 2] Transforming observations: n_neighbors=36, n_components=91, alpha=0.9636627605010293


[Trial 2] Projecting initial data into diffusion space: 100%|██████████| 19746/19746 [04:03<00:00, 81.02it/s]


2025-02-14 23:53:41,150 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Trial 2] Calculating silhouette score: n_neighbors=36, n_components=91, alpha=0.9636627605010293


Best trial: 1. Best value: 0.384816:   3%|▎         | 3/100 [12:28<6:43:34, 249.63s/it]

[I 2025-02-14 23:53:45,141] Trial 2 finished with value: 0.30093254860769597 and parameters: {'n_neighbors': 36, 'n_components': 91, 'alpha': 0.9636627605010293}. Best is trial 1 with value: 0.38481572152753923.
2025-02-14 23:53:45,259 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Trial 3] Fitting Diffision Map: n_neighbors=33, n_components=82, alpha=0.5288949197529045
2025-02-14 23:53:48,611 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Trial 3] Transforming observations: n_neighbors=33, n_components=82, alpha=0.5288949197529045


[Trial 3] Projecting initial data into diffusion space: 100%|██████████| 19746/19746 [04:03<00:00, 81.22it/s]


2025-02-14 23:57:51,945 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Trial 3] Calculating silhouette score: n_neighbors=33, n_components=82, alpha=0.5288949197529045


Best trial: 1. Best value: 0.384816:   4%|▍         | 4/100 [16:38<6:39:56, 249.96s/it]

[I 2025-02-14 23:57:55,603] Trial 3 finished with value: 0.34447618638860833 and parameters: {'n_neighbors': 33, 'n_components': 82, 'alpha': 0.5288949197529045}. Best is trial 1 with value: 0.38481572152753923.
2025-02-14 23:57:55,728 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Trial 4] Fitting Diffision Map: n_neighbors=44, n_components=94, alpha=0.07103605819788694
2025-02-14 23:57:59,529 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Trial 4] Transforming observations: n_neighbors=44, n_components=94, alpha=0.07103605819788694


[Trial 4] Projecting initial data into diffusion space: 100%|██████████| 19746/19746 [04:04<00:00, 80.60it/s]


2025-02-15 00:02:04,752 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Trial 4] Calculating silhouette score: n_neighbors=44, n_components=94, alpha=0.07103605819788694


Best trial: 1. Best value: 0.384816:   5%|▌         | 5/100 [20:51<6:37:25, 251.01s/it]

[I 2025-02-15 00:02:08,470] Trial 4 finished with value: 0.302305200623859 and parameters: {'n_neighbors': 44, 'n_components': 94, 'alpha': 0.07103605819788694}. Best is trial 1 with value: 0.38481572152753923.
2025-02-15 00:02:08,586 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Trial 5] Fitting Diffision Map: n_neighbors=14, n_components=11, alpha=0.832619845547938
2025-02-15 00:02:10,373 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Trial 5] Transforming observations: n_neighbors=14, n_components=11, alpha=0.832619845547938


[Trial 5] Projecting initial data into diffusion space: 100%|██████████| 19746/19746 [04:00<00:00, 81.97it/s]


2025-02-15 00:06:11,463 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Trial 5] Calculating silhouette score: n_neighbors=14, n_components=11, alpha=0.832619845547938


Best trial: 5. Best value: 0.406818:   6%|▌         | 6/100 [24:57<6:30:50, 249.48s/it]

[I 2025-02-15 00:06:14,964] Trial 5 finished with value: 0.4068183545556655 and parameters: {'n_neighbors': 14, 'n_components': 11, 'alpha': 0.832619845547938}. Best is trial 5 with value: 0.4068183545556655.
2025-02-15 00:06:15,093 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Trial 6] Fitting Diffision Map: n_neighbors=58, n_components=89, alpha=0.978618342232764
2025-02-15 00:06:19,413 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Trial 6] Transforming observations: n_neighbors=58, n_components=89, alpha=0.978618342232764


[Trial 6] Projecting initial data into diffusion space: 100%|██████████| 19746/19746 [04:05<00:00, 80.45it/s]


2025-02-15 00:10:25,098 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Trial 6] Calculating silhouette score: n_neighbors=58, n_components=89, alpha=0.978618342232764


Best trial: 5. Best value: 0.406818:   7%|▋         | 7/100 [29:11<6:28:52, 250.89s/it]

[I 2025-02-15 00:10:28,773] Trial 6 finished with value: 0.30859370067415265 and parameters: {'n_neighbors': 58, 'n_components': 89, 'alpha': 0.978618342232764}. Best is trial 5 with value: 0.4068183545556655.
2025-02-15 00:10:28,885 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Trial 7] Fitting Diffision Map: n_neighbors=59, n_components=51, alpha=0.7805291762864555
2025-02-15 00:10:32,598 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Trial 7] Transforming observations: n_neighbors=59, n_components=51, alpha=0.7805291762864555


[Trial 7] Projecting initial data into diffusion space: 100%|██████████| 19746/19746 [04:00<00:00, 82.26it/s]


2025-02-15 00:14:32,838 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Trial 7] Calculating silhouette score: n_neighbors=59, n_components=51, alpha=0.7805291762864555


Best trial: 5. Best value: 0.406818:   8%|▊         | 8/100 [33:19<6:23:06, 249.86s/it]

[I 2025-02-15 00:14:36,427] Trial 7 finished with value: 0.38602047542979173 and parameters: {'n_neighbors': 59, 'n_components': 51, 'alpha': 0.7805291762864555}. Best is trial 5 with value: 0.4068183545556655.
2025-02-15 00:14:36,535 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Trial 8] Fitting Diffision Map: n_neighbors=16, n_components=68, alpha=0.1433532874090464
2025-02-15 00:14:38,654 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Trial 8] Transforming observations: n_neighbors=16, n_components=68, alpha=0.1433532874090464


[Trial 8] Projecting initial data into diffusion space: 100%|██████████| 19746/19746 [04:01<00:00, 81.90it/s]


2025-02-15 00:18:39,998 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Trial 8] Calculating silhouette score: n_neighbors=16, n_components=68, alpha=0.1433532874090464


Best trial: 8. Best value: 0.569787:   9%|▉         | 9/100 [37:26<6:17:41, 249.03s/it]

[I 2025-02-15 00:18:43,634] Trial 8 finished with value: 0.569787088002823 and parameters: {'n_neighbors': 16, 'n_components': 68, 'alpha': 0.1433532874090464}. Best is trial 8 with value: 0.569787088002823.
2025-02-15 00:18:43,754 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Trial 9] Fitting Diffision Map: n_neighbors=68, n_components=57, alpha=0.4146619399905236
2025-02-15 00:18:47,761 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Trial 9] Transforming observations: n_neighbors=68, n_components=57, alpha=0.4146619399905236


[Trial 9] Projecting initial data into diffusion space: 100%|██████████| 19746/19746 [03:59<00:00, 82.29it/s]


2025-02-15 00:22:47,952 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Trial 9] Calculating silhouette score: n_neighbors=68, n_components=57, alpha=0.4146619399905236


Best trial: 8. Best value: 0.569787:  10%|█         | 10/100 [41:34<6:13:01, 248.69s/it]

[I 2025-02-15 00:22:51,536] Trial 9 finished with value: 0.3737599008472039 and parameters: {'n_neighbors': 68, 'n_components': 57, 'alpha': 0.4146619399905236}. Best is trial 8 with value: 0.569787088002823.
2025-02-15 00:22:51,676 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Trial 10] Fitting Diffision Map: n_neighbors=10, n_components=25, alpha=0.020768834766933364
2025-02-15 00:22:53,413 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Trial 10] Transforming observations: n_neighbors=10, n_components=25, alpha=0.020768834766933364


[Trial 10] Projecting initial data into diffusion space: 100%|██████████| 19746/19746 [03:59<00:00, 82.49it/s]


2025-02-15 00:26:53,010 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Trial 10] Calculating silhouette score: n_neighbors=10, n_components=25, alpha=0.020768834766933364


Best trial: 8. Best value: 0.569787:  11%|█         | 11/100 [45:39<6:07:11, 247.55s/it]

[I 2025-02-15 00:26:56,510] Trial 10 finished with value: 0.5429956254711731 and parameters: {'n_neighbors': 10, 'n_components': 25, 'alpha': 0.020768834766933364}. Best is trial 8 with value: 0.569787088002823.
2025-02-15 00:26:56,639 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Trial 11] Fitting Diffision Map: n_neighbors=10, n_components=21, alpha=0.008275092186310534
2025-02-15 00:26:58,396 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Trial 11] Transforming observations: n_neighbors=10, n_components=21, alpha=0.008275092186310534


[Trial 11] Projecting initial data into diffusion space: 100%|██████████| 19746/19746 [03:58<00:00, 82.72it/s]


2025-02-15 00:30:57,299 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Trial 11] Calculating silhouette score: n_neighbors=10, n_components=21, alpha=0.008275092186310534


Best trial: 8. Best value: 0.569787:  12%|█▏        | 12/100 [49:43<6:01:37, 246.56s/it]

[I 2025-02-15 00:31:00,819] Trial 11 finished with value: 0.4374556912844277 and parameters: {'n_neighbors': 10, 'n_components': 21, 'alpha': 0.008275092186310534}. Best is trial 8 with value: 0.569787088002823.
2025-02-15 00:31:00,946 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Trial 12] Fitting Diffision Map: n_neighbors=23, n_components=34, alpha=0.23785187354887993
2025-02-15 00:31:03,088 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Trial 12] Transforming observations: n_neighbors=23, n_components=34, alpha=0.23785187354887993


[Trial 12] Projecting initial data into diffusion space: 100%|██████████| 19746/19746 [03:59<00:00, 82.37it/s]


2025-02-15 00:35:03,017 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Trial 12] Calculating silhouette score: n_neighbors=23, n_components=34, alpha=0.23785187354887993


Best trial: 12. Best value: 0.601531:  13%|█▎        | 13/100 [53:49<5:57:08, 246.31s/it]

[I 2025-02-15 00:35:06,552] Trial 12 finished with value: 0.6015311054439123 and parameters: {'n_neighbors': 23, 'n_components': 34, 'alpha': 0.23785187354887993}. Best is trial 12 with value: 0.6015311054439123.
2025-02-15 00:35:06,678 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Trial 13] Fitting Diffision Map: n_neighbors=21, n_components=35, alpha=0.23763876186552133
2025-02-15 00:35:08,733 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Trial 13] Transforming observations: n_neighbors=21, n_components=35, alpha=0.23763876186552133


[Trial 34] Projecting initial data into diffusion space: 100%|██████████| 19746/19746 [20:56<00:00, 15.72it/s]


2025-02-15 04:14:16,609 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Trial 34] Calculating silhouette score: n_neighbors=40, n_components=42, alpha=0.6426762465938741


Best trial: 23. Best value: 0.698291:  35%|███▌      | 35/100 [4:33:04<21:38:48, 1198.89s/it]

[I 2025-02-15 04:14:21,406] Trial 34 finished with value: 0.4323190183088095 and parameters: {'n_neighbors': 40, 'n_components': 42, 'alpha': 0.6426762465938741}. Best is trial 23 with value: 0.6982913190341421.
2025-02-15 04:14:21,529 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Trial 35] Fitting Diffision Map: n_neighbors=32, n_components=52, alpha=0.5230532227739966
2025-02-15 04:14:31,080 - NAL-GDB_MNS_v2.SLC-MFC - INFO - [Trial 35] Transforming observations: n_neighbors=32, n_components=52, alpha=0.5230532227739966


[Trial 35] Projecting initial data into diffusion space:  39%|███▉      | 7664/19746 [08:05<12:29, 16.11it/s]

In [None]:

mns = GroupedNicheSpace(
    observation_type="genome",
    feature_type="ko",
    class1_type="ani-cluster",
    class2_type="mfc-cluster",
    name=model_name,
    n_neighbors=[int, int(np.log(n)), int(np.sqrt(n)/2)],
    n_trials=500,
    n_jobs=-1,
    verbose=3,
    checkpoint_directory="checkpoints",
)
mns.fit(X, y1, y2)
mns_test.qualitative_transform()
mns.to_file(mns, os.path.join("checkpoints", f"{model_name}.GroupedNicheSpace.n_trials-500.pkl"))


In [None]:
0

In [None]:
# class NicheSpace(object):
#     """
#     Usage:
#     import numpy as np
#     import pandas as pd
#     from sklearn.datasets import make_classification

#     # Generate synthetic dataset
#     X, y = make_classification(
#         n_samples=1000,        # Number of samples
#         n_features=100,        # Number of boolean features
#         n_informative=50,      # Number of informative features
#         n_redundant=25,        # Number of redundant features
#         n_repeated=0,          # No repeated features
#         n_classes=3,           # Number of classes
#         n_clusters_per_class=1, # One cluster per class
#         weights=[0.33, 0.33, 0.34],  # Balanced class distribution
#         random_state=42        # For reproducibility
#     )

#     # Convert features to boolean (0 or 1) using a threshold at 0
#     X_boolean = (X > 0).astype(int)

#     # Create DataFrame
#     df = pd.DataFrame(X_boolean, columns=[f'Feature_{i+1}' for i in range(100)])
#     df['Class'] = y  # Add class labels

#     y = df.pop("Class")
#     X = df > 0

#     model = NicheSpace(n_trials=3, minimum_nfeatures=1)
#     model.fit(X,y)
#     X_pacmap = model.qualitative_transform()
#     """
#     def __init__(
#         self, 
#         # General
#         name:str=None,
#         observation_type:str=None,
#         feature_type:str=None,
#         class_type:str=None,
#         minimum_nfeatures:int=100,

#         # Diffusion Maps
#         kernel_distance_metric:str="jaccard",
#         # scoring_method:str="silhouette", # or IICR
#         scoring_distance_metric:str="euclidean",
#         n_neighbors:int="auto",
#         n_components:int="auto", # n_eigenpairs in DataFold. First diffusion map vector is steady-state so 1 is automatically added to any n_components value
#         alpha:float="auto",
#         initial_diffusionmap_params:dict=None,

#         scale_by_steadystate:bool=True,
#         niche_prefix="n",
        
#         # PaCMAP
#         pacmap_n_components=3,
#         pacmap_n_neighbors=None,
#         pacmap_MN_ratio=0.5, 
#         pacmap_FP_ratio=2.0,

#         # Optuna
#         n_trials=50,
#         n_jobs:int=1,
#         n_concurrent_trials:int=1,
#         objective_direction="maximize",
#         checkpoint_directory=None,
#         study_timeout=None,
#         study_callbacks=None,
#         random_state=0,
#         verbose=1,
#         stream=sys.stdout,
#         ):
        
#         # General
#         if name is None:
#             name = str(uuid.uuid4())
            
#         self.name = name
#         self.observation_type = observation_type
#         self.feature_type = feature_type
#         self.class_type = class_type
#         self.minimum_nfeatures = minimum_nfeatures
        
#         # Diffusion Maps
#         self.initial_diffusionmap_params = initial_diffusionmap_params
#         self.kernel_distance_metric = kernel_distance_metric
#         # self.scoring_method = scoring_method
#         self.scoring_distance_metric = scoring_distance_metric
#         self.scale_by_steadystate = scale_by_steadystate
#         self.niche_prefix = niche_prefix
        
#         # PaCMAP
#         self.pacmap_n_components=pacmap_n_components
#         self.pacmap_n_neighbors=pacmap_n_neighbors
#         self.pacmap_MN_ratio=pacmap_MN_ratio
#         self.pacmap_FP_ratio=pacmap_FP_ratio
        
#         # Optuna
#         self.n_jobs = n_jobs
#         self.n_trials = n_trials
#         self.n_concurrent_trials = n_concurrent_trials
#         self.checkpoint_directory = checkpoint_directory
#         self.random_state = random_state
#         self.study_timeout = study_timeout
#         if study_callbacks is None:
#             study_callbacks = []
#         self.study_callbacks = study_callbacks
#         self.objective_direction = objective_direction

#         # Hyperparameters
#         # ------------
#         # DiffusionMap
#         self.is_tuned = True
#         if n_neighbors == "auto":
#             n_neighbors = [int, 10, 100]
#             self.is_tuned = False
#         if isinstance(n_neighbors, list):
#             self.is_tuned = False
#         self.n_neighbors = n_neighbors
        
#         if n_components == "auto":
#             n_components = [int, 10, 100]
#             self.is_tuned = False
#         if isinstance(n_components, list):
#             self.is_tuned = False
#         self.n_components = n_components
        
#         if alpha == "auto":
#             alpha = [float, 0.0, 1.0]
#             self.is_tuned = False
#         if isinstance(alpha, list):
#             self.is_tuned = False
#         self.alpha = alpha
        
#         self._diffusionmap_param_space = dict(
#             n_neighbors = self.n_neighbors,
#             n_components = self.n_components,
#             alpha = self.alpha,
#         )
        
#         # ---------------
#         # PacMAP
        
        
#         self.logger = build_logger(self.name, stream=stream)
#         self.verbose = verbose
#         self.is_fitted = False
        
#     def tune_diffusionmap(
#         self,
#         X:pd.DataFrame,
#         y:pd.Series,
#         distance_matrix:np.array,
#         sampler, 
#         **study_kws,
#         ):

#         def _objective(trial):
#             try:

#                 # Compile parameters
#                 params = compile_parameter_space(
#                     trial, 
#                     self._diffusionmap_param_space,
#                 )

#                 # Parameters
#                 n_neighbors = params["n_neighbors"]
#                 n_components = params["n_components"]
#                 alpha = params["alpha"]

#                 if n_neighbors >= X.shape[0]:
#                     return -1 #np.nan
#                 else:
#                     # Build kernel
#                     kernel = KNeighborsKernel( 
#                         metric=self.kernel_distance_metric, 
#                         n_neighbors=n_neighbors, 
#                         distance_matrix=distance_matrix, 
#                         copy_distance_matrix=False,
#                     )

#                     # Calculate Diffusion Maps using KNeighbors
#                     model = DiffusionMaps(kernel=kernel, n_eigenpairs=n_components+1, alpha=alpha)

#                     if self.verbose > 1: self.logger.info(f"[Trial {trial.number}] Fitting Diffision Map: n_neighbors={n_neighbors}, n_components={n_components}, alpha={alpha}")
#                     dmap = model.fit_transform(X)

#                     # Score
#                     if self.verbose > 1: self.logger.info(f"[Trial {trial.number}] Calculating silhouette score: n_neighbors={n_neighbors}, n_components={n_components}, alpha={alpha}")
#                     score = silhouette_score(dmap[:,1:], y.values, metric=self.scoring_distance_metric, sample_size=None, random_state=None) # Ignore steady state vector

#                     return score

#             except Exception as e:
#                 self.logger.error(f"[Trial {trial.number}] Failed due to error: {e}. Marking as pruned.")
#                 raise optuna.TrialPruned()  # Prevents skipping trials

#             finally:
#                 if self.checkpoint_directory:
#                     joblib.dump(study, os.path.join(self.checkpoint_directory, f"{self.name}-diffusionmap.pkl"))  # Save checkpoint

                
#         # Sampler
#         if sampler is None:
#             sampler = optuna.samplers.TPESampler(seed=self.random_state)
        
#         # Study
#         study_params = {
#             "direction":self.objective_direction, 
#             "study_name":self.name, 
#             "sampler":sampler, 
#             **study_kws,
#         }
        
#         # Checkpoints
#         study = None
#         if self.checkpoint_directory:
#             if not os.path.exists(self.checkpoint_directory):
#                 if self.verbose > 1: self.logger.info(f"Creating checkpoint directory: {self.checkpoint_directory}")
#                 os.makedirs(self.checkpoint_directory)
#             # if self.verbose > 1: self.logger.info("Creating sqlite database: {}".format(os.path.join(self.checkpoint_directory, f"{self.name}.db")))
#             # study_params["storage"] = "sqlite:///" + os.path.join(self.checkpoint_directory, f"{self.name}.db")
#             # study_params["load_if_exists"] = True
        
#             serialized_checkpoint_filepath = os.path.join(self.checkpoint_directory, f"{self.name}-diffusionmap.pkl")

#             if os.path.exists(serialized_checkpoint_filepath):
#                 if self.verbose > 1: self.logger.info(f"[Loading] Checkpoint file: {serialized_checkpoint_filepath}")
#                 study = joblib.load(serialized_checkpoint_filepath)
#             else:
#                 if self.verbose > 1: self.logger.info(f"[Creating] Checkpoint file: {serialized_checkpoint_filepath}")

#         if study is None:
#             study = optuna.create_study(**study_params)

#         if self.initial_diffusionmap_params:
#             if self.verbose > 1: self.logger.info(f"Adding initial parameters to study: {self.initial_diffusionmap_params}")
#             study.enqueue_trial(self.initial_diffusionmap_params, user_attrs={"memo": "initial_diffusionmap_params"}, skip_if_exists=True)
            
#         # Optimize
#         callback_fn = stop_when_exceeding_trials(self.n_trials, self.logger)
#         study.optimize(
#             _objective, 
#             n_trials=self.n_trials, 
#             n_jobs=self.n_concurrent_trials,
#             timeout=self.study_timeout, 
#             show_progress_bar=self.verbose >= 2, 
#             callbacks=self.study_callbacks + [_stop_when_exceeding_trials], 
#             gc_after_trial=True,
#         )

#         return study
    
#     def tune_pacmap(
#         self,
#         X:pd.DataFrame,
#         y:pd.Series,
#         distance_matrix:np.array,
#         sampler, 
#         **study_kws,
#         ):

#         def _objective(trial):
#             try:

#                 # Compile parameters
#                 params = compile_parameter_space(
#                     trial, 
#                     self._param_space,
#                 )

#                 # Parameters
#                 n_neighbors = params["n_neighbors"]
#                 n_components = params["n_components"]
#                 alpha = params["alpha"]

#                 if n_neighbors >= X.shape[0]:
#                     return -1 #np.nan
#                 else:
#                     # Build kernel
#                     kernel = KNeighborsKernel( 
#                         metric=self.kernel_distance_metric, 
#                         n_neighbors=n_neighbors, 
#                         distance_matrix=distance_matrix, 
#                         copy_distance_matrix=False,
#                     )

#                     # Calculate Diffusion Maps using KNeighbors
#                     model = DiffusionMaps(kernel=kernel, n_eigenpairs=n_components+1, alpha=alpha)

#                     if self.verbose > 1: self.logger.info(f"[Trial {trial.number}] Fitting Diffision Map: n_neighbors={n_neighbors}, n_components={n_components}, alpha={alpha}")
#                     dmap = model.fit_transform(X)

#                     # if self.verbose > 1: self.logger.info(f"[Trial {trial.number}] Transforming observations: n_neighbors={n_neighbors}, n_components={n_components}, alpha={alpha}")
#                     # dmap = model.transform(X)
#                     # dmap = self._parallel_transform(X, model, progressbar_message=f"[Trial {trial.number}] Projecting initial data into diffusion space")


#                     # Score
#                     if self.verbose > 1: self.logger.info(f"[Trial {trial.number}] Calculating silhouette score: n_neighbors={n_neighbors}, n_components={n_components}, alpha={alpha}")
#                     score = silhouette_score(dmap[:,1:], y.values, metric=self.scoring_distance_metric, sample_size=None, random_state=None) # Ignore steady state vector

#                     return score

#             except Exception as e:
#                 self.logger.error(f"[Trial {trial.number}] Failed due to error: {e}. Marking as pruned.")
#                 raise optuna.TrialPruned()  # Prevents skipping trials

#             finally:
#                 if self.checkpoint_directory:
#                     joblib.dump(study, os.path.join(self.checkpoint_directory, f"{self.name}-diffusionmap.pkl"))  # Save checkpoint

                
#         # Sampler
#         if sampler is None:
#             sampler = optuna.samplers.TPESampler(seed=self.random_state)
        
#         # Study
#         study_params = {
#             "direction":self.objective_direction, 
#             "study_name":self.name, 
#             "sampler":sampler, 
#             **study_kws,
#         }
        
#         # Checkpoints
#         study = None
#         if self.checkpoint_directory:
#             if not os.path.exists(self.checkpoint_directory):
#                 if self.verbose > 1: self.logger.info(f"Creating checkpoint directory: {self.checkpoint_directory}")
#                 os.makedirs(self.checkpoint_directory)
#             # if self.verbose > 1: self.logger.info("Creating sqlite database: {}".format(os.path.join(self.checkpoint_directory, f"{self.name}.db")))
#             # study_params["storage"] = "sqlite:///" + os.path.join(self.checkpoint_directory, f"{self.name}.db")
#             # study_params["load_if_exists"] = True
        
#             serialized_checkpoint_filepath = os.path.join(self.checkpoint_directory, f"{self.name}-diffusionmap.pkl")

#             if os.path.exists(serialized_checkpoint_filepath):
#                 if self.verbose > 1: self.logger.info(f"[Loading] Checkpoint file: {serialized_checkpoint_filepath}")
#                 study = joblib.load(serialized_checkpoint_filepath)
#             else:
#                 if self.verbose > 1: self.logger.info(f"[Creating] Checkpoint file: {serialized_checkpoint_filepath}")

#         if study is None:
#             study = optuna.create_study(**study_params)

#         if self.initial_params:
#             if self.verbose > 1: self.logger.info(f"Adding initial parameters to study: {self.initial_params}")
#             study.enqueue_trial(self.initial_params, user_attrs={"memo": "initial_params"}, skip_if_exists=True)
            
#         # Optimize
#         callback_fn = stop_when_exceeding_trials(self.n_trials, self.logger)
#         study.optimize(
#             _objective, 
#             n_trials=self.n_trials, 
#             n_jobs=self.n_concurrent_trials,
#             timeout=self.study_timeout, 
#             show_progress_bar=self.verbose >= 2, 
#             callbacks=self.study_callbacks + [_stop_when_exceeding_trials], 
#             gc_after_trial=True,
#         )

#         return study


#     def fit(
#         self,
#         X:pd.DataFrame,
#         y:pd.Series,
#         distance_matrix:np.array=None,
#         sampler=None,
#         copy=True,
#         **study_kws,
#         ):
#         def _scale_by_first_column(X: pd.DataFrame) -> pd.DataFrame:
#             """
#             Scale all columns of a DataFrame (except the first one) by the first column.

#             Parameters:
#             -----------
#             X : pd.DataFrame
#                 Input DataFrame where the first column serves as the divisor.

#             Returns:
#             --------
#             pd.DataFrame
#                 A new DataFrame with the first column removed and the remaining columns scaled.
#             """
#             values = X.values  # Convert to NumPy array for efficiency
#             steady_state_vector = values[:, 0].reshape(-1, 1)  # Extract first column as divisor
#             scaled_values = values[:, 1:] / steady_state_vector  # Perform element-wise division

#             return pd.DataFrame(
#                 scaled_values, 
#                 index=X.index, 
#                 columns=X.columns[1:]  # Remove first column name from new DataFrame
#             )
        
#         # Check inputs
#         if not np.all(X.shape[0] == y.size):
#             raise IndexError("X.shape[0] must equal y.size")
#         if not np.all(X.index == y.index):
#             raise IndexError("X.index must equal y.index")
#         if not isinstance(y, pd.CategoricalDtype):
#             y = y.astype("category")
#         self.X_ = X.copy()
#         self.y_ = y.copy()

            
#         # Minimum number of features
#         if self.minimum_nfeatures > 0:
#             if self.verbose > 0:
#                 self.logger.info(f"[Start] Filtering observations and classes below feature threshold: {self.minimum_nfeatures}")

#             number_of_features_per_observation = (X > 0).sum(axis=1)
#             observations_passed_qc = number_of_features_per_observation.index[number_of_features_per_observation >= self.minimum_nfeatures]

#             y = y.loc[observations_passed_qc]
#             X = X.loc[observations_passed_qc]
#             if self.verbose > 0:
#                 self.logger.info(f"[Dropping] N = {sum(number_of_features_per_observation < self.minimum_nfeatures)} observations")
#                 self.logger.info(f"[Remaining] N = {y.unique()} classes")
#                 self.logger.info(f"[Remaining] N = {X.shape[0]} observations")
#                 self.logger.info(f"[Remaining] N = {X.shape[1]} features")
#                 self.logger.info(f"[End] Filtering observations and classes below feature threshold")
            
#         # Dtype
#         if self.kernel_distance_metric == "jaccard":
#             X = X.astype(bool)
            
#         # Distance matrix
#         if distance_matrix is None:
#             if self.verbose > 0:
#                 self.logger.info("[Start] Processing distance matrix")
#             if self.kernel_distance_metric == "euclidean":
#                 distance_matrix = squareform(pdist(X.values, metric=self.kernel_distance_metric))
#             else:
#                 distance_matrix = pairwise_distances(X=X.values, metric=self.kernel_distance_metric, n_jobs=self.n_jobs)
            
#         if len(distance_matrix.shape) == 1:
#             distance_matrix = squareform(distance_matrix)
#         if self.verbose > 0:
#             self.logger.info("[End] Processing distance matrix")

#         # Store
#         self.classes_ = y.cat.categories
#         if copy:
#             self.X_ = X.copy()
#             self.y_ = y.copy()
        
#         # Tune
#         if not self.is_tuned:
#             if self.verbose > 0:
#                 self.logger.info("[Begin] Hyperparameter Tuning")
#             self.study_ = self.tune_diffusionmap(
#                 X=X,
#                 y=y,
#                 distance_matrix=distance_matrix,
#                 sampler=sampler, 
#                 **study_kws,
#                 )
#             for k, v in self.study_.best_params.items():
#                 setattr(self,k,v)
#             if self.verbose > 0:
#                 self.logger.info(f"Tuned parameters (Score={self.study_.best_value}): {self.study_.best_params}")
#                 self.logger.info("[End] Hyperparameter Tuning")
#             self.is_tuned = True
            
#         # Build kernel
#         self.kernel_ = KNeighborsKernel( 
#             metric=self.kernel_distance_metric, 
#             n_neighbors=self.n_neighbors, 
#             distance_matrix=distance_matrix, 
#             copy_distance_matrix=True,
#         )

#         # Calculate Diffusion Maps using KNeighbors
#         self.dmap_model_ = DiffusionMaps(kernel=self.kernel_, n_eigenpairs=self.n_components+1, alpha=self.alpha)
        
#         # Fit
#         dmap = self.dmap_model_.fit(X)

#         # Complete
#         dmap = self._parallel_transform(X, self.dmap_model_, progressbar_message=f"[Parallel Transformation] Initial data")
#         self.diffusion_coordinates_ = pd.DataFrame(dmap, index=X.index)
#         self.diffusion_coordinates_.columns = [f"{self.niche_prefix}0_steady-state"] + list(map(lambda i: f"{self.niche_prefix}{i}", range(1,dmap.shape[1])))
#         self.diffusion_coordinates_.index.name = self.observation_type
#         self.diffusion_coordinates_.columns.name = self.feature_type

#         # Scale
#         if self.scale_by_steadystate:
#             if self.verbose > 0: self.logger.info("Scaling embeddings by steady-state vector")
#             self.diffusion_coordinates_ = _scale_by_first_column(self.diffusion_coordinates_)
#             # Score
#             if self.verbose > 0: self.logger.info("Calculating silhouette score for initial data")
#             self.score_ = silhouette_score(self.diffusion_coordinates_.values, y.values, metric=self.scoring_distance_metric, sample_size=None, random_state=self.random_state)
#         else:
#             # Score
#             if self.verbose > 0: self.logger.info("Calculating silhouette score for initial data excluding steady-state vector")
#             self.score_ = silhouette_score(self.diffusion_coordinates_.values[:,1:], y.values, metric=self.scoring_distance_metric, sample_size=None, random_state=self.random_state)
            
# #         # Setting n_neighbors to "None" leads to an automatic choice shown below in "parameter" section
# #         self.pacmap_model_ = pacmap.PaCMAP(
# #             n_components=n_components, 
# #             n_neighbors=n_neighbors, 
# #             MN_ratio=MN_ratio, 
# #             FP_ratio=FP_ratio,
# #         ) 
# #         dataframes = list()
# #         if include_initial_data:
# #             dataframes.append(self.X_)

# #         X = pd.concat(dataframes, axis=0)

# #         if not self.scale_by_steadystate:
# #             X = X.iloc[:,1:]

# #         # fit the data (The index of transformed data corresponds to the index of the original data)
# #         self.pacmap_embedding_ = pacmap_model.fit_transform(X, init="pca")
# #         self.pacmap_embedding_ = pd.DataFrame(self.pacmap_embedding_, index=X.index)
# #         self.pacmap_embedding_.columns = self.pacmap_embedding_.columns.map(lambda i: f"PaCMAP-{i+1}")
        
#         self.is_fitted = True

#         return self
    
    
#     def transform(
#         self,
#         X,
#         progressbar_message=None,
#         ):
#         if not self.is_fitted:
#             raise Exception("Please run .fit to build DiffusionMap model before continuing")
#         dmap = self._parallel_transform(self, X, dmap_model_, progressbar_message=progressbar_message)
#         if isinstance(X, pd.DataFrame):
#             X_dmap = pd.DataFrame(dmap, index=X.index)
#             X_dmap.columns = [f"{self.niche_prefix}0_steady-state"] + list(map(lambda i: f"{self.niche_prefix}{i}", range(1,dmap.shape[1])))
#             X_dmap.index.name = self.observation_type
#             X_dmap.columns.name = self.feature_type
#         else:
#             return dmap
    
#     def qualitative_transform(
#         self, 
#         n_components=3,
#         n_neighbors=None,
#         MN_ratio=0.5, 
#         FP_ratio=2.0,
#         include_initial_data=True,
#         ):
#         if not self.is_fitted:
#             raise Exception("Please run .fit to build DiffusionMap model before continuing")
            
#         if hasattr(self, "pacmap_embedding_"):
#             return self.pacmap_embedding_
#         else:
#             # initializing the pacmap instance
#             # Setting n_neighbors to "None" leads to an automatic choice shown below in "parameter" section
#             pacmap_model = pacmap.PaCMAP(
#                 n_components=n_components, 
#                 n_neighbors=n_neighbors, 
#                 MN_ratio=MN_ratio, 
#                 FP_ratio=FP_ratio,
#             ) 
#             dataframes = list()
#             if include_initial_data:
#                 dataframes.append(self.X_)

#             X = pd.concat(dataframes, axis=0)

#             if not self.scale_by_steadystate:
#                 X = X.iloc[:,1:]

#             # fit the data (The index of transformed data corresponds to the index of the original data)
#             self.pacmap_embedding_ = pacmap_model.fit_transform(X, init="pca")
#             self.pacmap_embedding_ = pd.DataFrame(self.pacmap_embedding_, index=X.index)
#             self.pacmap_embedding_.columns = self.pacmap_embedding_.columns.map(lambda i: f"PaCMAP-{i+1}")
#             return self.pacmap_embedding_
    
#     def _process_row(self, model, row):
#         """Helper function to apply model.transform to a single row"""
#         return model.transform(row.reshape(1, -1))

#     def _parallel_transform(self, X, model, progressbar_message=None):
#         """Parallelizes the transformation using joblib"""
#         with warnings.catch_warnings():
#             warnings.filterwarnings("ignore", category=UserWarning, message="X does not have valid feature names")
#             output = joblib.Parallel(n_jobs=self.n_jobs, prefer="threads")(
#                 joblib.delayed(self._process_row)(model, row.values) for id, row in tqdm(X.iterrows(), desc=progressbar_message, total=X.shape[0], position=0, leave=True)
#             )
#             return np.vstack(output)
        
#     def plot_qualitative(
#         self, 
#         n_dimensions:int=3,
#         engine:str="matplotlib",
#         figsize=(8,8),
#         title=None,
#         **kws,
#         ):
#         if not hasattr(self, "pacmap_embedding_"):
#             raise Exception("Please run .qualitative_transform to compute PaCMAP embeddings before continuing")
#         if engine == "matplotlib":
#             import matplotlib.pyplot as plt
#             from mpl_toolkits.mplot3d import Axes3D        
            
#             """
#             Plots a 3D scatter plot using matplotlib from a DataFrame with exactly three columns.

#             Parameters:
#             -----------
#             df : pd.DataFrame
#                 A DataFrame with three numerical columns representing X, Y, and Z coordinates.
#             title : str, optional
#                 Title of the plot (default is "3D Scatter Plot").
#             """
#             fig = plt.figure(figsize=figsize)
#             ax = fig.add_subplot(111, projection='3d')

#             # Extract X, Y, Z from DataFrame
#             df = self.pacmap_embedding_
#             x, y, z = df.iloc[:, 0], df.iloc[:, 1], df.iloc[:, 2]
            
#             c = df.index.map(lambda x: {True:"red", False:"black"}[x in self.classes_])

#             # Scatter plot
#             ax.scatter(x, y, z, c=c, alpha=0.618, **kws)

#             # Labels and title
#             ax.set_xlabel(df.columns[0])
#             ax.set_ylabel(df.columns[1])
#             ax.set_zlabel(df.columns[2])
#             if title:
#                 ax.set_title(title)

#             return fig, ax
        
#     # def annotate(self):

#     def to_file(self, filepath):
#         write_pickle(self, filepath)
