In [1]:
%load_ext autoreload
%autoreload 2

# NOTES

- When should wandb.run finished?
- When kernel restart, wandb.run not finished, even if I specified it in `__del__` method.

In [1]:
import os
from thesis_work.data import load_data, load_mixed_interacted_compounds
from thesis_work.clustering.runner import ClusterRunner

## To disable all wandb logging
# os.environ["WANDB_MODE"] = "disabled"

os.environ["WANDB_NOTEBOOK_NAME"] = "./clustering_class.ipynb"
wandb_project_name = "clustering-class-test"

random_state = 42
device = "cuda"

Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (/home/ilker/miniconda3/envs/thesis-work/lib/python3.10/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'pytorch_lightning'
Skipped loading some Jax models, missing a dependency. No module named 'jax'


# DATA

In [2]:
each_sample_size = 2000
protein_types = ["gpcr", "kinase", "protease"]
protein_types.sort()
protein_labels = [x for x in range(len(protein_types))]

smiles_df = load_mixed_interacted_compounds(
    protein_types=protein_types,
    each_sample_size=each_sample_size,
    random_state=random_state,
    convert_category=True,
)

# RUNNER

In [9]:
model_name = "DeepChem/ChemBERTa-77M-MTR"
# model_name = "ecfp"
# model_name = "chemprop"

n_components = 25

dimensionality_reduction_method = None
dimensionality_reduction_method_kwargs = None

dimensionality_reduction_method = "UMAP"
dimensionality_reduction_method_kwargs = {
    "n_components": n_components,
    "n_neighbors": 15,
    "min_dist": 0.1,
    "metric": "euclidean",
}

# dimensionality_reduction_method = "PCA"
# dimensionality_reduction_method_kwargs = {
#     "n_components": n_components,
# }

# clustering_method = "K-MEANS"
# clustering_method_kwargs = {
#     "init_method": "k-means++",
#     "n_clusters": 3,
#     "n_init": 1,
# }

clustering_method = "BUTINA"
clustering_method_kwargs = {
    # "method": "ecfp", # NOTE: Should match with model name
    # "distance_metric": "tanimoto",
    "method": "generic",
    "distance_metric": "euclidean",
    "threshold": 0.35,
}


# wandb_run_name = None
wandb_run_name = f"""
    {clustering_method}_
    {model_name if "/" not in model_name else model_name.split("/")[1]}
"""

if dimensionality_reduction_method is not None:
    wandb_run_name += f"_{dimensionality_reduction_method}"

if dimensionality_reduction_method_kwargs is not None:
    wandb_run_name += f"_{dimensionality_reduction_method_kwargs['n_components']}"

# wandb_extra_configs = None
wandb_extra_configs = {"proteins": protein_types}

In [10]:
cluster_runner = ClusterRunner(
    wandb_project_name=wandb_project_name,
    wandb_run_name=wandb_run_name,
    wandb_extra_configs=wandb_extra_configs,
    smiles_df=smiles_df,
    # smiles_df_path = None,
    model_name=model_name,
    random_state=random_state,
    device=device,
    dimensionality_reduction_method=dimensionality_reduction_method,
    dimensionality_reduction_method_kwargs=dimensionality_reduction_method_kwargs,
    clustering_method=clustering_method,
    clustering_method_kwargs=clustering_method_kwargs,
    clustering_evaluation_method="silhouette",
)

In [11]:
# n_clusters = None
n_clusters = list(range(3, 21))

# thresholds = None
thresholds = [0.2, 0.35, 0.5, 0.8]

# cluster_runner.run_clustering()
cluster_runner.run_multiple_clustering(n_clusters=n_clusters, thresholds=thresholds)

del cluster_runner

VBox(children=(Label(value='0.003 MB of 0.020 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.136233…

0,1
silhouette_score,▄▁▂█
threshold,▁▃▄█

0,1
silhouette_score,-0.86692
threshold,0.8


In [7]:
del cluster_runner