In [1]:
%load_ext autoreload
%autoreload 2

# NOTES

- When should wandb.run finished?
- When kernel restart, wandb.run not finished, even if I specified it in `__del__` method.

In [2]:
import os
from thesis_work.data import load_data, load_mixed_interacted_compounds
from thesis_work.clustering.runner import ClusterRunner
import pandas as pd

## To disable all wandb logging
os.environ["WANDB_MODE"] = "disabled"

os.environ["WANDB_NOTEBOOK_NAME"] = "./clustering_class.ipynb"
wandb_project_name = "clustering-class-test"

random_state = 42
device = "cuda"

INFO:numba.cuda.cudadrv.driver:init


# DATA

## Ours

In [3]:
protein_type = "kinase"
# protein_type = "protease"
# protein_type = "gpcr"

protein_types = [protein_type]

smiles_df = load_data(protein_type=protein_type)

## Ours - Mixed

In [3]:
each_sample_size = 1000
protein_types = ["gpcr", "kinase", "protease"]
protein_types.sort()
protein_labels = list(range(len(protein_types)))

smiles_df = load_mixed_interacted_compounds(
    protein_types=protein_types,
    each_sample_size=each_sample_size,
    random_state=random_state,
    convert_category=True,
)

## BBBP

In [4]:
# FIXME: Drop invalid SMILES

from thesis_work.utils import get_largest_fragment_from_smiles

protein_types = ["BBBP"]

# Read in data from MoleculeNet
smiles_df = pd.read_csv(
    "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/BBBP.csv"
)

# Clean up columnn names so they are easier to interpret
smiles_df = (
    smiles_df[["smiles", "p_np", "name"]]
    .reset_index(drop=True)
    .rename({"smiles": "text", "p_np": "labels"}, axis=1)
)

# Remove extra fragments in SMILES (typically salts, which are irrelevant to BBB permeability)
smiles_df["text"] = (
    smiles_df["text"].apply(get_largest_fragment_from_smiles).dropna().astype(str)
)

# RUNNER

In [4]:
##################################################################################

# model_name = "DeepChem/ChemBERTa-77M-MTR"
# model_name = "DeepChem/ChemBERTa-77M-MLM"
model_name = "ecfp"
# model_name = "chemprop"

##################################################################################

n_components = 25

# dimensionality_reduction_method = None
# dimensionality_reduction_method_kwargs = None

dimensionality_reduction_method = "UMAP"
dimensionality_reduction_method_kwargs = {
    "n_components": n_components,
    "n_neighbors": 15,
    "min_dist": 0.1,
    "metric": "euclidean",
}

# dimensionality_reduction_method = "PCA"
# dimensionality_reduction_method_kwargs = {
#     "n_components": n_components,
# }

##################################################################################

# clustering_method = "K-MEANS"
# clustering_method_kwargs = {
#     "init_method": "k-means++",
#     "n_clusters": 3,
#     "n_init": 1,
# }

clustering_method = "BUTINA"
clustering_method_kwargs = {
    "method": "ecfp",  # NOTE: Should match with model name
    "distance_metric": "tanimoto",
    # "method": "generic",
    # "distance_metric": "euclidean",
    "threshold": 0.35,
}

# clustering_method = "DBSCAN"
# clustering_method_kwargs = {
#     "min_samples": 5,
#     "metric": "euclidean",
# }

# clustering_method = "HDBSCAN"
# clustering_method_kwargs = {
#     "min_cluster_size": 5,
#     "metric": "euclidean",
# }

##################################################################################


# wandb_run_name = None
wandb_run_name = f"""
    {clustering_method}_
    {model_name if "/" not in model_name else model_name.split("/")[1]}
"""

if dimensionality_reduction_method is not None:
    wandb_run_name += f"_{dimensionality_reduction_method}"

if dimensionality_reduction_method_kwargs is not None:
    wandb_run_name += f"_{dimensionality_reduction_method_kwargs['n_components']}"

# wandb_extra_configs = None
wandb_extra_configs = {"proteins": protein_types}

In [5]:
cluster_runner = ClusterRunner(
    wandb_project_name=wandb_project_name,
    wandb_run_name=wandb_run_name,
    wandb_extra_configs=wandb_extra_configs,
    smiles_df=smiles_df,
    # smiles_df_path = None,
    model_name=model_name,
    random_state=random_state,
    device=device,
    dimensionality_reduction_method=dimensionality_reduction_method,
    dimensionality_reduction_method_kwargs=dimensionality_reduction_method_kwargs,
    clustering_method=clustering_method,
    clustering_method_kwargs=clustering_method_kwargs,
    clustering_evaluation_method="silhouette",
)

INFO:thesis_work.clustering.runner:Disabling dimensionality reduction, since it is not working for BUTINA clustering with ecfp model


In [7]:
# n_clusters = None
n_clusters = list(range(2, 100, 3))

# thresholds = None
thresholds = [0.2, 0.35, 0.5, 0.8]

# min_samples = None
min_samples = [10, 20]

# min_cluster_sizes = None
min_cluster_sizes = [5, 10, 15, 20, 25]

cluster_runner.run_clustering()
# cluster_runner.run_multiple_clustering(n_clusters=n_clusters, thresholds=thresholds, min_cluster_sizes=min_cluster_sizes)

del cluster_runner

In [7]:
del cluster_runner

In [4]:
!wandb sync --clean-old-hours 4

[34m[1mwandb[0m: No runs to be synced.
