In [1]:
%load_ext autoreload
%autoreload 2


- When should wandb.run finished?
- When kernel restart, wandb.run not finished, even if I specified it in `__del__` method.

In [2]:
import os
from thesis_work.utils.data import load_data, load_mixed_interacted_compounds
from thesis_work.clustering.runner import ClusterRunner
import pandas as pd

## To disable all wandb logging
os.environ["WANDB_MODE"] = "disabled"

os.environ["WANDB_NOTEBOOK_NAME"] = "./generic.ipynb"
# wandb_project_name = "clustering-6-targets"
wandb_project_name = "CPU-vs-GPU-2"

random_state = 42
device = "cuda"  # TODO: Uncomment for generic experiments

INFO:numba.cuda.cudadrv.driver:init


# DATA

## Ours

In [4]:
protein_type = "kinase"
# protein_type = "protease"
# protein_type = "gpcr"

protein_types = [protein_type]

smiles_df = load_data(protein_type=protein_type)

smiles_df = smiles_df[:100]

Unnamed: 0,text,labels
0,O=C(Cc1cccc2ccccc12)Nc1n[nH]c2ccc(N3CCCS3(=O)=...,1
1,COC(=O)NC[C@@H](NC(=O)c1ccc(-c2nc(C3CCOCC3)cnc...,1
2,COc1ccccc1Nc1cc(Oc2cc(C)c(C)nc2-c2ccccn2)ccn1,1
3,O=C(/C=C/CN1CCCC1)N1CCOc2cc3ncnc(Nc4ccc(F)c(Cl...,1
4,O=C(Nc1cccc(Nc2cc3c(=O)[nH][nH]c(=O)c3cc2Cl)c1...,1
...,...,...
95,CN(C)CCCCc1cc2c(cc1O)c1c3c(c(-c4ccccc4Cl)cc1n2...,1
96,COCCOc1cc2ncc3c(N)nc(-n4ccnc4)cc3c2cc1OC,1
97,COC(=O)c1nc2ccc3ncnc(Nc4ccc(Cl)cc4Cl)c3c2s1,1
98,CCC#CC(=O)Nc1cccc(-c2cnc3[nH]ccc3c2)n1,1


## Ours - Mixed

In [3]:
each_sample_size = 1000
protein_types = [
    "gpcr",
    "ionchannel",
    "kinase",
    "nuclearreceptor",
    "protease",
    "transporter",
]
protein_types.sort()
protein_labels = list(range(len(protein_types)))

smiles_df = load_mixed_interacted_compounds(
    protein_types=protein_types,
    each_sample_size=each_sample_size,
    random_state=random_state,
    convert_labels=False,
)

## BBBP

In [4]:
# FIXME: Drop invalid SMILES

from thesis_work.utils.utils import get_largest_fragment_from_smiles

protein_types = ["BBBP"]

# Read in data from MoleculeNet
smiles_df = pd.read_csv(
    "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/BBBP.csv"
)

# Clean up columnn names so they are easier to interpret
smiles_df = (
    smiles_df[["smiles", "p_np", "name"]]
    .reset_index(drop=True)
    .rename({"smiles": "text", "p_np": "labels"}, axis=1)
)

# Remove extra fragments in SMILES (typically salts, which are irrelevant to BBB permeability)
smiles_df["text"] = (
    smiles_df["text"].apply(get_largest_fragment_from_smiles).dropna().astype(str)
)

# RUNNER

In [4]:
##################################################################################
num_threads = None


##################################################################################

model_name = "DeepChem/ChemBERTa-77M-MTR"
# model_name = "DeepChem/ChemBERTa-77M-MLM"
# model_name = "ecfp"
# model_name = "chemprop"

##################################################################################

n_components = 25

# dimensionality_reduction_method = None
# dimensionality_reduction_method_kwargs = None

dimensionality_reduction_method = "UMAP"
dimensionality_reduction_method_kwargs = {
    "n_components": n_components,
    "n_neighbors": 15,
    "min_dist": 0.1,
    "metric": "euclidean",
}

# dimensionality_reduction_method = "PCA"
# dimensionality_reduction_method_kwargs = {
#     "n_components": n_components,
# }

##################################################################################

clustering_method = "K-MEANS"
clustering_method_kwargs = {
    "init_method": "k-means++",
    "n_clusters": 6,
    "n_init": 1,
}

# clustering_method = "BUTINA"
# clustering_method_kwargs = {
#     # "distance_metric": "tanimoto",
#     "distance_metric": "euclidean",
#     "threshold": 0.35,
# }

# clustering_method = "DBSCAN"
# clustering_method_kwargs = {
#     "min_samples": 5,
#     "metric": "euclidean",
# }

# clustering_method = "HDBSCAN"
# clustering_method_kwargs = {
#     "min_cluster_size": 5,
#     "metric": "euclidean",
# }

##################################################################################

wandb_extra_configs = None
# wandb_extra_configs = {"proteins": protein_types} # TODO: Uncomment for generic experiments


# wandb_run_name = None
wandb_run_name = f"""
    {clustering_method}_
    {model_name if "/" not in model_name else model_name.split("/")[1]}
"""

if dimensionality_reduction_method is not None:
    wandb_run_name += f"_{dimensionality_reduction_method}"

if dimensionality_reduction_method_kwargs is not None:
    wandb_run_name += f"_{dimensionality_reduction_method_kwargs['n_components']}"

## GENERIC TESTS

In [5]:
cluster_runner = ClusterRunner(
    wandb_project_name=wandb_project_name,
    wandb_run_name=wandb_run_name,
    wandb_extra_configs=wandb_extra_configs,
    smiles_df=smiles_df,
    # smiles_df_path = None,
    model_name=model_name,
    random_state=random_state,
    device=device,
    dimensionality_reduction_method=dimensionality_reduction_method,
    dimensionality_reduction_method_kwargs=dimensionality_reduction_method_kwargs,
    clustering_method=clustering_method,
    clustering_method_kwargs=clustering_method_kwargs,
)

In [6]:
# n_clusters = None
n_clusters = list(range(2, 100))

# thresholds = None
thresholds = [0.2, 0.35, 0.5, 0.8]

# min_samples = None
min_samples = [10, 20]

# min_cluster_sizes = None
min_cluster_sizes = [5, 10, 15, 20, 25]

cluster_runner.run_clustering()
# cluster_runner.run_multiple_clustering(
#     n_clusters=n_clusters, thresholds=thresholds, min_cluster_sizes=min_cluster_sizes
# )

del cluster_runner

ValueError: data must be a string or an io object

In [7]:
del cluster_runner

In [2]:
!wandb sync --clean-old-hours 4

[34m[1mwandb[0m: No runs to be synced.


## CPU VS GPU

In [None]:
# NOTE: For forcing to use CPU even if GPU is available

# import os
# from cuml.common.device_selection import using_device_type

# os.environ["CUDA_VISIBLE_DEVICES"] = ""

# with using_device_type('cpu'):
#     pass

In [None]:
import wandb
import time

os.environ["WANDB_RESUME"] = "allow"
os.environ["WANDB_RUN_ID"] = wandb.util.generate_id()

num_threads = None
# num_threads = 1

# wandb_run_name = None
wandb_run_name = "GPU"

device = "cuda"
# device = "cpu"

# mol_nums = [100, 200, 500, 1_000, 5_000, 10_000]
mol_nums = [100, 200, 500, 1_000, 5_000, 10_000, 20_000]

for mol_num in mol_nums:
    protein_type = "kinase"
    protein_types = [protein_type]
    wandb_extra_configs = {"proteins": protein_types}

    smiles_df = load_data(protein_type=protein_type)
    smiles_df = smiles_df[:mol_num]

    cluster_runner = ClusterRunner(
        wandb_project_name=wandb_project_name,
        wandb_run_name=wandb_run_name,
        wandb_extra_configs=wandb_extra_configs,
        smiles_df=smiles_df,
        # smiles_df_path = None,
        model_name=model_name,
        random_state=random_state,
        device=device,
        dimensionality_reduction_method=dimensionality_reduction_method,
        dimensionality_reduction_method_kwargs=dimensionality_reduction_method_kwargs,
        clustering_method=clustering_method,
        clustering_method_kwargs=clustering_method_kwargs,
    )
    start_time = time.time()
    cluster_runner.run_clustering()
    end_time = time.time()
    wandb.log({"running_time": end_time - start_time, "molecule_number": mol_num})
    del cluster_runner

In [2]:
import wandb

if wandb.run is not None:
    wandb.finish()