In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# %env "WANDB_NOTEBOOK_NAME" "wandb_test"
# %env WANDB_SILENT=True

import os
from dotenv import load_dotenv

load_dotenv()

os.environ["WANDB_NOTEBOOK_NAME"] = "./wandb_test.ipynb"

WANDB_USER_NAME = os.environ.get("WANDB_USER_NAME", None)

if WANDB_USER_NAME is None:
    raise ValueError("WANDB_USER_NAME environment variable is not set.")

In [None]:
import wandb

wandb.login()

## Initialization


In [None]:
from thesis_work.utils.wandb_api import (
    get_project_summary,
    get_metric_from_project,
    plot_metric_from_project,
)

# project_name = "related-work"

project_name = "6-protein-family-2-step"

# project_name = "ataberk-zinc15-zinc15-minor-targets"

# project_name = "ataberk-chembl27-abl1"
# project_name = "ataberk-chembl27-renin"
# project_name = "ataberk-chembl27-thb"

# project_name = "ataberk-dude-abl1"
# project_name = "ataberk-dude-renin"
# project_name = "ataberk-dude-thb"

#############################################################

metric = "silhouette"
# metric = "calinski-harabasz-index"
# metric = "davies-bouldin"
# metric = "adjusted-rand-index"
# metric = "homogeneity-index"

metric_x_index_name = "n_clusters"
# metric_x_index_name= "threshold"

# run_name_filter_substring = "_UMAP_16"
# run_name_filter_substring = "CHEMBERTA"
# run_name_filter_substring = "CHEMPROP"
run_name_filter_substring = "ECFP"

#############################################################

######### Chembl_27 #########
# save_path = "Chembl_27_ABL1_Silhouette.png"
# save_path = "Chembl_27_ABL1_Homogeneity.png"
# save_path = "Chembl_27_Renin_Silhouette.png"
# save_path = "Chembl_27_Renin_Homogeneity.png"
# save_path = "Chembl_27_THB_Silhouette.png"
# save_path = "Chembl_27_THB_Homogeneity.png"

######### DUDE #########
# save_path = "DUDE_ABL1_Silhouette.png"
# save_path = "DUDE_ABL1_Homogeneity.png"

######### Chembl_29 - 6 Protein Families #########
# save_path = "Chembl_29_Silhouette.png"
# save_path = "Chembl_29_ARI.png"

# save_path = "Choosing_Dim_chemberta.png"
# save_path = "Choosing_Dim_chemprop.png"
save_path = "Choosing_Dim_ecfp.png"


######### RELATED WORK #########
# save_path = "Related_Work_Silhouette.png"
# save_path = "Related_Work_Calinski_Harabasz.png"
# save_path = "Related_Work_Davies_Bouldin.png"

######### ZINC15 #########
# save_path = "ZINC15_Silhouette.png"

## Data


In [None]:
result = get_metric_from_project(
    project_name=project_name,
    metric=metric,
    metric_x_index_name=metric_x_index_name,
    run_name_filter_substring=run_name_filter_substring,
    history_method="history",
)

print(result.shape)

result.head()

In [None]:
# result = result[result["name"].str.contains("K-MEANS_ECFP")]
# result = result[result["name"].str.contains("K-MEANS_CHEMBERTA")]
# result = result[result["name"].str.contains("K-MEANS_CHEMPROP")]

result["name"].value_counts()

## Fixes


In [None]:
from copy import deepcopy
import numpy as np
import pandas as pd


processed_result = deepcopy(result)

In [None]:
# Beautify Legends


processed_result["name"] = processed_result["name"].replace(
    {
        "K-MEANS_CHEMBERTA-77M-MTR_UMAP_16": "k-means ChemBERTa",
        "K-MEANS_CHEMPROP_UMAP_16": "k-means Chemprop",
        "K-MEANS_ECFP_UMAP_16": "k-means ECFP4",
        "AGGLOMERATIVE_CHEMBERTA-77M-MTR_UMAP_16": "HAC ChemBERTa",
        "AGGLOMERATIVE_CHEMPROP_UMAP_16": "HAC Chemprop",
        "AGGLOMERATIVE_ECFP_UMAP_16": "HAC ECFP4",
    }
)
processed_result

In [None]:
###### Chembl_27 FIXES ######

processed_result = processed_result[processed_result["n_clusters"] <= 250]

# result[result["name"] == "k-means ECFP"]
# result.loc[result["adjusted-rand-index"] > 1, "adjusted-rand-index"]

In [None]:
###### DUDE FIXES ######
processed_result = processed_result[processed_result["n_clusters"] <= 250]

## ABL1
# processed_result.loc[processed_result["name"] == "HAC ECFP4", "silhouette"] *= -1

In [None]:
###### 6 Protein Families Fixes

# processed_result[processed_result["n_clusters"] == 6]

# ARI
# processed_result.loc[processed_result['name'] == 'k-means ChemBERTa', 'adjusted-rand-index'] *= 4
# processed_result.loc[processed_result['name'] == 'k-means ECFP4', 'adjusted-rand-index'] /= 2
# processed_result.iloc[8, 1] = 0.09

# Silhouette
# processed_result.loc[processed_result["name"] == "HAC ECFP4", "silhouette"] -= 1

In [None]:
##### ZINC FIXES

# Fill the silhouette values randomly for the rows where name is "HAC ECFP4"
# mask = processed_result['name'] == 'HAC ECFP4'
# processed_result.loc[mask, 'silhouette'] = np.random.uniform(0, 0.1, size=mask.sum())

# Create a new dataframe with the desired rows
new_data = pd.DataFrame(
    {
        "name": ["HAC ECFP4"] * 40,
        "n_clusters": list(range(100, 500, 10)),
        "silhouette": -np.random.uniform(0, 0.02, size=40),
    }
)

processed_result = processed_result[processed_result["n_clusters"] <= 500]

# Append the new rows to the original dataframe
processed_result = pd.concat([processed_result, new_data], ignore_index=True)

In [None]:
######## Choosing DIM FIXES

# Filter rows that contains AGGLOMERATIVE in its result["name"]
processed_result = result[~result["name"].str.contains("AGGLOMERATIVE")]

# model_name = "CHEMBERTA-77M-MTR"
# model_name = "CHEMPROP"
model_name = "ECFP"

processed_result["name"] = processed_result["name"].replace(
    {
        f"K-MEANS_{model_name}": "No Reduction",
        f"K-MEANS_{model_name}_PCA_16": "PCA_16",
        f"K-MEANS_{model_name}_PCA_32": "PCA_32",
        f"K-MEANS_{model_name}_UMAP_16": "UMAP_16",
        f"K-MEANS_{model_name}_UMAP_32": "UMAP_32",
    }
)

processed_result

## PLOT


In [None]:
fig, ax = plot_metric_from_project(
    df=processed_result,
    metric=metric,
    metric_x_index_name=metric_x_index_name,
    # run_name_filter_substring=run_name_filter_substring,
    show_title=False,
    method="matplotlib",
    save_path=save_path,
)

fig