# Knife MI analysis

In [None]:
import os

import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
DATASET = "HIV"
LENGTH = 2048

In [None]:
full_df_loss = []
for file in os.listdir("results/losses"):
    #one plot for each model showing the loss on all descriptors. Files are {model}_{descriptor}_{run}_XY.csv
    if file.endswith(".csv"):
        file_split = file[:-4].split("_")
        if file_split[0] == DATASET and file_split[-2] == str(LENGTH):
            model, descriptor = file.split("_")[:2]
            df_tmp = pd.read_csv(os.path.join("results/losses", file))
            full_df_loss.append(df_tmp)

full_df_loss = pd.concat(full_df_loss)
full_df_loss

In [None]:
n_rows = full_df_loss.Y.nunique()

fig, axes = plt.subplots(2,n_rows,figsize=(4*n_rows,8))

for i, model in enumerate(full_df_loss.Y.unique()):
    df_tmp = full_df_loss[(full_df_loss.Y == model) & (full_df_loss.direction == "X->Y")]
    sns.lineplot(data=df_tmp, x="epoch", y="loss", hue="X", ax=axes[0,i])
    axes[0,i].set_title(model)
    axes[0,i].set_xlabel("")
    axes[0,i].set_ylabel("X->Y loss")

    df_tmp = full_df_loss[(full_df_loss.Y == model) & (full_df_loss.direction == "Y->X")]
    sns.lineplot(data=df_tmp, x="epoch", y="loss", hue="X", ax=axes[1,i])
    axes[1,i].set_xlabel("Epoch")
    if i== 0:
        axes[0,i].set_ylabel("X->Y loss")
        axes[1,i].set_ylabel("Y->X loss")
    else:
        axes[0,i].set_ylabel("")
        axes[1,i].set_ylabel("")



## MI between descriptors and embeddings

In [None]:


import os

all_df = []
for file in os.listdir("results"):
    if file.endswith(".csv"):
        file_split = file[:-4].split("_")
        if file_split[0] == DATASET and file_split[-1] == str(LENGTH):
            all_df.append(pd.read_csv(os.path.join("results", file)))
df = pd.concat(all_df)

In [None]:
df[df.isna().any(axis=1)]

## Clustermap

In [None]:
dim_desc = {}
import numpy as np
from tqdm import tqdm as tqdm

for file in tqdm(os.listdir("data/{}".format(DATASET))):
    if file.endswith(".npy"):
        if file[:-4].split('_')[1] == str(LENGTH):
            desc = file.split('_')[0]
            desc_val = np.load("data/{}/{}".format(DATASET, file), mmap_mode="r")
            dim_desc[desc + str(LENGTH)] = desc_val.shape[1]
dim_desc

In [None]:
df["I(Y->X)/dim"] = df.apply(lambda x: x["I(Y->X)"]/dim_desc[x.X], axis=1)
df["I(X->Y)/dim"] = df.apply(lambda x: x["I(X->Y)"]/300, axis=1)

df["I(Y->X)/logdim"] = df.apply(lambda x: x["I(Y->X)"]/np.log(dim_desc[x.X]), axis=1)
df["I(X->Y)/logdim"] = df.apply(lambda x: x["I(X->Y)"]/np.log(300), axis=1)

df["I(Y->X) - I(X->Y)"] =  df["I(Y->X)"]-df["I(X->Y)"]
df["I(Y->X)/dim - I(X->Y)/dim"] = df["I(Y->X)/dim"]-df["I(X->Y)/dim"]
df["I(Y->X)/logdim - I(X->Y)/logdim"] = df["I(Y->X)/logdim"]-df["I(X->Y)/logdim"]

In [None]:
keys = [
    "I(Y->X)", "I(X->Y)", "I(Y->X)/dim", "I(X->Y)/dim", "I(Y->X)/logdim", "I(X->Y)/logdim",
    "I(Y->X) - I(X->Y)", "I(Y->X)/dim - I(X->Y)/dim", "I(Y->X)/logdim - I(X->Y)/logdim"
]

for key in keys:
    df[key + "_normed"] = df.apply(lambda x: x[key] - df[(df.Y == "Not-trained") & (df.X == x.X)][key].values[0], axis=1)


In [None]:
def plot_cmap(df, key):
    std_cmap = sns.clustermap(
        df.pivot_table(index="X", columns="Y", values=key, aggfunc="mean"),
        cmap="viridis", figsize=(8,8)
    )
    std_cmap.savefig("fig/std_cmap.png")
    plt.clf()

    norm_cmap = sns.clustermap(
        df.pivot_table(index="X", columns="Y", values=f"{key}_normed", aggfunc="mean"),
        cmap="coolwarm", center=0, figsize=(8,8),
    )
    #save temp images to then display both in a subplot
    norm_cmap.savefig("fig/norm_cmap.png")
    plt.clf()
    import matplotlib.image as mpimg
    fig, axes = plt.subplots(1,2, figsize=(16,8))
    axes[0].imshow(mpimg.imread("fig/std_cmap.png"))
    axes[0].axis("off")
    axes[0].set_title("Standard MI")
    axes[1].imshow(mpimg.imread("fig/norm_cmap.png"))
    axes[1].axis("off")
    axes[1].set_title("Normalized MI")



In [None]:
plot_cmap(df, "I(X->Y)")

In [None]:
plot_cmap(df, "I(Y->X)")

In [None]:
plot_cmap(df, "I(Y->X) - I(X->Y)")

In [None]:
plot_cmap(df, "I(X->Y)/dim")

In [None]:
plot_cmap(df, "I(Y->X)/dim")

In [None]:
plot_cmap(df, "I(Y->X)/logdim")

In [None]:
plot_cmap(df, "I(X->Y)/logdim")

In [None]:
plot_cmap(df, "I(Y->X)/logdim - I(X->Y)/logdim")

In [None]:
dim_desc

In [None]:
sns.clustermap(
    df.pivot_table(index="X", columns="Y", values="I(Y->X)_normed", aggfunc="mean"),
    cmap="coolwarm", figsize=(8,8), center=0
)

In [None]:
# Normalize each I(X->Y) by the value of I(X->Y) for the untrained model by substracting it
df["I(X->Y)_normed"] = df.apply(lambda x: x["I(X->Y)"] - df[(df.Y == "Not-trained") & (df.X == x.X)]["I(X->Y)"].values[0], axis=1)

# Fine-tuning

In [None]:
!pip install networkx==2.8.8

In [None]:
!pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.2.0+cu118.html --force

In [None]:
import os
from tdc.single_pred import Tox

from sklearn.model_selection import train_test_split
import datamol as dm
import pandas as pd
import numpy as np
import torch

from evaluation import get_dataloaders, Feed_forward
from precompute_3d import precompute_3d

df = Tox(name="ClinTox").get_data()
mols, smiles = precompute_3d(df["Drug"].to_numpy(), "ClinTox")
valid_indices = []
mols_valid = []
i_mol = 0
for i,s in enumerate(df["Drug"]):
    if dm.to_smiles(dm.to_mol(s), True, False) in smiles and not "*" in s:
        df["Drug"].iloc[i] = dm.to_smiles(dm.to_mol(s), True, False)
        valid_indices.append(i)
        mols_valid.append(mols[i_mol])
        i_mol += 1

df = df.iloc[valid_indices]
df["Mol"] = mols_valid

smiles = df["Drug"].to_numpy()
y = df["Y"].to_numpy()

df_train, df_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=42)
df.sample(5)

In [None]:
def run_expe(model_path = "backbone_pretrained_models/GraphLog/Contextual.pth", desc_name="", n_epochs = 100, plot=False):
    dataloader_train, dataloader_test, input_dim = get_dataloaders(smiles_train, y_train, smiles_test, y_test, desc_name=desc_name, model_path=model_path)
    model = Feed_forward(
        input_dim = input_dim,
        hidden_dim = 128,
        output_dim = 1,
        n_layers = 1,
        d_rate=0.3,
        norm="batch"
    )
    model.train_model(dataloader_train, dataloader_test, n_epochs=n_epochs)
    model_name = "None" if model_path=="None" else model_path.split("/")[-2] + " "
    if plot:
        model.plot_loss(title = model_name + " " + desc_name)
    return model

In [None]:
descriptors = [
            "physchem",
            "ecfp-count",
            "ecfp",
            "estate",
            "erg",
            "rdkit",
            "topological",
            "avalon",
            "maccs",
            "scaffoldkeys",
            "cats",
            "default",
            "gobbi",
            "pmapper",
            "cats/3D",
            "gobbi/3D",
            "pmapper/3D",
        ]

In [None]:
df_mi = pd.read_csv("resultsClinTox.csv").groupby(["X", "Y"])["I(X->Y)"].mean()

In [None]:
from tqdm import tqdm
df_desc = {"descriptor":[], "best_acc": [], "best_f1":[], "best_roc":[], "best_aucpr":[]}

N_RUNS = 5
p_bar = tqdm(total=len(descriptors) * N_RUNS, desc="Fine tuning",position=0, leave=True)

for d in descriptors:
        dataloader_train, dataloader_test, input_dim = get_dataloaders(
            df_train.Drug, y_train, df_test.Drug, y_test,df_train["Mol"], df_test["Mol"], desc_name=d, model_path="None")
        for _ in range(N_RUNS):
            df_desc["descriptor"].append(d)
            model = Feed_forward(
                input_dim = input_dim,
                hidden_dim = 128,
                output_dim = 1,
                n_layers = 1,
                d_rate=0.3,
                norm="batch"
            )
            model.train_model(dataloader_train, dataloader_test, n_epochs=300)
            best_acc = np.max(model.test_acc)
            df_desc["best_acc"].append(best_acc)
            df_desc["best_f1"].append(np.max(model.test_f1))
            df_desc["best_roc"].append(np.max(model.test_roc))
            df_desc["best_aucpr"].append(np.max(model.test_aucpr))
            p_bar.update(1)



In [None]:
df_desc = pd.DataFrame(df_desc)
df_desc


In [None]:
MODEL_PATH = "backbone_pretrained_models"
MODELS = {}
# For every directory in the folder
for model_name in os.listdir(MODEL_PATH):
    # For every file in the directory
    for file_name in os.listdir(os.path.join(MODEL_PATH, model_name)):
        # If the file is a .pth file
        if file_name.endswith(".pth"):
            MODELS[model_name] = os.path.join(MODEL_PATH, model_name, file_name)
MODELS["Not-trained"] = ""

In [None]:
from tqdm import tqdm
df_model = {"model":[], "best_acc": [], "best_f1":[], "best_roc":[], "best_aucpr":[]}

N_RUNS = 5
p_bar = tqdm(total=len(MODELS) * N_RUNS, desc="Fine tuning",position=0, leave=True)

for model_name, model_path in MODELS.items():
        dataloader_train, dataloader_test, input_dim = get_dataloaders(
            df_train.Drug, y_train, df_test.Drug, y_test,df_train["Mol"], df_test["Mol"], desc_name="None", model_path=model_path)
        for _ in range(N_RUNS):
            df_model["model"].append(model_name)
            model = Feed_forward(
                input_dim = input_dim,
                hidden_dim = 128,
                output_dim = 1,
                n_layers = 1,
                d_rate=0.3,
                norm="batch"
            )
            model.train_model(dataloader_train, dataloader_test, n_epochs=300)
            best_acc = np.max(model.test_acc)
            df_model["best_acc"].append(best_acc)
            df_model["best_f1"].append(np.max(model.test_f1))
            df_model["best_roc"].append(np.max(model.test_roc))
            df_model["best_aucpr"].append(np.max(model.test_aucpr))
            p_bar.update(1)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

fig, axes = plt.subplots(3,1,figsize=(32,16))
for i,metric in enumerate(["best_f1", "best_roc", "best_aucpr"]):
    sns.barplot(data=df_desc.sort_values("best_aucpr"), x="descriptor", y=metric, hue="descriptor", ax = axes[i])
    axes[i].set_ylim(max(0,df_desc[metric].min()-0.05), min(1,df_desc[metric].max() + 0.05))
plt.show()

In [None]:
df_model = pd.DataFrame(df_model)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

fig, axes = plt.subplots(3,1,figsize=(32,16))
for i,metric in enumerate(["best_f1", "best_roc", "best_aucpr"]):
    sns.barplot(data=df_model.sort_values("best_aucpr"), x="model", y=metric, hue="model", ax = axes[i])
    axes[i].set_ylim(max(0,df_desc[metric].min()-0.05), min(1,df_desc[metric].max() + 0.05))
plt.show()

In [None]:
df_model

In [None]:
df_mi

In [None]:
df_mi = pd.read_csv("resultsClinTox.csv")
df_mi["I(X->Y)_normed"] = df_mi.apply(lambda x: x["I(X->Y)"] - df_mi[(df.Y == "Not-trained") & (df_mi.X == x.X)]["I(X->Y)"].values[0], axis=1)
df_mi["I(Y->X)_normed"] = df_mi.apply(lambda x: x["I(Y->X)"] - df_mi[(df.Y == "Not-trained") & (df_mi.X == x.X)]["I(Y->X)"].values[0], axis=1)


In [None]:
hue_order = df_model.model.unique()
fig, axess = plt.subplots(len(descriptors),3,figsize=(3*7,len(descriptors)*7))
first = True
for descriptor, axes in zip(descriptors, axess):
    df_plot = df_model[~(df_model.model.isin(["Not-trained", "EdgePred"]))].merge(df_mi, left_on="model", right_on="Y").drop(columns=["Y"]).rename(columns={"X":"descriptor"})
    df_plot = df_plot[df_plot.descriptor == descriptor]
    for i,(metric, ax) in enumerate(zip(["best_f1", "best_roc", "best_aucpr"], axes)):
        #Scatterplot with linear regression amd corr-coefficient
        sns.regplot(data=df_plot, y=metric, x="I(X->Y)_normed", ax =ax, scatter_kws={"alpha":0.5}, line_kws={"color":"red"})
        #display correlation coefficient
        corr = df_plot[[metric, "I(X->Y)_normed"]].corr().iloc[0,1]
        ax.text(0.05, 0.95, "corr: " + str(corr)[:4], transform=ax.transAxes, fontsize=14, verticalalignment='top')
        ax.set_title("I(X->Y)_normed " + descriptor + " " + str(df_desc[df_desc.descriptor == descriptor][metric].mean())[:4])


plt.show()

In [None]:
hue_order = df_model.model.unique()
fig, axess = plt.subplots(len(descriptors),3,figsize=(3*7,len(descriptors)*7))
first = True
for descriptor, axes in zip(descriptors, axess):
    df_plot = df_model[~(df_model.model.isin(["Not-trained", "EdgePred"]))].merge(df_mi, left_on="model", right_on="Y").drop(columns=["Y"]).rename(columns={"X":"descriptor"})
    df_plot = df_plot[df_plot.descriptor == descriptor]
    for i,(metric, ax) in enumerate(zip(["best_f1", "best_roc", "best_aucpr"], axes)):
        #Scatterplot with linear regression amd corr-coefficient
        sns.regplot(data=df_plot, y=metric, x="I(Y->X)_normed", ax =ax, scatter_kws={"alpha":0.5}, line_kws={"color":"red"})
        #display correlation coefficient
        corr = df_plot[[metric, "I(Y->X)_normed"]].corr().iloc[0,1]
        ax.text(0.05, 0.95, "corr: " + str(corr)[:4], transform=ax.transAxes, fontsize=14, verticalalignment='top')

        ax.set_title("I(Y->X)_normed " +descriptor + " " + str(df_desc[df_desc.descriptor == descriptor][metric].mean())[:4])
plt.show()

We now also consider a score that aggregates all MIs by suming all normalized MIs weigthed with the descriptor's best performance


In [None]:
df_plot = df_model[~(df_model.model.isin(["Not-trained", "EdgePred"]))].merge(df_mi, left_on="model", right_on="Y").drop(columns=["Y"]).rename(columns={"X":"descriptor"})
df_tmp_descriptors = df_desc.rename(columns={k: k+"_desc" for k in ["best_f1", "best_roc", "best_aucpr"]})
df_plot = df_plot.merge(df_tmp_descriptors, left_on="descriptor", right_on="descriptor")
df_plot["sum_I(X->Y)_item"] = df_plot["I(X->Y)_normed"] * df_plot["best_aucpr_desc"]
df_plot["sum_I(Y->X)_item"] = df_plot["I(Y->X)_normed"] * df_plot["best_aucpr_desc"]

sum_values = df_plot.groupby("model")[["sum_I(X->Y)_item", "sum_I(Y->X)_item"]].sum().reset_index().rename(columns={"sum_I(X->Y)_item":"sum_I(X->Y)", "sum_I(Y->X)_item":"sum_I(Y->X)"})

df_plot = df_plot.merge(sum_values, left_on="model", right_on="model")


In [None]:
fig,axes = plt.subplots(1,3,figsize=(20, 5), sharex=True)
for i, (ax, metric) in enumerate(zip(axes, ["best_f1", "best_roc", "best_aucpr"])):
    sns.scatterplot(data=df_plot, y=metric, x="sum_I(X->Y)", hue="model", palette="Set3", alpha=0.5, ax=ax, legend=i==0)
    #corr coefficient
    corr = df_plot[[metric, "sum_I(X->Y)"]].corr().iloc[0,1]
    ax.text(0.05, 0.95, "corr: " + str(corr)[:4], transform=ax.transAxes, fontsize=14, verticalalignment='top')
    ax.set_xlabel("")
    ax.set_ylabel(metric.split("_")[1])

axes[1].set_xlabel(
    "$\sum_{desc\in descriptors} [\mathcal{I} ( desc->model ) - \mathcal{I}(desc->model_{NT})$] metric$(desc)$"
)

fig.suptitle("$\mathcal{I} ( desc->model )$ score's correlation to the best performance of each evaluated model")

In [None]:
fig,axes = plt.subplots(1,3,figsize=(20, 5), sharex=True)
for i, (ax, metric) in enumerate(zip(axes, ["best_f1", "best_roc", "best_aucpr"])):
    sns.scatterplot(data=df_plot, y=metric, x="sum_I(Y->X)", hue="model", palette="Set3", alpha=0.5, ax=ax, legend=i==0)
    #corr coefficient
    corr = df_plot[[metric, "sum_I(Y->X)"]].corr().iloc[0,1]
    ax.text(0.05, 0.95, "corr: " + str(corr)[:4], transform=ax.transAxes, fontsize=14, verticalalignment='top')
    ax.set_xlabel("")
    ax.set_ylabel(metric.split("_")[1])

axes[1].set_xlabel(
    "$\sum_{desc\in descriptors} [\mathcal{I} ( model-> desc) - \mathcal{I}(model_{NT} -> desc)$] metric$(desc)$"
)

fig.suptitle("$\mathcal{I} ( model-> desc)$ score's correlation to the best performance of each evaluated model")

In [None]:
df_plot["compression"] = df_plot["sum_I(Y->X)"] - df_plot["sum_I(X->Y)"]

In [None]:
fig,axes = plt.subplots(1,3,figsize=(20, 5), sharex=True)
for i, (ax, metric) in enumerate(zip(axes, ["best_f1", "best_roc", "best_aucpr"])):
    sns.scatterplot(data=df_plot, y=metric, x="compression", hue="model", palette="Set3", alpha=0.5, ax=ax, legend=i==0)
    #corr coefficient
    corr = df_plot[[metric, "compression"]].corr().iloc[0,1]
    ax.text(0.05, 0.95, "corr: " + str(corr)[:4], transform=ax.transAxes, fontsize=14, verticalalignment='top')
    ax.set_xlabel("")
    ax.set_ylabel(metric.split("_")[1])

axes[1].set_xlabel(
    "$\sum_{desc\in descriptors} [\mathcal{I} ( model-> desc) - \mathcal{I}(model_{NT} -> desc)$] metric$(desc)$"
)

fig.suptitle("$\mathcal{I} ( model-> desc)$ score's correlation to the best performance of each evaluated model")

In [None]:
df_plot["compression_dummy"] = (df_plot["sum_I(X->Y)"] - df_plot["sum_I(X->Y)"].max())/(df_plot["sum_I(X->Y)"].min() - df_plot["sum_I(X->Y)"].max()) - (df_plot["sum_I(Y->X)"] - df_plot["sum_I(Y->X)"].max())/(df_plot["sum_I(Y->X)"].min() - df_plot["sum_I(Y->X)"].max())

In [None]:
fig,axes = plt.subplots(1,3,figsize=(20, 5), sharex=True)
for i, (ax, metric) in enumerate(zip(axes, ["best_f1", "best_roc", "best_aucpr"])):
    sns.scatterplot(data=df_plot, y=metric, x="compression_dummy", hue="model", palette="Set3", alpha=0.5, ax=ax, legend=i==0)
    #corr coefficient
    corr = df_plot[[metric, "compression_dummy"]].corr().iloc[0,1]
    ax.text(0.05, 0.95, "corr: " + str(corr)[:4], transform=ax.transAxes, fontsize=14, verticalalignment='top')
    ax.set_xlabel("")
    ax.set_ylabel(metric.split("_")[1])

axes[1].set_xlabel(
    "Weird compression stuff"
)

fig.suptitle("Really weird compression stuff")

In [None]:
from torch_geometric.datasets import ZINC

In [None]:
dataset = ZINC(root='/tmp/ZINC', subset=True, split='val')

In [None]:
from tdc_dataset import get_dataset

In [None]:
def get_dataset(dataset: str):
    try:
        df = correspondancy_dict[dataset](name=dataset).get_data()
    except:
        label_list = retrieve_label_name_list(dataset)
        df = correspondancy_dict[dataset](name=dataset, label_name=label_list[0]).get_data()
    return df

In [None]:
df = get_dataset("QM7b")