In [9]:
from utils import load_data, highlight_min

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
from itertools import product

In [2]:
data_dir = './data'

targets = os.listdir(data_dir)
splits = ['cv', 'bac']
split_ids = np.arange(5)

In [15]:
data = defaultdict(dict)

for target, split, split_id in product(targets, splits, split_ids):
    
    train_dataset, test_dataset, sim = load_data(target, split, split_id)
    
    y_train = train_dataset.y.flatten()
    y_test = test_dataset.y.flatten()
    
    data[target][split.upper() + ' Train Size'] = y_train.shape[0]
    data[target][split.upper() + ' Test Size'] = y_test.shape[0]
    
pd.DataFrame.from_dict(data, orient='index')

Unnamed: 0,CV Train Size,CV Test Size,BAC Train Size,BAC Test Size
CHEMBL214,3129,782,3192,719
CHEMBL216,1152,288,1073,367
CHEMBL217,5536,1383,5068,1851
CHEMBL224,3194,798,2878,1114
CHEMBL225,2448,612,2374,686
CHEMBL226,3347,836,3358,825
CHEMBL251,3920,980,3879,1021
CHEMBL264,2750,687,2888,549
CHEMBL3155,1531,382,1632,281
CHEMBL3371,2400,599,2104,895


In [3]:
score_names = ['_mse', '_drop_mse', '_unc']
big_ol_dict_of_everything = {}

for target in targets:

    target_dict = defaultdict(list)

    for split in splits:
        for split_id in split_ids:

            _, test_dataset, sim = load_data(target, split, split_id)
            y_test = test_dataset.y.flatten()

            results_path = os.path.join(data_dir, target, split, f"large_result_{split_id}.npz")
            results = np.load(results_path)

            y_pred = results['y_pred'] 
            y_drop_pred = results['y_drop_pred']
            unc = results['unc']

            mse = np.mean((y_test - y_pred)**2)
            mse_drop = np.mean((y_test - y_drop_pred)**2)

            target_dict[split + '_mse'].append(mse)
            target_dict[split + '_drop_mse'].append(mse_drop)
            target_dict[split + '_unc'].append(unc.mean())
        
        for score in score_names:
        
            mean = np.mean(target_dict[split + score])
            std = np.std(target_dict[split + score])
                
            target_dict[split + score] = f"{mean:.3f} ± {std:.2f}"

    big_ol_dict_of_everything[target] = target_dict

## CV Split Scores
MSE - lower is better

In [4]:
col_names = ['cv' + score for score in score_names]

df = pd.DataFrame.from_dict(big_ol_dict_of_everything, orient='index')
cv_df = df[col_names]

cv_df.style.apply(highlight_min, axis=1)

Unnamed: 0,cv_mse,cv_drop_mse,cv_unc
CHEMBL214,0.491 ± 0.03,0.477 ± 0.03,0.329 ± 0.01
CHEMBL216,0.666 ± 0.10,0.657 ± 0.10,0.376 ± 0.01
CHEMBL217,0.387 ± 0.01,0.370 ± 0.00,0.313 ± 0.01
CHEMBL224,0.488 ± 0.04,0.483 ± 0.03,0.327 ± 0.01
CHEMBL225,0.538 ± 0.03,0.522 ± 0.03,0.327 ± 0.01
CHEMBL226,0.430 ± 0.02,0.423 ± 0.02,0.323 ± 0.01
CHEMBL251,0.440 ± 0.02,0.434 ± 0.02,0.346 ± 0.01
CHEMBL264,0.439 ± 0.02,0.433 ± 0.02,0.305 ± 0.01
CHEMBL3155,0.506 ± 0.04,0.498 ± 0.05,0.303 ± 0.01
CHEMBL3371,0.512 ± 0.05,0.505 ± 0.04,0.317 ± 0.01


In [17]:
print(cv_df.to_latex())

\begin{tabular}{llll}
\toprule
{} &        cv\_mse &   cv\_drop\_mse &        cv\_unc \\
\midrule
CHEMBL214  &  0.491 ± 0.03 &  0.477 ± 0.03 &  0.329 ± 0.01 \\
CHEMBL216  &  0.666 ± 0.10 &  0.657 ± 0.10 &  0.376 ± 0.01 \\
CHEMBL217  &  0.387 ± 0.01 &  0.370 ± 0.00 &  0.313 ± 0.01 \\
CHEMBL224  &  0.488 ± 0.04 &  0.483 ± 0.03 &  0.327 ± 0.01 \\
CHEMBL225  &  0.538 ± 0.03 &  0.522 ± 0.03 &  0.327 ± 0.01 \\
CHEMBL226  &  0.430 ± 0.02 &  0.423 ± 0.02 &  0.323 ± 0.01 \\
CHEMBL251  &  0.440 ± 0.02 &  0.434 ± 0.02 &  0.346 ± 0.01 \\
CHEMBL264  &  0.439 ± 0.02 &  0.433 ± 0.02 &  0.305 ± 0.01 \\
CHEMBL3155 &  0.506 ± 0.04 &  0.498 ± 0.05 &  0.303 ± 0.01 \\
CHEMBL3371 &  0.512 ± 0.05 &  0.505 ± 0.04 &  0.317 ± 0.01 \\
\bottomrule
\end{tabular}



## BAC Split Scores
MSE - lower is better

In [5]:
col_names = ['bac' + score for score in score_names]
bac_df = df[col_names]
bac_df.style.apply(highlight_min, axis=1)

Unnamed: 0,bac_mse,bac_drop_mse,bac_unc
CHEMBL214,1.424 ± 0.18,1.377 ± 0.17,0.370 ± 0.03
CHEMBL216,2.432 ± 0.81,2.380 ± 0.75,0.391 ± 0.04
CHEMBL217,1.067 ± 0.12,1.008 ± 0.13,0.361 ± 0.03
CHEMBL224,1.834 ± 0.32,1.740 ± 0.34,0.432 ± 0.08
CHEMBL225,1.741 ± 0.38,1.684 ± 0.39,0.363 ± 0.02
CHEMBL226,1.487 ± 0.56,1.473 ± 0.57,0.356 ± 0.03
CHEMBL251,1.529 ± 0.43,1.505 ± 0.43,0.370 ± 0.04
CHEMBL264,1.296 ± 0.24,1.235 ± 0.24,0.377 ± 0.03
CHEMBL3155,1.394 ± 0.70,1.391 ± 0.74,0.349 ± 0.03
CHEMBL3371,1.502 ± 0.34,1.435 ± 0.33,0.344 ± 0.02


In [18]:
print(bac_df.to_latex())

\begin{tabular}{llll}
\toprule
{} &       bac\_mse &  bac\_drop\_mse &       bac\_unc \\
\midrule
CHEMBL214  &  1.424 ± 0.18 &  1.377 ± 0.17 &  0.370 ± 0.03 \\
CHEMBL216  &  2.432 ± 0.81 &  2.380 ± 0.75 &  0.391 ± 0.04 \\
CHEMBL217  &  1.067 ± 0.12 &  1.008 ± 0.13 &  0.361 ± 0.03 \\
CHEMBL224  &  1.834 ± 0.32 &  1.740 ± 0.34 &  0.432 ± 0.08 \\
CHEMBL225  &  1.741 ± 0.38 &  1.684 ± 0.39 &  0.363 ± 0.02 \\
CHEMBL226  &  1.487 ± 0.56 &  1.473 ± 0.57 &  0.356 ± 0.03 \\
CHEMBL251  &  1.529 ± 0.43 &  1.505 ± 0.43 &  0.370 ± 0.04 \\
CHEMBL264  &  1.296 ± 0.24 &  1.235 ± 0.24 &  0.377 ± 0.03 \\
CHEMBL3155 &  1.394 ± 0.70 &  1.391 ± 0.74 &  0.349 ± 0.03 \\
CHEMBL3371 &  1.502 ± 0.34 &  1.435 ± 0.33 &  0.344 ± 0.02 \\
\bottomrule
\end{tabular}



## Uncertainties

In [6]:
unc_names = ['cv_unc', 'bac_unc']
df[unc_names]

Unnamed: 0,cv_unc,bac_unc
CHEMBL214,0.329 ± 0.01,0.370 ± 0.03
CHEMBL216,0.376 ± 0.01,0.391 ± 0.04
CHEMBL217,0.313 ± 0.01,0.361 ± 0.03
CHEMBL224,0.327 ± 0.01,0.432 ± 0.08
CHEMBL225,0.327 ± 0.01,0.363 ± 0.02
CHEMBL226,0.323 ± 0.01,0.356 ± 0.03
CHEMBL251,0.346 ± 0.01,0.370 ± 0.04
CHEMBL264,0.305 ± 0.01,0.377 ± 0.03
CHEMBL3155,0.303 ± 0.01,0.349 ± 0.03
CHEMBL3371,0.317 ± 0.01,0.344 ± 0.02


# Scatter Plots

In [7]:
%%capture

for target in targets:

    fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(16, 2 * 6))

    for ax_row, split in zip(axes, splits):

        mses = []
        uncs = []
        sims = []

        for split_id in split_ids:

            _, test_dataset, sim = load_data(target, split, split_id)
            y_test = test_dataset.y.flatten()

            results_path = os.path.join(data_dir, target, split, f"large_result_{split_id}.npz")
            results = np.load(results_path)

            y_pred = results['y_pred'] 
            y_drop_pred = results['y_drop_pred']

            unc = results['unc']
            mse = (y_test - y_pred)**2
            mse_drop = (y_test - y_drop_pred)**2

            mses.append(mse)
            uncs.append(unc)
            sims.append(sim)

        mses = np.concatenate(mses)
        uncs = np.concatenate(uncs)
        sims = np.concatenate(sims)

        ax_1, ax_2, ax_3 = ax_row

        ax_1.scatter(sims, mses, s=15, alpha=0.7)
        ax_1.set_xlabel('SIMILARITY')
        ax_1.set_ylabel('MSE')

        ax_2.scatter(sims, uncs, s=15, alpha=0.7)
        ax_2.set_xlabel('SIMILARITY')
        ax_2.set_ylabel('UNCERTAINTY')

        ax_3.scatter(uncs, mses, s=15, alpha=0.7)
        ax_3.set_xlabel('UNCERTAINTY')
        ax_3.set_ylabel('MSE')

        for ax in ax_row:
            ax.set_xticks([])
            ax.set_yticks([])

        fig.tight_layout()

    for ax, row in zip(axes[:,0], splits):
        ax.annotate(row, xy=(0, 2.), xytext=(-ax.yaxis.labelpad - 5, 0),
                    xycoords=ax.yaxis.label, textcoords='offset points',
                    size=25, ha='right', va='center')

    _ = fig.suptitle(target, fontsize=30, y=1.02)

    plt.savefig(fname=f'./img/large_{target}.png', bbox_inches='tight')