In [None]:
from utils import load_data, highlight_max

import os
import numpy as np
import pandas as pd
from functools import partial
from collections import defaultdict

In [12]:
"""
Scoring Metric
"""

def top_k_precision(y_true, y_pred, k=0.10):
    
    if isinstance(k, float):
        top_k = int(k * y_true.shape[0])
    elif isinstance(k, int):
        top_k = k

    top_ids = np.argsort(y_true)[:top_k]
    top_pred = np.argsort(y_pred)[:top_k]
    
    return len(set(top_ids).intersection(set(top_pred))) / top_k

"""
Picking Strategies
"""

def normalize(x):
    return (x - x.min()) / (x - x.min()).max()

def add(y_pred, unc):
    return y_pred + unc

def add_scaled(y_pred, unc):
    return y_pred + normalize(unc)

def scale(y_pred, unc):
    return y_pred * normalize(unc)

def sum_scaled(y_pred, unc):
    return normalize(y_pred) + normalize(unc)
    
def linear_comb(y_pred, unc, lam):
    return lam * y_pred + (1 - lam) * unc

def baseline(y_pred, unc):
    return y_pred

def get_combs(lambdas=[0.5, 0.6, 0.7, 0.8, 0.9]):
    fs = []
    for lam in lambdas:
        f = partial(linear_comb, lam=lam)
        f.__name__ = f"comb_{lam}"
        fs.append(f)
        
    return fs

In [13]:
data_dir = './new_data'
res_dir = './data'

targets = os.listdir(data_dir)
splits = ['cv', 'bac']
split_ids = np.arange(5)

In [14]:
res = {}
strats = [baseline, add, scale, add_scaled, sum_scaled] + get_combs()

for target in targets:
    
    target_dict = defaultdict(list)

    for split in splits:
        for split_id in split_ids:

            _, test_dataset, sim = load_data(target, split, split_id)
            y_test = test_dataset.y.flatten()

            results_path = os.path.join(res_dir, target, split, f"large_result_{split_id}.npz")
            results = np.load(results_path)

            y_pred = results['y_pred'] 
            unc = results['unc']
            
            for strat in strats:
                y_strat = strat(y_pred, unc)
                score = top_k_precision(y_test, y_strat)
                target_dict[split + '_' + strat.__name__].append(score)
        
        for strat in strats:
            strat_name = split + '_' + strat.__name__
            
            mean = np.mean(target_dict[strat_name])
            std = np.std(target_dict[strat_name])

            target_dict[strat_name] = f"{mean:.3f} ± {std:.2f}"
        
            
    res[target] = target_dict

# Buying Stragies on CV
Precision at top 10 - higher is better

In [15]:
col_names = ['cv_' + strat.__name__ for strat in strats]

df = pd.DataFrame.from_dict(res, orient='index')
cv_df = df[col_names]
cv_df = cv_df.rename(columns={name: name[3:] for name in col_names})
cv_df.style.apply(highlight_max, axis=1)

Unnamed: 0,baseline,add,scale,add_scaled,sum_scaled,comb_0.5,comb_0.6,comb_0.7,comb_0.8,comb_0.9
CHEMBL214,0.585 ± 0.05,0.590 ± 0.05,0.518 ± 0.04,0.592 ± 0.06,0.200 ± 0.05,0.590 ± 0.05,0.590 ± 0.04,0.590 ± 0.04,0.592 ± 0.04,0.585 ± 0.05
CHEMBL216,0.743 ± 0.07,0.721 ± 0.07,0.593 ± 0.09,0.714 ± 0.07,0.129 ± 0.05,0.721 ± 0.07,0.743 ± 0.06,0.743 ± 0.06,0.743 ± 0.07,0.743 ± 0.07
CHEMBL217,0.614 ± 0.02,0.612 ± 0.01,0.070 ± 0.04,0.610 ± 0.01,0.120 ± 0.08,0.612 ± 0.01,0.612 ± 0.01,0.612 ± 0.01,0.612 ± 0.02,0.614 ± 0.02
CHEMBL224,0.663 ± 0.03,0.653 ± 0.04,0.458 ± 0.07,0.641 ± 0.05,0.038 ± 0.02,0.653 ± 0.04,0.663 ± 0.04,0.663 ± 0.03,0.663 ± 0.03,0.661 ± 0.03
CHEMBL225,0.587 ± 0.05,0.577 ± 0.05,0.266 ± 0.07,0.580 ± 0.06,0.184 ± 0.05,0.577 ± 0.05,0.580 ± 0.04,0.590 ± 0.04,0.590 ± 0.04,0.584 ± 0.05
CHEMBL226,0.648 ± 0.04,0.651 ± 0.04,0.101 ± 0.02,0.646 ± 0.03,0.120 ± 0.02,0.651 ± 0.04,0.651 ± 0.04,0.651 ± 0.04,0.651 ± 0.04,0.651 ± 0.04
CHEMBL251,0.663 ± 0.03,0.665 ± 0.03,0.237 ± 0.05,0.661 ± 0.03,0.169 ± 0.12,0.665 ± 0.03,0.663 ± 0.03,0.663 ± 0.02,0.667 ± 0.03,0.667 ± 0.03
CHEMBL264,0.665 ± 0.02,0.647 ± 0.02,0.653 ± 0.03,0.638 ± 0.02,0.082 ± 0.03,0.647 ± 0.02,0.650 ± 0.01,0.656 ± 0.01,0.656 ± 0.02,0.662 ± 0.02
CHEMBL3155,0.595 ± 0.06,0.589 ± 0.05,0.184 ± 0.09,0.584 ± 0.03,0.168 ± 0.04,0.589 ± 0.05,0.595 ± 0.05,0.595 ± 0.05,0.600 ± 0.05,0.600 ± 0.05
CHEMBL3371,0.582 ± 0.06,0.562 ± 0.05,0.535 ± 0.03,0.559 ± 0.04,0.117 ± 0.02,0.562 ± 0.05,0.562 ± 0.05,0.569 ± 0.05,0.582 ± 0.06,0.582 ± 0.06


# Buying Stragies on BAC
Precision at top 10 - higher is better

In [16]:
col_names = ['bac_' + strat.__name__ for strat in strats]
bac_df = df[col_names]
bac_df = bac_df.rename(columns={name: name[3:] for name in col_names})
bac_df.style.apply(highlight_max, axis=1)


Unnamed: 0,_baseline,_add,_scale,_add_scaled,_sum_scaled,_comb_0.5,_comb_0.6,_comb_0.7,_comb_0.8,_comb_0.9
CHEMBL214,0.209 ± 0.10,0.222 ± 0.09,0.174 ± 0.03,0.224 ± 0.08,0.237 ± 0.06,0.222 ± 0.09,0.219 ± 0.10,0.222 ± 0.10,0.219 ± 0.11,0.211 ± 0.11
CHEMBL216,0.168 ± 0.09,0.154 ± 0.10,0.102 ± 0.07,0.148 ± 0.09,0.110 ± 0.08,0.154 ± 0.10,0.154 ± 0.10,0.159 ± 0.10,0.168 ± 0.09,0.168 ± 0.09
CHEMBL217,0.202 ± 0.07,0.209 ± 0.08,0.173 ± 0.06,0.201 ± 0.08,0.215 ± 0.08,0.209 ± 0.08,0.208 ± 0.08,0.202 ± 0.08,0.197 ± 0.07,0.198 ± 0.07
CHEMBL224,0.250 ± 0.18,0.265 ± 0.19,0.276 ± 0.17,0.262 ± 0.19,0.232 ± 0.18,0.265 ± 0.19,0.262 ± 0.20,0.256 ± 0.19,0.256 ± 0.18,0.256 ± 0.19
CHEMBL225,0.131 ± 0.04,0.132 ± 0.06,0.111 ± 0.06,0.138 ± 0.06,0.128 ± 0.06,0.132 ± 0.06,0.129 ± 0.05,0.125 ± 0.05,0.128 ± 0.04,0.134 ± 0.04
CHEMBL226,0.211 ± 0.14,0.206 ± 0.14,0.052 ± 0.03,0.208 ± 0.14,0.131 ± 0.10,0.206 ± 0.14,0.211 ± 0.14,0.217 ± 0.14,0.213 ± 0.14,0.213 ± 0.14
CHEMBL251,0.202 ± 0.09,0.195 ± 0.08,0.126 ± 0.08,0.189 ± 0.08,0.185 ± 0.07,0.195 ± 0.08,0.195 ± 0.08,0.199 ± 0.08,0.202 ± 0.08,0.206 ± 0.09
CHEMBL264,0.223 ± 0.19,0.230 ± 0.18,0.218 ± 0.17,0.227 ± 0.18,0.231 ± 0.10,0.230 ± 0.18,0.222 ± 0.20,0.228 ± 0.19,0.228 ± 0.19,0.225 ± 0.19
CHEMBL3155,0.195 ± 0.09,0.183 ± 0.07,0.123 ± 0.08,0.176 ± 0.08,0.154 ± 0.08,0.183 ± 0.07,0.180 ± 0.09,0.189 ± 0.09,0.196 ± 0.08,0.196 ± 0.08
CHEMBL3371,0.192 ± 0.10,0.183 ± 0.08,0.104 ± 0.03,0.159 ± 0.06,0.103 ± 0.02,0.183 ± 0.08,0.188 ± 0.09,0.193 ± 0.10,0.193 ± 0.10,0.197 ± 0.10
