In [2]:
import json

import scipy.stats as stats

from src.dataset import load_cyp_data_split, load_herg_data_split, load_pampa_data_split, load_synthetic_data_split
from src.utils import get_data_partition_on_substructure_presence

In [7]:
train_cyp, valid_cyp, test_cyp = load_cyp_data_split()
train_herg, valid_herg, test_herg = load_herg_data_split()
train_pampa, valid_pampa, test_pampa = load_pampa_data_split()
train_synthetic, valid_synthetic, test_synthetic = load_synthetic_data_split()

test_datasets = {
    'cyp': test_cyp,
    'herg': test_herg,
    'pampa': test_pampa,
    'synthetic': test_synthetic
}

Found local copy...
Loading...
Done!
Found local copy...
Loading...
Done!
Found local copy...
Loading...
Done!


In [11]:
grad_cam_smiles = json.load(open('results/smiles_grad_cam.json'))
saliency_map_smiles = json.load(open('results/smiles_saliency_map.json'))

In [13]:
grad_cam_results = {}
for dataset_name in ['cyp', 'herg', 'pampa', 'synthetic']:
    grad_cam_results[dataset_name] = []
    for i in range(len(grad_cam_smiles[dataset_name])):
        results = []
        for smiles in grad_cam_smiles[dataset_name][i]:
            has_substr, no_substr = get_data_partition_on_substructure_presence(test_datasets[dataset_name], smiles)
            has_substr_y = [int(float(mol.GetProp("y"))) for mol in has_substr]
            no_substr_y = [int(float(mol.GetProp("y"))) for mol in no_substr]
            results.append(stats.ttest_ind(has_substr_y, no_substr_y))
        grad_cam_results[dataset_name].append(results)

saliency_map_results = {}
for dataset_name in ['cyp', 'herg', 'pampa', 'synthetic']:
    saliency_map_results[dataset_name] = []
    for i in range(len(saliency_map_smiles[dataset_name])):
        results = []
        for smiles in saliency_map_smiles[dataset_name][i]:
            has_substr, no_substr = get_data_partition_on_substructure_presence(test_datasets[dataset_name], smiles)
            has_substr_y = [int(float(mol.GetProp("y"))) for mol in has_substr]
            no_substr_y = [int(float(mol.GetProp("y"))) for mol in no_substr]
            results.append(stats.ttest_ind(has_substr_y, no_substr_y))
        saliency_map_results[dataset_name].append(results)

In [None]:
def did_test_pass(result, threshold=0.05):
    return result.pvalue < threshold

grad_cam_results_pass = {}
for dataset_name in ['cyp', 'herg', 'pampa', 'synthetic']:
    grad_cam_results_pass[dataset_name] = []
    for i in range(len(grad_cam_results[dataset_name])):
        results = []
        for result in grad_cam_results[dataset_name][i]:
            results.append(did_test_pass(result))
        grad_cam_results_pass[dataset_name].append(results)

saliency_map_results_pass = {}
for dataset_name in ['cyp', 'herg', 'pampa', 'synthetic']:
    saliency_map_results_pass[dataset_name] = []
    for i in range(len(saliency_map_results[dataset_name])):
        results = []
        for result in saliency_map_results[dataset_name][i]:
            results.append(did_test_pass(result))
        saliency_map_results_pass[dataset_name].append(results)