In [1]:
from pathlib import Path 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns 
import numpy as np 
import json 
import plot_utils
from importlib import reload
reload(plot_utils)

plot_utils.set_style()
plt.ioff()

<contextlib.ExitStack at 0x7fdd845ce150>

In [2]:
result_dir = Path('../results')
data_dir = Path('../data')

method_order = {'Ours': 0, 'pTS': 1, 'qEI': 2, 'Greedy': 3, 'UCB': 4}


# Maximizing homo-lumo gap 

### Loading, calculating

In [3]:
storage = []
for file in result_dir.glob('qm9*.json'): 
    with open(file, 'r') as f: 
        storage.extend(json.load(f))

Identify true top k and add to storage fraction of true top k information.

In [4]:
all_data = pd.read_csv(data_dir / 'qm9.csv')
all_smiles = list(all_data['smiles'])
test_data = {smi: score for smi, score in zip(all_smiles, all_data['gap'])}

k = [0.0001, 0.001, 0.01]
kN = [int(np.ceil(kk*len(test_data))) for kk in k]
for ki, kNi in zip(k, kN): print(f'{100*ki:0.2f}%: {kNi} compounds')
top_smis = {kk: set(sorted(list(test_data), key = lambda smi: -1*test_data[smi])[:kk]) for kk in kN}

for entry in storage: 
    for kkN, kk in zip(kN, k): 
        entry[f'Fraction top {100*kk:0.2f}%'] = len(top_smis[kkN].intersection(set(entry["All acquired points"].keys())))/kkN
    entry.pop("All acquired points")
    entry.pop("New acquired points")

data = pd.DataFrame(storage).sort_values(by='Method', key = lambda x: x.map(method_order))

0.01%: 14 compounds
0.10%: 134 compounds
1.00%: 1339 compounds


### Plotting

In [5]:
fig_dir = Path('qm9')
fig_dir.mkdir(exist_ok=True, parents=True)

latex = plot_utils.df_to_latex(data=data)
with open(fig_dir / 'table.txt', 'w') as f: 
    f.write(latex)

### Average of top k acquired 

In [29]:
for kk in [10, 50, 100]:
    fig, ax = plt.subplots(1,1)
    sns.lineplot(
        data=data, x='Iteration', y=f'Top {kk} ave', 
        hue='Method', style='Method', palette=plot_utils.method_colors,
        ax=ax, errorbar=('se',1))
    ax.set_ylabel(f'Top {kk} average')
    plot_utils.set_size(w=1.2, h=1, ax=ax)
    fig.savefig(fig_dir / f'top_{kk}_average.pdf', bbox_inches='tight', transparent=True)



### Fraction of true top k 

In [30]:
for kk, kkN in zip(k, kN):
    fig, ax = plt.subplots(1,1)
    sns.lineplot(
        data=data, x='Iteration', y=f'Fraction top {100*kk:0.2f}%', 
        hue='Method', style='Method', palette=plot_utils.method_colors,
        ax=ax, errorbar=("se", 1), legend=False)
    ax.set_ylabel(f'Fraction of top {100*kk:0.2f}%')
    plot_utils.set_size(w=1.2, h=1, ax=ax)
    fig.savefig(fig_dir / f'fraction_true_top_{kk}.pdf', bbox_inches='tight', transparent=True)


# Antibiotics

In [31]:
result_dir = Path('../results')

storage = []
for file in result_dir.rglob('wong_antibiotics*.json'): 
    with open(file, 'r') as f: 
        storage.extend(json.load(f))

In [32]:
all_data = pd.read_csv(data_dir / 'wong_antibiotics.csv')
all_smiles = list(all_data['smiles'])
test_data = {smi: score for smi, score in zip(all_smiles, all_data['Mean_50uM'])}

k = [0.005, 0.01, 0.05]
kN = [int(np.ceil(kk*len(test_data))) for kk in k]
for ki, kNi in zip(k, kN): print(f'{100*ki:0.2f}%: {kNi} compounds')
top_smis = {kk: set(sorted(list(test_data), key = lambda smi: test_data[smi])[:kk]) for kk in kN}

for entry in storage: 
    for kkN, kk in zip(kN, k): 
        entry[f'Fraction top {100*kk:0.2f}%'] = len(top_smis[kkN].intersection(set(entry["All acquired points"].keys())))/kkN
    entry.pop("All acquired points")
    entry.pop("New acquired points")

data = pd.DataFrame(storage).sort_values(by='Method', key=lambda x: x.map(method_order))

0.50%: 197 compounds
1.00%: 393 compounds
5.00%: 1962 compounds


In [33]:
fig_dir = Path('antibiotics')
fig_dir.mkdir(exist_ok=True, parents=True)

latex = plot_utils.df_to_latex(data=data)
with open(fig_dir / 'table.txt', 'w') as f: 
    f.write(latex)

In [34]:
for kk in [10, 50, 100]:
    fig, ax = plt.subplots(1,1)
    sns.lineplot(
        data=data, x='Iteration', y=f'Top {kk} ave', 
        hue='Method', style='Method', 
        ax=ax, palette=plot_utils.method_colors, errorbar=('se',1))
    ax.set_ylabel(f'Top {kk} average')
    plot_utils.set_size(w=1.2, h=1, ax=ax)
    fig.savefig(fig_dir / f'top_{kk}_average.pdf', bbox_inches='tight', transparent=True)

for kk, kkN in zip(k, kN):
    fig, ax = plt.subplots(1,1)
    sns.lineplot(
        data=data, x='Iteration', y=f'Fraction top {100*kk:0.2f}%', 
        hue='Method', style='Method', 
        ax=ax, palette=plot_utils.method_colors, errorbar=("se", 1))
    ax.set_ylabel(f'Fraction of top {100*kk:0.2f}%')
    plot_utils.set_size(w=1.2, h=1, ax=ax)
    fig.savefig(fig_dir / f'fraction_true_top_{kk}.pdf', bbox_inches='tight', transparent=True)

  fig, ax = plt.subplots(1,1)
