In [1]:
import os
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
from rdkit import Chem

mol_path = os.path.abspath(os.path.join(
    os.path.abspath(""), "..", # root directory
    "generated_molecules", # directory with experimental results
    "rerun_diffseeds", # run name
))

tqdm.pandas()

In [2]:
# load dataset

df = pd.read_pickle("/data/stat-cadd/bras5033/guided_diffusion/GaUDI/data/datasets/COMPAS-1x_reduced.csv")
df["objective"] = 3*df["GAP_eV"] + df["aIP_eV"] - df["aEA_eV"]
df["objective"] = (df.objective  - df.objective.min()) / (df.objective.max() - df.objective.min())

# discard benzene
df = df[df["n_rings"] > 10]

In [3]:
split_map = {
        -1: "< 11 rings", 0: "training set", 1: "validation set", 2: "test_set",
    }

df["Data Split"] = df["cluster_split"].map(split_map) 

In [4]:
# csv_path = "/data/stat-cadd/bras5033/guided_diffusion/GaUDI/data/datasets/COMPAS-1x_reduced.csv"
# df = pd.read_pickle(csv_path)
df = df.reset_index(drop=True)
df["inchi"] = df["smiles"].progress_apply(lambda x: Chem.MolToInchi(Chem.MolFromSmiles(x)))
df.columns

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 25394/25394 [00:25<00:00, 1015.03it/s]


Index(['molecule', 'smiles', 'balaban_notation', 'augmented_lalas', 'lalas',
       'HOMO_eV', 'LUMO_eV', 'GAP_eV', 'Dipmom_Debye', 'Etot_eV',
       'Etot_pos_eV', 'Etot_neg_eV', 'ZPE_eV', 'ZPE_pos_eV', 'ZPE_neg_eV',
       'aEA_eV', 'aIP_eV', 'dispersion_eV', 'n_rings', 'Erel_eV',
       'cluster_split', 'random_split', 'objective', 'Data Split', 'inchi'],
      dtype='object')

In [5]:
results = []
files = [f for f in os.listdir(mol_path)]

for f in files:
    with open(os.path.join(mol_path, f), "rb") as pf:
        results.extend(pickle.load(pf))
        
results = pd.DataFrame(results, copy=False)
results["reg_type"] = results.apply(lambda x: "_".join([x["reg_type"], x["context_set"]]), axis=1)
results = results[results["split"] == "cluster_split"]
results.columns

  from .autonotebook import tqdm as notebook_tqdm


Index(['split', 'context_set', 'reg_type', 'scale', 'rerun_iteration', 'alpha',
       'x_stable', 'atom_type_stable', 'pred', 'target_function_values',
       'mol_valid', 'mol_unique', 'molecule_valid_bool', 'valid_inchi'],
      dtype='object')

In [6]:
# get inchi -> train/val/test set map
set_map = df[["inchi", "cluster_split"]]
set_map = set_map.set_index("inchi")["cluster_split"]
set_map = set_map.map({0: "training set", 1: "validation set", 2:"test set"})

In [7]:
fseb_results = results.loc[
    (results["reg_type"] == "fseb_all") & (results["scale"].isin([0, 4])), 
    ["reg_type", "scale", "rerun_iteration", "valid_inchi"]
].explode("valid_inchi").dropna().reset_index(drop=True)

fseb_results["set"] = fseb_results["valid_inchi"].map(set_map)

for scale in fseb_results["scale"].unique():
    temp = fseb_results[fseb_results["scale"] == scale]
    counts = temp.groupby("rerun_iteration")["set"].value_counts(normalize=True).unstack()#.dropna()
    counts = counts.sort_index()
    mean_counts = counts.mean(0)
    std_counts = counts.std(0) / np.sqrt(10)
    
    formatted_stats = (mean_counts.apply(lambda x: f"{x:.2f}") + "±" + std_counts.apply(lambda x: f"{x:.2f}"))

    print("Scale", scale)
    display(formatted_stats)
    print()

Scale 0


set
test set          0.04±0.00
training set      0.91±0.01
validation set    0.05±0.00
dtype: object


Scale 4


set
test set          0.47±0.07
training set      0.16±0.04
validation set    0.40±0.05
dtype: object




In [8]:
fseb_results = results.loc[
    (results["reg_type"] == "ps_all") & (results["scale"].isin([0, 4])), 
    ["reg_type", "scale", "rerun_iteration", "valid_inchi"]
].explode("valid_inchi").dropna().reset_index(drop=True)

fseb_results["set"] = fseb_results["valid_inchi"].map(set_map)

for scale in fseb_results["scale"].unique():
    temp = fseb_results[fseb_results["scale"] == scale]
    counts = temp.groupby("rerun_iteration")["set"].value_counts(normalize=True).unstack()#.dropna()
    counts = counts.sort_index()
    mean_counts = counts.mean(0)
    std_counts = counts.std(0) / np.sqrt(10)
    
    formatted_stats = (mean_counts.apply(lambda x: f"{x:.2f}") + "±" + std_counts.apply(lambda x: f"{x:.2f}"))

    print("Scale", scale)
    display(formatted_stats)
    print()

Scale 0


set
test set          0.04±0.00
training set      0.91±0.01
validation set    0.05±0.00
dtype: object


Scale 4


set
test set          0.19±0.03
training set      0.49±0.07
validation set    0.32±0.05
dtype: object


