# HMM: Performance evaluation

Jacqueline R. M. A. Maasch | March 2022

## Preamble

In [1]:
# Importations.
import moses
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import wasserstein_distance
from moses.metrics import weight, logP, SA, QED
from moses.metrics.utils import get_mol, mapper
from moses import get_dataset
from collections import OrderedDict

import tdc
from tdc import Evaluator
from tdc.single_pred import HTS
from importlib.metadata import version

In [2]:
# Installs.
#!python3.8 -m pip install fcd_torch

## Define functions

In [3]:
def compute_tdc_metrics(generated, training):
    
    metrics_dict = dict()
    
    kl = Evaluator(name = "KL_Divergence")
    metrics_dict["KL divergence"] = kl(generated, training)

    uniqueness = Evaluator(name = "Uniqueness")
    metrics_dict["Uniqueness"] = uniqueness(generated)

    validity = Evaluator(name = "Validity")
    metrics_dict["Validity"] = validity(generated)

    novelty = Evaluator(name = "Novelty")
    metrics_dict["Novelty"] = novelty(generated, training)

    #fcd = Evaluator(name = "FCD_Distance")
    #metrics_dict["FCD distance"] = fcd(generated, training)

    diversity = Evaluator(name = "Diversity")
    metrics_dict["Diversity"] = diversity(generated)

    return metrics_dict


def plot_metrics(generated, metrics):

    distributions = OrderedDict()
    for metric_name, metric_fn in metrics.items():
        distributions[metric_name] = OrderedDict()
        for _set, _molecules in generated:
            distributions[metric_name][_set] = mapper(config.n_jobs)(
                metric_fn, _molecules['ROMol'].dropna().values
            )

    for metric_i, metric_name in enumerate(metrics):
        for model, d in distributions[metric_name].items():
            dist = wasserstein_distance(distributions[metric_name]['MOSES'], d)
            sns.distplot(
                d, hist=False, kde=True,
                kde_kws={'shade': True, 'linewidth': 3},
                label='{0} ({1:0.2g})'.format(model, dist))
        plt.title(metric_name, fontsize=14)
        plt.legend()
        plt.tight_layout()
        plt.savefig(
            os.path.join(config.img_folder, metric_name+'.pdf')
        )
        plt.savefig(
            os.path.join(config.img_folder, metric_name+'.png'),
            dpi=250
        )
        plt.close()

## Read data

### Generated molecules

In [4]:
# Read in generated molecules.
df = pd.read_csv("de_novo_data/hmm_de_novo_2k.csv")
print(df.info())
display(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   SMILES  2000 non-null   object
 1   Active  2000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 31.4+ KB
None


Unnamed: 0,SMILES,Active
0,CC(=O)c1ncc([C@H]1CO)C(=O)C(C)S[C@H](CCC1=CC(=...,1
1,CC(C)C)n1,1
2,COC(=O)(=O)CCl)N1CCN(Cc3nc(O)c3cn1nn[nH]2Cn2cc...,1
3,CC1CC2)C[C@]2(C2CCCCC3)c1,1
4,N[C@H]1[C@]43C)[C@@H]3[C@H]2[C@@H]1CCC2(CC1,1


In [5]:
# Disaggregate molecules sampled from active model vs inactive model.
df_active = df[df["Active"] == 1]
df_inactive = df[df["Active"] == 0]

print(df_active.info())
display(df_active.head())

print(df_inactive.info())
display(df_inactive.head())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   SMILES  1000 non-null   object
 1   Active  1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 23.4+ KB
None


Unnamed: 0,SMILES,Active
0,CC(=O)c1ncc([C@H]1CO)C(=O)C(C)S[C@H](CCC1=CC(=...,1
1,CC(C)C)n1,1
2,COC(=O)(=O)CCl)N1CCN(Cc3nc(O)c3cn1nn[nH]2Cn2cc...,1
3,CC1CC2)C[C@]2(C2CCCCC3)c1,1
4,N[C@H]1[C@]43C)[C@@H]3[C@H]2[C@@H]1CCC2(CC1,1


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 1000 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   SMILES  1000 non-null   object
 1   Active  1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 23.4+ KB
None


Unnamed: 0,SMILES,Active
1000,Cc1nnc(N(C)C(=O)c2ccc(OC)c(C(=O)c1ccc(C(N)(CC(...,0
1001,CCCOc1cccns1-c1nc3ccccc21,0
1002,CC(=O)[O-])CC1,0
1003,CN(C)C)=C(C/C=C(CNC2)cncn2)n2C(=O)O)CC1=C[C@]12O,0
1004,O=C(\C(=O)c4ccc(C(=O)[C@@H]1[C@H](CC[C@H]2[C@H...,0


### Training data from TDC

In [6]:
# Default random state seed for TDC.
# Read about split methods here: https://tdcommons.ai/functions/data_split/
# Consider scaffold split over random split in the future.
random_seed = 42

# SARS-CoV-2 In Vitro, Touret et al.
data_touret = HTS(name = "SARSCoV2_Vitro_Touret")
split_touret = data_touret.get_split(method = "random", 
                                     seed = random_seed, 
                                     frac = [0.7, 0.1, 0.2])

# SARS-CoV-2 3CL Protease, Diamond.
data_diamond = HTS(name = "SARSCoV2_3CLPro_Diamond")
split_diamond = data_diamond.get_split(method = "random", 
                                       seed = random_seed, 
                                       frac = [0.7, 0.1, 0.2])

# Explore data.
print("dataset datatype:", type(data_touret))
print("data split datatype:", type(split_touret), "\n")
display(data_touret)
print()
display(split_touret)

# Extract training / validation / testing sets.
train_touret = split_touret.get("train")
val_touret = split_touret.get("valid")
test_touret = split_touret.get("test")

train_diamond = split_diamond.get("train")
val_diamond = split_diamond.get("valid")
test_diamond = split_diamond.get("test")

Found local copy...
Loading...
Done!
Found local copy...
Loading...
Done!


dataset datatype: <class 'tdc.single_pred.hts.HTS'>
data split datatype: <class 'dict'> 



<tdc.single_pred.hts.HTS at 0x7fe2f56f5280>




{'train':       Drug_ID                                               Drug  Y
 0           0                       CCOc1ccc2nc(S(N)(=O)=O)sc2c1  1
 1           1  C[C@]12C/C(=C/O)C(=O)C[C@@H]1CC[C@@H]1[C@@H]2C...  1
 2           2               Cc1nccn1CC1CCc2c(c3ccccc3n2C)C1=O.Cl  1
 3           3  CC(=O)[C@H]1CC[C@H]2[C@@H]3CC=C4C[C@@H](O)CC[C...  1
 4           4  C=C1CC[C@@]2(O)[C@H]3Cc4ccc(O)c5c4[C@@]2(CCN3C...  1
 ...       ...                                                ... ..
 1034     1477                        O=C(CCCCCCC(=O)Nc1ccccc1)NO  0
 1035     1478        COc1ccccc1OCCNCC(O)COc1cccc2[nH]c3ccccc3c12  0
 1036     1481  Clc1ccc(C(Cn2ccnc2)OCc2csc3c(Cl)cccc23)c(Cl)c1...  0
 1037     1482  CCSc1ccc2c(c1)N(CCCN1CCN(C)CC1)c1ccccc1S2.O=C(...  0
 1038     1483  C=Cc1c(C)c2cc3nc(cc4[nH]c(cc5nc(cc1[nH]2)C(C)=...  0
 
 [1039 rows x 3 columns],
 'valid':      Drug_ID                                               Drug  Y
 0        581              CC(=O)OCC(CCn1cnc2cnc(N)nc21)CO

In [7]:
# Explore data splits.
print("\n~~~~~ TOURET: SARS-CoV-2 In Vitro, Touret et al. ~~~~~\n")
print("\n--- TRAINING SPLIT ---\n")
print(train_touret.Y.value_counts())
print(train_touret.Y.value_counts(normalize = True), "\n")
print(train_touret.info())
display(train_touret.head())
print("\n--- VALIDATION SPLIT ---\n")
print(val_touret.Y.value_counts())
print(val_touret.Y.value_counts(normalize = True), "\n")
print(val_touret.info())
display(val_touret.head())
print("\n--- TEST SPLIT ---\n")
print(test_touret.Y.value_counts())
print(test_touret.Y.value_counts(normalize = True), "\n")
print(test_touret.info())
display(test_touret.head())

# Explore data splits.
print("\n~~~~~ DIAMOND: SARS-CoV-2 3CL Protease, Diamond et al. ~~~~~\n")
print("\n--- TRAINING SPLIT ---\n")
print(train_diamond.Y.value_counts())
print(train_diamond.Y.value_counts(normalize = True), "\n")
print(train_diamond.info())
display(train_diamond.head())
print("\n--- VALIDATION SPLIT ---\n")
print(val_diamond.Y.value_counts())
print(val_diamond.Y.value_counts(normalize = True), "\n")
print(val_diamond.info())
display(val_diamond.head())
print("\n--- TEST SPLIT ---\n")
print(test_diamond.Y.value_counts())
print(test_diamond.Y.value_counts(normalize = True), "\n")
print(test_diamond.info())
display(test_diamond.head())


~~~~~ TOURET: SARS-CoV-2 In Vitro, Touret et al. ~~~~~


--- TRAINING SPLIT ---

0    977
1     62
Name: Y, dtype: int64
0    0.940327
1    0.059673
Name: Y, dtype: float64 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1039 entries, 0 to 1038
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Drug_ID  1039 non-null   int64 
 1   Drug     1039 non-null   object
 2   Y        1039 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 24.5+ KB
None


Unnamed: 0,Drug_ID,Drug,Y
0,0,CCOc1ccc2nc(S(N)(=O)=O)sc2c1,1
1,1,C[C@]12C/C(=C/O)C(=O)C[C@@H]1CC[C@@H]1[C@@H]2C...,1
2,2,Cc1nccn1CC1CCc2c(c3ccccc3n2C)C1=O.Cl,1
3,3,CC(=O)[C@H]1CC[C@H]2[C@@H]3CC=C4C[C@@H](O)CC[C...,1
4,4,C=C1CC[C@@]2(O)[C@H]3Cc4ccc(O)c5c4[C@@]2(CCN3C...,1



--- VALIDATION SPLIT ---

0    141
1      7
Name: Y, dtype: int64
0    0.952703
1    0.047297
Name: Y, dtype: float64 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148 entries, 0 to 147
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Drug_ID  148 non-null    int64 
 1   Drug     148 non-null    object
 2   Y        148 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 3.6+ KB
None


Unnamed: 0,Drug_ID,Drug,Y
0,581,CC(=O)OCC(CCn1cnc2cnc(N)nc21)COC(C)=O,0
1,375,CC(=O)S[C@@H]1CC2=CC(=O)CC[C@]2(C)[C@H]2CC[C@@...,0
2,703,Nc1nc2c(ncn2CCC(CO)CO)c(=O)[nH]1,0
3,1039,NC(=O)N1c2ccccc2C=Cc2ccccc21,0
4,610,O=C(O)CCc1nc(-c2ccccc2)c(-c2ccccc2)o1,0



--- TEST SPLIT ---

0    278
1     19
Name: Y, dtype: int64
0    0.936027
1    0.063973
Name: Y, dtype: float64 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 297 entries, 0 to 296
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Drug_ID  297 non-null    int64 
 1   Drug     297 non-null    object
 2   Y        297 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 7.1+ KB
None


Unnamed: 0,Drug_ID,Drug,Y
0,123,CCCCOc1cc(C(=O)NCCN(CC)CC)c2ccccc2n1,0
1,432,C[C@H](O)[C@H](O)[C@H]1CNc2nc(N)[nH]c(=O)c2N1....,0
2,1033,CS(=O)(=O)Nc1ccc([N+](=O)[O-])cc1Oc1ccccc1,0
3,529,CN1CCCCC1CCN1c2ccccc2Sc2ccc(S(C)=O)cc21.O=S(=O...,0
4,1417,CNCCCC12CCC(c3ccccc31)c1ccccc12.Cl,0



~~~~~ DIAMOND: SARS-CoV-2 3CL Protease, Diamond et al. ~~~~~


--- TRAINING SPLIT ---

0    568
1     48
Name: Y, dtype: int64
0    0.922078
1    0.077922
Name: Y, dtype: float64 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 616 entries, 0 to 615
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Drug_ID  616 non-null    int64 
 1   Drug     616 non-null    object
 2   Y        616 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 14.6+ KB
None


Unnamed: 0,Drug_ID,Drug,Y
0,1,CC(=O)NCCc1c[nH]c2ccc(F)cc12,1
1,2,NC(=O)[C@H]1CCC[C@H]1c1ccsc1,1
2,3,CN1CCCc2ccc(S(N)(=O)=O)cc21,1
3,4,CC(=O)Nc1ccc(Oc2ncccn2)cc1,1
4,6,O=C(CCl)N1CCN(S(=O)(=O)c2ccc(Cl)cc2)CC1,1



--- VALIDATION SPLIT ---

0    78
1    10
Name: Y, dtype: int64
0    0.886364
1    0.113636
Name: Y, dtype: float64 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88 entries, 0 to 87
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Drug_ID  88 non-null     int64 
 1   Drug     88 non-null     object
 2   Y        88 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 2.2+ KB
None


Unnamed: 0,Drug_ID,Drug,Y
0,498,CC(=O)NCC1(c2ccccc2)CCOCC1,0
1,524,Nc1cc(C(F)(F)F)ccc1N1CCCCC1,0
2,410,Cn1cc(Oc2ncncc2Cl)cn1,0
3,233,CC1(C(N)=O)CCCN1,0
4,229,O=C(CCl)N1CCN(Cc2c(F)cccc2Cl)CC1,0



--- TEST SPLIT ---

0    156
1     20
Name: Y, dtype: int64
0    0.886364
1    0.113636
Name: Y, dtype: float64 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 176 entries, 0 to 175
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Drug_ID  176 non-null    int64 
 1   Drug     176 non-null    object
 2   Y        176 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 4.2+ KB
None


Unnamed: 0,Drug_ID,Drug,Y
0,331,CCOc1ccc(NC(=O)NC(C)(C)C)cc1,0
1,247,C[C@H]1CN(C)CC[C@H]1CO,0
2,790,CC(C)C(=O)N1CCN(C(C)C)CC1,0
3,316,Cc1ccccc1C(=O)NC1CCN(C(=O)CCl)CC1,0
4,215,C[C@H]1NCCC[C@H]1C#N,0


In [8]:
# Agreggate all actives.
touret_pos = pd.concat([train_touret[train_touret.Y == 1],
                        val_touret[val_touret.Y == 1],
                        test_touret[test_touret.Y == 1]])

diamond_pos = pd.concat([train_diamond[train_diamond.Y == 1],
                         val_diamond[val_diamond.Y == 1],
                         test_diamond[test_diamond.Y == 1]])
df_train_active = pd.concat([touret_pos, diamond_pos])

print(df_train_active.Y.value_counts())
print(df_train_active.info())
display(df_train_active.head())

# Agreggate all inactives.
touret_neg = pd.concat([train_touret[train_touret.Y == 0],
                        val_touret[val_touret.Y == 0],
                        test_touret[test_touret.Y == 0]])

diamond_neg = pd.concat([train_diamond[train_diamond.Y == 0],
                         val_diamond[val_diamond.Y == 0],
                         test_diamond[test_diamond.Y == 0]])
df_train_inactive = pd.concat([touret_neg, diamond_neg])

print(df_train_inactive.Y.value_counts())
print(df_train_inactive.info())
display(df_train_inactive.head())

1    166
Name: Y, dtype: int64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 166 entries, 0 to 169
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Drug_ID  166 non-null    int64 
 1   Drug     166 non-null    object
 2   Y        166 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 5.2+ KB
None


Unnamed: 0,Drug_ID,Drug,Y
0,0,CCOc1ccc2nc(S(N)(=O)=O)sc2c1,1
1,1,C[C@]12C/C(=C/O)C(=O)C[C@@H]1CC[C@@H]1[C@@H]2C...,1
2,2,Cc1nccn1CC1CCc2c(c3ccccc3n2C)C1=O.Cl,1
3,3,CC(=O)[C@H]1CC[C@H]2[C@@H]3CC=C4C[C@@H](O)CC[C...,1
4,4,C=C1CC[C@@]2(O)[C@H]3Cc4ccc(O)c5c4[C@@]2(CCN3C...,1


0    2198
Name: Y, dtype: int64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2198 entries, 62 to 175
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Drug_ID  2198 non-null   int64 
 1   Drug     2198 non-null   object
 2   Y        2198 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 68.7+ KB
None


Unnamed: 0,Drug_ID,Drug,Y
62,88,CC1=C(/C=C/C(C)=C/C=C/C(C)=C/C(=O)O)C(C)(C)CCC1,0
63,89,CCC(COC(=O)c1cc(OC)c(OC)c(OC)c1)(c1ccccc1)N(C)C,0
64,90,Clc1ccc(Nc2nnc(Cc3ccncc3)c3ccccc23)cc1,0
65,91,CC(C)=CCC1C(=O)N(c2ccccc2)N(c2ccccc2)C1=O,0
66,92,CCN1CCCC(OC(=O)C(c2ccccc2)c2ccccc2)C1.Cl,0


In [9]:
# Extract SMILES for training and generated data.
actives_train = df_train_active["Drug"]
inactives_train = df_train_inactive["Drug"]
actives_gen = df_active["SMILES"]
inactives_gen = df_inactive["SMILES"]

print("Total training actives:", len(actives_train))
print("Total training inactives:", len(inactives_train))
print("Total generated actives:", len(actives_gen))
print("Total generated inactives:", len(inactives_gen), "\n")
print(actives_train[:10])
print(actives_gen[:10])

Total training actives: 166
Total training inactives: 2198
Total generated actives: 1000
Total generated inactives: 1000 

0                         CCOc1ccc2nc(S(N)(=O)=O)sc2c1
1    C[C@]12C/C(=C/O)C(=O)C[C@@H]1CC[C@@H]1[C@@H]2C...
2                 Cc1nccn1CC1CCc2c(c3ccccc3n2C)C1=O.Cl
3    CC(=O)[C@H]1CC[C@H]2[C@@H]3CC=C4C[C@@H](O)CC[C...
4    C=C1CC[C@@]2(O)[C@H]3Cc4ccc(O)c5c4[C@@]2(CCN3C...
5                        C=CCOc1ccccc1OCC(O)CNC(C)C.Cl
6                       CC(C)=C/C(C)=N/Nc1nncc2ccccc12
7    CC[C@H]1OC(=O)[C@H](C)[C@@H](O[C@H]2C[C@@](C)(...
8      CC1=C(/C=C/C(C)=C/C=C/C(C)=C\C(=O)O)C(C)(C)CCC1
9    COC(c1ccccc1)(c1ccccc1)[C@H](Oc1nc(C)cc(C)n1)C...
Name: Drug, dtype: object
0    CC(=O)c1ncc([C@H]1CO)C(=O)C(C)S[C@H](CCC1=CC(=...
1                                            CC(C)C)n1
2    COC(=O)(=O)CCl)N1CCN(Cc3nc(O)c3cn1nn[nH]2Cn2cc...
3                            CC1CC2)C[C@]2(C2CCCCC3)c1
4          N[C@H]1[C@]43C)[C@@H]3[C@H]2[C@@H]1CCC2(CC1
5    O=C(NC[O@@H]3C(=O)c3c

## MOSES metrics

In [10]:
moses.get_all_metrics?

[0;31mSignature:[0m
[0mmoses[0m[0;34m.[0m[0mget_all_metrics[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mgen[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mk[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mn_jobs[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdevice[0m[0;34m=[0m[0;34m'cpu'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbatch_size[0m[0;34m=[0m[0;36m512[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpool[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtest[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtest_scaffolds[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mptest[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mptest_scaffolds[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtrain[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m

In [20]:
# Compute MOSES metrics for de novo actives.
start_time = time.time()
metrics_active = moses.get_all_metrics(gen = list(actives_gen), train = list(actives_train))
print("Complete in {} seconds.".format(time.time() - start_time))

display(metrics_active)

Complete in 8.240182161331177 seconds.


{'valid': 0.05900000000000005,
 'unique@1000': 0.7627118644067796,
 'unique@10000': 0.7627118644067796,
 'FCD/Test': 40.18987662034009,
 'SNN/Test': 0.2806273886965493,
 'Frag/Test': 0.18277930024448663,
 'Scaf/Test': 0.0,
 'FCD/TestSF': 41.149840656880755,
 'SNN/TestSF': 0.262983328457606,
 'Frag/TestSF': 0.18110062142721595,
 'Scaf/TestSF': 0.0,
 'IntDiv': 0.8531324031263273,
 'IntDiv2': 0.7647339504617954,
 'Filters': 0.6610169491525424,
 'logP': 1.5657829091726272,
 'SA': 0.6392330702729165,
 'QED': 0.33681948448106985,
 'weight': 198.25092439061152,
 'Novelty': 0.9777777777777777}

In [21]:
# Compute MOSES metrics for de novo inactives.
start_time = time.time()
metrics_inactive = moses.get_all_metrics(gen = list(inactives_gen), train = list(inactives_train))
print("Complete in {} seconds.".format(time.time() - start_time))

display(metrics_inactive)

Complete in 10.178868055343628 seconds.


{'valid': 0.09399999999999997,
 'unique@1000': 0.7446808510638298,
 'unique@10000': 0.7446808510638298,
 'FCD/Test': 35.16884491559624,
 'SNN/Test': 0.3083278665992808,
 'Frag/Test': 0.2833126547134911,
 'Scaf/Test': 0.04550749084081307,
 'FCD/TestSF': 36.20779400296248,
 'SNN/TestSF': 0.2929789230544516,
 'Frag/TestSF': 0.2773416722369796,
 'Scaf/TestSF': 0.0,
 'IntDiv': 0.8863627103563346,
 'IntDiv2': 0.8008749375484697,
 'Filters': 0.8297872340425532,
 'logP': 1.4879923011009157,
 'SA': 0.9505904042871962,
 'QED': 0.3138469130348522,
 'weight': 194.32132918183092,
 'Novelty': 1.0}

## TDC metrics

In [13]:
# Compute all metrics for active de novos vs their training data.
compute_tdc_metrics(actives_gen, actives_train)

{'KL divergence': 0.18365253340222004,
 'Uniqueness': 0.045,
 'Validity': 0.059,
 'Novelty': 0.9777777777777777,
 'Diversity': 0.9126941406224025}

In [14]:
# Compute all metrics for inactive de novos vs their training data.
compute_tdc_metrics(inactives_gen, inactives_train)

{'KL divergence': 0.23628022661331322,
 'Uniqueness': 0.07,
 'Validity': 0.094,
 'Novelty': 1.0,
 'Diversity': 0.9120883916651362}

In [15]:
# KL divergence of active training data and inactive training data.
kl = Evaluator(name = "KL_Divergence")
kl(actives_train, inactives_train)

0.8907899756080644

In [16]:
# KL divergence of active de novos and inactive training data.
kl(actives_gen, inactives_train)

0.1844494405837512

In [17]:
# KL divergence of inactive de novos and active training data.
kl(inactives_gen, actives_train)

0.19660429841990162

In [18]:
# KL divergence of active de novos and inactive de novos.
kl(actives_gen, inactives_gen)

0.8712646548972882