In [1]:
import pandas as pd
import moses
from tqdm.auto import tqdm
from IPython.display import clear_output
from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def get_scaffold(data):
    return [
        Chem.MolToSmiles(MurckoScaffold.GetScaffoldForMol(Chem.MolFromSmiles(smi)))
        for smi in data
    ]

In [4]:
gen = pd.read_csv('./generation/gptneo_irak4_spe_20221103-1811-checkpoint-500/10_000_gen_sample.csv')
train = pd.read_csv('../GPT2/dataset/IRAK4/irak4_train.txt', header=None).iloc[:,0].to_list()
test = pd.read_csv('../GPT2/dataset/IRAK4/irak4_test.txt', header=None).iloc[:,0].to_list()
test_scaffolds = get_scaffold(test)

print(f"Training set: {len(train):,} counts")
print(f"Test set: {len(test):,} counts")
print(f"Test Scaffold: {len(test_scaffolds):,} counts")

Training set: 840 counts
Test set: 10 counts
Test Scaffold: 10 counts


In [21]:
%%time
metrics = []
for i in range(gen.shape[1]):
    data = gen.iloc[:, i].to_list()
    metrics.append(moses.get_all_metrics(
        data,
        n_jobs=56, device='cpu', batch_size=512,
        test=test, test_scaffolds=test_scaffolds,
        train=train
    ))

  cpuset_checked))
  "gen contains only {} molecules".format(len(gen))


CPU times: user 32min 10s, sys: 4min 5s, total: 36min 16s
Wall time: 2min 38s


In [24]:
df = pd.DataFrame(metrics)
df.to_csv('./generation/gptneo_irak4_spe_20221103-1811-checkpoint-500/metrics.csv', index=False)
df

Unnamed: 0,valid,unique@1000,unique@10000,FCD/Test,SNN/Test,Frag/Test,Scaf/Test,FCD/TestSF,SNN/TestSF,Frag/TestSF,Scaf/TestSF,IntDiv,IntDiv2,Filters,logP,SA,QED,weight,Novelty
0,0.9914,0.549,0.078071,30.38726,0.316955,0.813776,0.009453,35.63995,0.237472,0.693855,0.009453,0.80194,0.771127,0.945532,0.411628,0.554657,0.081076,55.730159,0.02584
1,1.0,0.534,0.0719,30.44478,0.320165,0.815555,0.012953,35.939203,0.238901,0.697045,0.012953,0.801092,0.770334,0.9459,0.421142,0.546691,0.079975,55.566711,0.0
2,0.9977,0.148,0.015736,28.607994,0.345782,0.790773,0.018396,33.36891,0.24174,0.670592,0.018396,0.771891,0.7173,0.977649,0.396087,0.529281,0.074187,53.472046,0.057325
