In [1]:
import os
import numpy as np
from rdkit import Chem
from rdkit.Chem.rdMolDescriptors import GetMorganFingerprint
from scipy.spatial.distance import cdist
from scipy import sparse
from fcd import get_fcd, load_ref_model
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:


def canonical_smiles(smiles_list):
    """Converts a list of SMILES to canonical SMILES."""
    canonical = []
    for smiles in smiles_list:
        try:
            mol = Chem.MolFromSmiles(smiles)
            if mol !='None':
                canonical.append(Chem.MolToSmiles(mol))
            else:
                canonical.append(None)
        except:
            canonical.append(None)
    return canonical

def calculate_validity(smiles_list):
    """Calculates the validity of a list of SMILES."""
    valid_count = 0
    for smiles in smiles_list:
        if smiles !='None':
            valid_count += 1
    return valid_count / len(smiles_list) if len(smiles_list) > 0 else 0

def calculate_novelty(generated_smiles, reference_smiles):
    """Calculates the novelty of generated SMILES compared to a reference set."""
    novel_count = 0
    for smiles in generated_smiles:
        if smiles !='None' and smiles not in reference_smiles:
            novel_count += 1
    return novel_count / len(generated_smiles) if len(generated_smiles) > 0 else 0

def calculate_unique(smiles_list):
    """Calculates the uniqueness of a list of SMILES."""
    unique_smiles = set()
    for smiles in smiles_list:
        if smiles !='None':
            unique_smiles.add(smiles)
    return len(unique_smiles) / len(smiles_list) if len(smiles_list) > 0 else 0

def calculate_internal_diversity(smiles_list):
    """
    Calculates the internal diversity of a list of SMILES strings
    using Morgan fingerprints and Tanimoto similarity.
    """
    if not smiles_list:
      return 0.0

    fps = []
    for smiles in smiles_list:
      if smiles !='None':
          mol = Chem.MolFromSmiles(smiles)
          if mol is not None:
              fp = GetMorganFingerprint(mol, 2)
              fps.append(fp)

    if not fps:
      return 0.0

    # Convert list of fingerprints to a sparse matrix
    fps_sparse = sparse.vstack(fps)

    # Calculate pairwise Tanimoto distances (1 - similarity)
    distances = cdist(fps_sparse, fps_sparse, 'jaccard')

    # Calculate the average distance (dissimilarity)
    avg_distance = np.mean(distances)

    return avg_distance

model = load_ref_model()


In [3]:
# gen_path = '/data2/chensm22/HRS/FreeGress-main/outputs/2025-01-05/06-56-22-logp_p01_lay12/final_smiles.txt'
# gen_path = '/data2/chensm22/HRS/FreeGress-main/outputs/2025-01-05/06-56-22-logp_p01_lay12/final_smiles.txt'
gen_path = '/home/hers22/HRS/MHdiff/sample/denovo/zinc/wo_dist/final_smiles.txt'

# gen_path = '/data2/chensm22/HRS/MHdiff/sample/denovo/zinc/charged/final_smiles.txt'
sample1 = np.array([s.strip() for s in open(gen_path).readlines()])
sample2 = np.random.choice(open('/data2/chensm22/HRS/data/zinc250k/raw/train.txt').readlines(), len(sample1), replace=False)

# sample1 = sample1[:4000]

# Calculate Validity
validity = calculate_validity(sample1)
print("Validity: ", validity)

sample1 = np.array([s for s in sample1 if s!='None'])
can_sample1 = [w for w in canonical_smiles(sample1) if w is not None]
can_sample2 = [w for w in canonical_smiles(sample2) if w is not None]


# Calculate Novelty
novelty = calculate_novelty(can_sample1, can_sample2)
print("Novelty: ", novelty)

# Calculate Unique
unique = calculate_unique(can_sample1)
print("Unique: ", unique)

# Calculate IntDiv
# intDiv = calculate_internal_diversity(can_sample1)
# print("IntDiv: ", intDiv)

fcd_score = get_fcd(can_sample1, can_sample2, model)

print("FCD: ", fcd_score)

Validity:  0.9166341145833333
Novelty:  0.9999644873752619
Unique:  0.9999644873752619
FCD:  0.8117532222388775
