In [1]:
import numpy as np
import rdkit as rd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit.DataStructs.cDataStructs import TanimotoSimilarity

In [2]:
#Determines if SMILES is valid or not
def is_valid(smiles):
    
    mol = Chem.MolFromSmiles(smiles)

    #Returns True if SMILES is valid, returns False if SMILES is invalid
    return smiles != '' and mol is not None and mol.GetNumAtoms() > 0

In [3]:
#Determines LogP
def logP(smiles):
    
    return(Descriptors.MolLogP(Chem.MolFromSmiles(smiles)))

In [4]:
#Determins molecular weight
def molWt(smiles):
    
    return(Descriptors.MolWt(Chem.MolFromSmiles(smiles)))

In [5]:
#Determine number hydrogen bond acceptors
def numAcc(smiles):
    
    return(Descriptors.NumHAcceptors(Chem.MolFromSmiles(smiles)))

In [6]:
#Determine number hydrogen bond donors
def numDon(smiles):
    
    return(Descriptors.NumHDonors(Chem.MolFromSmiles(smiles)))

In [7]:
#Determine polar surface area
def polSur(smiles):
    
    return(Descriptors.TPSA(Chem.MolFromSmiles(smiles)))

In [40]:
#Determine number of rotatable bonds
def rolBon(smiles):
    return(Descriptors.NumRotatableBonds(Chem.MolFromSmiles(smiles)))

In [34]:
#Number of characters generated
num_characters = 0

#Number of molecules generated
num_smiles = 0

#Number of unique SMILES
num_unq_mols = 0

#Number of unique SMILES that aren't in the training data
num_mols = 0

#Number of valid molecules that aren't in the training data generated
num_valid = 0

#List of smiles in file, to make sure smiles are unique
smileslist = []

In [35]:
#Training data
training_data = list(open("smiles.txt", "r"))

#File with unique generated SMILES that aren't in the training data
generatedmols = open("generatedmols.txt", "w")

#Read in data file line by line
for line in open("generatedsmiles.txt", "r"):
    
    #Ensure molecules are unique
    if line not in smileslist:
        
        smileslist.append(line)
        
        num_unq_mols += 1

        #Ensure smiles aren't in training data
        if line not in training_data:  

            #Remove \n character, remove G character
            smiles = line.replace("\n", "").replace("G", "")
            
            #Increment number of molecules generated
            num_mols += 1

            #Ensure smiles are valid
            if(is_valid(smiles) == True):
            
                #Copy over SMILES satisfying requirements
                generatedmols.write(smiles + "\n")
                
                #Increment number of valid molecules generated
                num_valid += 1
                
    #Increment total number of SMILES generated
    num_smiles += 1
    
    #Add length of line to total number of characters
    num_characters += len(line)

In [36]:
print("Number of characters generated: " + str(num_characters))
print("Number of molecules generated: " + str(num_smiles))
print("Number of unique molecules generated: " + str(num_unq_mols))
print("Number of novel and unique molecules generated: " + str(num_mols))
print("Number of novel, unique, and valid molecules generated: " + str(num_valid))

Number of characters generated: 275058
Number of molecules generated: 5517
Number of unique molecules generated: 1442
Number of novel and unique molecules generated: 1316
Number of novel, unique, and valid molecules generated: 667


In [37]:
#List of Morgan fingerprints of molecules
fingerprints = []

#Read in data file line by line
for line in open("generatedmols.txt", "r"):
    
    #Convert SMILES string to Morgan fingerprint
    mol = Chem.MolFromSmiles(line.replace("\n", ""))
    fingerprint = AllChem.GetMorganFingerprint(mol, 2)
    
    #Add to list of fingerprints
    fingerprints.append(fingerprint)    

In [38]:
#Total Tanimoto Distance
tanimoto = 0

#Calculate Tanimoto Distance between each pair of fingerprints
for fpt1 in fingerprints:
    for fpt2 in fingerprints:
        
        if fpt1 != fpt2:
            
            #Calculate Tanimoto Distance
            tan = TanimotoSimilarity(fpt1, fpt2)
            tanimoto += tan

#Average Tanimoto Distance (internal diversity)
avg_tanimoto = (1 / (num_valid ** 2)) * tanimoto
print("Average Internal Diversity: {:0.4f}".format(avg_tanimoto))

Average Internal Diversity: 0.1661


In [41]:
#Array of molecular properties for generated molecules
molProps = np.empty((0,5))

#Read in data file line by line
for molecule in open("generatedmols.txt", "r"):
    
    #Array of properties [partition coefficient, molecular weight, number of hydrogen bond acceptors, number of hydrogen bond donors, polar surface area]
    props = np.reshape(np.array([logP(molecule), molWt(molecule), numAcc(molecule), numDon(molecule), rolBon(molecule)]), (1, 5))
    
    #Append properties
    molProps = np.append(molProps, props, axis=0)

In [42]:
#Array of number of molecules each molecule is dominated by
dom = np.zeros((np.shape(molProps)[0]))

#Analyze each molecule's properties as they compare to others
for i in range(np.shape(molProps)[0]):
    
    #Number of properties that molecule i is worse than molecule j
    num_worse = 0
    
    for j in range(np.shape(molProps)[0]):
        
        #Comapre each property between the molecules
        if (molProps[j, 0] < molProps[i, 0]):
            num_worse += 1
            
        if (molProps[j, 1] < molProps[i, 1]):
            num_worse += 1
            
        if (molProps[j, 2] < molProps[i, 2]):
            num_worse += 1

        if (molProps[j, 3] < molProps[i, 3]):
            num_worse += 1
            
        if (molProps[j, 4] < molProps[i, 4]):
            num_worse += 1
            
        #If molecule i is worse than molecule j in every property, molecule j dominates molecule i
        if (num_worse == 5):
            dom[i] += 1

In [45]:
#Count how fronteirs there are and how many molecules are in each frontier
unique, counts = np.unique(dom, return_counts=True)
dict(zip(unique, counts))

{0.0: 340, 1.0: 260, 2.0: 46, 3.0: 17, 6.0: 3, 16.0: 1}