In [194]:
import hashlib
import rdkit as rd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.DataStructs.cDataStructs import TanimotoSimilarity

In [195]:
#Determines if SMILES is valid or not
def is_valid(smiles):
    
    mol = Chem.MolFromSmiles(smiles)

    #Returns True if SMILES is valid, returns False if SMILES is invalid
    return smiles != '' and mol is not None and mol.GetNumAtoms() > 0

In [196]:
#Number of characters generated
num_characters = 0

#Number of molecules generated
num_smiles = 0

#Number of unique SMILES
num_unq_mols = 0

#Number of unique SMILES that aren't in the training data
num_mols = 0

#Number of valid molecules that aren't in the training data generated
num_valid = 0

#List of smiles in file, to make sure smiles are unique
smileslist = []

In [197]:
#Training data
training_data = list(open("smiles.txt", "r"))

#File with unique generated SMILES that aren't in the training data
generatedmols = open("generatedmols.txt", "w")

#Read in data file line by line
for line in open("generatedsmiles.txt", "r"):
    
    #Ensure molecules are unique
    if line not in smileslist:
        
        smileslist.append(line)
        
        num_unq_mols += 1

        #Ensure smiles aren't in training data
        if line not in training_data:  

            #Remove \n character, remove G character
            smiles = line.replace("\n", "").replace("G", "")
            
            #Increment number of molecules generated
            num_mols += 1

            #Ensure smiles are valid
            if(is_valid(smiles) == True):
            
                #Copy over SMILES satisfying requirements
                generatedmols.write(line.replace("G", ""))
                
                #Increment number of valid molecules generated
                num_valid += 1
                
    #Increment total number of SMILES generated
    num_smiles += 1
    
    #Add length of line to total number of characters
    num_characters += len(line)

In [198]:
print("Number of characters generated: " + str(num_characters))
print("Number of molecules generated: " + str(num_smiles))
print("Number of unique molecules generated: " + str(num_unq_mols))
print("Number of novel and unique molecules generated: " + str(num_mols))
print("Number of novel, unique, and valid molecules generated: " + str(num_valid))

Number of characters generated: 275058
Number of molecules generated: 5517
Number of unique molecules generated: 1442
Number of novel and unique molecules generated: 1316
Number of novel, unique, and valid molecules generated: 667


In [199]:
#List of Morgan fingerprints of molecules
fingerprints = []

#Read in data file line by line
for line in open("generatedmols.txt", "r"):
    
    #Convert SMILES string to Morgan fingerprint
    mol = Chem.MolFromSmiles(line.replace("\n", ""))
    fingerprint = AllChem.GetMorganFingerprint(mol, 2)
    
    #Add to list of fingerprints
    fingerprints.append(fingerprint)

In [211]:
#Total Tanimoto Distance
tanimoto = 0

#Calculate Tanimoto Distance between each pair of fingerprints
for fpt1 in fingerprints:
    for fpt2 in fingerprints:
        
        if fpt1 != fpt2:
            
            #Calculate Tanimoto Distance
            tan = TanimotoSimilarity(fpt1, fpt2)
            tanimoto += tan

#Average Tanimoto Distance (internal diversity)
avg_tanimoto = (1 / (num_valid ** 2)) * tanimoto
print("Average Internal Diversity: {:0.4f}".format(avg_tanimoto))

Average Internal Diversity: 0.1661
