In [1]:
from rdkit import Chem
from rdkit.Chem import AllChem
from os.path import expanduser
import argparse
import csv
import sys
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix

In [2]:
from IPython.display import SVG
from rdkit import Chem
from rdkit.Chem import rdDepictor
from rdkit.Chem.Draw import rdMolDraw2D

In [3]:
def defaultSupplFile():
  return '%s/chembl_19/chembl_19.sdf' % datadir

def saveFingerprints(results, filename):
  """ results is a dictionary: chembl_id -> [fingerprints] """
  with open(filename, 'w') as csvfile:
    fpwriter = csv.writer(csvfile, delimiter=",", quoting=csv.QUOTE_NONE)
    fpwriter.writerow(["compound","feature"])
    for compound in results:
        for feature in results[compound]:
            fpwriter.writerow( [compound, feature] )

def saveFingerprintsNpy(results, filename, cmpdlist, ecfp_fold=32000):
    """ write fingerprints to a npy """
    fp2 =[(x,(np.array(list(results[x].keys())), np.array(list(results[x].values())))) for x in results] #TODO: Modify to save compound list
    cmpd, ecfp   = zip(*fp2)
    feat, counts = zip(*ecfp)
    
    ## Create indicies for csr matrix
    lens    = np.array([len(f) for f in feat])
    indptr  = np.concatenate([[0], np.cumsum(lens)])
    indices = np.concatenate(feat) % ecfp_fold
    data    = np.ones(indices.shape[0])
    csr     = csr_matrix((data, indices, indptr), shape=(len(feat), ecfp_fold))
    csr.sum_duplicates()
    csr.data[:] = 1.0

    np.save(filename, csr)
    pd.DataFrame(cmpd).to_csv(cmpdlist, header=False)
       

In [4]:
def getChemblIDs(ic50file):
    a = pd.read_csv(ic50file, header = None)
    chembl = a[0].values 
    print(f"  getChemblIds - Read {len(chembl)} Compounds")
    return chembl

In [5]:
def printMol(mol, width = 450, height = 150):
    bits = AllChem.GetMorganFingerprint(mol, 3) 
    print ('Chembl-id:    %s' % mol.GetProp('chembl_id'))
    print ('Num of Atoms: %d' % mol.GetNumAtoms())
    print ("#bits:        %d" % len(bits.GetNonzeroElements()))
    print ('Smiles:       %s' % Chem.MolToSmiles(mol, isomericSmiles=True))
    print ("")
    drawMol(mol, width, height)


In [6]:
def drawMol(mol, width = 450, height = 150):
#     molSize = (width, height)
    mc = Chem.Mol(mol.ToBinary())

    if not mc.GetNumConformers():
        rdDepictor.Compute2DCoords(mc)

    drawer = rdMolDraw2D.MolDraw2DSVG(width, height)
    drawer.DrawMolecule(mc)
    drawer.FinishDrawing()
    svg = drawer.GetDrawingText()
    display(SVG(svg.replace('svg:','')))

In [7]:
def whatisthis(s):
    if isinstance(s, str):
        print("ordinary string")
    elif isinstance(s, unicode):
        print ("unicode string")
    else:
        print ("not a string")

In [8]:
class SDF:
  def __init__(self, supplFile):
    self.supplFile = supplFile
    print("supplFile: ", supplFile)
    self.suppl = Chem.SDMolSupplier( self.supplFile)
    print('Total number of molcules loaded: ', len(self.suppl))

  def getMol(self, n = 10e+9):
    """ function for looping over all molecules """
    self.suppl.reset()
    i = 0
    for rdmol in self.suppl:
      if rdmol is None: continue
      i += 1
      yield rdmol
      if i >= n: return

  def print4Mol(self):
    for mol in self.getMol(4):
      print(type(mol)) 
      bits = AllChem.GetMorganFingerprint(mol, 3) 
      print ('Chembl-id: %s' % mol.GetProp('chembl_id'))
      print ("#bits:     %d" % len(bits.GetNonzeroElements()))
      print ('Smiles:    %s' % Chem.MolToSmiles(mol, isomericSmiles=True))
      print ("")

  def getMorganFingerPrints(self, chemblIDs, nMorgan):
    ids = set(chemblIDs)
    results = dict()
    i=0
    for mol in self.getMol():
      i+=1
      if i %20000 == 0:
          print("Processed: %d compounds"%i)
      chembl_id = mol.GetProp('chembl_id') 
      if chembl_id not in ids:
        continue
      results[chembl_id] = AllChem.GetMorganFingerprint(mol, nMorgan).GetNonzeroElements()
    return results

  def getMorganFingerPrintsAll(self, nMorgan):
    results = dict()
    for mol in self.getMol():
      chembl_id = mol.GetProp('chembl_id') 
      results[chembl_id] = AllChem.GetMorganFingerprint(mol, nMorgan).GetNonzeroElements()
    return results
##### end of class SDF ######

In [9]:
  parser = argparse.ArgumentParser(description='Generate Morgan(3) fingerprints from raw SDF.')
  parser.add_argument('-c', '--compounds', metavar='FILE', help="CSV file of compounds (CHEMBL IDs). If not supplied all compounds are saved.", default=None)
  parser.add_argument('-s', '--sdf', help="Input SDF file.", required=True)
  parser.add_argument('-o', '--out', help="Output file name (CSV file of fingerprints).", required=True)
  parser.add_argument('-r', '--radius', metavar='R', type=int, help="ECFP radius (default 3)", default=3)
  parser.add_argument('--numids', action='store_true')

_StoreTrueAction(option_strings=['--numids'], dest='numids', nargs=0, const=True, default=False, type=None, choices=None, help=None, metavar=None)

In [18]:
in_args = "-s input/chembl_29.sdf -o output/chembl_29_X.csv -c output/chembl_29/10_chembl_29_compounds.csv"
in_args = in_args.split()
print(in_args)

['-s', 'input/chembl_29.sdf', '-o', 'output/chembl_29_X.csv', '-c', 'output/chembl_29/10_chembl_29_compounds.csv']


In [19]:
args = vars(parser.parse_args(in_args))
args
# args_sdf = args['sdf'].encode('utf-8')
# args_sdf = args['sdf'].encode('utf-8').decode('ascii')
# args_sdf

{'compounds': 'output/chembl_29/10_chembl_29_compounds.csv',
 'sdf': 'input/chembl_29.sdf',
 'out': 'output/chembl_29_X.csv',
 'radius': 3,
 'numids': False}

### Main

In [20]:
# def mainf(compoundsFile, outFile, sdfFile, nMorgan = 3, numericIds = False):
# return mainf(args["compounds"], args["out"], args["sdf"], nMorgan = args["radius"], numericIds = args["numids"])

compoundsFile = args["compounds"] 
outFile = args["out"]
sdfFile = args["sdf"]
nMorgan = args["radius"]
numericIds = args["numids"]

In [21]:
print('sdffile:', sdfFile, ' type: ', type(sdfFile))
print(f" compoundsFile = {compoundsFile} ")
print(f" outFile       = {outFile} ")
print(f" sdfFile       = {sdfFile} ")
print(f" nMorgan       = {nMorgan} ")
print(f" numericIds    = {numericIds} ")

sdffile: input/chembl_29.sdf  type:  <class 'str'>
 compoundsFile = output/chembl_29/10_chembl_29_compounds.csv 
 outFile       = output/chembl_29_X.csv 
 sdfFile       = input/chembl_29.sdf 
 nMorgan       = 3 
 numericIds    = False 


### Read `chembl_29.sdf` file

Also read the compunds CSV file generated in step 1

In [22]:
sdf = SDF(sdfFile)
print('sdf object: ' , sdf)

supplFile:  input/chembl_29.sdf
Total number of molcules loaded:  2084724
sdf object:  <__main__.SDF object at 0x7f82cfafd880>


### Get Morgan Fingerprint for compounds

If a compounds file was provided, get Morgan Fingerprinfs for componds in file

In [23]:
if compoundsFile is None:
    fp = sdf.getMorganFingerPrintsAll(nMorgan)
    print(' Compounds File = None')
else:
    compoundIDs = getChemblIDs(compoundsFile)
    fp = sdf.getMorganFingerPrints(compoundIDs, nMorgan)
    print(' Compounds File is Not None')
    #fp = {'a':[1,123]}

  getChemblIds - Read 423808 Compounds
Processed: 20000 compounds
Processed: 40000 compounds
Processed: 60000 compounds
Processed: 80000 compounds
Processed: 100000 compounds
Processed: 120000 compounds




Processed: 140000 compounds
Processed: 160000 compounds
Processed: 180000 compounds
Processed: 200000 compounds
Processed: 220000 compounds
Processed: 240000 compounds
Processed: 260000 compounds
Processed: 280000 compounds
Processed: 300000 compounds
Processed: 320000 compounds
Processed: 340000 compounds
Processed: 360000 compounds
Processed: 380000 compounds
Processed: 400000 compounds
Processed: 420000 compounds
Processed: 440000 compounds
Processed: 460000 compounds
Processed: 480000 compounds
Processed: 500000 compounds
Processed: 520000 compounds
Processed: 540000 compounds
Processed: 560000 compounds
Processed: 580000 compounds
Processed: 600000 compounds
Processed: 620000 compounds
Processed: 640000 compounds
Processed: 660000 compounds
Processed: 680000 compounds
Processed: 700000 compounds
Processed: 720000 compounds
Processed: 740000 compounds
Processed: 760000 compounds
Processed: 780000 compounds
Processed: 800000 compounds
Processed: 820000 compounds
Processed: 840000 co



Processed: 1340000 compounds
Processed: 1360000 compounds
Processed: 1380000 compounds




Processed: 1400000 compounds
Processed: 1420000 compounds
Processed: 1440000 compounds
Processed: 1460000 compounds
Processed: 1480000 compounds
Processed: 1500000 compounds
Processed: 1520000 compounds
Processed: 1540000 compounds




Processed: 1560000 compounds




Processed: 1580000 compounds




Processed: 1600000 compounds
Processed: 1620000 compounds
Processed: 1640000 compounds
Processed: 1660000 compounds
Processed: 1680000 compounds
Processed: 1700000 compounds
Processed: 1720000 compounds
Processed: 1740000 compounds
Processed: 1760000 compounds
Processed: 1780000 compounds
Processed: 1800000 compounds
Processed: 1820000 compounds
Processed: 1840000 compounds
Processed: 1860000 compounds
Processed: 1880000 compounds
Processed: 1900000 compounds
Processed: 1920000 compounds
Processed: 1940000 compounds




Processed: 1960000 compounds
Processed: 1980000 compounds
Processed: 2000000 compounds
Processed: 2020000 compounds
Processed: 2040000 compounds
Processed: 2060000 compounds
Processed: 2080000 compounds
 Compounds File is Not None


### Save Morgan Fingerprint results 

In [24]:
saveFingerprints(fp, outFile)

saveFingerprintsNpy(fp, "20.1_fp_32000.npy", "20.2_cmpd_list_X.csv", ecfp_fold=32000)

print(f" fingerprints written to outFile: {outFile} ")
print(f" fingerprints written in npy format cmpd_list_X.csv ")

 fingerprints written to outFile: output/chembl_29_X.csv 
 fingerprints written in npy format cmpd_list_X.csv 


In [6]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("2.2_cmpd_list_X.csv")

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 423735 entries, 0 to 423734
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   0             423735 non-null  int64 
 1   CHEMBL405398  423735 non-null  object
dtypes: int64(1), object(1)
memory usage: 6.5+ MB


In [5]:
del df

In [8]:
np.load("2.1_fp_32000.npy", allow_pickle = True)

array(<423736x32000 sparse matrix of type '<class 'numpy.float64'>'
	with 32838620 stored elements in Compressed Sparse Row format>,
      dtype=object)

In [9]:
fp

NameError: name 'fp' is not defined