###  Step 20 

Use the Chembl SDF file to create a list of compounds, features, and feature counts.

Outputs:
 
 * output/chembl_29/chembl_29_X.csv           csv list of compounds, features 
 
 * Step20/2.1_fp_32000.npy                    npy folded file
 
 * Step20/2.2_cmpd_list_X.csv                 csv list: seq_number,compound

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))
%load_ext autoreload
%autoreload 2

In [2]:
from os.path import expanduser
import argparse
import csv
import sys
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from IPython.display import SVG

In [3]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdDepictor
from rdkit.Chem.Draw import rdMolDraw2D

In [4]:
def defaultSupplFile():
  return '%s/chembl_19/chembl_19.sdf' % datadir

def saveFingerprints(results, filename):
  """ results is a dictionary: chembl_id -> [fingerprints] """
  with open(filename, 'w') as csvfile:
    fpwriter = csv.writer(csvfile, delimiter=",", quoting=csv.QUOTE_NONE)
    fpwriter.writerow(["compound","feature"])
    for compound in results:
        for feature in results[compound]:
            fpwriter.writerow( [compound, feature] )
    print(f" => fingerprints written to {filename} ")    
    
    
def saveFingerprintsNpy(results, filename, cmpdlist, ecfp_fold=32000):
    """ write fingerprints to npy, compunds to CSV file """
    ## fp2 : list of tuples : each tuple: (compound id, (array of features, array of counts))
    fp2 =[(x,(np.array(list(results[x].keys())), np.array(list(results[x].values())))) for x in results] #TODO: Modify to save compound list
    
    ## cmpd: tuple of strings, each representing a chembl compound
    ## ecfp: tuple of tuples each tuple being an array of (features, counts)
    cmpd, ecfp   = zip(*fp2) 
    
    ## feat  : tuple of ndarrays, each member being a ndarray of features 
    ## counts: tuple of ndarrays, each member being a ndarray of counts 
    feat, counts = zip(*ecfp)
    
    ## Create indicies for csr matrix
    ## the column indices for row i are stored in indices[indptr[i]:indptr[i+1]] 
    ## their corresponding values are stored in data[indptr[i]:indptr[i+1]]
    lens    = np.array([len(f) for f in feat])
    indptr  = np.concatenate([[0], np.cumsum(lens)])
    indices = np.concatenate(feat) % ecfp_fold
    data    = np.ones(indices.shape[0])
    
    ## Create CSR matrix
    csr     = csr_matrix((data, indices, indptr), shape=(len(feat), ecfp_fold))
    csr.sum_duplicates()
    csr.data[:] = 1.0
    
    # write fingerprint data to numpy CSR sparse matrix 
    np.save(filename, csr)
    print(f" => fingerprints written to {filename} ")    
    
    # write compounds to csv file 
    pd.DataFrame(cmpd).to_csv(cmpdlist, header=False)
    print(f" => compounds written to {cmpdlist} ")       

In [5]:
def getChemblIDs(ic50file):
    a = pd.read_csv(ic50file, header = None)
    chembl = a[0].values 
    print(f"  getChemblIds - Read {len(chembl)} Compounds")
    return chembl

In [6]:
def printMol(mol, width = 450, height = 150):
    bits = AllChem.GetMorganFingerprint(mol, 3) 
    print ('Chembl-id:    %s' % mol.GetProp('chembl_id'))
    print ('Num of Atoms: %d' % mol.GetNumAtoms())
    print ("#bits:        %d" % len(bits.GetNonzeroElements()))
    print ('Smiles:       %s' % Chem.MolToSmiles(mol, isomericSmiles=True))
    print ("")
    drawMol(mol, width, height)


In [7]:
def drawMol(mol, width = 450, height = 150):
#     molSize = (width, height)
    mc = Chem.Mol(mol.ToBinary())

    if not mc.GetNumConformers():
        rdDepictor.Compute2DCoords(mc)

    drawer = rdMolDraw2D.MolDraw2DSVG(width, height)
    drawer.DrawMolecule(mc)
    drawer.FinishDrawing()
    svg = drawer.GetDrawingText()
    display(SVG(svg.replace('svg:','')))

In [8]:
def whatisthis(s):
    if isinstance(s, str):
        print("ordinary string")
    elif isinstance(s, unicode):
        print ("unicode string")
    else:
        print ("not a string")

### SDF file class definition

In [9]:
class SDF:
    
    
  def __init__(self, supplFile):
    self.supplFile = supplFile
    print("supplFile: ", supplFile)
    self.suppl = Chem.SDMolSupplier( self.supplFile)
    print('Total number of molcules loaded: ', len(self.suppl))

  def getMol(self, n = 10e+9):
    """ function for looping over all molecules """
    self.suppl.reset()
    i = 0
    for rdmol in self.suppl:
      if rdmol is None: continue
      i += 1
      yield rdmol
      if i >= n: return

  def print4Mol(self):
    for mol in self.getMol(4):
      print(type(mol)) 
      bits = AllChem.GetMorganFingerprint(mol, 3) 
      print ('Chembl-id: %s' % mol.GetProp('chembl_id'))
      print ("#bits:     %d" % len(bits.GetNonzeroElements()))
      print ('Smiles:    %s' % Chem.MolToSmiles(mol, isomericSmiles=True))
      print ("")

  def getMorganFingerPrints(self, chemblIDs, nMorgan):
    ids = set(chemblIDs)
    results = dict()
    i=0
    for mol in self.getMol():
      i+=1
      chembl_id = mol.GetProp('chembl_id') 
      if i %20000 == 0:
          print(f"Processed: {i} compounds - chembl_id : {chembl_id}")
      if chembl_id not in ids:
        continue
      results[chembl_id] = AllChem.GetMorganFingerprint(mol, nMorgan).GetNonzeroElements()
    return results

  def getMorganFingerPrintsAll(self, nMorgan):
    results = dict()
    for mol in self.getMol():
      chembl_id = mol.GetProp('chembl_id') 
      results[chembl_id] = AllChem.GetMorganFingerprint(mol, nMorgan).GetNonzeroElements()
    return results
##### end of class SDF ######


### Main

In [10]:
  parser = argparse.ArgumentParser(description='Generate Morgan(3) fingerprints from raw SDF.')
  parser.add_argument('-c', '--compounds', metavar='FILE', help="CSV file of compounds (CHEMBL IDs). If not supplied all compounds are saved.", default=None)
  parser.add_argument('-s', '--sdf', help="Input SDF file.", required=True)
  parser.add_argument('-o', '--out', help="Output file name (CSV file of fingerprints).", required=True)
  parser.add_argument('-r', '--radius', metavar='R', type=int, help="ECFP radius (default 3)", default=3)
  parser.add_argument('--numids', action='store_true')

_StoreTrueAction(option_strings=['--numids'], dest='numids', nargs=0, const=True, default=False, type=None, choices=None, help=None, metavar=None)

In [11]:
VERSION = 'chembl_29_dev'

in_args = f" -s  input/chembl_29.sdf "\
          f" -o output/{VERSION}/{VERSION}_X.csv " \
          f" -c output/{VERSION}/{VERSION}_compounds.csv"
in_args = in_args.split()
args = vars(parser.parse_args(in_args))

print(in_args)
print()
for i in args:
    print(f"  {i:15s}     {args[i]}")

# args_sdf = args['sdf'].encode('utf-8')
# args_sdf = args['sdf'].encode('utf-8').decode('ascii')
# args_sdf

['-s', 'input/chembl_29.sdf', '-o', 'output/chembl_29_dev/chembl_29_dev_X.csv', '-c', 'output/chembl_29_dev/chembl_29_dev_compounds.csv']

  compounds           output/chembl_29_dev/chembl_29_dev_compounds.csv
  sdf                 input/chembl_29.sdf
  out                 output/chembl_29_dev/chembl_29_dev_X.csv
  radius              3
  numids              False


In [12]:
# def mainf(compoundsFile, outFile, sdfFile, nMorgan = 3, numericIds = False):
# return mainf(args["compounds"], args["out"], args["sdf"], nMorgan = args["radius"], numericIds = args["numids"])

compoundsFile = args["compounds"] 
outFile = args["out"]
sdfFile = args["sdf"]
nMorgan = args["radius"]
numericIds = args["numids"]

print(f" sdffile       = {sdfFile}")
print(f" compoundsFile = {compoundsFile} ")
print(f" outFile       = {outFile} ")
print(f" nMorgan       = {nMorgan} ")
print(f" numericIds    = {numericIds} ")

 sdffile       = input/chembl_29.sdf
 compoundsFile = output/chembl_29_dev/chembl_29_dev_compounds.csv 
 outFile       = output/chembl_29_dev/chembl_29_dev_X.csv 
 nMorgan       = 3 
 numericIds    = False 


In [14]:
of = pd.read_csv(compoundsFile,header=None)

of.info()
print()
print(of.nunique())
print()
print(of.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 423809 entries, 0 to 423808
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   0       423809 non-null  object
dtypes: object(1)
memory usage: 3.2+ MB

0    423809
dtype: int64

               0
0  CHEMBL1234777
1  CHEMBL1812661
2  CHEMBL1812662
3  CHEMBL2326084
4  CHEMBL2326085


### Read `chembl_29.sdf` file

Also read the compunds CSV file generated in step 1

In [15]:
sdf = SDF(sdfFile)
print('sdf object: ' , sdf)

supplFile:  input/chembl_29.sdf
Total number of molcules loaded:  2084724
sdf object:  <__main__.SDF object at 0x7fba7c5452b0>


### Get Morgan Fingerprint for compounds

If a compounds file was provided, get Morgan Fingerprinfs for componds in file

In [16]:
if compoundsFile is None:
    print(' Compounds File is Not provided - run for all Chembl ids')
    fp = sdf.getMorganFingerPrintsAll(nMorgan)
else:
    print(f" Compounds File is {compoundsFile}")
    compoundIDs = getChemblIDs(compoundsFile)
    fp = sdf.getMorganFingerPrints(compoundIDs, nMorgan)

 Compounds File is output/chembl_29_dev/chembl_29_dev_compounds.csv
  getChemblIds - Read 423809 Compounds
Processed: 20000 compounds - chembl_id : CHEMBL23508
Processed: 40000 compounds - chembl_id : CHEMBL44263
Processed: 60000 compounds - chembl_id : CHEMBL67628
Processed: 80000 compounds - chembl_id : CHEMBL90147
Processed: 100000 compounds - chembl_id : CHEMBL111755
Processed: 120000 compounds - chembl_id : CHEMBL132114




Processed: 140000 compounds - chembl_id : CHEMBL151673
Processed: 160000 compounds - chembl_id : CHEMBL171563
Processed: 180000 compounds - chembl_id : CHEMBL190889
Processed: 200000 compounds - chembl_id : CHEMBL209362
Processed: 220000 compounds - chembl_id : CHEMBL225994
Processed: 240000 compounds - chembl_id : CHEMBL242586
Processed: 260000 compounds - chembl_id : CHEMBL4458812
Processed: 280000 compounds - chembl_id : CHEMBL278545
Processed: 300000 compounds - chembl_id : CHEMBL299609
Processed: 320000 compounds - chembl_id : CHEMBL4468915
Processed: 340000 compounds - chembl_id : CHEMBL340126
Processed: 360000 compounds - chembl_id : CHEMBL4474989
Processed: 380000 compounds - chembl_id : CHEMBL1201984
Processed: 400000 compounds - chembl_id : CHEMBL395047
Processed: 420000 compounds - chembl_id : CHEMBL4517967
Processed: 440000 compounds - chembl_id : CHEMBL438615
Processed: 460000 compounds - chembl_id : CHEMBL460768
Processed: 480000 compounds - chembl_id : CHEMBL478613
Proce



Processed: 1340000 compounds - chembl_id : CHEMBL1983322
Processed: 1360000 compounds - chembl_id : CHEMBL2035131
Processed: 1380000 compounds - chembl_id : CHEMBL2109907




Processed: 1400000 compounds - chembl_id : CHEMBL2144471
Processed: 1420000 compounds - chembl_id : CHEMBL2153098
Processed: 1440000 compounds - chembl_id : CHEMBL2312695
Processed: 1460000 compounds - chembl_id : CHEMBL2347669
Processed: 1480000 compounds - chembl_id : CHEMBL2386980
Processed: 1500000 compounds - chembl_id : CHEMBL2436775
Processed: 1520000 compounds - chembl_id : CHEMBL2230239
Processed: 1540000 compounds - chembl_id : CHEMBL3112640




Processed: 1560000 compounds - chembl_id : CHEMBL3193455




Processed: 1580000 compounds - chembl_id : CHEMBL3186526




Processed: 1600000 compounds - chembl_id : CHEMBL3249770
Processed: 1620000 compounds - chembl_id : CHEMBL3262060
Processed: 1640000 compounds - chembl_id : CHEMBL3440752
Processed: 1660000 compounds - chembl_id : CHEMBL3451724
Processed: 1680000 compounds - chembl_id : CHEMBL3470006
Processed: 1700000 compounds - chembl_id : CHEMBL3494322
Processed: 1720000 compounds - chembl_id : CHEMBL3480104
Processed: 1740000 compounds - chembl_id : CHEMBL3617872
Processed: 1760000 compounds - chembl_id : CHEMBL3634613
Processed: 1780000 compounds - chembl_id : CHEMBL3668594
Processed: 1800000 compounds - chembl_id : CHEMBL3641064
Processed: 1820000 compounds - chembl_id : CHEMBL3639488
Processed: 1840000 compounds - chembl_id : CHEMBL3781977
Processed: 1860000 compounds - chembl_id : CHEMBL3827010
Processed: 1880000 compounds - chembl_id : CHEMBL3910942
Processed: 1900000 compounds - chembl_id : CHEMBL3932182
Processed: 1920000 compounds - chembl_id : CHEMBL3954284
Processed: 1940000 compounds - 



Processed: 1960000 compounds - chembl_id : CHEMBL3984606
Processed: 1980000 compounds - chembl_id : CHEMBL4088547
Processed: 2000000 compounds - chembl_id : CHEMBL4107920
Processed: 2020000 compounds - chembl_id : CHEMBL4173719
Processed: 2040000 compounds - chembl_id : CHEMBL4211115
Processed: 2060000 compounds - chembl_id : CHEMBL4202486
Processed: 2080000 compounds - chembl_id : CHEMBL4283762


### Save Morgan Fingerprint results 

In [17]:
print(f" output file   :   {outFile}")
print(f" compounds file:   {compoundsFile}")
print(type(fp))
fp_key_list = list(fp.keys())
print(len(fp_key_list))
print(fp_key_list[0])
print(fp['CHEMBL405398'])

 output file   :   output/chembl_29_dev/chembl_29_dev_X.csv
 compounds file:   output/chembl_29_dev/chembl_29_dev_compounds.csv
<class 'dict'>
423737
CHEMBL405398
{98513984: 1, 136810838: 1, 591190074: 1, 600629739: 1, 619920801: 2, 725322217: 1, 779152244: 1, 787069595: 1, 847961216: 2, 895250610: 1, 951226070: 3, 971553482: 1, 1003790885: 1, 1024714809: 1, 1100037548: 2, 1222631225: 2, 1255595680: 1, 1257718710: 2, 1271427701: 1, 1316442092: 1, 1349404210: 1, 1528304983: 1, 1530144349: 1, 1601910673: 1, 1608392297: 1, 1634606847: 1, 1717044408: 1, 1781750782: 1, 1868602760: 1, 1907845850: 1, 1971022618: 1, 2041434490: 3, 2048117778: 1, 2092489639: 1, 2093282138: 1, 2144345290: 1, 2245384272: 2, 2257970297: 1, 2267998563: 1, 2315593046: 1, 2589782852: 1, 2676693293: 1, 2685954589: 1, 2747921197: 1, 2803848648: 2, 2900402147: 1, 2960810393: 1, 2967998925: 1, 2968968094: 4, 2991110306: 1, 3063504103: 1, 3095754548: 1, 3118255683: 1, 3182824521: 1, 3189457552: 1, 3203709451: 1, 321738070

In [18]:
## Write CSV dataset of compound, feature
saveFingerprints(fp, outFile)

 => fingerprints written to output/chembl_29_dev/chembl_29_dev_X.csv 


In [19]:
saveFingerprintsNpy(fp, "Step20/2.1_fp_32000.npy", "Step20/2.2_cmpd_list_X.csv", ecfp_fold=32000)


 => fingerprints written to Step20/2.1_fp_32000.npy 
 => compounds written to Step20/2.2_cmpd_list_X.csv 


In [20]:
saveFingerprintsNpy(fp, f"output/{VERSION}/{VERSION}_X", f"output/{VERSION}/{VERSION}_X_cmpds.csv", ecfp_fold=32000)


 => fingerprints written to output/chembl_29_dev/chembl_29_dev_X 
 => compounds written to output/chembl_29_dev/chembl_29_dev_X_cmpds.csv 


### Misc

In [2]:
import pandas as pd
import numpy as np

In [27]:
df = pd.read_csv("output/chembl_29/chembl_29_X.csv")

df.info()
df.nunique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32883981 entries, 0 to 32883980
Data columns (total 2 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   compound  object
 1   feature   int64 
dtypes: int64(1), object(1)
memory usage: 501.8+ MB


compound     423737
feature     1090504
dtype: int64

In [28]:
df = pd.read_csv("Step20/2.2_cmpd_list_X.csv")

df.info()
df.nunique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 423736 entries, 0 to 423735
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   0             423736 non-null  int64 
 1   CHEMBL405398  423736 non-null  object
dtypes: int64(1), object(1)
memory usage: 6.5+ MB


0               423736
CHEMBL405398    423736
dtype: int64

In [5]:
del df

In [33]:
np.load("Step20/2.1_fp_32000.npy", allow_pickle = True)

array(<423737x32000 sparse matrix of type '<class 'numpy.float64'>'
	with 32838752 stored elements in Compressed Sparse Row format>,
      dtype=object)

In [38]:
list(fp.keys())[:10]
print(fp['CHEMBL405398'])

{98513984: 1, 136810838: 1, 591190074: 1, 600629739: 1, 619920801: 2, 725322217: 1, 779152244: 1, 787069595: 1, 847961216: 2, 895250610: 1, 951226070: 3, 971553482: 1, 1003790885: 1, 1024714809: 1, 1100037548: 2, 1222631225: 2, 1255595680: 1, 1257718710: 2, 1271427701: 1, 1316442092: 1, 1349404210: 1, 1528304983: 1, 1530144349: 1, 1601910673: 1, 1608392297: 1, 1634606847: 1, 1717044408: 1, 1781750782: 1, 1868602760: 1, 1907845850: 1, 1971022618: 1, 2041434490: 3, 2048117778: 1, 2092489639: 1, 2093282138: 1, 2144345290: 1, 2245384272: 2, 2257970297: 1, 2267998563: 1, 2315593046: 1, 2589782852: 1, 2676693293: 1, 2685954589: 1, 2747921197: 1, 2803848648: 2, 2900402147: 1, 2960810393: 1, 2967998925: 1, 2968968094: 4, 2991110306: 1, 3063504103: 1, 3095754548: 1, 3118255683: 1, 3182824521: 1, 3189457552: 1, 3203709451: 1, 3217380708: 6, 3218693969: 7, 3311725064: 1, 3596485316: 1, 3598197347: 1, 3612926680: 1, 3632350815: 1, 3692055567: 1, 3706955836: 1, 3709437002: 1, 3750444818: 1, 3776905

In [78]:
fp['CHEMBL405398']

{98513984: 1,
 136810838: 1,
 591190074: 1,
 600629739: 1,
 619920801: 2,
 725322217: 1,
 779152244: 1,
 787069595: 1,
 847961216: 2,
 895250610: 1,
 951226070: 3,
 971553482: 1,
 1003790885: 1,
 1024714809: 1,
 1100037548: 2,
 1222631225: 2,
 1255595680: 1,
 1257718710: 2,
 1271427701: 1,
 1316442092: 1,
 1349404210: 1,
 1528304983: 1,
 1530144349: 1,
 1601910673: 1,
 1608392297: 1,
 1634606847: 1,
 1717044408: 1,
 1781750782: 1,
 1868602760: 1,
 1907845850: 1,
 1971022618: 1,
 2041434490: 3,
 2048117778: 1,
 2092489639: 1,
 2093282138: 1,
 2144345290: 1,
 2245384272: 2,
 2257970297: 1,
 2267998563: 1,
 2315593046: 1,
 2589782852: 1,
 2676693293: 1,
 2685954589: 1,
 2747921197: 1,
 2803848648: 2,
 2900402147: 1,
 2960810393: 1,
 2967998925: 1,
 2968968094: 4,
 2991110306: 1,
 3063504103: 1,
 3095754548: 1,
 3118255683: 1,
 3182824521: 1,
 3189457552: 1,
 3203709451: 1,
 3217380708: 6,
 3218693969: 7,
 3311725064: 1,
 3596485316: 1,
 3598197347: 1,
 3612926680: 1,
 3632350815: 1,
 3692

In [80]:
for compound in fp:
    for feature in fp[compound]:
        print([compound, feature] )
    break

['CHEMBL405398', 98513984]
['CHEMBL405398', 136810838]
['CHEMBL405398', 591190074]
['CHEMBL405398', 600629739]
['CHEMBL405398', 619920801]
['CHEMBL405398', 725322217]
['CHEMBL405398', 779152244]
['CHEMBL405398', 787069595]
['CHEMBL405398', 847961216]
['CHEMBL405398', 895250610]
['CHEMBL405398', 951226070]
['CHEMBL405398', 971553482]
['CHEMBL405398', 1003790885]
['CHEMBL405398', 1024714809]
['CHEMBL405398', 1100037548]
['CHEMBL405398', 1222631225]
['CHEMBL405398', 1255595680]
['CHEMBL405398', 1257718710]
['CHEMBL405398', 1271427701]
['CHEMBL405398', 1316442092]
['CHEMBL405398', 1349404210]
['CHEMBL405398', 1528304983]
['CHEMBL405398', 1530144349]
['CHEMBL405398', 1601910673]
['CHEMBL405398', 1608392297]
['CHEMBL405398', 1634606847]
['CHEMBL405398', 1717044408]
['CHEMBL405398', 1781750782]
['CHEMBL405398', 1868602760]
['CHEMBL405398', 1907845850]
['CHEMBL405398', 1971022618]
['CHEMBL405398', 2041434490]
['CHEMBL405398', 2048117778]
['CHEMBL405398', 2092489639]
['CHEMBL405398', 2093282138

In [27]:
fp2 =[(x,(np.array(list(fp[x].keys())), np.array(list(fp[x].values())))) for x in fp] #TODO: Modify to save compound list

In [33]:
print(type(fp2), type(fp2[0]), type(fp2[0][0]), type(fp2[0][1]))

fp2[0]

<class 'list'> <class 'tuple'> <class 'str'> <class 'tuple'>


('CHEMBL405398',
 (array([  98513984,  136810838,  591190074,  600629739,  619920801,
          725322217,  779152244,  787069595,  847961216,  895250610,
          951226070,  971553482, 1003790885, 1024714809, 1100037548,
         1222631225, 1255595680, 1257718710, 1271427701, 1316442092,
         1349404210, 1528304983, 1530144349, 1601910673, 1608392297,
         1634606847, 1717044408, 1781750782, 1868602760, 1907845850,
         1971022618, 2041434490, 2048117778, 2092489639, 2093282138,
         2144345290, 2245384272, 2257970297, 2267998563, 2315593046,
         2589782852, 2676693293, 2685954589, 2747921197, 2803848648,
         2900402147, 2960810393, 2967998925, 2968968094, 2991110306,
         3063504103, 3095754548, 3118255683, 3182824521, 3189457552,
         3203709451, 3217380708, 3218693969, 3311725064, 3596485316,
         3598197347, 3612926680, 3632350815, 3692055567, 3706955836,
         3709437002, 3750444818, 3776905034, 3777168895, 3888780669,
         39770479

In [34]:
cmpd, ecfp   = zip(*fp2)

In [36]:
print(type(cmpd), type(ecfp))
print(type(cmpd[0]), type(ecfp[0]))
print(cmpd[:10])
print(ecfp[0])
print(ecfp[0][1])

<class 'tuple'> <class 'tuple'>
<class 'str'> <class 'tuple'>
('CHEMBL405398', 'CHEMBL403325', 'CHEMBL501943', 'CHEMBL501094', 'CHEMBL505943', 'CHEMBL438018', 'CHEMBL444522', 'CHEMBL263810', 'CHEMBL266960', 'CHEMBL438997')
(array([  98513984,  136810838,  591190074,  600629739,  619920801,
        725322217,  779152244,  787069595,  847961216,  895250610,
        951226070,  971553482, 1003790885, 1024714809, 1100037548,
       1222631225, 1255595680, 1257718710, 1271427701, 1316442092,
       1349404210, 1528304983, 1530144349, 1601910673, 1608392297,
       1634606847, 1717044408, 1781750782, 1868602760, 1907845850,
       1971022618, 2041434490, 2048117778, 2092489639, 2093282138,
       2144345290, 2245384272, 2257970297, 2267998563, 2315593046,
       2589782852, 2676693293, 2685954589, 2747921197, 2803848648,
       2900402147, 2960810393, 2967998925, 2968968094, 2991110306,
       3063504103, 3095754548, 3118255683, 3182824521, 3189457552,
       3203709451, 3217380708, 32186939

In [74]:
feat, counts = zip(*ecfp)

In [75]:
print(type(feat), type(counts))
print(type(feat[0]), type(counts[0]))

<class 'tuple'> <class 'tuple'>
<class 'numpy.ndarray'> <class 'numpy.ndarray'>


In [109]:
print(len(feat[0]), feat[0])

79 [  98513984  136810838  591190074  600629739  619920801  725322217
  779152244  787069595  847961216  895250610  951226070  971553482
 1003790885 1024714809 1100037548 1222631225 1255595680 1257718710
 1271427701 1316442092 1349404210 1528304983 1530144349 1601910673
 1608392297 1634606847 1717044408 1781750782 1868602760 1907845850
 1971022618 2041434490 2048117778 2092489639 2093282138 2144345290
 2245384272 2257970297 2267998563 2315593046 2589782852 2676693293
 2685954589 2747921197 2803848648 2900402147 2960810393 2967998925
 2968968094 2991110306 3063504103 3095754548 3118255683 3182824521
 3189457552 3203709451 3217380708 3218693969 3311725064 3596485316
 3598197347 3612926680 3632350815 3692055567 3706955836 3709437002
 3750444818 3776905034 3777168895 3888780669 3977047921 3983062349
 4004723865 4041573576 4042373501 4050976520 4055698890 4113365465
 4239037249]


In [77]:
print(counts[0])

[1 1 1 1 2 1 1 1 2 1 3 1 1 1 2 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 2
 1 1 1 1 1 1 1 2 1 1 1 4 1 1 1 1 1 1 1 6 7 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1]


In [105]:
lens    = np.array([len(f) for f in feat])

In [107]:
print(type(lens), len(lens))
print(lens[:10])

<class 'numpy.ndarray'> 423737
[ 79  83  87 186  49 167 127  78  81  97]


In [112]:
indptr  = np.concatenate([[0], np.cumsum(lens)])

print(type(indptr), len(indptr))
print(indptr[0:10])

<class 'numpy.ndarray'> 423738
[  0  79 162 249 435 484 651 778 856 937]


In [114]:
print(type(feat), len(feat), type(feat[0]), len(feat[0]))

<class 'tuple'> 423737 <class 'numpy.ndarray'> 79


In [119]:
tmp = np.concatenate(feat)
print(type(tmp), len(tmp)) 
print(tmp[:10])
ecfp_fold = 32000
print(tmp[:10]% ecfp_fold)
del tmp

<class 'numpy.ndarray'> 32883981
[ 98513984 136810838 591190074 600629739 619920801 725322217 779152244
 787069595 847961216 895250610]
[17984 10838 22074 21739 16801 10217 16244 29595 25216 18610]


In [121]:
indices = np.concatenate(feat) % ecfp_fold
print(indices.shape)

(32883981,)


In [123]:
data    = np.ones(indices.shape[0])

print(type(data), len(data), type(data[0]))

<class 'numpy.ndarray'> 32883981 <class 'numpy.float64'>


In [129]:
## the column indices for row i are stored in indices[indptr[i]:indptr[i+1]] 
## their corresponding values are stored in data[indptr[i]:indptr[i+1]]

In [142]:
print('data     : ', data[:10])
print('indices  : ', indices[:10])
print('indptr   : ', indptr[:10])

data     :  [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 2. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.

In [132]:
csr     = csr_matrix((data, indices, indptr), shape=(len(feat), ecfp_fold))

In [133]:
csr.sum_duplicates()

In [152]:
tmp = (csr > 3)
tmp

<423737x32000 sparse matrix of type '<class 'numpy.bool_'>'
	with 0 stored elements in Compressed Sparse Row format>

In [153]:
del tmp

In [154]:
csr.data[:] = 1.0