# Ligands Framework
## Driver Program

In [1]:
sc.setLogLevel("INFO")
sc.addPyFile("molecules.py") # 300 - Ligand framework

In [2]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
from rdkit.Chem import MACCSkeys

from molecules import *
from chemblhelper import *
from pythonhelper import *

In [3]:
# a global dictionary holding our molecules bank
__molecules = dict([]) 

In [4]:
def reset():
    "Resets molecule data bank"
    __molecules.clear()
    
def addMolecule(molId, smiles):
    "Adds a new molecule in the internal dictionary"
    __molecules.update({molId: smiles})
    
def addMolecules(molecules):
    __molecules.update(molecules)
        
def printMolecules(n=-1):
    "Pretty print our molecules data bank"
    print("ID\t|\tSMILES")
    print("-------------------------------------------------------------------")    
    message = "(Showing first %d molecules from %d)" % (n, len(__molecules))
    for molId, smiles in __molecules.iteritems():        
        print("%s\t|\t%s" % (molId, smiles))
        n = n - 1
        if n == 0:
            print(message)
            break

def createSimilarityMatrix(molHelper, halfmatrix = False):         
    """ Returns an RDD represeinting the similarity matrix for all 
        molecules in our dictionary using the molHelper fingerprint 
        and similarity algorithm.
    """
    mols = sc.parallelize(__molecules).map(lambda k:(k, molHelper(__molecules[k])))
        
    return createSimilarityMatrixFromRDD(mols, halfmatrix)

def createSimilarityMatrixFromRDD(molRDD, halfmatrix = False):         
    """ Returns an RDD represeinting the similarity matrix for all 
        molecules in our dictionary using the molHelper fingerprint 
        and similarity algorithm.
    """
    
    if halfmatrix:
        # need to sort mols on key for the filter to work well        
        molRDD = molRDD.sortByKey()
        sm = molRDD.cartesian(molRDD) \
            .filter(lambda ((k1,v1),(k2,v2)): k1 < k2) \
            .map(lambda ((k1,v1),(k2,v2)): (k1, k2, float(v1.similarity(v2))))
    else:
        sm = molRDD.cartesian(molRDD) \
            .map(lambda ((k1,v1),(k2,v2)): (k1, k2, float(v1.similarity(v2))))

    return sm

def createSimilarityNetwork(querySmiles, molHelper = MoleculeHelper, halfmatrix = False, similarityThreshold = 0.85, radius = 1):
    """ Returns a GraphFrame.
    """
    
    # step 1 - create a molecule helper class for each molecule, this will take
    #          more memory but will increase computation efficiency
    queryMol = dict()
    queryMol.update({0: querySmiles})
    queryRDD = sc.parallelize(queryMol).map(lambda k:(k, molHelper(queryMol[k])))
    mols = sc.parallelize(__molecules).map(lambda k:(k, molHelper(__molecules[k]))).union(queryRDD)    
    
    if radius == 1:
        sm = mols.cartesian(queryRDD) \
                 .map(lambda ((k1,v1),(k2,v2)): (k1, k2, float(v1.similarity(v2)))) \
                 .filter(lambda (k1, k2, v): v >= similarityThreshold and k1 != k2)
    else:
        # need to be implemented correctly
        sm = createSimilarityMatrixFromRDD(mols, halfmatrix).filter(lambda (k1, k2, v): v >= similarityThreshold)
    
    return sm
        

In [9]:
%%time
# test MoleculeSimilarity
reset()
chembl = ChEMBLHelper()
mols = chembl.getMolecules(-50)

print len(mols)

#for k, v in mols.iteritems():
#    addMolecule(k, v)    

addMolecules(mols)

PythonHelper.writeToJupyterConsole("Completed!!")
    

991054
CPU times: user 12.3 s, sys: 1.68 s, total: 14 s
Wall time: 1min 51s


In [6]:
printMolecules(10)

ID	|	SMILES
-------------------------------------------------------------------
23	|	Br\C=C\1/CCC(C(=O)O1)c2cccc3ccccc23
72	|	COc1cc2nc(nc(N)c2cc1OC)N3CCN(CC3)S(=O)(=O)c4ccc(C)cc4
97	|	COc1cc2nc(nc(N)c2cc1OC)N3CCN(CC3)C(=O)c4occc4
111	|	[Na+].Cc1cc(CC(=O)[O-])n(C)c1C(=O)c2ccc(Cl)cc2
115	|	CN1CCC[C@H]1c2cccnc2
116	|	C1CN[C@@H](C1)c2cccnc2
130	|	C[C@]12CCC3C(CC=C4C[C@@H](O)[C@H]5COC[C@@]34C5)C1CC[C@@H]2OC6CC6
141	|	Cn1c(CC(=O)O)cc(CO)c1C(=O)c2ccc(Cl)cc2
145	|	C[C@]12CCC3C(CC=C4C(C)(C)[C@@H](O)CC[C@]34C)C1CC[C@@H]2OC5CC5
146	|	CC1COc2c(N3CCN(C)CC3)c(F)cc4C(=O)C(=CN1c24)C(=O)O
(Showing first 10 molecules from 991054)


In [46]:
s = createSimilarityMatrix(MoleculeHelper, True)
s.collect()

[(556, 14440, 0.09523809523809523),
 (556, 21213, 0.017857142857142856),
 (556, 21818, 0.01818181818181818),
 (556, 21820, 0.01818181818181818),
 (556, 21821, 0.017857142857142856),
 (556, 21823, 0.017543859649122806),
 (556, 47675, 0.07142857142857142),
 (556, 68845, 0.125),
 (556, 69448, 0.10526315789473684),
 (556, 69711, 0.10869565217391304),
 (556, 69712, 0.10869565217391304),
 (556, 69954, 0.12195121951219512),
 (556, 69986, 0.11363636363636363),
 (556, 69987, 0.11363636363636363),
 (556, 73598, 0.07407407407407407),
 (556, 82982, 0.07142857142857142),
 (556, 82983, 0.07142857142857142),
 (556, 83026, 0.0784313725490196),
 (556, 83027, 0.0625),
 (556, 157810, 0.1076923076923077),
 (14440, 21213, 0.1506849315068493),
 (14440, 21818, 0.12162162162162163),
 (14440, 21820, 0.12162162162162163),
 (14440, 21821, 0.1506849315068493),
 (14440, 21823, 0.1643835616438356),
 (21213, 21818, 0.775),
 (21213, 21820, 0.7317073170731707),
 (21213, 21821, 0.7142857142857143),
 (21213, 21823, 0.73

In [47]:
s.count()

1225

In [10]:
%%time
sn = createSimilarityNetwork("OC[C@H]1O[C@H](C[C@@H]1O[N+](=O)[O-])N2C=C(F)C(=O)NC2=O", similarityThreshold=0.2, halfmatrix=True)

CPU times: user 4.48 s, sys: 124 ms, total: 4.6 s
Wall time: 5.53 s


In [11]:
%%time
sn.count()

CPU times: user 8 ms, sys: 36 ms, total: 44 ms
Wall time: 5min 6s


4605

In [92]:
d = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
q = [10]
r = sc.parallelize(d)
qr = sc.parallelize(q)
c = r.cartesian(qr)
c.collect()

[(0, 10),
 (1, 10),
 (2, 10),
 (3, 10),
 (4, 10),
 (5, 10),
 (6, 10),
 (7, 10),
 (8, 10),
 (9, 10)]