# Test using sample data

In this notebook, we get a sample data from ChEMBL and using an unseen molecule, the program suggests putative protein targets.

In [1]:
sc.setLogLevel("INFO")
sc.addPyFile("moleculehelper.py") # 300 - Ligand framework

In [13]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
from rdkit.Chem import MACCSkeys

from moleculehelper import *
from chemblhelper import *
from pythonhelper import *
from elements import *

import os.path

## Get data from ChEMBL

### Bindings

In [3]:
# get samples from ChEMBL
chemblhelper = ChEMBLHelper()

# a global dictionary holding our bindings bank
bindings = Bindings()
bindingsPklFilename = "bindings100000.pkl"
if os.path.isfile(bindingsPklFilename):
    bindings.loadBindings(bindingsPklFilename)
else:
    bindings.addBindings(chemblhelper.getBindings(100000))
    bindings.saveBindings(bindingsPklFilename)

In [21]:
bindings.printBindings(10)

+------+--------+--------+------------+---------+---------+--------+-------------+------------+---------+--------------------+--------------------+
|row_id|assay_id|molregno|std_relation|std_value|std_units|std_type|pchembl_value|component_id|accession|            sequence|    canonical_smiles|
+------+--------+--------+------------+---------+---------+--------+-------------+------------+---------+--------------------+--------------------+
|     1| 1459233|  222065|           =|        1|       nM|      Ki|            9|           1|   O09028|MSYSLYLAFVCLNLLAQ...|Cc1ccc2OC(=CC(=O)...|
|     2| 1459233|   86147|           =|        2|       nM|      Ki|            9|           1|   O09028|MSYSLYLAFVCLNLLAQ...|[O-][N+](=O)c1ccc...|
|     3| 1459233|   86094|           =|        2|       nM|      Ki|            9|           1|   O09028|MSYSLYLAFVCLNLLAQ...|[O-][N+](=O)c1ccc...|
|     4|  142605|  181426|           =|    52000|       nM|    EC50|            4|           2|   P02708|MEPWPLL

### Molecules

In [None]:
# get molecules from databank
molecules = Molecules()

for index, binding in bindings.getBindings().iteritems():  
    # there is no need to cater for distinct as this is done implicitly when adding data to the molecules dictionary
    molecules.addMolecule(binding[BindingAttribute.MOLREGNO], binding[BindingAttribute.CANONICAL_SMILES])

In [None]:
molecules.printMolecules(10)

### Proteins

In [None]:
# get proteins from databank
proteins = Proteins()

for index, binding in bindings.getBindings().iteritems():  
    # there is no need to cater for distinct as this is done implicitly when adding data to the proteins dictionary
    proteins.addProtein(binding[BindingAttribute.COMPONENT_ID], binding[BindingAttribute.SEQUENCE])

In [None]:
proteins.printProteins(5)

## Run PySpark Jobs

In the next section, we will run a number of Spark jobs to get Molecule similarities

In [None]:
# this must run on the main thread
def findSimilarMolecules(querySmiles, knownMolecules, molHelper = MoleculeHelper, similarityThreshold = 0.85):
    """ Returns an RDD with similar molecules.
    """
    
    # step 1 - create a molecule helper class for each molecule, this will take
    #          more memory but will increase computation efficiency
    queryMol = dict()
    queryMol.update({0: querySmiles})
    queryRDD = sc.parallelize(queryMol).map(lambda k:(k, molHelper(queryMol[k])))
    mols = sc.parallelize(knownMolecules).map(lambda k:(k, molHelper(knownMolecules[k]))).union(queryRDD)    

    sm = mols.cartesian(queryRDD) \
             .map(lambda ((k1,v1),(k2,v2)): (k1, k2, float(v1.similarity(v2)))) \
             .filter(lambda (k1, k2, v): v >= similarityThreshold and k1 != k2)    

    return sm

In [None]:
sm = findSimilarMolecules("CC(C)(C)c1ccc(cc1)S(=O)(=O)N2CCC(CC2)c3ccncc3", molecules.getMolecules(), similarityThreshold=0.5)

In [None]:
sm.collect()

In [None]:
sm.count()

In [None]:
def getBindings(similarMoleculesRDD):
    