# Molecules

This python script reads molecules from ChEMBL and store then as a Spark DataFrame

In [1]:
sc.setLogLevel("INFO")

In [2]:
# mysql related variables

import mysql.connector
from mysql.connector import errorcode

config = {
  'user': 'joseph',
  'password': 'password',
  'host': '192.168.151.11',
  'database': 'chembl_22',
  'raise_on_warnings': True,
}

In [4]:
try:
  cnx = mysql.connector.connect(**config)
  
  cursor = cnx.cursor()

  query = ("SELECT DISTINCT cs.molregno, cs.canonical_smiles " +
           "FROM compound_structures cs, activities act, " +
           "     assays asy " +
           "WHERE cs.molregno = act.molregno AND" +
           "     act.assay_id = asy.assay_id AND" +
           "     asy.assay_tax_id = 9606 LIMIT 100;")

  cursor.execute(query)

  molecules = None
  tempMolecules = []
  i = 0
  for (molregno, canonical_smiles) in cursor:
    i = i + 1
    tempMolecules.append((molregno, canonical_smiles))
    if i % 10 == 0:
        print i
        if molecules is None:
            molecules = spark.createDataFrame(tempMolecules)
        else:
            _molecules = spark.createDataFrame(tempMolecules)
            molecules = molecules.union(_molecules)
        tempMolecules = []
    
  # store the rest  
  if len(tempMolecules) > 0:
    _molecules = spark.createDataFrame(tempMolecules)
    molecules = molecules.union(_molecules)

except mysql.connector.Error as err:
  if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
    print("Something is wrong with your user name or password")
  elif err.errno == errorcode.ER_BAD_DB_ERROR:
    print("Database does not exist")
  else:
    print(err)
else:
  cnx.close()

10
20
30
40
50
60
70
80
90
100


In [5]:
molecules.count()

100

In [7]:
molecules.coalesce(1).write.csv('hdfs://hadoop1:9000/molecules.csv')

In [8]:
df = spark.read.load("hdfs://hadoop1:9000/molecules.csv", format="csv")

In [9]:
df.count()
#df.show()

100

In [10]:
def fingerprint(a):
  from rdkit import Chem
  from rdkit.Chem import AllChem
  try:
    m = Chem.MolFromSmiles(a)
    if m != None:        
        f = AllChem.GetMorganFingerprintAsBitVect(m, 2)
        return f
  except ArgumentException as ex:
    print("**** SPARK - fingerprint: %s" % a)
    print(ex)
    raise
  return None

In [11]:
def sim(a, b):
  # can be None from fingerprint function
  if a == None or b == None:
    return 0

  from rdkit.Chem import AllChem
  from rdkit import DataStructs
  try:
    similarity = DataStructs.FingerprintSimilarity(a, b)
    if similarity > 0: # 0.8
        return similarity   
    else:
        return 0
  except Exception as ex:
    print(ex)
    raise
  return 0

In [12]:
fpTemp = df.rdd.map(lambda (k, v):(k, fingerprint(v)))

In [13]:
fpTemp.collect()

[(u'556', <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7fe7d1d9d998>),
 (u'82983',
  <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7fe7d1d78fc8>),
 (u'1633207',
  <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7fe7d1d78ba8>),
 (u'1633675',
  <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7fe7d1d78d60>),
 (u'47675',
  <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7fe7d1e394c8>),
 (u'1633268',
  <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7fe7d1e39310>),
 (u'1633652',
  <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7fe7d1d60940>),
 (u'1633750',
  <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7fe7d1d60838>),
 (u'1633752',
  <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7fe7d1d607e0>),
 (u'1633753',
  <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7fe7d1d60788>),
 (u'1633749',
  <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7fe7d1d60730>),
 (u'83026',
  <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7fe7d1d6

In [14]:
fpTemp.filter(lambda (k, v): v == None).count()

0

In [15]:
# clean data
fp = fpTemp.filter(lambda (k, v): v != None)

In [38]:
# create triangle similarity matrix
sm = fp.cartesian(fp).filter(lambda ((k1,v1),(k2,v2)): k1 < k2).map(lambda ((k1,v1),(k2,v2)): (int(k1),int(k2),float(sim(v1,v2))))

In [39]:
sm.collect()

[(556, 82983, 0.07142857142857142),
 (556, 83026, 0.0784313725490196),
 (556, 83027, 0.0625),
 (556, 82982, 0.07142857142857142),
 (556, 69954, 0.12195121951219512),
 (556, 699563, 0.11428571428571428),
 (556, 68845, 0.125),
 (556, 69711, 0.10869565217391304),
 (556, 69448, 0.10526315789473684),
 (556, 69987, 0.11363636363636363),
 (556, 69986, 0.11363636363636363),
 (556, 69712, 0.10869565217391304),
 (556, 73598, 0.07407407407407407),
 (556, 702, 0.046153846153846156),
 (556, 696, 0.017543859649122806),
 (556, 59125, 0.14084507042253522),
 (556, 9837, 0.0),
 (556, 68392, 0.018518518518518517),
 (1633207, 556, 0.1724137931034483),
 (1633207, 82983, 0.5087719298245614),
 (1633207, 1633675, 0.49206349206349204),
 (1633207, 47675, 0.6862745098039216),
 (1633207, 1633268, 0.4117647058823529),
 (1633207, 1633652, 0.6491228070175439),
 (82983, 83026, 0.057971014492753624),
 (82983, 83027, 0.06153846153846154),
 (1633207, 1633750, 0.26436781609195403),
 (1633207, 1633752, 0.4125),
 (1633207,

In [40]:
# create vertices
v = fp.map(lambda x:(int(x[0]), str(x[0])))

In [41]:
v.collect()

[(556, '556'),
 (82983, '82983'),
 (1633207, '1633207'),
 (1633675, '1633675'),
 (47675, '47675'),
 (1633268, '1633268'),
 (1633652, '1633652'),
 (1633750, '1633750'),
 (1633752, '1633752'),
 (1633753, '1633753'),
 (1633749, '1633749'),
 (83026, '83026'),
 (83027, '83027'),
 (1633754, '1633754'),
 (1633746, '1633746'),
 (1633751, '1633751'),
 (1633748, '1633748'),
 (1633747, '1633747'),
 (82982, '82982'),
 (69954, '69954'),
 (699563, '699563'),
 (68845, '68845'),
 (69711, '69711'),
 (69448, '69448'),
 (69987, '69987'),
 (69986, '69986'),
 (69712, '69712'),
 (1633264, '1633264'),
 (1633676, '1633676'),
 (1633206, '1633206'),
 (233447, '233447'),
 (421029, '421029'),
 (1627276, '1627276'),
 (1050523, '1050523'),
 (232251, '232251'),
 (1050942, '1050942'),
 (498712, '498712'),
 (232369, '232369'),
 (498776, '498776'),
 (231618, '231618'),
 (498774, '498774'),
 (73598, '73598'),
 (157810, '157810'),
 (14440, '14440'),
 (157844, '157844'),
 (21823, '21823'),
 (21821, '21821'),
 (21213, '212

In [42]:
from pyspark.sql.types import *
from graphframes import *

In [43]:
# Create the schema
simSchema = StructType([StructField("src", IntegerType(), False),
    StructField("dst", IntegerType(), False),
    StructField("similarity", FloatType(), False)])
    
vertexSchema = StructType([StructField("id", IntegerType(), False),
    StructField("name", StringType(), False)])

In [44]:
# Create a data frame
simDF = sqlContext.createDataFrame(sm, simSchema)
vDF = sqlContext.createDataFrame(v, vertexSchema)

g = GraphFrame(vDF, simDF)

In [45]:
g.degrees.show()

+-------+------+
|     id|degree|
+-------+------+
| 232369|    99|
|  68392|    99|
| 421029|    99|
| 185004|    99|
|    322|    99|
| 184370|    99|
|1627276|    99|
| 269761|    99|
|1633751|    99|
| 184674|    99|
| 184137|    99|
|1633749|    99|
| 184341|    99|
| 159754|    99|
|  83026|    99|
|1633747|    99|
|1633754|    99|
| 444925|    99|
|    556|    99|
|1633268|    99|
+-------+------+
only showing top 20 rows

