In [1]:
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit import RDLogger
import pandas as pd

lg = RDLogger.logger()
lg.setLevel(RDLogger.CRITICAL)

In [2]:
suppl = Chem.SDMolSupplier('../data/drugbank.sdf')
drugbank = [mol for mol in suppl if mol]
mtor = pd.read_csv(
    '../data/chembl_mtor_ic50.csv',
    # on_bad_lines='skip',
    sep=';'
    )

In [3]:
our_keys = (Chem.MolFromSmarts('c1ccccc1'), Chem.MolFromSmarts('CCCC'), Chem.MolFromSmarts('O=CO'), Chem.MolFromSmarts('C(F)(F)F'))

In [4]:
[tuple(mol.HasSubstructMatch(p) for p in our_keys) for mol in drugbank[:10]]

[(True, True, True, False),
 (True, True, False, False),
 (True, True, False, False),
 (True, True, False, False),
 (False, True, False, False),
 (True, True, False, False),
 (True, True, False, False),
 (False, False, False, False),
 (True, True, False, False),
 (True, True, True, False)]

In [5]:
from rdkit.Chem import MACCSkeys

In [6]:
maccs_db = [MACCSkeys.GenMACCSKeys(mol) for mol in drugbank[:10]]

In [7]:
maccs_db[0].GetNumBits()  # there are 166 identifiers, 0th key carries no information to avoid errors when switching to 1-based indexing

167

In [8]:
maccs_db[0].ToBitString()

'00000000000000000000000001000000000000000001000000000110000000000000000000110111101111000011100111001000101000110111011111110101110110001111111101111111111111111111110'

In [9]:
tuple(maccs_db[0].GetOnBits())

(25,
 43,
 53,
 54,
 74,
 75,
 77,
 78,
 79,
 80,
 82,
 83,
 84,
 85,
 90,
 91,
 92,
 95,
 96,
 97,
 100,
 104,
 106,
 110,
 111,
 113,
 114,
 115,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 125,
 127,
 128,
 129,
 131,
 132,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165)

In [10]:
maccs_db[0].ToBinary()

b'\xe0\xff\xff\xff\xa7\x00\x00\x00F\x00\x00\x002"\x12\x00&\x00\x02\x00\x00\x00\x02\x00\x00\x00\x08\x00\x00\x04\x00\x00\x04\x06\x02\x06\x00\x02\x00\x00\x02\x00\x00\x00\x00\x00\x00\x02\x02\x00\x00\x02\x00\x06\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02'

In [11]:
maccs_db[0].ToBase64()

'4P///6cAAABGAAAAMiISACYAAgAAAAIAAAAIAAAEAAAEBgIGAAIAAAIAAAAAAAACAgAAAgAGAAAAAAAAAAIAAAAAAAAAAAAAAAAAAAAAAAAAAAI='

In [12]:
maccs_db = [MACCSkeys.GenMACCSKeys(mol) for mol in drugbank]
inter = [0 for x in range(167)]  # empty vector

In [13]:
for m in maccs_db:
    for i in m.GetOnBits():
        inter[i] += 1

In [14]:
inter  # sum of bits that are on in the first 10 cmpds of drugbank db

[0,
 0,
 0,
 30,
 0,
 1,
 12,
 25,
 99,
 48,
 26,
 117,
 30,
 62,
 33,
 11,
 52,
 77,
 68,
 282,
 7,
 21,
 181,
 212,
 437,
 565,
 227,
 96,
 149,
 727,
 127,
 21,
 431,
 481,
 155,
 44,
 626,
 522,
 1314,
 153,
 167,
 166,
 698,
 1199,
 273,
 249,
 194,
 456,
 873,
 717,
 649,
 601,
 553,
 1898,
 2249,
 663,
 178,
 1636,
 669,
 604,
 685,
 695,
 1756,
 207,
 625,
 2512,
 1118,
 783,
 133,
 1552,
 295,
 528,
 2199,
 755,
 1187,
 2096,
 801,
 2251,
 707,
 2267,
 2303,
 1230,
 2082,
 2787,
 2111,
 2500,
 1686,
 1310,
 1598,
 2481,
 3536,
 3346,
 2829,
 1655,
 1487,
 3195,
 3104,
 2790,
 3061,
 1149,
 2848,
 2779,
 1938,
 786,
 3213,
 2945,
 2672,
 1534,
 1737,
 2378,
 3117,
 3380,
 2732,
 2294,
 1100,
 1831,
 1902,
 3358,
 3353,
 928,
 3229,
 3768,
 2979,
 2796,
 2470,
 3452,
 2556,
 3827,
 2694,
 2854,
 1798,
 4642,
 3561,
 2721,
 1637,
 2200,
 3271,
 4498,
 2585,
 4294,
 3269,
 1357,
 4176,
 3827,
 2807,
 3702,
 4382,
 3402,
 3800,
 2513,
 3945,
 4411,
 3968,
 4557,
 4677,
 4858,
 5261

In [15]:
db_ratios = [i/len(maccs_db) for i in inter]

In [20]:
db_ratios

[0.0,
 0.0,
 0.0,
 0.004213483146067416,
 0.0,
 0.0001404494382022472,
 0.0016853932584269663,
 0.0035112359550561797,
 0.013904494382022473,
 0.006741573033707865,
 0.003651685393258427,
 0.01643258426966292,
 0.004213483146067416,
 0.008707865168539325,
 0.0046348314606741575,
 0.001544943820224719,
 0.007303370786516854,
 0.010814606741573033,
 0.009550561797752809,
 0.039606741573033705,
 0.0009831460674157304,
 0.002949438202247191,
 0.025421348314606743,
 0.029775280898876405,
 0.061376404494382024,
 0.07935393258426966,
 0.03188202247191011,
 0.01348314606741573,
 0.020926966292134832,
 0.1021067415730337,
 0.017837078651685392,
 0.002949438202247191,
 0.06053370786516854,
 0.0675561797752809,
 0.021769662921348316,
 0.006179775280898876,
 0.08792134831460674,
 0.07331460674157303,
 0.1845505617977528,
 0.02148876404494382,
 0.02345505617977528,
 0.023314606741573034,
 0.09803370786516855,
 0.16839887640449439,
 0.03834269662921348,
 0.03497191011235955,
 0.027247191011235954,
 

In [16]:
maccs_mtor = [MACCSkeys.GenMACCSKeys(Chem.MolFromSmiles(smi)) for smi in mtor["Smiles"] if smi is not None]
inter = [0 for x in range(167)]  # empty vector
for m in maccs_mtor:
    for i in m.GetOnBits():
        inter[i] += 1

In [17]:
inter

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 68,
 0,
 0,
 115,
 0,
 37,
 0,
 0,
 0,
 907,
 0,
 468,
 0,
 0,
 124,
 216,
 52,
 1604,
 47,
 2,
 11,
 5,
 1,
 0,
 406,
 429,
 12,
 0,
 278,
 1215,
 3884,
 0,
 8,
 119,
 1257,
 910,
 0,
 47,
 26,
 129,
 5,
 24,
 89,
 616,
 974,
 343,
 135,
 625,
 11,
 2528,
 631,
 303,
 631,
 639,
 4451,
 11,
 528,
 4498,
 764,
 639,
 2,
 645,
 56,
 93,
 390,
 633,
 1024,
 3781,
 185,
 4326,
 34,
 2815,
 4293,
 727,
 925,
 2524,
 1809,
 3758,
 3365,
 1404,
 860,
 855,
 681,
 704,
 2203,
 2289,
 1434,
 2626,
 2598,
 1742,
 4532,
 196,
 3490,
 3088,
 733,
 453,
 738,
 3226,
 2851,
 1600,
 1667,
 2395,
 3027,
 3532,
 1479,
 1915,
 1270,
 1557,
 1836,
 3192,
 3473,
 47,
 4565,
 4566,
 4063,
 394,
 1639,
 4520,
 1435,
 2260,
 3292,
 3417,
 859,
 2577,
 2622,
 4290,
 1616,
 4284,
 1314,
 4595,
 3397,
 1012,
 1079,
 1484,
 4518,
 2260,
 3239,
 4569,
 2098,
 3497,
 4189,
 2825,
 3590,
 3916,
 1899,
 3689,
 2774,
 2555,
 4557,
 3435,
 4545,
 3335,
 3911,
 4591,
 4541,
 4595,
 430

In [18]:
mtor_ratios = [i/len(maccs_mtor) for i in inter]

In [31]:
comparison = []
for i, (d, m) in enumerate(zip(db_ratios, mtor_ratios)):
    comparison.append((i, d, m, d-m))
    print(f'Bit {i}: {d:.5f} vs {m:.5f}, difference: {d-m:.5f}')

Bit 0: 0.00000 vs 0.00000, difference: 0.00000
Bit 1: 0.00000 vs 0.00000, difference: 0.00000
Bit 2: 0.00000 vs 0.00000, difference: 0.00000
Bit 3: 0.00421 vs 0.00000, difference: 0.00421
Bit 4: 0.00000 vs 0.00000, difference: 0.00000
Bit 5: 0.00014 vs 0.00000, difference: 0.00014
Bit 6: 0.00169 vs 0.00000, difference: 0.00169
Bit 7: 0.00351 vs 0.00000, difference: 0.00351
Bit 8: 0.01390 vs 0.01480, difference: -0.00089
Bit 9: 0.00674 vs 0.00000, difference: 0.00674
Bit 10: 0.00365 vs 0.00000, difference: 0.00365
Bit 11: 0.01643 vs 0.02502, difference: -0.00859
Bit 12: 0.00421 vs 0.00000, difference: 0.00421
Bit 13: 0.00871 vs 0.00805, difference: 0.00066
Bit 14: 0.00463 vs 0.00000, difference: 0.00463
Bit 15: 0.00154 vs 0.00000, difference: 0.00154
Bit 16: 0.00730 vs 0.00000, difference: 0.00730
Bit 17: 0.01081 vs 0.19735, difference: -0.18653
Bit 18: 0.00955 vs 0.00000, difference: 0.00955
Bit 19: 0.03961 vs 0.10183, difference: -0.06222
Bit 20: 0.00098 vs 0.00000, difference: 0.0009

In [32]:
sorted(comparison, key=lambda x: x[3])

[(62, 0.24662921348314606, 0.9684508268059182, -0.7218216133227722),
 (38, 0.1845505617977528, 0.845082680591819, -0.6605321187940663),
 (65, 0.35280898876404493, 0.9786771105308965, -0.6258681217668516),
 (77, 0.31615168539325844, 0.9412532637075718, -0.6251015783143133),
 (135, 0.3089887640449438, 0.9321148825065274, -0.6231261184615836),
 (80, 0.32345505617977527, 0.9340731070496083, -0.6106180508698331),
 (98, 0.42991573033707864, 0.9860748476936466, -0.556159117356568),
 (133, 0.3821629213483146, 0.933420365535248, -0.5512574441869333),
 (120, 0.45351123595505616, 0.9932550043516101, -0.5397437683965539),
 (75, 0.2943820224719101, 0.8226718885987816, -0.5282898661268716),
 (125, 0.4848314606741573, 0.9834638816362054, -0.4986324209620481),
 (86, 0.23679775280898877, 0.7321583986074848, -0.49536064579849604),
 (145, 0.5199438202247191, 0.9941253263707572, -0.47418150614603816),
 (85, 0.351123595505618, 0.8176675369886858, -0.46654394148306777),
 (122, 0.41839887640449436, 0.8840295