# Structural keys

In [43]:
from rdkit.Chem import AllChem as Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
import csv
import numpy as np

In [44]:
with open('Epidermal_growth_factor_receptor_erbB1_IC50.csv', 'r') as csvfile:
    reader = csv.DictReader(csvfile, delimiter=";")
    EGFR_ligands = [Chem.MolFromSmiles(m['Smiles']) for m in reader]

suppl = Chem.SDMolSupplier('../data/drugbank.sdf')
drugs = [m for m in suppl if m]

RDKit ERROR: [15:46:04] ERROR: Explicit valence for atom # 0 C greater than permitted
RDKit ERROR: [18:33:56] Explicit valence for atom # 7 N, 4, is greater than permitted
RDKit ERROR: [18:33:56] ERROR: Could not sanitize molecule ending on line 20009
RDKit ERROR: [18:33:56] ERROR: Explicit valence for atom # 7 N, 4, is greater than permitted
RDKit ERROR: [18:33:56] Can't kekulize mol.  Unkekulized atoms: 1 2 3 5 6 7 8 9 10
RDKit ERROR: 
RDKit ERROR: [18:33:56] ERROR: Could not sanitize molecule ending on line 250947
RDKit ERROR: [18:33:56] ERROR: Can't kekulize mol.  Unkekulized atoms: 1 2 3 5 6 7 8 9 10
RDKit ERROR: 
RDKit ERROR: [18:33:56] Explicit valence for atom # 17 O, 3, is greater than permitted
RDKit ERROR: [18:33:56] ERROR: Could not sanitize molecule ending on line 258130
RDKit ERROR: [18:33:56] ERROR: Explicit valence for atom # 17 O, 3, is greater than permitted
RDKit ERROR: [18:33:56] Can't kekulize mol.  Unkekulized atoms: 57 58 59 60 61 62 63 64 65
RDKit ERROR: 
RDKit 

I've created a structural key with Amide, Sulfide and Nitrate patterns.

In [64]:
Amide_pattern = Chem.MolFromSmarts('[NX3][CX3](=[OX1])[#6]')
Sulfide_pattern = Chem.MolFromSmarts('[#16X2H0]')
Nitrate_pattern = Chem.MolFromSmarts('[$([NX3](=[OX1])(=[OX1])O),$([NX3+]([OX1-])(=[OX1])O)]')

In [79]:
custom_key = [Amide_pattern, Sulfide_pattern, Nitrate_pattern]
EGFR_ligands_keys = [[m.HasSubstructMatch(substruct) for substruct in custom_key] for m in EGFR_ligands]

np.sum(EGFR_ligands_keys, axis=0)/len(EGFR_ligands_keys)

array([0.48405515, 0.12254561, 0.00375992])

In [80]:
drug_keys = [[m.HasSubstructMatch(substruct) for substruct in custom_key] for m in drugs]

np.sum(drug_ligands_keys, axis=0)/len(drug_keys)

array([0.28938862, 0.12101195, 0.00098384])

48% of EGFR ligands have Amide_pattern, 12% have Sulfide_pattern, 0.3% have Nitrate_pattern. In drugbank dataset there are 30% drugs with Amide_pattern, 12% with Sulfide_pattern and 0.09% with Nitrate_pattern.

# MACCS key

In [81]:
from rdkit.Chem import MACCSkeys

In [82]:
EGFR_maccs = [MACCSkeys.GenMACCSKeys(m) for m in EGFR_ligands]
drugbank_maccs = [MACCSkeys.GenMACCSKeys(m) for m in drugs]
EGFR_maccs[0]

<rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f25805855d0>

In [83]:
EGFR_ligands_maccs_sums = [0]*EGFR_maccs[0].GetNumBits() # a list of zeros of a given length
for key in EGFR_maccs:
    for onbit in key.GetOnBits():
        EGFR_ligands_maccs_sums[onbit] += 1

In [84]:
drugbank_maccs_sums = [0]*drugbank_maccs[0].GetNumBits() # a list of zeros of a given length
for key in drugbank_maccs:
    for onbit in key.GetOnBits():
        drugbank_maccs_sums[onbit] += 1

In [88]:
EGFR_ligands_maccs_scaled = [x/len(EGFR_maccs) for x in EGFR_ligands_maccs_sums]
EGFR_ligands_maccs_scaled

[0.0,
 0.0,
 0.0,
 0.000974794596852806,
 0.0,
 0.0,
 0.0,
 0.0,
 0.008216125887759365,
 0.0,
 6.962818548948615e-05,
 0.010374599637933435,
 6.962818548948615e-05,
 0.002297730121153043,
 0.010513856008912408,
 0.0001392563709789723,
 0.001392563709789723,
 0.04477092326973959,
 0.0011836791533212645,
 0.030427517058905444,
 6.962818548948615e-05,
 0.002367358306642529,
 0.01190641971870213,
 0.025135774961704497,
 0.057721765770784014,
 0.2089541846539479,
 0.01336861161398134,
 0.004038434758390197,
 0.006266536694053753,
 0.004247319314858655,
 0.0038295502019217377,
 0.0,
 0.028269043308731374,
 0.02868681242166829,
 0.18124216682913244,
 0.00034814092744743074,
 0.0894025901685002,
 0.05430998468179919,
 0.6516501879961009,
 0.006057652137585295,
 0.009887202339507033,
 0.07979390057095112,
 0.30371814510513856,
 0.04734716613285058,
 0.0025762428631109875,
 0.016919649073945133,
 0.1382815763821195,
 0.10492967553265561,
 0.011279766049296756,
 0.04588497423757137,
 0.0796546441

In [89]:
drugbank_maccs_scaled = [x/len(drugbank_maccs) for x in drugbank_maccs_sums]
drugbank_maccs_scaled

[0.0,
 0.0,
 0.0,
 0.004216444132115249,
 0.0,
 0.000140548137737175,
 0.001546029515108925,
 0.0035137034434293743,
 0.014054813773717497,
 0.006465214335910049,
 0.00309205903021785,
 0.016584680252986646,
 0.004216444132115249,
 0.008432888264230498,
 0.0046380885453267745,
 0.001546029515108925,
 0.007449051300070274,
 0.010822206605762474,
 0.009557273366127899,
 0.03977512297962052,
 0.0009838369641602248,
 0.0029515108924806745,
 0.025579761068165847,
 0.029796205200281096,
 0.06113843991567112,
 0.07940969782150387,
 0.031482782853127195,
 0.013492621222768798,
 0.020941672522839072,
 0.10203794799718904,
 0.01770906535488405,
 0.0032326071679550246,
 0.06057624736472242,
 0.06760365425158117,
 0.02150386507378777,
 0.0061841180604357,
 0.08784258608573436,
 0.07322557976106817,
 0.18453970484891075,
 0.02150386507378777,
 0.023471539002108223,
 0.023330990864371046,
 0.09810260014054814,
 0.16837666900913562,
 0.03752635277582572,
 0.034574841883345045,
 0.027406886858749122,


In [58]:
# compute the differences, store bit numbers prior to sorting
EGFR_drugbank_differences = [(i, a_b[0] - a_b[1])
                             for i, a_b in enumerate(zip(EGFR_ligands_maccs_scaled, drugbank_maccs_scaled))]
EGFR_drugbank_differences

[(0, 0.0),
 (1, 0.0),
 (2, 0.0),
 (3, -0.003241649535262443),
 (4, 0.0),
 (5, -0.000140548137737175),
 (6, -0.001546029515108925),
 (7, -0.0035137034434293743),
 (8, -0.005838687885958132),
 (9, -0.006465214335910049),
 (10, -0.0030224308447283636),
 (11, -0.006210080615053211),
 (12, -0.004146815946625763),
 (13, -0.006135158143077455),
 (14, 0.005875767463585633),
 (15, -0.0014067731441299526),
 (16, -0.006056487590280551),
 (17, 0.03394871666397711),
 (18, -0.008373594212806635),
 (19, -0.009347605920715076),
 (20, -0.0009142087786707387),
 (21, -0.0005841525858381455),
 (22, -0.013673341349463717),
 (23, -0.0046604302385766),
 (24, -0.003416674144887104),
 (25, 0.12954448683244404),
 (26, -0.018114171239145854),
 (27, -0.009454186464378601),
 (28, -0.014675135828785319),
 (29, -0.09779062868233039),
 (30, -0.013879515152962311),
 (31, -0.0032326071679550246),
 (32, -0.032307204055991044),
 (33, -0.03891684182991288),
 (34, 0.15973830175534467),
 (35, -0.005835977132988269),
 (36, 0

In [59]:
# let's sort the bits by the difference in MACCS incidence between our ligand set and the DrugBank database
EGFR_drugbank_differences.sort(key=lambda x: x[1])
EGFR_drugbank_differences

[(139, -0.39741039909427583),
 (90, -0.2923371025847764),
 (123, -0.2725790015680287),
 (140, -0.22444769386850189),
 (112, -0.21909021845830967),
 (54, -0.21735611944654054),
 (136, -0.2057912154575942),
 (104, -0.20272615630984514),
 (89, -0.20114876230640258),
 (84, -0.15834462242880923),
 (146, -0.14825000344960443),
 (102, -0.14495883618861086),
 (69, -0.13485410815591192),
 (130, -0.130014229985175),
 (132, -0.12985204475423795),
 (43, -0.12102950287628503),
 (91, -0.1199695006039499),
 (48, -0.1108565656443083),
 (66, -0.10519019161500497),
 (108, -0.10312858079947249),
 (82, -0.10002707817136325),
 (29, -0.09779062868233039),
 (155, -0.09190902869610929),
 (124, -0.06898284034653407),
 (159, -0.06707305126953755),
 (53, -0.06316496752038425),
 (49, -0.0543258479690344),
 (78, -0.04818564997837758),
 (119, -0.04728745127861911),
 (72, -0.041341106376698966),
 (33, -0.03891684182991288),
 (114, -0.03845926281415499),
 (60, -0.03562932478862243),
 (44, -0.03495010991271474),
 (67,

The first 3 MACCS bits that are least prevalent in EGFR ligand set compared to the DrugBank database contents are 139, 90, 123. These bits correspond to structural patterns "OH", "QHAACH2A", "OCO". 
The first 3 bits of MACCS, which are the most common in the EGFR ligand set compared to the contents of the DrugBank database: 65, 80, 135. These correspond to structural patterns "S Heterocycle", "NAAAN" and "Nnot%A%A".