# Structural keys

In [2]:
from rdkit.Chem import AllChem as Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
import csv
import pandas as pd

In [19]:
serotonine_ligands = pd.read_csv("./data/DOWNLOAD-ligands_for_serotonine_rec.csv", sep=";", 
                                 usecols=["Molecule ChEMBL ID", "Smiles"])
with open('./data/DOWNLOAD-ligands_for_serotonine_rec.csv', 'r') as csvfile:
    reader = csv.DictReader(csvfile, delimiter=";")
    mols = [Chem.MolFromSmiles(m['Smiles']) for m in reader]
#serotonine_ligands["molecule"] = serotonine_ligands["Smiles"].apply(Chem.MolFromSmiles, axis=1)
serotonine_ligands["molecule"] = mols

In [20]:
serotonine_ligands.head()

Unnamed: 0,Molecule ChEMBL ID,Smiles,molecule
0,CHEMBL1852387,Cc1cccc(N2CCN(CC(O)CNC(=O)c3nc(-c4ccccc4)n(-c4...,<rdkit.Chem.rdchem.Mol object at 0x0000026908F...
1,CHEMBL63905,Fc1ccc(N2CCN(c3nc4ccsc4n4cccc34)CC2)cc1,<rdkit.Chem.rdchem.Mol object at 0x0000026908F...
2,CHEMBL2393241,CN1CCc2nc(N3CCN(C)CC3)nc(Nc3ccc(F)cc3)c2CC1.Cl,<rdkit.Chem.rdchem.Mol object at 0x0000026908F...
3,CHEMBL1087,Cc1cccc(C)c1NC(=O)C1CCCCN1C,<rdkit.Chem.rdchem.Mol object at 0x0000026908F...
4,CHEMBL3183075,CNC(=C[N+](=O)[O-])NCCSCc1csc(CN(C)C)n1,<rdkit.Chem.rdchem.Mol object at 0x0000026908F...


In [21]:
suppl = Chem.SDMolSupplier('../data/drugbank.sdf')
drugs = [m for m in suppl if m]

## Let's assemble a structural key from several substructures that were introduced in the previous exercise 5:

In [22]:
ethanol_pattern = Chem.MolFromSmarts('CCO')
propanol_pattern = Chem.MolFromSmarts('CCCO')
cooh_pattern = Chem.MolFromSmarts('C(=O)[O;h1]')
salicylic_acid_pattern = Chem.MolFromSmarts('c1ccc(c(c1)C(=O)O)O')

In [23]:
amazing_key = [ethanol_pattern, propanol_pattern, cooh_pattern, salicylic_acid_pattern]

serotonine_ligands_keys = [[m.HasSubstructMatch(substruct) for substruct in amazing_key] for m in serotonine_ligands["molecule"]]
len(serotonine_ligands_keys), serotonine_ligands_keys

(1876,
 [[True, False, False, False],
  [False, False, False, False],
  [False, False, False, False],
  [False, False, False, False],
  [False, False, False, False],
  [True, False, False, False],
  [False, False, False, False],
  [True, False, False, False],
  [True, False, True, False],
  [False, False, False, False],
  [True, False, False, False],
  [True, True, True, False],
  [False, False, False, False],
  [True, True, False, False],
  [False, False, False, False],
  [False, False, False, False],
  [True, False, False, False],
  [False, False, False, False],
  [False, False, False, False],
  [False, False, False, False],
  [True, False, False, False],
  [False, False, False, False],
  [False, False, False, False],
  [True, True, False, False],
  [False, False, False, True],
  [True, False, False, False],
  [False, False, False, False],
  [True, True, False, False],
  [True, True, False, False],
  [True, True, True, False],
  [True, True, False, False],
  [False, False, False, Fal

## MACCS key

In [24]:
from rdkit.Chem import MACCSkeys

In [25]:
serotonine_maccs = [MACCSkeys.GenMACCSKeys(m) for m in serotonine_ligands["molecule"]]
drugbank_maccs = [MACCSkeys.GenMACCSKeys(m) for m in drugs]
serotonine_maccs[0]

<rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x26916e0f440>

In [34]:
print(serotonine_maccs[0])
first_key = serotonine_maccs[0]

<rdkit.DataStructs.cDataStructs.ExplicitBitVect object at 0x0000026916E0F440>


In [35]:
first_key.GetNumBits()

167

In [37]:
list(first_key.GetOnBits())

[38,
 54,
 62,
 65,
 75,
 77,
 79,
 80,
 82,
 83,
 85,
 86,
 90,
 91,
 92,
 95,
 96,
 97,
 98,
 100,
 104,
 108,
 110,
 111,
 117,
 118,
 120,
 121,
 122,
 125,
 128,
 129,
 131,
 132,
 133,
 135,
 137,
 138,
 139,
 141,
 142,
 144,
 145,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165]

In [38]:
first_key.ToBitString()

'00000000000000000000000000000000000000100000000000000010000000100100000000010101101101100011100111101000100010110000011011100100110111010111011011011111111111111111110'

In [44]:
serotonine_ligands_maccs_sums = [sum(1 for key in serotonine_maccs if key.GetBit(bit)) for bit in range(first_key.GetNumBits())]
print(serotonine_ligands_maccs_sums)

[0, 0, 0, 2, 0, 0, 0, 0, 15, 0, 1, 19, 1, 50, 5, 1, 6, 14, 2, 131, 0, 4, 45, 39, 88, 145, 74, 10, 28, 11, 14, 0, 168, 172, 24, 4, 187, 103, 539, 9, 19, 17, 368, 67, 9, 26, 28, 142, 19, 70, 153, 201, 178, 243, 257, 196, 30, 336, 197, 213, 196, 207, 877, 35, 203, 1025, 265, 215, 10, 149, 56, 128, 313, 205, 337, 1048, 175, 748, 74, 1006, 924, 367, 661, 1052, 218, 1222, 1138, 633, 399, 356, 753, 673, 770, 732, 443, 733, 1127, 868, 1221, 233, 1305, 1013, 334, 468, 827, 1045, 543, 728, 638, 370, 866, 1317, 571, 607, 319, 642, 635, 1003, 1302, 103, 1284, 1419, 1296, 359, 493, 1317, 543, 815, 1159, 1153, 262, 791, 651, 1101, 810, 1028, 516, 1527, 1123, 557, 359, 527, 1424, 815, 1013, 1442, 618, 1307, 1355, 979, 1015, 1128, 903, 1446, 1117, 1276, 1560, 1029, 1599, 1091, 1395, 1642, 1654, 1758, 1574, 1791, 213]


In [45]:
drugbank_maccs_sums = [sum(1 for key in drugbank_maccs if key.GetBit(bit)) for bit in range(first_key.GetNumBits())]
print(drugbank_maccs_sums)

[0, 0, 0, 30, 0, 1, 11, 25, 100, 46, 22, 118, 30, 60, 33, 11, 53, 77, 68, 283, 7, 21, 182, 212, 435, 565, 224, 96, 149, 726, 126, 23, 431, 481, 153, 44, 625, 521, 1313, 153, 167, 166, 698, 1198, 267, 246, 195, 455, 869, 713, 646, 601, 553, 1896, 2246, 663, 178, 1633, 669, 604, 685, 695, 1753, 207, 624, 2508, 1118, 782, 133, 1551, 295, 526, 2196, 755, 1184, 2092, 797, 2246, 703, 2263, 2297, 1228, 2080, 2782, 2110, 2498, 1685, 1311, 1596, 2474, 3533, 3342, 2825, 1650, 1481, 3191, 3099, 2784, 3058, 1145, 2845, 2777, 1935, 787, 3210, 2941, 2667, 1536, 1732, 2372, 3113, 3377, 2726, 2292, 1097, 1828, 1896, 3354, 3347, 924, 3225, 3763, 2973, 2791, 2465, 3449, 2547, 3821, 2689, 2850, 1793, 4639, 3554, 2718, 1639, 2199, 3265, 4494, 2581, 4291, 3262, 1352, 4170, 3821, 2803, 3700, 4375, 3396, 3795, 2507, 3940, 4408, 3963, 4550, 4670, 4851, 5255, 5253, 5351, 5676, 4234, 5681, 5175, 5799, 6506, 6133, 163]


In [52]:
serotonine_ligands_maccs_scaled = [x/len(serotonine_ligands) for x in serotonine_ligands_maccs_sums]
drugbank_maccs_scaled = [x/len(drugbank_maccs) for x in drugbank_maccs_sums]
for numbers in zip(serotonine_ligands_maccs_scaled, drugbank_maccs_scaled):
    print(f"{numbers[0]:.6f} \t {numbers[1]:.6f}")

0.000000 	 0.000000
0.000000 	 0.000000
0.000000 	 0.000000
0.001066 	 0.004216
0.000000 	 0.000000
0.000000 	 0.000141
0.000000 	 0.001546
0.000000 	 0.003514
0.007996 	 0.014055
0.000000 	 0.006465
0.000533 	 0.003092
0.010128 	 0.016585
0.000533 	 0.004216
0.026652 	 0.008433
0.002665 	 0.004638
0.000533 	 0.001546
0.003198 	 0.007449
0.007463 	 0.010822
0.001066 	 0.009557
0.069829 	 0.039775
0.000000 	 0.000984
0.002132 	 0.002952
0.023987 	 0.025580
0.020789 	 0.029796
0.046908 	 0.061138
0.077292 	 0.079410
0.039446 	 0.031483
0.005330 	 0.013493
0.014925 	 0.020942
0.005864 	 0.102038
0.007463 	 0.017709
0.000000 	 0.003233
0.089552 	 0.060576
0.091684 	 0.067604
0.012793 	 0.021504
0.002132 	 0.006184
0.099680 	 0.087843
0.054904 	 0.073226
0.287313 	 0.184540
0.004797 	 0.021504
0.010128 	 0.023472
0.009062 	 0.023331
0.196162 	 0.098103
0.035714 	 0.168377
0.004797 	 0.037526
0.013859 	 0.034575
0.014925 	 0.027407
0.075693 	 0.063949
0.010128 	 0.122136
0.037313 	 0.100211


In [53]:
serotonine_drugbank_differences = [(i, a_b[0] - a_b[1])
                             for i, a_b in enumerate(zip(serotonine_ligands_maccs_scaled, drugbank_maccs_scaled))]
serotonine_drugbank_differences

[(0, 0.0),
 (1, 0.0),
 (2, 0.0),
 (3, -0.003150346051091795),
 (4, 0.0),
 (5, -0.000140548137737175),
 (6, -0.001546029515108925),
 (7, -0.0035137034434293743),
 (8, -0.006059078166041591),
 (9, -0.006465214335910049),
 (10, -0.0025590099897061227),
 (11, -0.006456748483263832),
 (12, -0.003683395091603522),
 (13, 0.018219563761355854),
 (14, -0.001972843342768139),
 (15, -0.0010129804745971978),
 (16, -0.004250757056999912),
 (17, -0.003359520038598295),
 (18, -0.008491175285104444),
 (19, 0.030054301327415725),
 (20, -0.0009838369641602248),
 (21, -0.0008193147304337662),
 (22, -0.001592554245138128),
 (23, -0.00900729262032374),
 (24, -0.014230124350639133),
 (25, -0.0021175869473034348),
 (26, 0.007962846144740611),
 (27, -0.008162130817651526),
 (28, -0.006016299388510714),
 (29, -0.09617440855156005),
 (30, -0.010246378787719869),
 (31, -0.0032326071679550246),
 (32, 0.028975991441247725),
 (33, 0.024080780716435893),
 (34, -0.008710688101506322),
 (35, -0.004051921898388791),
 (

In [55]:
# sort by differences
serotonine_drugbank_differences.sort(key=lambda x: x[1])
serotonine_drugbank_differences

[(139, -0.3061837434651859),
 (146, -0.28547379556389324),
 (140, -0.2671034197549548),
 (131, -0.23036101991797858),
 (159, -0.21619472659791095),
 (123, -0.20090524688074538),
 (157, -0.18979190484681308),
 (136, -0.18383636480782517),
 (84, -0.18035187979388273),
 (54, -0.17867751394618112),
 (89, -0.15795063433959605),
 (132, -0.15249315614478554),
 (72, -0.14179936079066568),
 (69, -0.13856585459411105),
 (53, -0.13694835230533411),
 (109, -0.13615203772324005),
 (43, -0.13266238329484992),
 (130, -0.11234396234868224),
 (48, -0.11200839992388224),
 (91, -0.11096987205324643),
 (127, -0.10259946627668809),
 (143, -0.10259946627668809),
 (29, -0.09617440855156005),
 (90, -0.09517064312010876),
 (102, -0.09392226699051676),
 (106, -0.0853962543471779),
 (124, -0.0836579825498549),
 (112, -0.07876322133934288),
 (152, -0.0756489862703349),
 (164, -0.07538699435260199),
 (119, -0.0749624280964418),
 (126, -0.0685304778187169),
 (49, -0.06289738937078487),
 (154, -0.06094402498100793),

The least prevalent bits (compared to DrugBank):
139, 146, 140, 131, 159, 123

123:OCO <br>
131: QH > 1 <br>
139: OH <br>
140: O > 3  <br>
146: O > 2 <br>
159: O>1 <br>
The most prevalent bits:
86, 85, 100, 122, 75, 145 <br>
75: A!N$A <br>
85: CN(C)C <br>
86: CH2QCH2 <br>
100: ACH2N <br>
122: AN(A)A <br>
145: 6M ring > 1 <br>
It looks like there are overall more rings, probably also aromaticity, N

## My Special Key

In [88]:
benzene_pattern = Chem.MolFromSmarts('c1ccccc1')
amino_pattern = Chem.MolFromSmarts('N[H2]')
cooh_pattern = Chem.MolFromSmarts('C(=O)[O;h1]')
phosphorus_pattern = Chem.MolFromSmarts('P')
my_special_key = [benzene_pattern, amino_pattern, cooh_pattern, phosphorus_pattern]

In [91]:

serotonine_special_keys = [[ligand.HasSubstructMatch(substr) for substr in my_special_key] for ligand in serotonine_ligands["molecule"]]
serotonine_special_keys

[[True, True, False, False],
 [True, True, False, False],
 [True, True, False, False],
 [True, True, False, False],
 [False, True, False, False],
 [True, True, False, False],
 [True, True, False, False],
 [True, True, False, False],
 [True, False, True, False],
 [True, True, False, False],
 [True, True, False, False],
 [False, False, True, False],
 [True, True, False, False],
 [False, True, False, False],
 [True, True, False, False],
 [True, True, False, False],
 [True, True, False, False],
 [True, True, False, False],
 [True, True, False, False],
 [True, True, False, False],
 [False, True, False, False],
 [False, True, False, False],
 [False, True, False, False],
 [False, False, False, False],
 [True, False, False, False],
 [False, False, False, False],
 [True, True, False, False],
 [True, True, False, False],
 [True, True, False, False],
 [True, False, True, False],
 [False, True, False, False],
 [True, True, False, False],
 [False, False, False, False],
 [True, True, False, False],


In [73]:
my_special_key

[<rdkit.Chem.rdchem.Mol at 0x26917c64030>,
 None,
 <rdkit.Chem.rdchem.Mol at 0x26917c643a0>,
 <rdkit.Chem.rdchem.Mol at 0x26917c64440>]

In [94]:
serotonine_ligands_key_sums = [sum(1 for key in serotonine_special_keys if key[bit]) for bit in range(4)]
print(serotonine_ligands_key_sums)

[1513, 1280, 156, 11]


In [97]:
special_key_scaled = [x/len(serotonine_ligands) for x in serotonine_ligands_key_sums]
special_key_scaled

[0.8065031982942431,
 0.6823027718550106,
 0.08315565031982942,
 0.005863539445628998]

The most prevalent pattern of these four is a phenyl group.