In [31]:
from rdkit import Chem
from rdkit import RDLogger
import matplotlib.pyplot as plt
import pandas as pd
import gzip
from rdkit.Chem import Descriptors
from rdkit.Chem import Lipinski
import seaborn as sb
from itertools import chain

RDLogger.DisableLog('rdApp.error')

In [32]:
# load PAINS structures in SMARTS format from two files in data
pains = []
with open('../data/pains/p_l15.txt', 'r') as f, open('../data/pains/p_m150.txt', 'r') as p:
    for line in chain(f, p):
        pattern, text = line.strip().split('\t') # split by tab
        pains.append(Chem.MolFromSmarts(pattern)) # remove the xml tags
len(pains)
pains[0:3]

[<rdkit.Chem.rdchem.Mol at 0x7f5a0dcb9710>,
 <rdkit.Chem.rdchem.Mol at 0x7f5a0eadca30>,
 <rdkit.Chem.rdchem.Mol at 0x7f5a0dcb9940>]

In [33]:
supp = Chem.SDMolSupplier('../data/drugbank.sdf')
drug_bank = [[mol, 'DrugBank'] for mol in supp if mol]

with gzip.open('actives_final.sdf.gz') as sdf:
    supp_actives = Chem.ForwardSDMolSupplier(sdf)
    actives = [[mol, 'actives'] for mol in supp_actives if mol]

with gzip.open('decoys_final.sdf.gz') as sdf:
    supp_decoys = Chem.ForwardSDMolSupplier(sdf)
    decoys = [[mol, 'decoys'] for mol in supp_decoys if mol]



In [34]:
df_db = pd.DataFrame(drug_bank, columns=['Mol', 'Source'])
df_ac = pd.DataFrame(actives, columns=['Mol', 'Source'])
df_de = pd.DataFrame(decoys, columns=['Mol', 'Source'])

df = pd.concat((df_db, df_ac, df_de), ignore_index=True)
df

Unnamed: 0,Mol,Source
0,<rdkit.Chem.rdchem.Mol object at 0x7f5a0e9ae8f0>,DrugBank
1,<rdkit.Chem.rdchem.Mol object at 0x7f5a0e9aead0>,DrugBank
2,<rdkit.Chem.rdchem.Mol object at 0x7f5a0e9aea30>,DrugBank
3,<rdkit.Chem.rdchem.Mol object at 0x7f5a0e9aec10>,DrugBank
4,<rdkit.Chem.rdchem.Mol object at 0x7f5a0e9aeb70>,DrugBank
...,...,...
36236,<rdkit.Chem.rdchem.Mol object at 0x7f5a0d2a8b20>,decoys
36237,<rdkit.Chem.rdchem.Mol object at 0x7f5a0d2a8bc0>,decoys
36238,<rdkit.Chem.rdchem.Mol object at 0x7f5a0d2a8c60>,decoys
36239,<rdkit.Chem.rdchem.Mol object at 0x7f5a0d2a8d00>,decoys


In [35]:
def gen_keys(mol, keys=pains):
    structure_key = [mol.HasSubstructMatch(pain) for pain in pains]
    return structure_key

In [36]:
df['pains_keys'] = df['Mol'].apply(lambda x: gen_keys(x, keys=pains))
df

Unnamed: 0,Mol,Source,pains_keys
0,<rdkit.Chem.rdchem.Mol object at 0x7f5a0e9ae8f0>,DrugBank,"[False, False, False, False, False, False, Fal..."
1,<rdkit.Chem.rdchem.Mol object at 0x7f5a0e9aead0>,DrugBank,"[False, False, False, False, False, False, Fal..."
2,<rdkit.Chem.rdchem.Mol object at 0x7f5a0e9aea30>,DrugBank,"[False, False, False, False, False, False, Fal..."
3,<rdkit.Chem.rdchem.Mol object at 0x7f5a0e9aec10>,DrugBank,"[False, False, False, False, False, False, Fal..."
4,<rdkit.Chem.rdchem.Mol object at 0x7f5a0e9aeb70>,DrugBank,"[False, False, False, False, False, False, Fal..."
...,...,...,...
36236,<rdkit.Chem.rdchem.Mol object at 0x7f5a0d2a8b20>,decoys,"[False, False, False, False, False, False, Fal..."
36237,<rdkit.Chem.rdchem.Mol object at 0x7f5a0d2a8bc0>,decoys,"[False, False, False, False, False, False, Fal..."
36238,<rdkit.Chem.rdchem.Mol object at 0x7f5a0d2a8c60>,decoys,"[False, False, False, False, False, False, Fal..."
36239,<rdkit.Chem.rdchem.Mol object at 0x7f5a0d2a8d00>,decoys,"[False, False, False, False, False, False, Fal..."


In [37]:
def get_index_true_pains(structure_vector):
    indices = []
    for id, value in enumerate(structure_vector):
        if value:
            indices.append(id)
    return indices

In [38]:
df['true_pains'] = df['pains_keys'].apply(lambda x: get_index_true_pains(x))
df[df['true_pains'].apply(lambda x: len(x)) > 0]

Unnamed: 0,Mol,Source,pains_keys,true_pains
61,<rdkit.Chem.rdchem.Mol object at 0x7f5a0e9b1d50>,DrugBank,"[False, False, False, False, False, False, Fal...",[413]
141,<rdkit.Chem.rdchem.Mol object at 0x7f5a0e9b9170>,DrugBank,"[False, False, False, False, False, False, Fal...",[418]
195,<rdkit.Chem.rdchem.Mol object at 0x7f5a0e9bd670>,DrugBank,"[False, False, False, False, False, False, Fal...",[413]
211,<rdkit.Chem.rdchem.Mol object at 0x7f5a0e9bead0>,DrugBank,"[False, False, False, False, True, False, Fals...",[4]
274,<rdkit.Chem.rdchem.Mol object at 0x7f5a0eaae850>,DrugBank,"[False, False, False, False, False, False, Fal...",[413]
...,...,...,...,...
35978,<rdkit.Chem.rdchem.Mol object at 0x7f5a0d29d620>,decoys,"[False, False, False, False, False, False, Fal...",[418]
36038,<rdkit.Chem.rdchem.Mol object at 0x7f5a0d29fc60>,decoys,"[False, False, False, False, False, False, Fal...",[414]
36159,<rdkit.Chem.rdchem.Mol object at 0x7f5a0d2a59e0>,decoys,"[False, False, False, False, False, False, Fal...",[423]
36204,<rdkit.Chem.rdchem.Mol object at 0x7f5a0d2a76c0>,decoys,"[False, False, False, False, False, False, Fal...",[419]


In [39]:
from rdkit.Chem import MACCSkeys

In [41]:
df['MACCS_keys'] = df['Mol'].apply(lambda x: [bool(y) for y in MACCSkeys.GenMACCSKeys(x)])
df

Unnamed: 0,Mol,Source,pains_keys,true_pains,MACCS_keys
0,<rdkit.Chem.rdchem.Mol object at 0x7f5a0e9ae8f0>,DrugBank,"[False, False, False, False, False, False, Fal...",[],"[False, False, False, False, False, False, Fal..."
1,<rdkit.Chem.rdchem.Mol object at 0x7f5a0e9aead0>,DrugBank,"[False, False, False, False, False, False, Fal...",[],"[False, False, False, False, False, False, Fal..."
2,<rdkit.Chem.rdchem.Mol object at 0x7f5a0e9aea30>,DrugBank,"[False, False, False, False, False, False, Fal...",[],"[False, False, False, False, False, False, Fal..."
3,<rdkit.Chem.rdchem.Mol object at 0x7f5a0e9aec10>,DrugBank,"[False, False, False, False, False, False, Fal...",[],"[False, False, False, False, False, False, Fal..."
4,<rdkit.Chem.rdchem.Mol object at 0x7f5a0e9aeb70>,DrugBank,"[False, False, False, False, False, False, Fal...",[],"[False, False, False, False, False, False, Fal..."
...,...,...,...,...,...
36236,<rdkit.Chem.rdchem.Mol object at 0x7f5a0d2a8b20>,decoys,"[False, False, False, False, False, False, Fal...",[],"[False, False, False, False, False, False, Fal..."
36237,<rdkit.Chem.rdchem.Mol object at 0x7f5a0d2a8bc0>,decoys,"[False, False, False, False, False, False, Fal...",[],"[False, False, False, False, False, False, Fal..."
36238,<rdkit.Chem.rdchem.Mol object at 0x7f5a0d2a8c60>,decoys,"[False, False, False, False, False, False, Fal...",[],"[False, False, False, False, False, False, Fal..."
36239,<rdkit.Chem.rdchem.Mol object at 0x7f5a0d2a8d00>,decoys,"[False, False, False, False, False, False, Fal...",[],"[False, False, False, False, False, False, Fal..."
