# Computational Drug Design - Final Project

Author: Vojtech Melichar
Year: AY 2022/23

## Import of dataset

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import gzip
from itertools import chain

from rdkit import Chem
from rdkit.Chem.SaltRemover import SaltRemover
from rdkit.Chem import Descriptors
from rdkit.Chem import Lipinski

from rdkit import RDLogger

RDLogger.DisableLog('rdApp.error')

In [11]:
supp = Chem.SDMolSupplier('../data/drugbank.sdf')
drug_bank = [[mol, 'DrugBank'] for mol in supp if mol]

with gzip.open('actives_final.sdf.gz') as sdf:
    supp_actives = Chem.ForwardSDMolSupplier(sdf)
    actives = [[mol, 'actives'] for mol in supp_actives if mol]

with gzip.open('decoys_final.sdf.gz') as sdf:
    supp_decoys = Chem.ForwardSDMolSupplier(sdf)
    decoys = [[mol, 'decoys'] for mol in supp_decoys if mol]

df_db = pd.DataFrame(drug_bank, columns=['Mol', 'Source'])
df_ac = pd.DataFrame(actives, columns=['Mol', 'Source'])
df_de = pd.DataFrame(decoys, columns=['Mol', 'Source'])

df = pd.concat((df_db, df_ac, df_de), ignore_index=True)
df



Unnamed: 0,Mol,Source
0,<rdkit.Chem.rdchem.Mol object at 0x7f308e18fee0>,DrugBank
1,<rdkit.Chem.rdchem.Mol object at 0x7f308e18ff30>,DrugBank
2,<rdkit.Chem.rdchem.Mol object at 0x7f308e18ff80>,DrugBank
3,<rdkit.Chem.rdchem.Mol object at 0x7f308e194030>,DrugBank
4,<rdkit.Chem.rdchem.Mol object at 0x7f308e194080>,DrugBank
...,...,...
36236,<rdkit.Chem.rdchem.Mol object at 0x7f308dc0d300>,decoys
36237,<rdkit.Chem.rdchem.Mol object at 0x7f308dc0d3a0>,decoys
36238,<rdkit.Chem.rdchem.Mol object at 0x7f308dc0d440>,decoys
36239,<rdkit.Chem.rdchem.Mol object at 0x7f308dc0d4e0>,decoys


## Standardization of molecules

In [7]:
""" contribution from Hans de Winter """
def _InitialiseNeutralisationReactions():
    patts= (
        # Imidazoles
        ('[n+;H]','n'),
        # Amines
        ('[N+;!H0]','N'),
        # Carboxylic acids and alcohols
        ('[$([O-]);!$([O-][#7])]','O'),
        # Thiols
        ('[S-;X1]','S'),
        # Sulfonamides
        ('[$([N-;X2]S(=O)=O)]','N'),
        # Enamines
        ('[$([N-;X2][C,N]=C)]','N'),
        # Tetrazoles
        ('[n-]','[nH]'),
        # Sulfoxides
        ('[$([S-]=O)]','S'),
        # Amides
        ('[$([N-]C=O)]','N'),
        )
    return [(Chem.MolFromSmarts(x),Chem.MolFromSmiles(y,False)) for x,y in patts]

_reactions=None
def NeutraliseCharges(mol, reactions=None):
    global _reactions
    if reactions is None:
        if _reactions is None:
            _reactions=_InitialiseNeutralisationReactions()
        reactions=_reactions
    replaced = False
    for i,(reactant, product) in enumerate(reactions):
        while mol.HasSubstructMatch(reactant):
            replaced = True
            rms = Chem.ReplaceSubstructs(mol, reactant, product)
            mol = rms[0]
    return mol, replaced

In [8]:
_saltRemover = SaltRemover()
_inorganicPatt = Chem.MolFromSmarts("[!#6;!#7;!#8;!#16;!F;!Cl;!Br;!I]") # to remove compounds with unwanted atom types
_carbonPatt = Chem.MolFromSmarts("[#6]") # to remove compounds without carbon - inorganic

def standardize(mol):
    if mol.HasSubstructMatch(_carbonPatt):
        mol = _saltRemover(mol)
        if mol.GetNumAtoms()==0:
            return None
        else:
            mol, neutralized = NeutraliseCharges(mol)
            if mol.HasSubstructMatch(_inorganicPatt):
                return None
            else:
                # Sanitize mol, without sanitization some structures can't be drawn or fingerprinted
                Chem.SanitizeMol(mol) # add catch block?
                return mol
    else:
        return None

In [None]:
df['Mol'] = df['Mol'].apply(standardize)
df["Mol"].isna().sum()

1071 molecules will be removed due to standardization process.

In [14]:
df.dropna(inplace=True)
df

Unnamed: 0,Mol,Source
0,<rdkit.Chem.rdchem.Mol object at 0x7f308e1d4580>,DrugBank
1,<rdkit.Chem.rdchem.Mol object at 0x7f308e92cbc0>,DrugBank
2,<rdkit.Chem.rdchem.Mol object at 0x7f308e931b20>,DrugBank
3,<rdkit.Chem.rdchem.Mol object at 0x7f308ef3acb0>,DrugBank
4,<rdkit.Chem.rdchem.Mol object at 0x7f308e91d440>,DrugBank
...,...,...
36236,<rdkit.Chem.rdchem.Mol object at 0x7f308e60f6c0>,decoys
36237,<rdkit.Chem.rdchem.Mol object at 0x7f308e60f760>,decoys
36238,<rdkit.Chem.rdchem.Mol object at 0x7f308e60f3f0>,decoys
36239,<rdkit.Chem.rdchem.Mol object at 0x7f308e60cd00>,decoys


## Lipinski's Rule of Five

In [19]:
# do lipinski's rule of five filtering
def lipinski(mol):
    mw = Descriptors.MolWt(mol)
    logp = Descriptors.MolLogP(mol)
    hbd = Lipinski.NumHDonors(mol)
    hba = Lipinski.NumHAcceptors(mol)
    if mw <= 500 and logp <= 5 and hbd <= 5 and hba <= 10:
        return True
    else:
        return False

In [20]:
df['lip5'] = df['Mol'].apply(lipinski)
df.lip5.value_counts()

True     31432
False     3738
Name: lip5, dtype: int64

31432 molecules satisfy Lipinski's Rule of Five. 3738 molecules do not satisfy Lipinski's Rule of Five.

## PAINs filter

In [23]:
# load PAINS structures in SMARTS format from two files in data
pains = []
with open('../data/pains/p_l15.txt', 'r') as f, open('../data/pains/p_m150.txt', 'r') as p:
    for line in chain(f, p):
        pattern, text = line.strip().split('\t') # split by tab
        pains.append(Chem.MolFromSmarts(pattern)) # remove the xml tags
len(pains)

425

In [28]:
def gen_pains(mol, keys=pains):
    structure_key = [mol.HasSubstructMatch(pain) for pain in pains]
    return sum(structure_key)

In [29]:
df['pains'] = df['Mol'].apply(gen_pains)
df.pains.value_counts()

Unnamed: 0,Mol,Source,lip5,pains
0,<rdkit.Chem.rdchem.Mol object at 0x7f308e1d4580>,DrugBank,False,0
1,<rdkit.Chem.rdchem.Mol object at 0x7f308e92cbc0>,DrugBank,False,0
2,<rdkit.Chem.rdchem.Mol object at 0x7f308e931b20>,DrugBank,False,0
3,<rdkit.Chem.rdchem.Mol object at 0x7f308ef3acb0>,DrugBank,False,0
4,<rdkit.Chem.rdchem.Mol object at 0x7f308e91d440>,DrugBank,False,0
...,...,...,...,...
36236,<rdkit.Chem.rdchem.Mol object at 0x7f308e60f6c0>,decoys,True,0
36237,<rdkit.Chem.rdchem.Mol object at 0x7f308e60f760>,decoys,True,0
36238,<rdkit.Chem.rdchem.Mol object at 0x7f308e60f3f0>,decoys,True,0
36239,<rdkit.Chem.rdchem.Mol object at 0x7f308e60cd00>,decoys,True,0


There are 532 molecules with 1 PAIN structure and 4 molecules with 2 PAIN structures.