# Modeling

In [7]:
from scipy.spatial import distance
from sklearn import manifold
from sklearn import decomposition
from sklearn import preprocessing
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
from sklearn import svm
from sklearn import ensemble
from sklearn import model_selection as ms
from sklearn.metrics import mean_squared_error

# rdkit mols are displayed as images
from rdkit.Chem.Draw import IPythonConsole

from rdkit.Chem import AllChem as Chem
from rdkit import DataStructs
from rdkit.Chem.SaltRemover import SaltRemover

# descriptors
from rdkit.Chem import Lipinski
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import Descriptors

# plots
from matplotlib import pyplot
import seaborn

# other packages
import pandas as pd
import numpy as np
import csv
import re


from rdkit import Chem
from rdkit import RDLogger
import matplotlib.pyplot as plt
import pandas as pd
import gzip
from rdkit.Chem import Descriptors
from rdkit.Chem import Lipinski
import seaborn as sb
from itertools import chain
from rdkit.Chem import MACCSkeys
import numpy as np

RDLogger.DisableLog('rdApp.error')

# Standardization

In [8]:
sr = SaltRemover() # from rdkit
m = Chem.MolFromSmiles("[Na+].C(=O)[O-]")
print(Chem.MolToSmiles(sr(m)))

O=C[O-]


In [9]:
""" contribution from Hans de Winter """
def _InitialiseNeutralisationReactions():
    patts= (
        # Imidazoles
        ('[n+;H]','n'),
        # Amines
        ('[N+;!H0]','N'),
        # Carboxylic acids and alcohols
        ('[$([O-]);!$([O-][#7])]','O'),
        # Thiols
        ('[S-;X1]','S'),
        # Sulfonamides
        ('[$([N-;X2]S(=O)=O)]','N'),
        # Enamines
        ('[$([N-;X2][C,N]=C)]','N'),
        # Tetrazoles
        ('[n-]','[nH]'),
        # Sulfoxides
        ('[$([S-]=O)]','S'),
        # Amides
        ('[$([N-]C=O)]','N'),
        )
    return [(Chem.MolFromSmarts(x),Chem.MolFromSmiles(y,False)) for x,y in patts]

_reactions=None
def NeutraliseCharges(mol, reactions=None):
    global _reactions
    if reactions is None:
        if _reactions is None:
            _reactions=_InitialiseNeutralisationReactions()
        reactions=_reactions
    replaced = False
    for i,(reactant, product) in enumerate(reactions):
        while mol.HasSubstructMatch(reactant):
            replaced = True
            rms = Chem.ReplaceSubstructs(mol, reactant, product)
            mol = rms[0]
    return mol, replaced

In [10]:
_saltRemover = SaltRemover()
_inorganicPatt = Chem.MolFromSmarts("[!#6;!#7;!#8;!#16;!F;!Cl;!Br;!I]") # to remove compounds with unwanted atom types
_carbonPatt = Chem.MolFromSmarts("[#6]") # to remove compounds without carbon - inorganic
def standardize(mol):
    if mol.HasSubstructMatch(_carbonPatt):
        mol = _saltRemover(mol)
        if mol.GetNumAtoms()==0:
            return None
        else:
            mol, neutralized = NeutraliseCharges(mol)
            if mol.HasSubstructMatch(_inorganicPatt):
                return None
            else:
                # Sanitize mol, without sanitization some structures can't be drawn or fingerprinted
                Chem.SanitizeMol(mol) # add catch block?
                return mol
    else:
        return None

In [11]:
# load dataset
suppl = Chem.SDMolSupplier("../data/drugbank.sdf")
drug_bank = [[mol, 'drugbank'] for mol in suppl if mol]

with gzip.open("../data/actives_final.sdf.gz") as sdf:
    supp_actives = Chem.ForwardSDMolSupplier(sdf)
    actives = [[mol, 'acives'] for mol in supp_actives if mol]

with gzip.open("../data/decoys_final.sdf.gz") as sdf:
    supp_decoys = Chem.ForwardSDMolSupplier(sdf)
    decoys = [[mol, 'decoys'] for mol in supp_decoys if mol]

df_db = pd.DataFrame(drug_bank, columns=['Mol', 'Source'])
df_ac = pd.DataFrame(actives, columns=['Mol', 'Source'])
df_dc = pd.DataFrame(decoys, columns=['Mol', 'Source'])

df = pd.concat([df_db, df_ac, df_dc], ignore_index=True)
df

Unnamed: 0,Mol,Source
0,<rdkit.Chem.rdchem.Mol object at 0x7ffb05457610>,drugbank
1,<rdkit.Chem.rdchem.Mol object at 0x7ffb054578b0>,drugbank
2,<rdkit.Chem.rdchem.Mol object at 0x7ffb05457920>,drugbank
3,<rdkit.Chem.rdchem.Mol object at 0x7ffb054576f0>,drugbank
4,<rdkit.Chem.rdchem.Mol object at 0x7ffb05457990>,drugbank
...,...,...
42177,<rdkit.Chem.rdchem.Mol object at 0x7ffb04721c40>,decoys
42178,<rdkit.Chem.rdchem.Mol object at 0x7ffb04721cb0>,decoys
42179,<rdkit.Chem.rdchem.Mol object at 0x7ffb04721d20>,decoys
42180,<rdkit.Chem.rdchem.Mol object at 0x7ffb04721d90>,decoys


In [12]:
# standardize and create new column 'Standardised_Mol' and mol_InChI_Key and Standardised_InChI_Key

df['Standardised_Mol'] = df['Mol'].apply(standardize)

df['Mol_InChI_Key'] = df['Mol'].apply(lambda x: Chem.MolToInchiKey(x))
df['Standardised_InChI_Key'] = df['Standardised_Mol'].apply(lambda x: Chem.MolToInchiKey(x) if x else None)





[12:54:22] bond type above 3 (17) is treated as unspecified!
[12:54:22] bond type above 3 (17) is treated as unspecified!
[12:54:22] bond type above 3 (17) is treated as unspecified!
[12:54:22] bond type above 3 (17) is treated as unspecified!
[12:54:22] bond type above 3 (17) is treated as unspecified!
[12:54:22] bond type above 3 (17) is treated as unspecified!
[12:54:22] bond type above 3 (17) is treated as unspecified!
[12:54:22] bond type above 3 (17) is treated as unspecified!


Unnamed: 0,Mol,Source,Standardised_Mol,Mol_InChI_Key,Standardised_InChI_Key
0,<rdkit.Chem.rdchem.Mol object at 0x7ffb05457610>,drugbank,<rdkit.Chem.rdchem.Mol object at 0x7ffb047229d0>,OIRCOABEOLEUMC-GEJPAHFPSA-N,OIRCOABEOLEUMC-GEJPAHFPSA-N
1,<rdkit.Chem.rdchem.Mol object at 0x7ffb054578b0>,drugbank,<rdkit.Chem.rdchem.Mol object at 0x7ffb04722ab0>,BLCLNMBMMGCOAS-URPVMXJPSA-N,BLCLNMBMMGCOAS-URPVMXJPSA-N
2,<rdkit.Chem.rdchem.Mol object at 0x7ffb05457920>,drugbank,<rdkit.Chem.rdchem.Mol object at 0x7ffb047234c0>,NFLWUMRGJYTJIN-NXBWRCJVSA-N,NFLWUMRGJYTJIN-NXBWRCJVSA-N
3,<rdkit.Chem.rdchem.Mol object at 0x7ffb054576f0>,drugbank,<rdkit.Chem.rdchem.Mol object at 0x7ffb04723610>,SBNPWPIBESPSIF-MHWMIDJBSA-N,SBNPWPIBESPSIF-MHWMIDJBSA-N
4,<rdkit.Chem.rdchem.Mol object at 0x7ffb05457990>,drugbank,<rdkit.Chem.rdchem.Mol object at 0x7ffb04723760>,PMATZTZNYRCHOR-CGLBZJNRSA-N,PMATZTZNYRCHOR-CGLBZJNRSA-N
...,...,...,...,...,...
42177,<rdkit.Chem.rdchem.Mol object at 0x7ffb04721c40>,decoys,<rdkit.Chem.rdchem.Mol object at 0x7ffb041e99a0>,VSSUHXFPDJXHBD-UHFFFAOYSA-N,VSSUHXFPDJXHBD-UHFFFAOYSA-N
42178,<rdkit.Chem.rdchem.Mol object at 0x7ffb04721cb0>,decoys,<rdkit.Chem.rdchem.Mol object at 0x7ffb041e95b0>,PPXICSRTDNVROV-JFIYKMOQSA-N,PPXICSRTDNVROV-JFIYKMOQSA-N
42179,<rdkit.Chem.rdchem.Mol object at 0x7ffb04721d20>,decoys,<rdkit.Chem.rdchem.Mol object at 0x7ffb041e9070>,UQWJXBXCEOIEBM-UHFFFAOYSA-N,UQWJXBXCEOIEBM-UHFFFAOYSA-N
42180,<rdkit.Chem.rdchem.Mol object at 0x7ffb04721d90>,decoys,<rdkit.Chem.rdchem.Mol object at 0x7ffb041e8580>,RQDSBYXRWNTCND-SNVBAGLBSA-N,RQDSBYXRWNTCND-SNVBAGLBSA-N


In [15]:
# save standardised dataset

df_standardised = df
df_standardised.to_pickle('../data/df_standardised.pkl')

In [21]:
# do some statistics how many compounds were removed by standardization and 
# how many compounds were were changed by standardization and visualize

removed = df['Standardised_Mol'].isnull().sum()
print(f'Removed: {removed}')

# from what source are the removed compounds
df.loc[df['Standardised_Mol'].isnull(), 'Source'].value_counts()
# compare mol_inchi_key and standardised_inchi_key
number_of_changed_inchi_keys = df['Mol_InChI_Key'].ne(df['Standardised_InChI_Key']).sum()
print(f'Number of changed InChI keys: {number_of_changed_inchi_keys}')


Removed: 1051
Number of changed InChI keys: 26092
