# Compute molecular descriptors using `rdkit`



In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from itertools import combinations

from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import Descriptors, Lipinski, rdMolDescriptors
from rdkit.Chem.rdmolops import FastFindRings, GetFormalCharge

In [2]:
# Import the dataframe with all molecules
input_file = './rdKit_db_molecules.obj'
df_mols = pd.read_pickle(input_file)

# Compute some molecular properties using rdkit
df_mols['MW']        = df_mols['mol_rdk'].apply(Descriptors.MolWt)
df_mols['num_atoms'] = df_mols['mol_rdk'].apply(Chem.Lipinski.HeavyAtomCount)
# Initialize ringInfo
df_mols['mol_rdk'].apply(FastFindRings)
df_mols['num_rot']   = df_mols['mol_rdk'].apply(Chem.Lipinski.NumRotatableBonds)
df_mols['num_rings'] = df_mols['mol_rdk'].apply(rdMolDescriptors.CalcNumRings)
df_mols['charge']    = df_mols['mol_rdk'].apply(GetFormalCharge)

# Reset the index to simplify working with each library
df_mols = df_mols.reset_index()
df_mols = df_mols.rename({'level_0': 'library'}, axis = 'columns')

# Healthy inspection
df_mols.head()

Unnamed: 0,library,Lig,Activity,mol_rdk,sanitized,MW,num_atoms,num_rot,num_rings,charge
0,COCRYS,STU,1,<rdkit.Chem.rdchem.Mol object at 0x7fec4f567df0>,True,465.533,35,2,8,1
1,COCRYS,ATP,1,<rdkit.Chem.rdchem.Mol object at 0x7fec4f567e30>,True,503.15,31,8,3,-4
2,COCRYS,PVB,1,<rdkit.Chem.rdchem.Mol object at 0x7fec4f56b0b0>,True,259.7,18,2,3,0
3,COCRYS,DTQ,1,<rdkit.Chem.rdchem.Mol object at 0x7fec4f56b0f0>,True,297.314,22,4,3,0
4,COCRYS,HMD,1,<rdkit.Chem.rdchem.Mol object at 0x7fec4f56b170>,False,323.13,19,0,3,0


How many actives are there in each molecular library?

In [3]:
display(df_mols\
            .groupby('library')['Activity']\
            .value_counts()
       )

library  Activity
COCRYS   1            315
CSAR     0             85
         1             26
DEKOIS2  0           1200
         1             40
DUD      0           2074
         1             72
Name: Activity, dtype: int64

## Compute molecular descriptors

In [4]:
from rdkit.Chem import MACCSkeys
from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect
from rdkit.Chem import DataStructs

In [5]:
# Define a helper function to return a fingerprint matrix
def fps_to_array(fps, drop_constant_cols=True):
    '''Convert a list of molecular fingerprints into a 
       numpy m*n ndarray, where n = # molecules and n = # fps'''
    def fp2arr(fp):
        arr = np.zeros((0,))
        DataStructs.ConvertToNumpyArray(fp, arr)
        return arr
    X = np.asarray([fp2arr(fp) for fp in fps])
    if drop_constant_cols:
        X = pd.DataFrame(X)
        X = X.loc[:, X.var() != 0.0] 
    return X 

#### MACCS Fingerprints

In [6]:
# Compute Maccs Fingerprints
fps_all_maccs = [MACCSkeys.GenMACCSKeys(i) 
                 for i in df_mols.mol_rdk]
X_all_maccs = fps_to_array(fps_all_maccs)

print("MACCS Fps:", X_all_maccs.shape)

MACCS Fps: (3812, 147)


#### Morgan Fingerprints


In [7]:
# Compute Morgan Fingerprints
fps_all_morgan = [GetMorganFingerprintAsBitVect(i, radius=2) 
                  for i in df_mols.mol_rdk]
X_all_morgan = fps_to_array(fps_all_morgan)

print("Morgan Fps:", X_all_morgan.shape)

Morgan Fps: (3812, 2048)


#### RDKFingerprints

In [8]:
# Compute RDKit Fingerprints
fps_all_rdk = [Chem.RDKFingerprint(i) 
               for i in df_mols.mol_rdk]
X_all_rdk = fps_to_array(fps_all_rdk)

print("RDKit Fps:", X_all_rdk.shape)

RDKit Fps: (3812, 2048)


#### Physiological Properties
**DUD:**
1. Mol. Weight
2. Log P
3. number Hb acceptors 
4. number Hb donnors
5. number Rotatable bonds

**DEKOIS 2.0:**
1. molecular weight
2. logP
3. number of hydrogen bond donors
4. number of hydrogen bond acceptors
5. number of rotatable bonds
6. population of negatively charged states
7. population of positively charged states
8. number of aromatic rings.

In [9]:
from rdkit.Chem import rdPartialCharges
from rdkit.ML.Descriptors import MoleculeDescriptors

def get_num_charges(mol):
    '''A simple function to count the number of 
       positive and negative charges of a given molecule'''
    mol.ComputeGasteigerCharges()
    n = mol.GetNumAtoms()
    charges = np.array(
        [float(mol.GetAtomWithIdx(i)\
        .GetProp('_GasteigerCharge')) for i in range(n)])
    n_pos = np.sum(charges >= 0)
    n_neg = np.sum(charges < 0)
    return (n_pos, n_neg)

In [10]:
descriptors = ['MolWt', 
               'MolLogP', 
               'NumHAcceptors', 
               'NumHDonors', 
               'NumRotatableBonds', 
               'NumAromaticRings']

# Define a rdkit molecular descriptors calculator
rdk_calculator = MoleculeDescriptors\
                    .MolecularDescriptorCalculator(descriptors)

# Compute the first six descriptors
df_phy_desc =  pd.DataFrame(
                   df_mols.mol_rdk\
                          .apply(
                              rdk_calculator.CalcDescriptors
                           ).to_list(), 
                  columns = descriptors,
                  index = df_mols['library'])

# Now compute the number of charges
df_charges = pd.DataFrame(df_mols.mol_rdk\
             .apply(get_num_charges).tolist(), 
             columns = ('NumPositiveChr', 'NumNegativeChr'),
             index = df_mols['library'])

X_all_phy = pd.concat([df_phy_desc, df_charges], axis=1)

print("Phy. Properties:", X_all_phy.shape)

Phy. Properties: (3812, 8)


## Find duplicated molecules using Morgan Fingerprints

In [11]:
%run ../helper_modules/run_or_load.py

In [12]:
@run_or_load
def find_duplicated_molecules(filename, df, 
                              fps, sim_thr = 1):
    '''
    A helper function to identify duplicated molecules 
    regarding to a set of molecular descriptors and a 
    similarity threshold
    '''
    indices = df.index.values
    names = df.Lig.values
    mols = df.mol_rdk.values
    libraries = df.library.values

    matched_molecules = []
    for j, k in combinations(df.index.values, 2):
        fp_sim = DataStructs.FingerprintSimilarity(fps[j], fps[k],
                       metric = DataStructs.TanimotoSimilarity)
        if fp_sim == sim_thr:
           matched_molecules.append( 
               (names[j], names[k], 
                libraries[j] + '_' + libraries[k],
                names[j] + '$' + libraries[j],
                names[k] + '$' + libraries[k]
               )
           )
    # Create a dataframe
    df_matched_mols = pd.DataFrame(
                         matched_molecules, 
                         columns = ['mol1', 'mol2', 
                                    'libraries', 'mol1_lib1', 
                                    'mol2_lib2'])
    return df_matched_mols

##### Identify the molecules that appear in more than one library

In [13]:
%%time
# Identify the molecules that appear in more than one library
# We will use the Morgan Fingerprints
file_unique_mols = './df_unique_mols_among_libraries.obj'
df_unique_mols   = find_duplicated_molecules(
                        filename = file_unique_mols, 
                        df = df_mols, 
                        fps = fps_all_morgan
                       )
# Sort by library and by mol colum
df_unique_mols.sort_values(['libraries', 'mol1'], inplace=True)

File saved: ./df_unique_mols_among_libraries.obj
CPU times: user 50.2 s, sys: 235 ms, total: 50.4 s
Wall time: 50.9 s


In [14]:
# Set of all repeated molecules
unique_repeated_mols = set(df_unique_mols.mol2_lib2.unique().tolist() + \
                           df_unique_mols.mol1_lib1.unique().tolist())

In [15]:
print('- There are', 
      df_unique_mols.shape[0], 
      'pairs of duplicated molecules =', 
      df_unique_mols.shape[0]*2, 
      'molecules.')

print('-', df_unique_mols.mol1.str.contains('^decoy').sum() + 
      df_unique_mols.mol2.str.contains('^decoy').sum(), 'are decoys.')

print('- Among all duplicates', len(df_unique_mols.mol2_lib2.unique()),  
      'mols are different (as judged by the Descriptors).')

- There are 478 pairs of duplicated molecules = 956 molecules.
- 860 are decoys.
- Among all duplicates 390 mols are different (as judged by the Descriptors).


In [16]:
df_unique_mols['libraries'].value_counts().sort_index()

COCRYS_COCRYS        2
COCRYS_CSAR         10
COCRYS_DEKOIS2       1
COCRYS_DUD          18
DEKOIS2_DEKOIS2      3
DEKOIS2_DUD          4
DUD_DUD            440
Name: libraries, dtype: int64

### Update Data frames by removing repeated molecules

- We kept molecules from the second column to avoid keep COCRYS molecules over other libraries 

In [17]:
def drop_repeated_mols(df, mols_to_drop, temp_index):
    '''Helper function to drop molecules of a given dataframe.'''
    temp_df = df.copy()
    temp_df['temp_index'] = temp_index
    temp_df = temp_df.set_index('temp_index')
    # drop molecules
    temp_df = temp_df.drop(mols_to_drop, axis=0)
    # reset index and drop temporal column
    temp_df = temp_df.reset_index().drop('temp_index', axis=1)
    return temp_df

In [18]:
# We'll keep molecules from the second column 
# to avoid keep COCRYS molecules over other libraries
set_to_keep = df_unique_mols.mol2_lib2.unique()
# Now, from the set of all repeated molecules remove those to be kept
set_to_drop = unique_repeated_mols.difference(set_to_keep)
    
print('Number of molecules to drop', len(set_to_drop))

Number of molecules to drop 324


In [19]:
#*********************************
# Update the original data frames
#*********************************

# From the original MERGED dataset, 
# create a temporal column with mol_name$library and set it as index
temp_index = df_mols[['Lig', 'library']]\
                         .apply(
                              lambda x: 
                                x.values[0] + '$' + x.values[1], 
                                axis=1
                          )\
                         .to_list()

# Main DataFrame
df_all_mols = drop_repeated_mols(df_mols, set_to_drop, temp_index)
print('Number of remaining molecules:')
print(df_all_mols.shape)

# MACCS Fps
X_all_maccs = drop_repeated_mols(X_all_maccs, set_to_drop, temp_index)
print(X_all_maccs.shape)
# Morgan Fps
X_all_morgan = drop_repeated_mols(X_all_morgan, set_to_drop, temp_index)
print(X_all_maccs.shape)
# RDKit Fps
X_all_rdk = drop_repeated_mols(X_all_rdk, set_to_drop, temp_index)
print(X_all_maccs.shape)
# Phys Properties
X_all_phy = drop_repeated_mols(X_all_phy, set_to_drop, temp_index)
print(X_all_phy.shape)

Number of remaining molecules:
(3488, 10)
(3488, 147)
(3488, 147)
(3488, 147)
(3488, 8)


## Save the fingertprints and the final DataFrame

In [20]:
# Save molecular fingerprints in a single object
fps_file = './rdkit_fingerprints_all_libraries_no_duplicates.obj'
all_fps = {
    'MACCS': X_all_maccs, 
    'Morgan': X_all_morgan,
    'RDKFps': X_all_rdk,
    'Phys': X_all_phy 
}

with open(fps_file, 'wb') as f:
    joblib.dump(all_fps, filename = f)

In [21]:
# Save the molecules dataframe without duplicates
df_out_file = './df_MOLECULES_no_duplicates.obj'
df_all_mols.to_pickle(df_out_file)