# This notebook computes the Max Common Substructure (MCS) and the Tanimoto similarity scores between the generated molecules and the reference molecule.

MCS is computed using the ``rdkit.Chem.MCS`` module

Tanimoto similarity is computed using ``rdkit.DataStructs.TanimotoSimilarity`` module

In [1]:
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, rdFMCS
import numpy as np
import pandas as pd
from pathlib import Path
import glob

In [9]:
# # input files must be in a list of SDF files, or you can use the following code to convert things into SDF format
# pdb_files = glob.glob('./mpro_ligands/*.pdb')
# pdb_files
# for pdb_file in pdb_files:
#     os.system(f'obabel -ipdb {pdb_file} -osdf -O {pdb_file.replace(".pdb", ".sdf")}')

In [11]:
def get_lig_similarity(target_ligs, reference_ligs):
    """
    Calculate the similarity between the ligands in the target and reference sets.
    Parameters
    ----------
    target_ligs : list
        List of the target ligands as SDF files.
    reference_ligs : list
        List of the reference ligands as SDF files.
    Returns
    -------
    similarity_df : DataFrame
        DataFrame containing the Tanimoto similarity between the target and reference ligands.
    """
    def load_molecule(file_path):
        supplier = Chem.SDMolSupplier(file_path)
        if supplier:
            mol = supplier[0]
            if mol:
                return Chem.RemoveHs(mol)
        return None

    target_lig_mols = []
    target_lig_files = []
    for lig in target_ligs:
        mol = load_molecule(lig)
        if mol:
            target_lig_mols.append(mol)
            target_lig_files.append(lig)

    reference_lig_mols = []
    reference_lig_files = []
    for lig in reference_ligs:
        mol = load_molecule(lig)
        if mol:
            reference_lig_mols.append(mol)
            reference_lig_files.append(lig)

    target_lig_fps = [AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048) for mol in target_lig_mols]
    reference_lig_fps = [AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048) for mol in reference_lig_mols]

    similarity = np.zeros((len(target_lig_fps), len(reference_lig_fps)))
    for i in range(len(target_lig_fps)):
        for j in range(len(reference_lig_fps)):
            similarity[i, j] = DataStructs.TanimotoSimilarity(target_lig_fps[i], reference_lig_fps[j])

    similarity_df = pd.DataFrame(similarity, index=[Path(lig).stem for lig in target_lig_files], columns=[Path(lig).stem for lig in reference_lig_files])
    return similarity_df


In [13]:
# Prepare your lists of SDF file paths
target_ligs = glob.glob('shapegauss/exp_9/*sdf')
reference_ligs = ['mpro_ligands/Mpro-x0072_0_ligand.sdf','mpro_ligands/Mpro-x0107_0_ligand.sdf']

# Use the function
taminoto_df = get_lig_similarity(target_ligs, reference_ligs)

# Display the result
taminoto_df

[16:01:19] Explicit valence for atom # 3 C, 5, is greater than permitted
[16:01:19] ERROR: Could not sanitize molecule ending on line 54
[16:01:19] ERROR: Explicit valence for atom # 3 C, 5, is greater than permitted


Unnamed: 0,Mpro-x0072_0_ligand,Mpro-x0107_0_ligand
mol_2024_07_09_1123541_000,0.060000,0.057692
mol_2024_07_09_1135030_000,0.057692,0.075472
mol_2024_07_09_1127490_000,0.087719,0.103448
mol_2024_07_09_1127030_000,0.019231,0.037736
mol_2024_07_09_1144101_000,0.033333,0.049180
...,...,...
mol_2024_07_09_1128071_000,0.120000,0.160000
mol_2024_07_09_1131010_000,0.081967,0.079365
mol_2024_07_09_1122262_000,0.061224,0.080000
mol_2024_07_09_1144183_000,0.041667,0.061224


In [16]:
def get_mcs(target_ligs, reference_ligs):
    """
    Find the maximum common substructure (MCS) between the ligands in the target and reference sets.
    Parameters
    ----------
    target_ligs : list
        List of the target ligands as SDF files.
    reference_ligs : list
        List of the reference ligands as SDF files.
    Returns
    -------
    mcs_result : rdFMCS.MCSResult
        Result containing the MCS information.
    """
    def load_molecule(file_path):
        supplier = Chem.SDMolSupplier(file_path)
        if supplier:
            mol = supplier[0]
            if mol:
                return Chem.RemoveHs(mol)
        return None

    target_lig_mols = []
    for lig in target_ligs:
        mol = load_molecule(lig)
        if mol:
            target_lig_mols.append(mol)

    reference_lig_mols = []
    for lig in reference_ligs:
        mol = load_molecule(lig)
        if mol:
            reference_lig_mols.append(mol)

    all_mols = target_lig_mols + reference_lig_mols

    mcs_result = rdFMCS.FindMCS(all_mols, minNumAtoms=2, maximize='bonds', atomCompare='elements',
                                bondCompare='bondtypes', matchValences=False, ringMatchesRingOnly=False,
                                completeRingsOnly=False, timeout=None, threshold=None)
    return mcs_result

# Prepare your lists of SDF file paths
target_ligs = glob.glob('shapegauss/exp_9/*.sdf')
reference_ligs = ['mpro_ligands/Mpro-x0072_0_ligand.sdf', 'mpro_ligands/Mpro-x0107_0_ligand.sdf']

# Use the function
mcs_result = get_mcs(target_ligs, reference_ligs)

# Display the result
print(f"SMARTS of MCS: {mcs_result.smartsString}")
print(f"Number of atoms in MCS: {mcs_result.numAtoms}")
print(f"Number of bonds in MCS: {mcs_result.numBonds}")


[16:28:20] Explicit valence for atom # 3 C, 5, is greater than permitted
[16:28:20] ERROR: Could not sanitize molecule ending on line 54
[16:28:20] ERROR: Explicit valence for atom # 3 C, 5, is greater than permitted


ArgumentError: Python argument types in
    rdkit.Chem.rdFMCS.FindMCS(list)
did not match C++ signature:
    FindMCS(boost::python::api::object mols, RDKit::PyMCSParameters {lvalue} parameters)
    FindMCS(boost::python::api::object mols, bool maximizeBonds=True, double threshold=1.0, unsigned int timeout=3600, bool verbose=False, bool matchValences=False, bool ringMatchesRingOnly=False, bool completeRingsOnly=False, bool matchChiralTag=False, RDKit::AtomComparator atomCompare=rdkit.Chem.rdFMCS.AtomCompare.CompareElements, RDKit::BondComparator bondCompare=rdkit.Chem.rdFMCS.BondCompare.CompareOrder, RDKit::RingComparator ringCompare=rdkit.Chem.rdFMCS.RingCompare.IgnoreRingFusion, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > seedSmarts='')

In [17]:

def get_mcs(target_ligs, reference_ligs):
    """
    Find the maximum common substructure (MCS) between the ligands in the target and reference sets.
    Parameters
    ----------
    target_ligs : list
        List of the target ligands as SDF files.
    reference_ligs : list
        List of the reference ligands as SDF files.
    Returns
    -------
    mcs_result : rdFMCS.MCSResult
        Result containing the MCS information.
    """
    def load_molecule(file_path):
        supplier = Chem.SDMolSupplier(file_path, removeHs=False)
        if supplier:
            mol = supplier[0]
            if mol:
                try:
                    Chem.SanitizeMol(mol)
                    return mol
                except:
                    return None
        return None

    target_lig_mols = [load_molecule(lig) for lig in target_ligs]
    target_lig_mols = [mol for mol in target_lig_mols if mol is not None]

    reference_lig_mols = [load_molecule(lig) for lig in reference_ligs]
    reference_lig_mols = [mol for mol in reference_lig_mols if mol is not None]

    all_mols = target_lig_mols + reference_lig_mols

    if not all_mols:
        raise ValueError("No valid molecules loaded to find MCS.")
    
    mcs_result = rdFMCS.FindMCS(all_mols, minNumAtoms=2, maximize=True, atomCompare=rdFMCS.AtomCompare.CompareElements,
                                bondCompare=rdFMCS.BondCompare.CompareOrder, matchValences=False, ringMatchesRingOnly=False,
                                completeRingsOnly=False)
    return mcs_result

# Prepare your lists of SDF file paths
target_ligs = glob.glob('shapegauss/exp_9/*.sdf')
reference_ligs = ['mpro_ligands/Mpro-x0072_0_ligand.sdf', 'mpro_ligands/Mpro-x0107_0_ligand.sdf']

# Use the function
mcs_result = get_mcs(target_ligs, reference_ligs)

# Prepare your lists of SDF file paths
target_ligs = glob.glob('shapegauss/exp_9/*.sdf')
reference_ligs = ['mpro_ligands/Mpro-x0072_0_ligand.sdf', 'mpro_ligands/Mpro-x0107_0_ligand.sdf']

# Use the function
mcs_result = get_mcs(target_ligs, reference_ligs)

# Display the result
print(f"SMARTS of MCS: {mcs_result.smartsString}")
print(f"Number of atoms in MCS: {mcs_result.numAtoms}")
print(f"Number of bonds in MCS: {mcs_result.numBonds}")

[16:31:29] Explicit valence for atom # 7 C, 5, is greater than permitted
[16:31:29] ERROR: Could not sanitize molecule ending on line 54
[16:31:29] ERROR: Explicit valence for atom # 7 C, 5, is greater than permitted
[16:31:29] Explicit valence for atom # 12 C, 5, is greater than permitted
[16:31:29] ERROR: Could not sanitize molecule ending on line 54
[16:31:29] ERROR: Explicit valence for atom # 12 C, 5, is greater than permitted


ArgumentError: Python argument types in
    rdkit.Chem.rdFMCS.FindMCS(list)
did not match C++ signature:
    FindMCS(boost::python::api::object mols, RDKit::PyMCSParameters {lvalue} parameters)
    FindMCS(boost::python::api::object mols, bool maximizeBonds=True, double threshold=1.0, unsigned int timeout=3600, bool verbose=False, bool matchValences=False, bool ringMatchesRingOnly=False, bool completeRingsOnly=False, bool matchChiralTag=False, RDKit::AtomComparator atomCompare=rdkit.Chem.rdFMCS.AtomCompare.CompareElements, RDKit::BondComparator bondCompare=rdkit.Chem.rdFMCS.BondCompare.CompareOrder, RDKit::RingComparator ringCompare=rdkit.Chem.rdFMCS.RingCompare.IgnoreRingFusion, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > seedSmarts='')

In [18]:
def get_mcs(target_ligs, reference_ligs):
    """
    Find the maximum common substructure (MCS) between the ligands in the target and reference sets.
    Parameters
    ----------
    target_ligs : list
        List of the target ligands as SDF files.
    reference_ligs : list
        List of the reference ligands as SDF files.
    Returns
    -------
    mcs_result : rdFMCS.MCSResult
        Result containing the MCS information.
    """
    def load_molecule(file_path):
        supplier = Chem.SDMolSupplier(file_path, removeHs=False)
        if supplier:
            mol = supplier[0]
            if mol:
                try:
                    Chem.SanitizeMol(mol)
                    return mol
                except:
                    return None
        return None

    target_lig_mols = [load_molecule(lig) for lig in target_ligs]
    target_lig_mols = [mol for mol in target_lig_mols if mol is not None]

    reference_lig_mols = [load_molecule(lig) for lig in reference_ligs]
    reference_lig_mols = [mol for mol in reference_lig_mols if mol is not None]

    all_mols = target_lig_mols + reference_lig_mols

    if not all_mols:
        raise ValueError("No valid molecules loaded to find MCS.")

    # Create MCS parameters
    mcs_params = rdFMCS.MCSParameters()
    mcs_params.atomCompare = rdFMCS.AtomCompare.CompareElements
    mcs_params.bondCompare = rdFMCS.BondCompare.CompareOrder
    mcs_params.matchValences = False
    mcs_params.ringMatchesRingOnly = False
    mcs_params.completeRingsOnly = False
    mcs_params.minNumAtoms = 2
    mcs_params.maximizeBonds = True

    # Find the MCS
    mcs_result = rdFMCS.FindMCS(all_mols, mcs_params)
    return mcs_result

# Prepare your lists of SDF file paths
target_ligs = glob.glob('shapegauss/exp_9/*.sdf')
reference_ligs = ['mpro_ligands/Mpro-x0072_0_ligand.sdf', 'mpro_ligands/Mpro-x0107_0_ligand.sdf']

# Use the function
mcs_result = get_mcs(target_ligs, reference_ligs)

# Display the result
print(f"SMARTS of MCS: {mcs_result.smartsString}")
print(f"Number of atoms in MCS: {mcs_result.numAtoms}")
print(f"Number of bonds in MCS: {mcs_result.numBonds}")

SMARTS of MCS: [#6]
Number of atoms in MCS: 1
Number of bonds in MCS: 0


[16:59:07] Explicit valence for atom # 7 C, 5, is greater than permitted
[16:59:07] ERROR: Could not sanitize molecule ending on line 54
[16:59:07] ERROR: Explicit valence for atom # 7 C, 5, is greater than permitted
[16:59:07] Explicit valence for atom # 12 C, 5, is greater than permitted
[16:59:07] ERROR: Could not sanitize molecule ending on line 54
[16:59:07] ERROR: Explicit valence for atom # 12 C, 5, is greater than permitted
