# Standardization of SMILES and calculation of Mordred descriptors from mol representation

In [None]:
# Loading necessary libraries
import pandas as pd
import numpy as np
import pathlib

import warnings
warnings.filterwarnings('ignore', category = RuntimeWarning)

# Reading the data

In [None]:
#file_in = ...
file_in = 'smiles.csv'
sep = ' '
# Getting the file extension
file_extension = pathlib.Path(file_in).suffix

# Reading the data into dataframe based on file extension
if file_extension in ['.xlsx', '.xlsm']:
  data = pd.read_excel(file_in)
else:
  data = pd.read_csv(file_in, sep=sep, low_memory=False)

# Standardization

In [None]:
# Installing rdkit
! pip install rdkit-pypi

Collecting rdkit-pypi
  Downloading rdkit_pypi-2022.9.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Downloading rdkit_pypi-2022.9.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m65.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit-pypi
Successfully installed rdkit-pypi-2022.9.5


In [None]:
# Loading necessary libraries
from IPython.display import SVG # to use Scalar Vector Graphics (SVG) not bitmaps, for cleaner lines
import rdkit
from rdkit import Chem, rdBase
from rdkit.Chem import AllChem, Draw
from rdkit.Chem import Draw # to draw molecules
from rdkit.Chem.Draw import IPythonConsole # to draw inline in iPython
from rdkit.Chem import rdDepictor  # to generate 2D depictions of molecules
from rdkit.Chem.Draw import rdMolDraw2D # to draw 2D molecules using vectors
from rdkit.Chem.MolStandardize import rdMolStandardize

from rdkit import RDLogger # to suppress rdkit messages
RDLogger.DisableLog('rdApp.*')

In [None]:
def standardize_mol(mol, verbose=False):
    clean_mol = rdMolStandardize.Cleanup(mol)
    if verbose:
        print('Remove hydrogens, disconnected metal atoms, normalize the molecule, reionize the molecule:')
        draw_mol_with_SVG(clean_mol)

    parent_clean_mol = rdMolStandardize.FragmentParent(clean_mol)
    if verbose:
        print('Select the "parent" fragment:')
        draw_mol_with_SVG(parent_clean_mol)

    uncharger = rdMolStandardize.Uncharger()
    uncharged_parent_clean_mol = uncharger.uncharge(parent_clean_mol)
    if verbose:
        print('Neutralize the molecule:')
        draw_mol_with_SVG(uncharged_parent_clean_mol)

    taut_enum = rdMolStandardize.TautomerEnumerator()
    taut_uncharged_parent_clean_mol = taut_enum.Canonicalize(uncharged_parent_clean_mol)
    if verbose:
        print('Enumerate tautomers:')
        draw_mol_with_SVG(taut_uncharged_parent_clean_mol)

    assert taut_uncharged_parent_clean_mol is not None

    if verbose: print(Chem.MolToSmiles(taut_uncharged_parent_clean_mol))

    return taut_uncharged_parent_clean_mol


In [None]:
def smiles_to_standardized_mol(smiles, verbose=False):
    if verbose: print(smiles)
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        if verbose:
            print(f"Invalid SMILES string: {smiles}")
        return None
    std_mol = standardize_mol(mol, verbose=verbose)
    return (std_mol)

def standardize_smiles_from_mol(mol, verbose=False):
    if mol is None:
        return None
    return Chem.MolToSmiles(mol) # Convert standardized Mol object to SMILES

def standardize_inchikey_from_mol(mol, verbose=False):
    if mol is None:
        return None
    return Chem.MolToInchiKey(mol)

def molecular_formula(mol, verbose=False):
    if mol is None:
        return None
    return rdkit.Chem.rdMolDescriptors.CalcMolFormula(mol)

In [None]:
data['mol'] = data.SMILES.apply(lambda x: smiles_to_standardized_mol(x, verbose=False))

In [None]:
data['standardized_SMILES'] = data.mol.apply(lambda x: standardize_smiles_from_mol(x, verbose=False))

In [None]:
data['standardized_inchikey'] = data.mol.apply(lambda x: standardize_inchikey_from_mol(x, verbose=False))

In [None]:
data['standardized_MF'] = data.mol.apply(lambda x: molecular_formula(x, verbose=False))

In [None]:
file_out = 'standardized_data.tsv'
data.to_csv(file_out, sep='\t', index=False, quoting=False)

# Mordred descriptors

In [None]:
# Installing Mordred
! pip install mordred
# Loading necessary libraries
from mordred import Calculator, descriptors



In [None]:
calc = Calculator(descriptors, ignore_3D=False) # create descriptor calculator (discard 3D descriptors)
print(f'Number of calculated descriptors: {len(calc.descriptors)}')

#calc_3D = Calculator(descriptors, ignore_3D=False) # create descriptor calculator with all descriptors
#print(f'Number of calculated descriptors: {len(calc_3D.descriptors)}')


Number of calculated descriptors: 1826


#### Calculating the descriptors for all the chemicals in the original dataset

In [None]:
data_cleaned = data.dropna(subset=['mol'])

In [None]:
calc_mordred = calc.pandas(data_cleaned.mol)
calc_mordred = calc_mordred.apply(pd.to_numeric, errors='coerce')

100%|██████████| 47058/47058 [3:50:24<00:00,  3.40it/s]


Removed rows with indices: Index([83298], dtype='int64')


In [None]:
data_mordred = pd.concat([data_cleaned, calc_mordred], axis=1)

In [None]:
file_out = 'Mordred_descs.tsv'
data_mordred.to_csv(file_out, sep='\t', index=False, quoting=False)