# mordred descriptors

In [None]:
# this process was done in separate stages due to different conda environments for each descriptor module ie one for mordred, one for mold and one for rdkit

# imports

import pandas as pd
import numpy as np
import rdkit
from rdkit import Chem
from rdkit.Chem import Descriptors, Draw, AllChem
from mordred import Calculator, descriptors

tob_data = pd.read_csv(r'M:\ML_scripts\tobramycin_data_class_struct.csv')
tob_data = tob_data.drop(columns=['Unnamed: 0'])
tob_smiles = tob_data['Structure']

calc = Calculator(descriptors, ignore_3D=False) # setting the mordred descriptors calculator

mols = [Chem.MolFromSmiles(smi) for smi in tob_data['Structure']] # for each smiles, convert it into an RDkit mol object
mols_3d = []

for x in mols:
    Chem.AddHs(x) # add Hs
    AllChem.EmbedMolecule(x) # fix geometry, not really needed but should fix any errors with adding Hs
    mols_3d.append(x)

df = calc.pandas(mols) # calculates full set of mordred descriptors

def is_numeric(column):
    return pd.to_numeric(column, errors='coerce').notna().all() # if the feature contains any text or errors, the feature is removed

non_numeric_columns = [col for col in df.columns if not is_numeric(df[col])] # ^^

df_cleaned = df.drop(columns=non_numeric_columns) # ^^

# processing time: 10m 15.4s

In [None]:
df_cleaned.insert(0, 'Structure', tob_smiles) # coincidentally, the SMILES strings get removed as they are text, so they're re-inserted

In [None]:
structure = tob_data['Structure']
classifier = tob_data['Class'] # technically we created a new dataframe of descriptors from the SMILES so there was no classifier column present, hence inserting this column here
# df_cleaned.insert(0, 'Structure', structure)
df_cleaned.insert(1, 'Class', classifier)

In [None]:
df_cleaned.to_csv('./tobramycin_mordred_descriptors.csv') # saved as .csv file to be read in the .ipynb file for adding mold2 descriptors

# mold2 descriptors

In [None]:
from Mold2_pywrapper import Mold2

path_to_zipfile = 'M:/ML_scripts/data/Mold2-Executable-File.zip'

mold2 = Mold2.from_executable(path_to_zipfile) # the mold2 descriptors are held in a zip file (much easier to handle than the errors I've had from installing the pywrapper in conda)

In [None]:
import pandas as pd

main = pd.read_csv('M:/ML_scripts/notebooks/tobramycin_mordred_descriptors.csv') # read in the mordred descriptors
data = pd.DataFrame(main['Structure'])

In [None]:
from rdkit import Chem

mols = [Chem.MolFromSmiles(x) for x in data['Structure']] # again, convert SMILES to RDkit mol objects

descriptors = mold2.calculate(mols, show_banner=False) # calculate the mold2 descriptors from the SMILES

In [None]:
mold2_df = pd.DataFrame.from_dict(descriptors, orient='columns') # convert the descriptors array into a dataframe
mold2_df.rename(columns=mold2.descriptor_details()) # keep the descriptor titles in the dataframe

In [None]:
def is_numeric(column):
    return pd.to_numeric(column, errors='coerce').notna().all()

non_numeric_columns = [col for col in mold2_df.columns if not is_numeric(mold2_df[col])]

mold2_df_clean = mold2_df.drop(columns=non_numeric_columns) # again, drop columns that contain text

In [None]:
mordred_mold2 = pd.concat([main, mold2_df_clean], axis=1) # joined the mordred and mold2 dataframes together

In [None]:
mordred_mold2.to_csv('../mordred_mold2_tob_5889.csv') # save as a .csv file, in case of any errors

In [None]:
from rdkit.Chem import Descriptors

rdkit_descr = []

for x in mols:
    desc = Descriptors.CalcMolDescriptors(x)
    rdkit_descr.append(desc) # calculate RDkit descriptors

rdkit_df = pd.DataFrame(rdkit_descr) # convert to dataframe

non_numeric_columns_rdkit = [col for col in rdkit_df.columns if not is_numeric(rdkit_df[col])]
rdkit_df_clean = rdkit_df.drop(columns=non_numeric_columns_rdkit) # again, clean the dataframe 

mordred_mold2_rdkit_df = pd.concat([mordred_mold2, rdkit_df_clean], axis=1) # join the RDkit dataframe onto mordred and mold2
mordred_mold2_rdkit_df = mordred_mold2_rdkit_df.drop(columns=['Unnamed: 0'])
mordred_mold2_rdkit_df.to_csv('../mordred_mold2_rdkit_descr_tob_5889.csv') # save as final csv

# final result is a dataframe of 5889 macrocycles and 2416 descriptors (structure and class make up the total 2418 columns)