In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from rdkit import Chem
from rdkit.Chem import inchi

from tqdm import tqdm
from time import sleep
from tqdm.notebook import tqdm

In [None]:
#Load the compounds for which we are creating molecular descriptors

def load_compounds(file_name):
    df = pd.read_csv(file_name, index_col=0)

    return df

In [None]:
#Calculate dataset length specific variabels for calculation optimization

def calculate_variables(df):

    comp_number = len(df)
    step = int(comp_number / 10) +1
    job_number = 7 
    chunk = int(step / 7) +1 

    return comp_number, step, job_number, chunk

In [None]:
#Create a list of SMILES

def generate_smiles(df):
    smiles_list = df['papyrus_SMILES'].tolist()

    return smiles_list

In [None]:
#Generate molecules from SMILES

def generate_mols(smiles_list):
    mols = []
    smiles_to_fix = []

    for smiles in smiles_list:
        try:
            mol = Chem.MolFromSmiles(smiles)
            mols.append(mol)

        except Exception as e:
            print(f"Error with SMILES string {smiles}")
            continue 


    #check for None Mol objects
    for i, mol in enumerate(mols):
        if mol is None:
            print(f"Error with SMILES string at index {i}: {smiles_list[i]}")
            smiles_to_fix.append(i)
            continue

    return mols

In [None]:
#Add hydrogens to the molecules

def add_hydrogens(mols):

    mols_H = []

    for mol in mols:
        mol_H = Chem.AddHs(mol)
        mols_H.append(mol_H)

    return mols_H

In [None]:
#Create and save the raw CDK molecular descriptors

def cdk_md(mols_H,comp_number,step,file_name_prefix):

    from CDK_pywrapper import CDK

    cdk = CDK()
    cdk_descriptors = pd.DataFrame()

    print(f'Calculating CDK molecular descriptors...')

    for i in tqdm(range(0,comp_number,step), smoothing=1.0):
        new_data = cdk.calculate(mols_H[i:i+step], show_banner = False)
        cdk_descriptors = pd.concat([cdk_descriptors, new_data], ignore_index=True)

    file_name_cdk = f'{file_name_prefix}_raw_cdk_md.csv'
    cdk_descriptors.to_csv(file_name_cdk, index=True)

    print(f'Length: {len(cdk_descriptors)}')
    #-----
    nan_per_column = cdk_descriptors.isna().sum()
    rows_with_nan = cdk_descriptors.isna().any(axis=1).sum()

    print(f'Number of rows with NaN values: {rows_with_nan}')

    return cdk_descriptors

In [None]:
#Create and save the raw Mold2 molecular descriptors

def mold2_md(mols_H,comp_number,step,file_name_prefix):

    from Mold2_pywrapper import Mold2

    mold2 = Mold2()
    mold2_descriptors = pd.DataFrame()

    print(f'Calculating Mold2 molecular descriptors...')

    for i in tqdm(range(0,comp_number,step), smoothing=1.0):
        new_data = mold2.calculate(mols_H[i:i+step], show_banner = False)
        mold2_descriptors = pd.concat([mold2_descriptors, new_data], ignore_index=True)

    file_name_mold2 = f'{file_name_prefix}_raw_mold2_md.csv'
    mold2_descriptors.to_csv(file_name_mold2, index=True)

    print(f'Length: {len(mold2_descriptors)}')
    #-----
    nan_per_column = mold2_descriptors.isna().sum()
    rows_with_nan = mold2_descriptors.isna().any(axis=1).sum()

    print(f'Number of rows with NaN values: {rows_with_nan}')

    return mold2_descriptors

In [None]:
#Create and save the raw ChemoPy molecular descriptors

def chemopy_md(mols_H,comp_number,step,file_name_prefix):

    from chemopy import ChemoPy

    cmp = ChemoPy()

    chemopy_descriptors = pd.DataFrame()

    print(f'Calculating Chemopy molecular descriptors...')

    for i in tqdm(range(0,comp_number,step), smoothing=0.0):
        new_data = cmp.calculate(mols_H[i:i+step], show_banner = False, njobs=8, chunksize=65)
        chemopy_descriptors = pd.concat([chemopy_descriptors, new_data], ignore_index=True)

    file_name_cmp = f'{file_name_prefix}_raw_chemopy_md.csv'
    chemopy_descriptors.to_csv(file_name_cmp, index=True)

    print(f'Length: {len(chemopy_descriptors)}')
    #-----
    nan_per_column = chemopy_descriptors.isna().sum()
    rows_with_nan = chemopy_descriptors.isna().any(axis=1).sum()

    print(f'Number of rows with NaN values: {rows_with_nan}')

    return chemopy_descriptors

In [None]:
#Create and save the raw PaDel molecular descriptors

def padel_md(mols_H, comp_number,step,file_name_prefix):

    from PaDEL_pywrapper import PaDEL
    from PaDEL_pywrapper.descriptor import ALOGP, Crippen, FMF

    from PaDEL_pywrapper import descriptors
    padel = PaDEL(descriptors)

    padel_descriptors = pd.DataFrame()

    print(f'Calculating PaDel molecular descriptors...')

    for i in tqdm(range(0,comp_number,step), smoothing=0.0):
        new_data = padel.calculate(mols_H[i:i+step], show_banner = False, njobs=8, chunksize=65)
        padel_descriptors = pd.concat([padel_descriptors, new_data], ignore_index=True)

    file_name_padel = f'{file_name_prefix}_raw_padel_md.csv'
    padel_descriptors.to_csv(file_name_padel, index=True)

    print(f'Length: {len(padel_descriptors)}')
    #-----
    nan_per_column = padel_descriptors.isna().sum()
    rows_with_nan = padel_descriptors.isna().any(axis=1).sum()

    print(f'Number of rows with NaN values: {rows_with_nan}')

    return padel_descriptors

In [None]:
#Create and save the raw BlueDesc molecular descriptors

def bluedesc_md(mols_H,comp_number,step,file_name_prefix):
    from BlueDesc_pywrapper import BlueDesc
    import warnings

    bluedesc_descriptors = pd.DataFrame()
    bluedesc = BlueDesc()

    print(f'Calculating Bluedesc molecular descriptors...')

    for i in tqdm(range(0, comp_number,step), smoothing=0.0):
        new_data = bluedesc.calculate(mols_H[i:i+step], show_banner=False)
        #new_data['Index'] = i
        bluedesc_descriptors = pd.concat([bluedesc_descriptors, new_data], ignore_index=True)


    file_name_bluedesc = f'{file_name_prefix}_raw_bluedesc_md.csv'
    bluedesc_descriptors.to_csv(file_name_bluedesc, index=True)

    print(f'Length: {len(bluedesc_descriptors)}')
    #-----
    nan_per_column = bluedesc_descriptors.isna().sum()
    rows_with_nan = bluedesc_descriptors.isna().any(axis=1).sum()

    print(f'Number of rows with NaN values: {rows_with_nan}')

    return bluedesc_descriptors

In [None]:
#Add prefixes to the molecular descriptors originating from different algorithm to make it easier to identify them

def add_prefixes(bluedesc,cdk,chemopy,mold2,padel):
    bluedesc = bluedesc.add_prefix('bluedesc - ')
    cdk = cdk.add_prefix('cdk - ')
    chemopy = chemopy.add_prefix('chemopy - ')
    mold2 = mold2.add_prefix('mold2 - ')
    padel = padel.add_prefix('padel - ')

    return bluedesc,cdk,chemopy,mold2,padel

In [None]:
#Merge the molecular descriptors with the compounds

def merge_datasets(df,bluedesc,cdk,chemopy,mold2,padel,md_number):

    if md_number == "all": #To have all molecular descriptors

        all_moldesc = df.merge(bluedesc, left_index = True, right_index=True, how='outer')
        all_moldesc = all_moldesc.merge(cdk, left_index = True, right_index=True, how='outer')
        all_moldesc = all_moldesc.merge(chemopy, left_index = True, right_index=True, how='outer')
        all_moldesc = all_moldesc.merge(mold2, left_index = True, right_index=True, how='outer')
        all_moldesc = all_moldesc.merge(padel, left_index = True, right_index=True, how='outer')

    elif md_number == "subset": #To only have the molecular descriptors that were selected via feature selection

        all_moldesc = df.merge(cdk, left_index = True, right_index=True, how='outer')
        all_moldesc = all_moldesc.merge(chemopy, left_index = True, right_index=True, how='outer')
        all_moldesc = all_moldesc.merge(padel, left_index = True, right_index=True, how='outer')

    return all_moldesc

In [None]:
#Function that runs all the other functions

def generate_mol_descs(file_path,file_name_prefix,file_save,md_number):
    df = load_compounds(file_path)
    comp_number, step, job_number, chunk = calculate_variables(df)

    smiles_list= generate_smiles(df)
    mols = generate_mols(smiles_list)
    mols_H = add_hydrogens(mols)

    cdk =cdk_md(mols_H,comp_number,step,file_name_prefix)
    mold2 = mold2_md(mols_H,comp_number,step,file_name_prefix)
    chemopy = chemopy_md(mols_H,comp_number,step,file_name_prefix)
    padel = padel_md(mols_H,comp_number,step,file_name_prefix)
    bluedesc= bluedesc_md(mols_H,comp_number,step,file_name_prefix)

    bluedesc,cdk,chemopy,mold2,padel = add_prefixes(bluedesc,cdk,chemopy,mold2,padel)

    all_moldesc = merge_datasets(df,bluedesc,cdk,chemopy,mold2,padel,md_number)

    len(all_moldesc)
        
    all_moldesc.to_csv(file_save, index=True)


Provide file specific variables: one-by-one

In [None]:
#Specify source file: path to the compounds with column "papyrus_SMILES" containing SMILES

file_path = f'../blood_brain_barrier/kadar_data_prep/val_data/kadar_influx_val.csv'

#Sppecify file_prefix to save intermediate files: raw molecular descriptors

file_name_prefix = f'kadar_md/influx/kadar_val_influx'

#Specify final file path for saving

file_save = f"kadar_md/influx/kadar_val_influx_md_all.csv"

#"all" or "subset"? (subset mean: CDK, ChemoPy and PaDel)

md_number = "all"

In [None]:
#Run the functions to generate mol.descs
generate_mol_descs(file_path,file_name_prefix,file_save,md_number)