In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from rdkit import Chem
from rdkit.Chem import inchi

from tqdm import tqdm
from time import sleep
from tqdm.notebook import tqdm

from jCompoundMapper_pywrapper import JCompoundMapper
from chemopy import ChemoPy
from chemopy import Fingerprint
from CDK_pywrapper import CDK, FPType
from PaDEL_pywrapper import PaDEL
from PaDEL_pywrapper.descriptor import SubstructureFPCount
from PaDEL_pywrapper.descriptor import KlekotaRothFPCount


[ForwardRef('ExtensionArray'), <class 'numpy.ndarray'>]
[ForwardRef('ExtensionArray'), <class 'numpy.ndarray'>, ForwardRef('Index'), ForwardRef('Series')]
[<class 'str'>, <class 'float'>, <class 'bool'>]
[ForwardRef('Period'), ForwardRef('Timestamp'), ForwardRef('Timedelta')]
[ForwardRef('Period'), ForwardRef('Timestamp'), ForwardRef('Timedelta'), ForwardRef('Interval')]
[<class 'str'>, <class 'float'>, <class 'bool'>, ForwardRef('Period'), ForwardRef('Timestamp'), ForwardRef('Timedelta'), ForwardRef('Interval'), <class 'numpy.datetime64'>, <class 'numpy.timedelta64'>, <class 'datetime.datetime'>]
[ForwardRef('Timestamp'), <class 'datetime.datetime'>, <class 'numpy.datetime64'>, <class 'numpy.int64'>, <class 'float'>, <class 'str'>]
[ForwardRef('Timedelta'), <class 'datetime.timedelta'>, <class 'numpy.timedelta64'>, <class 'numpy.int64'>, <class 'float'>, <class 'str'>]
[<class 'str'>, <class 'datetime.tzinfo'>]
[<class 'str'>, <class 'int'>]
[typing.Hashable, typing.Sequence[typing.Ha

In [2]:
#Load the compounds for which we are creating molecular fingerprints

def load_compounds(file_name):
    df = pd.read_csv(file_name, index_col=0)

    return df

In [3]:
#Calculate dataset length specific variabels for calculation optimization

def calculate_variables(df):

    comp_number = len(df)
    step = int(comp_number / 10) +1
    job_number = 7 
    chunk = int(step / 7) +1 

    return comp_number, step, job_number, chunk

In [4]:
#Create a list of SMILES

def generate_smiles(df):
    smiles_list = df['papyrus_SMILES'].tolist()

    return smiles_list

In [5]:
#Generate molecules from SMILES

def generate_mols(smiles_list):
    mols = []
    smiles_to_fix = []

    for smiles in smiles_list:
        try:
            mol = Chem.MolFromSmiles(smiles)
            mols.append(mol)

        except Exception as e:
            print(f"Error with SMILES string {smiles}")
            continue 


    #check for None Mol objects
    for i, mol in enumerate(mols):
        if mol is None:
            print(f"Error with SMILES string at index {i}: {smiles_list[i]}")
            smiles_to_fix.append(i)
            continue

    return mols

In [6]:
#Add hydrogens to the molecules

def add_hydrogens(mols):

    mols_H = []

    for mol in mols:
        mol_H = Chem.AddHs(mol)
        mols_H.append(mol_H)

    return mols_H

Calculate molecular fingerprints

From this point on we are creating and saving the different mol. fingerprints (function are named accordingly) one-by one into a raw file

In [7]:
def chemopy_all(mols_H,comp_number,step,file_name_prefix):
    fp = pd.DataFrame()

    print(f'Calculating Chemopy fingerprints...')

    for mol in mols_H:
        new_data = Fingerprint.get_all_fps(mol)
        fp = fp.append([new_data], ignore_index=True)


    file_name = f'{file_name_prefix}_raw_chemopy_fp.csv'
    fp.to_csv(file_name, index=True)

    print(f'Length: {len(fp)}')
        #-----
    nan_per_column = fp.isna().sum()
    rows_with_nan = fp.isna().any(axis=1).sum()

    print(f'Number of rows with NaN values: {rows_with_nan}')

    return fp

In [8]:
#CDK pub

def cdk_pub(mols_H,comp_number,step,file_name_prefix,chunk,job_number):
    cdk = CDK(fingerprint=FPType.PubchemFP)

    fp = pd.DataFrame()

    print(f'Calculating CDK_Pubchem fingerprints...')


    for i in tqdm(range(0,comp_number,step), smoothing=1.0):
        new_data = cdk.calculate(mols_H[i:i+step], show_banner=False, njobs=job_number, chunksize=chunk)
        fp = pd.concat([fp, new_data], ignore_index=True)

    file_name = f'{file_name_prefix}_raw_cdk_pub_fp.csv'
    fp.to_csv(file_name, index=True)

    print(f'Length: {len(fp)}')

    rows_with_nan = fp.isna().any(axis=1).sum()

    print(f'Number of rows with NaN values: {rows_with_nan}')

    return fp

In [9]:
def cdk_fp(mols_H,comp_number,step,file_name_prefix,chunk,job_number):
    #CDK FP fingerprints
    cdk = CDK(fingerprint=FPType.FP)

    fp = pd.DataFrame()

    print(f'Calculating CDK_FP fingerprints...')

    for i in tqdm(range(0,comp_number,step), smoothing=1.0):
        new_data = cdk.calculate(mols_H[i:i+step], show_banner=False, njobs=job_number, chunksize=chunk)
        fp = pd.concat([fp, new_data], ignore_index=True)

    file_name = f'{file_name_prefix}_raw_cdk_fp_fp.csv'
    fp.to_csv(file_name, index=True)

    print(f'Length: {len(fp)}')
        #-----
    nan_per_column = fp.isna().sum()
    rows_with_nan = fp.isna().any(axis=1).sum()

    print(f'Number of rows with NaN values: {rows_with_nan}')

    return fp

In [10]:
def cdk_ext(mols_H,comp_number,step,file_name_prefix,chunk,job_number):
  
    cdk = CDK(fingerprint=FPType.ExtFP)

    fp = pd.DataFrame()

    for i in tqdm(range(0,comp_number,step), smoothing=1.0):
        new_data = cdk.calculate(mols_H[i:i+step], show_banner=False, njobs=job_number, chunksize=chunk)
        fp = pd.concat([fp, new_data], ignore_index=True)

    file_name = f'{file_name_prefix}_raw_cdk_ext_fp.csv'
    fp.to_csv(file_name, index=True)
    print(f'Length: {len(fp)}')
        #-----
    nan_per_column = fp.isna().sum()
    rows_with_nan = fp.isna().any(axis=1).sum()

    print(f'Number of rows with NaN values: {rows_with_nan}')

    return fp

In [11]:
def cdk_graph(mols_H,comp_number,step,file_name_prefix,chunk,job_number):
    #CDK Graph fingerprints


    cdk = CDK(fingerprint=FPType.GraphFP)

    fp = pd.DataFrame()

    print(f'Calculating CDK_Graph fingerprints...')

    for i in tqdm(range(0,comp_number,step), smoothing=1.0):
        new_data = cdk.calculate(mols_H[i:i+step], show_banner=False, njobs=job_number, chunksize=chunk)
        fp = pd.concat([fp, new_data], ignore_index=True)

    file_name = f'{file_name_prefix}_raw_cdk_graph_fp.csv'
    fp.to_csv(file_name, index=True)
    print(f'Length: {len(fp)}')
        #-----
    nan_per_column = fp.isna().sum()
    rows_with_nan = fp.isna().any(axis=1).sum()

    print(f'Number of rows with NaN values: {rows_with_nan}')

    return fp

In [12]:
def cdk_maccs(mols_H,comp_number,step,file_name_prefix,chunk,job_number):
    #CDK MACCS fingerprints


    cdk = CDK(fingerprint=FPType.MACCSFP)

    fp = pd.DataFrame()

    for i in tqdm(range(0,comp_number,step), smoothing=1.0):
        new_data = cdk.calculate(mols_H[i:i+step], show_banner=False, njobs=job_number, chunksize=chunk)
        fp = pd.concat([fp, new_data], ignore_index=True)

    file_name = f'{file_name_prefix}_raw_cdk_maccs_fp.csv'
    fp.to_csv(file_name, index=True)
    print(f'Length: {len(fp)}')
        #-----
    nan_per_column = fp.isna().sum()
    rows_with_nan = fp.isna().any(axis=1).sum()

    print(f'Number of rows with NaN values: {rows_with_nan}')

    return fp

In [13]:
def cdk_sub(mols_H,comp_number,step,file_name_prefix,chunk,job_number):
    #CDK Sub fingerprints

    cdk = CDK(fingerprint=FPType.SubFP)

    fp = pd.DataFrame()

    print(f'Calculating CDK_Sub fingerprints...')

    for i in tqdm(range(0,comp_number,step), smoothing=1.0):
        new_data = cdk.calculate(mols_H[i:i+step], show_banner=False, njobs=job_number, chunksize=chunk)
        fp = pd.concat([fp, new_data], ignore_index=True)

    file_name = f'{file_name_prefix}_raw_cdk_sub_fp.csv'
    fp.to_csv(file_name, index=True)
    print(f'Length: {len(fp)}')
        #-----
    nan_per_column = fp.isna().sum()
    rows_with_nan = fp.isna().any(axis=1).sum()

    print(f'Number of rows with NaN values: {rows_with_nan}')

    return fp

In [14]:
def cdk_kr(mols_H,comp_number,step,file_name_prefix,chunk,job_number):
    #CDK Klekota-Roth fingerprints

    cdk = CDK(fingerprint=FPType.KRFP)

    fp = pd.DataFrame()

    for i in tqdm(range(0,comp_number,step), smoothing=1.0):
        new_data = cdk.calculate(mols_H[i:i+step], show_banner=False, njobs=job_number, chunksize=chunk)
        fp = pd.concat([fp, new_data], ignore_index=True)

    file_name = f'{file_name_prefix}_raw_cdk_kr_fp.csv'
    fp.to_csv(file_name, index=True)
    print(f'Length: {len(fp)}')
        #-----
    nan_per_column = fp.isna().sum()
    rows_with_nan = fp.isna().any(axis=1).sum()

    print(f'Number of rows with NaN values: {rows_with_nan}')

    return fp

In [15]:
def cdk_ap2d(mols_H,comp_number,step,file_name_prefix,chunk,job_number):
    #CDK Atom pair 2D fingerprints

    cdk = CDK(fingerprint=FPType.AP2DFP)

    fp = pd.DataFrame()

    print(f'Calculating CDK_AP2DFP fingerprints...')

    for i in tqdm(range(0,comp_number,step), smoothing=1.0):
        new_data = cdk.calculate(mols_H[i:i+step], show_banner=False, njobs=job_number, chunksize=chunk)
        fp = pd.concat([fp, new_data], ignore_index=True)

    file_name = f'{file_name_prefix}_raw_cdk_ap2d_fp.csv'
    fp.to_csv(file_name, index=True)
    print(f'Length: {len(fp)}')
        #-----
    nan_per_column = fp.isna().sum()
    rows_with_nan = fp.isna().any(axis=1).sum()

    print(f'Number of rows with NaN values: {rows_with_nan}')

    return fp

In [16]:
def cdk_hybrid(mols_H,comp_number,step,file_name_prefix,chunk,job_number):
    #CDK hybrid fingerprints


    cdk = CDK(fingerprint=FPType.HybridFP)

    fp = pd.DataFrame()

    print(f'Calculating CDK_Hybrid fingerprints...')

    for i in tqdm(range(0,comp_number,step), smoothing=1.0):
        new_data = cdk.calculate(mols_H[i:i+step], show_banner=False, njobs=job_number, chunksize=chunk)
        fp = pd.concat([fp, new_data], ignore_index=True)

    file_name = f'{file_name_prefix}_raw_cdk_hybrid_fp.csv'
    fp.to_csv(file_name, index=True)
    print(f'Length: {len(fp)}')
        #-----
    nan_per_column = fp.isna().sum()
    rows_with_nan = fp.isna().any(axis=1).sum()

    print(f'Number of rows with NaN values: {rows_with_nan}')

    return fp

In [17]:
def cdk_lingo(mols_H,comp_number,step,file_name_prefix, chunk,job_number):
    #CDK LINGO fingerprints


    cdk = CDK(fingerprint=FPType.LingoFP)

    fp = pd.DataFrame()

    print(f'Calculating CDK_Lingo fingerprints...')

    for i in tqdm(range(0,comp_number,step), smoothing=1.0):
        new_data = cdk.calculate(mols_H[i:i+step], show_banner=False, njobs=job_number, chunksize=chunk)
        fp = pd.concat([fp, new_data], ignore_index=True)

    file_name = f'{file_name_prefix}_raw_cdk_lingo_fp.csv'
    fp.to_csv(file_name, index=True)
    print(f'Length: {len(fp)}')
        #-----
    nan_per_column = fp.isna().sum()
    rows_with_nan = fp.isna().any(axis=1).sum()

    print(f'Number of rows with NaN values: {rows_with_nan}')

    return fp

In [18]:
def cdk_sp(mols_H,comp_number,step,file_name_prefix,chunk,job_number):
#CDK shortest path fingerprints


    cdk = CDK(fingerprint=FPType.SPFP)

    fp = pd.DataFrame()

    print(f'Calculating CDK_SP fingerprints...')

    for i in tqdm(range(0,comp_number,step), smoothing=1.0):
        new_data = cdk.calculate(mols_H[i:i+step], show_banner=False, njobs=job_number, chunksize=chunk)
        fp = pd.concat([fp, new_data], ignore_index=True)

    file_name = f'{file_name_prefix}_raw_cdk_sp_fp.csv'
    fp.to_csv(file_name, index=True)
    print(f'Length: {len(fp)}')
        #-----
    nan_per_column = fp.isna().sum()
    rows_with_nan = fp.isna().any(axis=1).sum()

    print(f'Number of rows with NaN values: {rows_with_nan}')

    return fp

In [19]:
def cdk_circ(mols_H,comp_number,step,file_name_prefix,chunk,job_number):
#CDK circular fingerprints


    cdk = CDK(fingerprint=FPType.CircFP)

    fp = pd.DataFrame()

    print(f'Calculating CDK_Circ fingerprints...')

    for i in tqdm(range(0,comp_number,step), smoothing=1.0):
        new_data = cdk.calculate(mols_H[i:i+step], show_banner=False, njobs=job_number, chunksize=chunk)
        fp = pd.concat([fp, new_data], ignore_index=True)

    file_name = f'{file_name_prefix}_raw_cdk_circ_fp.csv'
    fp.to_csv(file_name, index=True)
    print(f'Length: {len(fp)}')
        #-----
    nan_per_column = fp.isna().sum()
    rows_with_nan = fp.isna().any(axis=1).sum()

    print(f'Number of rows with NaN values: {rows_with_nan}')

    return fp

In [20]:
def cdk_estate(mols_H,comp_number,step,file_name_prefix,chunk,job_number):
#CDK E-State fingerprints


    cdk = CDK(fingerprint=FPType.EStateFP)

    fp = pd.DataFrame()

    print(f'Calculating CDK_EState fingerprints...')

    for i in tqdm(range(0,comp_number,step), smoothing=1.0):
        new_data = cdk.calculate(mols_H[i:i+step], show_banner=False, njobs=job_number, chunksize=chunk)
        fp = pd.concat([fp, new_data], ignore_index=True)

    file_name = f'{file_name_prefix}_raw_cdk_estate_fp.csv'
    fp.to_csv(file_name, index=True)
    print(f'Length: {len(fp)}')
        #-----
    nan_per_column = fp.isna().sum()
    rows_with_nan = fp.isna().any(axis=1).sum()

    print(f'Number of rows with NaN values: {rows_with_nan}')

    return fp

In [21]:
def padel_subcount(mols_H,comp_number,step,file_name_prefix,chunk,job_number):
#PaDel Substructrure + Counts

    fp_type = SubstructureFPCount

    padel = PaDEL([fp_type], ignore_3D=False)

    print(f'Calculating PaDEL_SubstructureCount fingerprints...')

    fp = pd.DataFrame()

    for i in tqdm(range(0,comp_number,step), smoothing=1.0):
        new_data = padel.calculate(mols_H[i:i+500], show_banner=False, njobs=job_number, chunksize=chunk)
        fp = pd.concat([fp, new_data], ignore_index=True)

    file_name = f'{file_name_prefix}_raw_padel_subcount.csv'
    fp.to_csv(file_name, index=True)
    print(f'Length: {len(fp)}')
        #-----
    nan_per_column = fp.isna().sum()
    rows_with_nan = fp.isna().any(axis=1).sum()

    print(f'Number of rows with NaN values: {rows_with_nan}')

    return fp

In [22]:
def padel_krcount(mols_H,comp_number,step,file_name_prefix,chunk,job_number):
#PaDel KlekotaRoth + Couns


    fp_type = KlekotaRothFPCount


    padel = PaDEL([fp_type], ignore_3D=False)

    print(f'Calculating PaDEL_KlekotaRothCount fingerprints...')


    fp = pd.DataFrame()

    for i in tqdm(range(0,comp_number,step), smoothing=1.0):
        new_data = padel.calculate(mols_H[i:i+500], show_banner=False, njobs=job_number, chunksize=chunk)
        fp = pd.concat([fp, new_data], ignore_index=True)

    file_name = f'{file_name_prefix}_raw_padel_krcount.csv'
    fp.to_csv(file_name, index=True)
    print(f'Length: {len(fp)}')
        #-----
    nan_per_column = fp.isna().sum()
    rows_with_nan = fp.isna().any(axis=1).sum()

    print(f'Number of rows with NaN values: {rows_with_nan}')

    return fp

In [23]:
def jcm_dfs(mols_H,comp_number,step,file_name_prefix,chunk,job_number):

    jcm = JCompoundMapper(Fingerprint.DFS)


    fp = pd.DataFrame()

    print(f'Calculating JCompoundMapper DFS fingerprints...')

    for i in tqdm(range(0,comp_number,step), smoothing=1.0):
        new_data = jcm.calculate(mols_H[i:i+500], show_banner=False, njobs=job_number, chunksize=chunk)
        fp = pd.concat([fp, new_data], ignore_index=True)

    file_name = f'{file_name_prefix}_raw_jcm_dfs.csv'
    fp.to_csv(file_name, index=True)
    print(f'Length: {len(fp)}')
        #-----
    nan_per_column = fp.isna().sum()
    rows_with_nan = fp.isna().any(axis=1).sum()

    print(f'Number of rows with NaN values: {rows_with_nan}')

    return fp

In [24]:
def jcm_ap2d(mols_H,comp_number,step,file_name_prefix,chunk,job_number):

    jcm = JCompoundMapper('AP2D')


    fp = pd.DataFrame()

    print(f'Calculating JCompoundMapper AP2d fingerprints...')

    for i in tqdm(range(0,comp_number,step), smoothing=1.0):
        new_data = jcm.calculate(mols_H[i:i+500], show_banner=False, njobs=job_number, chunksize=chunk)
        fp = pd.concat([fp, new_data], ignore_index=True)

    file_name = f'{file_name_prefix}_raw_jcm_ap2d.csv'
    fp.to_csv(file_name, index=True)
    print(f'Length: {len(fp)}')
        #-----
    nan_per_column = fp.isna().sum()
    rows_with_nan = fp.isna().any(axis=1).sum()

    print(f'Number of rows with NaN values: {rows_with_nan}')

    return fp

In [25]:
def jcm_asp(mols_H,comp_number,step,file_name_prefix,chunk,job_number):

    jcm = JCompoundMapper('ASP')


    fp = pd.DataFrame()

    print('Calculating JCM_ASP...')

    for i in tqdm(range(0,comp_number,step), smoothing=1.0):
        new_data = jcm.calculate(mols_H[i:i+500], show_banner=False, njobs=job_number, chunksize=chunk)
        fp = pd.concat([fp, new_data], ignore_index=True)

    file_name = f'{file_name_prefix}_raw_jcm_asp.csv'
    fp.to_csv(file_name, index=True)
    print(f'Length: {len(fp)}')
        #-----
    nan_per_column = fp.isna().sum()
    rows_with_nan = fp.isna().any(axis=1).sum()

    print(f'Number of rows with NaN values: {rows_with_nan}')

    return fp

In [26]:
def jcm_cats2d(mols_H,comp_number,step,file_name_prefix,chunk,job_number):

    jcm = JCompoundMapper('CATS2D')


    fp = pd.DataFrame()

    print(f'Calculating JCompoundMapper CATS2D fingerprints...')

    for i in tqdm(range(0,comp_number,step), smoothing=1.0):
        new_data = jcm.calculate(mols_H[i:i+500], show_banner=False, njobs=job_number, chunksize=chunk)
        fp = pd.concat([fp, new_data], ignore_index=True)

    file_name = f'{file_name_prefix}_raw_jcm_cats2d.csv'
    fp.to_csv(file_name, index=True)
    print(f'Length: {len(fp)}')
        #-----
    nan_per_column = fp.isna().sum()
    rows_with_nan = fp.isna().any(axis=1).sum()

    print(f'Number of rows with NaN values: {rows_with_nan}')

    return fp

In [27]:
def jcm_at2d(mols_H,comp_number,step,file_name_prefix,chunk,job_number):

    jcm = JCompoundMapper('AT2D')


    fp = pd.DataFrame()

    for i in tqdm(range(0,comp_number,step), smoothing=1.0):
        new_data = jcm.calculate(mols_H[i:i+500], show_banner=False, njobs=job_number, chunksize=chunk)
        fp = pd.concat([fp, new_data], ignore_index=True)

    file_name = f'{file_name_prefix}_raw_jcm_at2d.csv'
    fp.to_csv(file_name, index=True)
    print(f'Length: {len(fp)}')
        #-----
    nan_per_column = fp.isna().sum()
    rows_with_nan = fp.isna().any(axis=1).sum()

    print(f'Number of rows with NaN values: {rows_with_nan}')

    return fp

In [28]:
def jcm_22(mols_H,comp_number,step,file_name_prefix,chunk,job_number):

    jcm = JCompoundMapper('PHAP2POINT2D')


    fp = pd.DataFrame()

    print(f'Calculating JCompoundMapper PHAP2POINT2D fingerprints...')

    for i in tqdm(range(0,comp_number,step), smoothing=1.0):
        new_data = jcm.calculate(mols_H[i:i+500], show_banner=False, njobs=job_number, chunksize=chunk)
        fp = pd.concat([fp, new_data], ignore_index=True)

    file_name = f'{file_name_prefix}_raw_jcm_22.csv'
    fp.to_csv(file_name, index=True)
    print(f'Length: {len(fp)}')
        #-----
    nan_per_column = fp.isna().sum()
    rows_with_nan = fp.isna().any(axis=1).sum()

    print(f'Number of rows with NaN values: {rows_with_nan}')

    return fp

In [29]:
def jcm_32(mols_H,comp_number,step,file_name_prefix,chunk,job_number):

    jcm = JCompoundMapper('PHAP3POINT2D')


    fp = pd.DataFrame()

    print('Calculating JCM_PHAP3POINT2D...')
        
    for i in tqdm(range(0,comp_number,step), smoothing=1.0):
        new_data = jcm.calculate(mols_H[i:i+500], show_banner=False, njobs=job_number, chunksize=chunk)
        fp = pd.concat([fp, new_data], ignore_index=True)

    file_name = f'{file_name_prefix}_raw_jcm_32.csv'
    fp.to_csv(file_name, index=True)
    print(f'Length: {len(fp)}')
        #-----
    nan_per_column = fp.isna().sum()
    rows_with_nan = fp.isna().any(axis=1).sum()

    print(f'Number of rows with NaN values: {rows_with_nan}')

    return fp

In [30]:
def jcm_ecfp(mols_H,comp_number,step,file_name_prefix, chunk,job_number):

    jcm = JCompoundMapper('ECFP')


    fp = pd.DataFrame()

    print('Calculating JCM_ECFP...')

    for i in tqdm(range(0,comp_number,step), smoothing=1.0):
        new_data = jcm.calculate(mols_H[i:i+500], show_banner=False, njobs=job_number, chunksize=chunk)
        fp = pd.concat([fp, new_data], ignore_index=True)

    file_name = f'{file_name_prefix}_raw_jcm_ecfp.csv'
    fp.to_csv(file_name, index=True)
    print(f'Length: {len(fp)}')
        #-----
    nan_per_column = fp.isna().sum()
    rows_with_nan = fp.isna().any(axis=1).sum()

    print(f'Number of rows with NaN values: {rows_with_nan}')

    return fp

In [31]:
def jcm_ecvar(mols_H,comp_number,step,file_name_prefix,chunk,job_number):

    jcm = JCompoundMapper('ECFPVariant')


    fp = pd.DataFrame()

    print('Calculating JCM_ECFPVariant...')

    for i in tqdm(range(0,comp_number,step), smoothing=1.0):
        new_data = jcm.calculate(mols_H[i:i+500], show_banner=False, njobs=job_number, chunksize=chunk)
        fp = pd.concat([fp, new_data], ignore_index=True)

    file_name = f'{file_name_prefix}_raw_jcm_ecvar.csv'
    fp.to_csv(file_name, index=True)
    print(f'Length: {len(fp)}')
        #-----
    nan_per_column = fp.isna().sum()
    rows_with_nan = fp.isna().any(axis=1).sum()

    print(f'Number of rows with NaN values: {rows_with_nan}')

    return fp

In [32]:
def jcm_lstar(mols_H,comp_number,step,file_name_prefix,chunk,job_number):

    jcm = JCompoundMapper('LSTAR')


    fp = pd.DataFrame()

    for i in tqdm(range(0,comp_number,step), smoothing=1.0):
        new_data = jcm.calculate(mols_H[i:i+500], show_banner=False, njobs=job_number, chunksize=chunk)
        fp = pd.concat([fp, new_data], ignore_index=True)

    file_name = f'{file_name_prefix}_raw_jcm_lstar.csv'
    fp.to_csv(file_name, index=True)
    print(f'Length: {len(fp)}')
        #-----
    nan_per_column = fp.isna().sum()
    rows_with_nan = fp.isna().any(axis=1).sum()

    print(f'Number of rows with NaN values: {rows_with_nan}')

    return fp

In [33]:
def jcm_shed(mols_H,comp_number,step,file_name_prefix, chunk,job_number):

    jcm = JCompoundMapper('SHED')


    fp = pd.DataFrame()

    print('Calculating JCM_SHED...')

    for i in tqdm(range(0,comp_number,step), smoothing=1.0):
        new_data = jcm.calculate(mols_H[i:i+500], show_banner=False, njobs=job_number, chunksize=chunk)
        fp = pd.concat([fp, new_data], ignore_index=True)

    file_name = f'{file_name_prefix}_raw_jcm_shed.csv'
    fp.to_csv(file_name, index=True)
    print(f'Length: {len(fp)}')
        #-----
    nan_per_column = fp.isna().sum()
    rows_with_nan = fp.isna().any(axis=1).sum()

    print(f'Number of rows with NaN values: {rows_with_nan}')

    return fp

In [34]:
def jcm_rad2(mols_H,comp_number,step,file_name_prefix,chunk,job_number):

    jcm = JCompoundMapper('RAD2D')


    fp = pd.DataFrame()

    print('Calculating JCM_RAD2D...')

    for i in tqdm(range(0,comp_number,step), smoothing=1.0):
        new_data = jcm.calculate(mols_H[i:i+500], show_banner=False, njobs=job_number, chunksize=chunk)
        fp = pd.concat([fp, new_data], ignore_index=True)

    file_name = f'{file_name_prefix}_raw_jcm_rad2d.csv'
    fp.to_csv(file_name, index=True)
    print(f'Length: {len(fp)}')
        #-----
    nan_per_column = fp.isna().sum()
    rows_with_nan = fp.isna().any(axis=1).sum()

    print(f'Number of rows with NaN values: {rows_with_nan}')

    return fp

In [35]:
def jcm_maccs(mols_H,comp_number,step,file_name_prefix,chunk,job_number):

    jcm = JCompoundMapper('MACCS')


    fp = pd.DataFrame()

    for i in tqdm(range(0,comp_number,step), smoothing=1.0):
        new_data = jcm.calculate(mols_H[i:i+500], show_banner=False, njobs=job_number, chunksize=chunk)
        fp = pd.concat([fp, new_data], ignore_index=True)

    file_name = f'{file_name_prefix}_raw_jcm_maccs.csv'
    fp.to_csv(file_name, index=True)
    print(f'Length: {len(fp)}')
        #-----
    nan_per_column = fp.isna().sum()
    rows_with_nan = fp.isna().any(axis=1).sum()

    print(f'Number of rows with NaN values: {rows_with_nan}')

    return fp

Fingerprints calculating functions ends here.

Run the function

In [39]:
def generate_mol_fps(file_path,file_name_prefix,file_save):

    df = load_compounds(file_path)
    comp_number, step, job_number, chunk = calculate_variables(df)

    smiles_list= generate_smiles(df)
    mols = generate_mols(smiles_list)
    mols_H = add_hydrogens(mols)


    #Generating fingerprints

    cdk_ap2d_fp =cdk_ap2d(mols_H,comp_number,step,file_name_prefix,chunk,job_number)
    cdk_circ_fp = cdk_circ(mols_H,comp_number,step,file_name_prefix,chunk,job_number)
    cdk_estate_fp = cdk_estate(mols_H,comp_number,step,file_name_prefix,chunk,job_number)
    cdk_ext_fp = cdk_ext(mols_H,comp_number,step,file_name_prefix,chunk,job_number)
    cdk_fp_fp =cdk_fp(mols_H,comp_number,step,file_name_prefix,chunk,job_number)
    cdk_graph_fp =cdk_graph(mols_H,comp_number,step,file_name_prefix,chunk,job_number)
    cdk_hybrid_fp = cdk_hybrid(mols_H,comp_number,step,file_name_prefix,chunk,job_number)
    cdk_kr_fp = cdk_kr(mols_H,comp_number,step,file_name_prefix,chunk,job_number)
    cdk_lingo_fp = cdk_lingo(mols_H,comp_number,step,file_name_prefix,chunk,job_number)
    cdk_maccs_fp = cdk_maccs(mols_H,comp_number,step,file_name_prefix,chunk,job_number)
    cdk_pub_fp = cdk_pub(mols_H,comp_number,step,file_name_prefix,chunk,job_number)
    cdk_sp_fp = cdk_sp(mols_H,comp_number,step,file_name_prefix,chunk,job_number)
    cdk_sub_fp = cdk_sub(mols_H,comp_number,step,file_name_prefix,chunk,job_number)

    chemopy_fp = chemopy_all(mols_H,comp_number,step,file_name_prefix)

    jcm_22_fp = jcm_22(mols_H,comp_number,step,file_name_prefix,chunk,job_number)
    jcm_32_fp =jcm_32(mols_H,comp_number,step,file_name_prefix,chunk,job_number)
    jcm_ap2d_fp = jcm_ap2d(mols_H,comp_number,step,file_name_prefix,chunk,job_number)
    jcm_asp_fp = jcm_asp(mols_H,comp_number,step,file_name_prefix,chunk,job_number)
    jcm_at2d_fp = jcm_at2d(mols_H,comp_number,step,file_name_prefix,chunk,job_number)
    jcm_cats2d_fp = jcm_cats2d(mols_H,comp_number,step,file_name_prefix,chunk,job_number)
    jcm_dfs_fp=jcm_dfs(mols_H,comp_number,step,file_name_prefix,chunk,job_number)
    jcm_ecfp_fp = jcm_ecfp(mols_H,comp_number,step,file_name_prefix,chunk,job_number)
    jcm_ecvar_fp = jcm_ecvar(mols_H,comp_number,step,file_name_prefix,chunk,job_number)
    jcm_lstar_fp = jcm_lstar(mols_H,comp_number,step,file_name_prefix,chunk,job_number)
    jcm_maccs_fp = jcm_maccs(mols_H,comp_number,step,file_name_prefix,chunk,job_number)
    jcm_rad2d_fp = jcm_rad2(mols_H,comp_number,step,file_name_prefix,chunk,job_number)
    jcm_shed_fp = jcm_shed(mols_H,comp_number,step,file_name_prefix,chunk,job_number)
    
    padel_subcount_fp = padel_subcount(mols_H,comp_number,step,file_name_prefix,chunk,job_number)
    padel_krcount_fp = padel_krcount(mols_H,comp_number,step,file_name_prefix,chunk,job_number)

    #Add prefixes

    cdk_ap2d_fp = cdk_ap2d_fp.add_prefix('cdk_ap2d - ')
    cdk_circ_fp = cdk_circ_fp.add_prefix('cdk_circ - ')
    cdk_estate_fp = cdk_estate_fp.add_prefix('cdk_estate - ')
    cdk_ext_fp = cdk_ext_fp.add_prefix('cdk_ext - ')
    cdk_fp_fp = cdk_fp_fp.add_prefix('cdk_fp - ')
    cdk_graph_fp = cdk_graph_fp.add_prefix('cdk_grap - ')
    cdk_hybrid_fp = cdk_hybrid_fp.add_prefix('cdk_hybrid - ')
    cdk_kr_fp = cdk_kr_fp.add_prefix('cdk_kr - ')
    cdk_lingo_fp = cdk_lingo_fp.add_prefix ('cdk_lingo - ')
    cdk_maccs_fp = cdk_maccs_fp.add_prefix('cdk_maccs - ')
    cdk_pubc_fp = cdk_pub_fp.add_prefix('cdk_pubc - ')
    cdk_sp_fp = cdk_sp_fp.add_prefix('cdk_sp - ')
    cdk_sub_fp = cdk_sub_fp.add_prefix('cdk_sub - ')

    chemopy_fp = chemopy_fp.add_prefix('chemopy - ')

    jcm_22_fp = jcm_22_fp.add_prefix('jcm_22 - ')
    jcm_32_fp = jcm_32_fp.add_prefix('jcm_32 - ')
    jcm_ap2d_fp = jcm_ap2d.add_prefix('jcm_ap2d - ')
    jcm_asp_fp = jcm_asp_fp.add_prefix('jcm_asp - ')
    jcm_at2d_fp = jcm_at2d_fp.add_prefix('jcm_at2d - ')
    jcm_cats2d_fp = jcm_cats2d_fp.add_prefix('jcm_cats2d - ')
    jcm_dfs_fp = jcm_dfs_fp.add_prefix('jcm_dfs - ')
    jcm_ecfp_fp = jcm_ecfp_fp.add_prefix('jcm_ecfp - ')
    jcm_ecvar_fp = jcm_ecvar_fp.add_prefix('jcm_ecvar - ')
    jcm_lstar_fp = jcm_lstar_fp.add_prefix(' jcm_lstar - ')
    jcm_maccs_fp = jcm_maccs_fp.add_prefix('jcm_maccs - ')
    jcm_rad2d_fp = jcm_rad2d_fp.add_prefix('jcm_rad2d - ')
    jcm_shed_fp = jcm_shed_fp.add_prefix('jcm_shed - ')

    padel_subcount_fp = padel_subcount_fp.add_prefix('padel_subcount - ')
    padel_krcount_fp = padel_krcount_fp.add_prefix('padel_krcount - ')
    
    #Merging

    compounds_fingerprints = df.merge(cdk_ap2d_fp, left_index = True, right_index=True, how='outer')
    compounds_fingerprints = compounds_fingerprints.merge(cdk_circ_fp, left_index = True, right_index=True, how='outer')
    compounds_fingerprints = compounds_fingerprints.merge(cdk_estate_fp, left_index = True, right_index=True, how='outer')
    compounds_fingerprints = compounds_fingerprints.merge(cdk_ext_fp, left_index = True, right_index=True, how='outer')
    compounds_fingerprints = compounds_fingerprints.merge(cdk_fp_fp, left_index = True, right_index=True, how='outer')
    compounds_fingerprints = compounds_fingerprints.merge(cdk_graph_fp, left_index = True, right_index=True, how='outer')
    compounds_fingerprints = compounds_fingerprints.merge(cdk_hybrid_fp, left_index = True, right_index=True, how='outer')
    compounds_fingerprints = compounds_fingerprints.merge(cdk_kr_fp, left_index = True, right_index=True, how='outer')
    compounds_fingerprints = compounds_fingerprints.merge(cdk_lingo_fp, left_index = True, right_index=True, how='outer')
    compounds_fingerprints = compounds_fingerprints.merge(cdk_maccs_fp, left_index = True, right_index=True, how='outer')
    compounds_fingerprints = compounds_fingerprints.merge(cdk_pubc_fp, left_index = True, right_index=True, how='outer')
    compounds_fingerprints = compounds_fingerprints.merge(cdk_sp_fp, left_index = True, right_index=True, how='outer')
    compounds_fingerprints = compounds_fingerprints.merge(cdk_sub_fp, left_index = True, right_index=True, how='outer')

    compounds_fingerprints = compounds_fingerprints.merge(chemopy_fp, left_index = True, right_index=True, how='outer')

    compounds_fingerprints = compounds_fingerprints.merge(jcm_22_fp, left_index = True, right_index=True, how='outer')
    compounds_fingerprints = compounds_fingerprints.merge(jcm_32_fp, left_index = True, right_index=True, how='outer')
    compounds_fingerprints = compounds_fingerprints.merge(jcm_ap2d_fp, left_index = True, right_index=True, how='outer')
    compounds_fingerprints = compounds_fingerprints.merge(jcm_asp_fp, left_index = True, right_index=True, how='outer')
    compounds_fingerprints = compounds_fingerprints.merge(jcm_at2d_fp, left_index = True, right_index=True, how='outer')
    compounds_fingerprints = compounds_fingerprints.merge(jcm_cats2d_fp, left_index = True, right_index=True, how='outer')
    compounds_fingerprints = compounds_fingerprints.merge(jcm_dfs_fp, left_index = True, right_index=True, how='outer')
    compounds_fingerprints = compounds_fingerprints.merge(jcm_ecfp_fp, left_index = True, right_index=True, how='outer')
    compounds_fingerprints = compounds_fingerprints.merge(jcm_ecvar_fp, left_index = True, right_index=True, how='outer')
    compounds_fingerprints = compounds_fingerprints.merge(jcm_lstar_fp, left_index = True, right_index=True, how='outer')
    compounds_fingerprints = compounds_fingerprints.merge(jcm_maccs_fp, left_index = True, right_index=True, how='outer')
    compounds_fingerprints = compounds_fingerprints.merge(jcm_rad2d_fp, left_index = True, right_index=True, how='outer')
    compounds_fingerprints = compounds_fingerprints.merge(jcm_shed_fp, left_index = True, right_index=True, how='outer')

    compounds_fingerprints = compounds_fingerprints.merge(padel_subcount_fp, left_index = True, right_index=True, how='outer')
    compounds_fingerprints = compounds_fingerprints.merge(padel_krcount_fp, left_index = True, right_index=True, how='outer')

    len(compounds_fingerprints)
    
    compounds_fingerprints.to_csv(file_save, index=True)


Provide file specific variables: one-by-one

In [37]:
#Specify source file: path to the compounds with column "papyrus_SMILES" containing SMILES

file_path = f'../blood_brain_barrier/kadar_data_prep/val_data/kadar_influx_val.csv'

#Sppecify file_prefix to save intermediate files: raw molecular descriptors

file_name_prefix = f'kadar_fp/influx/kadar_val_influx_val'

#Specify final file path for saving

file_save = f"kadar_fp/influx/kadar_val_influx_fp_all.csv"



In [40]:
#Run the functions to generate mol.descs
generate_mol_fps(file_path,file_name_prefix,file_save)

Calculating CDK_AP2DFP fingerprints...


  0%|          | 0/9 [00:00<?, ?it/s]

Length: 18
Number of rows with NaN values: 0
Calculating CDK_Circ fingerprints...


  0%|          | 0/9 [00:00<?, ?it/s]

Length: 18
Number of rows with NaN values: 0
Calculating CDK_EState fingerprints...


  0%|          | 0/9 [00:00<?, ?it/s]

Length: 18
Number of rows with NaN values: 0


  0%|          | 0/9 [00:00<?, ?it/s]

Length: 18
Number of rows with NaN values: 0
Calculating CDK_FP fingerprints...


  0%|          | 0/9 [00:00<?, ?it/s]

Length: 18
Number of rows with NaN values: 0
Calculating CDK_Graph fingerprints...


  0%|          | 0/9 [00:00<?, ?it/s]

Length: 18
Number of rows with NaN values: 0
Calculating CDK_Hybrid fingerprints...


  0%|          | 0/9 [00:00<?, ?it/s]

Length: 18
Number of rows with NaN values: 0


  0%|          | 0/9 [00:00<?, ?it/s]

Length: 18
Number of rows with NaN values: 0
Calculating CDK_Lingo fingerprints...


  0%|          | 0/9 [00:00<?, ?it/s]

Length: 18
Number of rows with NaN values: 0


  0%|          | 0/9 [00:00<?, ?it/s]

Length: 18
Number of rows with NaN values: 0
Calculating CDK_Pubchem fingerprints...


  0%|          | 0/9 [00:00<?, ?it/s]

Length: 18
Number of rows with NaN values: 0
Calculating CDK_SP fingerprints...


  0%|          | 0/9 [00:00<?, ?it/s]

Length: 18
Number of rows with NaN values: 0
Calculating CDK_Sub fingerprints...


  0%|          | 0/9 [00:00<?, ?it/s]

Length: 18
Number of rows with NaN values: 0
Calculating Chemopy fingerprints...


  fp = fp.append([new_data], ignore_index=True)
  fp = fp.append([new_data], ignore_index=True)
  fp = fp.append([new_data], ignore_index=True)
  fp = fp.append([new_data], ignore_index=True)
  fp = fp.append([new_data], ignore_index=True)
  fp = fp.append([new_data], ignore_index=True)
  fp = fp.append([new_data], ignore_index=True)
  fp = fp.append([new_data], ignore_index=True)
  fp = fp.append([new_data], ignore_index=True)
  fp = fp.append([new_data], ignore_index=True)
  fp = fp.append([new_data], ignore_index=True)
  fp = fp.append([new_data], ignore_index=True)
  fp = fp.append([new_data], ignore_index=True)
  fp = fp.append([new_data], ignore_index=True)
  fp = fp.append([new_data], ignore_index=True)
  fp = fp.append([new_data], ignore_index=True)
  fp = fp.append([new_data], ignore_index=True)
  fp = fp.append([new_data], ignore_index=True)


Length: 18
Number of rows with NaN values: 0
Calculating JCompoundMapper PHAP2POINT2D fingerprints...


  0%|          | 0/9 [00:00<?, ?it/s]

Length: 90
Number of rows with NaN values: 0
Calculating JCM_PHAP3POINT2D...


  0%|          | 0/9 [00:00<?, ?it/s]

Length: 90
Number of rows with NaN values: 0
Calculating JCompoundMapper AP2d fingerprints...


  0%|          | 0/9 [00:00<?, ?it/s]

Length: 90
Number of rows with NaN values: 0
Calculating JCM_ASP...


  0%|          | 0/9 [00:00<?, ?it/s]

Length: 90
Number of rows with NaN values: 0


  0%|          | 0/9 [00:00<?, ?it/s]

Length: 90
Number of rows with NaN values: 0
Calculating JCompoundMapper CATS2D fingerprints...


  0%|          | 0/9 [00:00<?, ?it/s]

Length: 90
Number of rows with NaN values: 0


AttributeError: type object 'Fingerprint' has no attribute 'DFS'