In [1]:
ROOT_DIR ='/data/user/home/mhossai5/DPP-New2025'
!git clone https://github.com/brendaferrari/AutoPaDELPy.git -l {ROOT_DIR}/utils

fatal: destination path '/data/user/home/mhossai5/DPP-New2025/utils' already exists and is not an empty directory.


In [2]:
import os
import sys
sys.path.insert(0,os.path.join(
    ROOT_DIR,'utils'
))
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

# Data Collection

In [3]:
df_external_set = pd.read_excel(os.path.join(
    ROOT_DIR,'data/bioactivities_Tue_May_06_2025.xlsx'
))
df_external_set.head()

Unnamed: 0,Compound ID,Uniprot ID,Compound Name,Standard inchi key,Max Phase,Target Pref Name,Gene Names,Target Class,Wild type or mutant,Mutation information,...,Volume,Issue,Authors,Annotation Comments,Assay ID,DTC Tid,DTC Activity ID,DTC Molregno,Record ID,DTC Document ID
0,,P27487,SURECN7126947,VZTHEVPZHGVFET-UHFFFAOYSA-N,0,DIPEPTIDYL PEPTIDASE IV,DPP4,Enzyme,,,...,,,,,,DTCT0024079,14711717,DTCC01744681,2098492,68003
1,,P27487,AKOS008858589,IDZNFPHPWBHHSS-UHFFFAOYSA-N,0,DIPEPTIDYL PEPTIDASE IV,DPP4,Enzyme,,,...,,,,,,DTCT0024079,14722185,DTCC01745076,2102506,68374
2,,P27487,AGN-PC-00ANFJ,GDYIRHKDJLPJTB-UHFFFAOYSA-N,0,DIPEPTIDYL PEPTIDASE IV,DPP4,Enzyme,,,...,,,,,,DTCT0024079,14723839,DTCC01745298,2102761,68050
3,,P27487,,ABZSPJVXTTUFAA-UHFFFAOYSA-N,0,DIPEPTIDYL PEPTIDASE IV,DPP4,Enzyme,,,...,,,,,,DTCT0024079,14705495,DTCC01744004,2096809,68036
4,,P27487,,ABZSPJVXTTUFAA-UHFFFAOYSA-N,0,DIPEPTIDYL PEPTIDASE IV,DPP4,Enzyme,,,...,,,,,,DTCT0024079,14719410,DTCC01744004,2096809,68036


In [4]:
df_external_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6059 entries, 0 to 6058
Data columns (total 43 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Compound ID                         5971 non-null   object 
 1   Uniprot ID                          6059 non-null   object 
 2   Compound Name                       311 non-null    object 
 3   Standard inchi key                  5895 non-null   object 
 4   Max Phase                           6059 non-null   int64  
 5   Target Pref Name                    6059 non-null   object 
 6   Gene Names                          6059 non-null   object 
 7   Target Class                        6059 non-null   object 
 8   Wild type or mutant                 0 non-null      float64
 9   Mutation information                0 non-null      float64
 10  PubMed ID                           5502 non-null   float64
 11  End Point Standard Type             6059 no

In [5]:
df_external_set= df_external_set[(
    df_external_set['Target Pref Name'].isin(['DIPEPTIDYL PEPTIDASE IV']) 
    & 
    df_external_set['Gene Names'].isin(['DPP4'])
    &
    df_external_set['End Point Standard Type'].isin(['IC50'])
    &
    df_external_set['End Point Standard Units'].isin(['NM'])
)]
df_external_set = df_external_set.drop_duplicates(['Standard inchi key']).reset_index(drop = True)

In [6]:
df_external_set.shape

(3737, 43)

In [None]:
# ref: https://drugtargetcommons.fimm.fi/bioactivities?id=DTCT0024079&category=Target&name=DPP4
import pubchempy as pcp
import pandas as pd
from tqdm.auto import tqdm

# Function using pubchempy
def inchikey_to_smiles_pubchempy(inchikey):
    compounds = pcp.get_compounds(inchikey, 'inchikey')
    if compounds and compounds[0].canonical_smiles:
        return compounds[0].canonical_smiles
    return None

# Add SMILES column
df_external_set['SMILES'] = None
for idx,row in tqdm(df_external_set.iterrows(),total = len(df_external_set)):
    inchikey = row['Standard inchi key']
    if pd.isna(df_external_set.at[idx,'SMILES']):
        try:
            df_external_set.at[idx,'SMILES'] = inchikey_to_smiles_pubchempy(inchikey)
        except:
            df_external_set.at[idx,'SMILES'] = None

In [None]:
df_external_set = df_external_set[['DTC Activity ID','SMILES','End Point Standard Value','End Point Standard Type','End Point Standard Units']]
df_external_set.rename(columns = {
    'DTC Activity ID':'id',
    'SMILES':'smiles',
    'End Point Standard Value':'standard_value',
    'End Point Standard Type':'standard_type',
    'End Point Standard Units':'standard_units'
},inplace = True)
df_external_set['target_organism'] = 'Homo sapiens'
df_external_set['target_pref_name'] = 'Dipeptidyl peptidase IV'
df_external_set = df_external_set[['id','smiles','standard_value','standard_type','standard_units','target_organism','target_pref_name']]
df_external_set.dropna(inplace = True)
df_external_set

In [None]:
df_external_set.to_csv(os.path.join(ROOT_DIR,'data/External_DTC_DPP4-05-05-2025_Processed.csv'),index = False)


# Feature Extraction

In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
tqdm.pandas()
df_external_set = pd.read_csv(os.path.join(ROOT_DIR,'data/External_DTC_DPP4-05-05-2025_Processed.csv'))
df_train = pd.read_parquet(os.path.join(ROOT_DIR,'data/dpp4-26-03-25-feat.parquet'))

In [None]:
# ## IC50 Conversion
def calculate_pIC50(ic50_nM):
    ic50_nM = pd.to_numeric(ic50_nM, errors='coerce')  # Convert to numeric, handle errors
    ic50_M = ic50_nM * 1e-9  # Convert to molar concentration
    return -np.log10(ic50_M)  # Compute pIC50
def classify_pic50(value):
    if value > 7.5:
        return 'active'
    elif 6 < value <= 7.5:
        return 'grey'
    elif value <= 6:
        return 'inactive'
# Save the fitted LabelEncoder to a file
import pickle

with open(os.path.join(ROOT_DIR,'data/label_encoder.pkl'), 'rb') as file:
    le = pickle.load(file)
    
with open(os.path.join(ROOT_DIR,'data/ltn_std_scaler.pkl'), 'rb') as file:
    scaler = pickle.load(file)
    
df_external_set['pIC50'] = df_external_set['standard_value'].apply(calculate_pIC50)
# Apply the classification to the DataFrame
df_external_set['label'] = df_external_set['pIC50'].apply(classify_pic50)
df_external_set = df_external_set[~df_external_set['label'].isin(['grey'])]
df_external_set['target'] = le.transform(df_external_set['label'])

In [None]:
df_external_set = df_external_set[~df_external_set['smiles'].progress_apply(lambda x: x in df_train['smiles'].tolist())].reset_index(drop = True)
df_external_set

In [None]:
from functions.fingerprint_functions import FingerprintFunctions

In [None]:
feat_path = os.path.join(ROOT_DIR,'data/External_DTC_DPP4-CDKextended.csv')
if not os.path.exists(feat_path):
    os.chdir(os.path.join(ROOT_DIR,'utils'))
    smi_path =  'dataset.smi'
    df_external_set[['smiles','id']].to_csv(
       smi_path, sep='\t', index=False, header=False,
    )
    FingerprintFunctions().do_CDKextended(smi_path)
    os.rename(
        os.path.join(ROOT_DIR,'utils/CDKextended.csv'),
        feat_path,
    )
    os.remove(smi_path)
    os.chdir(os.path.join(ROOT_DIR,'notebook'))

In [None]:
df_external_set['CDKextended'] = pd.read_csv(feat_path).drop('Name',axis = 1).values.tolist()

In [None]:
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from tqdm.auto import tqdm
tqdm.pandas()
import numpy as np
from rdkit import rdBase
rdBase.DisableLog('rdApp.warning')

In [None]:
def smiles_to_fp(smiles, radius=2, num_bits=1024):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return list(np.zeros(num_bits))
    return list(AllChem.GetMorganFingerprintAsBitVect(mol, radius, num_bits))

In [None]:
df_external_set['ECFP'] = df_external_set['smiles'].progress_apply(lambda x: smiles_to_fp(x))
df_external_set['ECFP_2048'] = df_external_set['smiles'].progress_apply(lambda x: smiles_to_fp(x,num_bits = 2048))
df_external_set['ECFP_512'] = df_external_set['smiles'].progress_apply(lambda x: smiles_to_fp(x,num_bits = 512))

In [None]:
df_external_set.to_parquet(os.path.join(ROOT_DIR,'data/dpp4-ext-26-03-25-feat.parquet'),index = False)