
This notebook explores the curated EGFR dataset (`EGFR_04_bioactivity_data_curated_standardized.csv).

In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit.Chem.SaltRemover import SaltRemover
from rdkit.Chem.rdmolfiles import MolToSmiles
from rdkit.Chem import AllChem
import numpy as np


# Load dataset 
df = pd.read_csv("EGFR_04_bioactivity_data_curated_standardized.csv")
print("Initial shape:", df.shape)
df.head()



Initial shape: (10074, 7)


Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,class,original_smiles,corrected_smiles,correction_status
0,CHEMBL68920,Cc1cc(C)c(/C=C2\C(=O)Nc3ncnc(Nc4ccc(F)c(Cl)c4)...,41.0,active,Cc1cc(C)c(/C=C2\C(=O)Nc3ncnc(Nc4ccc(F)c(Cl)c4)...,,unchanged
1,CHEMBL69960,Cc1cc(C(=O)N2CCOCC2)[nH]c1/C=C1\C(=O)Nc2ncnc(N...,170.0,active,Cc1cc(C(=O)N2CCOCC2)[nH]c1/C=C1\C(=O)Nc2ncnc(N...,,unchanged
2,CHEMBL137635,CN(c1ccccc1)c1ncnc2ccc(N/N=N/Cc3ccccn3)cc12,9300.0,intermediate,CN(c1ccccc1)c1ncnc2ccc(N/N=N/Cc3ccccn3)cc12,,unchanged
3,CHEMBL306988,CC(=C(C#N)C#N)c1ccc(NC(=O)CCC(=O)O)cc1,500000.0,inactive,CC(=C(C#N)C#N)c1ccc(NC(=O)CCC(=O)O)cc1,,unchanged
4,CHEMBL66879,O=C(O)/C=C/c1ccc(O)cc1,3000000.0,inactive,O=C(O)/C=C/c1ccc(O)cc1,,unchanged


In [2]:

# Standardization functions 
_SALT_REMOVER  = SaltRemover()
_METAL_DC      = rdMolStandardize.MetalDisconnector()
_FRAG_REMOVER  = rdMolStandardize.FragmentRemover()
_LFC           = rdMolStandardize.LargestFragmentChooser(preferOrganic=True)
_NORMALIZER    = rdMolStandardize.Normalizer()
_REIONIZER     = rdMolStandardize.Reionizer()

def _tautomerize_if_available(mol):
    if hasattr(rdMolStandardize, "TautomerEnumerator"):
        te = rdMolStandardize.TautomerEnumerator()
        if hasattr(te, "Canonicalize"): return te.Canonicalize(mol)
        if hasattr(te, "canonicalize"): return te.canonicalize(mol)
    if hasattr(rdMolStandardize, "TautomerCanonicalizer"):
        tc = rdMolStandardize.TautomerCanonicalizer()
        if hasattr(tc, "Canonicalize"): return tc.Canonicalize(mol)
        if hasattr(tc, "canonicalize"): return tc.canonicalize(mol)
    return mol

def standardize_molecule(mol: Chem.Mol, do_tautomer: bool = False) -> Chem.Mol | None:
    if mol is None:
        return None
    try:
        Chem.SanitizeMol(mol)
        mol = _METAL_DC.Disconnect(mol)
        mol = _SALT_REMOVER.StripMol(mol, dontRemoveEverything=True)
        mol = _FRAG_REMOVER.remove(mol)
        mol = _LFC.choose(mol)
        mol = _NORMALIZER.normalize(mol)
        mol = _REIONIZER.reionize(mol)
        if do_tautomer:
            mol = _tautomerize_if_available(mol)
        if mol is None or mol.GetNumAtoms() == 0:
            return None
        return mol
    except Exception:
        return None

def standardize_smiles(smi: str, do_tautomer: bool = False) -> str | None:
    if not isinstance(smi, str) or not smi.strip():
        return None
    try:
        m = Chem.MolFromSmiles(smi, sanitize=False)
        if m is None:
            return None
        m = standardize_molecule(m, do_tautomer=do_tautomer)
        return MolToSmiles(m, isomericSmiles=True, canonical=True) if m else None
    except Exception:
        return None

[01:42:33] Initializing MetalDisconnector
[01:42:33] Initializing Normalizer


In [3]:
col_need=['molecule_chembl_id', 'canonical_smiles', 'standard_value', 'class']
df = df[col_need]
# Apply standardization 
df["standardized_smiles"] = df["canonical_smiles"].apply(standardize_smiles)
print("After standardization, missing SMILES:", df["standardized_smiles"].isna().sum())

# Save after standardization
df.to_csv("EGFR_05_standardized.csv", index=False)

[01:42:39] Running MetalDisconnector
[01:42:39] Running FragmentRemover
[01:42:39] Running LargestFragmentChooser
[01:42:39] Running Normalizer
[01:42:39] Running MetalDisconnector
[01:42:39] Running FragmentRemover
[01:42:39] Running LargestFragmentChooser
[01:42:39] Running Normalizer
[01:42:39] Running MetalDisconnector
[01:42:39] Running FragmentRemover
[01:42:39] Running LargestFragmentChooser
[01:42:39] Running Normalizer
[01:42:39] Running MetalDisconnector
[01:42:39] Running FragmentRemover
[01:42:39] Running LargestFragmentChooser
[01:42:39] Running Normalizer
[01:42:39] Running MetalDisconnector
[01:42:39] Running FragmentRemover
[01:42:39] Running LargestFragmentChooser
[01:42:39] Running Normalizer
[01:42:39] Running MetalDisconnector
[01:42:39] Running FragmentRemover
[01:42:39] Running LargestFragmentChooser
[01:42:39] Running Normalizer
[01:42:39] Running MetalDisconnector
[01:42:39] Running FragmentRemover
[01:42:39] Running LargestFragmentChooser
[01:42:39] Running Nor

After standardization, missing SMILES: 0


[01:42:48] Running LargestFragmentChooser
[01:42:48] Running Normalizer
[01:42:48] Running MetalDisconnector
[01:42:48] Running FragmentRemover
[01:42:48] Running LargestFragmentChooser
[01:42:48] Running Normalizer
[01:42:48] Running MetalDisconnector
[01:42:48] Running FragmentRemover
[01:42:48] Running LargestFragmentChooser
[01:42:48] Running Normalizer


In [4]:
from rdkit import Chem
from rdkit.Chem import Descriptors, rdMolDescriptors, Lipinski
from rdkit.Chem.Scaffolds import MurckoScaffold

# Substructure patterns
_PYRIDINE   = Chem.MolFromSmarts("n1ccccc1")
_PYRIMIDINE = Chem.MolFromSmarts("n1cnccc1")

def compute_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    # Core physicochemical properties
    mol_wt = Descriptors.MolWt(mol)
    logp   = Descriptors.MolLogP(mol)
    tpsa   = Descriptors.TPSA(mol)

    # Lipinski-compliant counts
    num_hbd = Lipinski.NumHDonors(mol)
    num_hba = Lipinski.NumHAcceptors(mol)
    num_rotatable_bonds = Lipinski.NumRotatableBonds(mol)

    # Structural properties
    ring_count          = rdMolDescriptors.CalcNumRings(mol)
    aromatic_ring_count = rdMolDescriptors.CalcNumAromaticRings(mol)
    fraction_sp3        = rdMolDescriptors.CalcFractionCSP3(mol)
    heteroatom_count    = sum(1 for a in mol.GetAtoms() if a.GetAtomicNum() not in (1, 6))

    # Substructure flags
    has_pyridine   = int(mol.HasSubstructMatch(_PYRIDINE))
    has_pyrimidine = int(mol.HasSubstructMatch(_PYRIMIDINE))

    # Scaffold
    murcko_scaffold = MurckoScaffold.MurckoScaffoldSmiles(mol=mol)

    # Normalized values (scaling constants adjustable)
    mw_norm   = mol_wt / 900.0
    tpsa_norm = tpsa / 300.0
    logp_norm = (logp + 5.0) / 10.0
    sp3_norm  = fraction_sp3

    return {
        'MolWt': mol_wt,
        'LogP': logp,
        'TPSA': tpsa,
        'HBD': num_hbd,
        'HBA': num_hba,
        'RotB': num_rotatable_bonds,
        'RingCount': ring_count,
        'AromaticRingCount': aromatic_ring_count,
        'FractionCSP3': fraction_sp3,
        'HeavyAtomCount': Descriptors.HeavyAtomCount(mol),
        'HeteroAtomCount': heteroatom_count,
        'HasPyridine': has_pyridine,
        'HasPyrimidine': has_pyrimidine,
        'MurckoScaffold': murcko_scaffold,
        'MW_norm': mw_norm,
        'TPSA_norm': tpsa_norm,
        'LogP_norm': logp_norm,
        'SP3_norm': sp3_norm
    }

# Apply to standardized_smiles column
desc_df = df["standardized_smiles"].apply(compute_descriptors).apply(pd.Series)
df_desc = pd.concat([df, desc_df], axis=1)

print("Shape after descriptors:", df_desc.shape)
df_desc.to_csv("EGFR_05_descriptors.csv", index=False)


Shape after descriptors: (10074, 23)


In [None]:

# Generate ECFP + FCFP fingerprints
def generate_fingerprints(smiles, radius=2, nBits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None, None
    ecfp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nBits)
    fcfp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nBits, useFeatures=True)
    return np.array(list(ecfp)), np.array(list(fcfp))

fp_data = df_desc["standardized_smiles"].apply(lambda s: generate_fingerprints(s))
ecfp_df = pd.DataFrame([fp[0] for fp in fp_data], columns=[f"ECFP_{i}" for i in range(1024)])
fcfp_df = pd.DataFrame([fp[1] for fp in fp_data], columns=[f"FCFP_{i}" for i in range(1024)])

# Save fingerprints separately
pd.concat([df_desc, ecfp_df, fcfp_df], axis=1).to_csv("EGFR_step3_with_fingerprints.csv", index=False)

# --- 6) Combine descriptors + fingerprints ---
final_df = pd.concat([df_desc.reset_index(drop=True), ecfp_df, fcfp_df], axis=1)
print("Final shape:", final_df.shape)

# Save final dataset
final_df.to_csv("EGFR_final_modeling_dataset.csv", index=False)
