In [2]:
# load herg dataset

import pandas as pd
df = pd.read_csv('../data/herg/herg_central.csv')

In [3]:
df = df[['X', 'hERG_inhib']].dropna()

In [4]:
# clean smiles

import rdkit.Chem as Chem
from rdkit.Chem.SaltRemover import SaltRemover
from rdkit.Chem.MolStandardize.rdMolStandardize import Uncharger
from typing import List
from rdkit import RDLogger
import logging
from sklearn import metrics

# disable RDKit warnings
RDLogger.DisableLog("rdApp.*")


def clean_smiles(smiles_list: List[str]) -> List[str | None]:
    """Remove invalid SMILES from a list of SMILES strings, strip salts, and remove duplicates."""
    un = Uncharger()
    salt_remover = SaltRemover()
    cleaned_smiles = []

    for smiles in smiles_list:
        mol = Chem.MolFromSmiles(smiles)
        if mol is not None:
            # Remove salts
            mol = salt_remover.StripMol(mol)
            # Uncharge the molecule
            mol = un.uncharge(mol)
            # Convert back to SMILES
            cleaned_smiles.append(Chem.MolToSmiles(mol))
        else:
            logging.debug(f"Invalid SMILES in the dataset: {smiles}")
            cleaned_smiles.append(None)

    return cleaned_smiles

pre_cleaning_len = len(df)
cleaned_smiles = clean_smiles(df['X'])
df['X'] = cleaned_smiles
df.dropna(inplace=True)
post_cleaning_len = len(df)
print(f"{pre_cleaning_len} -> {post_cleaning_len} after cleaning")

306893 -> 306893 after cleaning


In [5]:
# extract negative and positive samples

neg_df = df[df['hERG_inhib'] == 0].copy()
pos_df = df[df['hERG_inhib'] == 1].copy()

print(f"Number of negative samples: {len(neg_df)}")
print(f"Number of positive samples: {len(pos_df)}")

Number of negative samples: 293149
Number of positive samples: 13744


In [6]:
# cluster negative samples

import rdkit.Chem.AllChem as AllChem
from rdkit import Chem

def morgan_fp(smiles):
    mol = Chem.MolFromSmiles(smiles)
    morgan = AllChem.GetMorganGenerator(radius=2, fpSize=512)
    return morgan.GetFingerprint(mol)

neg_df['fp'] = neg_df['X'].apply(morgan_fp)

In [None]:
# cluster negative samples using k-means
from sklearn.cluster import KMeans
from rdkit.Chem import DataStructs
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors



 22%|██▏       | 63130/293148 [04:36<54:09, 70.77it/s]  

In [None]:
THRESH = 0.9

# Cluster the negative samples using Butina algorithm
clusters = Butina.ClusterData(distance_matrix, len(neg_df), THRESH, isDistData=True)
print(f"Number of clusters: {len(clusters)}")