<a href="https://colab.research.google.com/github/jwasswa2023/Physpropnet/blob/main/Data_cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
import math
import swifter  # for efficient parallel apply

# Get all descriptor functions from RDKit
all_descriptors = {name: func for name, func in Descriptors.descList}

def check_bad_salt(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            raise ValueError(f"Invalid SMILES: {smiles}")
    except Exception:
        print(f"Ignored SMILES: {smiles}")
        return False

    try:
        # Use MaxAbsPartialCharge as a proxy check
        feat = all_descriptors['MaxAbsPartialCharge'](mol)
        if math.isnan(feat):
            print(f"Bad salt SMILES: {smiles}")
            return True
        return False
    except Exception:
        print(f"Descriptor failed for: {smiles}")
        return False

def process_csv(file_path, output_path=None):
    df = pd.read_csv(file_path)

    if 'SMILES' not in df.columns:
        raise ValueError("CSV must contain a column named 'SMILES'")

    df['bad_salts'] = df['SMILES'].swifter.apply(lambda x: check_bad_salt(str(x)))

    if output_path:
        df.to_csv(output_path, index=False)
        print(f"Processed file saved to: {output_path}")

    return df

# Example usage:
df = process_csv("data.csv", "cleaned_data.csv")



Some Datasets can have bad featurized smiles, here is the code that can help

In [None]:
import pandas as pd
import numpy as np
import deepchem as dc

# Load your dataset
df = pd.read_csv("/content/desalted_Kh.csv")
smiles_list = df["SMILES"].tolist()
labels = df["LogHL"].values

# Initialize featurizer
featurizer = dc.feat.MolGraphConvFeaturizer(use_edges=True)

# Featurize and check for validity
clean_smiles = []
clean_labels = []
clean_features = []

for smile, label in zip(smiles_list, labels):
    feature = featurizer.featurize([smile])[0]

    # Check if featurization succeeded and has multiple nodes
    if feature is not None and hasattr(feature, 'node_features') and feature.node_features.shape[0] > 1:
        clean_smiles.append(smile)
        clean_labels.append(label)
        clean_features.append(feature)

print(f" Cleaned dataset prepared. Retained {len(clean_smiles)} out of {len(smiles_list)} molecules.")

# Convert to arrays for downstream compatibility
clean_labels = np.array(clean_labels)

# Optionally, create a cleaned dataframe for inspection or saving
clean_df = pd.DataFrame({
    "SMILES": clean_smiles,
    "LogHL": clean_labels
})

# Save cleaned dataset if needed
clean_df.to_csv("/content/cleaned_desalted_Kh.csv", index=False)
print("Cleaned dataset saved to cleaned_desalted_Kh.csv")

# Now you can proceed to use clean_features, clean_labels, and clean_smiles in your GraphSAGE pipeline




✅ Cleaned dataset prepared. Retained 1661 out of 1662 molecules.
💾 Cleaned dataset saved to cleaned_desalted_Kh.csv
