<a href="https://colab.research.google.com/github/jwasswa2023/Physpropnet/blob/main/Code_for_data_cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# clean & install (run, then restart runtime)
!pip uninstall -y dgl torch torchvision torchaudio torchdata
!pip install -U pip setuptools wheel

# Core (CPU wheels)
!pip install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cpu
!pip install torchdata==0.7.1

# DGL (works with Torch 2.2.x)
!pip install dgl==2.1.0   # if this complains, try: pip install dgl==2.2.1

# Your libs
!pip install -U deepchem rdkit scikit-learn

In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import SaltRemover
import numpy
import deepchem as dc

# Function to check if a SMILES string is valid
def is_valid_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return mol is not None

# Function to remove salts
def remove_salts(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None  # Return None if the molecule is invalid
    remover = SaltRemover.SaltRemover()
    clean_mol = remover.StripMol(mol)
    return Chem.MolToSmiles(clean_mol)

# Remove invalid SMILES
df3['is_valid'] = df3['SMILES'].apply(is_valid_smiles)
df3 = df3[df3['is_valid']]

# Remove salts and clean SMILES
df3['clean_SMILES'] = df3['SMILES'].apply(remove_salts)

# Drop temporary columns
df3 = df3.drop(columns=['is_valid'])

# Remove rows where clean_SMILES is None
df3 = df3.dropna(subset=['clean_SMILES'])

print(df3.head())