Install Required library

In [None]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2024.3.6-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Downloading rdkit-2024.3.6-cp310-cp310-manylinux_2_28_x86_64.whl (32.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.8/32.8 MB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.3.6


Import Libraryies require for this study

In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.MolStandardize import rdMolStandardize
import pandas as pd

Import the data that you want to augment as CSV format.

In [None]:
data = pd.read_csv(your_data)

Create the function for data augmentation

In [None]:
def augment_smiles(smiles, n_random=5):
    mol = Chem.MolFromSmiles(smiles)

    # Generate canonical SMILES
    canonical_smiles = Chem.MolToSmiles(mol, canonical=True)

    # Generate random SMILES
    random_smiles_list = [Chem.MolToSmiles(mol, canonical=False, doRandom=True) for _ in range(n_random)]

    # Generate tautomers
    enumerator = rdMolStandardize.TautomerEnumerator()
    tautomers = enumerator.Enumerate(mol)
    tautomer_smiles_list = [Chem.MolToSmiles(taut) for taut in tautomers]

    # Combine all SMILES
    augmented_smiles = set([canonical_smiles] + random_smiles_list + tautomer_smiles_list)

    return list(augmented_smiles)

In [None]:
# Augment the dataset
augmented_data = []

for idx, row in data.iterrows():
    smiles = row['SMILES']
    logEC50 = row['Experimental logEC50']
    augmented_smiles_list = augment_smiles(smiles)

    for augmented_smiles in augmented_smiles_list:
        augmented_data.append({
            'IL No.': row['IL No.'],
            'Name': row['Name'],
            'SMILES': augmented_smiles,
            'Experimental logEC50': logEC50
        })

augmented_df = pd.DataFrame(augmented_data)

save your augmented data in CSV format.

In [None]:
augmented_df.to_csv('/yourAddres/Augmented_Book1.csv', index=False)


Load the augmented data again in CSV format

In [None]:
data_augmented = pd.read_csv(CSV_file_path)

In [None]:
data_augmented.shape

(2119, 4)

In [None]:
!pip install pandas openpyxl



If you want to have detail insights for your augmented data, for instance: Counting unique SMILES in each dataset and Calculate duplicates eliminated, you can run code below.

In [None]:
#Get the SMILES of the original data
original_smiles = data["SMILES"].unique()

In [None]:
#Get the SMILES of the Augmented data
augmented_smiles = augmented_data["SMILES"].unique()

In [None]:
# Count unique SMILES in each dataset
original_count = len(original_smiles)
augmented_count = len(augmented_smiles)

# Calculate duplicates eliminated
duplicates_eliminated = original_count - augmented_count

# Print results
print(f"Original SMILES Count: {original_count}")
print(f"Augmented SMILES Count: {augmented_count}")
print(f"Duplicates Eliminated: {duplicates_eliminated}")

Original SMILES Count: 355
Augmented SMILES Count: 2062
Duplicates Eliminated: -1707
