In [27]:
import pandas as pd
from rdkit import Chem
import os
import random

In [28]:
INPUT_DATASET_PATH = '../DataSets/XAI/3MR/toy_label_mw350.csv'
DATASETNAME = '3MR'
FILTER = 'Halogens' #'Benzene' 'Halogens'
SMILESCOL = 'SMILES'
OUTPUTDIR = f'../DataSets/Splits/{DATASETNAME}/'

In [29]:
def contains_benzene(smiles):
    # Create an RDKit molecule object from the SMILES string
    mol = Chem.MolFromSmiles(smiles)
    
    if mol is None:
        raise ValueError(f"Invalid SMILES string: {smiles}")

    # Define the SMILES for benzene
    benzene_smiles = 'c1ccccc1'
    benzene = Chem.MolFromSmiles(benzene_smiles)

    # Check if the molecule contains a benzene ring
    return mol.HasSubstructMatch(benzene)

def contains_halogen(smiles):
     # Create an RDKit molecule object from the SMILES string
    mol = Chem.MolFromSmiles(smiles)
    
    if mol is None:
        raise ValueError(f"Invalid SMILES string: {smiles}")
    
    # List of halogens: F, Cl, Br, I
    halogens = ['F', 'Cl', 'Br', 'I']
    
    for atom in mol.GetAtoms():
        symbol = atom.GetSymbol()
        if symbol in halogens:
            return True
    
    return False

def trim_dataset(df, amt):
    rows_to_remove = random.sample(df.index.tolist(), amt)
    df.drop(rows_to_remove, inplace=True)

In [30]:
# Read the data
df = pd.read_csv(INPUT_DATASET_PATH)

In [31]:
if FILTER == 'Benzene':
    contains_filter = contains_benzene
elif FILTER == 'Halogens':
    contains_filter = contains_halogen
else:
    raise ValueError(f'Filter {FILTER} not recognized!')

# Add a column to test for the filter
df['contains_filter'] = df['SMILES'].apply(contains_filter)

# Filter the dataset
contains_subset = df[df['contains_filter'] == True].copy()
not_contains_subset = df[df['contains_filter'] == False].copy()

# Remove the extra column from the new subsets
contains_subset.drop(columns=['contains_filter'], inplace=True)
not_contains_subset.drop(columns=['contains_filter'], inplace=True)

print(f'Size containing: {len(contains_subset)}')
print(f'Size not containing: {len(not_contains_subset)}')

# # Trim the largest dataset, to make them equal in size
# if(len(contains_subset) > len(not_contains_subset)):
#     trim_dataset(contains_subset, len(contains_subset) - len(not_contains_subset))
# elif(len(not_contains_subset) > len(contains_subset)):
#     trim_dataset(not_contains_subset, len(not_contains_subset) - len(contains_subset))
    

Size containing: 448
Size not containing: 2429


In [32]:
if not os.path.exists(OUTPUTDIR):
        try:
            os.makedirs(OUTPUTDIR)
            print(f"Directory '{OUTPUTDIR}' created successfully.")
        except Exception as e:
            print(f"Error creating directory '{OUTPUTDIR}': {e}")

contains_subset.to_csv(os.path.join(OUTPUTDIR, f'contains_{FILTER}.csv'), index=False)
not_contains_subset.to_csv(os.path.join(OUTPUTDIR, f'not_contains_{FILTER}.csv'), index=False)

In [33]:
print(df)

        index                                          SMILES  \
0        3236   O=C1c2cccc(OCc3ccc(C(F)(F)F)cc3)c2C(=O)C2OC12   
1        3865  C=C(C)C1CC(OC(C)=O)C2(C)CC3OC3(C)CCC=C(C)CCC12   
2        5572                     c1ccc2c(C3CO3)c3ccccc3nc2c1   
3        5838             COC1C(O)CCC2(CO2)C1C1(C)OC1CC=C(C)C   
4        6554             O=C1c2ccccc2C(=O)c2c(NCC3CO3)cccc21   
...       ...                                             ...   
2872   893759                           CCN1CC=C(c2ccccc2)CC1   
2873  1137976                                Cc1ccc2ccncc2c1N   
2874   742932           CCN1CCN(CC(=O)c2c[nH]c3ccc(C)cc23)CC1   
2875   604244        c1cncc(-c2c[nH]c(C3COCCN3Cc3ccncc3)n2)c1   
2876   475546           CCCN(Cc1ccccc1)C(S)=Nc1ccc(C(C)=O)cc1   

                   label  label_full  smarts0  smarts1  smarts2  smarts3  \
0     [1, 0, 0, 0, 0, 0]           1        1        0        0        0   
1     [1, 0, 0, 0, 0, 0]           1        1        0        0    