In [1]:
#Import the necessary libraries
import os
import sys
import numpy as np
import pandas as pd
from rdkit.Chem import PandasTools
from rdkit import Chem
from duplicate_molecules import *

### Training labels dataset

In [2]:
#Load the training dataset containing the molecules and 12 assay labels
combinedDF = pd.DataFrame(columns=['SMILES','ID'])
for filename in os.listdir(os.getcwd()):
    if filename.endswith('.smiles'):
        df = pd.read_csv(filename, header=None, sep='\t')
        df.columns=['SMILES','ID',filename[:-7]]
        combinedDF = combinedDF.merge(df, on=['SMILES', 'ID'], how='outer')
combinedDF.drop('ID', axis=1, inplace=True)
df_train=combinedDF.rename(columns={"SMILES": "smiles"})
print(df_train.shape)

(11764, 13)


In [3]:
df_train.head(2)

Unnamed: 0,smiles,nr-ahr,nr-ar-lbd,nr-ar,nr-aromatase,nr-er-lbd,nr-er,nr-ppar-gamma,sr-are,sr-atad5,sr-hse,sr-mmp,sr-p53
0,CC(O)=O.[H][C@@]12CCC3=CC(=CC=C3[C@@]1(C)CCC[C...,0.0,,0.0,,,,,,,,,
1,Cl.C[C@@H](NCCCC1=CC=CC(=C1)C(F)(F)F)C2=CC=CC3...,0.0,,0.0,,,,,,,,,


### Evaluation labels dataset

In [4]:
#Load the evaluation dataset containing the molecules and 12 assay labels
df_leaderbd = PandasTools.LoadSDF('tox21_10k_challenge_test.sdf')
keep_cols = ['NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD', 'NR-PPAR-gamma',
            'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53', 'ROMol']
df_leaderbd = df_leaderbd[keep_cols]

#Get the Smiles from the ROMOl
df_leaderbd['smiles'] = df_leaderbd['ROMol'].apply(Chem.MolToSmiles)
#Drop the ROMol column 
df_leaderbd = df_leaderbd.drop('ROMol', axis=1)

#Move the smiles to be the first column
cols = df_leaderbd.columns.tolist()
cols = cols[-1:] + cols[:-1]  
df_leaderbd = df_leaderbd[cols]
df_leaderbd.columns= df_leaderbd.columns.str.lower()
print(df_leaderbd.shape)

(295, 13)


RDKit ERROR: [22:38:45] Explicit valence for atom # 20 O, 3, is greater than permitted
RDKit ERROR: [22:38:45] ERROR: Could not sanitize molecule ending on line 17054


In [5]:
df_leaderbd.head(2)

Unnamed: 0,smiles,nr-ar,nr-ar-lbd,nr-ahr,nr-aromatase,nr-er,nr-er-lbd,nr-ppar-gamma,sr-are,sr-atad5,sr-hse,sr-mmp,sr-p53
0,CNc1ncnc2c1ncn2[C@@H]1O[C@H](CO)C(O)[C@H]1O,0,0,0.0,,0,0.0,0,0.0,1,0.0,0.0,0
1,Oc1cc(O)cc(/C=C/c2ccc(O)c(O)c2)c1,0,0,,,1,,0,,1,,,1


### Testing labels dataset

In [6]:
#Load the testing dataset containing the molecules and 12 assay labels
df_score_smiles = pd.read_csv('tox21_10k_challenge_score.smi', sep='\t') # has the Sample ID and SMILES columns 
df_score_results = pd.read_csv('tox21_10k_challenge_score.csv', sep='\t') # has the Sample ID and 12 assay labels columns

In [7]:
#Combine the df_score_smiles and df_score_results to create the testing labels dataset
df_test=pd.concat([df_score_smiles, df_score_results]).groupby('Sample ID', as_index=False, sort=False).first().fillna('NA')
df_test = df_test.drop('Sample ID', axis=1).replace('x', np.nan)
df_test.rename(columns={'#SMILES': 'smiles'}, inplace=True)
df_test.columns= df_test.columns.str.lower()
print(df_test.shape)

(647, 13)


In [8]:
df_test.head(2)

Unnamed: 0,smiles,nr-ahr,nr-ar,nr-ar-lbd,nr-aromatase,nr-er,nr-er-lbd,nr-ppar-gamma,sr-are,sr-atad5,sr-hse,sr-mmp,sr-p53
0,OC(=O)\C=C/C(O)=O.C[C@]12CC=C3[C@@H](CCC4=CC(=...,0,1,,0.0,0,0,0,,0,0,,0
1,[Na+].NC1=NC=NC2=C1N=C(Br)N2C1OC2CO[P@]([O-])(...,0,1,,,0,0,0,0.0,0,0,0.0,0


### Standardization step

In [9]:
import molvs as mv
#Function to get parent of a smiles
def get_parent_smile(smile):
    try:
        st = mv.Standardizer()
        mol = Chem.MolFromSmiles(smile)
        smts = Chem.MolFromSmarts("[!#1&!#5&!#6&!#7&!#8&!#9&!#14&!#15&!#16&!#17&!#34&!#35&!#53]~*")
        if mol.HasSubstructMatch(smts) == False:
            mols = st.charge_parent(mol)
            return Chem.MolToSmiles(mols)
        else:
            #print(smile)
            return 'problematic'
    except:
        return 'NaN'
    
#Function to clean and standardize the data
def clean_data_(data):
    #remove missing smiles
    data = data[~(data['smiles'].isnull())]
    
    #Standardize and get parent with molvs
    data["smiles_parent"] = data.smiles.apply(get_parent_smile)
    data = data[~(data['smiles_parent'] == "NaN")]
    
    #Identifiy compounds that fail SMILE conversion
    pCompounds = data[data['smiles_parent']=='problematic']['smiles']
    data = data[~(data['smiles_parent'] == 'problematic')]
    print(len(pCompounds))
    return data

In [10]:
#Standardization and obtaining parent smiles
clean_eval = clean_data_(df_leaderbd) #292 molecules
print(clean_eval.shape)
clean_train=clean_data_(df_train) #11628 molecules
print(clean_train.shape)
clean_test = clean_data_(df_test) #631 molecules
print(clean_test.shape)

3
(292, 14)


RDKit ERROR: [22:38:45] ERROR: Explicit valence for atom # 20 O, 3, is greater than permitted
RDKit ERROR: [22:38:51] Explicit valence for atom # 3 Si, 8, is greater than permitted
RDKit ERROR: [22:38:54] Explicit valence for atom # 0 Cl, 2, is greater than permitted
RDKit ERROR: [22:38:56] Can't kekulize mol.  Unkekulized atoms: 3 10
RDKit ERROR: 
RDKit ERROR: [22:38:56] Can't kekulize mol.  Unkekulized atoms: 3 10
RDKit ERROR: 
RDKit ERROR: [22:38:58] Explicit valence for atom # 1 B, 5, is greater than permitted
RDKit ERROR: [22:38:59] Explicit valence for atom # 2 Cl, 2, is greater than permitted
RDKit ERROR: [22:39:00] Explicit valence for atom # 1 B, 5, is greater than permitted
RDKit ERROR: [22:39:03] Explicit valence for atom # 3 Si, 8, is greater than permitted
RDKit ERROR: [22:39:03] Explicit valence for atom # 1 B, 5, is greater than permitted
RDKit ERROR: [22:39:03] Explicit valence for atom # 1 B, 5, is greater than permitted
RDKit ERROR: [22:39:05] Explicit valence for ato

126
(11628, 14)


RDKit ERROR: [22:39:16] Can't kekulize mol.  Unkekulized atoms: 4 5 6 7 10
RDKit ERROR: 
RDKit ERROR: [22:39:16] Can't kekulize mol.  Unkekulized atoms: 6 7 8 9 10 11 12 13 14
RDKit ERROR: 


14
(631, 14)


### Removing duplicates and inconsistent labels

In [11]:
#Function to remove inconsistent and duplicate molecules
'''
If a compound is duplicated, record the most occuring activity
If the activities for the duplicates occur same number of times e.g. 4 duplicates with 2 active and 2 inactives, such compound (all its duplicates) are ambiguous and removed.
If there is no duplicate leave the compound.
'''
def moder(x):
    m = pd.Series.mode(x) 
    #print(m)
    if len(m) == 1: 
        return m[0]
assays = ['nr-ar', 'nr-ar-lbd', 'nr-ahr', 'nr-aromatase', 'nr-er', 'nr-er-lbd', 'nr-ppar-gamma', 'sr-are', 'sr-atad5', 'sr-hse', 'sr-mmp', 'sr-p53']
def duplicate_label(data):
    dataDF = pd.DataFrame(columns=['smiles_parent'])
    for i in assays:
        #print(i)
        a = data[['smiles_parent', i]]
        #print(a)
        res = a.groupby('smiles_parent')[i].apply(moder)
        #print(res)
        dataDF = dataDF.merge(res, on=['smiles_parent'], how='outer')
    return dataDF       

In [12]:
#Combine the train and evaluation datasets
df_train_eval = pd.concat([clean_train, clean_eval])
print(df_train_eval.shape)
#Remove duplicates
df_train_eval=duplicate_label(df_train_eval)
#Replace the Nones with Nans
train_set= df_train_eval.replace(to_replace=[None], value=np.nan)
#Check for any duplicates
train_set['smiles_parent'].duplicated().any()

(11920, 14)


  warn(f"Unable to sort modes: {err}")


False

In [13]:
train_set.to_csv("tox21_train_preproccessed.csv")

In [14]:
#Remove duplicates
testDF = duplicate_label(clean_test)
print(testDF.shape)
#Replace the Nones with Nans
testing_data= testDF.replace(to_replace=[None], value=np.nan)
#Check for any duplicates
testing_data['smiles_parent'].duplicated().any()

(628, 13)


False

In [15]:
testing_data.to_csv("tox21_test_preproccessed.csv")