In [1]:
import pandas as pd
import torch
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import sys
import os
import pickle

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdmolops

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split



In [2]:
data_folder = 'raw_data/'
data_source = '1 Supplemental Excel data.xlsx'

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
chems_data  = pd.read_excel(data_folder+data_source, sheet_name = 1, header=1)

# chems_extfp = pd.read_excel(data_folder+data_source, sheet_name = 2, header=0)
# chems_maccs = pd.read_excel(data_folder+data_source, sheet_name = 3, header=0)
# chems_fcfp4 = pd.read_excel(data_folder+data_source, sheet_name = 4, header=0)

In [4]:
robcmr = 'Excluded in robustness check 2 - CMR'
robpbt = 'Excluded in robustness check 2 - PBT/vPvB'
# robed  = 'Excluded in robustness check 0 - ED'
sm = 'SMILES'

In [5]:
neg_ext = pd.read_csv(data_folder+'06_ExtendedFingerprint_Negative_List.csv', sep=';', engine='python')
pos_ext = pd.read_csv(data_folder+'06_ExtendedFingerprint_Positive_List.csv', sep=';', engine='python')

neg_maccs = pd.read_csv(data_folder+'01_MACCS_Negative_List.csv', sep=';', engine='python')
pos_maccs = pd.read_csv(data_folder+'01_MACCS_Positive_List.csv', sep=';', engine='python')

# neg_fcfp = pd.read_csv(data_folder+'15_FCFP4_Negative_List.csv', sep=';', engine='python')
# pos_fcfp = pd.read_csv(data_folder+'15_FCFP4_Positive_List.csv', sep=';', engine='python')

In [6]:
neg_ext = neg_ext.loc[~neg_ext[sm].isin(chems_data[sm].loc[chems_data[robcmr] == 1].tolist())]
pos_ext = pos_ext.loc[~pos_ext[sm].isin(chems_data[sm].loc[chems_data[robcmr] == 1].tolist())]

neg_maccs = neg_maccs.loc[~neg_maccs[sm].isin(chems_data[sm].loc[chems_data[robpbt] == 1].tolist())]
pos_maccs = pos_maccs.loc[~pos_maccs[sm].isin(chems_data[sm].loc[chems_data[robpbt] == 1].tolist())]

# neg_fcfp = neg_fcfp.loc[~neg_fcfp[sm].isin(chems_data[sm].loc[chems_data[robed] == 1].tolist())]
# pos_fcfp = pos_fcfp.loc[~pos_fcfp[sm].isin(chems_data[sm].loc[chems_data[robed] == 1].tolist())]

In [7]:
fp_cols = ['SMILES']

chems_ext = pos_ext[fp_cols+['CMR']].append(neg_ext[fp_cols+['CMR']])

# chems_extC = pos_ext[fp_cols+['C']].append(neg_ext[fp_cols+['C']])
# chems_extM = pos_ext[fp_cols+['M']].append(neg_ext[fp_cols+['M']])
# chems_extR = pos_ext[fp_cols+['R']].append(neg_ext[fp_cols+['R']])

chems_maccs = pos_maccs[fp_cols+['PBT/vPvB']].append(neg_maccs[fp_cols+['PBT/vPvB']])
# smiles_maccs = pos_maccs['SMILES'].append(neg_maccs['SMILES'])

# chems_maccsPBT = pos_maccs[fp_cols+['PBT']].append(neg_maccs[fp_cols+['PBT']])
# chems_maccsvPvB = pos_maccs[fp_cols+['vPvB']].append(neg_maccs[fp_cols+['vPvB']])

# chems_fcfp = pos_fcfp[fp_cols+['ED']].append(neg_fcfp[fp_cols+['ED']])

In [8]:
print(chems_ext.loc[chems_ext['CMR']==1].shape)
print(chems_ext.loc[chems_ext['CMR']==0].shape)
print(chems_maccs.loc[chems_maccs['PBT/vPvB']==1].shape)
print(chems_maccs.loc[chems_maccs['PBT/vPvB']==0].shape)
# print(chems_fcfp.loc[chems_fcfp['ED']==1].shape)

(271, 2)
(628, 2)
(113, 2)
(725, 2)


In [9]:
def create_representation2(smiles_data, features, dictionary_bonds, tox_label):
    matrices_dataset = []
    
    for mol_idx in range(smiles_data.shape[0]):
        #get molecule from SMILES
        mcule = Chem.MolFromSmiles(smiles_data['SMILES'].iloc[mol_idx])
        
        #adjacency matrix, molecule in graph and edge list
        adj_matrix = Chem.GetAdjacencyMatrix(mcule)
        mcule_graph = nx.from_numpy_matrix(adj_matrix)
        edge_list = (adj_matrix != 0).nonzero()
        
        #get label for each atom as list
        atom_labels = []
        for _, atom in enumerate(mcule.GetAtoms()):
            atom_labels.append(atom.GetAtomicNum())
        
        #add edge label as weight to the graph representation, use dict of bonds
        for atom1,atom2 in np.transpose(edge_list):
            bond_str = str(mcule.GetBondBetweenAtoms(int(atom1),int(atom2)).GetBondType())
            mcule_graph[atom1][atom2]['weight'] = bond_dict.get(bond_str)

        #add adjancency matrix with edge labels, atom labels and label to dataset
        adj_mat = nx.to_numpy_matrix(mcule_graph).astype(int)
        tox_lab = smiles_data[tox_label].iloc[mol_idx]
        matrices_dataset.append([adj_mat, atom_labels, tox_lab])


    return pd.DataFrame(matrices_dataset, columns=['matrix', 'atoms', 'tox'])

In [10]:
def prepare_onehotencoded(rep_dataset):
    atom_list = [x for x in rep_dataset['atoms'].values]
    atom_flat = [item for sublist in atom_list for item in sublist]
    atom_set  = sorted(list(set(atom_flat)))
    
    #first replace with numbers above max of atom set to avoid multiple wrong replacing
    replace1  = list(range(max(atom_set)+1, len(atom_set)+max(atom_set)+1))
    for i, a_list in enumerate(atom_list):
        for j, atom_l in enumerate(a_list):
            for idx, atom_s in enumerate(atom_set):
                if atom_l == atom_s:
                    atom_list[i][j] = replace1[idx]
                    
    #second replace with numbers in range zero to length atom set 
    replace2  = list(range(len(replace1)))
    for i, a_list in enumerate(atom_list):
        for j, atom_l in enumerate(a_list):
            for idx, atom_s in enumerate(replace1):
                if atom_l == atom_s:
                    atom_list[i][j] = replace2[idx]
    
    rep_dataset['atoms'] = atom_list
    dict_onehot = {}
    for i, d in enumerate(replace2):
        dict_onehot[d] = atom_set[i]

    return rep_dataset, dict_onehot

In [11]:
def transform_DGL(dataframe):
    DGL_list = []
    #convert to correct conventions
    dataframe['matrix'] = dataframe['matrix'].apply(lambda x: torch.tensor(x))
    dataframe['atoms']  = dataframe['atoms'].apply(lambda x: torch.tensor(x))
    #create list of graph info dictionaries
    for i in range(dataframe.shape[0]):
        mol_dict = {}
        mol_dict['num_atom']  = int(len(dataframe['atoms'].iloc[i]))
        mol_dict['atom_type'] = dataframe['atoms'].iloc[i]
        mol_dict['bond_type'] = dataframe['matrix'].iloc[i]
        mol_dict['label']     = dataframe['tox'].iloc[i]
        DGL_list.append(mol_dict)
    return DGL_list