In [1]:
import numpy as np
import pandas as pd
import os

from rdkit import Chem
from rdkit.Chem import MACCSkeys, rdFingerprintGenerator
from rdkit import DataStructs
from mordred import Calculator, descriptors
import mordred

from tqdm import tqdm

In [2]:
endpoint = 'skin-sensitization'
# endpoint = 'eye-irritation'

loc = r'D:\School\Semester3\Seminar - Reproducibility\seminar-toxicity\data'
endpoint_loc = os.path.join(loc, endpoint)

In [3]:
filename = 'data.csv'
df = pd.read_csv(os.path.join(endpoint_loc, filename))

In [4]:
df.head()

Unnamed: 0,CASRN,SMILES,Activity
0,1655500-83-6,CC(CC=C(C)C)C1CC1(C)CO,1
1,2082-81-7,CC(=C)C(=O)OCCCCOC(=O)C(C)=C,1
2,75-33-2,CC(C)S,1
3,16958-92-2,CCCCCCCCCCCCCOC(=O)CCCCC(=O)OCCCCCCCCCCCCC,1
4,106-26-3,CC(C)=CCCC(C)=CC=O,1


#### MODI Index

For Binary classification

$$MODI = \frac{1}{2} \sum_{i=1}^2 \frac{N_i^{same}}{N_i^{total}}$$

In [5]:
def customTanimoto(a, b):
    return np.sum(a*b)/(np.sum(a**2) + np.sum(b**2) - np.sum(a*b))

def getMODIindex(df, fps, fp = 'rdkit'):
    '''
        df should contain the followinng columns
        SMILES - smiles string
        Activity - respective classification
    '''
    modified_df = df.copy()
    modified_df['fps'] = fps

    Modi = 0
    for activity in modified_df['Activity'].unique():
        temp_df = modified_df[modified_df['Activity'] == activity]
        Ni_same = 0
        Ni_total = 0
        for i in tqdm(range(temp_df.shape[0])):
            sim = []
            if temp_df['fps'].iloc[i] is None:
                continue
            for j in range(modified_df.shape[0]):
                if modified_df['fps'].iloc[j] is None:
                    continue
                if temp_df['SMILES'].iloc[i] != modified_df['SMILES'].iloc[j]:
                    if fp == 'rdkit':
                        sim.append((DataStructs.TanimotoSimilarity(temp_df['fps'].iloc[i], modified_df['fps'].iloc[j]),
                                    temp_df['Activity'].iloc[i] == modified_df['Activity'].iloc[j]))
                    else:
                        sim.append((customTanimoto(temp_df['fps'].iloc[i], modified_df['fps'].iloc[j]), 
                                    temp_df['Activity'].iloc[i] == modified_df['Activity'].iloc[j]))
            
            max_element = max(sim, key = lambda x: x[0])

            if max_element[1]:
                Ni_same += 1
            Ni_total += 1

        Modi += Ni_same/Ni_total

    Modi = Modi/len(modified_df['Activity'].unique())

    return Modi

##### Using MACCS

In [6]:
'''
    Using MACCS Fingerprints
'''
fps = []
for smiles in df['SMILES']:
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        fps.append(None)
    else:
        fps.append(MACCSkeys.GenMACCSKeys(mol))

assert len(fps) == df.shape[0]

[13:46:14] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 9 10 11 13 14 15
[13:46:16] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4


In [7]:
Modi = getMODIindex(df, fps)
print('Modi for MACCS Keys :', Modi)

100%|██████████| 2021/2021 [06:01<00:00,  5.58it/s]
100%|██████████| 1674/1674 [05:25<00:00,  5.14it/s]

Modi for MACCS Keys : 0.6629017653707989





##### Using Morgan

In [8]:
'''
    Using Morgan Fingerprints r = 3 and nbits = 2048
'''
fpg = rdFingerprintGenerator.GetMorganGenerator(radius=3, fpSize=2048)
fps = []
for smiles in df['SMILES']:
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        fps.append(None)
    else:
        fps.append(fpg.GetFingerprint(mol))

assert len(fps) == df.shape[0]

[14:02:17] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 9 10 11 13 14 15
[14:02:17] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4


In [9]:
Modi = getMODIindex(df, fps)
print('Modi for Morgan fingerprints :', Modi)

100%|██████████| 2021/2021 [06:35<00:00,  5.12it/s]
100%|██████████| 1674/1674 [05:15<00:00,  5.30it/s]

Modi for Morgan fingerprints : 0.657179401442834





##### Using MORDRED

In [10]:
calc = Calculator(descriptors, ignore_3D=True)
mol_list = []
for smiles in df['SMILES']:
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        continue
    else:
        mol_list.append(mol)

[14:14:09] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 9 10 11 13 14 15
[14:14:09] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4


In [11]:
df_mordred = calc.pandas(mol_list)

  0%|          | 7/3693 [00:02<10:46,  5.70it/s]  

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  1%|          | 44/3693 [00:03<02:17, 26.49it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  4%|▍         | 146/3693 [00:05<01:15, 46.84it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  4%|▍         | 164/3693 [00:06<01:35, 36.82it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  6%|▌         | 220/3693 [00:08<02:02, 28.38it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 15%|█▍        | 548/3693 [00:20<02:52, 18.26it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 17%|█▋        | 630/3693 [00:23<02:21, 21.65it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 20%|█▉        | 726/3693 [00:29<03:10, 15.61it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 3693/3693 [02:32<00:00, 24.27it/s]


In [12]:
df_mordred.shape

(3693, 1613)

In [13]:
truth_map = df_mordred.applymap(lambda x : not isinstance(x, mordred.error.MissingValueBase))

In [14]:
truth_series = truth_map.all(axis=0)

In [15]:
truth_series

ABC          True
ABCGG        True
nAcid        True
nBase        True
SpAbs_A      True
            ...  
WPol         True
Zagreb1      True
Zagreb2      True
mZagreb1    False
mZagreb2     True
Length: 1613, dtype: bool

In [16]:
truth_series.sum()

1027

In [17]:
'''
    Using Mordred Fingerprints
'''
fps = []
for smiles in tqdm(df['SMILES']):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        fps.append(None)
    else:
        fps.append(np.array(calc(mol))[truth_series.to_numpy()])

assert len(fps) == df.shape[0]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
 27%|██▋       | 992/3695 [01:45<05:41,  7.91it/s][14:18:33] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 9 10 11 13 14 15
 92%|█████████▏| 3381/3695 [06:52<00:37,  8.44it/s][14:23:40] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4
100%|██████████| 3695/3695 [07:30<00:00,  8.20it/s]


In [18]:
Modi = getMODIindex(df, fps, fp='custom')
print('Modi for Mordred fingerprints :', Modi)

100%|██████████| 2021/2021 [1:26:30<00:00,  2.57s/it]
100%|██████████| 1674/1674 [1:21:34<00:00,  2.92s/it]

Modi for Mordred fingerprints : 0.5681088102832996





##### Skin Sensitization

Modi index for Maccs keys = 0.6629 \
Modi index for Morgan keys = 0.6571 \
Modi index for Mordred keys = 0.5681

##### Eye irritation

Modi index for Maccs keys = 0.6918 \
Modi index for Morgan keys = 0.7006 \
Modi index for Mordred keys = 