In [1]:
import numpy as np
import pandas as pd
import os

from rdkit import Chem
from rdkit.Chem import MACCSkeys, rdFingerprintGenerator
from rdkit import DataStructs
from mordred import Calculator, descriptors

In [2]:
# endpoint = 'skin-sensitization'
endpoint = 'eye-irritation'

loc = r'D:\School\Semester3\Seminar - Reproducibility\seminar-toxicity\data'
endpoint_loc = os.path.join(loc, endpoint)

In [3]:
filename = 'data.csv'
df = pd.read_csv(os.path.join(endpoint_loc, filename))

In [4]:
df.head()

Unnamed: 0,CASRN,SMILES,Activity
0,51581-32-9,CN(C)C(=O)OC1C=CC=NC=1,1
1,35155-28-3,CN1C=C2CC3N(C)CC(CO)CC3(OC)C3C=CC=C1C=32,1
2,289-95-2,C1N=CC=CN=1,1
3,77-78-1,COS(=O)(=O)OC,1
4,80-73-9,CN1CCN(C)C1=O,1


#### MODI Index

For Binary classification

$$MODI = \frac{1}{2} \sum_{i=1}^2 \frac{N_i^{same}}{N_i^{total}}$$

In [5]:
def customTanimoto(a, b):
    return np.sum(a*b)/(np.sum(a**2) + np.sum(b**2) - np.sum(a*b))

def getMODIindex(df, fps, fp = 'rdkit'):
    '''
        df should contain the followinng columns
        SMILES - smiles string
        Activity - respective classification
    '''
    modified_df = df.copy()
    modified_df['fps'] = fps

    Modi = 0
    for activity in modified_df['Activity'].unique():
        temp_df = modified_df[modified_df['Activity'] == activity]
        Ni_same = 0
        Ni_total = 0
        for i in range(temp_df.shape[0]):
            sim = []
            if temp_df['fps'].iloc[i] is None:
                continue
            for j in range(modified_df.shape[0]):
                if modified_df['fps'].iloc[j] is None:
                    continue
                if temp_df['SMILES'].iloc[i] != modified_df['SMILES'].iloc[j]:
                    if fp == 'rdkit':
                        sim.append((DataStructs.TanimotoSimilarity(temp_df['fps'].iloc[i], modified_df['fps'].iloc[j]),
                                    temp_df['Activity'].iloc[i] == modified_df['Activity'].iloc[j]))
                    else:
                        sim.append((customTanimoto(temp_df['fps'].iloc[i], modified_df['fps'].iloc[j]), 
                                    temp_df['Activity'].iloc[i] == modified_df['Activity'].iloc[j]))
            
            max_element = max(sim, key = lambda x: x[0])

            if max_element[1]:
                Ni_same += 1
            Ni_total += 1

        Modi += Ni_same/Ni_total

    Modi = Modi/len(modified_df['Activity'].unique())

    return Modi

In [6]:
'''
    Using MACCS Fingerprints
'''
fps = []
for smiles in df['SMILES']:
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        fps.append(None)
    else:
        fps.append(MACCSkeys.GenMACCSKeys(mol))

assert len(fps) == df.shape[0]

[22:14:25] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4
[22:14:25] Can't kekulize mol.  Unkekulized atoms: 5 6 7 9 10 11 12 13 14 15 16 17 18
[22:14:25] Can't kekulize mol.  Unkekulized atoms: 10 11 12 13 15 16 17 19 20 21 25 27 28
[22:14:26] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 9 10 11 13 14 15
[22:14:26] Can't kekulize mol.  Unkekulized atoms: 10 11 12 13 14 15 16 18 19 20 21 22 23
[22:14:26] Can't kekulize mol.  Unkekulized atoms: 5 6 7 9 10 11 12 13 14 15 16 17 18


In [7]:
Modi = getMODIindex(df, fps)
print('Modi for MACCS Keys :', Modi)

Modi for MACCS Keys : 0.6918367346938776


In [8]:
'''
    Using Morgan Fingerprints r = 3 and nbits = 2048
'''
fpg = rdFingerprintGenerator.GetMorganGenerator(radius=3, fpSize=2048)
fps = []
for smiles in df['SMILES']:
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        fps.append(None)
    else:
        fps.append(fpg.GetFingerprint(mol))

assert len(fps) == df.shape[0]

[22:30:20] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4
[22:30:20] Can't kekulize mol.  Unkekulized atoms: 5 6 7 9 10 11 12 13 14 15 16 17 18
[22:30:20] Can't kekulize mol.  Unkekulized atoms: 10 11 12 13 15 16 17 19 20 21 25 27 28
[22:30:20] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 9 10 11 13 14 15
[22:30:20] Can't kekulize mol.  Unkekulized atoms: 10 11 12 13 14 15 16 18 19 20 21 22 23
[22:30:20] Can't kekulize mol.  Unkekulized atoms: 5 6 7 9 10 11 12 13 14 15 16 17 18


In [9]:
Modi = getMODIindex(df, fps)
print('Modi for Morgan fingerprints :', Modi)

Modi for Morgan fingerprints : 0.700597127739985


In [5]:
calc = Calculator(descriptors, ignore_3D=True)
fps = []
for smiles in df['SMILES']:
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        fps.append(None)
    else:
        fps.append(np.array(calc(mol)))

    break

In [9]:
for res in fps[0]:
    print(res)

8.773010658942226
8.182020864407647
0
0
14.565903310504035
2.2116102388780656
4.423220477756131
14.565903310504034
1.213825275875336
3.3681069927275558
3.2598284381370677
0.27165236984475566
1.3639961244494339
43.58964762813187
3.632470635677656
3.957141239233924
6
6
22
12
0
0
4
10
0
8
2
2
0
0
0
0
0
0
0
192.0
174.0
231.0
176.0
124.0
112.0
49.0
6.0
0.0
64.0
72.0
105.0
99.0
85.0
62.0
51.0
46.0
20.0
113.80555555555554
81.33333333333334
134.61111111111111
142.16666666666669
148.50000000000003
98.49999999999999
80.66666666666666
63.0
22.0
524.0
558.0
742.0
711.0
531.0
456.0
363.0
189.0
54.0
2068.6017079999992
2115.809715999999
2770.7724470000007
2569.0544050000003
1810.496721
1664.4014809999999
1221.473006
512.5380979999999
102.95308800000001
4618.3825586760195
5354.674182590245
6881.573187271842
6875.079950711334
5241.601742927067
4468.3097988201225
4232.296732795553
2828.7085225638057
1104.3953975497307
174.61547200000007
175.29365600000006
283.87417200000004
295.4708119999999
326.649132


In [None]:
'''
    Using Mordred Fingerprints
'''
calc = Calculator(descriptors, ignore_3D=True)
fps = []
for smiles in df['SMILES']:
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        fps.append(None)
    else:
        fps.append(np.array(calc(mol)))

assert len(fps) == df.shape[0]

In [None]:
customTanimoto(fps[0], fps[1])

In [None]:
Modi = getMODIindex(df, fps, fp='custom')
print('Modi for Mordred fingerprints :', Modi)

##### Skin Sensitization

Modi index for Maccs keys = 0.6629 \
Modi index for Morgan keys = 0.6571 \
Modi index for Mordred keys = -

##### Eye irritation

Modi index for Maccs keys = 0.6918 \
Modi index for Morgan keys = 0.7006 \
Modi index for Mordred keys = 