In [1]:
%load_ext autoreload
%autoreload 2

import re
import pickle
import numpy as np
import pandas as pd
from joblib import Parallel, delayed

import rdkit
from rdkit.Chem import Descriptors
from rdkit.Chem import AllChem

from utils.features import *

Loaded 97 descriptor functions


In [3]:
df = pd.read_csv("AID_893/AID_893_datatable_all.csv", skiprows=[1,2,3,4,5], header=0, index_col=0)
print((df.PUBCHEM_ACTIVITY_OUTCOME=='Active').sum())
print((df.PUBCHEM_ACTIVITY_OUTCOME=='Inactive').sum())
df

5711
63840


Unnamed: 0_level_0,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_EXT_DATASOURCE_SMILES,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Phenotype,Potency,Efficacy,...,Activity at 0.092 uM,Activity at 0.205 uM,Activity at 0.457 uM,Activity at 1.022 uM,Activity at 2.286 uM,Activity at 5.112 uM,Activity at 11.43 uM,Activity at 25.56 uM,Activity at 57.14 uM,Compound QC
PUBCHEM_RESULT_TAG,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,846117,648489.0,C1CCN(C1)S(=O)(=O)C2=CC=C(C=C2)S(=O)(=O)N3CCN4...,Inactive,0,,,Inactive,,,...,3.7270,,1.777,,-8.3470,,0.3165,,10.3500,QC'd by DPISMR
2,4253520,266698.0,C1CCC(C1)(C#N)NC2=CC=C(C=C2)C3=CC=C(C=C3)NC4(C...,Inactive,0,,,Inactive,,,...,4.6170,,-1.313,,9.4050,,9.1420,,,QC'd by NCI
3,845570,206650.0,C1CCC2(CC1)C3=C(CCCC3)N=C(S2)N,Inactive,0,,,Inactive,,,...,-3.9390,,-6.395,,-2.9000,,11.3400,,4.2200,QC'd by DPISMR
4,4265391,1261815.0,CC1=CC2=C(C=C1)N=C(C=C2C(=O)N3CCC4(CC3)OCCO4)C...,Inactive,0,,,Inactive,,,...,-0.3424,,-11.830,,6.7500,,-2.0780,,-29.7100,QC'd by DPISMR
5,4250134,3243989.0,CC(C)CCNC(=O)CCCOC1=CC(=O)N(C2=CC=CC=C21)C,Inactive,0,,,Inactive,,,...,5.8420,,1.664,,-3.3650,,-6.7550,,-1.7790,QC'd by DPISMR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75024,3711461,2997645.0,CC1=CC(=C(C=C1)C)C2=NN=C(C3=CC=CC=C32)OCC(=O)N...,Inactive,0,,,Inactive,,,...,4.8250,,-8.253,,-4.6270,,7.4630,,-1.6130,QC'd by DPISMR
75025,861906,663131.0,C1CCC(=CC1)CCN2C(=C(C(=O)NC2=O)C=NCCN3CCNCC3)O,Inactive,0,,,Inactive,,,...,-2.3340,,-6.264,,-4.4200,,3.9570,,10.1900,QC'd by DPISMR
75026,26748798,312915.0,CCN(CC)CCOC(=O)C(CC1CCCO1)CC2=CC=CC3=CC=CC=C32...,Inactive,0,,,Inactive,,,...,-0.4557,,0.514,,0.5472,,0.4700,,-0.4347,QC'd by Microsource
75027,11113905,197584.0,C[C@@H](C1=CC(=C(C=C1)Cl)Cl)NC[C@@H](CP(=O)(CC...,Inactive,0,,,Inactive,,,...,1.5170,-3.631,0.799,-3.478,-3.8690,-2.404,14.2200,5.423,-23.3700,QC'd by Tocris


In [28]:
smiles_ls = df.PUBCHEM_EXT_DATASOURCE_SMILES
print(len(smiles_ls))
to_drop = np.where(smiles_ls.isna())[0] + 1

smiles_ls = smiles_ls.drop(index=to_drop, axis=0).astype(str).to_list()
print(len(smiles_ls))

75028
74988


In [29]:
mol_ls = list(map(lambda x: rdkit.Chem.MolFromSmiles(x), smiles_ls))

In [30]:
activity_ls = df.PUBCHEM_ACTIVITY_OUTCOME
activity_ls = activity_ls.drop(index=to_drop, axis=0).map({'Inactive':0, 'Active':1}).tolist()

ytrue_idxs = np.where(np.array(activity_ls))[0]
ytrue = np.array(smiles_ls)[ytrue_idxs]

In [31]:
from rdkit.Chem.FilterCatalog import FilterCatalog, FilterCatalogParams

pains_idxs = []

# initialize filter
params = FilterCatalogParams()
params.AddCatalog(FilterCatalogParams.FilterCatalogs.PAINS)
catalog = FilterCatalog(params)


for i, mol in zip(range(len(mol_ls)), mol_ls):
    entry = catalog.GetFirstMatch(mol)  # Get the first matching PAINS
    if entry is not None:
        pains_idxs.append(i)

In [40]:
np.array(activity_ls)

array([0., 0., 0., ..., 0., 0., 0.])

In [32]:
print(f"% PAINS: {len(pains_idxs)/len(activity_ls)}")
print(f"% Hits: {sum(activity_ls)/len(activity_ls)}")

print(f"# PAINs active: {len(pains_idxs) + len(ytrue_idxs) - len(set(pains_idxs+list(ytrue_idxs)))}")
print(f"% PAINs active: {(len(pains_idxs) + len(ytrue_idxs) - len(set(pains_idxs+list(ytrue_idxs)))) / len(ytrue_idxs)}")

% PAINS: 0.039139595668640316
% Hits: nan
# PAINs active: 790
% PAINs active: 0.07068086248546121
