In [1]:
%load_ext autoreload
%autoreload 2

import re
import pickle
import numpy as np
import pandas as pd
from joblib import Parallel, delayed

import rdkit
from rdkit.Chem import Descriptors
from rdkit.Chem import AllChem

from utils.features import *

Loaded 97 descriptor functions


In [37]:
df = pd.read_csv("dataset/test/AID_1259354/AID_1259354_datatable_all.csv", skiprows=[1,2,3], header=0, index_col=0)
df = df[(~df.PUBCHEM_EXT_DATASOURCE_SMILES.isna())&(~df.PUBCHEM_ACTIVITY_OUTCOME.isna())]
print((df.PUBCHEM_ACTIVITY_OUTCOME=='Active').sum())
print((df.PUBCHEM_ACTIVITY_OUTCOME=='Inactive').sum())
df

1823
54542


Unnamed: 0_level_0,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_EXT_DATASOURCE_SMILES,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,% Inhibition
PUBCHEM_RESULT_TAG,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2,348401744,694901.0,C1=CC=C2C(=C1)C(=O)C(C2=O)C3=CC=NC=C3,Active,,,,99.0
3,348401745,46344002.0,CC(C)C1=CC=C(C=C1)NC(=O)C2=CC3=C(C=C2)N4CCCCC4...,Active,,,,99.0
4,348401746,3772003.0,C1CC(CN(C1)CC2=CC3=C(C=C2)C4=CC=CC=C4C3)CO,Active,,,,98.0
5,348401747,764882.0,CC1=CC=CN2C1=NC(=C(C2=O)C=NC3=CC=C(C=C3)OC)NC,Active,,,,98.0
6,348401748,131841611.0,CC1(CC(CC(N1)(C)C)N2C(=O)OC(C2(C)O)(C)[C@H]3CC...,Active,,,,98.0
...,...,...,...,...,...,...,...,...
77696,348479438,755180.0,C1CN(CCN1C2=CC=CC=C2)C3=NC=NC(=C3)N4C=CC=N4,Inactive,,,,
77697,348479439,705269.0,CCOC(=O)C1=CC=C(C=C1)OCC2=C(ON=C2C)C,Inactive,,,,
77698,348479440,2873589.0,C1=CC=C(C=C1)N2C=C(C(=N2)C3=CC=C(C=C3)Br)C(=O)O,Inactive,,,,
77699,348479441,1084020.0,CCOC1=CC(=CC(=C1OCC)OCC)C2=NC(=NO2)C3=CC=NC=C3,Inactive,,,,


In [38]:
smiles_ls = df.PUBCHEM_EXT_DATASOURCE_SMILES.tolist()
mol_ls = list(map(lambda x: rdkit.Chem.MolFromSmiles(x), smiles_ls))
fp_ls = [AllChem.GetMorganFingerprintAsBitVect(x, radius=2, nBits = 1024) for x in mol_ls]

activity_ls = df.PUBCHEM_ACTIVITY_OUTCOME
activity_ls = activity_ls.map({'Inactive':0, 'Active':1}).tolist()

ds_ls = get_descriptors_parallel(mol_ls)



In [39]:
def get_scaffold(mol: rdkit.Chem.rdchem.Mol) -> str:
    return Chem.MolToSmiles(MakeScaffoldGeneric(GetScaffoldForMol(mol)))

In [40]:
from collections import defaultdict
from rdkit.Chem.Scaffolds.MurckoScaffold import MurckoScaffoldSmiles, MakeScaffoldGeneric, GetScaffoldForMol

scaffold_dict = defaultdict(list)
for idx, mol in zip(range(len(mol_ls)), mol_ls):
    scaffold_dict[get_scaffold(mol)].append(idx)

In [41]:
data = {
    "smiles_ls": smiles_ls,
    "mol_ls": mol_ls,
    "fp_ls": fp_ls,
    "activity_ls": activity_ls,
    "ds_ls": ds_ls,
    "scaffold_dict": scaffold_dict
}
with open("dataset/test/AID_1259354/data.pkl", "wb") as file:
    pickle.dump(data, file)

In [42]:
def preprocess_pubchem(path: str):
    df = pd.read_csv(path, skiprows=[1,2,], header=0, index_col=0)
    df = df[(~df.PUBCHEM_EXT_DATASOURCE_SMILES.isna())&(~df.PUBCHEM_ACTIVITY_OUTCOME.isna())]
    
    smiles_ls = df.PUBCHEM_EXT_DATASOURCE_SMILES.tolist()
    mol_ls = list(map(lambda x: rdkit.Chem.MolFromSmiles(x), smiles_ls))
    fp_ls = [AllChem.GetMorganFingerprintAsBitVect(x, radius=2, nBits = 1024) for x in mol_ls]
    activity_ls = df.PUBCHEM_ACTIVITY_OUTCOME
    activity_ls = activity_ls.map({'Inactive':0, 'Active':1}).tolist()
    ds_ls = get_descriptors_parallel(mol_ls)
    
    scaffold_dict = defaultdict(list)
    for idx, mol in zip(range(len(mol_ls)), mol_ls):
        scaffold_dict[get_scaffold(mol)].append(idx)
        

    data = {
        "smiles_ls": smiles_ls,
        "mol_ls": mol_ls,
        "fp_ls": fp_ls,
        "activity_ls": activity_ls,
        "ds_ls": ds_ls,
        "scaffold_dict": scaffold_dict
    }
    with open(os.path.dirname(path)+"/data.pkl", "wb") as file:
        pickle.dump(data, file)

In [43]:
preprocess_pubchem("dataset/test/AID_488969/AID_488969_datatable_all.csv")
preprocess_pubchem("dataset/test/AID_598/AID_598_datatable_all.csv")

  df = pd.read_csv(path, skiprows=[1,2,], header=0, index_col=0)
