In [2]:
import dgl
from collections import defaultdict
from dgl.nn.pytorch.glob import AvgPooling
from dgllife.model import load_pretrained
from dgllife.model.model_zoo import *
from dgllife.utils import mol_to_bigraph, PretrainAtomFeaturizer, PretrainBondFeaturizer
import numpy as np
import pandas as pd
import pickle
from rdkit import Chem
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

def collate(gs):
    return dgl.batch(gs)


In [5]:
model = load_pretrained('gin_supervised_infomax') # contextpred infomax edgepred masking
model.to('cpu')
model.eval()

# with open('/tf/notebooks/code_for_pub/smiles_files/smiles_drugcomb_BY_cid_duplicated.pickle','rb') as f:
#     b = pickle.load(f)
annotation_df = pd.read_csv('../pathway-mll/imszcp_data/annotation_manual_fixed.csv', encoding="ISO-8859-1")

Downloading gin_supervised_infomax_pre_trained.pth from https://data.dgl.ai/dgllife/pre_trained/gin_supervised_infomax.pth...
Pretrained model loaded


In [10]:
graphs = []
b_res = annotation_df['new_smiles']
for smi in b_res:
    try:
        mol = Chem.MolFromSmiles(smi)
        if mol is None:
            continue
        g = mol_to_bigraph(mol, add_self_loop=True,
                           node_featurizer=PretrainAtomFeaturizer(),
                           edge_featurizer=PretrainBondFeaturizer(),
                           canonical_atom_order=True)
        graphs.append(g)

    except:
        continue
del b_res

In [13]:
data_loader = DataLoader(graphs, batch_size=256, collate_fn=collate, shuffle=False)

readout = AvgPooling()

mol_emb = []
for batch_id, bg in enumerate(data_loader):
    bg = bg.to('cpu')
    nfeats = [bg.ndata.pop('atomic_number').to('cpu'),
              bg.ndata.pop('chirality_type').to('cpu')]
    efeats = [bg.edata.pop('bond_type').to('cpu'),
              bg.edata.pop('bond_direction_type').to('cpu')]
    with torch.no_grad():
        node_repr = model(bg, nfeats, efeats)
    mol_emb.append(readout(bg, node_repr))
mol_emb = torch.cat(mol_emb, dim=0).detach().cpu().numpy()

In [20]:
#http://86.50.253.156:8888/notebooks/notebooks/code_for_pub/_5_make_infomaxFP.ipynb
fps_infomax_new = pd.DataFrame(data=mol_emb, index=b.index)
fps_infomax_new = pd.concat((annotation_df, fps_infomax_new), axis=1)
# drugs_name = '/tf/notebooks/code_for_pub/smiles_files/drugcomb_drugs_export_OCT2020.csv'
# drugs = pd.read_csv(drugs_name, names=['dname','id', 'smiles', 'cid'], header=0) # oct2020 version

# mapping = defaultdict(list) 
# for i in drugs.itertuples(): # map cid to id
#     mapping[i.cid] = i.id
# fps_infomax_new['id'] = fps_infomax_new.index
# fps_infomax_new['id'] = fps_infomax_new['id'].map(mapping)
# fps_infomax_new = fps_infomax_new.set_index('id', drop=True)

In [21]:
fps_infomax_new.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,RackCode,Plate Location,VialCode,Cat. No.,Compound Name,Synonyms,CAS No.,M.Wt,...,290,291,292,293,294,295,296,297,298,299
0,0,0,HYCPK37909,A2,,HY-N7075,Inulin,,9005-80-5,,...,-0.295861,0.339213,0.030114,0.04125,-0.143652,0.016088,-0.118622,0.3585,0.043871,0.115406
1,1,7,HYCPK37909,A9,,HY-100589,Isepamicin (sulfate),Sch 21420 (sulfate),67814-76-0,,...,-0.17522,0.266567,0.060309,0.055059,0.00967,0.018011,0.007363,0.164561,0.151743,0.202761
2,2,10,HYCPK37910,A3,,HY-B2162,Chondroitin (sulfate),Chondroitin polysulfate,9007-28-7,,...,-0.066393,0.139899,0.008509,-0.00235,-0.024057,0.016829,-0.021586,0.288418,0.295866,0.21582
3,3,11,HYCPK37910,A4,,HY-A0276,Gentamicin (sulfate),,1405-41-0,561.65 (Average),...,-0.046858,0.162902,0.047013,0.12069,-0.013501,0.018784,-0.011576,0.13207,0.321714,0.256947
4,4,14,HYCPK37911,A2,,HY-A0020,Eldecalcitol,"ED-71; 2-(3-hydroxypropoxy)-1,25-dihydroxyvita...",104121-92-8,490.72,...,-0.10107,0.050156,0.024114,-0.038566,0.007401,0.018976,-0.087203,0.000458,0.001519,0.098078


In [23]:
with open('../pathway_mll/imszcp_data/fps_infomax_new.pkl', 'wb') as f:
    pickle.dump(fps_infomax_new, f)