In [1]:
import torch
import numpy as np
import pickle
import pandas as pd
from tqdm import tqdm
from torch.utils.data import DataLoader
from deepdtagen.demo.demo_utils import *
from deepdtagen.demo.model_aff import DeepDTAGen
from deepdtagen.utils import *

# 1. Environments

In [2]:
dataset_name = 'bindingdb'

## Setup device

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')

## Paths

In [4]:
filepath_model = os.path.join('models', f'deepdtagen_model_{dataset_name}.pth')
filepath_tokenizer = os.path.join('data', f'{dataset_name}_tokenizer.pkl')

## Load Tokenizer

In [5]:
import sys
import deepdtagen.utils as dutils
sys.modules['utils'] = dutils

with open(filepath_tokenizer, 'rb') as f:
    tokenizer = pickle.load(f)

In [6]:
print(len(tokenizer))

107


## Load Model

In [7]:
model = DeepDTAGen(tokenizer)

In [8]:
model.load_state_dict(torch.load(filepath_model, map_location=device))

<All keys matched successfully>

In [9]:
_ = model.to(device)

In [10]:
_ = model.eval()

## Test Data

In [11]:
# seqs_smi = [
#     "N#CCC1CCN(Cc2cccc(-c3cc4c(c(S(=O)(=O)Nc5ccsc5C(=O)N[C@@H](CCCNC(=N)N)C(=O)O)c3)OCC4)c2)CC1",
#     "CCS(=O)(=O)CCC(=O)CNC(=O)N1Cc2ccccc2Oc2ccc(Cl)cc21",
#     "CCCCCN1C=C(C2=CC=CC=C21)C(=O)C3C(C3(C)C)(C)C",
#     "CC1=C(C2=C3N1[C@@H](COC3=CC=C2)CN4CCOCC4)C(=O)C5=CC=CC6=CC=CC=C65",
#     "CCCCCC1=CC(=C2[C@@H]3C=C(CC[C@H]3C(OC2=C1)(C)C)C)O",
#     "C[C@@H]([C@@H](CC1=CC=C(C=C1)Cl)C2=CC=CC(=C2)C#N)NC(=O)C(C)(C)OC3=NC=C(C=C3)C(F)(F)F",
#     "CC1=C(N(N=C1C(=O)NN2CCCCC2)C3=C(C=C(C=C3)Cl)Cl)C4=CC=C(C=C4)I",
#     "CCC1=C(N(N=C1C(=O)NN2CCCCC2)C3=C(C=C(C=C3)Cl)Cl)C4=CC=C(C=C4)Br",
#     "CC1=C(C2=C(N1CCN3CCOCC3)C=C(C=C2)I)C(=O)C4=CC=C(C=C4)OC",
#     "CC1=CC=C(C=C1)CN2C(=CC(=N2)C(=O)N[C@H]3[C@]4(CC[C@H](C4)C3(C)C)C)C5=CC(=C(C=C5)Cl)C",
#     "CC1=C(N(N=C1C(=O)NC23CC4CC(C2)CC(C4)C3)CCCCCO)C5=CC=CC=C5",
#     "CCCCCC1=CC(=C(C(=C1)O)[C@@H]2C=C(CC[C@H]2C(=C)C)C)O",
# ]

In [12]:
# seqs_prot = [
#     "MKSILDGLADTTFRTITTDLLYVGSNDIQYEDIKGDMASKLGYFPQKFPLTSFRGSPFQEKMTAGDNPQLVPADQVNITEFYNKSLSSFKENEENIQCGENFMDIECFMVLNPSQQLAIAVLSLTLGTFTVLENLLVLCVILHSRSLRCRPSYHFIGSLAVADLLGSVIFVYSFIDFHVFHRKDSRNVFLFKLGGVTASFTASVGSLFLTAIDRYISIHRPLAYKRIVTRPKAVVAFCLMWTIAIVIAVLPLLGWNCEKLQSVCSDIFPHIDETYLMFWIGVTSVLLLFIVYAYMYILWKAHSHAVRMIQRGTQKSIIIHTSEDGKVQVTRPDQARMDIRLAKTLVLILVVLIICWGPLLAIMVYDVFGKMNKLIKTVFAFCSMLCLLNSTVNPIIYALRSKDLRHAFRSMFPSCEGTAQPLDNSMGDSDCLHKHANNAASVHRAAESCIKSTVKIAKVTMSVSTDTSAEAL",
#     "MEECWVTEIANGSKDGLDSNPMKDYMILSGPQKTAVAVLCTLLGLLSALENVAVLYLILSSHQLRRKPSYLFIGSLAGADFLASVVFACSFVNFHVFHGVDSKAVFLLKIGSVTMTFTASVGSLLLTAIDRYLCLRYPPSYKALLTRGRALVTLGIMWVLSALVSYLPLMGWTCCPRPCSELFPLIPNDYLLSWLLFIAFLFSGIIYTYGHVLWKAHQHVASLSGHQDRQVPGMARMRLDVRLAKTLGLVLAVLLICWFPVLALMAHSLATTLSDQVKKAFAFCSMLCLINSMVNPVIYALRSGEIRSSAHHCLAHWKKCVRGLGSEAKEEAPRSSVTETEADGKITPWPDSRDLDLSDC",    
# ]

In [13]:
# df_inputs = pd.DataFrame(
#     [{'SMILES':x, 'TARGET':y} for y in seqs_prot for x in seqs_smi]
# )

In [14]:
# df_inputs

In [15]:
column_name = 'target_smiles'

data_path = 'data/bindingdb_test.csv'

df = pd.read_csv(data_path)

df = df[:1000]

smiles = df[column_name].tolist()

print(smiles)

target = df['target_sequence'].tolist()

print(target)

['N#CCC1CCN(Cc2cccc(-c3cc4c(c(S(=O)(=O)Nc5ccsc5C(=O)N[C@@H](CCCNC(=N)N)C(=O)O)c3)OCC4)c2)CC1', 'CCS(=O)(=O)CCC(=O)CNC(=O)N1Cc2ccccc2Oc2ccc(Cl)cc21', 'CN(C)C[C@@H]1CCn2cc(c3ccccc32)C2=C(C(=O)NC2=O)c2cn(c3ccccc23)CCO1', 'C[C@@H](Oc1cc(-n2cnc3ccc(CN4CCN(C)CC4)cc32)sc1C(N)=O)c1ccccc1C(F)(F)F', 'CN(C)CCCNC(=O)c1cc(NC(=O)CN2CCCCC2)cc(Nc2ccnc3cc(Cl)ccc23)c1', 'Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1Nc1nccc(-c2cccnc2)n1', 'CC(C)NCC(O)COc1ccc(CCOCC2CCC2)cc1', 'CCN(C(=O)c1nc(C(=O)NCC(C)(C)O)sc1-c1ccc(S(=O)(=O)N[C@@H](C)C(F)(F)F)c(Cl)c1Cl)C1CC1', 'C=O.CC(C)(C)c1cc(NC(=O)Nc2ccc(-c3cn4c(n3)sc3cc(OCCN5CCOCC5)ccc34)cc2)no1.O', 'O=C(N[C@@H](Cc1cccc2ccccc12)C(=O)Nc1ccncc1)C1CCCCC1', 'COc1cc(OCc2cccc(-c3ccccc3)c2C)cc(OC)c1CNCCNC(C)=O', 'O=C(NOCC1CC1)c1ccc(F)c(F)c1Nc1ccc(I)cc1Cl', 'O=C(c1nc(NS(=O)(=O)c2cc(Br)cc(Cl)c2O)cn1C1CCCC1)N1CCC(C2CCCN2)CC1', 'C=O.Cc1[nH]c(/C=C2\\C(=O)Nc3ccc(S(=O)(=O)Cc4c(Cl)cccc4Cl)cc32)c(C)c1C(=O)N1CCC[C@@H]1CN1CCCC1.O', 'C.CC(C)C[C@H](NP(=O)([O-])O[C@@H]1O[C@@H](C)[C@H](O)[C@@H]

In [16]:
df_inputs = pd.DataFrame(
    [{f'{column_name}': x, 'TARGET': y} for x, y in zip(smiles, target)]
)

In [17]:
df_inputs

Unnamed: 0,target_smiles,TARGET
0,N#CCC1CCN(Cc2cccc(-c3cc4c(c(S(=O)(=O)Nc5ccsc5C...,MERGLPLLCAVLALVLAPAGAFRNDKCGDTIKIESPGYLTSPGYPH...
1,CCS(=O)(=O)CCC(=O)CNC(=O)N1Cc2ccccc2Oc2ccc(Cl)...,MSPCGPLNLSLAGEATTCAAPWVPNTSAVPPSGASPALPIFSMTLG...
2,CN(C)C[C@@H]1CCn2cc(c3ccccc32)C2=C(C(=O)NC2=O)...,MSWSPSLTTQTCGAWEMKERLGTGGFGNVIRWHNQETGEQIAIKQC...
3,C[C@@H](Oc1cc(-n2cnc3ccc(CN4CCN(C)CC4)cc32)sc1...,MAAVILESIFLKRSQQKKKTSPLNFKKRLFLLTVHKLSYYEYDFER...
4,CN(C)CCCNC(=O)c1cc(NC(=O)CN2CCCCC2)cc(Nc2ccnc3...,MLLRSKPALPPPLMLLLLGPLGPLSPGALPRPAQAQDVVDLDFFTQ...
...,...,...
995,C=CC(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc1OCCCN...,MADDDVLFEDVYELCEVIGKGPFSVVRRCINRETGQQFAVKIVDVA...
996,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,MDFGSLETVVANSAFIAARGSFDGSSSQPSRDKKYLAKLKLPPLSK...
997,C=O.CCN(CC)c1nc2ccc(C(O)(c3ccc(C(F)(F)F)nc3)c3...,MAHHHHHHAGGAENLYFQGAMDSTPEAPYASLTEIEHLVQSVCKSY...
998,Nc1nc(Nc2ccc(S(N)(=O)=O)cc2)nn1C(=O)c1c(F)cccc1F,MIPLEKPGSGGSSPGATSGSGRAGRGLSGPCRPPPPPQARGLLTEI...


# 2. Binding Affinity Prediction

## Data Loader

In [18]:
def create_dataset(seqs_smi, seqs_prot):
    from deepdtagen.demo.demo_utils import TestbedDataset
    smile_graph = {}
    for smi in seqs_smi:
        if not smi in smile_graph:
            g = smile_to_graph(smi)
            smile_graph[smi] = g
        
    XD = np.asarray(seqs_smi)
    XT = np.asarray([seq_cat(aa) for aa in seqs_prot])
    
    name = 'tmp'
    data = TestbedDataset(
        root='data',
        dataset=name,
        xd=XD,
        xt=XT,
        smile_graph=smile_graph
    )
    return data

In [19]:
test_data = create_dataset(df_inputs['target_smiles'], df_inputs['TARGET'])

Preparing data in Pytorch Format: 1/1000
Preparing data in Pytorch Format: 2/1000
Preparing data in Pytorch Format: 3/1000
Preparing data in Pytorch Format: 4/1000
Preparing data in Pytorch Format: 5/1000
Preparing data in Pytorch Format: 6/1000
Preparing data in Pytorch Format: 7/1000
Preparing data in Pytorch Format: 8/1000
Preparing data in Pytorch Format: 9/1000
Preparing data in Pytorch Format: 10/1000
Preparing data in Pytorch Format: 11/1000
Preparing data in Pytorch Format: 12/1000
Preparing data in Pytorch Format: 13/1000
Preparing data in Pytorch Format: 14/1000
Preparing data in Pytorch Format: 15/1000
Preparing data in Pytorch Format: 16/1000
Preparing data in Pytorch Format: 17/1000
Preparing data in Pytorch Format: 18/1000
Preparing data in Pytorch Format: 19/1000
Preparing data in Pytorch Format: 20/1000
Preparing data in Pytorch Format: 21/1000
Preparing data in Pytorch Format: 22/1000
Preparing data in Pytorch Format: 23/1000
Preparing data in Pytorch Format: 24/1000
P

In [20]:
test_loader = torch.utils.data.DataLoader(
    test_data,
    batch_size=1,
    shuffle=False,
    collate_fn=collate
)

## Evaluate the model

In [21]:
predictions = []

with torch.no_grad():
    for data in tqdm(test_loader, desc='Testing'):
        y = model(data.to(device)).item()
        predictions.append(y)

Testing:   0%|          | 0/1000 [00:00<?, ?it/s]

In [22]:
df_res = df_inputs.copy()
df_res.loc[:,'AFFINITY'] = predictions

In [23]:
df_res

Unnamed: 0,target_smiles,TARGET,AFFINITY
0,N#CCC1CCN(Cc2cccc(-c3cc4c(c(S(=O)(=O)Nc5ccsc5C...,MERGLPLLCAVLALVLAPAGAFRNDKCGDTIKIESPGYLTSPGYPH...,5.737388
1,CCS(=O)(=O)CCC(=O)CNC(=O)N1Cc2ccccc2Oc2ccc(Cl)...,MSPCGPLNLSLAGEATTCAAPWVPNTSAVPPSGASPALPIFSMTLG...,5.752086
2,CN(C)C[C@@H]1CCn2cc(c3ccccc32)C2=C(C(=O)NC2=O)...,MSWSPSLTTQTCGAWEMKERLGTGGFGNVIRWHNQETGEQIAIKQC...,5.013233
3,C[C@@H](Oc1cc(-n2cnc3ccc(CN4CCN(C)CC4)cc32)sc1...,MAAVILESIFLKRSQQKKKTSPLNFKKRLFLLTVHKLSYYEYDFER...,5.005891
4,CN(C)CCCNC(=O)c1cc(NC(=O)CN2CCCCC2)cc(Nc2ccnc3...,MLLRSKPALPPPLMLLLLGPLGPLSPGALPRPAQAQDVVDLDFFTQ...,3.339211
...,...,...,...
995,C=CC(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc1OCCCN...,MADDDVLFEDVYELCEVIGKGPFSVVRRCINRETGQQFAVKIVDVA...,5.004563
996,COC(=O)c1ccc2c(c1)NC(=O)/C2=C(\Nc1ccc(N(C)C(=O...,MDFGSLETVVANSAFIAARGSFDGSSSQPSRDKKYLAKLKLPPLSK...,5.646845
997,C=O.CCN(CC)c1nc2ccc(C(O)(c3ccc(C(F)(F)F)nc3)c3...,MAHHHHHHAGGAENLYFQGAMDSTPEAPYASLTEIEHLVQSVCKSY...,7.149166
998,Nc1nc(Nc2ccc(S(N)(=O)=O)cc2)nn1C(=O)c1c(F)cccc1F,MIPLEKPGSGGSSPGATSGSGRAGRGLSGPCRPPPPPQARGLLTEI...,5.436498


# 3. Target-aware Drug Generation

In [24]:
def create_dataset2(seqs_smi, seqs_prot, affinity):
    from deepdtagen.demo.demo_utils import TestbedDataset2
    smile_graph = {}
    for smi in seqs_smi:
        if not smi in smile_graph:
            g = smile_to_graph(smi)
            smile_graph[smi] = g
        
    XD = np.asarray(seqs_smi)
    XT = np.asarray([seq_cat(aa) for aa in seqs_prot])
    Y = np.asarray(affinity)
    
    name = 'tmp'
    data = TestbedDataset2(
        root='data',
        dataset=name,
        xd=XD,
        xt=XT,
        y=Y,
        smile_graph=smile_graph
    )
    return data

In [25]:
test_data = create_dataset2(df_res['target_smiles'], df_res['TARGET'], df_res['AFFINITY'])

Preparing data in Pytorch Format: 1/1000
Preparing data in Pytorch Format: 2/1000
Preparing data in Pytorch Format: 3/1000
Preparing data in Pytorch Format: 4/1000
Preparing data in Pytorch Format: 5/1000
Preparing data in Pytorch Format: 6/1000
Preparing data in Pytorch Format: 7/1000
Preparing data in Pytorch Format: 8/1000
Preparing data in Pytorch Format: 9/1000
Preparing data in Pytorch Format: 10/1000
Preparing data in Pytorch Format: 11/1000
Preparing data in Pytorch Format: 12/1000
Preparing data in Pytorch Format: 13/1000
Preparing data in Pytorch Format: 14/1000
Preparing data in Pytorch Format: 15/1000
Preparing data in Pytorch Format: 16/1000
Preparing data in Pytorch Format: 17/1000
Preparing data in Pytorch Format: 18/1000
Preparing data in Pytorch Format: 19/1000
Preparing data in Pytorch Format: 20/1000
Preparing data in Pytorch Format: 21/1000
Preparing data in Pytorch Format: 22/1000
Preparing data in Pytorch Format: 23/1000
Preparing data in Pytorch Format: 24/1000
P

In [26]:
test_loader = torch.utils.data.DataLoader(
    test_data,
    batch_size=1,
    shuffle=False,
    collate_fn=collate
)

## Evaluate the model

In [27]:
generated = []

with torch.no_grad():
    for data in tqdm(test_loader, desc='Testing'):
        y = tokenizer.get_text(model.generate(data.to(device)))
        generated.append(y[0])

Testing:   0%|          | 0/1000 [00:00<?, ?it/s]

In [28]:
generated

['Nc1ncnc2c1nc(NCc1ccc(Cl)c(S(=O)(=O)NC(=O)N3CCC(C3)CC3)c1)n2Cc1cccc(CN2CCCCC2)c1',
 'CC(C)C[C@@H](NC(=O)[C@H](Cc1ccccc1)NC(=O)CNC(=O)OC(C)(C)C)C(=O)N[C@H](C(=O)N[C@@H](CC(C)C)C(=O)N[C@H](C(=O)O)C(C)C)C(C)C',
 'CO[C@@H]1[C@H](N(C)C(=O)c2ccccc2)C[C@H]2O[C@]1(C)n1c3ccccc3c3c4c(c5c6ccccc6n2c5c31)C(=O)N[C@H]4O',
 'CO[C@@H]1[C@H](N(C)CCOCCOCCOCCNC(=O)CCC2=[N+]3C(=Cc4c(C)cc(C)n4[B-]3(F)F)C=C2)C[C@H]2O[C@]1(C)n1c3ccccc3c3c4c(c5c6ccccc6n2c5c31)C(=O)NC4',
 'CC1CN(C(=O)Cn2cc(CN3CCN(C)CC3)c3ccc(NC(=S)NCCc4c[nH]c5ccccc45)cc32)CCO1',
 'CO[C@@H]1[C@H](N(C)C(=O)c2ccccc2)C[C@H]2O[C@]1(C)n1c3ccccc3c3c4c(c5c6ccccc6n2c5c31)C(=O)N[C@H]4O',
 'COc1ccc(CCNCC(O)COc2ccc(CCOCC3CCC3)cc2)cc1OC',
 'CC(C)C[C@H](NC(=O)c1ccc(-c2sc(C(=O)NCC(C)(C)O)nc2COC(C)(C)C)c(Cl)c1Cl)C(F)(F)F',
 'COc1cc(N(C)CCCO)ccc1Nc1ncc2c(n1)N(C1CCCC1)CCC(=O)N2C',
 'O=C(NCCCOP(=O)(O)O[C@H]1[C@@H](O)[C@@H](O)[C@H](OP(=O)(O)O)[C@@H](OP(=O)(O)O)[C@H]1O)C(=O)NCCc1ccccc1',
 'COc1cc(C(=O)O)ccc1NC(=O)C(CC(=O)O)C(=O)OC2CCCCC2)c1',
 'CO[C@@H]1[C@H](N(C)

In [None]:
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

from deepdtagen.generation_eveluation import evaluate_smiles

In [None]:
reference_df_path = 'data/bindingdb_train.csv'
reference_df = pd.read_csv(reference_df_path)
reference_list = reference_df['target_smiles'].tolist()

results_df = evaluate_smiles(generated, reference_list)

print(results_df)

{'validity_ratio': 0.837, 'uniqueness_ratio': 0.44086021505376344, 'novelty_ratio': 0.7181571815718157}
