In [1]:
import torch
import numpy as np
import pickle
import pandas as pd
from tqdm import tqdm
from torch.utils.data import DataLoader
from deepdtagen.demo.demo_utils import *
from deepdtagen.demo.model_aff import DeepDTAGen
from deepdtagen.utils import *

# 1. Environments

In [2]:
dataset_name = 'bindingdb'

## Setup device

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')

## Paths

In [4]:
filepath_model = os.path.join('models', f'deepdtagen_model_{dataset_name}.pth')
filepath_tokenizer = os.path.join('data', f'{dataset_name}_tokenizer.pkl')

## Load Tokenizer

In [5]:
import sys
import deepdtagen.utils as dutils
sys.modules['utils'] = dutils

with open(filepath_tokenizer, 'rb') as f:
    tokenizer = pickle.load(f)

In [6]:
print(len(tokenizer))

107


## Load Model

In [7]:
model = DeepDTAGen(tokenizer)

In [8]:
model.load_state_dict(torch.load(filepath_model, map_location=device))

<All keys matched successfully>

In [9]:
_ = model.to(device)

In [10]:
_ = model.eval()

## Test Data

In [11]:
# column_name = 'canonical_SMILES'
column_name = 'isomeric_SMILES'

data_path = 'data/anticancer_IC50.csv'

df = pd.read_csv(data_path)

smiles = df[column_name].tolist()

print(smiles)

target = df['FASTA'].tolist()

print(target)

['CN(C)CC=CC(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc1OC1CCOC1', 'CN(C)CC=CC(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc1OC1CCOC1', 'CN(C)CC=CC(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc1OC1CCOC1', 'CN(C)CC=CC(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc1OC1CCOC1', 'CN(C)CC=CC(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc1OC1CCOC1', 'CN(C)CC=CC(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc1OC1CCOC1', 'CN(C)CC=CC(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc1OC1CCOC1', 'CN(C)CC=CC(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc1OC1CCOC1', 'CN(C)CC=CC(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc1OC1CCOC1', 'CN(C)CC=CC(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc1OC1CCOC1', 'CN(C)CC=CC(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc1OC1CCOC1', 'CN(C)CC=CC(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc1OC1CCOC1', 'CN(C)CC=CC(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc1OC1CCOC1', 'CN(C)CC=CC(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc1OC1CCOC1', 'CN(C)CC=CC(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc1OC1CCOC1', 'CN(C)CC=CC(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc1OC1CCOC1', 'CN(C)CC=CC(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc1OC1CCOC

In [12]:
df_inputs = pd.DataFrame(
    [{f'{column_name}': x, 'TARGET': y, 'Standard Value': z} for x, y, z in zip(smiles, target, df['Standard Value'])]
)

In [13]:
df_inputs

Unnamed: 0,isomeric_SMILES,TARGET,Standard Value
0,CN(C)CC=CC(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc...,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,6.0
1,CN(C)CC=CC(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc...,MELAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDML...,2.0
2,CN(C)CC=CC(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc...,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,11.0
3,CN(C)CC=CC(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc...,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,15.0
4,CN(C)CC=CC(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc...,MELAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDML...,14.0
...,...,...,...
525,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,38.5
526,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,MELAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDML...,1230.0
527,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,80.0
528,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,39.0


# 2. Binding Affinity Prediction

## Data Loader

In [14]:
def create_dataset(seqs_smi, seqs_prot):
    from deepdtagen.demo.demo_utils import TestbedDataset
    smile_graph = {}
    for smi in seqs_smi:
        if not smi in smile_graph:
            g = smile_to_graph(smi)
            smile_graph[smi] = g
        
    XD = np.asarray(seqs_smi)
    XT = np.asarray([seq_cat(aa) for aa in seqs_prot])
    
    name = 'tmp'
    data = TestbedDataset(
        root='data',
        dataset=name,
        xd=XD,
        xt=XT,
        smile_graph=smile_graph
    )
    return data

In [15]:
test_data = create_dataset(df_inputs[column_name], df_inputs['TARGET'])

Preparing data in Pytorch Format: 1/530
Preparing data in Pytorch Format: 2/530
Preparing data in Pytorch Format: 3/530
Preparing data in Pytorch Format: 4/530
Preparing data in Pytorch Format: 5/530
Preparing data in Pytorch Format: 6/530
Preparing data in Pytorch Format: 7/530
Preparing data in Pytorch Format: 8/530
Preparing data in Pytorch Format: 9/530
Preparing data in Pytorch Format: 10/530
Preparing data in Pytorch Format: 11/530
Preparing data in Pytorch Format: 12/530
Preparing data in Pytorch Format: 13/530
Preparing data in Pytorch Format: 14/530
Preparing data in Pytorch Format: 15/530
Preparing data in Pytorch Format: 16/530
Preparing data in Pytorch Format: 17/530
Preparing data in Pytorch Format: 18/530
Preparing data in Pytorch Format: 19/530
Preparing data in Pytorch Format: 20/530
Preparing data in Pytorch Format: 21/530
Preparing data in Pytorch Format: 22/530
Preparing data in Pytorch Format: 23/530
Preparing data in Pytorch Format: 24/530
Preparing data in Pytorch

In [16]:
test_loader = torch.utils.data.DataLoader(
    test_data,
    batch_size=1,
    shuffle=False,
    collate_fn=collate
)

## Evaluate the model

In [17]:
predictions = []

with torch.no_grad():
    for data in tqdm(test_loader, desc='Testing'):
        y = model(data.to(device)).item()
        predictions.append(y)

Testing:   0%|          | 0/530 [00:00<?, ?it/s]

In [18]:
df_res = df_inputs.copy()
df_res.loc[:,f'{column_name}_AFFINITY'] = predictions

In [19]:
df_res

Unnamed: 0,isomeric_SMILES,TARGET,Standard Value,isomeric_SMILES_AFFINITY
0,CN(C)CC=CC(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc...,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,6.0,9.412930
1,CN(C)CC=CC(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc...,MELAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDML...,2.0,8.460589
2,CN(C)CC=CC(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc...,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,11.0,9.412930
3,CN(C)CC=CC(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc...,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,15.0,9.412930
4,CN(C)CC=CC(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc...,MELAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDML...,14.0,8.460589
...,...,...,...,...
525,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,38.5,9.385417
526,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,MELAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDML...,1230.0,6.372738
527,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,80.0,9.385417
528,CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,39.0,9.385417


In [20]:
print(df['Standard Value'][0])

6.0


In [21]:
def denormalize(affinity):
    import numpy as np
    
    return 9-np.log10(affinity) 

df_res['pIC50'] = denormalize(df_res['Standard Value'])

print(df_res)

                                       isomeric_SMILES  \
0    CN(C)CC=CC(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc...   
1    CN(C)CC=CC(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc...   
2    CN(C)CC=CC(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc...   
3    CN(C)CC=CC(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc...   
4    CN(C)CC=CC(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc...   
..                                                 ...   
525  CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...   
526  CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...   
527  CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...   
528  CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...   
529  CCOc1cc2ncc(C#N)c(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)...   

                                                TARGET  Standard Value  \
0    MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...             6.0   
1    MELAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDML...             2.0   
2    MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...            11.0   
3    MR

In [22]:
df_res.to_csv('data/anticancer_IC50_isomeric_pred.csv', index=False)