In [1]:
import os
os.chdir('../')

In [2]:
import pandas as pd
import numpy as np

import DTI.models as models
from DTI.utils import data_process, convert_y_unit, generate_config

In [4]:
# load data
df_Kd = pd.read_csv('../DTBA_data_folder/Kd/data.csv')
df_Kd = df_Kd.sample(frac = 0.002, replace = False)# toy dataset

X_drug = df_Kd.SMILES.values
X_target = df_Kd['Target Sequence'].values 
# support nM to p (logspace) convertion to help regression
y = [1 if i else 0 for i in df_Kd.Kd.values <30]
#y = convert_y_unit(df_Kd.Kd.values, 'nM', 'p') 

drug_encoding = 'MPNN'
target_encoding = 'AAC'
train, val, test = data_process(X_drug, X_target, y, 
                                drug_encoding, target_encoding, 
                                split_method='random',frac=[0.7,0.1,0.2])

in total: 133 drug-target pairs
encoding drug...
unique drugs: 87
drug encoding finished...
encoding protein...
unique target sequence: 110
-- Encoding AAC takes time. Time Reference: 24s for ~100 sequences in a CPU. Calculate your time by the unique target sequence #, instead of the entire dataset.
protein encoding finished...
splitting dataset...
Done.


In [5]:
# model setup, you can adjust the config file by typing in model parameters. e.g. cls_hidden_dim = [256, 32]
config = generate_config(drug_encoding, target_encoding, train_epoch = 3)
model = models.model_initialize(**config)

In [6]:
model.train(train, val, test)

--- Data Preparation ---
--- Go for Training ---
Training at Epoch 1 iteration 0 with loss 0.66001576


  'precision', 'predicted', average, warn_for)


Validation at Epoch 1 , AUROC: 0.36363636363636365 , AUPRC: 0.17424242424242425 , F1: 0.0
Training at Epoch 2 iteration 0 with loss 0.46462926
Validation at Epoch 2 , AUROC: 0.6363636363636364 , AUPRC: 0.6 , F1: 0.6666666666666666
Training at Epoch 3 iteration 0 with loss 0.04125151
Validation at Epoch 3 , AUROC: 0.6363636363636364 , AUPRC: 0.6 , F1: 0.6666666666666666
--- Go for Testing ---
Testing AUROC: 0.43636363636363634 , AUPRC: 0.24648577966655769 , F1: 0.0
--- Training Finished ---


In [7]:
test = df_Kd.sample(n = 20, replace=False)
target = test['Target Sequence'].iloc[0]
X_repurpose = test.SMILES.values
drug_name = test.PubChem_ID.astype(int).astype(str).values
target_name = test.UniProt_ID.iloc[0]

In [8]:
r = models.repurpose(X_repurpose, target, model, drug_name, target_name)

repurposing...
in total: 20 drug-target pairs
encoding drug...
unique drugs: 18
drug encoding finished...
encoding protein...
unique target sequence: 1
-- Encoding AAC takes time. Time Reference: 24s for ~100 sequences in a CPU. Calculate your time by the unique target sequence #, instead of the entire dataset.
protein encoding finished...
splitting dataset...
Done.
predicting...
---------------
Drug Repurposing Result for P36896
Drug 216239     predicted to NOT have interaction with the target
Drug 44355753   predicted to NOT have interaction with the target
Drug 44243183   predicted to NOT have interaction with the target
Drug 24180719   predicted to NOT have interaction with the target
Drug 216239     predicted to NOT have interaction with the target
Drug 10184653   predicted to NOT have interaction with the target
Drug 176155     predicted to NOT have interaction with the target
Drug 132157535  predicted to NOT have interaction with the target
Drug 5494449    predicted to NOT have 

In [9]:
target = test['Target Sequence'].values
target_name = test.UniProt_ID.astype(str).values

In [10]:
r = models.virtual_screening(X_repurpose, target, model, drug_name, target_name)

virtual screening...
in total: 20 drug-target pairs
encoding drug...
unique drugs: 18
drug encoding finished...
encoding protein...
unique target sequence: 19
-- Encoding AAC takes time. Time Reference: 24s for ~100 sequences in a CPU. Calculate your time by the unique target sequence #, instead of the entire dataset.
protein encoding finished...
splitting dataset...
Done.
predicting...
---------------
Virtual Screening Result
Drug 216239     predicted to NOT have interaction with the target P36896 
Drug 44355753   predicted to NOT have interaction with the target P00374 
Drug 44243183   predicted to NOT have interaction with the target nan    
Drug 24180719   predicted to NOT have interaction with the target P61075 
Drug 216239     predicted to NOT have interaction with the target P00533 
Drug 10184653   predicted to NOT have interaction with the target O95747 
Drug 176155     predicted to NOT have interaction with the target Q13131 
Drug 132157535  predicted to have interaction with 