In [1]:
import os
os.chdir('../')

In [2]:
import pandas as pd
import numpy as np

import DTI.models as models
from DTI.utils import data_process, convert_y_unit, generate_config

In [3]:
# load data
df_Kd = pd.read_csv('./DTBA_data_folder/Kd/data.csv')
df_Kd = df_Kd.sample(frac = 0.002, replace = False)# toy dataset

X_drug = df_Kd.SMILES.values
X_target = df_Kd['Target Sequence'].values 
# support nM to p (logspace) convertion to help regression
y = convert_y_unit(df_Kd.Kd.values, 'nM', 'p') 

drug_encoding = 'ECFP4'
target_encoding = 'AAC'
train, val, test = data_process(X_drug, X_target, y, 
                                drug_encoding, target_encoding, 
                                split_method='random',frac=[0.7,0.1,0.2])

in total: 133 drug-target pairs
encoding drug...
unique drugs: 87
drug encoding finished...
encoding protein...
unique target sequence: 113
-- Encoding AAC takes time. Time Reference: 24s for ~100 sequences in a CPU. Calculate your time by the unique target sequence #, instead of the entire dataset.
protein encoding finished...
splitting dataset...
Done.


In [4]:
# model setup, you can adjust the config file by typing in model parameters. e.g. cls_hidden_dim = [256, 32]
config = generate_config(drug_encoding, target_encoding)
model = models.model_initialize(drug_encoding, target_encoding, **config)

In [5]:
model.train(train, val, test)

--- Data Preparation ---
--- Go for Training ---
Training at Epoch 1 iteration 0 with loss 49.618973
Validation at Epoch 1 , MSE: 13.385996113479642 , Pearson Correlation: -0.18776432138026336 with p-value: 0.5390282621855491 , Concordance Index: 0.4722222222222222
Training at Epoch 2 iteration 0 with loss 1.9656172
Validation at Epoch 2 , MSE: 12.268444797253498 , Pearson Correlation: -0.35187780208393715 with p-value: 0.2383721211216671 , Concordance Index: 0.5138888888888888
Training at Epoch 3 iteration 0 with loss 0.000382868
Validation at Epoch 3 , MSE: 9.817124640453986 , Pearson Correlation: -0.1032013980740257 with p-value: 0.7372460543155873 , Concordance Index: 0.5555555555555556
Training at Epoch 4 iteration 0 with loss 0.7580768
Validation at Epoch 4 , MSE: 10.609886467775194 , Pearson Correlation: -0.15560934723718006 with p-value: 0.6117088807588327 , Concordance Index: 0.4861111111111111
Training at Epoch 5 iteration 0 with loss 0.019163823
Validation at Epoch 5 , MSE: 

In [None]:
model.predict()

In [10]:
target = test['Target Sequence'].iloc[0]

In [11]:
X_repurpose = test.SMILES.values

In [13]:
t = test.target_encoding.iloc[0]
x = test.drug_encoding.values

In [18]:
t = np.tile(t, (len(x),1))

In [21]:
import torch

In [30]:
models.repurpose((torch.Tensor(np.vstack(x).astype(np.float)), torch.Tensor(t)), model)

predicting...


tensor([[6.7370],
        [6.8289],
        [6.6317],
        [7.0390],
        [6.4034],
        [6.7295],
        [6.8686],
        [5.8693],
        [6.8866],
        [6.6668],
        [6.4435],
        [6.7394],
        [6.5740],
        [6.7912],
        [6.5648],
        [6.7815],
        [6.8936],
        [6.8324],
        [6.1336],
        [5.9698],
        [6.0565],
        [6.8936],
        [7.1831],
        [6.8331],
        [6.7370],
        [6.4034],
        [7.1325]], grad_fn=<AddmmBackward>)