In [1]:
import os
os.chdir('./DDE')

import torch
from torch.autograd import Variable
import torch.nn.functional as F
from torch.utils import data
from torch import nn 
import copy

from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from time import time
from sklearn.metrics import roc_auc_score, precision_recall_curve
from sklearn.model_selection import KFold
torch.manual_seed(2)    # reproducible torch:2 np:3
np.random.seed(3)

from dde_config import dde_NN_config
from dde_torch import dde_NN, simple_autoencoder_NN, simple_NN, dde_NN_Large_Predictor
from stream_dde import supData, unsupData

use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")

In [2]:
from subword_nmt.apply_bpe import BPE
import codecs

dataFolder = './data'

vocab_path = dataFolder + '/codes.txt'
bpe_codes_fin = codecs.open(vocab_path)
bpe = BPE(bpe_codes_fin, merges=-1, separator='')

vocab_map = pd.read_csv(dataFolder + '/subword_units_map.csv')
idx2word = vocab_map['index'].values
words2idx = dict(zip(idx2word, range(0, len(idx2word))))

In [3]:
def smiles2index(s1, s2):
    t1 = bpe.process_line(s1).split() #split
    t2 = bpe.process_line(s2).split() #split
    i1 = [words2idx[i] for i in t1] # index
    i2 = [words2idx[i] for i in t2] # index
    return i1, i2

def index2multi_hot(i1, i2):
    v1 = np.zeros(len(idx2word),)
    v2 = np.zeros(len(idx2word),)
    v1[i1] = 1
    v2[i2] = 1
    v_d = np.maximum(v1, v2)
    return v_d

def smiles2vector(s1, s2):
    i1, i2 = smiles2index(s1, s2)
    v_d = index2multi_hot(i1, i2)
    #v_f = index2single_hot(i1, i2)
    return v_d

In [4]:
dataFolder = '../../../../../scratch/kh2383/DFI_data'
df_ddi = pd.read_csv(dataFolder + '/data/SNAP/sup_train_val.csv')
#model.eval()

In [5]:
df_ddi.head(2)

Unnamed: 0,Drug1_ID,Drug1_SMILES,Drug2_ID,Drug2_SMILES,label
0,DB00706,CCOc1ccccc1OCCN[C@H](C)Cc1ccc(OC)c(S(N)(=O)=O)c1,DB01023,CCOC(=O)C1=C(C)NC(C)=C(C(=O)OC)C1c1cccc(Cl)c1Cl,1.0
1,DB01068,O=C1CN=C(c2ccccc2Cl)c2cc([N+](=O)[O-])ccc2N1,DB00928,Nc1ncn([C@@H]2O[C@H](CO)[C@@H](O)[C@H]2O)c(=O)n1,0.0


In [6]:
def get_codes(s1, s2, model):
    v_D = smiles2vector(s1, s2)
    v_D = np.expand_dims(v_D, axis = 0)
    _, code, s, _ ,_ = model(torch.tensor(v_D).float())
    return code.squeeze().detach().cpu().numpy(), s

In [7]:
s1 = df_ddi[df_ddi.Drug1_ID == 'DB01020'].Drug1_SMILES.values[0]

In [8]:
s1

'O=[N+]([O-])O[C@@H]1CO[C@@H]2[C@@H](O)CO[C@H]12'

In [9]:
s2 = df_ddi[df_ddi.Drug1_ID == 'DB00203'].Drug1_SMILES.values[0]

In [10]:
s2

'CCCc1nn(C)c2c(=O)nc(-c3cc(S(=O)(=O)N4CCN(C)CC4)ccc3OCC)[nH]c12'

In [17]:
dict_codes = []

In [22]:
for i in ['5', '10', '15', '20', '25']:
    model = torch.load('../../../../../scratch/kh2383/DFI_checkpoint/model_train_checkpoint_deepDDI_small_Run2_explainability_seed' + i + '.pt')
    #model.load_state_dict(model)
    if isinstance(model, torch.nn.DataParallel):
        model = model.module
    model.cuda()
    model.eval()
    code, score = get_codes(s1, s2, model)
    code1 = np.abs(code[np.abs(code)>0])*100
    dict_codes.append(code1)

In [29]:
dict_codes

[array([ 0.63542426,  0.44687057,  6.33255529,  7.1794467 ,  4.8289566 ,
         7.77798128,  2.47209358,  7.77451897,  7.52848101,  3.0971632 ,
         6.39500332,  5.20142174,  5.87682152,  4.52361298,  1.83678734,
         6.79152107,  2.82770014,  7.7742219 ,  1.89271498,  7.77455568,
         0.4949483 ,  6.2375164 ,  0.934048  ,  0.67924112,  7.77444696], dtype=float32),
 array([ 0.01780428,  0.1331259 ,  3.89042711,  7.06324005,  3.9596386 ,
         6.61894989,  1.32479537,  6.9335289 ,  6.64494944,  6.79534388,
         4.23204994,  4.39468861,  6.81447411,  2.31668234,  4.62658739,
         4.81248665,  5.48766088,  1.57640505,  6.49967289,  7.0633688 ,
         0.71887422,  4.12155199,  0.1638066 ,  0.17555533,  7.06337404], dtype=float32),
 array([ 0.29192531,  0.48657787,  4.27260447,  6.35948324,  4.15854836,
         6.19628334,  4.72457743,  6.28347683,  5.82954979,  6.35947514,
         6.359622  ,  4.14088583,  6.11369514,  3.15054655,  6.35948849,
         5.286348

In [27]:
np.asarray(dict_codes).T.shape

(25, 5)

In [30]:
correlation_matrix = pd.DataFrame(np.asarray(dict_codes).T).corr()

In [33]:
correlation_matrix.mean().mean()

0.7673216786083199

In [None]:
# from the first experiment

In [16]:
idx = np.argwhere(np.abs(code)>0).flatten()
high_idx = idx[more>0.08]
idx2word[high_idx]

array(['N', 'CCC', 'c12', 'c2c(=O)', 'O=[N+]([O-])'], dtype=object)

In [17]:
idx2word[idx]

array([')', '4', 'n', 'N', '2', '(', 'c3', 'CCC', 'CO', 'OCC', '[C@@H](O)',
       '[C@H]1', '[nH]', '(C)CC', 'c1n', '[C@@H]2', 'CCN', 'S(=O)(=O)',
       'O[C@@H]1', 'c12', 'c(-c3cc', 'c2c(=O)', '4)cc', 'n(C)',
       'O=[N+]([O-])'], dtype=object)

In [18]:
bpe.process_line(s1).split()

['O=[N+]([O-])', 'O[C@@H]1', 'CO', '[C@@H]2', '[C@@H](O)', 'CO', '[C@H]1', '2']

In [19]:
bpe.process_line(s2).split()

['CCC',
 'c1n',
 'n(C)',
 'c2c(=O)',
 'n',
 'c(-c3cc',
 '(',
 'S(=O)(=O)',
 'N',
 '4',
 'CCN',
 '(C)CC',
 '4)cc',
 'c3',
 'OCC',
 ')',
 '[nH]',
 'c12']