In [1]:


import pandas as pd
import numpy as np

import math, wandb, os, random
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from copy import deepcopy

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

import networkx as nx

random.seed(42)

###
# Loading in the DDI data preprocessed
###

base_path = "/n/data1/hms/dbmi/zitnik/lab/users/yeh803/DDI/processed_data/simple_baseline_data2"

train_edgelist = np.load(os.path.join(base_path, "train_edgelist.npy"))
test_edgelist = np.load(os.path.join(base_path, "test_edgelist.npy"))
val_edgelist = np.load(os.path.join(base_path, "val_edgelist.npy"))
adjmat = pd.read_csv(os.path.join(base_path, "all_adjlist.csv"), index_col = 0)
ddilookup = pd.read_csv(os.path.join(base_path, "ddilookup.csv"), index_col = 0)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
A = adjmat.values
G = nx.from_numpy_matrix(A)
edgelist = list(G.edges())

B = np.array(A, copy=True) 

where_0 = np.where(A == 0)
where_1 = np.where(A == 1)
B[where_0] = 1
B[where_1] = 0

H = nx.from_numpy_matrix(B)
negative_edgelist = list(H.edges())

print(len(edgelist))
print(len(negative_edgelist))

random.shuffle(negative_edgelist)
neg_arr = np.array(negative_edgelist)

# np.save("data/neg_edgelist.npy",neg_arr)

1116662
4553366


In [3]:
####
# Concatenate the two 
####

train = pd.DataFrame(train_edgelist, columns = ["x", "y"])
train["class"] = 1

neg_train = pd.DataFrame(neg_arr[:len(train_edgelist)], columns = ["x", "y"])
neg_train["class"] = 0


test = pd.DataFrame(test_edgelist, columns = ["x", "y"])
test["class"] = 1

neg_test = pd.DataFrame(neg_arr[len(train_edgelist):len(train_edgelist) + len(test_edgelist)], columns = ["x", "y"])
neg_test["class"] = 0


val = pd.DataFrame(val_edgelist, columns = ["x", "y"])
val["class"] = 1

neg_test = pd.DataFrame(neg_arr[len(train_edgelist) + len(test_edgelist) : len(train_edgelist) + len(test_edgelist) + len(val_edgelist)], columns = ["x", "y"])
neg_test["class"] = 0




train = train.append(neg_train)
test = test.append(neg_test)
val = val.append(neg_test)

train_ddi = train.sample(frac=1)
test_ddi = test.sample(frac=1)
val_ddi = test.sample(frac=1)


  train = train.append(neg_train)
  test = test.append(neg_test)
  val = val.append(neg_test)


# Tanimoto Fingerprints

In [None]:
####
# Grab the list of train drugbank IDs
# NOTE: This cell about two hours to run
######

deepddi_path = "/n/data1/hms/dbmi/zitnik/lab/users/yeh803/DDI/processed_data/simple_baseline_data2/deepDDI"

from tqdm import tqdm

all_ids = np.unique(ddilookup["drug_index"].values)
train_indices = np.unique(train_edgelist)


### Randomly grabbing 2000 drugs to use as basis drugs
shuffled_indices = np.array(train_indices, copy = True)
np.random.shuffle(shuffled_indices)
basis_indices = shuffled_indices[:2000]
### saving the basis drugs in case we need them later
b = pd.DataFrame.from_dict(basis_indices)
b = b.rename(columns = {0 : "ids"})
b.to_csv(os.path.join(deepddi_path, "pca_basis_drugs.csv"))


drug_similarity_info = {}
curr_count = 0


for drug1 in tqdm(all_ids):
    
    drug_similarity_info[drug1] = {}
    
    drug1_smile = ddilookup[ddilookup["drug_index"] == drug1]["canonical_smiles"].iloc[0]
    drug1_mol = Chem.MolFromSmiles(drug1_smile)
    drug1_mol = AllChem.AddHs(drug1_mol)
    drug1_fps  = AllChem.GetMorganFingerprint(drug1_mol, 2)

    for drug2 in basis_indices:
        
        ### calculate the score
        drug2_smile = ddilookup[ddilookup["drug_index"] == drug2]["canonical_smiles"].iloc[0]
        drug2_mol = Chem.MolFromSmiles(drug2_smile)
        drug2_mol = AllChem.AddHs(drug2_mol)
        drug2_fps  = AllChem.GetMorganFingerprint(drug2_mol, 2)
        score = DataStructs.DiceSimilarity(drug1_fps, drug2_fps)
        drug_similarity_info[drug1][drug2] = score
        
df = pd.DataFrame.from_dict(drug_similarity_info)

df.to_csv(os.path.join(deepddi_path, "lookup_table_of_tanimoto.csv"))



# PCA Fitting

In [26]:

### Loading in the PCA stuffs
deepddi_path = "/n/data1/hms/dbmi/zitnik/lab/users/yeh803/DDI/processed_data/simple_baseline_data2/deepDDI"
# path = "./"

PCA_basis_drugs = pd.read_csv(os.path.join(deepddi_path , "pca_basis_drugs.csv"), index_col = 0)
tanimoto_lookup = pd.read_csv(os.path.join(deepddi_path , "lookup_table_of_tanimoto.csv"), index_col = 0)
tanimoto_lookup.columns = tanimoto_lookup.columns.astype("int")

basis_drugs = list(PCA_basis_drugs["ids"])
PCA_fitting = tanimoto_lookup[basis_drugs]


In [27]:
#####
# Fitting the PCA
#####

df = PCA_fitting

### scaling the data for PCA
scaler = StandardScaler()
scaler.fit(df)
scaled_df = scaler.transform(df)

### PCA with 50 components, the same tha they use
pca = PCA(n_components=50)

### fitting the data
pca.fit(scaled_df)

### transforming it 
pca_data = pca.transform(scaled_df)

### saving it as a dataframe
new_df = pd.DataFrame(pca_data, columns=['PC_%d' % (i + 1) for i in range(50)], index=df.index)
new_df

Unnamed: 0,PC_1,PC_2,PC_3,PC_4,PC_5,PC_6,PC_7,PC_8,PC_9,PC_10,...,PC_41,PC_42,PC_43,PC_44,PC_45,PC_46,PC_47,PC_48,PC_49,PC_50
864,20.935486,31.321493,-4.187045,13.401265,-7.122553,-9.857449,-4.642497,-2.147628,1.893738,-1.746549,...,-0.846768,0.847800,-1.932744,0.349962,-0.453062,0.330167,-1.533284,0.382949,1.252051,0.168315
1035,-16.340037,30.233404,7.329532,10.077706,2.613874,4.758463,3.195721,-6.759980,-0.548303,2.813353,...,-0.393699,-0.055412,0.351112,-0.072190,0.331588,-0.292915,1.171392,-1.288857,-0.421101,-0.443532
662,-16.626961,-24.524393,3.375227,-2.930885,7.479861,2.002405,0.810368,-12.808328,4.420617,-7.582959,...,-0.706086,-0.956522,0.503363,-0.262623,0.510248,0.039964,1.510965,1.192450,0.359553,-0.217761
857,-27.933068,6.404170,-5.020575,3.699839,-4.782935,-5.713406,3.866216,7.758823,1.408755,1.348758,...,1.231884,0.595525,-0.604112,-0.703552,-0.185904,-0.909883,-1.424813,-0.040638,0.686921,0.326869
1431,-19.488833,12.288547,0.607225,22.716345,-0.413641,-1.602109,1.740820,0.895958,-0.659758,2.338504,...,1.549046,0.851516,-0.843385,-0.331186,-0.477552,0.351794,0.324745,-0.031860,-0.283472,0.432719
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1422,-15.139021,-15.051173,-1.801231,-13.897803,-8.985216,-4.706546,-5.043665,6.097899,1.639828,1.839888,...,0.596371,0.939603,-0.391443,-0.553282,0.699476,-0.773151,-0.631905,-0.442818,-0.157880,-0.008072
260,-4.176896,0.413636,15.577662,10.028561,-2.637791,-8.949966,-10.593999,7.126921,0.920537,-8.040209,...,-0.440092,0.711536,0.042527,-0.443661,-1.002596,-0.145597,0.635377,-1.602745,-0.368993,0.162929
3005,-24.507184,11.855357,3.690505,2.429155,-1.167852,0.726975,0.918055,-5.604161,0.168690,-2.012064,...,-0.603782,-0.742120,-0.155960,0.435592,0.216668,0.580163,-0.830875,-0.019922,1.246964,0.304890
1927,28.373522,25.793734,-15.911296,-9.664032,-6.832196,11.368734,-0.869092,5.419099,5.567093,-4.097584,...,-0.422467,-0.239702,0.344883,0.416874,0.156811,1.022144,0.496162,0.203430,0.747427,-0.356605


In [31]:
####
# Run the PCA on every compound in the lookup
####
df = tanimoto_lookup.T

### scaling the data for PCA
#### https://bitbucket.org/kaistsystemsbiology/deepddi/src/master/deepddi/preprocessing.py 
scaler = StandardScaler()
scaler.fit(df)
scaled_df = scaler.transform(df)

### transforming it 
pca_data = pca.transform(scaled_df)
### saving it as a dataframe
pca_lookup = pd.DataFrame(pca_data, columns=['PC_%d' % (i + 1) for i in range(50)], index=df.index)
pca_lookup = pca_lookup.T

In [18]:
###
# Not Necessary anymore because I reindexed it
###

# a = pca_lookup.T.reset_index().rename(columns = {"index" : "dbids"})

# b = pd.merge(a,ddilookup[["drug_index", "drugbank_id"]], how = "left", left_on = "dbids", right_on = "drugbank_id")

# df = b[b["drug_index"].notnull()]
# df = df.astype({'drug_index': 'int32'})
# df = df.drop_duplicates("drug_index")
# df = df.set_index("drug_index")
# df = df.drop(columns=['dbids', 'drugbank_id'])
# reindexed_pca_lookup = df.T

## Generating the actual Training data

In [34]:
######
# Looping through every single interaction in the train_ddi
# tanimoto_lookup["DB13508"] ### how to look up a dbid
######
from tqdm import tqdm

#### TRAINING
training_set = []
for i in tqdm(range(len(train_ddi))):

    sel = train_ddi.iloc[i]
    left = sel["x"]
    right = sel["y"]
    label = sel["class"]

    a = np.array(pca_lookup[left])
    b = np.array(pca_lookup[right])
    x = np.concatenate((a, b, [label])) ### NOTE THE LAST VARIABLE IS THE CLASS
    training_set.append(x)
    
train_data = np.array(training_set)
# np.save("train_data.npy", train_data)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2679988/2679988 [04:11<00:00, 10651.65it/s]


In [36]:
#### test
test_data = []
for i in tqdm(range(len(test_ddi))):

    sel = test_ddi.iloc[i]
    left = sel["x"]
    right = sel["y"]
    label = sel["class"]

    a = np.array(pca_lookup[left])
    b = np.array(pca_lookup[right])
    x = np.concatenate((a, b, [label])) ### NOTE THE LAST VARIABLE IS THE CLASS
    test_data.append(x)
    
test_data = np.array(test_data)
# np.save("test_data.npy", test_data)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 893330/893330 [01:24<00:00, 10519.62it/s]


In [37]:
#### val 
val_data = []
for i in tqdm(range(len(val_ddi))):

    sel = test_ddi.iloc[i]
    left = sel["x"]
    right = sel["y"]
    label = sel["class"]

    a = np.array(pca_lookup[left])
    b = np.array(pca_lookup[right])
    x = np.concatenate((a, b, [label])) ### NOTE THE LAST VARIABLE IS THE CLASS
    val_data.append(x)
    
val_data = np.array(val_data)


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 893330/893330 [01:23<00:00, 10675.32it/s]


In [39]:
path = "/n/data1/hms/dbmi/zitnik/lab/users/yeh803/DDI/processed_data/simple_baseline_data2/deepDDI"

np.save(os.path.join(path, "train_data.npy"), train_data)
np.save(os.path.join(path, "test_data.npy"), test_data)
np.save(os.path.join(path, "val_data.npy"), val_data)


In [75]:
# ####
# # In order to generate negative samples within train, I need to 
# # reindex the train_edgelist with a new index based on the things that appear.
# ####

# train = pd.DataFrame(train_edgelist, columns = ["x", "y"])
# ids = set(list(train.x) + list(train.y))
# train_ids_lookup = pd.DataFrame(ids, columns=["all_ids"]).reset_index().rename(columns = {"index": "train_index"})

# a = pd.merge(train, train_ids_lookup, how="left", left_on="x", right_on="all_ids")
# a = a.rename(columns = {"train_index": "x_train_index"})
# a = pd.merge(a, train_ids_lookup, how="left", left_on="y", right_on="all_ids")
# a = a.rename(columns = {"train_index": "y_train_index"})

# ### reindexed train edgelist
# reindexed_train_edgelist = a[['x_train_index', 'y_train_index']]

# df = pd.crosstab(reindexed_train_edgelist.x_train_index, reindexed_train_edgelist.y_train_index)
# idx = df.columns.union(df.index)
# df2 = df.reindex(index = idx, columns=idx, fill_value=0)

# df2