In [7]:
import pickle
import numpy as np
import pandas as pd
import scipy.sparse as sp

In [8]:
root_path = "../data/processed_step1/"
save_path = "../data/processed_step2/"

## PPI

In [9]:
path = root_path + "/ppi_preprocessed.csv"
ppi_df = pd.read_csv(path)
ppi_df = ppi_df.drop(["Unnamed: 0"], axis=1)
print(ppi_df.shape)
ppi_df.head()

(327924, 2)


Unnamed: 0,gene1,gene2
0,9796,56992
1,7918,9240
2,8233,23548
3,4899,11253
4,5297,8601


In [19]:
protein_map = {}
protein_count = 0

pp_list = []

row = list()
col = list()

for idx in ppi_df.index:
    
    g1 = ppi_df["gene1"][idx]
    g2 = ppi_df["gene2"][idx]
    
    if g1 not in protein_map:
        protein_map[g1] = protein_count
        protein_count += 1
    if g2 not in protein_map:
        protein_map[g2] = protein_count
        protein_count += 1

    g1, g2 = protein_map[g1], protein_map[g2]
    row.append(g1)
    col.append(g2)
    
    pp_list.append((g1, g2))

temp_ppi_counts = []
temp_ppi_counts.extend(list(ppi_df['gene1']))
temp_ppi_counts.extend(list(ppi_df['gene2']))

assert(len(set(temp_ppi_counts)) == protein_count)

adj = sp.coo_matrix((np.ones(len(row)), (row, col)), shape=(protein_count, protein_count))
sym_adj = adj + adj.T
sym_adj = sym_adj > 0
# sym_adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)
sp.save_npz(save_path + "ppi-sparse-adj.npz", sym_adj)

assert(np.count_nonzero(adj.toarray())==len(ppi_df))

In [27]:
print(len(protein_map))

18505


## Drug - Protein

In [22]:
path = root_path + "/drug-gene.csv"
drug_gene_df = pd.read_csv(path)
drug_gene_df = drug_gene_df.drop(["Unnamed: 0"], axis=1)
print(drug_gene_df.shape)
drug_gene_df.head()

(24649, 2)


Unnamed: 0,ID,entrez_id
0,DB00001,2147.0
1,DB00002,1956.0
2,DB00002,2215.0
3,DB00002,715.0
4,DB00002,712.0


In [23]:
all_drug = set(drug_gene_df["ID"])
print(len(all_drug))
drug_withDB = set(filter(lambda x: x.startswith("DB"), list(drug_gene_df["ID"])))
print(len(drug_withDB))
drug_withoutDB = all_drug - drug_withDB
print(len(drug_withoutDB))

# removing drug without DB IDs
drug_gene_df = drug_gene_df[drug_gene_df["ID"].isin(drug_withDB)]
# removing proteins not in PPI
drug_gene_df = drug_gene_df[drug_gene_df["entrez_id"].isin(protein_map)]
drug_gene_df['entrez_id'] = drug_gene_df['entrez_id'].astype(int)
print(drug_gene_df.shape)
drug_gene_df.head()

6253
6194
59
(22230, 2)


Unnamed: 0,ID,entrez_id
0,DB00001,2147
1,DB00002,1956
2,DB00002,2215
3,DB00002,715
4,DB00002,712


In [25]:
drug_map = {}
drug_count = 0

row = list()
col = list()

for idx in drug_gene_df.index:
    
    drug = drug_gene_df["ID"][idx]
    gene = drug_gene_df["entrez_id"][idx]
    
    # create drug map
    if drug not in drug_map:
        drug_map[drug] = drug_count
        drug_count += 1
    
    drug, gene = drug_map[drug], protein_map[gene]
    row.append(drug)
    col.append(gene)
    
adj = sp.coo_matrix((np.ones(len(row)), (row, col)), shape=(drug_count, protein_count))
sp.save_npz(save_path + "drug-protein-sparse-adj.npz", adj)

In [24]:
drug_list = list(set(list(drug_gene_df['ID'])))
print(len(drug_list))

6158


In [26]:
print(adj.toarray().shape)

(6158, 18505)


## Disease Protein

In [33]:
path = root_path + "/Guney16_preprocessed.csv"
disease_prot_df = pd.read_csv(path)
disease_prot_df = disease_prot_df.drop(["Unnamed: 0"], axis=1)
print(disease_prot_df.shape)
disease_prot_df.head()

(1448, 1023)


Unnamed: 0,MESHID,1,2,3,4,5,6,7,8,9,...,1013,1014,1015,1016,1017,1018,1019,1020,1021,1022
0,D000012,4547,,,,,,,,,...,,,,,,,,,,
1,D000013,374654,5820.0,9289.0,5824.0,4000.0,5828.0,3854.0,3852.0,3853.0,...,,,,,,,,,,
2,D000015,374654,1302.0,8626.0,2657.0,5190.0,2138.0,5192.0,5077.0,5824.0,...,,,,,,,,,,
3,D000052,2261,3643.0,,,,,,,,...,,,,,,,,,,
4,D000130,2261,1280.0,,,,,,,,...,,,,,,,,,,


In [34]:
prot_list = []
for i in disease_prot_df.index:
    for j in range(1,1023):
        value = disease_prot_df[str(j)][i]
        if(pd.isna(value) == False):
            value = int(value)
            if(value not in protein_map):
                disease_prot_df[str(j)][i] = np.nan
            else:
                disease_prot_df[str(j)][i] = int(value)
                prot_list.append(int(value))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  disease_prot_df[str(j)][i] = int(value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  disease_prot_df[str(j)][i] = int(value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  disease_prot_df[str(j)][i] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  disease_prot_df[str(j)][i] = int(value)
A value is t

In [39]:
prot_set_pdis = set(prot_list)
print(len(prot_set_pdis))
print(list(prot_set_pdis)[:5])
disease_prot_df.head()

3119
[8193, 8195, 8200, 10, 8204]


Unnamed: 0,MESHID,1,2,3,4,5,6,7,8,9,...,1013,1014,1015,1016,1017,1018,1019,1020,1021,1022
0,D000012,4547.0,,,,,,,,,...,,,,,,,,,,
1,D000013,374654.0,,9289.0,5824.0,4000.0,5828.0,3854.0,3852.0,3853.0,...,,,,,,,,,,
2,D000015,374654.0,1302.0,8626.0,2657.0,5190.0,2138.0,5192.0,5077.0,5824.0,...,,,,,,,,,,
3,D000052,2261.0,3643.0,,,,,,,,...,,,,,,,,,,
4,D000130,2261.0,1280.0,,,,,,,,...,,,,,,,,,,


In [40]:
disease_map = {}
disease_count = 0

row = list()
col = list()
X=0

for idx in disease_prot_df.index:
   
    disease = disease_prot_df["MESHID"][idx]
    
    # create drug map
    if disease not in disease_map:
        disease_map[disease] = disease_count
        disease_count += 1

    for j in range(1,1023):
        prot = disease_prot_df[str(j)][idx]
        if(pd.isna(prot) == False):
            disease_id = disease_map[disease]
            prot = protein_map[prot]
            row.append(disease_id)
            col.append(prot)
    X += 1

adj = sp.coo_matrix((np.ones(len(row)), (row, col)), shape=(disease_count, protein_count))
sp.save_npz(save_path + "disease-prot-sparse-adj.npz", adj)

In [41]:
adj.toarray().shape

(1448, 18505)

## Drug-Disease

In [42]:
path = root_path + "/original_disease_drug.tsv"
disease_drug_df = pd.read_csv(path, sep="\t")
print(disease_drug_df.shape)
disease_drug_df.head()

(466657, 2)


Unnamed: 0,# Disease(MESH),Chemical
0,D005923,DB00564
1,D009503,DB01072
2,D016115,DB01759
3,D018476,DB00451
4,C567059,DB00641


In [43]:
# removing drugs not in Drug Set
disease_drug_df = disease_drug_df[disease_drug_df["Chemical"].isin(drug_map)]
# removing disease not in Disease Set
disease_drug_df = disease_drug_df[disease_drug_df["# Disease(MESH)"].isin(disease_map)]
print(disease_drug_df.shape)
disease_drug_df.head()

(195811, 2)


Unnamed: 0,# Disease(MESH),Chemical
0,D005923,DB00564
1,D009503,DB01072
8,D001249,DB00814
12,D001943,DB00300
16,D001281,DB01394


In [44]:
drug_set_dd = set(disease_drug_df["Chemical"])
print(len(drug_set_dd))
disease_set_dd = set(disease_drug_df["# Disease(MESH)"])
print(len(disease_set_dd))

1416
1271


In [45]:
row = list()
col = list()
for idx in disease_drug_df.index:
    
    disease = disease_drug_df["# Disease(MESH)"][idx]
    drug = disease_drug_df["Chemical"][idx]
    disease, drug = disease_map[disease], drug_map[drug]
    row.append(disease)
    col.append(drug)
    
adj = sp.coo_matrix((np.ones(len(row)), (row, col)), 
                    shape=(disease_count, drug_count))
sp.save_npz(save_path + "disease-drug-sparse-adj.npz", adj)

In [46]:
adj.toarray().shape

(1448, 6158)

## Save dictionaries

In [47]:
def save_to_pkl(path, obj):
    with open(path, 'wb') as ff:
        pickle.dump(obj, ff)

In [49]:
save_to_pkl(save_path+"map_drug.pkl", drug_map)
save_to_pkl(save_path+"map_protein.pkl", protein_map)
save_to_pkl(save_path+"map_disease.pkl", disease_map)
save_to_pkl(save_path+"map_counts.pkl", (protein_count, drug_count, disease_count))

In [52]:
## Verify

file = save_path+"map_disease.pkl"
with open(file, 'rb') as f:   # the whole dataset
    map = pickle.load(f)
display(disease_prot_df[disease_prot_df["MESHID"] == "D059327"])
print(map["D059327"])
assert(disease_prot_df[disease_prot_df["MESHID"] == "D059327"].index == map["D059327"])

Unnamed: 0,MESHID,1,2,3,4,5,6,7,8,9,...,1013,1014,1015,1016,1017,1018,1019,1020,1021,1022
1441,D059327,4920.0,3239.0,8200.0,,,,,,,...,,,,,,,,,,


1441


In [69]:
disease_map["D000086382"]

1446

## Form final dictionary

In [70]:
import pickle
import numpy as np
import pandas as pd
import scipy.sparse as sp
import torch
import torch.sparse as tsp

import warnings
warnings.filterwarnings("ignore")

In [71]:
path = "../data/processed_step2/"

In [72]:
with open(path + 'map_counts.pkl', 'rb') as f:
        protein_num, drug_num, disease_num = pickle.load(f)

In [73]:
pp_adj = sp.load_npz(path + "ppi-sparse-adj.npz")
print(pp_adj.shape)

dp_adj = sp.load_npz(path + "drug-protein-sparse-adj.npz").tocsr()
dp_adj[np.where(dp_adj.toarray()>1)]=1       
print(dp_adj.shape)

dd_adj = sp.load_npz(path + "disease-drug-sparse-adj.npz").tocsr()
dd_adj[np.where(dd_adj.toarray()>1)]=1
print(dd_adj.shape)

disp_adj = sp.load_npz(path + "disease-prot-sparse-adj.npz").tocsr()
disp_adj[np.where(disp_adj.toarray()>1)]=1
print(disp_adj.shape)

(18505, 18505)
(6158, 18505)
(1448, 6158)
(1448, 18505)


#### Feature Matrices

In [76]:
ind = torch.LongTensor([np.array(range(protein_num), dtype=np.longlong), np.array(range(protein_num), dtype=np.longlong)])
val = torch.FloatTensor([1] * protein_num)
protein_feat = torch.sparse.FloatTensor(ind, val,torch.Size([protein_num, protein_num]))

ind = torch.LongTensor([np.array(range(drug_num), dtype=np.longlong), np.array(range(drug_num), dtype=np.longlong)])
val = torch.FloatTensor([1] * drug_num)
drug_feat = torch.sparse.FloatTensor(ind, val,torch.Size([drug_num, drug_num]))

ind = torch.LongTensor([np.array(range(disease_num), dtype=np.longlong), np.array(range(disease_num), dtype=np.longlong)])
val = torch.FloatTensor([1] * disease_num)
disease_feat = torch.sparse.FloatTensor(ind, val,torch.Size([disease_num, disease_num]))

In [77]:
data = {'n_drug': drug_num,
        'n_prot': protein_num,
        'n_dis': disease_num,
        'd_feat': drug_feat,
        'p_feat': protein_feat,
        'dis_feat': disease_feat,
        'dp_adj': dp_adj.tocoo(),
        'pp_adj': pp_adj.tocoo(),
        'dd_adj': dd_adj.tocoo(),
        'disp_adj': disp_adj.tocoo()}

In [78]:
file = '../data/final_data_dict.pkl'
with open(file, 'wb') as f:   # the whole dataset
    pickle.dump(data, f)

## Embeddings

In [54]:
embed_dis = pd.read_csv("../data/raw/Embed_dis.csv")
print(embed_dis.shape)
embed_dis.head()

(2541, 33)


Unnamed: 0,C0206180,-0.14473225,0.13399248,-0.00448086,-0.00764772,-0.07541482,0.06394061,-0.02279412,0.16622804,0.07047904,...,-0.08408829,-0.06948166,-0.03048278,-0.02493184,0.02463660,-0.11912217,-0.10896748,0.09094054,0.09812895,0.00522631
0,C0153685,-0.209077,0.395467,-0.081615,0.057308,-0.302729,-0.103724,0.175678,0.025079,-0.15348,...,0.23522,0.187109,0.058264,-0.077734,0.123228,0.021371,-0.175696,0.032547,0.026433,-0.075189
1,C0153687,-0.153783,0.302808,-0.024087,0.086008,-0.291659,0.002242,0.070519,-0.012165,-0.21869,...,0.188705,0.138539,0.109696,-0.125395,0.052532,0.076731,-0.093987,0.021757,-0.006971,-0.140305
2,kidney diseases,-0.018979,-0.134309,-0.026794,-0.046509,0.047284,-0.201912,0.059666,-0.099623,-0.028077,...,0.114408,-0.023066,-0.087959,-0.013896,0.187902,0.069262,-0.016006,0.071573,0.00214,0.005891
3,C0043515,-0.193414,0.123853,-0.033909,0.072345,-0.131433,-0.187623,0.031269,0.019136,-0.012869,...,0.054915,0.026992,0.023262,-0.096078,0.227686,-0.039217,-0.115887,-0.005295,-0.038975,-0.06305
4,C0684249,-0.156089,0.213359,0.030418,0.046285,-0.215987,-0.049301,0.118703,0.034011,-0.146477,...,0.153771,0.070236,0.027617,-0.068266,0.056771,-0.011301,-0.106526,-0.029846,0.034271,-0.015952


In [56]:
original_col_names = embed_dis.columns.tolist()
embed_dis.columns = ['Disease'] + list(range(len(embed_dis.columns) - 1))
new_row = pd.DataFrame([original_col_names], columns=embed_dis.columns)
embed_dis = pd.concat([new_row, embed_dis], ignore_index=True)
display(embed_dis.head())
disease_names = embed_dis['Disease'].unique()
print(disease_names[:5])

Unnamed: 0,Disease,0,1,2,3,4,5,6,7,8,...,22,23,24,25,26,27,28,29,30,31
0,C0206180,-0.14473225,0.13399248,-0.00448086,-0.00764772,-0.07541482,0.06394061,-0.02279412,0.16622804,0.07047904,...,-0.08408829,-0.06948166,-0.03048278,-0.02493184,0.0246366,-0.11912217,-0.10896748,0.09094054,0.09812895,0.00522631
1,C0153685,-0.209077,0.395467,-0.081615,0.057308,-0.302729,-0.103724,0.175678,0.025079,-0.15348,...,0.23522,0.187109,0.058264,-0.077734,0.123228,0.021371,-0.175696,0.032547,0.026433,-0.075189
2,C0153687,-0.153783,0.302808,-0.024087,0.086008,-0.291659,0.002242,0.070519,-0.012165,-0.21869,...,0.188705,0.138539,0.109696,-0.125395,0.052532,0.076731,-0.093987,0.021757,-0.006971,-0.140305
3,kidney diseases,-0.018979,-0.134309,-0.026794,-0.046509,0.047284,-0.201912,0.059666,-0.099623,-0.028077,...,0.114408,-0.023066,-0.087959,-0.013896,0.187902,0.069262,-0.016006,0.071573,0.00214,0.005891
4,C0043515,-0.193414,0.123853,-0.033909,0.072345,-0.131433,-0.187623,0.031269,0.019136,-0.012869,...,0.054915,0.026992,0.023262,-0.096078,0.227686,-0.039217,-0.115887,-0.005295,-0.038975,-0.06305


['C0206180' 'C0153685' 'C0153687' 'kidney diseases' 'C0043515']


In [55]:
embed_drug = pd.read_csv("../data/raw/Embed_drug.csv")
print(embed_drug.shape)
embed_drug.head()

(1606, 33)


Unnamed: 0,DB05109,-0.19608374,0.23931627,0.05019253,0.05698078,-0.21290046,-0.06830791,0.06562623,-0.04144978,-0.04692309,...,-0.01899907,-0.05770514,-0.00549219,-0.15199748,0.26887849,0.01872207,-0.21672669,0.05964709,-0.00648303,-0.14651969
0,DB01319,0.067627,-0.032969,0.005502,-0.084378,0.052334,-0.089966,-0.072125,-0.198961,-0.065112,...,0.056712,0.026111,0.039578,-0.089997,0.094072,0.07822,-0.007905,0.06275,-0.027364,-0.069908
1,DB01178,-0.185777,-0.156804,-0.128788,0.131473,0.103026,0.027707,-0.209216,0.101132,0.186314,...,-0.293431,-0.111966,-0.060836,-0.100436,0.048699,-0.183541,-0.016802,0.073022,-0.162378,-0.171262
2,DB01179,-0.239227,0.031083,-0.146822,0.308153,-0.160663,0.160768,0.064983,0.530715,0.075338,...,-0.115142,-0.237423,-0.295252,0.165809,0.019919,-0.128804,-0.203718,0.080864,0.004756,-0.10242
3,DB01176,-0.101185,0.053003,-0.043511,0.106435,0.03463,0.070799,-0.195901,0.030923,0.056826,...,-0.18161,0.003004,0.13651,-0.159798,0.016011,-0.042975,-0.031174,0.012761,-0.081461,-0.156632
4,DB01177,-0.094911,0.078449,-0.016063,0.016922,-0.093217,0.009215,0.044167,0.061957,0.029097,...,-0.000527,-0.094811,-0.079612,-0.025263,0.125167,0.006202,-0.154478,0.120364,0.044464,-0.061873
