In [82]:
import pickle
import numpy as np
import pandas as pd
import scipy.sparse as sp

In [83]:
!ls ../data

bio-decagon-ppi.csv	       Guney2016.tsv
decagon_et.pkl		       mesh_disease.csv
disease-drug_preprocessed.csv  original_disease_drug.tsv
disease-prot_preprocessed.csv  PPI.csv
drug-gene.csv		       ppi_preprocessed.csv
drug-gene_preprocessed.csv     processed
Guney16_preprocessed.csv       SARSprotein-Humanprotein.csv


In [84]:
root_path = "../data/"
save_path = "../data/processed/"

### PPI

In [85]:
path = root_path + "/ppi_preprocessed.csv"
ppi_df = pd.read_csv(path)
ppi_df = ppi_df.drop(["Unnamed: 0"], axis=1)

print(ppi_df.shape)
ppi_df.head()

(327924, 2)


Unnamed: 0,gene1,gene2
0,9796,56992
1,7918,9240
2,8233,23548
3,4899,11253
4,5297,8601


In [86]:
protein_map = {}
protein_count = 0

pp_list = []

row = list()
col = list()

In [87]:
for idx in ppi_df.index:
    
    g1 = ppi_df["gene1"][idx]
    g2 = ppi_df["gene2"][idx]
#     print(g1, g2)
    
    if g1 not in protein_map:
        protein_map[g1] = protein_count
        protein_count += 1
    if g2 not in protein_map:
        protein_map[g2] = protein_count
        protein_count += 1

    g1, g2 = protein_map[g1], protein_map[g2]
    row.append(g1)
    col.append(g2)
    
    pp_list.append((g1, g2))
    
adj = sp.coo_matrix((np.ones(len(row)), (row, col)), shape=(protein_count, protein_count))
sym_adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)
sp.save_npz(save_path + "ppi-sparse-adj.npz", sym_adj)

In [88]:
np.count_nonzero(adj.toarray())

327924

In [89]:
adj.toarray().shape

(18505, 18505)

---

### Drug - Protein

In [90]:
path = root_path + "/drug-gene_preprocessed.csv"
drug_gene_df = pd.read_csv(path)
drug_gene_df = drug_gene_df.drop(["Unnamed: 0"], axis=1)

print(drug_gene_df.shape)
drug_gene_df.head()

(22230, 2)


Unnamed: 0,ID,entrez_id
0,DB00001,2147.0
1,DB00002,1956.0
2,DB00002,2215.0
3,DB00002,715.0
4,DB00002,712.0


In [91]:
drug_map = {}
drug_count = 0

row = list()
col = list()

In [92]:
for idx in drug_gene_df.index:
    
    drug = drug_gene_df["ID"][idx]
    gene = drug_gene_df["entrez_id"][idx]
    
    # create drug map
    if drug not in drug_map:
        drug_map[drug] = drug_count
        drug_count += 1
    
#     # remove proteins that are not in ppi (DONE DURING PRE-PROCESSING)
#     if gene not in protein_map:
#         continue
    
    drug, gene = drug_map[drug], protein_map[gene]
    row.append(drug)
    col.append(gene)
    
adj = sp.coo_matrix((np.ones(len(row)), (row, col)), shape=(drug_count, protein_count))
sp.save_npz(save_path + "drug-protein-sparse-adj.npz", adj)

In [93]:
np.count_nonzero(adj.toarray())

22000

In [94]:
adj.toarray().shape

(6158, 18505)

---

### Disease - Protein

In [95]:
path = root_path + "/disease-prot_preprocessed.csv"
disease_prot_df = pd.read_csv(path)
disease_prot_df = disease_prot_df.drop(["Unnamed: 0"], axis=1)

print(disease_prot_df.shape)
disease_prot_df.head()

(1448, 1023)


Unnamed: 0,MESHID,1,2,3,4,5,6,7,8,9,...,1013,1014,1015,1016,1017,1018,1019,1020,1021,1022
0,D000012,4547.0,,,,,,,,,...,,,,,,,,,,
1,D000013,374654.0,,9289.0,5824.0,4000.0,5828.0,3854.0,3852.0,3853.0,...,,,,,,,,,,
2,D000015,374654.0,1302.0,8626.0,2657.0,5190.0,2138.0,5192.0,5077.0,5824.0,...,,,,,,,,,,
3,D000052,2261.0,3643.0,,,,,,,,...,,,,,,,,,,
4,D000130,2261.0,1280.0,,,,,,,,...,,,,,,,,,,


In [96]:
disease_map = {}
disease_count = 0

row = list()
col = list()
X=0

In [97]:
for idx in disease_prot_df.index:
#     print(X)    
    disease = disease_prot_df["MESHID"][idx]
#     print(disease)
    
    # create drug map
    if disease not in disease_map:
        disease_map[disease] = disease_count
        disease_count += 1
    
#     for i in disease_prot_df.index:
    for j in range(1,1023):
        prot = disease_prot_df[str(j)][idx]
        if(pd.isna(prot) == False):
#                 print(prot)
            disease_id = disease_map[disease]
            prot = protein_map[int(prot)]
            row.append(disease_id)
            col.append(prot)
    X += 1

adj = sp.coo_matrix((np.ones(len(row)), (row, col)), 
                    shape=(disease_count, protein_count))
sp.save_npz(save_path + "disease-prot-sparse-adj.npz", adj)

In [98]:
np.count_nonzero(adj.toarray())

29957

In [99]:
adj.toarray().shape

(1448, 18505)

---

### Drug - Disease

In [100]:
path = root_path + "/disease-drug_preprocessed.csv"
disease_drug_df = pd.read_csv(path)
disease_drug_df = disease_drug_df.drop(["Unnamed: 0"], axis=1)

print(disease_drug_df.shape)
disease_drug_df.head()

(195811, 2)


Unnamed: 0,# Disease(MESH),Chemical
0,D005923,DB00564
1,D009503,DB01072
2,D001249,DB00814
3,D001943,DB00300
4,D001281,DB01394


In [101]:
row = list()
col = list()

In [102]:
for idx in disease_drug_df.index:
    
    disease = disease_drug_df["# Disease(MESH)"][idx]
    drug = disease_drug_df["Chemical"][idx]
    
    
#     # remove drugs and diseases that are not in drug and disease map (DONE DURING PRE-PROCESSING)

    disease, drug = disease_map[disease], drug_map[drug]
    row.append(disease)
    col.append(drug)
    
adj = sp.coo_matrix((np.ones(len(row)), (row, col)), 
                    shape=(disease_count, drug_count))
sp.save_npz(save_path + "disease-drug-sparse-adj.npz", adj)

In [103]:
np.count_nonzero(adj.toarray())

195811

In [104]:
adj.toarray().shape

(1448, 6158)

---

In [105]:
def save_to_pkl(path, obj):
    with open(path, 'wb') as ff:
        pickle.dump(obj, ff)

In [106]:
save_to_pkl(save_path+"index_map/drug-map.pkl", drug_map)
save_to_pkl(save_path+"index_map/protein-map.pkl", protein_map)
save_to_pkl(save_path+"index_map/disease-map.pkl", disease_map)

save_to_pkl(save_path+"graph_info.pkl", (protein_count, drug_count, disease_count))

In [47]:
file = save_path+"index_map/disease-map.pkl"
with open(file, 'rb') as f:   # the whole dataset
    map = pickle.load(f)

In [54]:
disease_prot_df[disease_prot_df["MESHID"] == "D059327"]

Unnamed: 0,MESHID,1,2,3,4,5,6,7,8,9,...,1013,1014,1015,1016,1017,1018,1019,1020,1021,1022
1441,D059327,4920.0,3239.0,8200.0,,,,,,,...,,,,,,,,,,


In [53]:
map["D059327"]

1441

In [38]:
list(prot_map.keys()).count(10283)

1

In [51]:
for key in map:
    if map[key] == 1441:
        print(key)

D059327
