In [3]:
import pandas as pd
from igraph import Graph
import disease_process_proteins
import metrics_functions
import importlib
import numpy as np
from ast import literal_eval
from tqdm.notebook import tqdm
import importlib
from IPython.display import display

In [5]:
# STRING network
adj_matrix = np.load("../../data/processed/string_adjacency_matrix.npy")
graph = Graph.Read_GML("../../data/processed/graph_string")

In [4]:
# modules
reactome_modules = pd.read_csv("../../data/processed/string_reactome_modules.csv", sep=',', header=0)
display(reactome_modules.head(2))
disgenet_modules = pd.read_csv("../../data/processed/string_disgenet_modules.csv", sep=',', header=0)
disgenet_modules.head(2)

Unnamed: 0,process,proteins_ids,protein_index,module_size
0,R-HSA-1031716,"['TRIM10', 'TRIM22', 'IRF9', 'IFI30', 'TRIM38'...","[16047, 5732, 430, 10346, 10992, 4050, 13305, ...",67
1,R-HSA-112379,"['CDK7', 'CDK9', 'SUPT16H', 'LEO1', 'ERCC2', '...","[1962, 5424, 3557, 7708, 2587, 4699, 7715, 115...",52


Unnamed: 0,process,proteins_ids,protein_index,len,main_component,conservative_module,added_nodes,len_sca,len_conservative,len_added_nodes,main_component_ids,conservative_module_ids,added_nodes_ids,increase
0,C0000786,"['AGTR1', 'AHR', 'APOE', 'ARNT', 'CEACAM1', 'C...","[977, 3340, 3243, 6233, 7440, 9899, 4106, 7824...",103,"[977, 3340, 3243, 6233, 7440, 9899, 4106, 7824...","[3073, 4106, 5670, 4647, 5672, 53, 1591, 575, ...","[408, 94, 13110, 12727, 10080, 2430, 3802, 379...",127,103,25,"['AGTR1', 'AHR', 'APOE', 'ARNT', 'CEACAM1', 'C...","['ITGB6', 'CD8A', 'IL12B', 'IL5RA', 'TNFSF10',...","['STAT1', 'CASP3', 'INHBC', 'PLOD1', 'CTSG', '...",0.242718
1,C0000822,"['AGTR1', 'AHR', 'APOE', 'ARNT', 'CEACAM1', 'C...","[977, 3340, 3243, 6233, 7440, 9899, 4106, 7824...",103,"[977, 3340, 3243, 6233, 7440, 9899, 4106, 7824...","[3073, 4106, 5670, 4647, 5672, 53, 1591, 575, ...","[408, 94, 13110, 12727, 10080, 2430, 3802, 379...",127,103,25,"['AGTR1', 'AHR', 'APOE', 'ARNT', 'CEACAM1', 'C...","['ITGB6', 'CD8A', 'IL12B', 'IL5RA', 'TNFSF10',...","['STAT1', 'CASP3', 'INHBC', 'PLOD1', 'CTSG', '...",0.242718


In [6]:
reactome_modules['protein_index'] = reactome_modules['protein_index'].apply(literal_eval)
disgenet_modules['protein_index'] = disgenet_modules['protein_index'].apply(literal_eval)
reactome_modules['proteins_ids'] = reactome_modules['proteins_ids'].apply(literal_eval)
disgenet_modules['proteins_ids'] = disgenet_modules['proteins_ids'].apply(literal_eval)
disgenet_modules['conservative_module'] = disgenet_modules['conservative_module'].apply(literal_eval)
disgenet_modules['added_nodes'] = disgenet_modules['added_nodes'].apply(literal_eval)
disgenet_modules['conservative_module_ids'] = disgenet_modules['conservative_module_ids'].apply(literal_eval)
disgenet_modules['added_nodes_ids'] = disgenet_modules['added_nodes_ids'].apply(literal_eval)
disgenet_modules['main_component'] = disgenet_modules['main_component'].apply(literal_eval)
disgenet_modules['main_component_ids'] = disgenet_modules['main_component_ids'].apply(literal_eval)

In [7]:
disgenet_conservative_module = disgenet_modules[['process', 'conservative_module_ids', 'conservative_module', 'len_conservative']]
disgenet_conservative_module.columns = ['process', 'proteins_ids', 'protein_index', 'n_proteins']

In [8]:
#  SCA Modules: discard modules with more than 40% added genes.
disgenet_sca_module = disgenet_modules[disgenet_modules['increase']<0.4][['process', 'main_component_ids', 'main_component', 'len_sca']]
disgenet_sca_module.columns = ['process', 'proteins_ids', 'protein_index', 'n_proteins']

# 1. Target Process Metrics

## 1.1 Random Walks with Restart

In [10]:
importlib.reload(metrics_functions)
process_rwr = metrics_functions.random_walk_restart(graph, reactome_modules)
process_rwr_df = pd.DataFrame.from_dict(process_rwr)
process_rwr_df = process_rwr_df.rename(index=dict(zip(list(process_rwr_df.index),list(graph.vs['name']))))
process_rwr_df.to_csv('../../data/processed/metrics/process_rwr_string.csv')
print(process_rwr_df.shape)
process_rwr_df.head()

  0%|          | 0/231 [00:00<?, ?it/s]

(16381, 231)


Unnamed: 0,R-HSA-1031716,R-HSA-112379,R-HSA-112385,R-HSA-1168640,R-HSA-1234159,R-HSA-141409,R-HSA-141422,R-HSA-141431,R-HSA-141439,R-HSA-141671,...,R-HSA-9633742,R-HSA-9634669,R-HSA-9648114,R-HSA-9660824,R-HSA-983140,R-HSA-983147,R-HSA-983156,R-HSA-983157,R-HSA-983259,R-HSA-983266
GUCY2F,2.1e-05,1.2e-05,1.2e-05,1.4e-05,1.5e-05,1.7e-05,1.7e-05,1.7e-05,1.7e-05,1.1e-05,...,1.2e-05,1.1e-05,2e-05,0.000101,1.8e-05,1.8e-05,1.8e-05,1.8e-05,2.7e-05,2.7e-05
PDE1C,3.1e-05,1.6e-05,1.6e-05,1.9e-05,2e-05,2.3e-05,2.3e-05,2.3e-05,2.3e-05,1.2e-05,...,1.2e-05,1.2e-05,2.4e-05,0.00022,2.4e-05,2.5e-05,2.4e-05,2.4e-05,3.1e-05,3.1e-05
GNB3,0.000104,4.3e-05,4.3e-05,7.4e-05,7e-05,6.8e-05,6.8e-05,6.8e-05,6.8e-05,3e-05,...,3e-05,3e-05,7.4e-05,0.002709,7.1e-05,7.3e-05,7.1e-05,7.1e-05,9.3e-05,9.3e-05
PDE1B,3.2e-05,1.6e-05,1.6e-05,1.9e-05,2e-05,2.6e-05,2.6e-05,2.6e-05,2.6e-05,1.3e-05,...,1.3e-05,1.3e-05,2.6e-05,0.000219,2.5e-05,2.6e-05,2.5e-05,2.5e-05,3.2e-05,3.2e-05
GNG13,6.9e-05,3e-05,3e-05,4.9e-05,4.8e-05,4.5e-05,4.5e-05,4.5e-05,4.5e-05,1.9e-05,...,1.9e-05,1.9e-05,4.8e-05,0.002662,4.6e-05,4.7e-05,4.6e-05,4.6e-05,5.8e-05,5.8e-05


In [11]:
importlib.reload(metrics_functions)
disease_rwr = metrics_functions.random_walk_restart(graph, disgenet_sca_module)
disease_rwr_df = pd.DataFrame.from_dict(disease_rwr)
disease_rwr_df = disease_rwr_df.rename(index=dict(zip(list(disease_rwr_df.index),list(graph.vs['name']))))
disease_rwr_df.to_csv('../../data/processed/metrics/disease_rwr_string.csv')
print(disease_rwr_df.shape)
disease_rwr_df.head()

  0%|          | 0/282 [00:00<?, ?it/s]

(16381, 282)


Unnamed: 0,C0000786,C0000822,C0001418,C0001787,C0001973,C0002152,C0002395,C0002736,C0003873,C0004096,...,C4317109,C4317123,C4505436,C4505456,C4552091,C4552766,C4704862,C4707243,C4721453,C4721507
GUCY2F,2.9e-05,2.9e-05,3.6e-05,5.7e-05,7.3e-05,4.5e-05,3.5e-05,2.8e-05,3e-05,5.1e-05,...,5.2e-05,5.1e-05,5.2e-05,2.2e-05,4.7e-05,2.9e-05,4.7e-05,3.9e-05,2.8e-05,2.7e-05
PDE1C,3.6e-05,3.6e-05,5.9e-05,0.000106,0.000114,7.5e-05,6.4e-05,4.4e-05,4.5e-05,0.000111,...,0.000109,0.000107,0.000109,3.1e-05,7.8e-05,3.6e-05,7.8e-05,4.6e-05,4.8e-05,4.6e-05
GNB3,0.000139,0.000139,0.00018,0.00017,0.000611,0.000476,0.000183,0.00013,0.000158,0.000252,...,0.002127,0.002075,0.002127,0.000128,0.000269,0.000139,0.000269,0.000121,0.000171,0.000196
PDE1B,3.5e-05,3.5e-05,6e-05,0.000116,0.000115,7.5e-05,6.3e-05,4.3e-05,4.7e-05,0.000123,...,0.000102,0.000101,0.000102,3.1e-05,8.2e-05,3.5e-05,8.2e-05,5.1e-05,4.8e-05,4.7e-05
GNG13,9.8e-05,9.8e-05,0.00013,0.000165,0.000306,0.000272,0.000131,0.000102,0.000111,0.000193,...,0.000317,0.000311,0.000317,8.1e-05,0.00021,9.8e-05,0.00021,7.9e-05,0.000121,0.000143


In [12]:
importlib.reload(metrics_functions)
disease_rwr_conservative = metrics_functions.random_walk_restart(graph, disgenet_conservative_module)
disease_rwr_conservative_df = pd.DataFrame.from_dict(disease_rwr_conservative)
disease_rwr_conservative_df = disease_rwr_conservative_df.rename(index=dict(zip(list(disease_rwr_conservative_df.index),list(graph.vs['name']))))
disease_rwr_conservative_df.to_csv('../../data/processed/metrics/disease_rwr_conservative_string.csv')
print(disease_rwr_conservative_df.shape)
disease_rwr_conservative_df.head()

  0%|          | 0/298 [00:00<?, ?it/s]

(16381, 298)


Unnamed: 0,C0000786,C0000822,C0001418,C0001787,C0001973,C0002152,C0002395,C0002736,C0003873,C0004096,...,C4317109,C4317123,C4505436,C4505456,C4552091,C4552766,C4704862,C4707243,C4721453,C4721507
GUCY2F,2.9e-05,2.9e-05,3.5e-05,5.7e-05,7.7e-05,4.5e-05,3.4e-05,2.6e-05,3e-05,5.6e-05,...,5.4e-05,5.4e-05,5.4e-05,2.2e-05,4.6e-05,2.9e-05,4.6e-05,4e-05,2.9e-05,2.6e-05
PDE1C,3.7e-05,3.7e-05,5.6e-05,0.000108,0.000121,7.4e-05,6.4e-05,4.4e-05,4.5e-05,0.000123,...,0.000115,0.000113,0.000115,3e-05,7.9e-05,3.7e-05,7.9e-05,4.4e-05,4.8e-05,4.1e-05
GNB3,0.000136,0.000136,0.000179,0.000161,0.000666,0.00046,0.000179,0.000127,0.00016,0.000267,...,0.002384,0.002318,0.002384,0.000125,0.00026,0.000136,0.00026,0.000121,0.000181,0.00019
PDE1B,3.5e-05,3.5e-05,5.7e-05,0.000117,0.000122,7.5e-05,6.4e-05,4.3e-05,4.7e-05,0.000137,...,0.000107,0.000106,0.000107,3.1e-05,8.2e-05,3.5e-05,8.2e-05,4.9e-05,4.8e-05,4.2e-05
GNG13,9.6e-05,9.6e-05,0.000128,0.000154,0.000324,0.000256,0.000124,0.000101,0.000111,0.000205,...,0.000342,0.000335,0.000342,8e-05,0.000195,9.6e-05,0.000195,7.9e-05,0.000126,0.000136


## 1.2 Labels Creation

In [14]:
reactome_labels = np.zeros(process_rwr_df.shape)
column = 0
for indexes in reactome_modules['protein_index'].values:
    for pos in indexes:
        reactome_labels[pos][column] = 1
    column += 1
np.savetxt("../../data/processed/reactome_labels_string.csv", reactome_labels, delimiter=",")

In [15]:
disgenet_sca_labels = np.zeros(disease_rwr_df.shape)
column = 0
for indexes in disgenet_sca_module['protein_index'].values:
    for pos in indexes:
        disgenet_sca_labels[pos][column] = 1
    column += 1
np.savetxt("../../data/processed/disgenet_sca_labels_string.csv", disgenet_sca_labels, delimiter=",")

In [16]:
disgenet_conservative_labels = np.zeros(disease_rwr_conservative_df.shape)
column = 0
for indexes in disgenet_conservative_module['protein_index'].values:
    for pos in indexes:
        disgenet_conservative_labels[pos][column] = 1
    column += 1
np.savetxt("../../data/processed/disgenet_conservative_labels_string.csv", disgenet_conservative_labels, delimiter=",")