In [1]:
import pandas as pd
from igraph import Graph
import disease_process_proteins
import metrics_functions
import importlib
import numpy as np
from ast import literal_eval
from tqdm.notebook import tqdm
import importlib
from IPython.display import display

In [2]:
# STRING network
adj_matrix = np.load("../../data/processed/string_adjacency_matrix.npy")
graph = Graph.Read_GML("../../data/processed/graph_string")

In [3]:
# modules
reactome_modules = pd.read_csv("../../data/processed/string_reactome_modules.csv", sep=',', header=0)
display(reactome_modules.head(2))
disgenet_modules = pd.read_csv("../../data/processed/string_disgenet_modules.csv", sep=',', header=0)
disgenet_modules.head(2)

Unnamed: 0,process,proteins_ids,protein_index,module_size
0,R-HSA-1031716,"['TRIM10', 'TRIM22', 'IRF9', 'IFI30', 'TRIM38'...","[13891, 6259, 6308, 6325, 6149, 5598, 16269, 2...",71
1,R-HSA-112379,"['CDK7', 'CDK9', 'SUPT16H', 'LEO1', 'ERCC2', '...","[786, 6625, 681, 996, 4031, 3268, 2755, 6574, ...",52


Unnamed: 0,process,proteins_ids,protein_index,len,main_component,conservative_module,added_nodes,len_sca,len_conservative,len_added_nodes,main_component_ids,conservative_module_ids,added_nodes_ids,increase
0,C0000786,"['AGTR1', 'AHR', 'APOE', 'ARNT', 'CEACAM1', 'C...","[9810, 10634, 4866, 7455, 7208, 6890, 6332, 59...",107,"[9810, 10634, 4866, 7455, 7208, 6890, 6332, 59...","[11272, 6160, 12816, 6684, 4645, 7208, 7214, 7...",[16513],108,107,1,"['AGTR1', 'AHR', 'APOE', 'ARNT', 'CEACAM1', 'C...","['PRLR', 'CD163', 'SPAG5', 'IGF2', 'TFRC', 'CE...",['ADAM12'],0.009346
1,C0000822,"['AGTR1', 'AHR', 'APOE', 'ARNT', 'CEACAM1', 'C...","[9810, 10634, 4866, 7455, 7208, 6890, 6332, 59...",107,"[9810, 10634, 4866, 7455, 7208, 6890, 6332, 59...","[11272, 6160, 12816, 6684, 4645, 7208, 7214, 7...",[16513],108,107,1,"['AGTR1', 'AHR', 'APOE', 'ARNT', 'CEACAM1', 'C...","['PRLR', 'CD163', 'SPAG5', 'IGF2', 'TFRC', 'CE...",['ADAM12'],0.009346


In [4]:
reactome_modules['protein_index'] = reactome_modules['protein_index'].apply(literal_eval)
disgenet_modules['protein_index'] = disgenet_modules['protein_index'].apply(literal_eval)
reactome_modules['proteins_ids'] = reactome_modules['proteins_ids'].apply(literal_eval)
disgenet_modules['proteins_ids'] = disgenet_modules['proteins_ids'].apply(literal_eval)
disgenet_modules['conservative_module'] = disgenet_modules['conservative_module'].apply(literal_eval)
disgenet_modules['added_nodes'] = disgenet_modules['added_nodes'].apply(literal_eval)
disgenet_modules['conservative_module_ids'] = disgenet_modules['conservative_module_ids'].apply(literal_eval)
disgenet_modules['added_nodes_ids'] = disgenet_modules['added_nodes_ids'].apply(literal_eval)
disgenet_modules['main_component'] = disgenet_modules['main_component'].apply(literal_eval)
disgenet_modules['main_component_ids'] = disgenet_modules['main_component_ids'].apply(literal_eval)

In [5]:
disgenet_conservative_module = disgenet_modules[['process', 'conservative_module_ids', 'conservative_module', 'len_conservative']]
disgenet_conservative_module.columns = ['process', 'proteins_ids', 'protein_index', 'n_proteins']

In [6]:
#  SCA Modules: discard modules with more than 40% added genes.
disgenet_sca_module = disgenet_modules[disgenet_modules['increase']<0.4][['process', 'main_component_ids', 'main_component', 'len_sca']]
disgenet_sca_module.columns = ['process', 'proteins_ids', 'protein_index', 'n_proteins']

# 1. Target Process Metrics

## 1.1 Random Walks with Restart

In [7]:
importlib.reload(metrics_functions)
process_rwr = metrics_functions.random_walk_restart(graph, reactome_modules)
process_rwr_df = pd.DataFrame.from_dict(process_rwr)
process_rwr_df = process_rwr_df.rename(index=dict(zip(list(process_rwr_df.index),list(graph.vs['name']))))
process_rwr_df.to_csv('../../data/processed/metrics/process_rwr_string.csv')
print(process_rwr_df.shape)
process_rwr_df.head()

  0%|          | 0/244 [00:00<?, ?it/s]

(19035, 244)


Unnamed: 0,R-HSA-1031716,R-HSA-112379,R-HSA-112385,R-HSA-1168640,R-HSA-1234159,R-HSA-141409,R-HSA-141422,R-HSA-141431,R-HSA-141439,R-HSA-141671,...,R-HSA-9633742,R-HSA-9634669,R-HSA-9648114,R-HSA-9660824,R-HSA-983140,R-HSA-983147,R-HSA-983156,R-HSA-983157,R-HSA-983259,R-HSA-983266
ARF5,8.1e-05,8.3e-05,8.3e-05,9.8e-05,0.000107,9.4e-05,9.4e-05,9.4e-05,9.4e-05,0.000104,...,0.000102,0.000102,0.000107,9e-05,9e-05,9e-05,9e-05,9e-05,0.000129,0.000129
PDE1C,3.5e-05,3.4e-05,3.4e-05,3.5e-05,3.6e-05,3.8e-05,3.8e-05,3.8e-05,3.8e-05,3.5e-05,...,3.3e-05,3.3e-05,4e-05,7.9e-05,3.8e-05,3.9e-05,3.8e-05,3.8e-05,4.8e-05,4.8e-05
ERCC1,5.1e-05,0.000132,0.000132,5.3e-05,5.9e-05,6.9e-05,6.9e-05,6.9e-05,6.9e-05,5.3e-05,...,5.3e-05,5.3e-05,6.8e-05,5.9e-05,6.3e-05,6e-05,6.3e-05,6.3e-05,5.5e-05,5.5e-05
TLL1,3.2e-05,2.9e-05,2.9e-05,2.9e-05,3e-05,2.9e-05,2.9e-05,2.9e-05,2.9e-05,2.7e-05,...,2.7e-05,2.7e-05,2.9e-05,4.2e-05,3.1e-05,3.2e-05,3.1e-05,3.1e-05,3e-05,3e-05
PRSS22,3.7e-05,2.1e-05,2.1e-05,2.6e-05,2.5e-05,2.3e-05,2.3e-05,2.3e-05,2.3e-05,2.1e-05,...,2.1e-05,2.1e-05,2.3e-05,3.1e-05,2.3e-05,2.4e-05,2.3e-05,2.3e-05,4e-05,4e-05


In [8]:
importlib.reload(metrics_functions)
disease_rwr = metrics_functions.random_walk_restart(graph, disgenet_sca_module)
disease_rwr_df = pd.DataFrame.from_dict(disease_rwr)
disease_rwr_df = disease_rwr_df.rename(index=dict(zip(list(disease_rwr_df.index),list(graph.vs['name']))))
disease_rwr_df.to_csv('../../data/processed/metrics/disease_rwr_string.csv')
print(disease_rwr_df.shape)
disease_rwr_df.head()

  0%|          | 0/301 [00:00<?, ?it/s]

(19035, 301)


Unnamed: 0,C0000786,C0000822,C0001418,C0001787,C0001973,C0002152,C0002395,C0002736,C0003873,C0004096,...,C4317109,C4317123,C4505436,C4505456,C4552091,C4552766,C4704862,C4707243,C4721453,C4721507
ARF5,8.5e-05,8.5e-05,9.2e-05,0.000109,9e-05,9.4e-05,9.2e-05,0.000109,9e-05,8.3e-05,...,8.7e-05,8.8e-05,8.7e-05,9.7e-05,0.000101,8.5e-05,0.000101,8.9e-05,9.7e-05,8.3e-05
PDE1C,4.1e-05,4.1e-05,4.2e-05,5.4e-05,6.5e-05,4.9e-05,4.9e-05,4.2e-05,4.1e-05,4.8e-05,...,6.6e-05,6.5e-05,6.6e-05,3.9e-05,5e-05,4.1e-05,5e-05,5.5e-05,4.5e-05,4.3e-05
ERCC1,5.3e-05,5.3e-05,7.5e-05,5.2e-05,5.4e-05,5.6e-05,5.7e-05,5.7e-05,6.1e-05,6.2e-05,...,5.7e-05,5.7e-05,5.7e-05,6.3e-05,6e-05,5.3e-05,6e-05,5.3e-05,0.002809,5.5e-05
TLL1,8.4e-05,8.4e-05,4e-05,4.6e-05,4.3e-05,4.3e-05,4.7e-05,3.6e-05,4.8e-05,4.8e-05,...,4.4e-05,4.4e-05,4.4e-05,3.5e-05,3.8e-05,8.4e-05,3.8e-05,9.1e-05,4.1e-05,5.7e-05
PRSS22,4.2e-05,4.2e-05,3.3e-05,3.2e-05,3.1e-05,3.6e-05,3.4e-05,2.6e-05,3e-05,3.5e-05,...,3.2e-05,3.2e-05,3.2e-05,2.9e-05,3e-05,4.2e-05,3e-05,2.7e-05,3.6e-05,3.6e-05


In [9]:
importlib.reload(metrics_functions)
disease_rwr_conservative = metrics_functions.random_walk_restart(graph, disgenet_conservative_module)
disease_rwr_conservative_df = pd.DataFrame.from_dict(disease_rwr_conservative)
disease_rwr_conservative_df = disease_rwr_conservative_df.rename(index=dict(zip(list(disease_rwr_conservative_df.index),list(graph.vs['name']))))
disease_rwr_conservative_df.to_csv('../../data/processed/metrics/disease_rwr_conservative_string.csv')
print(disease_rwr_conservative_df.shape)
disease_rwr_conservative_df.head()

  0%|          | 0/301 [00:00<?, ?it/s]

(19035, 301)


Unnamed: 0,C0000786,C0000822,C0001418,C0001787,C0001973,C0002152,C0002395,C0002736,C0003873,C0004096,...,C4317109,C4317123,C4505436,C4505456,C4552091,C4552766,C4704862,C4707243,C4721453,C4721507
ARF5,8.5e-05,8.5e-05,9.2e-05,0.000109,9e-05,9.4e-05,9.2e-05,0.000109,9e-05,8.3e-05,...,8.7e-05,8.8e-05,8.7e-05,9.7e-05,0.000101,8.5e-05,0.000101,8.9e-05,9.7e-05,8.3e-05
PDE1C,4.1e-05,4.1e-05,4.2e-05,5.4e-05,6.5e-05,4.9e-05,4.9e-05,4.2e-05,4.1e-05,4.8e-05,...,6.6e-05,6.5e-05,6.6e-05,3.9e-05,5e-05,4.1e-05,5e-05,5.5e-05,4.5e-05,4.3e-05
ERCC1,5.3e-05,5.3e-05,7.5e-05,5.2e-05,5.4e-05,5.6e-05,5.7e-05,5.7e-05,6.1e-05,6.2e-05,...,5.7e-05,5.7e-05,5.7e-05,6.3e-05,6e-05,5.3e-05,6e-05,5.3e-05,0.00286,5.5e-05
TLL1,8.2e-05,8.2e-05,4e-05,4.6e-05,4.3e-05,4.3e-05,4.7e-05,3.6e-05,4.8e-05,4.8e-05,...,4.4e-05,4.4e-05,4.4e-05,3.5e-05,3.8e-05,8.2e-05,3.8e-05,9.1e-05,4.1e-05,5.7e-05
PRSS22,4.3e-05,4.3e-05,3.3e-05,3.2e-05,3.1e-05,3.6e-05,3.4e-05,2.6e-05,3e-05,3.5e-05,...,3.2e-05,3.2e-05,3.2e-05,2.9e-05,3e-05,4.3e-05,3e-05,2.7e-05,3.6e-05,3.6e-05


## 1.2 Labels Creation

In [10]:
reactome_labels = np.zeros(process_rwr_df.shape)
column = 0
for indexes in reactome_modules['protein_index'].values:
    for pos in indexes:
        reactome_labels[pos][column] = 1
    column += 1
np.savetxt("../../data/processed/reactome_labels_string.csv", reactome_labels, delimiter=",")

In [11]:
disgenet_sca_labels = np.zeros(disease_rwr_df.shape)
column = 0
for indexes in disgenet_sca_module['protein_index'].values:
    for pos in indexes:
        disgenet_sca_labels[pos][column] = 1
    column += 1
np.savetxt("../../data/processed/disgenet_sca_labels_string.csv", disgenet_sca_labels, delimiter=",")

In [12]:
disgenet_conservative_labels = np.zeros(disease_rwr_conservative_df.shape)
column = 0
for indexes in disgenet_conservative_module['protein_index'].values:
    for pos in indexes:
        disgenet_conservative_labels[pos][column] = 1
    column += 1
np.savetxt("../../data/processed/disgenet_conservative_labels_string.csv", disgenet_conservative_labels, delimiter=",")