In [3]:
import pandas as pd
from igraph import Graph
import disease_process_proteins
import metrics_functions
import importlib
import numpy as np
from ast import literal_eval
from tqdm.notebook import tqdm
import importlib

In [4]:
reactome_modules = pd.read_csv("../../data/processed/string_reactome_modules.csv", sep=',', header=0)
disgenet_modules = pd.read_csv("../../data/processed/string_disgenet_modules.csv", sep=',', header=0)

adj_matrix = np.load("../../data/processed/string_adjacency_matrix.npy")
graph = Graph.Read_GML("../../data/processed/graph_string")
graph_wgh = Graph.Read_GML("../../data/processed/graph_string_weighted")

In [5]:
reactome_modules['protein_index'] = reactome_modules['protein_index'].apply(literal_eval)
disgenet_modules['protein_index'] = disgenet_modules['protein_index'].apply(literal_eval)
reactome_modules['proteins_ids'] = reactome_modules['proteins_ids'].apply(literal_eval)
disgenet_modules['proteins_ids'] = disgenet_modules['proteins_ids'].apply(literal_eval)
disgenet_modules['conservative_module'] = disgenet_modules['conservative_module'].apply(literal_eval)
disgenet_modules['added_nodes'] = disgenet_modules['added_nodes'].apply(literal_eval)
disgenet_modules['conservative_module_ids'] = disgenet_modules['conservative_module_ids'].apply(literal_eval)
disgenet_modules['added_nodes_ids'] = disgenet_modules['added_nodes_ids'].apply(literal_eval)
disgenet_modules['main_component'] = disgenet_modules['main_component'].apply(literal_eval)
disgenet_modules['main_component_ids'] = disgenet_modules['main_component_ids'].apply(literal_eval)

In [6]:
disgenet_conservative_module = disgenet_modules[['process', 'conservative_module_ids', 'conservative_module', 'len_conservative']]
disgenet_conservative_module.columns = ['process', 'proteins_ids', 'protein_index', 'n_proteins']

In [7]:
disgenet_sca_module = disgenet_modules[disgenet_modules['increase']<0.4][['process', 'main_component_ids', 'main_component', 'len_sca']]
disgenet_sca_module.columns = ['process', 'proteins_ids', 'protein_index', 'n_proteins']

# 1. Target Process Metrics

## 1.1 Random Walks with Restart

In [12]:
importlib.reload(metrics_functions)
process_rwr = metrics_functions.random_walk_restart(graph, reactome_modules)
process_rwr_df = pd.DataFrame.from_dict(process_rwr)
process_rwr_df = process_rwr_df.rename(index=dict(zip(list(process_rwr_df.index),list(graph.vs['name']))))
process_rwr_df.to_csv('../../data/processed/metrics/process_rwr_string.csv')
print(process_rwr_df.shape)
process_rwr_df.head()

  0%|          | 0/244 [00:00<?, ?it/s]

(19035, 244)


Unnamed: 0,R-HSA-1031716,R-HSA-112379,R-HSA-112385,R-HSA-1168640,R-HSA-1234159,R-HSA-141409,R-HSA-141422,R-HSA-141431,R-HSA-141439,R-HSA-141671,...,R-HSA-9633742,R-HSA-9634669,R-HSA-9648114,R-HSA-9660824,R-HSA-983140,R-HSA-983147,R-HSA-983156,R-HSA-983157,R-HSA-983259,R-HSA-983266
A1BG,2.5e-05,1.8e-05,1.8e-05,2.3e-05,2.3e-05,2.1e-05,2.1e-05,2.1e-05,2.1e-05,1.9e-05,...,1.9e-05,1.9e-05,2.2e-05,2.3e-05,2.2e-05,2.2e-05,2.2e-05,2.2e-05,2.1e-05,2.1e-05
A1CF,5.4e-05,5.5e-05,5.5e-05,5.3e-05,5.2e-05,5.2e-05,5.2e-05,5.2e-05,5.2e-05,5.6e-05,...,5.6e-05,5.6e-05,5.2e-05,4.7e-05,5.6e-05,5.2e-05,5.6e-05,5.6e-05,5e-05,5e-05
A2M,9.9e-05,5.9e-05,5.9e-05,6.5e-05,6.5e-05,5.8e-05,5.8e-05,5.8e-05,5.8e-05,5.8e-05,...,5.9e-05,5.8e-05,6e-05,9.4e-05,6.4e-05,6.5e-05,6.4e-05,6.4e-05,6.4e-05,6.4e-05
A2ML1,2.8e-05,2.1e-05,2.1e-05,2.3e-05,2.2e-05,2.2e-05,2.2e-05,2.2e-05,2.2e-05,2.1e-05,...,2.1e-05,2.1e-05,2.2e-05,2.5e-05,3.1e-05,3.2e-05,3.1e-05,3.1e-05,2.4e-05,2.4e-05
A3GALT2,1.7e-05,1e-05,1e-05,1.3e-05,1.1e-05,9e-06,9e-06,9e-06,9e-06,9e-06,...,1e-05,1e-05,9e-06,1.5e-05,1e-05,1e-05,1e-05,1e-05,1e-05,1e-05


In [13]:
importlib.reload(metrics_functions)
disease_rwr = metrics_functions.random_walk_restart(graph, disgenet_sca_module)
disease_rwr_df = pd.DataFrame.from_dict(disease_rwr)
disease_rwr_df = disease_rwr_df.rename(index=dict(zip(list(disease_rwr_df.index),list(graph.vs['name']))))
disease_rwr_df.to_csv('../../data/processed/metrics/disease_rwr_string.csv')
print(disease_rwr_df.shape)
disease_rwr_df.head()

  0%|          | 0/301 [00:00<?, ?it/s]

(19035, 301)


Unnamed: 0,C0000786,C0000822,C0001418,C0001787,C0001973,C0002152,C0002395,C0002736,C0003873,C0004096,...,C4317109,C4317123,C4505436,C4505456,C4552091,C4552766,C4704862,C4707243,C4721453,C4721507
A1BG,4.5e-05,4.5e-05,4.1e-05,4.2e-05,2.7e-05,3e-05,3.6e-05,2.6e-05,3.5e-05,2.9e-05,...,2.6e-05,2.5e-05,2.6e-05,2.9e-05,2.8e-05,4.5e-05,2.8e-05,2.3e-05,2.5e-05,3.5e-05
A1CF,5.8e-05,5.8e-05,5.6e-05,6e-05,5.5e-05,5.8e-05,5.7e-05,7e-05,5.4e-05,4.8e-05,...,5.5e-05,5.5e-05,5.5e-05,6.3e-05,5.6e-05,5.8e-05,5.6e-05,5.1e-05,5.4e-05,6.1e-05
A2M,0.000178,0.000178,0.000121,0.000125,0.0001,0.000128,0.001844,9.9e-05,0.000123,0.00014,...,0.000102,0.000101,0.000102,9.5e-05,0.000109,0.000178,0.000109,0.000146,0.000108,0.000169
A2ML1,4.2e-05,4.2e-05,3.4e-05,3.9e-05,3.1e-05,2.8e-05,3.7e-05,2.5e-05,3.1e-05,3.6e-05,...,2.5e-05,2.5e-05,2.5e-05,2.7e-05,3.3e-05,4.2e-05,3.3e-05,3.1e-05,3e-05,3.2e-05
A3GALT2,1.7e-05,1.7e-05,1.1e-05,1.3e-05,1.2e-05,1.1e-05,1.2e-05,1.1e-05,1.3e-05,1.4e-05,...,1.1e-05,1.1e-05,1.1e-05,1.1e-05,1.4e-05,1.7e-05,1.4e-05,1.2e-05,1.1e-05,1.5e-05


In [14]:
importlib.reload(metrics_functions)
disease_rwr_conservative = metrics_functions.random_walk_restart(graph, disgenet_conservative_module)
disease_rwr_conservative_df = pd.DataFrame.from_dict(disease_rwr_conservative)
disease_rwr_conservative_df = disease_rwr_conservative_df.rename(index=dict(zip(list(disease_rwr_conservative_df.index),list(graph.vs['name']))))
disease_rwr_conservative_df.to_csv('../../data/processed/metrics/disease_rwr_conservative_string.csv')
print(disease_rwr_conservative_df.shape)
disease_rwr_conservative_df.head()

  0%|          | 0/301 [00:00<?, ?it/s]

(19035, 301)


Unnamed: 0,C0000786,C0000822,C0001418,C0001787,C0001973,C0002152,C0002395,C0002736,C0003873,C0004096,...,C4317109,C4317123,C4505436,C4505456,C4552091,C4552766,C4704862,C4707243,C4721453,C4721507
A1BG,4.5e-05,4.5e-05,4.1e-05,4.2e-05,2.7e-05,3e-05,3.6e-05,2.6e-05,3.5e-05,2.9e-05,...,2.6e-05,2.5e-05,2.6e-05,2.9e-05,2.9e-05,4.5e-05,2.9e-05,2.3e-05,2.5e-05,3.5e-05
A1CF,5.8e-05,5.8e-05,5.6e-05,6e-05,5.5e-05,5.8e-05,5.7e-05,7e-05,5.4e-05,4.8e-05,...,5.5e-05,5.5e-05,5.5e-05,6.3e-05,5.6e-05,5.8e-05,5.6e-05,5.1e-05,5.5e-05,6.1e-05
A2M,0.000177,0.000177,0.000121,0.000125,0.0001,0.000128,0.001844,9.9e-05,0.000123,0.000136,...,0.000102,0.000101,0.000102,9.5e-05,0.000109,0.000177,0.000109,0.000146,0.000108,0.000169
A2ML1,4.2e-05,4.2e-05,3.4e-05,3.9e-05,3.1e-05,2.8e-05,3.7e-05,2.5e-05,3.1e-05,3.2e-05,...,2.5e-05,2.5e-05,2.5e-05,2.7e-05,3.3e-05,4.2e-05,3.3e-05,3.1e-05,3e-05,3.2e-05
A3GALT2,1.7e-05,1.7e-05,1.1e-05,1.3e-05,1.2e-05,1.1e-05,1.2e-05,1.1e-05,1.3e-05,1.5e-05,...,1.1e-05,1.1e-05,1.1e-05,1.1e-05,1.4e-05,1.7e-05,1.4e-05,1.2e-05,1.1e-05,1.5e-05


## 1.2 Labels Creation

In [15]:
reactome_labels = np.zeros(process_rwr_df.shape)
column = 0
for indexes in reactome_modules['protein_index'].values:
    for pos in indexes:
        reactome_labels[pos][column] = 1
    column += 1
np.savetxt("../../data/processed/reactome_labels_string.csv", reactome_labels, delimiter=",")

In [16]:
disgenet_sca_labels = np.zeros(disease_rwr_df.shape)
column = 0
for indexes in disgenet_sca_module['protein_index'].values:
    for pos in indexes:
        disgenet_sca_labels[pos][column] = 1
    column += 1
np.savetxt("../../data/processed/disgenet_sca_labels_string.csv", disgenet_sca_labels, delimiter=",")

In [17]:
disgenet_conservative_labels = np.zeros(disease_rwr_conservative_df.shape)
column = 0
for indexes in disgenet_conservative_module['protein_index'].values:
    for pos in indexes:
        disgenet_conservative_labels[pos][column] = 1
    column += 1
np.savetxt("../../data/processed/disgenet_conservative_labels_string.csv", disgenet_conservative_labels, delimiter=",")

# 1.3. GenePANDA

In [1]:
sp = graph_wgh.shortest_paths()
sp_df = pd.DataFrame(sp)
sp_df.to_csv('../../data/processed/metrics/string_sp.csv')

NameError: name 'graph_wgh' is not defined

In [8]:
sp_df = pd.read_csv('../../data/processed/metrics/string_sp.csv', index_col=0)

In [9]:
edge_df = graph_wgh.get_edge_dataframe()
df = pd.crosstab(edge_df.source, edge_df.target, values=edge_df['weight'], aggfunc=np.mean)
idx = df.columns.union(df.index)
df = df.reindex(index = idx, columns=idx)
df.fillna(0, inplace=True)
weight_adj_upper = np.array(df)
weight_adj = weight_adj_upper + weight_adj_upper.T - np.diag(np.diag(weight_adj_upper))
weight_adj[weight_adj == 0] = 1

In [11]:
reactome_labels_df = pd.read_csv("../../data/processed/reactome_labels_string.csv", sep=',', names=reactome_modules['process'].values)
reactome_labels_df['protein_id'] = graph.vs['name']
reactome_labels_df.set_index('protein_id', inplace=True)

In [12]:
disgenet_labels_df = pd.read_csv('../../data/processed/disgenet_sca_labels_string.csv', header = None)
disgenet_labels_df['protein_id'] = graph.vs['name']
disgenet_labels_df.set_index('protein_id', inplace=True)

disgenet_labels_conservative_df = pd.read_csv('../../data/processed/disgenet_conservative_labels_string.csv', header = None)
disgenet_labels_conservative_df['protein_id'] = graph.vs['name']
disgenet_labels_conservative_df.set_index('protein_id', inplace=True)

In [14]:
importlib.reload(metrics_functions)
process_genePANDA = metrics_functions.genePANDA(graph, reactome_labels_df, np.array(sp_df), weight_adj)
process_genePANDA.to_csv('../../data/processed/metrics/process_genePANDA_string.csv')
print(process_genePANDA.shape)
process_genePANDA.head()

Running...


  0%|          | 0/244 [00:00<?, ?it/s]

(19035, 244)


Unnamed: 0,R-HSA-1031716,R-HSA-112379,R-HSA-112385,R-HSA-1168640,R-HSA-1234159,R-HSA-141409,R-HSA-141422,R-HSA-141431,R-HSA-141439,R-HSA-141671,...,R-HSA-9633742,R-HSA-9634669,R-HSA-9648114,R-HSA-9660824,R-HSA-983140,R-HSA-983147,R-HSA-983156,R-HSA-983157,R-HSA-983259,R-HSA-983266
A1BG,0.00066,0.006009,0.006009,0.00399,0.005026,0.001551,0.001551,0.001551,0.001551,0.00674,...,0.006555,0.006613,0.000872,0.002855,0.000854,0.000862,0.000854,0.000854,0.001466,0.001466
A1CF,0.000791,0.016966,0.016966,0.016342,0.021135,0.00182,0.00182,0.00182,0.00182,0.007004,...,0.006769,0.00683,0.00081,0.004436,0.001169,0.000755,0.001169,0.001169,0.005737,0.005737
A2M,0.000758,0.019097,0.019097,0.004077,0.005129,0.005057,0.005057,0.005057,0.005057,0.009324,...,0.008905,0.008934,0.001601,0.002583,0.002339,0.001992,0.002339,0.002339,0.002322,0.002322
A2ML1,0.000659,0.008851,0.008851,0.004503,0.010428,0.002362,0.002362,0.002362,0.002362,0.007902,...,0.007685,0.007724,0.000886,0.003132,0.000847,0.000914,0.000847,0.000847,0.002995,0.002995
A3GALT2,0.000813,0.021622,0.021622,0.004,0.007088,0.001617,0.001617,0.001617,0.001617,0.0234,...,0.022746,0.022733,0.000684,0.002681,0.000657,0.000864,0.000657,0.000657,0.006569,0.006569


In [15]:
importlib.reload(metrics_functions)
disease_genePANDA = metrics_functions.genePANDA(graph, disgenet_labels_df, np.array(sp_df), weight_adj)
disease_genePANDA.to_csv('../../data/processed/metrics/disease_genePANDA_string.csv')
print(disease_genePANDA.shape)
disease_genePANDA.head()

Running...


  0%|          | 0/301 [00:00<?, ?it/s]

(19035, 301)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,300
A1BG,0.000862,0.000862,0.001696,0.001027,0.0017,0.000321,0.000252,0.000799,0.001505,0.000555,...,0.000454,0.000615,0.000454,0.000675,0.002724,0.000862,0.002724,0.000287,0.000732,0.000982
A1CF,0.000923,0.000923,0.001005,0.000796,0.001792,0.000401,0.000245,0.000896,0.000895,0.002772,...,0.000395,0.000603,0.000395,0.003513,0.001666,0.000923,0.001666,0.000284,0.000676,0.001326
A2M,0.005534,0.005534,0.005386,0.002777,0.006484,0.003628,0.003663,0.001538,0.004228,0.003275,...,0.001345,0.00141,0.001345,0.001904,0.006586,0.005534,0.006586,0.002629,0.001351,0.00312
A2ML1,0.001343,0.001343,0.000992,0.000983,0.002603,0.000297,0.000238,0.000599,0.000852,0.000348,...,0.000276,0.000562,0.000276,0.001007,0.002956,0.001343,0.002956,0.000326,0.000682,0.001561
A3GALT2,0.000785,0.000785,0.001926,0.000834,0.000844,0.0,0.0,0.000613,0.000809,0.000441,...,0.000474,0.000998,0.000474,0.0,0.001552,0.000785,0.001552,0.000574,0.000692,0.000987


In [17]:
importlib.reload(metrics_functions)
disease_conservative_genePANDA = metrics_functions.genePANDA(graph, disgenet_labels_conservative_df, np.array(sp_df), weight_adj)
disease_conservative_genePANDA.to_csv('../../data/processed/metrics/disease_conservative_genePANDA_string.csv')
print(disease_conservative_genePANDA.shape)
disease_conservative_genePANDA.head()

Running...


  0%|          | 0/301 [00:00<?, ?it/s]

(19035, 301)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,300
A1BG,0.000782,0.000782,0.001696,0.001027,0.0017,0.000321,0.000252,0.000799,0.001577,0.00055,...,0.000454,0.000615,0.000454,0.000675,0.002609,0.000782,0.002609,0.000287,0.00072,0.000982
A1CF,0.000919,0.000919,0.001005,0.000796,0.001792,0.000401,0.000245,0.000896,0.000891,0.002681,...,0.000395,0.000603,0.000395,0.003513,0.001684,0.000919,0.001684,0.000284,0.000671,0.001326
A2M,0.005328,0.005328,0.005386,0.002777,0.006484,0.003628,0.003663,0.001538,0.004169,0.003138,...,0.001345,0.00141,0.001345,0.001904,0.006478,0.005328,0.006478,0.002629,0.00141,0.00312
A2ML1,0.001342,0.001342,0.000992,0.000983,0.002603,0.000297,0.000238,0.000599,0.00085,0.000604,...,0.000276,0.000562,0.000276,0.001007,0.002865,0.001342,0.002865,0.000326,0.000682,0.001561
A3GALT2,0.000778,0.000778,0.001926,0.000834,0.000844,0.0,0.0,0.000613,0.000793,0.000435,...,0.000474,0.000998,0.000474,0.0,0.001661,0.000778,0.001661,0.000574,0.000688,0.000987


### 1.5 MaxLink

In [18]:
importlib.reload(metrics_functions)
process_maxlink = metrics_functions.MaxLink(reactome_labels_df, adj_matrix)
process_maxlink.to_csv('../../data/processed/metrics/process_maxlink_string.csv')
print(process_maxlink.shape)
process_maxlink.head()

  0%|          | 0/244 [00:00<?, ?it/s]

(19035, 244)


Unnamed: 0,R-HSA-1031716,R-HSA-112379,R-HSA-112385,R-HSA-1168640,R-HSA-1234159,R-HSA-141409,R-HSA-141422,R-HSA-141431,R-HSA-141439,R-HSA-141671,...,R-HSA-9633742,R-HSA-9634669,R-HSA-9648114,R-HSA-9660824,R-HSA-983140,R-HSA-983147,R-HSA-983156,R-HSA-983157,R-HSA-983259,R-HSA-983266
0,1,0,0,1,1,1,1,1,1,1,...,1,1,2,0,4,4,4,4,1,1
1,2,2,2,1,1,3,3,3,3,3,...,3,3,3,0,10,5,10,10,0,0
2,9,1,1,2,2,0,0,0,0,2,...,2,2,1,7,3,3,3,3,1,1
3,1,0,0,1,0,0,0,0,0,1,...,1,1,0,0,6,6,6,6,0,0
4,2,0,0,2,1,0,0,0,0,0,...,0,0,0,2,0,0,0,0,0,0


In [19]:
importlib.reload(metrics_functions)
disease_maxlink = metrics_functions.MaxLink(disgenet_labels_df, adj_matrix)
disease_maxlink.to_csv('../../data/processed/metrics/disease_maxlink_string.csv')
print(disease_maxlink.shape)
disease_maxlink.head()

  0%|          | 0/301 [00:00<?, ?it/s]

(19035, 301)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,300
0,9,9,18,10,11,5,7,1,13,3,...,4,4,4,6,6,9,6,0,1,8
1,6,6,7,4,10,6,7,9,8,0,...,4,4,4,14,4,6,4,1,1,7
2,49,49,44,25,52,39,50,10,47,31,...,23,23,23,20,28,49,28,23,14,46
3,6,6,7,5,8,1,5,0,4,2,...,0,0,0,3,7,6,7,1,1,2
4,3,3,0,1,1,0,0,0,2,2,...,0,0,0,0,1,3,1,0,0,3


In [20]:
importlib.reload(metrics_functions)
disease_conservative_maxlink = metrics_functions.MaxLink(disgenet_labels_conservative_df, adj_matrix)
disease_conservative_maxlink.to_csv('../../data/processed/metrics/disease_conservative_maxlink_string.csv')
print(disease_conservative_maxlink.shape)
disease_conservative_maxlink.head()

  0%|          | 0/301 [00:00<?, ?it/s]

(19035, 301)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,300
0,9,9,18,10,11,5,7,1,13,3,...,4,4,4,6,6,9,6,0,1,8
1,6,6,7,4,10,6,7,9,8,0,...,4,4,4,14,4,6,4,1,1,7
2,48,48,44,25,52,39,50,10,47,30,...,23,23,23,20,28,48,28,23,14,46
3,6,6,7,5,8,1,5,0,4,1,...,0,0,0,3,7,6,7,1,1,2
4,3,3,0,1,1,0,0,0,2,2,...,0,0,0,0,1,3,1,0,0,3
