In [1]:
import pandas as pd
from igraph import Graph
import disease_process_proteins
import metrics_functions
import importlib
import numpy as np
from ast import literal_eval
from tqdm.notebook import tqdm

In [2]:
reactome_proteins_indexes_apid_huri_df = pd.read_csv("../../data/processed/reactome_proteins_indexes_apid_huri.csv", sep=',', header=0)
disgenet_proteins_indexes_apid_huri_df = pd.read_csv("../../data/processed/disgenet_prot_index_main_comp.csv", sep=',', header=0)

protein_ids_apid_huri_df = pd.read_csv("../../data/processed/protein_ids_apid_huri.csv", sep=',', header=0)
adj_matrix_apid_huri = np.load("../../data/processed/adjacency_matrix_apid_huri.npy")

In [3]:
reactome_proteins_indexes_apid_huri_df['protein_index'] = reactome_proteins_indexes_apid_huri_df['protein_index'].apply(literal_eval)
disgenet_proteins_indexes_apid_huri_df['protein_index'] = disgenet_proteins_indexes_apid_huri_df['protein_index'].apply(literal_eval)
reactome_proteins_indexes_apid_huri_df['proteins_ids'] = reactome_proteins_indexes_apid_huri_df['proteins_ids'].apply(literal_eval)
disgenet_proteins_indexes_apid_huri_df['proteins_ids'] = disgenet_proteins_indexes_apid_huri_df['proteins_ids'].apply(literal_eval)
disgenet_proteins_indexes_apid_huri_df['conservative_module'] = disgenet_proteins_indexes_apid_huri_df['conservative_module'].apply(literal_eval)
disgenet_proteins_indexes_apid_huri_df['added_nodes'] = disgenet_proteins_indexes_apid_huri_df['added_nodes'].apply(literal_eval)
disgenet_proteins_indexes_apid_huri_df['conservative_module_ids'] = disgenet_proteins_indexes_apid_huri_df['conservative_module_ids'].apply(literal_eval)
disgenet_proteins_indexes_apid_huri_df['added_nodes_ids'] = disgenet_proteins_indexes_apid_huri_df['added_nodes_ids'].apply(literal_eval)
disgenet_proteins_indexes_apid_huri_df['main_component'] = disgenet_proteins_indexes_apid_huri_df['main_component'].apply(literal_eval)
disgenet_proteins_indexes_apid_huri_df['main_component_ids'] = disgenet_proteins_indexes_apid_huri_df['main_component_ids'].apply(literal_eval)

In [4]:
graph_apid_huri = Graph.Read_GML("../../data/processed/graph_apid_huri")

In [5]:
disgenet_prot_index_conservative_module = disgenet_proteins_indexes_apid_huri_df[['process', 'conservative_module_ids', 'conservative_module', 'len_cm']]
disgenet_prot_index_conservative_module.columns = ['process', 'proteins_ids', 'protein_index', 'n_proteins']

In [6]:
disgenet_prot_index_main_comp = disgenet_proteins_indexes_apid_huri_df[disgenet_proteins_indexes_apid_huri_df['increase']<0.4][['process', 'main_component_ids', 'main_component', 'len_main_component']]
disgenet_prot_index_main_comp.columns = ['process', 'proteins_ids', 'protein_index', 'n_proteins']

In [7]:
reactome_labels_df = pd.read_csv("../../data/processed/reactome_labels_apid_huri.csv", sep=',', names=reactome_proteins_indexes_apid_huri_df['process'].values)
reactome_labels_df['protein_id'] = graph_apid_huri.vs['name']
reactome_labels_df.set_index('protein_id', inplace=True)

disgenet_labels_df = pd.read_csv("../../data/processed/disgenet_filtered_labels_apid_huri.csv", sep=',', names=disgenet_prot_index_main_comp['process'].values)
disgenet_labels_df['protein_id'] = graph_apid_huri.vs['name']
disgenet_labels_df.set_index('protein_id', inplace=True)

disgenet_labels_conservative_df = pd.read_csv('../../data/processed/disgenet_conservative_labels_apid_huri.csv', names=disgenet_prot_index_conservative_module['process'].values)
disgenet_labels_conservative_df['protein_id'] = graph_apid_huri.vs['name']
disgenet_labels_conservative_df.set_index('protein_id', inplace=True)

In [9]:
process_rwr_whole = pd.read_csv("../../models/GAP-MINE/process/probability/rwr_whole.csv")
process_rwr_whole['new_proteins'] = process_rwr_whole['new_proteins'].apply(literal_eval)
disease_rwr_whole = pd.read_csv("../../models/GAP-MINE/disease/probability/rwr_whole.csv")
disease_rwr_whole['new_proteins'] = disease_rwr_whole['new_proteins'].apply(literal_eval)
disease_conservative_rwr_whole = pd.read_csv("../../models/GAP-MINE/disease/probability/rwr_conservative_whole.csv")
disease_conservative_rwr_whole['new_proteins'] = disease_conservative_rwr_whole['new_proteins'].apply(literal_eval)

In [186]:
disgenet_prot_index_main_comp.reset_index(inplace=True)

In [187]:
protein_list = set(list(disgenet_labels_df.sum(axis=1).replace(0, np.nan).dropna().index)+list(disgenet_labels_conservative_df.sum(axis=1).replace(0, np.nan).dropna().index)+list(reactome_labels_df.sum(axis=1).replace(0, np.nan).dropna().index))

In [188]:
sp = graph_apid_huri.shortest_paths(graph_apid_huri.vs['name'], protein_list)
sp_df = pd.DataFrame(sp, columns=protein_list)
sp_df.index = graph_apid_huri.vs['name']

In [189]:
import math
from tqdm.notebook import tqdm
tqdm.pandas()
def fp_addition(protein_indexes, sp_df, graph, clf_fp):
    clf_fp_proteins = clf_fp['new_proteins']
    min_sp = sp_df[~sp_df.index.isin(protein_indexes)][protein_indexes].min(axis=1)
    degree_values = graph.degree(sp_df[~sp_df.index.isin(protein_indexes)].index)
    log_degree_values = [math.log10(x) for x in degree_values]
    min_sp = pd.DataFrame(min_sp, columns=['sp'])
    min_sp['degree'] = log_degree_values
    min_sp['weight'] = min_sp['degree']/(10**min_sp['sp'])
    min_sp.loc[min_sp.index.isin(clf_fp_proteins), 'weight'] = 0
    min_sp['normalized_weight'] = min_sp['weight']/sum(min_sp['weight'])
    new_proteins = list(np.random.choice(min_sp.index, int(len(protein_indexes)*0.1), p=min_sp['normalized_weight']))
    new_proteins = new_proteins+protein_indexes
    new_proteins_index = [int(graph.vs.find(name=x)['id']) for x in new_proteins]
    return new_proteins, new_proteins_index

In [190]:
reactome_proteins_indexes_apid_huri_df[['fp_proteins', 'fp_proteins_index']] = reactome_proteins_indexes_apid_huri_df.progress_apply(lambda row: fp_addition(row['proteins_ids'], sp_df, graph_apid_huri, process_rwr_whole.loc[row.name,:]), axis=1, result_type='expand')
disgenet_prot_index_conservative_module[['fp_proteins', 'fp_proteins_index']] = disgenet_prot_index_conservative_module.progress_apply(lambda row: fp_addition(row['proteins_ids'], sp_df, graph_apid_huri, disease_conservative_rwr_whole.loc[row.name,:]), axis=1, result_type='expand')
disgenet_prot_index_main_comp[['fp_proteins', 'fp_proteins_index']] = disgenet_prot_index_main_comp.progress_apply(lambda row: fp_addition(row['proteins_ids'], sp_df, graph_apid_huri, disease_rwr_whole.loc[row.name,:]), axis=1, result_type='expand')

  0%|          | 0/429 [00:00<?, ?it/s]

  0%|          | 0/301 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


  0%|          | 0/203 [00:00<?, ?it/s]

In [191]:
reactome_proteins_indexes_apid_huri_df.to_csv('../../data/processed/reactome_protein_indexes_fp.csv', index=False)
disgenet_prot_index_conservative_module.to_csv('../../data/processed/disgenet_protein_indexes_conservative_fp.csv', index=False)
disgenet_prot_index_main_comp.to_csv('../../data/processed/disgenet_protein_indexes_fp.csv', index=False)

In [4]:
reactome_proteins_indexes_apid_huri_df = pd.read_csv('../../data/processed/reactome_protein_indexes_fp.csv')
disgenet_prot_index_conservative_module = pd.read_csv('../../data/processed/disgenet_protein_indexes_conservative_fp.csv')
disgenet_prot_index_main_comp = pd.read_csv('../../data/processed/disgenet_protein_indexes_fp.csv')

In [5]:
reactome_proteins_indexes_apid_huri_df['fp_proteins'] = reactome_proteins_indexes_apid_huri_df['fp_proteins'].apply(literal_eval)
disgenet_prot_index_conservative_module['fp_proteins'] = disgenet_prot_index_conservative_module['fp_proteins'].apply(literal_eval)
disgenet_prot_index_main_comp['fp_proteins'] = disgenet_prot_index_main_comp['fp_proteins'].apply(literal_eval)

reactome_proteins_indexes_apid_huri_df['fp_proteins_index'] = reactome_proteins_indexes_apid_huri_df['fp_proteins_index'].apply(literal_eval)
disgenet_prot_index_conservative_module['fp_proteins_index'] = disgenet_prot_index_conservative_module['fp_proteins_index'].apply(literal_eval)
disgenet_prot_index_main_comp['fp_proteins_index'] = disgenet_prot_index_main_comp['fp_proteins_index'].apply(literal_eval)

In [6]:
reactome_proteins_indexes_apid_huri_df.drop(columns=['proteins_ids', 'protein_index'], inplace=True)
reactome_proteins_indexes_apid_huri_df = reactome_proteins_indexes_apid_huri_df[['process', 'fp_proteins', 'fp_proteins_index']]
reactome_proteins_indexes_apid_huri_df.columns = ['process', 'proteins_ids', 'protein_index']
reactome_proteins_indexes_apid_huri_df['n_proteins'] = reactome_proteins_indexes_apid_huri_df['proteins_ids'].apply(lambda row: len(row))

In [7]:
disgenet_prot_index_conservative_module.drop(columns=['proteins_ids', 'protein_index'], inplace=True)
disgenet_prot_index_conservative_module = disgenet_prot_index_conservative_module[['process', 'fp_proteins', 'fp_proteins_index']]
disgenet_prot_index_conservative_module.columns = ['process', 'proteins_ids', 'protein_index']
disgenet_prot_index_conservative_module['n_proteins'] = disgenet_prot_index_conservative_module['proteins_ids'].apply(lambda row: len(row))

disgenet_prot_index_main_comp.drop(columns=['proteins_ids', 'protein_index'], inplace=True)
disgenet_prot_index_main_comp = disgenet_prot_index_main_comp[['process', 'fp_proteins', 'fp_proteins_index']]
disgenet_prot_index_main_comp.columns = ['process', 'proteins_ids', 'protein_index']
disgenet_prot_index_main_comp['n_proteins'] = disgenet_prot_index_main_comp['proteins_ids'].apply(lambda row: len(row))

# 1. Target Process Metrics

## 1.6 Random Walks with Restart

In [167]:
importlib.reload(metrics_functions)
process_rwr = metrics_functions.random_walk_restart(graph_apid_huri, reactome_proteins_indexes_apid_huri_df)
process_rwr_df = pd.DataFrame.from_dict(process_rwr)
process_rwr_df = process_rwr_df.rename(index=dict(zip(list(process_rwr_df.index),list(graph_apid_huri.vs['name']))))
process_rwr_df.to_csv('../../data/processed/metrics/process_rwr_fp.csv')
print(process_rwr_df.shape)
process_rwr_df.head()

  0%|          | 0/429 [00:00<?, ?it/s]

(17204, 429)


Unnamed: 0,R-HSA-1031716,R-HSA-112379,R-HSA-112385,R-HSA-1168640,R-HSA-1214188,R-HSA-1234159,R-HSA-141409,R-HSA-141422,R-HSA-141431,R-HSA-141439,...,R-HSA-9670149,R-HSA-9698928,R-HSA-9710490,R-HSA-977224,R-HSA-983140,R-HSA-983147,R-HSA-983156,R-HSA-983157,R-HSA-983259,R-HSA-983266
A1BG,2.5e-05,2.1e-05,2.1e-05,0.000139,2.2e-05,0.000119,2.5e-05,2.4e-05,2.4e-05,2.4e-05,...,2.3e-05,2.3e-05,2.4e-05,2.2e-05,5.8e-05,6.5e-05,5.8e-05,5.8e-05,2.9e-05,2.9e-05
A1CF,3e-05,2.8e-05,2.8e-05,7.2e-05,2.4e-05,6.2e-05,2.8e-05,2.9e-05,3e-05,2.9e-05,...,2.5e-05,2.5e-05,2.5e-05,2.4e-05,3.2e-05,3.1e-05,3.1e-05,3.1e-05,3.2e-05,3.3e-05
A2M,0.000209,0.000247,0.00028,0.000185,0.000218,0.000197,0.000188,0.000184,0.000186,0.000185,...,0.000215,0.000179,0.000221,0.000219,0.000257,0.000273,0.000255,0.000261,0.000244,0.000243
A2ML1,2e-05,1.4e-05,1.3e-05,1.4e-05,1.1e-05,1.4e-05,1.3e-05,1.3e-05,1.3e-05,1.3e-05,...,1.2e-05,1.1e-05,1.2e-05,1.1e-05,1.9e-05,1.6e-05,1.4e-05,1.4e-05,1.3e-05,1.3e-05
A4GALT,5e-06,4e-06,4e-06,5e-06,3e-06,5e-06,1.1e-05,1.1e-05,1.1e-05,1.1e-05,...,3e-06,3e-06,3e-06,3e-06,8e-06,5e-06,4e-06,4e-06,5e-06,5e-06


In [168]:
importlib.reload(metrics_functions)
disease_rwr = metrics_functions.random_walk_restart(graph_apid_huri, disgenet_prot_index_main_comp)
disease_rwr_df = pd.DataFrame.from_dict(disease_rwr)
disease_rwr_df = disease_rwr_df.rename(index=dict(zip(list(disease_rwr_df.index),list(graph_apid_huri.vs['name']))))
disease_rwr_df.to_csv('../../data/processed/metrics/disease_rwr_fp.csv')
print(disease_rwr_df.shape)
disease_rwr_df.head()

  0%|          | 0/203 [00:00<?, ?it/s]

(17204, 203)


Unnamed: 0,C0001418,C0001973,C0002152,C0002395,C0002736,C0003873,C0004153,C0004238,C0004352,C0005684,...,C3714636,C3714758,C4277682,C4277690,C4279912,C4505456,C4552091,C4704862,C4707243,C4721507
A1BG,4.4e-05,2.8e-05,2.6e-05,2.6e-05,2.6e-05,2.8e-05,0.000181,2.6e-05,4.4e-05,3.2e-05,...,2.2e-05,5.2e-05,3.4e-05,3.2e-05,3.5e-05,7.6e-05,2.7e-05,2.7e-05,6.5e-05,2.5e-05
A1CF,3.4e-05,3.2e-05,3e-05,3.7e-05,3.4e-05,4.8e-05,3.5e-05,4.4e-05,3.1e-05,3.1e-05,...,2.9e-05,3.5e-05,3e-05,3.3e-05,3.1e-05,3.4e-05,3.5e-05,3.4e-05,3.4e-05,2.8e-05
A2M,0.000413,0.000798,0.000468,0.001707,0.000231,0.001167,0.000343,0.00022,0.000272,0.000384,...,0.000859,0.00022,0.000438,0.000192,0.000446,0.00033,0.000224,0.000218,0.000311,0.000873
A2ML1,4.6e-05,1.5e-05,1.8e-05,1.9e-05,1.8e-05,2.8e-05,1.4e-05,1.5e-05,1.8e-05,1.7e-05,...,2.9e-05,3.6e-05,3.5e-05,1.3e-05,3.5e-05,1.4e-05,3.2e-05,3.2e-05,1.3e-05,2e-05
A4GALT,5e-06,9e-06,8e-06,7e-06,9e-06,6e-06,6e-06,5e-06,6e-06,5e-06,...,9e-06,5e-06,8e-06,5e-06,8e-06,5e-06,5e-06,6e-06,5e-06,1e-05


In [169]:
importlib.reload(metrics_functions)
disease_rwr_conservative = metrics_functions.random_walk_restart(graph_apid_huri, disgenet_prot_index_conservative_module)
disease_rwr_conservative_df = pd.DataFrame.from_dict(disease_rwr_conservative)
disease_rwr_conservative_df = disease_rwr_conservative_df.rename(index=dict(zip(list(disease_rwr_conservative_df.index),list(graph_apid_huri.vs['name']))))
disease_rwr_conservative_df.to_csv('../../data/processed/metrics/disease_rwr_conservative_fp.csv')
print(disease_rwr_conservative_df.shape)
disease_rwr_conservative_df.head()

  0%|          | 0/301 [00:00<?, ?it/s]

(17204, 301)


Unnamed: 0,C0000786,C0000822,C0001418,C0001787,C0001973,C0002152,C0002395,C0002736,C0003873,C0004096,...,C4317123,C4505436,C4505456,C4552091,C4552766,C4704862,C4707243,C4721453,C4721507,C4722327
A1BG,2.7e-05,2.7e-05,4.9e-05,4.9e-05,2.6e-05,2.6e-05,2.7e-05,2.6e-05,2.9e-05,2.9e-05,...,2.9e-05,3.1e-05,8.7e-05,2.4e-05,2.7e-05,2.3e-05,4.2e-05,2.6e-05,2.5e-05,2.5e-05
A1CF,6.1e-05,6.2e-05,3.1e-05,3.6e-05,3.2e-05,3e-05,5.1e-05,3.2e-05,4.4e-05,2.9e-05,...,2.9e-05,3e-05,3.6e-05,3.5e-05,6.8e-05,3.6e-05,3.5e-05,3.2e-05,2.7e-05,3.5e-05
A2M,0.000491,0.000493,0.000523,0.000301,0.000371,0.000459,0.002113,0.000233,0.000642,0.000496,...,0.000435,0.000454,0.000357,0.000207,0.000492,0.000207,0.000265,0.000383,0.001062,0.000251
A2ML1,4e-05,4e-05,5e-05,1.6e-05,1.6e-05,1.6e-05,1.8e-05,1.7e-05,2.2e-05,2.3e-05,...,1.6e-05,1.6e-05,1.4e-05,2.6e-05,4.1e-05,2.6e-05,1.3e-05,1.6e-05,2e-05,3.4e-05
A4GALT,5e-06,5e-06,5e-06,5e-06,1e-05,6e-06,6e-06,5e-06,5e-06,6e-06,...,7e-06,7e-06,5e-06,5e-06,5e-06,5e-06,5e-06,6e-06,7e-06,5e-06
