In [1]:
import pandas as pd
from igraph import Graph
import disease_process_proteins
import metrics_functions
import importlib
import numpy as np
from ast import literal_eval
from tqdm.notebook import tqdm
from scipy.stats import hypergeom
from metrics_functions import MaxLink

In [25]:
graph = Graph.Read_GML("../../data/processed/graph_apid_huri")

In [23]:
adj_matrix_apid_huri = np.load("../../data/processed/adjacency_matrix_apid_huri.npy")

In [4]:
reactome_proteins_indexes_df = pd.read_csv("../../data/processed/reactome_proteins_indexes_apid_huri.csv", sep=',', header=0)
disgenet_proteins_indexes_df = pd.read_csv("../../data/processed/disgenet_prot_index_main_comp.csv", sep=',', header=0)
disgenet_prot_index_main_comp = disgenet_proteins_indexes_df[disgenet_proteins_indexes_df['increase']<0.4]

In [5]:
reactome_labels_df = pd.read_csv("../../data/processed/reactome_labels_apid_huri.csv", sep=',', names=reactome_proteins_indexes_df['process'].values)
reactome_labels_df['protein_id'] = graph.vs['name']
reactome_labels_df.set_index('protein_id', inplace=True)

disgenet_labels_df = pd.read_csv("../../data/processed/disgenet_filtered_labels_apid_huri.csv", sep=',', names=disgenet_prot_index_main_comp['process'].values)
disgenet_labels_df['protein_id'] = graph.vs['name']
disgenet_labels_df.set_index('protein_id', inplace=True)

disgenet_labels_conservative_df = pd.read_csv('../../data/processed/disgenet_conservative_labels_apid_huri.csv', names=disgenet_proteins_indexes_df['process'].values)
disgenet_labels_conservative_df['protein_id'] = graph.vs['name']
disgenet_labels_conservative_df.set_index('protein_id', inplace=True)

In [7]:
process_maxlink = MaxLink(reactome_labels_df, adj_matrix_apid_huri)
process_maxlink.index = graph.vs['name']
process_maxlink.to_csv('../../data/processed/metrics/process_maxlink.csv')

  0%|          | 0/429 [00:00<?, ?it/s]

In [34]:
disease_maxlink = MaxLink(disgenet_labels_df, adj_matrix_apid_huri)
disease_maxlink.index = graph.vs['name']
disease_maxlink.to_csv('../../data/processed/metrics/disease_maxlink.csv')

  0%|          | 0/203 [00:00<?, ?it/s]

In [8]:
disease_conservative_maxlink = MaxLink(disgenet_labels_conservative_df, adj_matrix_apid_huri)
disease_conservative_maxlink.index = graph.vs['name']
disease_conservative_maxlink.to_csv('../../data/processed/metrics/disease_conservative_maxlink.csv')

  0%|          | 0/301 [00:00<?, ?it/s]

# False Positives

In [15]:
reactome_proteins_indexes_fp = pd.read_csv('../../data/processed/reactome_protein_indexes_fp.csv')
disgenet_proteins_indexes_conservative_fp = pd.read_csv('../../data/processed/disgenet_protein_indexes_conservative_fp.csv')
disgenet_proteins_indexes_sca_fp = pd.read_csv('../../data/processed/disgenet_protein_indexes_fp.csv')

In [16]:
reactome_proteins_indexes_fp['fp_proteins'] = reactome_proteins_indexes_fp['fp_proteins'].apply(literal_eval)
disgenet_proteins_indexes_conservative_fp['fp_proteins'] = disgenet_proteins_indexes_conservative_fp['fp_proteins'].apply(literal_eval)
disgenet_proteins_indexes_sca_fp['fp_proteins'] = disgenet_proteins_indexes_sca_fp['fp_proteins'].apply(literal_eval)

reactome_proteins_indexes_fp['fp_proteins_index'] = reactome_proteins_indexes_fp['fp_proteins_index'].apply(literal_eval)
disgenet_proteins_indexes_conservative_fp['fp_proteins_index'] = disgenet_proteins_indexes_conservative_fp['fp_proteins_index'].apply(literal_eval)
disgenet_proteins_indexes_sca_fp['fp_proteins_index'] = disgenet_proteins_indexes_sca_fp['fp_proteins_index'].apply(literal_eval)

In [17]:
reactome_proteins_indexes_fp.drop(columns=['proteins_ids', 'protein_index'], inplace=True)
reactome_proteins_indexes_fp = reactome_proteins_indexes_fp[['process', 'fp_proteins', 'fp_proteins_index']]
reactome_proteins_indexes_fp.columns = ['process', 'proteins_ids', 'protein_index']
reactome_proteins_indexes_fp['n_proteins'] = reactome_proteins_indexes_fp['proteins_ids'].apply(lambda row: len(row))

In [18]:
disgenet_proteins_indexes_conservative_fp.drop(columns=['proteins_ids', 'protein_index'], inplace=True)
disgenet_proteins_indexes_conservative_fp = disgenet_proteins_indexes_conservative_fp[['process', 'fp_proteins', 'fp_proteins_index']]
disgenet_proteins_indexes_conservative_fp.columns = ['process', 'proteins_ids', 'protein_index']
disgenet_proteins_indexes_conservative_fp['n_proteins'] = disgenet_proteins_indexes_conservative_fp['proteins_ids'].apply(lambda row: len(row))

disgenet_proteins_indexes_sca_fp.drop(columns=['proteins_ids', 'protein_index'], inplace=True)
disgenet_proteins_indexes_sca_fp = disgenet_proteins_indexes_sca_fp[['process', 'fp_proteins', 'fp_proteins_index']]
disgenet_proteins_indexes_sca_fp.columns = ['process', 'proteins_ids', 'protein_index']
disgenet_proteins_indexes_sca_fp['n_proteins'] = disgenet_proteins_indexes_sca_fp['proteins_ids'].apply(lambda row: len(row))

In [20]:
reactome_labels_fp_df = pd.read_csv("../../data/processed/reactome_labels_fp.csv", sep=',', names=reactome_proteins_indexes_fp['process'].values)
disgenet_labels_fp_df = pd.read_csv("../../data/processed/disgenet_labels_fp.csv", sep=',', names=disgenet_proteins_indexes_sca_fp['process'].values)
disgenet_conservative_labels_fp_df = pd.read_csv("../../data/processed/disgenet_conservative_labels_fp.csv", sep=',', names=disgenet_proteins_indexes_conservative_fp['process'].values)

In [26]:
process_maxlink_fp = MaxLink(reactome_labels_fp_df, adj_matrix_apid_huri)
process_maxlink_fp.index = graph.vs['name']
process_maxlink_fp.to_csv('../../data/processed/metrics/process_maxlink_fp.csv')

  0%|          | 0/429 [00:00<?, ?it/s]

In [27]:
disease_maxlink_fp = MaxLink(disgenet_labels_fp_df, adj_matrix_apid_huri)
disease_maxlink_fp.index = graph.vs['name']
disease_maxlink_fp.to_csv('../../data/processed/metrics/disease_maxlink_fp.csv')

  0%|          | 0/203 [00:00<?, ?it/s]

In [28]:
disease_conservative_maxlink_fp = MaxLink(disgenet_conservative_labels_fp_df, adj_matrix_apid_huri)
disease_conservative_maxlink_fp.index = graph.vs['name']
disease_conservative_maxlink_fp.to_csv('../../data/processed/metrics/disease_conservative_maxlink_fp.csv')

  0%|          | 0/301 [00:00<?, ?it/s]