In [1]:
import pandas as pd
from igraph import Graph
import disease_process_proteins
import metrics_functions
import importlib
import numpy as np
from ast import literal_eval
from tqdm.notebook import tqdm
from scipy.stats import hypergeom

In [2]:
graph = Graph.Read_GML("../../data/processed/graph_apid_huri")

In [3]:
adj_matrix_apid_huri = np.load("../../data/processed/adjacency_matrix_apid_huri.npy")

In [4]:
reactome_proteins_indexes_df = pd.read_csv("../../data/processed/reactome_proteins_indexes_apid_huri.csv", sep=',', header=0)
disgenet_proteins_indexes_df = pd.read_csv("../../data/processed/disgenet_prot_index_main_comp.csv", sep=',', header=0)
disgenet_prot_index_main_comp = disgenet_proteins_indexes_df[disgenet_proteins_indexes_df['increase']<0.4]

In [5]:
reactome_labels_df = pd.read_csv("../../data/processed/reactome_labels_apid_huri.csv", sep=',', names=reactome_proteins_indexes_df['process'].values)
reactome_labels_df['protein_id'] = graph.vs['name']
reactome_labels_df.set_index('protein_id', inplace=True)

disgenet_labels_df = pd.read_csv("../../data/processed/disgenet_filtered_labels_apid_huri.csv", sep=',', names=disgenet_prot_index_main_comp['process'].values)
disgenet_labels_df['protein_id'] = graph.vs['name']
disgenet_labels_df.set_index('protein_id', inplace=True)

disgenet_labels_conservative_df = pd.read_csv('../../data/processed/disgenet_conservative_labels_apid_huri.csv', names=disgenet_proteins_indexes_df['process'].values)
disgenet_labels_conservative_df['protein_id'] = graph.vs['name']
disgenet_labels_conservative_df.set_index('protein_id', inplace=True)

In [6]:
def MaxLink(labels, adj):
    proteins = labels.index
    labels.reset_index(inplace=True, drop=True)
    maxlink = {}
    for module_name in tqdm(labels.columns):
        maxlink[module_name] = []
        module = labels[labels[module_name]==1].index.values
        protein_degree_module = np.sum(adj[:,module], axis=1)
        protein_degree = np.sum(adj, axis=1)
        for i in range(len(protein_degree)):
            connectivity = hypergeom.pmf(protein_degree_module[i], adj.shape[0], len(module), protein_degree[i])
            if connectivity >= 0.5:
                maxlink[module_name].append(0)
            else:
                maxlink[module_name].append(protein_degree_module[i])
    maxlink_df = pd.DataFrame.from_dict(maxlink)
    return maxlink_df



In [7]:
process_maxlink = MaxLink(reactome_labels_df, adj_matrix_apid_huri)
process_maxlink.index = graph.vs['name']
process_maxlink.to_csv('../../data/processed/metrics/process_maxlink.csv')

  0%|          | 0/429 [00:00<?, ?it/s]

In [34]:
disease_maxlink = MaxLink(disgenet_labels_df, adj_matrix_apid_huri)
disease_maxlink.index = graph.vs['name']
disease_maxlink.to_csv('../../data/processed/metrics/disease_maxlink.csv')

  0%|          | 0/203 [00:00<?, ?it/s]

In [8]:
disease_conservative_maxlink = MaxLink(disgenet_labels_conservative_df, adj_matrix_apid_huri)
disease_conservative_maxlink.index = graph.vs['name']
disease_conservative_maxlink.to_csv('../../data/processed/metrics/disease_conservative_maxlink.csv')

  0%|          | 0/301 [00:00<?, ?it/s]