In [1]:
import pandas as pd
from igraph import Graph
import disease_process_proteins
import metrics_functions
import importlib
import numpy as np
from ast import literal_eval
from tqdm.notebook import tqdm

In [2]:
graph = Graph.Read_GML("../../data/processed/graph_apid_huri")

In [4]:
reactome_proteins_indexes_df = pd.read_csv("../../data/processed/reactome_proteins_indexes_apid_huri.csv", sep=',', header=0)
disgenet_proteins_indexes_df = pd.read_csv("../../data/processed/disgenet_prot_index_main_comp.csv", sep=',', header=0)
disgenet_prot_index_main_comp = disgenet_proteins_indexes_df[disgenet_proteins_indexes_df['increase']<0.4]

In [5]:
reactome_labels_df = pd.read_csv("../../data/processed/reactome_labels_apid_huri.csv", sep=',', names=reactome_proteins_indexes_df['process'].values)
reactome_labels_df['protein_id'] = graph.vs['name']
reactome_labels_df.set_index('protein_id', inplace=True)

disgenet_labels_df = pd.read_csv("../../data/processed/disgenet_filtered_labels_apid_huri.csv", sep=',', names=disgenet_prot_index_main_comp['process'].values)
disgenet_labels_df['protein_id'] = graph.vs['name']
disgenet_labels_df.set_index('protein_id', inplace=True)

disgenet_labels_conservative_df = pd.read_csv('../../data/processed/disgenet_conservative_labels_apid_huri.csv', names=disgenet_proteins_indexes_df['process'].values)
disgenet_labels_conservative_df['protein_id'] = graph.vs['name']
disgenet_labels_conservative_df.set_index('protein_id', inplace=True)

In [6]:
sp = np.array(graph.shortest_paths())

In [7]:
def genePANDA(graph, labels, sp):
    proteins = graph.vs['name']
    average_distance = np.sum(sp, axis=1)/len(graph.vs['name'])
    average_distance_sqrt = np.sqrt(np.dot(average_distance[:,None],average_distance[None,:]))
    raw_distance = np.divide(sp, average_distance_sqrt)
    labels.reset_index(drop=True, inplace=True)
    genePANDA_proba = {}
    for module_name in tqdm(labels.columns):
        module = labels[labels[module_name]==1].index.values
        module_distance = raw_distance[:,module]
        weights = (np.sum(raw_distance, axis=1)/len(graph.vs['name'])) - (np.sum(module_distance, axis=1)/len(module))
        weights_labels_df = pd.DataFrame(labels[module_name])
        weights_labels_df['weights'] = weights
        weights_labels_df.sort_values(by='weights', inplace=True, ascending=False)
        weights_labels_df.reset_index(inplace=True)
        weights_labels_df.columns = ['true_index', 'label', 'weight']
        weights_labels_df.reset_index(inplace=True)
        weights_labels_df.set_index('true_index', inplace=True)
        weights_labels_df['P'] = weights_labels_df.apply(lambda row: row['index']+1, axis=1)
        weights_labels_df['TP'] = np.cumsum(weights_labels_df['label'])
        weights_labels_df['probability'] = weights_labels_df['TP']/weights_labels_df['P']
        weights_labels_df.sort_index(inplace=True)
        genePANDA_proba[module_name] = weights_labels_df['probability'].values
    genePANDA_df = pd.DataFrame.from_dict(genePANDA_proba, orient='columns')    
    genePANDA_df.index = graph.vs['name']
    return genePANDA_df

In [8]:
process_genePANDA = genePANDA(graph, reactome_labels_df, sp)
process_genePANDA.to_csv('../../data/processed/metrics/process_genepanda.csv')

  0%|          | 0/429 [00:00<?, ?it/s]

In [12]:
disease_genePANDA = genePANDA(graph, disgenet_labels_df, sp)
disease_genePANDA.to_csv('../../data/processed/metrics/disease_genepanda.csv')

  0%|          | 0/203 [00:00<?, ?it/s]

In [11]:
disease_conservative_genePANDA = genePANDA(graph, disgenet_labels_conservative_df, sp)
disease_conservative_genePANDA.to_csv('../../data/processed/metrics/disease_genepanda_conservative.csv')

  0%|          | 0/301 [00:00<?, ?it/s]