# Setup

La idea de este notebook es tratar de hacer una evaluación nodo a nodo del modelo, en constraste a la evaluación "global" que hace el AUC y BCE durante el entrenamiento, que es sobre una muestra de enlaces.

In [1]:
import pickle
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import torch
from sklearn.metrics import (average_precision_score, precision_score,
                             recall_score, roc_auc_score)
from torch_geometric import seed_everything
from tqdm import tqdm

sys.path.append("../../..")
from src.models import final_model, prediction_utils, training_utils

data_folder = "../../../data/processed/graph_data_nohubs/merged_types/split_dataset/"
feature_folder = "../../../data/processed/feature_data/"
reports_folder = "../../../reports/explore_predictions/"
models_folder = "../../../models/final_model/"

seed = 4
seed_everything(4)

Load data and model weights

In [2]:
#load data
dataset, node_map = training_utils.load_data(data_folder+f"seed_{seed}/",load_test=True)
test_data = dataset[2]
node_df = pd.read_csv(data_folder+f"seed_{seed}/tensor_df.csv",index_col=0).set_index("node_index",drop=True)

with open(f"{models_folder}training_parameters.pickle", 'rb') as handle:
    params = pickle.load(handle)

#initialize features in test data
test_data  = training_utils.initialize_features(test_data,params["feature_type"],params["feature_dim"],feature_folder)

#load model
weights_path = models_folder+f"seeds/final_model_{seed}.pth"
weights = torch.load(weights_path)
model = final_model.Model(test_data.metadata(),[("gene_protein","gda","disease")])
model.load_state_dict(weights)

<All keys matched successfully>

In [3]:
encodings_dict = training_utils.get_encodings(model,test_data)
mapped_dataset = prediction_utils.MappedDataset(test_data,node_map,("gene_protein","gda","disease"))
mapped_df = mapped_dataset.dataframe
mapped_df = mapped_df.rename(columns={x: x.replace("_source","").replace("_target","") for x in mapped_df.columns})

predictor = prediction_utils.Predictor(node_df,encodings_dict)

diseases = node_df[node_df.node_type == "disease"].index.values
genes = node_df[node_df.node_type == "gene_protein"].index.values

Por ahí esto es más correcto hacerlo como "otras asociaciones" como hice en neighborhood eval, porque no estoy removiendo de las listas otras asoc positivas que el modelo no vio

In [4]:
seen_edges = mapped_df[(mapped_df.edge_type == "message_passing")][["gene_protein","disease"]]
new_edges = mapped_df[(mapped_df.edge_type == "supervision") & (mapped_df.label == 1) ][["gene_protein","disease"]]

In [5]:
disgenet_edges = mapped_df[(mapped_df.edge_type == "message_passing") | (mapped_df.label == 1)]
mapped_dd = prediction_utils.MappedDataset(test_data,node_map,("disease","disease_disease","disease"))
mapped_dd_df = mapped_dd.dataframe

In [6]:
mapped_df.loc[(mapped_df.disease == 28435)&(mapped_df.label == 1),"gene_protein"].values

array([], dtype=int64)

# Modo 1

Tomo todos los genes no asociados como etiquetas negativas, en lugar de solo usar las etiquetas negativas que había generado antes.

In [7]:
num_positive_labels = []
first_positive_rank = []
ap_at_50 = []
ap_at_10 = []
ap_at_100 = []

recall_at_50 = []
recall_at_10 = []
recall_at_100 = []

total_auc = []
total_ap = []

auc_50 = []
auc_100 = []
auc_500 = []
auc_700 = []
auc_1000 = []
auc_1500 = []

all_results = []
k_vals = [10,50,100]

for disease in tqdm(diseases):
    positive_edges = new_edges.loc[(new_edges.disease == disease),"gene_protein"].values
    num_positive_labels.append(len(positive_edges))
    if len(positive_edges) > 0:
        if disease in seen_edges.disease.values:
            to_remove = seen_edges[seen_edges.disease == disease].gene_protein.values
            targets = node_df[node_df.node_type == "gene_protein"].drop(to_remove).index.values
            ranked_index, ranked_scores = predictor.prioritize_one_vs_all(disease,targets)
        else:
            ranked_index, ranked_scores = predictor.prioritize_one_vs_all(disease)

        label_array = np.in1d(ranked_index, positive_edges)
        first_positive_rank.append(np.where(label_array)[0][0])
        ap_at_50.append(average_precision_score(label_array[:50],ranked_scores[:50]))
        ap_at_10.append(average_precision_score(label_array[:10],ranked_scores[:10]))
        ap_at_100.append(average_precision_score(label_array[:100],ranked_scores[:100]))
        
        recall_at_10.append(recall_score(label_array[:10],ranked_scores.round()[:10]))
        recall_at_50.append(recall_score(label_array[:50],ranked_scores.round()[:50]))
        recall_at_100.append(recall_score(label_array[:100],ranked_scores.round()[:100]))

        total_auc.append(roc_auc_score(label_array,ranked_scores))
        total_ap.append(average_precision_score(label_array,ranked_scores.round()))

        auc_50.append(roc_auc_score(label_array[:50],ranked_scores[:50]))
        auc_100.append(roc_auc_score(label_array[:100],ranked_scores[:100]))
        auc_500.append(roc_auc_score(label_array[:500],ranked_scores[:500]))
        auc_700.append(roc_auc_score(label_array[:700],ranked_scores[:700]))
        auc_1000.append(roc_auc_score(label_array[:1000],ranked_scores[:1000]))
        auc_1500.append(roc_auc_score(label_array[:1500],ranked_scores[:1500]))

    else:
        first_positive_rank.append(-1)
        ap_at_50.append(-1)
        ap_at_10.append(-1)
        ap_at_100.append(-1)
        recall_at_10.append(-1)
        recall_at_50.append(-1)
        recall_at_100.append(-1)
        total_auc.append(-1)
        total_ap.append(-1)

disease_total_ap_scores = pd.DataFrame(np.array([num_positive_labels,first_positive_rank,ap_at_10,ap_at_50,ap_at_100,recall_at_10,recall_at_50,recall_at_100,total_auc,total_ap,auc_50,auc_100,auc_500,auc_700,auc_1000,auc_1500]).T ,index=diseases,columns=["num_labels","first_positive_rank","ap_at_10","ap_at_50","ap_at_100","recall_at_10","recall_at_50","recall_at_100","total_auc","total_ap","auc_50","auc_100","auc_500","auc_700","auc_1000","auc_1500"])
disease_total_ap_scores = disease_total_ap_scores.merge(node_df[["node_name","degree_gda","degree_dd"]], left_index=True,right_index=True)
disease_total_ap_scores

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

Unnamed: 0,num_labels,first_positive_rank,ap_at_10,ap_at_50,ap_at_100,recall_at_10,recall_at_50,recall_at_100,total_auc,total_ap,node_name,degree_gda,degree_dd
19599,10.0,142.0,-0.000000,-0.000000,-0.000000,0.0,0.0,0.0,0.624332,0.000589,Hepatomegaly,54.0,0.0
20754,82.0,1.0,0.416667,0.340278,0.204552,1.0,1.0,1.0,0.526661,0.005425,Schizophrenia,883.0,1.0
24165,56.0,0.0,1.000000,0.927128,0.880945,1.0,1.0,1.0,0.810591,0.003882,Malignant neoplasm of prostate,616.0,6.0
18773,104.0,0.0,1.000000,0.941947,0.863880,1.0,1.0,1.0,0.585277,0.006688,Malignant neoplasm of breast,1074.0,7.0
18732,13.0,2.0,0.333333,0.333333,0.333333,1.0,1.0,1.0,0.806252,0.001065,Bladder Neoplasm,140.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
30820,0.0,-1.0,-1.000000,-1.000000,-1.000000,-1.0,-1.0,-1.0,-1.000000,-1.000000,"AMYLOID CARDIOMYOPATHY, TRANSTHYRETIN-RELATED",0.0,1.0
30959,0.0,-1.0,-1.000000,-1.000000,-1.000000,-1.0,-1.0,-1.0,-1.000000,-1.000000,"BERNARD-SOULIER SYNDROME, TYPE A1",0.0,1.0
33955,0.0,-1.0,-1.000000,-1.000000,-1.000000,-1.0,-1.0,-1.0,-1.000000,-1.000000,Triple-Negative Breast Carcinoma,0.0,1.0
32968,0.0,-1.0,-1.000000,-1.000000,-1.000000,-1.0,-1.0,-1.0,-1.000000,-1.000000,DOCK8 Deficiency,0.0,1.0


Cuantas enfermedades me quedaron con etiquetas positivas en test para evaluar?

In [8]:
pd.Series(disease_total_ap_scores.num_labels != 0).value_counts()

False    13120
True      2959
Name: num_labels, dtype: int64

In [11]:
disease_total_ap_scores[disease_total_ap_scores.num_labels != 0].describe().round(3)[["ap_at_10","ap_at_100","recall_at_10","recall_at_100","total_auc","total_ap"]].drop("count")

Unnamed: 0,ap_at_10,ap_at_100,recall_at_10,recall_at_100,total_auc,total_ap
mean,0.102,0.091,0.208,0.465,0.831,0.002
std,0.254,0.205,0.406,0.499,0.248,0.003
min,-0.0,-0.0,0.0,0.0,0.006,0.0
25%,-0.0,-0.0,0.0,0.0,0.735,0.0
50%,0.0,0.0,0.0,0.0,0.974,0.001
75%,-0.0,0.062,0.0,1.0,0.996,0.002
max,1.0,1.0,1.0,1.0,1.0,0.034


## Analizando las "one shot en test"

Es posible que algunas de estas sean "one shots" en el sentido de que el único enlace que tenían fue a parar a test. En otras palabras, el modelo no vió nunca un enlace de esta enfermedad y ahora le estamos pidiendo que haga predicciones. En estos casos habría que tener esa consideración ...

Dentro de estas hay como "dos niveles" de one shotting, el peor escenario es que esa enfermedad tenía grado 1, y ese único enlace el modelo nunca lo vió, osea que el nodo quedó desconectado del grafo en todo momento, nunca tuvo propagación! (ni en train ni val ni test, nunca!). La otra es que el modelo tenía un solo enlace GDA y quedó en test, pero tenía enlaces dd que pudo usar para propagar. 

Este último caso es lo que llaman "leave one out" en la bibliografía!

Hay que dividirlo en bins según grado total, o según grado msg passing por ahi

Cuantas son "one shots"

In [12]:
gda_prop_df = pd.DataFrame(mapped_df[mapped_df.edge_type == "message_passing"].disease.value_counts()).rename(columns={"disease":"gda_propagation_degree"})
gda_prop_df = pd.merge(gda_prop_df, disease_total_ap_scores, left_index=True,right_index=True,how="right").fillna(0)
oneshots_index = gda_prop_df[(gda_prop_df.num_labels > 0) & (gda_prop_df.gda_propagation_degree == 0)].index.values
rest_index = gda_prop_df[(gda_prop_df.num_labels > 0) & (gda_prop_df.gda_propagation_degree != 0)].index.values

In [13]:
oneshot_df = disease_total_ap_scores.loc[oneshots_index]
print(f"{len(oneshot_df)} enfermedades son \'one shot\' en test. Esto es {round((len(oneshot_df)*100)/2959)}% de los nodos en test")
oneshot_df = oneshot_df.sort_values(by="first_positive_rank")
oneshot_df["reciprocal_rank"] = 1/(oneshot_df["first_positive_rank"] +1)
oneshot_df[:20]

699 enfermedades son 'one shot' en test. Esto es 24% de los nodos en test


Unnamed: 0,num_labels,first_positive_rank,ap_at_10,ap_at_50,ap_at_100,recall_at_10,recall_at_50,recall_at_100,total_auc,total_ap,node_name,degree_gda,degree_dd,reciprocal_rank
31669,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.000571,Oculocutaneous albinism type 6,1.0,1.0,1.0
31316,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.000703,Atrial Septal Defect with Atrioventricular Con...,1.0,1.0,1.0
29347,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.000167,"BRACHYDACTYLY, TYPE B2 (disorder)",1.0,1.0,1.0
30777,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.000321,"LIPODYSTROPHY, FAMILIAL PARTIAL, TYPE 4",1.0,1.0,1.0
31523,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.000977,PEROXISOME BIOGENESIS DISORDER 12A (ZELLWEGER),1.0,1.0,1.0
25941,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.000297,Pituitary Dwarfism Type 3,1.0,1.0,1.0
33935,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.000659,"MUSCULAR DYSTROPHY, LIMB-GIRDLE, AUTOSOMAL DOM...",1.0,1.0,1.0
31126,1.0,1.0,0.5,0.5,0.5,1.0,1.0,1.0,0.999944,0.000527,"EPILEPSY, FAMILIAL TEMPORAL LOBE, 5",1.0,1.0,0.5
23830,1.0,1.0,0.5,0.5,0.5,1.0,1.0,1.0,0.999944,0.001541,Hypoparathyroidism - X-linked,1.0,1.0,0.5
33128,1.0,1.0,0.5,0.5,0.5,1.0,1.0,1.0,0.999944,0.000207,JOUBERT SYNDROME 27,1.0,1.0,0.5


Osea, el MRR daría bajo pero.. que a una enfermedad que no vio nunca, le ponga su único enlace en el puesto 10 ENTRE 16000 en realidad está muy bien!! (para mi . . .) como muestro esto??

Creo que había una métrica donde podía ir sumando las cosas que aparecieron. La cuestión es que.. una métrica que me da 0.5 cuando puse en el puesto 2 a un nodo entre 16mil........................... NOSE SI ES MUY REPRESENTATIVA 

Acá estoy queriendo ver si está correlacionada la buena performance con el grado disease-disease, no me salió

In [77]:
fig_df = oneshot_df.copy()
fig_df["has_dd_edges"] = fig_df.degree_dd > 0
fig = px.violin(fig_df, x="has_dd_edges", y="first_positive_rank")
fig.show()

In [21]:
oneshot_df[(oneshot_df.num_labels != 0) & (oneshot_df.degree_dd != 0)].describe().round(3)[["ap_at_10","ap_at_100","recall_at_10","recall_at_100","total_auc","total_ap"]]

Unnamed: 0,ap_at_10,ap_at_100,recall_at_10,recall_at_100,total_auc,total_ap
count,504.0,504.0,504.0,504.0,504.0,504.0
mean,0.028,0.034,0.073,0.254,0.741,0.0
std,0.132,0.131,0.261,0.436,0.295,0.001
min,-0.0,-0.0,0.0,0.0,0.024,0.0
25%,-0.0,-0.0,0.0,0.0,0.475,0.0
50%,-0.0,-0.0,0.0,0.0,0.915,0.0
75%,0.0,0.011,0.0,1.0,0.994,0.0
max,1.0,1.0,1.0,1.0,1.0,0.01


### Viendo un ejemplo particular de una enfermedad "one shot en test" que tuvo buena performance

"Diamond-blackfan anemia 13" es "one shot en test" y logró rankear a su único enlace GDA en el puesto 3 de ~16000. Me llamó la atención porque es una de esas enfermedades "Enfermedad N" que aparecen en DisGeNET. Esta enfermedad tiene grado dd 1, para entender un poco su entorno voy a mirar la comunidad infomap a la que pertenece. Elegí ver la infomap porque es más chica y me interesa el entorno más cercano.

In [19]:
diamond_bf_index = 32292
diamond_bf_comu = node_df.loc[diamond_bf_index,"comunidades_infomap"]
node_df[node_df.comunidades_infomap == diamond_bf_comu]

Unnamed: 0_level_0,node_id,node_name,node_type,node_source,tensor_index,comunidades_infomap,comunidades_louvain,degree_gda,degree_pp,degree_dd,total_degree
node_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
30422,C2931850,Aase Smith syndrome 2,disease,disgenet,2673,816.0,12.0,15.0,0.0,1.0,16.0
26126,C1260899,"Anemia, Diamond-Blackfan",disease,disgenet,2674,816.0,12.0,30.0,0.0,1.0,31.0
1731,11639_10493,Diamond-Blackfan anemia with mandibulofacial d...,disease,primekg,3386,816.0,12.0,0.0,0.0,3.0,3.0
32840,C4225411,DIAMOND-BLACKFAN ANEMIA 15 WITH MANDIBULOFACIA...,disease,disgenet,3387,816.0,12.0,1.0,0.0,1.0,2.0
32851,C4225422,DIAMOND-BLACKFAN ANEMIA 14 WITH MANDIBULOFACIA...,disease,disgenet,3388,816.0,12.0,1.0,0.0,1.0,2.0
2828,14394_13217_32668_12529_32669_44309_12937_1321...,Diamond-Blackfan anemia,disease,primekg,3389,816.0,12.0,0.0,0.0,22.0,22.0
29647,C2675512,Diamond-Blackfan Anemia 7,disease,disgenet,6039,816.0,12.0,1.0,0.0,1.0,2.0
33803,C4693556,DIAMOND-BLACKFAN ANEMIA-LIKE,disease,disgenet,6040,816.0,12.0,1.0,0.0,1.0,2.0
29646,C2675511,Diamond-Blackfan Anemia 8,disease,disgenet,6041,816.0,12.0,1.0,0.0,1.0,2.0
32292,C4014641,DIAMOND-BLACKFAN ANEMIA 13,disease,disgenet,6042,816.0,12.0,1.0,0.0,1.0,2.0


Se ve clarito que es una comunidad que agrupa a varios tipos de "diamond-blackfan anemia", con dos grupos bert (te das cuenta porque la fuente no es disgenet sino primekg) que las conectan.

se me ocurrió una forma de validar, que es usar los datos de la parte minada de Disgenet. Esto sirve?

Estoy viendo si las enfermedades de esa comunidad iban todas al mismo gen y por eso fue "fácil" predecirlo, pero veo que son como 30 distintos.

In [20]:
comu_index = node_df[node_df.comunidades_infomap == 816].index
disease_idx = disgenet_edges.set_index("disease").index
len(disgenet_edges.set_index("disease").loc[comu_index.intersection(disease_idx)].gene_protein.unique())

30

Tendría que ver las "diamond black fan N", que tienen grado 1, si no van al mismo:

In [21]:
comu_index = node_df[(node_df.comunidades_infomap == 816)&(node_df.degree_gda == 1)].index
disease_idx = disgenet_edges.set_index("disease").index
len(disgenet_edges.set_index("disease").loc[comu_index.intersection(disease_idx)].gene_protein.unique())

15

No es el mismo gen y, de hecho, me parece que en general las enfermedades que en disgenet se llaman "Enfermedad N" y hay muchas "N", son enfermedades que tienen una presentación clínica parecida pero están asociadas a genes diferentes (interesante). Por ahí le propuso esos 30 genes que tienen asociadas sus compañeras de comunidad y a uno de esos le pegó. Pero bueno, justamente así funciona la GNN.

In [22]:
comu_index = node_df[(node_df.comunidades_infomap == 816)].index
disease_idx = disgenet_edges.set_index("disease").index
diamond_bf_edge = disgenet_edges.set_index("disease").loc[diamond_bf_index].gene_protein
node_df.loc[disgenet_edges.set_index("gene_protein").loc[diamond_bf_edge].disease.values]

Unnamed: 0_level_0,node_id,node_name,node_type,node_source,tensor_index,comunidades_infomap,comunidades_louvain,degree_gda,degree_pp,degree_dd,total_degree
node_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
32292,C4014641,DIAMOND-BLACKFAN ANEMIA 13,disease,disgenet,6042,816.0,12.0,1.0,0.0,1.0,2.0
32866,C4228778,Abnormality of radial ray,disease,disgenet,8307,,,33.0,0.0,0.0,33.0
30422,C2931850,Aase Smith syndrome 2,disease,disgenet,2673,816.0,12.0,15.0,0.0,1.0,16.0
26126,C1260899,"Anemia, Diamond-Blackfan",disease,disgenet,2674,816.0,12.0,30.0,0.0,1.0,31.0


Ahí se ve que en la comunidad había otras enfermedades con enlaces a ese mismo gen. Igual, tenian 33, 15 y 30 enlaces diferentes. El modelo ubicó el único que tenia esta enfermedad en el puesto 3!. Puede ser que esté priorizando los enlaces que tienen enfermedades cercanas (y esto estaría bien), pero por ahí también hay más info codificada porque tenía 30 genes para elegir si tomamos solo la comunidad infomap.

Hay otro nivel más de "one shot" que es que no tenga el enlace dd que está en el dataset, porque en el split no apareció. Esto lo podemos saber mirando el dataset de enlaces d-d:

In [27]:
mapped_dd_df.set_index("disease_target").loc[diamond_bf_index]

disease_source                           2828
torch_disease_index_source               3389
torch_disease_index_target               6042
label                                     NaN
edge_type                     message_passing
Name: 32292, dtype: object

Esto es el mismo gráfico de más arriba pero corregido por el tema de que los enlaces dd también entran en el split. Igual da muy parecido, la idea es la misma.

In [35]:
fig_df[fig_df.has_dd_edges == True].describe()

Unnamed: 0,num_labels,first_positive_rank,ap_at_10,ap_at_50,ap_at_100,recall_at_10,recall_at_50,recall_at_100,degree_gda,degree_dd,reciprocal_rank,dd_propagation_degree
count,468.0,468.0,468.0,468.0,468.0,468.0,468.0,468.0,468.0,468.0,468.0,468.0
mean,1.025641,4443.856838,0.029672,0.034989,0.03601,0.076923,0.190171,0.260684,1.025641,1.40812,0.037048,1.34188
std,0.17123,5202.40969,0.136215,0.136068,0.135857,0.266755,0.392856,0.439477,0.17123,1.171621,0.13552,1.027772
min,1.0,0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,1.0,1.0,5.8e-05,1.0
25%,1.0,83.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,1.0,1.0,0.000111,1.0
50%,1.0,1113.5,-0.0,-0.0,-0.0,0.0,0.0,0.0,1.0,1.0,0.000897,1.0
75%,1.0,9040.0,0.0,0.0,0.011905,0.0,0.0,1.0,1.0,1.0,0.011905,1.0
max,3.0,17310.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,13.0,1.0,11.0


In [30]:
dd_prop_df = pd.DataFrame(mapped_dd_df[mapped_dd_df.edge_type == "message_passing"].disease_source.value_counts()).rename(columns={"disease_source":"dd_propagation_degree"})

fig_df = oneshot_df.copy()
fig_df = fig_df.merge(dd_prop_df,left_index=True,right_index=True, how="left").fillna(0)
fig_df["has_dd_edges"] = fig_df.dd_propagation_degree > 0
fig = px.violin(fig_df, x="has_dd_edges", y="first_positive_rank")
fig.show()

## Analizando el resto

In [14]:
def group_by_range(data_df,group_column,ranges,inplace=True):
    if not inplace:
        df = data_df.copy()
        bins = np.digitize(df[group_column].values, ranges)
        df["bins"] = bins
        return df
    else:
        df = data_df
        bins = np.digitize(df[group_column].values, ranges)
        df["bins"] = bins

Si filtro las que tenían al menos N de propagación

In [16]:
over_K_prop = gda_prop_df[(gda_prop_df.gda_propagation_degree >= 10)&(gda_prop_df.num_labels > 0)].sort_values(by="first_positive_rank")
over_K_prop.describe()

Unnamed: 0,gda_propagation_degree,num_labels,first_positive_rank,ap_at_10,ap_at_50,ap_at_100,recall_at_10,recall_at_50,recall_at_100,total_auc,total_ap,degree_gda,degree_dd
count,1094.0,1094.0,1094.0,1094.0,1094.0,1094.0,1094.0,1094.0,1094.0,1094.0,1094.0,1094.0,1094.0
mean,50.372943,5.658135,684.316271,0.188896,0.166021,0.150337,0.348263,0.57404,0.679159,0.887256,0.002871,56.031079,2.651737
std,76.252937,8.604141,2284.085869,0.326736,0.267615,0.243099,0.476638,0.494714,0.467013,0.174097,0.004291,84.556596,5.689778
min,10.0,1.0,0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.027577,5.6e-05,11.0,0.0
25%,15.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.845296,0.000437,17.0,0.0
50%,26.0,3.0,30.5,-0.0,0.035483,0.033853,0.0,1.0,1.0,0.978583,0.001259,29.0,1.0
75%,55.75,6.0,158.0,0.25,0.2,0.166667,1.0,1.0,1.0,0.994944,0.00327,61.0,3.0
max,970.0,104.0,17243.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.033531,1074.0,66.0


In [3]:
(2284*100)/16000

14.275

In [18]:
rest_df = disease_total_ap_scores.loc[rest_index].sort_values(by="first_positive_rank")
rest_df

Unnamed: 0,num_labels,first_positive_rank,ap_at_10,ap_at_50,ap_at_100,recall_at_10,recall_at_50,recall_at_100,node_name,degree_gda,degree_dd
24097,2.0,0.0,1.000000,1.000000,1.000000,1.0,1.0,1.0,Other specified cardiac arrhythmias,11.0,1.0
19272,1.0,0.0,1.000000,1.000000,1.000000,1.0,1.0,1.0,Myoclonic Epilepsy,9.0,5.0
20522,7.0,0.0,0.642857,0.395604,0.395604,1.0,1.0,1.0,Lobar Pneumonia,54.0,0.0
19742,12.0,0.0,1.000000,0.413876,0.271159,1.0,1.0,1.0,Profound Mental Retardation,139.0,0.0
32593,1.0,0.0,1.000000,1.000000,1.000000,1.0,1.0,1.0,C3 glomerulopathy,9.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
20117,1.0,16731.0,-0.000000,-0.000000,-0.000000,0.0,0.0,0.0,Mitral Valve Prolapse Syndrome,3.0,3.0
31349,1.0,16871.0,-0.000000,-0.000000,-0.000000,0.0,0.0,0.0,Triple Negative Breast Neoplasms,4.0,1.0
26637,1.0,17243.0,-0.000000,-0.000000,-0.000000,0.0,0.0,0.0,Pancreatic Ductal Adenocarcinoma,11.0,3.0
29149,1.0,17432.0,-0.000000,-0.000000,-0.000000,0.0,0.0,0.0,"Pseudoxanthoma Elasticum, Incomplete",3.0,0.0


In [133]:
aver = group_by_range(rest_df,"first_positive_rank",np.logspace(0,5),False)
fig = px.histogram(aver, "bins")
fig.show()

In [161]:
rest_df

Unnamed: 0,num_labels,first_positive_rank,ap_at_10,ap_at_50,ap_at_100,precision_at_10,precision_at_50,precision_at_100,node_name,degree_gda,degree_dd
24097,2.0,0.0,1.000000,1.000000,1.000000,0.1,0.02,0.01,Other specified cardiac arrhythmias,11.0,1.0
19272,1.0,0.0,1.000000,1.000000,1.000000,0.1,0.02,0.01,Myoclonic Epilepsy,9.0,5.0
20522,7.0,0.0,0.642857,0.395604,0.395604,0.2,0.08,0.04,Lobar Pneumonia,54.0,0.0
19742,12.0,0.0,1.000000,0.413876,0.271159,0.1,0.06,0.05,Profound Mental Retardation,139.0,0.0
32593,1.0,0.0,1.000000,1.000000,1.000000,0.1,0.02,0.01,C3 glomerulopathy,9.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
20117,1.0,16731.0,-0.000000,-0.000000,-0.000000,0.0,0.00,0.00,Mitral Valve Prolapse Syndrome,3.0,3.0
31349,1.0,16871.0,-0.000000,-0.000000,-0.000000,0.0,0.00,0.00,Triple Negative Breast Neoplasms,4.0,1.0
26637,1.0,17243.0,-0.000000,-0.000000,-0.000000,0.0,0.00,0.00,Pancreatic Ductal Adenocarcinoma,11.0,3.0
29149,1.0,17432.0,-0.000000,-0.000000,-0.000000,0.0,0.00,0.00,"Pseudoxanthoma Elasticum, Incomplete",3.0,0.0


In [169]:
rest_df[rest_df.num_labels > 50].ap_at_100.mean()

0.5334211511494492

# Modo 2

Calcular AP solo con los rankings que me generan los enlaces que quedaron en test, osea, la misma muestra sobre la que evalué AUC, en lugar de todas las combinaciones posibles.

In [145]:
pred = predictor.predict_supervision_edges(test_data,("gene_protein","gda","disease"))

disease_index = []
ap_at_10 = []
ap_at_5 = []
k = []

for disease in pred.torch_disease_index.unique():
    labels = pred[pred.torch_disease_index == disease].sort_values(by="score", ascending=False)["label"].values
    scores = pred[pred.torch_disease_index == disease].sort_values(by="score", ascending=False)["score"].values
    k.append(len(labels))

    if len(labels) >= 10:
        labels = labels[:10]
        scores = scores[:10]
        ap_at_10.append(average_precision_score(labels,scores))  
    else:
        ap_at_10.append(average_precision_score(labels,scores))
    
    if len(labels) >= 5:
        labels = labels[:5]
        scores = scores[:5]
        ap_at_5.append(average_precision_score(labels,scores))
    else:
        ap_at_5.append(average_precision_score(labels,scores))

    disease_index.append(disease)

ap_df = pd.DataFrame({"disease_index":disease_index,"ap_at_10":ap_at_10,"ap_at_5":ap_at_5, "k":k})
ap_df


No positive class found in y_true, recall is set to one for all thresholds.


No positive class found in y_true, recall is set to one for all thresholds.


No positive class found in y_true, recall is set to one for all thresholds.


No positive class found in y_true, recall is set to one for all thresholds.


No positive class found in y_true, recall is set to one for all thresholds.


No positive class found in y_true, recall is set to one for all thresholds.


No positive class found in y_true, recall is set to one for all thresholds.


No positive class found in y_true, recall is set to one for all thresholds.


No positive class found in y_true, recall is set to one for all thresholds.


No positive class found in y_true, recall is set to one for all thresholds.


No positive class found in y_true, recall is set to one for all thresholds.


No positive class found in y_true, recall is set to one for all thresholds.


No positive class found in y_true, recall is set to one for all

Unnamed: 0,disease_index,ap_at_10,ap_at_5,k
0,1614,1.000000,1.00,6
1,20,0.884354,0.95,16
2,1363,1.000000,1.00,2
3,3,1.000000,1.00,146
4,916,1.000000,1.00,1
...,...,...,...,...
5262,3755,-0.000000,-0.00,1
5263,2654,-0.000000,-0.00,1
5264,4584,-0.000000,-0.00,1
5265,2558,-0.000000,-0.00,1


In [146]:
ap_df[ap_df.k < 5]

Unnamed: 0,disease_index,ap_at_10,ap_at_5,k
2,1363,1.0,1.0,2
4,916,1.0,1.0,1
7,522,1.0,1.0,4
8,6331,1.0,1.0,4
14,8080,1.0,1.0,4
...,...,...,...,...
5262,3755,-0.0,-0.0,1
5263,2654,-0.0,-0.0,1
5264,4584,-0.0,-0.0,1
5265,2558,-0.0,-0.0,1


Estos son los "one shot"

In [155]:
ap_df[ap_df.k == 1].ap_at_5.mean().round(2)

0.36

El resto

In [170]:
ap_df[ap_df.k >= 5].ap_at_5.mean().round(2)

0.91

In [148]:
ap_df[ap_df.k >= 10].ap_at_10.mean().round(2)

0.92

In [149]:
ap_df[(ap_df.k <= 10)&(ap_df.k >= 5)].ap_at_5.mean().round(2)

0.89

In [150]:
ap_df[(ap_df.k <= 10)&(ap_df.k >= 5)]

Unnamed: 0,disease_index,ap_at_10,ap_at_5,k
0,1614,1.000000,1.000000,6
19,895,0.754592,0.679167,10
25,1249,1.000000,1.000000,6
26,1894,0.975000,1.000000,10
31,1377,0.915079,0.950000,8
...,...,...,...,...
3362,10502,-0.000000,-0.000000,5
3408,1908,-0.000000,-0.000000,5
3432,7709,-0.000000,-0.000000,5
3870,5688,-0.000000,-0.000000,5


Esto está balanceado?

In [14]:
pred = predictor.predict_supervision_edges(test_data,("gene_protein","gda","disease"),False)
labels = test_data.edge_label_dict[("gene_protein","gda","disease")]
average_precision_score(labels,pred)

0.9063889136629331

In [253]:
pred

Unnamed: 0,torch_gene_protein_index,torch_disease_index,score,label
0,6951,1614,0.909069,1.0
1,7772,20,0.873451,1.0
2,599,1363,0.663127,1.0
3,10254,3,0.587420,1.0
4,2986,916,0.980938,1.0
...,...,...,...,...
16797,6817,672,0.008400,0.0
16798,1224,352,0.001227,0.0
16799,14330,903,0.817325,0.0
16800,13487,2014,0.000022,0.0


In [271]:
pred[["torch_disease_index","label"]].value_counts()

torch_disease_index  label
3                    1.0      104
1                    1.0       82
48                   1.0       68
379                  1.0       67
36                   1.0       64
                             ... 
4742                 0.0        1
4740                 0.0        1
4732                 0.0        1
4722                 0.0        1
13991                0.0        1
Length: 7032, dtype: int64

In [255]:
pred_balance = pd.DataFrame(index=pred.torch_disease_index.unique())
pred_balance["num_samples"] = pred.torch_disease_index.value_counts()
pred_balance["num_positive"] 

Unnamed: 0,num_samples
1614,6
20,16
1363,2
3,146
916,1
...,...
3755,1
2654,1
4584,1
2558,1


# Hits

In [179]:
predictions = predictor.prioritize_one_vs_all(16899)

In [180]:
predictions

[tensor([34065, 32276, 33350,  ..., 21850, 21436, 24257]),
 tensor([1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 2.8140e-19, 2.7781e-19,
         3.1687e-21])]

In [184]:
new_edges[new_edges["disease"] == 16899]

Unnamed: 0,gene_protein,disease


In [187]:
def hits_at_k(node_index,seen_edges_df,new_edges_df):
    k_list = [5,10,50,100]
    predictions = predictor.prioritize_one_vs_all(node_index)

    node_type = node_df.loc[node_index,"node_type"]
    y_type = "disease" if node_type == "gene_protein" else "gene_protein"

    node_new_edges = set(new_edges_df[new_edges_df[node_type] == node_index][y_type].values)
    node_seen_edges = set(seen_edges_df[seen_edges_df[node_type] == node_index][y_type].values)

    results = {"seen_edges":len(node_seen_edges),"new_edges":len(node_new_edges)}

    for k in k_list:
        predicted_top = set(predictions[0][:k].numpy())

        seen_hits = len(node_seen_edges&predicted_top)
        new_hits = len(node_new_edges&predicted_top)

        results[f"{k}_seen"] = seen_hits
        results[f"{k}_new"] = new_hits

    return results

TODO: pasar esto al script de prediction utils

In [188]:
disease_evals = {}
for disease in tqdm(node_df[node_df.node_type == "disease"].index.values):
    predictions = predictor.prioritize_one_vs_all(disease)
    disease_evals[disease] = hits_at_k(disease,seen_edges,new_edges)

100%|██████████| 16079/16079 [01:04<00:00, 248.61it/s]


In [191]:
gene_evals = {}
for gene_protein in tqdm(node_df[node_df.node_type == "gene_protein"].index.values):
    predictions = predictor.prioritize_one_vs_all(gene_protein)
    gene_evals[gene_protein] = hits_at_k(gene_protein,seen_edges,new_edges)

100%|██████████| 17743/17743 [01:02<00:00, 282.48it/s]


In [192]:
df = node_df

total_disease_evals = pd.DataFrame(disease_evals).T
cols = total_disease_evals.columns
total_disease_evals = total_disease_evals.merge(df["node_name"],left_index=True,right_index=True)[["node_name",*cols]]

total_gene_evals = pd.DataFrame(gene_evals).T
cols = total_gene_evals.columns
total_gene_evals = total_gene_evals.merge(df["node_name"],left_index=True,right_index=True)[["node_name",*cols]]

k_list = [5,10,50,100]
summary_disease_evals = pd.DataFrame()
for k in k_list:
    summary_disease_evals[f"hits_{k}"] = total_disease_evals[f"{k}_new"] + total_disease_evals[f"{k}_seen"]

cols = summary_disease_evals.columns
add_cols=["node_name","degree_gda","degree_dd","comunidades_infomap","comunidades_louvain"]
summary_disease_evals = pd.merge(summary_disease_evals,df[add_cols],left_index=True,right_index=True)[[*add_cols,*cols]]

summary_gene_evals = pd.DataFrame()
for k in k_list:
    summary_gene_evals[f"hits_{k}"] = total_gene_evals[f"{k}_new"] + total_gene_evals[f"{k}_seen"]

cols = summary_gene_evals.columns
add_cols = ["node_name","degree_gda","degree_pp"]
summary_gene_evals = pd.merge(summary_gene_evals,df[add_cols],left_index=True,right_index=True)[[*add_cols,*cols]]

total_disease_evals = total_disease_evals.merge(summary_disease_evals["degree_gda"],left_index=True,right_index=True)
total_gene_evals = total_gene_evals.merge(summary_gene_evals["degree_gda"],left_index=True,right_index=True)

In [196]:
def save_hits_df(model_name,desc,disease_total,disease_summary,gene_total,gene_summary,reports_folder=reports_folder+"ranking_analysis/hits"):
    disease_total.to_csv(reports_folder+model_name+"_total_disease.csv")
    disease_summary.to_csv(reports_folder+model_name+"_summary_disease.csv")
    gene_summary.to_csv(reports_folder+model_name+"_summary_gene.csv")
    gene_total.to_csv(reports_folder+model_name+"_total_gene.csv")

    with open(reports_folder+model_name+"_desc.txt", "w") as f:
        f.write(desc)

def load_hits_df(model_name,reports_folder=reports_folder+"ranking_analysis/hits"):
    fnames = ["_total_disease.csv","_summary_disease.csv","_summary_gene.csv","_total_gene.csv"]
    dfs = []
    for fname in fnames:
        dfs.append(pd.read_csv(reports_folder+model_name+fname,index_col=0))
    return dfs

def group_by_range(data_df,group_column,ranges,inplace=True):
    if not inplace:
        df = data_df.copy()
        bins = np.digitize(df[group_column].values, ranges)
        df["bins"] = bins
        return df
    else:
        df = data_df
        bins = np.digitize(df[group_column].values, ranges)
        df["bins"] = bins

def plot_box(data_df,value_cols,title,range_text,y_top):
    melted_df = data_df[["bins",*value_cols]].melt("bins").rename(columns={"value":"hits"})
    melted_df["Nivel de Evidencia"] = melted_df.bins.apply(lambda x: range_text[x])
    fig = px.box(melted_df.sort_values(by="bins"),y="hits",x="Nivel de Evidencia",color="variable",title=title,width=900,height=450,labels={"hits":"Hits"})
    fig.update_yaxes(range=[-0.5, y_top])
    fig.show()

In [194]:
model_name = f"sage_lsa"
desc = "final_model"

save_hits_df(model_name,desc,total_disease_evals,summary_disease_evals,total_gene_evals,summary_gene_evals)

In [201]:
# model_name = "sage_ones_no_sampling"
# model_name = "sage_ones_first_negatives_exp_04_07_23__12_07"
model_name = "sage_lsa"
hits_df = load_hits_df(model_name)

disease_ranges = np.array([10,50,100,hits_df[0].degree_gda.max()+1]).astype(int)
gene_ranges = np.array([5,20,50,100,hits_df[3].degree_gda.max()+1]).astype(int)
range_text = ["< 10","10-50","50-100","100 +"]

for i,data in enumerate(hits_df):
    if i<2:
        data = group_by_range(data[data.degree_gda != 0],"degree_gda",disease_ranges,inplace=False)
        hits_df[i] = data
    else:
        data = group_by_range(data[data.degree_gda != 0],"degree_gda",gene_ranges,inplace=False)
        hits_df[i] = data       

value_pairs = [["5_seen","5_new"],["10_seen","10_new"],["50_seen","50_new"]]
for pair in value_pairs:
    plot_box(hits_df[0],pair,"Evaluación Enfermedades",range_text,51)

# One fold con M muestras

En vez de comparar vs todos los nodos posibles, generar muestras de M nodos y hacer ahí el ranking

In [189]:
positive_sup_edges = mapped_df[mapped_df.label == 1][["gene_protein","disease"]]

## MRR y hits @ k para enfermedades y genes

In [109]:
test_size = len(positive_sup_edges)
M = 500
k_values = [5,20,30]
hits_list = [0,0,0]
reciprocal_ranks = []

for edge in positive_sup_edges.values:
    gene, disease = edge

    #saco los enlaces que ya vio
    to_remove = seen_edges[seen_edges.disease == disease].gene_protein.values
    targets = node_df[node_df.node_type == "gene_protein"].drop(to_remove).index.values
    target_sample = np.append(np.random.choice(targets,M), gene)
    ranked_index, ranked_scores = predictor.prioritize_one_vs_all(disease,target_sample)
    
    rank = np.where(ranked_index == gene)[0][0] +1

    # hits@k
    for i,k in enumerate(k_values):
        if rank <= k:
            hits_list[i] += 1
    #MRR
    reciprocal_ranks.append(1/rank)

hits_k_results = np.array(hits_list)/test_size
mean_reciprocal_rank = np.sum(reciprocal_ranks)/test_size

print(hits_k_results, mean_reciprocal_rank)


[0.44423283 0.63801928 0.67563385] 0.3192508488016502


In [114]:
test_size = len(positive_sup_edges)
M = 500
k_values = [5,20,30]
hits_list = [0,0,0]
reciprocal_ranks = []

for edge in positive_sup_edges.values:
    gene, disease = edge

    #saco los enlaces que ya vio
    to_remove = seen_edges[seen_edges.gene_protein == gene].disease.values
    targets = node_df[node_df.node_type == "disease"].drop(to_remove).index.values
    target_sample = np.append(np.random.choice(targets,M), disease)
    ranked_index, ranked_scores = predictor.prioritize_one_vs_all(gene,target_sample)
    
    rank = np.where(ranked_index == disease)[0][0] +1

    # hits@k
    for i,k in enumerate(k_values):
        if rank <= k:
            hits_list[i] += 1
    #MRR
    reciprocal_ranks.append(1/rank)

hits_k_results = np.array(hits_list)/test_size
mean_reciprocal_rank = np.sum(reciprocal_ranks)/test_size

print(hits_k_results, mean_reciprocal_rank)

[0.41578384 0.61326033 0.66861088] 0.31632365307824345


## Lo mismo pero quiero ver quien es cada uno

In [205]:
test_size = len(positive_sup_edges)
M = 500
k_values = [5,20,30]
hits_list = [[],[],[]]
reciprocal_ranks = []

for edge in positive_sup_edges.values:
    gene, disease = edge

    #saco los enlaces que ya vio
    to_remove = seen_edges[seen_edges.disease == disease].gene_protein.values
    targets = node_df[node_df.node_type == "gene_protein"].drop(to_remove).index.values
    target_sample = np.append(np.random.choice(targets,M), gene)
    ranked_index, ranked_scores = predictor.prioritize_one_vs_all(disease,target_sample)
    
    rank = np.where(ranked_index == gene)[0][0] +1

    # hits@k
    for i,k in enumerate(k_values):
        if rank <= k:
            hits_list[i].append(1)
        else:
            np.array(hits_list).T
            hits_list[i].append(0)
    #MRR
    reciprocal_ranks.append(1/rank)

# hits_k_results = np.array(hits_list)/test_size
# mean_reciprocal_rank = np.sum(reciprocal_ranks)/test_size

# print(hits_k_results, mean_reciprocal_rank)


Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.


Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.


Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.


Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.


Creating an ndarray from ragged nes

In [206]:
d_rank_eval_results = pd.DataFrame(np.array(hits_list).T, columns=["hits_5","hits_20","hits_30"])
d_rank_eval_results["reciprocal_rank"] = reciprocal_ranks
d_rank_eval_results["disease_node_index"] = positive_sup_edges.disease.values
d_rank_eval_results

Unnamed: 0,hits_5,hits_20,hits_30,reciprocal_rank,disease_node_index
0,1,1,1,0.250000,28433
1,1,1,1,0.250000,21061
2,0,1,1,0.062500,19538
3,0,0,0,0.003040,18773
4,1,1,1,1.000000,28696
...,...,...,...,...,...
8396,0,1,1,0.058824,19702
8397,1,1,1,0.250000,25003
8398,0,1,1,0.125000,21217
8399,0,1,1,0.111111,26819


In [171]:
d_rank_eval_mean = d_rank_eval_results.groupby(["disease_node_index"]).mean()
d_rank_eval_mean["count"] = d_rank_eval_results.groupby(["disease_node_index"]).count()["hits_5"]
d_rank_eval_mean

Unnamed: 0_level_0,hits_5,hits_20,hits_30,reciprocal_rank,count
disease_node_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
18432,0.000000,0.000000,0.000000,0.003774,1
18434,0.000000,0.000000,0.000000,0.009970,2
18435,0.333333,0.777778,0.777778,0.174169,9
18438,0.333333,0.500000,0.500000,0.242165,12
18446,0.000000,0.000000,0.000000,0.007463,1
...,...,...,...,...,...
34158,0.000000,0.000000,0.000000,0.010101,1
34160,0.000000,0.000000,0.000000,0.003401,1
34164,0.000000,0.000000,0.000000,0.002105,1
34169,0.000000,0.000000,0.000000,0.006494,1


In [172]:
d_rank_eval_mean = d_rank_eval_mean.merge(node_df[["degree_gda","degree_dd"]],left_on="disease_node_index", right_index=True,how="left")
d_rank_eval_mean.sort_values(by="count",ascending=False)

Unnamed: 0_level_0,hits_5,hits_20,hits_30,reciprocal_rank,count,degree_gda,degree_dd
disease_node_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
18773,0.192308,0.230769,0.250000,0.167883,104,1074.0,7.0
20754,0.085366,0.134146,0.134146,0.064830,82,883.0,1.0
19957,0.000000,0.014706,0.014706,0.006070,68,774.0,0.0
18978,0.029851,0.089552,0.089552,0.039496,67,702.0,5.0
20574,0.218750,0.453125,0.484375,0.205724,64,616.0,5.0
...,...,...,...,...,...,...,...
23771,0.000000,1.000000,1.000000,0.071429,1,5.0,2.0
23769,1.000000,1.000000,1.000000,1.000000,1,1.0,1.0
23767,0.000000,0.000000,1.000000,0.047619,1,17.0,2.0
23765,0.000000,0.000000,0.000000,0.003125,1,5.0,2.0


In [179]:
d_rank_eval_mean.sort_values(by="count",ascending=False)[100:110]

Unnamed: 0_level_0,hits_5,hits_20,hits_30,reciprocal_rank,count,degree_gda,degree_dd
disease_node_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
25966,0.461538,0.846154,0.846154,0.26069,13,139.0,0.0
21432,0.75,1.0,1.0,0.684921,12,101.0,0.0
19742,0.416667,0.75,0.75,0.298751,12,139.0,0.0
26075,0.583333,0.75,0.833333,0.35312,12,137.0,5.0
20905,1.0,1.0,1.0,0.727778,12,115.0,0.0
22472,0.25,0.416667,0.583333,0.199108,12,105.0,0.0
24466,1.0,1.0,1.0,0.805556,12,101.0,0.0
20177,0.666667,0.916667,1.0,0.467617,12,125.0,1.0
18816,0.25,0.583333,0.666667,0.105599,12,159.0,7.0
33376,0.916667,1.0,1.0,0.697222,12,101.0,0.0
