# Setup

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import networkx as nx
import igraph as ig 
from matplotlib_venn import venn3, venn2
import random
import plotly.io as pio

In [2]:
seed = 16
random.seed(seed)
np.random.seed(seed)

#esto es para forzar a plt a poner fondos blancos en las figuras aunque el tema del notebook sea oscuro
plt.rcParams['axes.facecolor'] = 'white'
plt.rcParams['figure.facecolor'] = 'white'
cmap = plt.get_cmap("tab10")
pio.templates.default = "seaborn"

sns.set_style("darkgrid", rc={'xtick.bottom': True})

In [3]:
data_processed = "../../../data/processed/"
data_interim = "../../../data/interim/"
data_external = "../../../data/external/"

graph_node_data = pd.read_csv(data_processed+"graph_data/grafo_alternativo_CG_nodos.csv")
graph_edge_data = pd.read_csv(data_processed+"graph_data/grafo_alternativo_CG_edges.csv")

disease_attributes = pd.read_csv(data_interim+"grafo_alternativo_disease_attributes.csv")

node_counts = dict(graph_node_data.node_type.value_counts())
node_counts["total"] = sum(node_counts.values())

edge_counts = dict(graph_edge_data.edge_type.value_counts()/2)
edge_counts["total"] = sum(edge_counts.values())

enfermedades = graph_node_data[(graph_node_data.node_type == "disease")|(graph_node_data.node_type == "bert_group")]

display(pd.DataFrame.from_dict({"Número de nodos":node_counts}, orient="columns"))
display(pd.DataFrame.from_dict({"Número de enlaces":edge_counts}, orient="columns"))

Unnamed: 0,Número de nodos
bert_group,1043
complex,421
disease,15066
gene_protein,17322
pathway,2017
total,35869


Unnamed: 0,Número de enlaces
disease_disease,17432.0
form_complex,1885.0
gda,84024.0
pathway_protein,42643.0
ppi,110051.0
total,256035.0


In [4]:
def get_node_dict(G):
    return {node:data for (node,data) in list(G.nodes(data=True))}

def node_labels_to_numeric(G):
    new_labels = {old_label:int(old_label) for old_label in list(G.nodes)}
    nx.relabel_nodes(G,new_labels,False)

In [5]:
G = nx.read_gml(data_processed+"graph_data/full_graph_giant_component.gml")
node_labels_to_numeric(G)

In [6]:
DD = nx.read_gml(data_processed+"graph_data/grafo_alternativo_CG_disease_layer.gml")
node_labels_to_numeric(DD)

In [7]:
GDA = nx.read_gml(data_processed+"graph_data/gda_network.gml")
node_labels_to_numeric(GDA)

# Hubs

In [8]:
def gc_size(G):
    """Calcula el tamaño de la componente más grande del grafo"""
    return len(max(nx.connected_components(G), key=len))

In [58]:
disease_hubs = graph_node_data[(graph_node_data.node_type == "disease")].sort_values(by="degree_dd", ascending=False)
highest_ranking = disease_hubs[0:10].node_index.values

Estos son los 10 nodos enfermedad con más enlaces enfermedad-enfermedad:

In [59]:
disease_hubs[0:10]

Unnamed: 0,node_index,node_id,node_name,node_type,node_source,comunidades_infomap,comunidades_louvain,degree_gda,degree_pp,degree_dd
21584,21799,C0158538,Other congenital anomalies of nervous system,disease,disgenet,3.0,17.0,0.0,0.0,304.0
24360,24761,C0497552,Congenital neurologic anomalies,disease,disgenet,3.0,17.0,10.0,0.0,304.0
24081,24462,C0410787,Hereditary Connective Tissue Disorder,disease,disgenet,75.0,11.0,0.0,0.0,170.0
20662,20820,C0037277,"Skin Diseases, Genetic",disease,disgenet,206.0,16.0,0.0,0.0,89.0
25804,26284,C1290884,Inflammatory disorder,disease,disgenet,514.0,9.0,0.0,0.0,78.0
19976,20104,C0025521,Inborn Errors of Metabolism,disease,disgenet,300.0,33.0,17.0,0.0,67.0
24372,24773,C0520572,Enzymopathy,disease,disgenet,382.0,36.0,11.0,0.0,66.0
19415,19529,C0018799,Heart Diseases,disease,disgenet,65.0,17.0,46.0,0.0,52.0
18404,18471,C0001418,Adenocarcinoma,disease,disgenet,60.0,2.0,116.0,0.0,51.0
31491,32197,C3898144,Neurovascular Disorder,disease,disgenet,247.0,8.0,0.0,0.0,51.0


In [27]:
def get_edges_from(node_index):
    edges_from = graph_edge_data[(graph_edge_data.x_index == node_index)]
    return edges_from

def get_neighbors_data(node_index):
    neighbors = get_edges_from(node_index).y_index.values
    neighbors_data = graph_node_data.set_index("node_index").loc[neighbors]
    return neighbors_data

In [56]:
G2 = G.copy()
sizes = [gc_size(G2)]
for hub in highest_ranking:
    G2.remove_node(hub)
    sizes.append(gc_size(G2))

size_difference = [sizes[i-1] - sizes[i] for i in range(len(sizes))][1:]

fig = px.scatter(x=np.arange(len(sizes)),y=sizes,width=800, height=400, title="Tamaño de componente gigante al remover los 10 primeros hubs enfermedad").update_layout(xaxis_title="Ranking de nodo", yaxis_title="Tamaño CG")
fig.show()

fig2 = px.scatter(x=np.arange(len(sizes)-1),y=size_difference,width=800, height=400, title="Tamaño de componente gigante al remover los 10 primeros hubs enfermedad").update_layout(xaxis_title="Ranking de nodo", yaxis_title="Nodos perdidos")
fig2.show()

print(f"Al remover el hub más importante pierdo {sizes[0]-sizes[1]} nodos de la componente gigante \nAl remover los primeros 10, en total, pierdo {sizes[0] - sizes[-1]}")

Al remover el hub más importante pierdo 1 nodos de la componente gigante 
Al remover los primeros 10, en total, pierdo 116


In [57]:
DD2 = DD.copy()
sizes = [gc_size(DD2)]
for hub in highest_ranking:
    DD2.remove_node(hub)
    sizes.append(gc_size(DD2))

size_difference = [sizes[i-1] - sizes[i] for i in range(len(sizes))][1:]

fig = px.scatter(x=np.arange(len(sizes)),y=sizes,width=800, height=400, title="Tamaño de capa disease-disease al remover los 10 primeros hubs enfermedad").update_layout(xaxis_title="Ranking de nodo", yaxis_title="Tamaño CG")
fig.show()

fig2 = px.scatter(x=np.arange(len(sizes)-1),y=size_difference,width=800, height=400, title="Tamaño de componente gigante al remover los 10 primeros hubs enfermedad").update_layout(xaxis_title="Ranking de nodo", yaxis_title="Nodos perdidos")
fig2.show()

print(f"Al remover el hub más importante pierdo {sizes[0]-sizes[1]} nodos de la red disease-disease \nAl remover los primeros 10, en total, pierdo {sizes[0] - sizes[-1]}")

Al remover el hub más importante pierdo 1 nodos de la red disease-disease 
Al remover los primeros 10, en total, pierdo 635


In [53]:
GDA2 = GDA.copy()
sizes = [gc_size(GDA2)]
for hub in highest_ranking:
    try: 
        GDA2.remove_node(hub)
        sizes.append(gc_size(GDA2))
    except:
        print(f"node {hub} not in GDA layer, trying next node")

if len(sizes) == 1:
    print(f"\nNo se consideraron hubs que participen de la capa GDA")
else:
    size_difference = [sizes[i-1] - sizes[i] for i in range(len(sizes))][1:]

    fig = px.scatter(x=np.arange(len(sizes)),y=sizes,width=800, height=400, title="Tamaño de componente gigante al remover los 10 primeros hubs enfermedad - Capa GDA").update_layout(xaxis_title="Ranking de nodo", yaxis_title="Tamaño CG")
    fig.show()

    fig2 = px.scatter(x=np.arange(len(sizes)-1),y=size_difference,width=800, height=400, title="Tamaño de componente gigante al remover los 10 primeros hubs enfermedad").update_layout(xaxis_title="Ranking de nodo", yaxis_title="Nodos perdidos")
    fig2.show()
    
    print(f"Hubs en capa GDA: {len(sizes)-1}")
    print(f"Al remover el hub más importante pierdo {sizes[0]-sizes[1]} nodos de la capa GDA \nAl remover los primeros 10, en total, pierdo {sizes[0] - sizes[-1]}")

node 21799 not in GDA layer, trying next node
node 24462 not in GDA layer, trying next node
node 20820 not in GDA layer, trying next node
node 26284 not in GDA layer, trying next node
node 32197 not in GDA layer, trying next node


Hubs en capa GDA: 5
Al remover el hub más importante pierdo 1 nodos de la capa GDA 
Al remover los primeros 10, en total, pierdo 8


### Veo las comunidades a las que pertenecen estos hubs

In [61]:
def get_cluster_nodes(cluster_id,algoritmo):
    nodes = graph_node_data.loc[graph_node_data[algoritmo] == cluster_id,"node_index"].values
    return nodes

In [62]:
disease_hubs[0:10]

Unnamed: 0,node_index,node_id,node_name,node_type,node_source,comunidades_infomap,comunidades_louvain,degree_gda,degree_pp,degree_dd
21584,21799,C0158538,Other congenital anomalies of nervous system,disease,disgenet,3.0,17.0,0.0,0.0,304.0
24360,24761,C0497552,Congenital neurologic anomalies,disease,disgenet,3.0,17.0,10.0,0.0,304.0
24081,24462,C0410787,Hereditary Connective Tissue Disorder,disease,disgenet,75.0,11.0,0.0,0.0,170.0
20662,20820,C0037277,"Skin Diseases, Genetic",disease,disgenet,206.0,16.0,0.0,0.0,89.0
25804,26284,C1290884,Inflammatory disorder,disease,disgenet,514.0,9.0,0.0,0.0,78.0
19976,20104,C0025521,Inborn Errors of Metabolism,disease,disgenet,300.0,33.0,17.0,0.0,67.0
24372,24773,C0520572,Enzymopathy,disease,disgenet,382.0,36.0,11.0,0.0,66.0
19415,19529,C0018799,Heart Diseases,disease,disgenet,65.0,17.0,46.0,0.0,52.0
18404,18471,C0001418,Adenocarcinoma,disease,disgenet,60.0,2.0,116.0,0.0,51.0
31491,32197,C3898144,Neurovascular Disorder,disease,disgenet,247.0,8.0,0.0,0.0,51.0


In [65]:
graph_node_data[graph_node_data.comunidades_infomap == 3]

Unnamed: 0,node_index,node_id,node_name,node_type,node_source,comunidades_infomap,comunidades_louvain,degree_gda,degree_pp,degree_dd
7350,7385,32832_44319,intellectual developmental disorder with dysmo...,bert_group,primekg,3.0,17.0,0.0,0.0,3.0
14864,14921,7877_7876,laryngeal adductor paralysis,bert_group,primekg,3.0,17.0,0.0,0.0,3.0
18499,18568,C0002902,Anencephaly,disease,disgenet,3.0,17.0,4.0,0.0,3.0
18628,18701,C0004779,Basal Cell Nevus Syndrome,disease,disgenet,3.0,17.0,5.0,0.0,4.0
19164,19268,C0014544,Epilepsy,disease,disgenet,3.0,17.0,109.0,0.0,26.0
...,...,...,...,...,...,...,...,...,...,...
32903,33653,C4551775,"Knobloch Syndrome, Type I",disease,disgenet,3.0,17.0,1.0,0.0,3.0
32989,33739,C4552097,Nevus Sebaceus of Jadassohn,disease,disgenet,3.0,17.0,3.0,0.0,3.0
33398,34161,C4749920,15q overgrowth syndrome,disease,disgenet,3.0,17.0,0.0,0.0,3.0
33401,34164,C4750849,XYLT1-CDG - xylosyltransferase 1 congenital di...,disease,disgenet,3.0,17.0,1.0,0.0,2.0


Infomap:

Los dos primeros outliers pertenecen al mismo cluster! Que además es el cluster más grande de infomap y tiene métricas "raris" de tfidf

In [66]:
graph_node_data[graph_node_data.comunidades_louvain == 17]

Unnamed: 0,node_index,node_id,node_name,node_type,node_source,comunidades_infomap,comunidades_louvain,degree_gda,degree_pp,degree_dd
28,28,100103_100102_100104_100101_8824,fetal akinesia deformation sequence,bert_group,primekg,367.0,17.0,0.0,0.0,2.0
530,532,10283_10460_30908_10427_10407_15601_10417_1066...,X-linked intellectual disability,bert_group,primekg,192.0,17.0,0.0,0.0,35.0
545,547,10297_10318_10590_2010_10316_10366,FG syndrome,bert_group,primekg,342.0,17.0,0.0,0.0,7.0
612,614,10383_23178_23179_23180,fragile X syndrome,bert_group,primekg,187.0,17.0,0.0,0.0,5.0
696,698,10465_7843_16512,Kabuki syndrome,bert_group,primekg,344.0,17.0,0.0,0.0,4.0
...,...,...,...,...,...,...,...,...,...,...
33398,34161,C4749920,15q overgrowth syndrome,disease,disgenet,3.0,17.0,0.0,0.0,3.0
33401,34164,C4750849,XYLT1-CDG - xylosyltransferase 1 congenital di...,disease,disgenet,3.0,17.0,1.0,0.0,2.0
33403,34166,C4750855,Severe neonatal lactic acidosis due to NFS1-IS...,disease,disgenet,778.0,17.0,2.0,0.0,1.0
33424,34188,C4755278,FASTKD2-related infantile mitochondrial enceph...,disease,disgenet,3.0,17.0,1.0,0.0,2.0


También están en el mismo cluster de Louvain!

Aver el tercer hub

In [70]:
graph_node_data[graph_node_data.comunidades_louvain == 11]

Unnamed: 0,node_index,node_id,node_name,node_type,node_source,comunidades_infomap,comunidades_louvain,degree_gda,degree_pp,degree_dd
278,280,10075_32724_19675_18254,spondyloepimetaphyseal dysplasia with joint la...,bert_group,primekg,270.0,11.0,0.0,0.0,4.0
334,336,10117_13039_13627,3M syndrome,bert_group,primekg,271.0,11.0,0.0,0.0,5.0
490,492,10248_10737,X-linked spondyloepimetaphyseal dysplasia,bert_group,primekg,238.0,11.0,0.0,0.0,6.0
847,849,10619_10358_20720,hereditary hypophosphatemic rickets,bert_group,primekg,279.0,11.0,0.0,0.0,6.0
926,928,10704_10571_19027,otopalatodigital syndrome,bert_group,primekg,248.0,11.0,0.0,0.0,5.0
...,...,...,...,...,...,...,...,...,...,...
33080,33831,C4693799,"SPONDYLOEPIMETAPHYSEAL DYSPLASIA, DI ROCCO TYPE",disease,disgenet,288.0,11.0,1.0,0.0,1.0
33093,33844,C4693870,"EHLERS-DANLOS SYNDROME, CLASSIC-LIKE, 2",disease,disgenet,74.0,11.0,1.0,0.0,2.0
33175,33933,C4721845,"Marfan Syndrome, Type I",disease,disgenet,214.0,11.0,9.0,0.0,3.0
33231,33990,C4747922,PELGER-HUET ANOMALY WITH MILD SKELETAL ANOMALIES,disease,disgenet,287.0,11.0,1.0,0.0,1.0
