# Setup

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import networkx as nx
import igraph as ig 
from matplotlib_venn import venn3, venn2
import random
import plotly.io as pio

In [2]:
seed = 16
random.seed(seed)
np.random.seed(seed)

#esto es para forzar a plt a poner fondos blancos en las figuras aunque el tema del notebook sea oscuro
plt.rcParams['axes.facecolor'] = 'white'
plt.rcParams['figure.facecolor'] = 'white'
cmap = plt.get_cmap("tab10")
pio.templates.default = "seaborn"

sns.set_style("darkgrid", rc={'xtick.bottom': True})

In [3]:
data_processed = "../../../data/processed/"
data_interim = "../../../data/interim/"
data_external = "../../../data/external/"

graph_node_data = pd.read_csv(data_processed+"graph_data/grafo_alternativo_CG_nodos.csv")
graph_edge_data = pd.read_csv(data_processed+"graph_data/grafo_alternativo_CG_edges.csv")

disease_attributes = pd.read_csv(data_interim+"grafo_alternativo_disease_attributes.csv")

node_counts = dict(graph_node_data.node_type.value_counts())
node_counts["total"] = sum(node_counts.values())

edge_counts = dict(graph_edge_data.edge_type.value_counts()/2)
edge_counts["total"] = sum(edge_counts.values())

enfermedades = graph_node_data[(graph_node_data.node_type == "disease")|(graph_node_data.node_type == "bert_group")]

display(pd.DataFrame.from_dict({"Número de nodos":node_counts}, orient="columns"))
display(pd.DataFrame.from_dict({"Número de enlaces":edge_counts}, orient="columns"))

Unnamed: 0,Número de nodos
bert_group,1043
complex,421
disease,15066
gene_protein,17322
pathway,2017
total,35869


Unnamed: 0,Número de enlaces
disease_disease,17432.0
form_complex,1885.0
gda,84024.0
pathway_protein,42643.0
ppi,110051.0
total,256035.0


In [4]:
def get_node_dict(G):
    return {node:data for (node,data) in list(G.nodes(data=True))}

def node_labels_to_numeric(G):
    new_labels = {old_label:int(old_label) for old_label in list(G.nodes)}
    nx.relabel_nodes(G,new_labels,False)

In [7]:
G = nx.read_gml(data_processed+"graph_data/full_graph_giant_component.gml")
node_labels_to_numeric(G)

In [20]:
DD = nx.read_gml(data_processed+"graph_data/grafo_alternativo_CG_disease_layer.gml")
node_labels_to_numeric(DD)

In [23]:
GDA = nx.read_gml(data_processed+"graph_data/gda_network.gml")
node_labels_to_numeric(GDA)

# Hubs

In [9]:
def gc_size(G):
    """Calcula el tamaño de la componente más grande del grafo"""
    return len(max(nx.connected_components(G), key=len))

In [34]:
disease_hubs = graph_node_data[(graph_node_data.node_type == "disease")].sort_values(by="degree_dd", ascending=False)
highest_ranking = disease_hubs[0:10].node_index.values

Estos son los 10 nodos enfermedad con más enlaces enfermedad-enfermedad:

In [35]:
disease_hubs[0:10]

Unnamed: 0,node_index,node_id,node_name,node_type,node_source,comunidades_infomap,comunidades_louvain,degree_gda,degree_pp,degree_dd
21584,21799,C0158538,Other congenital anomalies of nervous system,disease,disgenet,3.0,17.0,0.0,0.0,304.0
24360,24761,C0497552,Congenital neurologic anomalies,disease,disgenet,3.0,17.0,10.0,0.0,304.0
24081,24462,C0410787,Hereditary Connective Tissue Disorder,disease,disgenet,75.0,11.0,0.0,0.0,170.0
20662,20820,C0037277,"Skin Diseases, Genetic",disease,disgenet,206.0,16.0,0.0,0.0,89.0
25804,26284,C1290884,Inflammatory disorder,disease,disgenet,514.0,9.0,0.0,0.0,78.0
19976,20104,C0025521,Inborn Errors of Metabolism,disease,disgenet,300.0,33.0,17.0,0.0,67.0
24372,24773,C0520572,Enzymopathy,disease,disgenet,382.0,36.0,11.0,0.0,66.0
19415,19529,C0018799,Heart Diseases,disease,disgenet,65.0,17.0,46.0,0.0,52.0
18404,18471,C0001418,Adenocarcinoma,disease,disgenet,60.0,2.0,116.0,0.0,51.0
31491,32197,C3898144,Neurovascular Disorder,disease,disgenet,247.0,8.0,0.0,0.0,51.0


In [36]:
G2 = G.copy()
sizes = [gc_size(G2)]
for hub in highest_ranking:
    G2.remove_node(hub)
    sizes.append(gc_size(G2))

fig = px.scatter(x=np.arange(len(sizes)),y=sizes,width=800, height=400, title="Tamaño de componente gigante al remover los 10 primeros hubs enfermedad").update_layout(xaxis_title="Ranking de nodo", yaxis_title="Tamaño CG")
fig.show()

print(f"Al remover el hub más importante pierdo {sizes[0]-sizes[1]} nodos de la componente gigante \nAl remover los primeros 10, en total, pierdo {sizes[0] - sizes[-1]}")

Al remover el hub más importante pierdo 1 nodos de la componente gigante 
Al remover los primeros 10, en total, pierdo 50


In [37]:
DD2 = DD.copy()
sizes = [gc_size(DD2)]
for hub in highest_ranking:
    DD2.remove_node(hub)
    sizes.append(gc_size(DD2))

fig = px.scatter(x=np.arange(len(sizes)),y=sizes,width=800, height=400, title="Tamaño de componente gigante al remover los 10 primeros hubs enfermedad - Capa Disease").update_layout(xaxis_title="Ranking de nodo", yaxis_title="Tamaño CG")
fig.show()

print(f"Al remover el hub más importante pierdo {sizes[0]-sizes[1]} nodos de la componente gigante \nAl remover los primeros 10, en total, pierdo {sizes[0] - sizes[-1]}")

Al remover el hub más importante pierdo 1 nodos de la componente gigante 
Al remover los primeros 10, en total, pierdo 372


In [38]:
GDA2 = GDA.copy()
sizes = [gc_size(GDA2)]
for hub in highest_ranking:
    try: 
        DD2.remove_node(hub)
        sizes.append(gc_size(GDA2))
    except:
        print(f"node {hub} not in GDA layer, skipping it")

if len(sizes) == 1:
    print(f"\nNo se consideraron hubs que participen de la capa GDA")
else:
    fig = px.scatter(x=np.arange(len(sizes)),y=sizes,width=800, height=400, title="Tamaño de componente gigante al remover los 10 primeros hubs enfermedad - Capa GDA").update_layout(xaxis_title="Ranking de nodo", yaxis_title="Tamaño CG")
    fig.show()
    print(f"Hubs en capa GDA: {len(sizes)-1}")
    print(f"Al remover el hub más importante pierdo {sizes[0]-sizes[1]} nodos de la componente gigante \nAl remover los primeros 10, en total, pierdo {sizes[0] - sizes[-1]}")

node 21799 not in GDA layer, skipping it
node 24761 not in GDA layer, skipping it
node 24462 not in GDA layer, skipping it
node 20820 not in GDA layer, skipping it
node 26284 not in GDA layer, skipping it
node 20104 not in GDA layer, skipping it
node 24773 not in GDA layer, skipping it
node 19529 not in GDA layer, skipping it
node 18471 not in GDA layer, skipping it
node 32197 not in GDA layer, skipping it

No se consideraron hubs que participen de la capa GDA


In [27]:
sizes

[19902]