# Setup

In [2]:
import random

import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.io as pio
import seaborn as sns

In [3]:
seed = 16
random.seed(seed)
np.random.seed(seed)

#esto es para forzar a plt a poner fondos blancos en las figuras aunque el tema del notebook sea oscuro
plt.rcParams['axes.facecolor'] = 'white'
plt.rcParams['figure.facecolor'] = 'white'
cmap = plt.get_cmap("tab10")
pio.templates.default = "seaborn"

sns.set_style("darkgrid", rc={'xtick.bottom': True})

In [25]:
data_processed = "../../data/processed/"
data_interim = "../../data/interim/"
data_external = "../../data/external/"

graph_node_data = pd.read_csv(data_interim+"grafo_alternativo_nodos.csv")
graph_edge_data = pd.read_csv(data_interim+"grafo_alternativo_enlaces.csv").rename(columns={"relation":"edge_type"})
disease_attributes = pd.read_csv(data_interim+"grafo_alternativo_disease_attributes.csv")

node_counts = dict(graph_node_data.node_type.value_counts())
node_counts["total"] = sum(node_counts.values())

edge_counts = dict(graph_edge_data.edge_type.value_counts()/2)
edge_counts["total"] = sum(edge_counts.values())

enfermedades = graph_node_data[(graph_node_data.node_type == "disease")|(graph_node_data.node_type == "bert_group")]

display(pd.DataFrame.from_dict({"Número de nodos":node_counts}, orient="columns"))
display(pd.DataFrame.from_dict({"Número de enlaces":edge_counts}, orient="columns"))

Unnamed: 0,Número de nodos
bert_group,1067
complex,422
disease,15766
gene_protein,17363
pathway,2020
total,36638


Unnamed: 0,Número de enlaces
disease_disease,17488.0
form_complex,1888.0
gda,84038.0
pathway_protein,42646.0
ppi,110062.0
total,256122.0


# Componente Gigante

Me quedo solo con la componente gigante del grafo completo. 

In [9]:
def attributes_from_pd(G:nx.Graph,df:pd.DataFrame,attributes:dict,indexcol):
    """Dados un grafo G y un dataframe df con atributos de sus nodos, especificamos los atributos
    que queremos agregar a los nodos en un diccionario con formato {nombre_columna:nombre_atributo}. 
    La función arma un diccionario con los atributos y el nombre que le queremos poner, indexado con el identificador de nodo que elegimos 
    y los asigna a los nodos del grafo"""
    for attribute,name in attributes.items():
        nx.set_node_attributes(G,pd.Series(df.set_index(indexcol)[attribute]).to_dict(),name)

def get_node_dict(G):
    return {node:data for (node,data) in list(G.nodes(data=True))}

def get_edge_dict(G):
    edge_dict = {}
    for edge_data in list(G.edges(data=True)):
        edge_index = edge_data[2]["edge_index"]
        edge_type = edge_data[2]["edge_type"]
        edge_source = edge_data[0]
        edge_target = edge_data[1]
        edge_dict[edge_index] = {"edge_type":edge_type, "nodes":(edge_source,edge_target)}
    return edge_dict

def get_edges_from(node_index):
    edges_from = graph_edge_data[(graph_edge_data.x_index == node_index)]
    return edges_from

In [26]:
#Armo el grafo completo
G_full = nx.from_pandas_edgelist(graph_edge_data,source="x_index",target="y_index", edge_attr=["edge_type","edge_index"])
G_attributes = {"node_type":"node_type","node_name":"node_name","node_id":"node_id","node_source":"node_source"}
attributes_from_pd(G_full,graph_node_data,G_attributes,"node_index")

#Tomo la componente gigante
Gcc = sorted(nx.connected_components(G_full), key=len, reverse=True)
G = G_full.subgraph(Gcc[0]).copy()

#Saco de los dataframes las filas que quedaron afuera de la CG
nodos_en_cg = list(G.nodes())
graph_node_data = graph_node_data.loc[nodos_en_cg].sort_values(by="node_index").reset_index(drop=True)

graph_edge_data = graph_edge_data.set_index("x_index").loc[nodos_en_cg].reset_index()
graph_edge_data = graph_edge_data.set_index("y_index").loc[nodos_en_cg].reset_index()

node_counts = dict(graph_node_data.node_type.value_counts())
node_counts["total"] = sum(node_counts.values())

edge_counts = dict(graph_edge_data.edge_type.value_counts()/2)
edge_counts["total"] = sum(edge_counts.values())

# graph_node_data.to_csv(data_interim+"grafo_alternativo_CG_nodos.csv",index=False)
# graph_edge_data.to_csv(data_interim+"grafo_alternativo_CG_edges.csv",index=False)

display(pd.DataFrame.from_dict({"Número de nodos":node_counts}, orient="columns"))
display(pd.DataFrame.from_dict({"Número de enlaces":edge_counts}, orient="columns"))

Unnamed: 0,Número de nodos
bert_group,1043
complex,421
disease,15066
gene_protein,17322
pathway,2017
total,35869


Unnamed: 0,Número de enlaces
disease_disease,17432.0
form_complex,1885.0
gda,84024.0
pathway_protein,42643.0
ppi,110051.0
total,256035.0


# Armo subredes

Armo subgrafo de enfermedades

In [10]:
edges_enfermedades = graph_edge_data[graph_edge_data.edge_type == "disease_disease"]

nodos_enfermedades = graph_node_data[(graph_node_data.node_type == "disease") | (graph_node_data.node_type == "bert_group")]
nodos_disease = graph_node_data[graph_node_data.node_type == "disease"]
nodos_bert = graph_node_data[graph_node_data.node_type == "bert_group"]

disease_attributes["node_type"] = "disease"
disease_attributes = pd.concat([disease_attributes,nodos_bert])

DD = nx.from_pandas_edgelist(edges_enfermedades,source="x_index",target="y_index")
DD_attributes = {"node_name":"node_name","node_type":"node_type","node_id":"node_id","node_source":"node_source","disgenet_type":"disgenet_type","diseaseClassMSH":"diseaseClassMSH","diseaseClassNameMSH":"diseaseClassNameMSH"}
attributes_from_pd(DD,disease_attributes,DD_attributes,"node_index")

Armo subgrafos de proteínas y de asociaciones GDA

In [11]:
edges_proteinas = graph_edge_data[(graph_edge_data.edge_type == "ppi") | (graph_edge_data.edge_type == "form_complex") | (graph_edge_data.edge_type == "pathway_protein")]
nodos_proteinas = graph_node_data[(graph_node_data.node_type == "gene_protein") | (graph_node_data.node_type == "complex") | (graph_node_data.node_type == "pathway")]
PPI = nx.from_pandas_edgelist(edges_proteinas,source="x_index",target="y_index")
PPI_attributes = {"node_type":"node_type","node_name":"node_name","node_id":"node_id","node_source":"node_source"}

edges_gda = graph_edge_data[graph_edge_data.edge_type == "gda"]
nodos_gda = graph_node_data[graph_node_data.node_source == "disgenet"]

GDA = nx.from_pandas_edgelist(edges_gda,source="x_index",target="y_index")
GDA_attributes = {"node_type":"node_type","node_name":"node_name","node_id":"node_id","node_source":"node_source"}

for network,attr in zip([PPI,GDA],[PPI_attributes,GDA_attributes]):
    attributes_from_pd(network,graph_node_data,attr,"node_index")

In [29]:
graphs = {"Grafo completo":G, "Disease layer":DD, "Protein layer":PPI, "GDA": GDA}
stats = {name:{"Número de nodos":graph.number_of_nodes(), "Número de enlaces":graph.number_of_edges()} for name,graph in graphs.items()}
display(pd.DataFrame(stats))

Unnamed: 0,Grafo completo,Disease layer,Protein layer,GDA
Número de nodos,35869,12509,18525,20856
Número de enlaces,256035,17432,154579,84024


# Remuevo Hubs Enfermedad

In [51]:
def get_degree_series(G,name):
    degrees = {n[0]:G.degree(n[0]) for n in G.nodes(data=True)}
    return pd.Series(degrees,name=name)

def gc_size(G):
    """Calcula el tamaño de la componente más grande del grafo"""
    return len(max(nx.connected_components(G), key=len))

In [60]:
dd_degree = get_degree_series(DD,"degree_dd")
graph_node_data = pd.merge(graph_node_data,dd_degree,left_on="node_index",right_index=True,how="left").fillna(0)

In [63]:
gda_degree = get_degree_series(GDA,"degree_gda")
graph_node_data = pd.merge(graph_node_data,gda_degree,left_on="node_index",right_index=True,how="left").fillna(0)

In [64]:
disease_hubs = graph_node_data[(graph_node_data.node_type == "disease")].sort_values(by="degree_dd", ascending=False)
highest_ranking = disease_hubs[0:10].node_index.values

Estos son los 10 nodos enfermedad con más enlaces enfermedad-enfermedad:

In [65]:
disease_hubs[0:10]

Unnamed: 0,node_index,node_id,node_name,node_type,node_source,degree_dd,degree_gda
21584,21799,C0158538,Other congenital anomalies of nervous system,disease,disgenet,304.0,0.0
24360,24761,C0497552,Congenital neurologic anomalies,disease,disgenet,304.0,10.0
24081,24462,C0410787,Hereditary Connective Tissue Disorder,disease,disgenet,170.0,0.0
20662,20820,C0037277,"Skin Diseases, Genetic",disease,disgenet,89.0,0.0
25804,26284,C1290884,Inflammatory disorder,disease,disgenet,78.0,0.0
19976,20104,C0025521,Inborn Errors of Metabolism,disease,disgenet,67.0,17.0
24372,24773,C0520572,Enzymopathy,disease,disgenet,66.0,11.0
19415,19529,C0018799,Heart Diseases,disease,disgenet,52.0,46.0
18404,18471,C0001418,Adenocarcinoma,disease,disgenet,51.0,116.0
31491,32197,C3898144,Neurovascular Disorder,disease,disgenet,51.0,0.0


In [66]:
top_10 = disease_hubs[0:10]
remove_list = list(top_10.loc[top_10.degree_gda == 0,"node_index"].values)
remove_list.append(24761)

In [67]:
top_10[top_10.degree_gda == 0]

Unnamed: 0,node_index,node_id,node_name,node_type,node_source,degree_dd,degree_gda
21584,21799,C0158538,Other congenital anomalies of nervous system,disease,disgenet,304.0,0.0
24081,24462,C0410787,Hereditary Connective Tissue Disorder,disease,disgenet,170.0,0.0
20662,20820,C0037277,"Skin Diseases, Genetic",disease,disgenet,89.0,0.0
25804,26284,C1290884,Inflammatory disorder,disease,disgenet,78.0,0.0
31491,32197,C3898144,Neurovascular Disorder,disease,disgenet,51.0,0.0


In [68]:
def get_edges_from(node_index):
    edges_from = graph_edge_data[(graph_edge_data.x_index == node_index)]
    return edges_from

def get_neighbors_data(node_index):
    neighbors = get_edges_from(node_index).y_index.values
    neighbors_data = graph_node_data.set_index("node_index").loc[neighbors]
    return neighbors_data

In [69]:
G2 = G.copy()
sizes = [gc_size(G2)]
for hub in highest_ranking:
    G2.remove_node(hub)
    sizes.append(gc_size(G2))

size_difference = [sizes[i-1] - sizes[i] for i in range(len(sizes))][1:]

fig = px.scatter(x=np.arange(len(sizes)),y=sizes,width=800, height=400, title="Tamaño de componente gigante al remover los 10 primeros hubs enfermedad").update_layout(xaxis_title="Ranking de nodo", yaxis_title="Tamaño CG")
fig.show()

fig2 = px.scatter(x=np.arange(len(sizes)-1),y=size_difference,width=800, height=400, title="Tamaño de componente gigante al remover los 10 primeros hubs enfermedad").update_layout(xaxis_title="Ranking de nodo", yaxis_title="Nodos perdidos")
fig2.show()

print(f"Al remover el hub más importante pierdo {sizes[0]-sizes[1]} nodos de la componente gigante \nAl remover los primeros 10, en total, pierdo {sizes[0] - sizes[-1]}")

Al remover el hub más importante pierdo 1 nodos de la componente gigante 
Al remover los primeros 10, en total, pierdo 50


In [70]:
DD2 = DD.copy()
sizes = [gc_size(DD2)]
for hub in highest_ranking:
    DD2.remove_node(hub)
    sizes.append(gc_size(DD2))

size_difference = [sizes[i-1] - sizes[i] for i in range(len(sizes))][1:]

fig = px.scatter(x=np.arange(len(sizes)),y=sizes,width=800, height=400, title="Tamaño de capa disease-disease al remover los 10 primeros hubs enfermedad").update_layout(xaxis_title="Ranking de nodo", yaxis_title="Tamaño CG")
fig.show()

fig2 = px.scatter(x=np.arange(len(sizes)-1),y=size_difference,width=800, height=400, title="Tamaño de componente gigante al remover los 10 primeros hubs enfermedad").update_layout(xaxis_title="Ranking de nodo", yaxis_title="Nodos perdidos")
fig2.show()

print(f"Al remover el hub más importante pierdo {sizes[0]-sizes[1]} nodos de la red disease-disease \nAl remover los primeros 10, en total, pierdo {sizes[0] - sizes[-1]}")

Al remover el hub más importante pierdo 1 nodos de la red disease-disease 
Al remover los primeros 10, en total, pierdo 372


In [301]:
GDA2 = GDA.copy()
sizes = [gc_size(GDA2)]
for hub in highest_ranking:
    try: 
        GDA2.remove_node(hub)
        sizes.append(gc_size(GDA2))
    except:
        print(f"node {hub} not in GDA layer, trying next node")

if len(sizes) == 1:
    print(f"\nNo se consideraron hubs que participen de la capa GDA")
else:
    size_difference = [sizes[i-1] - sizes[i] for i in range(len(sizes))][1:]

    fig = px.scatter(x=np.arange(len(sizes)),y=sizes,width=800, height=400, title="Tamaño de componente gigante al remover los 10 primeros hubs enfermedad - Capa GDA").update_layout(xaxis_title="Ranking de nodo", yaxis_title="Tamaño CG")
    fig.show()

    fig2 = px.scatter(x=np.arange(len(sizes)-1),y=size_difference,width=800, height=400, title="Tamaño de componente gigante al remover los 10 primeros hubs enfermedad").update_layout(xaxis_title="Ranking de nodo", yaxis_title="Nodos perdidos")
    fig2.show()
    
    print(f"Hubs en capa GDA: {len(sizes)-1}")
    print(f"Al remover el hub más importante pierdo {sizes[0]-sizes[1]} nodos de la capa GDA \nAl remover los primeros 10, en total, pierdo {sizes[0] - sizes[-1]}")

node 21799 not in GDA layer, trying next node
node 24462 not in GDA layer, trying next node
node 20820 not in GDA layer, trying next node
node 26284 not in GDA layer, trying next node
node 32197 not in GDA layer, trying next node


Hubs en capa GDA: 5
Al remover el hub más importante pierdo 1 nodos de la capa GDA 
Al remover los primeros 10, en total, pierdo 8


# De los top 10, saco los hubs sin evidencia GDA (+ un outlier que si tiene evidencia)

In [72]:
G2 = G.copy()
sizes = [gc_size(G2)]
for hub in remove_list:
    G2.remove_node(hub)
    sizes.append(gc_size(G2))

fig = px.scatter(x=np.arange(len(sizes)),y=sizes,width=800, height=400, title="Tamaño de componente gigante al remover los 6 hubs enfermedad").update_layout(xaxis_title="Ranking de nodo", yaxis_title="Tamaño CG")
fig.show()

print(f"Al remover el hub más importante pierdo {sizes[0]-sizes[1]} nodos de la componente gigante \nAl remover toda la lista, en total, pierdo {sizes[0] - sizes[-1]}")

Al remover el hub más importante pierdo 1 nodos de la componente gigante 
Al remover toda la lista, en total, pierdo 30


In [303]:
DD2 = DD.copy()
sizes = [gc_size(DD2)]
for hub in remove_list:
    DD2.remove_node(hub)
    sizes.append(gc_size(DD2))

fig = px.scatter(x=np.arange(len(sizes)),y=sizes,width=800, height=400, title="Tamaño de capa disease-disease al remover los 6 hubs enfermedad").update_layout(xaxis_title="Ranking de nodo", yaxis_title="Tamaño CG")
fig.show()

print(f"Al remover el hub más importante pierdo {sizes[0]-sizes[1]} nodos de la red disease-disease \nAl remover toda la lista, en total, pierdo {sizes[0] - sizes[-1]}")

Al remover el hub más importante pierdo 1 nodos de la red disease-disease 
Al remover toda la lista, en total, pierdo 275


In [304]:
GDA2 = GDA.copy()
sizes = [gc_size(GDA2)]
for hub in remove_list:
    try: 
        GDA2.remove_node(hub)
        sizes.append(gc_size(GDA2))
    except:
        print(f"node {hub} not in GDA layer, trying next node")

if len(sizes) == 1:
    print(f"\nNo se consideraron hubs que participen de la capa GDA")
else:

    fig = px.scatter(x=np.arange(len(sizes)),y=sizes,width=800, height=400, title="Tamaño de componente gigante al remover los 6 enfermedad - Capa GDA").update_layout(xaxis_title="Ranking de nodo", yaxis_title="Tamaño CG")
    fig.show()
    
    print(f"Hubs en capa GDA: {len(sizes)-1}")
    print(f"Al remover el hub más importante pierdo {sizes[0]-sizes[1]} nodos de la capa GDA \nAl remover los primeros 10, en total, pierdo {sizes[0] - sizes[-1]}")

node 21799 not in GDA layer, trying next node
node 24462 not in GDA layer, trying next node
node 20820 not in GDA layer, trying next node
node 26284 not in GDA layer, trying next node
node 32197 not in GDA layer, trying next node


Hubs en capa GDA: 1
Al remover el hub más importante pierdo 1 nodos de la capa GDA 
Al remover los primeros 10, en total, pierdo 1


# Saco los nodos y me guardo los datos del nuevo grafo

In [75]:
G_sin_hubs = G.copy()
G_sin_hubs.remove_nodes_from(remove_list)

Gcc = sorted(nx.connected_components(G_sin_hubs), key=len, reverse=True)
H = G_sin_hubs.subgraph(Gcc[0]).copy()

nodos_en_cg = list(H.nodes())
new_graph_node_data = graph_node_data.set_index("node_index").loc[nodos_en_cg].reset_index()

new_graph_edge_data = graph_edge_data.set_index("x_index").loc[nodos_en_cg].reset_index()
new_graph_edge_data = new_graph_edge_data.set_index("y_index").loc[nodos_en_cg].reset_index()

new_node_counts = dict(new_graph_node_data.node_type.value_counts())
new_node_counts["total"] = sum(new_node_counts.values())

new_edge_counts = dict(new_graph_edge_data.edge_type.value_counts()/2)
new_edge_counts["total"] = sum(new_edge_counts.values())

display(pd.DataFrame.from_dict({"Número de nodos":new_node_counts}, orient="columns"))
display(pd.DataFrame.from_dict({"Número de enlaces":new_edge_counts}, orient="columns"))

Unnamed: 0,Número de nodos
bert_group,1040
complex,421
disease,15039
gene_protein,17322
pathway,2017
total,35839


Unnamed: 0,Número de enlaces
disease_disease,16433.0
form_complex,1885.0
gda,84014.0
pathway_protein,42643.0
ppi,110051.0
total,255026.0


In [306]:
node_counts = dict(graph_node_data.node_type.value_counts())
node_counts["total"] = sum(node_counts.values())

edge_counts = dict(graph_edge_data.edge_type.value_counts()/2)
edge_counts["total"] = sum(edge_counts.values())

display(pd.DataFrame.from_dict({"Número de nodos":node_counts}, orient="columns"))
display(pd.DataFrame.from_dict({"Número de enlaces":edge_counts}, orient="columns"))

Unnamed: 0,Número de nodos
bert_group,1043
complex,421
disease,15066
gene_protein,17322
pathway,2017
total,35869


Unnamed: 0,Número de enlaces
disease_disease,17432.0
form_complex,1885.0
gda,84024.0
pathway_protein,42643.0
ppi,110051.0
total,256035.0


In [307]:
nodos_perdidos = pd.DataFrame.from_dict({"Número de nodos":node_counts}, orient="columns") - pd.DataFrame.from_dict({"Número de nodos":new_node_counts}, orient="columns")
nodos_perdidos

Unnamed: 0,Número de nodos
bert_group,3
complex,0
disease,27
gene_protein,0
pathway,0
total,30


In [78]:
enlaces_perdidos = pd.DataFrame.from_dict({"Número de enlaces":edge_counts}, orient="columns") - pd.DataFrame.from_dict({"Número de enlaces":new_edge_counts}, orient="columns")
enlaces_perdidos

Unnamed: 0,Número de enlaces
disease_disease,999.0
form_complex,0.0
gda,10.0
pathway_protein,0.0
ppi,0.0
total,1009.0


Guardo los datos del grafo procesado

In [93]:
new_graph_node_data = new_graph_node_data.drop(columns=["degree_gda","degree_dd"])

# new_graph_node_data.to_csv(data_processed+"graph_data_nohubs/nohub_graph_nodes.csv",index=False)
# new_graph_edge_data.to_csv(data_processed+"graph_data_nohubs/nohub_graph_edge_data.csv",index=False)
# nx.write_gml(H,data_processed+"graph_data_nohubs/nohub_fullgraph.gml")

Actualizo los atributos

In [313]:
# old_disease_attributes = pd.read_csv(data_interim+"grafo_alternativo_disease_attributes.csv")
# disease_node_index = new_graph_node_data.loc[new_graph_node_data.node_type == "disease","node_index"].values
# disease_attributes = old_disease_attributes.set_index("node_index").loc[disease_node_index].reset_index()
# disease_attributes.to_csv(data_processed+"graph_data_nohubs/nohub_disease_attributes.csv",index=False)