# Setup

In [10]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import networkx as nx
import igraph as ig 
from matplotlib_venn import venn3, venn2
import random
import plotly.io as pio

In [11]:
seed = 16
random.seed(seed)
np.random.seed(seed)

#esto es para forzar a plt a poner fondos blancos en las figuras aunque el tema del notebook sea oscuro
plt.rcParams['axes.facecolor'] = 'white'
plt.rcParams['figure.facecolor'] = 'white'
cmap = plt.get_cmap("tab10")
pio.templates.default = "seaborn"

sns.set_style("darkgrid", rc={'xtick.bottom': True})

In [12]:
data_processed = "../../data/processed/"
data_interim = "../../data/interim/"
data_external = "../../data/external/"

graph_node_data = pd.read_csv(data_processed+"grafo_alternativo_CG_nodos.csv")
graph_edge_data = pd.read_csv(data_processed+"grafo_alternativo_CG_edges.csv")

disease_attributes = pd.read_csv(data_interim+"grafo_alternativo_disease_attributes.csv")

node_counts = dict(graph_node_data.node_type.value_counts())
node_counts["total"] = sum(node_counts.values())

edge_counts = dict(graph_edge_data.edge_type.value_counts()/2)
edge_counts["total"] = sum(edge_counts.values())

display(pd.DataFrame.from_dict({"Número de nodos":node_counts}, orient="columns"))
display(pd.DataFrame.from_dict({"Número de enlaces":edge_counts}, orient="columns"))

Unnamed: 0,Número de nodos
bert_group,1043
complex,421
disease,15066
gene_protein,17322
pathway,2017
total,35869


Unnamed: 0,Número de enlaces
disease_disease,17432.0
form_complex,1885.0
gda,84024.0
pathway_protein,42643.0
ppi,110051.0
total,256035.0


In [13]:
nodos_bert = graph_node_data.loc[graph_node_data.node_type == "bert_group",["node_index","node_id","node_name","node_source"]].copy()
disease_attributes = pd.concat([disease_attributes,nodos_bert])

enfermedades_en_dd = graph_node_data.loc[graph_node_data.degree_dd != 0, "node_index"].values
disease_attributes = disease_attributes.set_index("node_index").loc[enfermedades_en_dd].reset_index()

disease_attributes = pd.merge(graph_node_data[["node_index","comunidades_infomap","comunidades_louvain"]],disease_attributes,left_on="node_index",right_on="node_index",how="right")

tamaños_louvain = disease_attributes.comunidades_louvain.value_counts()
tamaños_infomap = disease_attributes.comunidades_infomap.value_counts()

infomap_clusters = pd.DataFrame(tamaños_infomap).reset_index().rename(columns={"index":"comunidad","comunidades_infomap":"tamaño"}).astype({"comunidad":"int"})
louvain_clusters = pd.DataFrame(tamaños_louvain).reset_index().rename(columns={"index":"comunidad","comunidades_louvain":"tamaño"}).astype({"comunidad":"int"})

In [17]:
def attributes_from_pd(G:nx.Graph,df:pd.DataFrame,attributes:dict,indexcol):
    """Dados un grafo G y un dataframe df con atributos de sus nodos, especificamos los atributos
    que queremos agregar a los nodos en un diccionario con formato {nombre_columna:nombre_atributo}. 
    La función arma un diccionario con los atributos y el nombre que le queremos poner, indexado con el identificador de nodo que elegimos 
    y los asigna a los nodos del grafo"""
    for attribute,name in attributes.items():
        nx.set_node_attributes(G,pd.Series(df.set_index(indexcol)[attribute]).to_dict(),name)

def get_node_dict(G):
    return {node:data for (node,data) in list(G.nodes(data=True))}

def get_edge_dict(G):
    edge_dict = {}
    for edge_data in list(G.edges(data=True)):
        edge_index = edge_data[2]["edge_index"]
        edge_type = edge_data[2]["edge_type"]
        edge_source = edge_data[0]
        edge_target = edge_data[1]
        edge_dict[edge_index] = {"edge_type":edge_type, "nodes":(edge_source,edge_target)}
    return edge_dict

def get_edges_from(node_index):
    edges_from = graph_edge_data[(graph_edge_data.x_index == node_index)]
    return edges_from

In [32]:
edges_gda = graph_edge_data[graph_edge_data.edge_type == "gda"]
nodos_gda = graph_node_data[graph_node_data.node_source == "disgenet"]

GDA = nx.from_pandas_edgelist(edges_gda,source="x_index",target="y_index")
GDA_attributes = {"node_type":"node_type","node_name":"node_name","node_id":"node_id","node_source":"node_source","comunidades_infomap":"id_infomap","comunidades_louvain":"id_louvain"}

attributes_from_pd(GDA,graph_node_data,GDA_attributes,"node_index")

In [29]:
genes_gda = nodos_gda[nodos_gda.node_type == "gene_protein"]

In [41]:
disease_cluster_map = disease_attributes[["node_index","comunidades_infomap","comunidades_louvain"]]

In [42]:
disease_cluster_map

Unnamed: 0,node_index,comunidades_infomap,comunidades_louvain
0,7,220.0,42.0
1,17,463.0,36.0
2,19,467.0,36.0
3,28,367.0,17.0
4,88,594.0,54.0
...,...,...,...
12504,34190,348.0,30.0
12505,34191,568.0,9.0
12506,34192,236.0,42.0
12507,34194,224.0,42.0


In [44]:
left_disease = edges_gda[edges_gda.x_type == "disease"]

In [52]:
gen_cluster_edges = pd.merge(left_disease, disease_cluster_map, left_on="x_index", right_on="node_index",how="left").drop(columns=["node_index","y_type","x_index","x_type","x_id","edge_type","edge_source","edge_source_index","edge_index"])

In [61]:
gen_infomap = nx.from_pandas_edgelist(gen_cluster_edges,source="y_index",target="comunidades_infomap")
# gen_infomap_attributes = {"y_index":"gene_index","y_id":"gene_id","comunidades_infomap":"infomap_id"}

# for attribute,name in gen_infomap_attributes.items():
#     nx.set_node_attributes(gen_infomap,pd.Series(gen_cluster_edges[attribute]).to_dict(),name)

In [66]:
list(gen_infomap.nodes(data=True))[3]

(522.0, {})