# Setup

In [58]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import networkx as nx
import random
import plotly.io as pio
import plotly.express as px

In [6]:
seed = 16
random.seed(seed)
np.random.seed(seed)

#esto es para forzar a plt a poner fondos blancos en las figuras aunque el tema del notebook sea oscuro
plt.rcParams['axes.facecolor'] = 'white'
plt.rcParams['figure.facecolor'] = 'white'
cmap = plt.get_cmap("tab10")
pio.templates.default = "seaborn"

sns.set_style("darkgrid", rc={'xtick.bottom': True})

In [135]:
graph_edge_data = pd.read_csv(graph_data+"nohub_graph_edge_data.csv")

In [137]:
graph_node_data

Unnamed: 0,node_index,node_id,node_name,node_type,node_source,comunidades_infomap,comunidades_louvain,degree_gda,degree_pp,degree_dd
0,0,1,A1BG,gene_protein,disgenet,,,2.0,12.0,0.0
1,19599,C0019209,Hepatomegaly,disease,disgenet,,,54.0,0.0,0.0
2,34251,R-HSA-114608,Platelet degranulation,pathway,primekg_REACTOME,,,0.0,129.0,0.0
3,653,10422,UBAC1,gene_protein,hippie,,,0.0,36.0,0.0
4,13168,63891,RNF123,gene_protein,disgenet,,,2.0,24.0,0.0
...,...,...,...,...,...,...,...,...,...,...
35834,30820,C3151471,"AMYLOID CARDIOMYOPATHY, TRANSTHYRETIN-RELATED",disease,disgenet,766.0,22.0,0.0,0.0,1.0
35835,30959,C3278148,"BERNARD-SOULIER SYNDROME, TYPE A1",disease,disgenet,1106.0,252.0,0.0,0.0,1.0
35836,33955,C4722518,Triple-Negative Breast Carcinoma,disease,disgenet,17.0,2.0,0.0,0.0,1.0
35837,32968,C4289709,DOCK8 Deficiency,disease,disgenet,127.0,17.0,0.0,0.0,1.0


In [136]:
graph_edge_data

Unnamed: 0,y_index,x_index,edge_index,x_id,y_id,x_type,y_type,edge_type,edge_source,edge_source_index
0,0,19599,376057,C0019209,1,disease,gene_protein,gda,disgenet,0.0
1,0,34251,468555,R-HSA-114608,1,pathway,gene_protein,pathway_protein,primekg,
2,0,653,10692,10422,1,gene_protein,gene_protein,ppi,hippie,357083.0
3,0,13168,248192,63891,1,gene_protein,gene_protein,ppi,hippie,358639.0
4,0,6638,110237,284403,1,gene_protein,gene_protein,ppi,hippie,297494.0
...,...,...,...,...,...,...,...,...,...,...
510047,30820,32925,461737,C4275067,C3151471,disease,disease,disease_disease,primekg,
510048,30959,30948,454819,C3277076,C3278148,disease,disease,disease_disease,primekg,
510049,33955,31617,456990,C3642347,C4722518,disease,disease,disease_disease,primekg,
510050,32968,32090,459693,C3887645,C4289709,disease,disease,disease_disease,primekg,


In [7]:
data_processed = "../../../data/processed/"
data_interim = "../../../data/interim/"
data_external = "../../../data/external/"
graph_data = data_processed + "graph_data_nohubs/"
reports_comunidades = "../../../reports/reports_nohubs/analisis_comunidades/"

graph_node_data = pd.read_csv(graph_data+"nohub_graph_node_data.csv")


disease_attributes = pd.read_csv(graph_data+"nohub_disease_attributes.csv")

node_counts = dict(graph_node_data.node_type.value_counts())
node_counts["total"] = sum(node_counts.values())

display(pd.DataFrame.from_dict({"Número de nodos":node_counts}, orient="columns"))

G = nx.read_gml(graph_data+"nohub_gda_network.gml", destringizer=int)

Unnamed: 0,Número de nodos
bert_group,1040
complex,421
disease,15039
gene_protein,17322
pathway,2017
total,35839


In [37]:
gda_dd_diseases = set(graph_node_data[(graph_node_data.degree_dd != 0)&(graph_node_data.degree_gda != 0)].node_index.values)
gda_genes = graph_node_data[(graph_node_data.degree_gda != 0 ) & (graph_node_data.node_type == "gene_protein")].node_index.values

## Participation coefficient

$P_i = 1 - \sum_{s=1}^{N_M} (\frac{k_{i_s}}{k_i})^2$

Donde $k_{i_s}$ es el número de enlaces del nodo i a nodos del módulo s (sumo sobre todos los módulos) y ${k_i}$ es el grado **total** del nodo i.

In [8]:
def get_participation_coefficient(G,node,particion,count_only_dd_nodes=True,graph_node_data=graph_node_data,set_dd_gda=gda_dd_diseases):
    vecinos = list(G.neighbors(node))
    if count_only_dd_nodes:
        vecinos = list(set(vecinos)&set_dd_gda)

    node_total_degree = len(vecinos)
    col = "comunidades_"+particion
    k_is = graph_node_data.set_index("node_index").loc[vecinos].dropna()[col].astype(int).value_counts().values

    valores_s = (k_is/node_total_degree)**2
    sumatoria = sum(valores_s)
    P_i = round(1 - sumatoria,2)

    return P_i

In [9]:
def count_neighbors_in_dd(G,node,set_dd_gda=gda_dd_diseases):
    vecinos = list(G.neighbors(node))
    in_dd = list(set(vecinos)&set_dd_gda)
    count = len(in_dd)
    ratio = count/len(vecinos)
    return count,ratio

In [55]:
infomap_participation_dict = {node:get_participation_coefficient(G,node,"infomap") for node in gda_genes}
infomap_participation_series = pd.Series(infomap_participation_dict, name="participation_infomap")

louvain_participation_dict = {node:get_participation_coefficient(G,node,"louvain") for node in gda_genes}
louvain_participation_series = pd.Series(louvain_participation_dict, name="participation_louvain")

dd_gene_degree_ratio_series = pd.Series({node:round(count_neighbors_in_dd(G,node)[1],2) for node in gda_genes}, name="dd_gda_degree_ratio")
dd_gene_degree_series = pd.Series({node:count_neighbors_in_dd(G,node)[0] for node in gda_genes}, name="dd_degree")

gda_degree_series = graph_node_data.set_index("node_index").loc[gda_genes,"degree_gda"]

results = pd.concat([infomap_participation_series,louvain_participation_series,dd_gene_degree_ratio_series,dd_gene_degree_series,gda_degree_series], axis=1)

In [113]:
results = results[results.dd_degree != 0]

In [124]:
mu = results.dd_degree.mean()
sigma = results.dd_degree.std()

results["z_score"] = round((results.dd_degree - mu)/sigma,2)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [125]:
results

Unnamed: 0,participation_infomap,participation_louvain,dd_gda_degree_ratio,dd_degree,degree_gda,z_score
0,0.00,0.00,0.50,1,2.0,-0.51
13168,0.00,0.00,1.00,2,2.0,-0.40
6638,0.79,0.79,0.64,9,14.0,0.37
569,0.00,0.00,0.20,1,5.0,-0.51
18356,0.00,0.00,0.25,1,4.0,-0.51
...,...,...,...,...,...,...
18373,0.50,0.00,1.00,2,2.0,-0.40
18375,0.00,0.00,0.17,1,6.0,-0.51
18384,0.00,0.00,0.33,1,3.0,-0.51
18395,0.00,0.00,1.00,1,1.0,-0.51


In [127]:
fig = px.scatter(results, x="dd_degree", y="participation_infomap",text=results.index, width=1000, height=600, marginal_x="histogram",marginal_y="histogram", title="Participation vs degree dd - Infomap")
# fig.update_layout(xaxis_title="Participation Coefficient - P", yaxis_title="Within-module degree - Z")

fig.show()

In [128]:
fig = px.scatter(results, x="dd_degree", y="participation_louvain",text=results.index, width=1000, height=600, marginal_x="histogram",marginal_y="histogram", title="Participation vs degree dd - Louvain")
# fig.update_layout(xaxis_title="Participation Coefficient - P", yaxis_title="Within-module degree - Z")

fig.show()

In [140]:
node = 14317
vecinos = list(G.neighbors(node))
aver = graph_node_data.set_index("node_index").loc[vecinos]
aver[aver.degree_dd != 0]

Unnamed: 0_level_0,node_id,node_name,node_type,node_source,comunidades_infomap,comunidades_louvain,degree_gda,degree_pp,degree_dd
node_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
20754,C0036341,Schizophrenia,disease,disgenet,0.0,0.0,883.0,0.0,1.0
29460,C2239176,Liver carcinoma,disease,disgenet,58.0,3.0,507.0,0.0,1.0
24165,C0376358,Malignant neoplasm of prostate,disease,disgenet,3.0,2.0,616.0,0.0,6.0
18773,C0006142,Malignant neoplasm of breast,disease,disgenet,11.0,3.0,1074.0,0.0,7.0
18732,C0005695,Bladder Neoplasm,disease,disgenet,28.0,3.0,140.0,0.0,5.0
...,...,...,...,...,...,...,...,...,...
22049,C0206681,"Adenocarcinoma, Clear Cell",disease,disgenet,61.0,3.0,2.0,0.0,7.0
28744,C1859972,"ADRENOCORTICAL CARCINOMA, HEREDITARY",disease,disgenet,581.0,3.0,1.0,0.0,4.0
24507,C0431109,Choroid Plexus Carcinoma,disease,disgenet,8.0,3.0,1.0,0.0,3.0
18696,C0004698,Balkan Nephropathy,disease,disgenet,803.0,60.0,1.0,0.0,3.0


In [141]:
graph_node_data.set_index("node_index").loc[node]

node_id                        7157
node_name                      TP53
node_type              gene_protein
node_source                disgenet
comunidades_infomap             NaN
comunidades_louvain             NaN
degree_gda                    232.0
degree_pp                     523.0
degree_dd                       0.0
Name: 14317, dtype: object