In [1]:
import itertools

import numpy as np
import pandas as pd
import networkx as nx
import obnb.data
from obnb.dataset import OpenBiomedNetBench
from tqdm import tqdm

pd.options.display.max_rows = 500

In [2]:
BARFMT = "{l_bar}{bar:40}{r_bar}{bar:-40b}"
PRINT_LATEX_TABLE = False

In [3]:
networks = [
    "HumanBaseTopGlobal",
    "HuMAP",
    "STRING",
    "ConsensusPathDB",
    "FunCoup",
    "PCNet",
    "BioGRID",
    "HumanNet",
    "ComPPIHumanInt",
    "HIPPIE",
    "BioPlex",
    "HuRI",
    "OmniPath",
    "ProteomeHD",
    "SIGNOR",
]

network_stats_list = []
for network in tqdm(networks, bar_format=BARFMT):
    g = getattr(obnb.data, network)("../datasets", log_level="WARNING")
    
    num_edges = sum(map(len, g.edge_data))
    num_nodes = g.size
    density = num_edges / num_nodes / (num_nodes - 1)
    
    network_stats_list.append(
        {
            "Network": network,
            "Weighted": g.weighted,
            "Num. nodes": f"{num_nodes:,}", 
            "Num. edges": f"{num_edges:,}",
            "Density": f"{density:.6f}",
        }
    )
network_stats_df = pd.DataFrame(network_stats_list).set_index("Network")
print(network_stats_df.style.to_latex()) if PRINT_LATEX_TABLE else network_stats_df

100%|████████████████████████████████████████| 15/15 [03:05<00:00, 12.35s/it]                                                                                               


Unnamed: 0_level_0,Weighted,Num. nodes,Num. edges,Density
Network,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HumanBaseTopGlobal,True,25689,77807094,0.117908
HuMAP,True,15433,35052604,0.14718
STRING,True,18480,11019492,0.032269
ConsensusPathDB,True,17735,10611416,0.033739
FunCoup,True,17892,10037478,0.031357
PCNet,False,18544,5365116,0.015603
BioGRID,False,19765,1554790,0.00398
HumanNet,True,18591,2250780,0.006513
ComPPIHumanInt,True,17015,699620,0.002417
HIPPIE,True,19338,1542044,0.004124


In [4]:
labels = [
    "DISEASES",
    "DisGeNET",
    "GOBP",
    "GOCC",
    "GOMF",
]

dataset_stats_list = []
for network, label in tqdm(list(itertools.product(networks, labels)), bar_format=BARFMT):
    try:
        dataset = OpenBiomedNetBench("../datasets", network, label, version="current", log_level="WARNING")
    except ValueError:
        if network == "ProteomeHD" and label == "GOMF":
            pass
    
    num_tasks = len(dataset.label.label_ids)
    task_sizes = np.array([len(dataset.label.get_labelset(i)) for i in dataset.label.label_ids])
    
    dataset_stats_list.append(
        {
            "Network": network,
            "Label": label,
            "Num. tasks": num_tasks,
            "Num. pos. avg.": f"{task_sizes.mean():.1f}",
            "Num. pos. std.": f"{task_sizes.std():.1f}",
            "Num. pos. med.": f"{np.median(task_sizes):.1f}",
        }
    )

dataset_stats_df = pd.DataFrame(dataset_stats_list).set_index(["Label", "Network"]).sort_index()
print(network_stats_df.style.to_latex()) if PRINT_LATEX_TABLE else dataset_stats_df

100%|████████████████████████████████████████| 75/75 [26:36<00:00, 21.28s/it]                                                                                               


Unnamed: 0_level_0,Unnamed: 1_level_0,Num. tasks,Num. pos. avg.,Num. pos. std.,Num. pos. med.
Label,Network,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
DISEASES,BioGRID,145,178.1,137.4,127.0
DISEASES,BioPlex,72,123.8,64.4,101.5
DISEASES,ComPPIHumanInt,145,174.6,134.5,125.0
DISEASES,ConsensusPathDB,144,177.4,137.5,126.0
DISEASES,FunCoup,145,177.1,135.1,127.0
DISEASES,HIPPIE,143,178.1,137.6,127.0
DISEASES,HuMAP,123,168.0,119.2,120.0
DISEASES,HuRI,50,130.3,56.7,112.5
DISEASES,HumanBaseTopGlobal,149,178.5,137.7,129.0
DISEASES,HumanNet,142,179.0,136.9,127.0
