In [6]:
import torch_geometric.datasets as datasets
from pathlib import Path

def get_dataset_from_name(name: str):
    """
    Get dataset from name.
    :param name: name of dataset
    :return: dataset
    """
    data_dir = Path("data") / "downloaded_raw_data"

    dataset_dict = {
        "karate": datasets.KarateClub(),  # nodes: 34,  edges: 156,  avg(degree): 9.18, https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.datasets.KarateClub.html#torch_geometric.datasets.KarateClub
        "airports": datasets.Airports(
            root=data_dir, name="Europe"
        ),  # nodes: 1190,  edges: 13599,  avg(degree): 22.86, https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.datasets.Airports.html#torch_geometric.datasets.Airports
        "wiki": datasets.AttributedGraphDataset(
            root=data_dir, name="Wiki"
        ),  # nodes: 2405,  edges: 17981,  avg(degree): 13.74, https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.datasets.AttributedGraphDataset.html#torch_geometric.datasets.AttributedGraphDataset
        "facebook": datasets.AttributedGraphDataset(
            root=data_dir, name="Facebook"
        ),  # nodes: 4039,  edges: 88234,  avg(degree): 43.69, https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.datasets.AttributedGraphDataset.html#torch_geometric.datasets.AttributedGraphDataset
        "actor": datasets.Actor(
            root=data_dir / "actor"
        ),  # nodes: 7600,  edges: 30019,  avg(degree): 07.90, https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.datasets.Actor.html#torch_geometric.datasets.Actor
        "github": datasets.GitHub(
            root=data_dir / "github"
        ),  # nodes: 37700, edges: 578006, avg(degree): 30.66, https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.datasets.GitHub.html#torch_geometric.datasets.GitHub
    }

    if name.lower() not in dataset_dict:
        raise ValueError(f"Dataset {name} not found.")
    else:
        return dataset_dict[name.lower()]

In [7]:
import networkx as nx
from torch_geometric.utils.convert import to_networkx
import numpy as np
import pandas as pd

In [25]:
df = pd.DataFrame(columns=["Dataset","Nodes","Edges","Density","Avg. Degree","Avg. Clustering Coefficient","Avg. Shortest Path Length","Diameter","Avg. Degree Centrality"])
for dataset_name in ["karate","airports","wiki","facebook","actor","github"]:
    print(f"Dataset: {dataset}")
    dataset = get_dataset_from_name(dataset_name)
    for i in range(len(dataset)):
        # transform to networkx graph
        graph = to_networkx(
            dataset[i], to_undirected=False, remove_self_loops=True
        ).to_undirected()
        # use biggest connected component
        max_cc = max(nx.connected_components(graph), key=len)
        graph = graph.subgraph(max_cc).copy()
        # append metrics to dataframe
        pd.concat([df,pd.DataFrame({
            "Dataset":dataset_name,
            "Nodes":graph.number_of_nodes(),
            "Edges":graph.number_of_edges(),
            "Density":nx.density(graph),
            "Avg. Degree": graph.number_of_edges()/graph.number_of_nodes(),
            "Avg. Clustering Coefficient":nx.average_clustering(graph),
            "Avg. Shortest Path Length":nx.average_shortest_path_length(graph, method="unweighted"),
            "Diameter":nx.diameter(graph),
            "Avg. Degree Centrality":np.mean(list(nx.degree_centrality(graph).values()))
            }, index=[i])
        ], ignore_index=True)
        

Dataset: KarateClub()
Dataset: KarateClub()
Dataset: EuropeAirports()
Dataset: Wiki()
Dataset: Facebook()
Dataset: Actor()
