# Descriptive Statistics

This script follows the Morgan & Copeland (no date). It stores in a dataframe many descriptive statistics for the sixty graphs used.

In [1]:
import pandas as pd
import os
import numpy as np
import pickle
from tqdm import tqdm
import networkx as nx
import statistics
from glob import glob

### 1. Get all graphs

In [2]:
# 23 Minute runtime
files = glob('/mnt/disk2/Data/Graphs/*.graphml')

graphs = []  # List to store the loaded graphs

for i, file in tqdm(enumerate(files)):
    graph = nx.read_graphml(file)
    graph_name = file[file.find('graph_'):file.find('.graphml')]
    graph_name = graph_name.replace('-','_')
    graphs.append(graph_name)
    globals()[graph_name] = graph
    
del graph

61it [23:40, 23.28s/it]


### 2. Basic Measures

Number fo Nodes (V), Edges (E) and Dyads in each Graph

In [4]:
description = pd.DataFrame() # Dataframe with al the descriptive stattistics for export
# 4.1 Number of nodes (graph size)
n_nodes = np.zeros(61)
# 4.2 Number of edges
n_edges = np.zeros(61)
# 4.3 Number of dyads
n_dyads = np.zeros(61)

In [5]:
# Calculate number of nodes, edges and dyads and store them in the description DataFrame
for idx, graph_name in tqdm(enumerate(graphs)):
    graph = globals()[graph_name]
    n_nodes[idx] = graph.number_of_nodes()
    n_edges[idx] = graph.number_of_edges()
    n_dyads[idx] = n_edges[idx]/2

description['graph'] = graphs
description['n_nodes'] = n_nodes
description['n_edges'] = n_edges
description['n_dyads'] = n_dyads
# del n_nodes
# del n_edges
# del n_dyads
description.head()

61it [00:01, 39.17it/s]


Unnamed: 0,graph,n_nodes,n_edges,n_dyads
0,graph_11_06_2021,37308.0,221709.0,110854.5
1,graph_01_06_2021,37308.0,293177.0,146588.5
2,graph_11_05_2021,37308.0,254221.0,127110.5
3,graph_10_05_2021,37308.0,675726.0,337863.0
4,graph_06_05_2021,37308.0,351881.0,175940.5


### 3. System Level Measures

3.1. $Density_i = \frac{2E_i}{V_i(V_i - 1)}$

In [6]:
description['density'] = (2*description['n_edges']) / (description['n_nodes'] * (description['n_nodes'] - 1))

3.2 Degree distribution

In [7]:
# 5.2 Degree Distribution
mean_degree = np.zeros(61)
median_degree = np.zeros(61)

for idx, graph_name in tqdm(enumerate(graphs)):
    graph = globals()[graph_name]
    degree = list(dict(graph.degree()).values())
    mean_degree[idx] = sum(degree) / description['n_nodes'][idx]
    median_degree[idx] = statistics.median(degree)

description['mean_degree'] = mean_degree
description['median_degree'] = median_degree
del mean_degree
del median_degree
description.head()

61it [00:02, 28.82it/s]


Unnamed: 0,graph,n_nodes,n_edges,n_dyads,density,mean_degree,median_degree
0,graph_11_06_2021,37308.0,221709.0,110854.5,0.000319,11.885333,2.0
1,graph_01_06_2021,37308.0,293177.0,146588.5,0.000421,15.716576,2.0
2,graph_11_05_2021,37308.0,254221.0,127110.5,0.000365,13.62823,2.0
3,graph_10_05_2021,37308.0,675726.0,337863.0,0.000971,36.224188,8.0
4,graph_06_05_2021,37308.0,351881.0,175940.5,0.000506,18.863568,3.0


In [8]:
geodesics = []
mean_geopath = np.zeros(61)

for idx, graph_name in tqdm(enumerate(graphs)):
    graph = globals()[graph_name]
    connected_components = nx.connected_components(graph)

    for idy, component in enumerate(connected_components):
        subgraph = graph.subgraph(component)
        geodesics.append(nx.average_shortest_path_length(subgraph))
    
    geodesics = np.array(geodesics)
    mean_geopath[idx] = np.mean(geodesics.append)
    geodesics = geodesics.tolist()

description.head()

0it [00:00, ?it/s]