# Graph Stats

This is a development notebook for collecting metadata on problem instances. The final code can be found in `data/generate_metadata.py`.

In [None]:
import numpy as np
from pathlib import Path
import networkx as nx

from concurrent.futures import ProcessPoolExecutor

import yaml
yaml.add_representer(np.float64, lambda dumper, data: dumper.represent_float(data))
yaml.add_representer(np.ndarray, lambda dumper, data: dumper.represent_list(data))

from utils import *

In [None]:
instances = list(map(read_metis_graph, Path("../data/bio").glob("*.metis")))

In [None]:
G, S = instances[0]

In [None]:
def cost_stats(costs):
    stats = dict()
    
    stats['mean'] = np.mean(costs)
    stats['std'] = np.std(costs)
    stats['min'] = np.min(costs)
    stats['max'] = np.max(costs)
    stats['median'] = float(np.quantile(costs, 0.5))
    stats['quantiles'] = np.quantile(costs, np.linspace(0, 1, 11))
    
    return stats


def graph_stats(G):
    stats = dict()
    
    n, m = G.number_of_nodes(), G.number_of_edges()
    
    stats['number_of_vertices'] = n
    stats['number_of_edges'] = m
    stats['complexity'] = n * m
    stats['density'] = 2 * m / (n * (n - 1))
    
    stats['connected_components'] = []
    for G_hat in (G.subgraph(c) for c in nx.connected_components(G)):
        component_stats = dict()
        
        component_stats['number_of_vertices'] = G_hat.number_of_nodes()
        component_stats['number_of_edges'] = G_hat.number_of_edges()
        component_stats['diameter'] = nx.diameter(G_hat, usebounds=True)
        component_stats['radius'] = nx.radius(G_hat, usebounds=True)
        component_stats['center_size'] = len(nx.center(G_hat, usebounds=True))
        component_stats['periphery_size'] = len(nx.periphery(G_hat, usebounds=True))
        
        stats['connected_components'] += [component_stats]
    
    stats['number_of_connected_components'] = len(stats['connected_components'])
    
    stats['average_clustering_coefficient'] = nx.average_clustering(G)
    
    return stats


def instance_stats(instance):
    G, S = instance
    stats = dict()
    
    stats['name'] = G.name
    stats['graph'] = graph_stats(G)
    stats['costs'] = cost_stats(S[np.triu_indices(S.shape[0], 1)])
    
    return stats


with ProcessPoolExecutor() as executor:
    stats = list(executor.map(instance_stats, [(G, S) for G, S in instances if G.number_of_nodes() < 10]))

In [None]:
len(stats)

In [None]:
with Path('../data/bio/metadata.yaml').open('w') as file:
    yaml.dump(stats, file, default_flow_style=False)

In [None]:
#[[cc['number_of_vertices'] for cc in stat['graph']['connected_components']] for stat in stats if stat['graph']['number_of_connected_components'] > 1]
[stat for stat in stats if stat['graph']['number_of_connected_components'] > 1]