# Graph Stats

This is a development notebook for collecting metadata on problem instances. The final code can be found in `data/generate_metadata.py`.

In [119]:
import numpy as np
from pathlib import Path
import networkx as nx

from concurrent.futures import ProcessPoolExecutor

import yaml
yaml.add_representer(np.float64, lambda dumper, data: dumper.represent_float(data))
yaml.add_representer(np.ndarray, lambda dumper, data: dumper.represent_list(data))

from utils import *

In [3]:
instances = list(map(read_metis_graph, Path("../data/bio").glob("*.metis")))

In [5]:
G, S = instances[0]

In [122]:
def cost_stats(costs):
    stats = dict()
    
    stats['mean'] = np.mean(costs)
    stats['std'] = np.std(costs)
    stats['min'] = np.min(costs)
    stats['max'] = np.max(costs)
    stats['median'] = float(np.quantile(costs, 0.5))
    stats['quantiles'] = np.quantile(costs, np.linspace(0, 1, 11))
    
    return stats


def graph_stats(G):
    stats = dict()
    
    stats['number_of_vertices'] = G.number_of_nodes()
    stats['number_of_edges'] = G.number_of_edges()
    stats['complexity'] = G.number_of_nodes() * G.number_of_edges()
    stats['density'] = G.number_of_edges() / G.number_of_nodes()**2
    
    stats['connected_components'] = []
    for G_hat in (G.subgraph(c) for c in nx.connected_components(G)):
        component_stats = dict()
        
        component_stats['number_of_vertices'] = G_hat.number_of_nodes()
        component_stats['number_of_edges'] = G_hat.number_of_edges()
        component_stats['diameter'] = nx.diameter(G_hat, usebounds=True)
        component_stats['radius'] = nx.radius(G_hat, usebounds=True)
        component_stats['center_size'] = len(nx.center(G_hat, usebounds=True))
        component_stats['periphery_size'] = len(nx.periphery(G_hat, usebounds=True))
        
        stats['connected_components'] += [component_stats]
    
    stats['number_of_connected_components'] = len(stats['connected_components'])
    
    stats['average_clustering_coefficient'] = nx.average_clustering(G)
    
    return stats


def instance_stats(instance):
    G, S = instance
    stats = dict()
    
    stats['name'] = G.name
    stats['graph'] = graph_stats(G)
    stats['costs'] = cost_stats(S[np.triu_indices(S.shape[0], 1)])
    
    return stats


with ProcessPoolExecutor() as executor:
    stats = list(executor.map(instance_stats, [(G, S) for G, S in instances if G.number_of_nodes() < 10]))

In [123]:
len(stats)

1962

In [115]:
with Path('../data/bio/metadata.yaml').open('w') as file:
    yaml.dump(stats, file, default_flow_style=False)

In [124]:
#[[cc['number_of_vertices'] for cc in stat['graph']['connected_components']] for stat in stats if stat['graph']['number_of_connected_components'] > 1]
[stat for stat in stats if stat['graph']['number_of_connected_components'] > 1]

[{'name': 'bio-nr-1320-size-3.metis',
  'graph': {'number_of_vertices': 3,
   'number_of_edges': 1,
   'complexity': 3,
   'density': 0.1111111111111111,
   'connected_components': [{'number_of_vertices': 2,
     'number_of_edges': 1,
     'diameter': 1,
     'radius': 1,
     'center_size': 2,
     'periphery_size': 2},
    {'number_of_vertices': 1,
     'number_of_edges': 0,
     'diameter': 0,
     'radius': 0,
     'center_size': 1,
     'periphery_size': 1}],
   'number_of_connected_components': 2,
   'average_clustering_coefficient': 0.0},
  'costs': {'mean': 99.802978515625,
   'std': 141.5608771229029,
   'min': -0.591064453125,
   'max': 300.0,
   'median': 0.0,
   'quantiles': array([-5.91064453e-01, -4.72851563e-01, -3.54638672e-01, -2.36425781e-01,
          -1.18212891e-01,  0.00000000e+00,  6.00000000e+01,  1.20000000e+02,
           1.80000000e+02,  2.40000000e+02,  3.00000000e+02])}},
 {'name': 'bio-nr-1415-size-3.metis',
  'graph': {'number_of_vertices': 3,
   'number_