In [1]:
import sys
sys.path.insert(0, '../src/')

import os
import pickle
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import scipy.sparse as sp
from scipy.sparse import save_npz, load_npz, csr_matrix

from matplotlib import pyplot as plt
%matplotlib inline

import pandas as pd

import utils
import graph_statistics

# Statistics

In [4]:
class Evaluation(object):
    
    def __init__(self, experiment_root, statistic_fns):
        self.experiment_root = experiment_root
        self.statistic_fns = statistic_fns    
    
    def _load_timings(self):
        return self._load('timing')
    
    def _load_overlaps(self):
        return self._load('overlap')
    
    def _load_roc_aucs(self):
        return self._load('ROC-AUC')
    
    def _load_avg_precs(self):
        return self._load('avg_prec')
    
    def _load(self, name):
        
        def get_filename(idx):
            filename = os.path.join(self.experiment_root,
                                    f'Experiment_{idx:0{self.str_exp_len}d}',
                                    'sampled_graphs',
                                    f'{name}.pickle')
            return filename
        
        dicts = [utils.load_dict(get_filename(idx)) for idx in range(self.num_experiments)]
        return dicts
    
    def compute_statistics(self):
        # parse experiment root folder
        experiment_keys = [key for key in os.listdir(self.experiment_root) if key[:11]=='Experiment_']
        self.num_experiments = len(experiment_keys)
        self.str_exp_len = len(str(self.num_experiments))
        
        # load overlaps and timings
        overlaps = self._load_overlaps()
        roc_aucs = self._load_roc_aucs()
        avg_precs = self._load_avg_precs()
        timings = self._load_timings()
        
        steps = max(timings[0].keys())
        step_len = len(str(steps))
        step_idxs = len(timings[0].keys())
        invoke_every = steps // step_idxs
        
        statistics = {name: np.zeros([self.num_experiments,
                                      step_idxs]) for name in self.statistic_fns.keys()}
        statistics['Edge Overlap (%)'] = np.zeros([self.num_experiments, step_idxs])
        statistics['ROC-AUC Score'] = np.zeros([self.num_experiments, step_idxs])
        statistics['Average Precision'] = np.zeros([self.num_experiments, step_idxs])
        statistics['Time (s)'] = np.zeros([self.num_experiments, step_idxs])
                    
        for step_idx, step in enumerate(range(invoke_every, steps+invoke_every, invoke_every)):
            for experiment in range(self.num_experiments):
                # load sparse graph
                graph_name = f'graph_{step:0{step_len}d}.npz'
                graph_path = os.path.join(self.experiment_root,
                                          f'Experiment_{experiment:0{self.str_exp_len}d}',
                                          'sampled_graphs',
                                          graph_name)
                graph = load_npz(graph_path)
                # compute statistics
                statistics['Edge Overlap (%)'][experiment, step_idx] = overlaps[experiment][step]
                statistics['ROC-AUC Score'][experiment, step_idx] = roc_aucs[experiment][step]
                statistics['Average Precision'][experiment, step_idx] = avg_precs[experiment][step]
                statistics['Time (s)'][experiment, step_idx] = timings[experiment][step]
                for name, statistic_fn in self.statistic_fns.items():
                    statistics[name][experiment, step_idx] = statistic_fn(graph)

                    
        self.statistics = statistics
        self.steps = steps
        self.invoke_every = invoke_every

    def aggregate_statistics(self, num_bins, start=0, end=1):
        # binning
        overlaps = self.statistics['Edge Overlap (%)']
        lin = np.linspace(start, end, num_bins+1)
        statistics_mean = {name:np.zeros(num_bins) for name in self.statistics.keys()}
        statistics_std = {name:np.zeros(num_bins) for name in self.statistics.keys()}
        for idx, (start, end) in enumerate(zip(lin[:-1], lin[1:])):
            args = np.argwhere(np.logical_and(start<overlaps, overlaps<=end))
            for name, statistic in self.statistics.items():
                statistics_mean[name][idx] = statistic[args[:,0], args[:,1]].mean()
                statistics_std[name][idx] = statistic[args[:,0], args[:,1]].std()
        
        self.statistics_mean = statistics_mean
        self.statistics_std = statistics_std
        self.mean_std = (statistics_mean, statistics_std)
                
    def export_statistics(self):
        pass
    
    def plot_statistics(self):
        pass

## Ours

In [7]:
eval_ours = Evaluation(experiment_root='../logs/CORA-ML/Ours/',
                       statistic_fns={#'Assortativity':graph_statistics.assortativity,
                                      #'Average Degree':graph_statistics.average_degree,
                                      'Claw Count':graph_statistics.claw_count,
                                      #'Clustering Coefficient':graph_statistics.clustering_coefficient,
                                      #'Characteristic Path Length':graph_statistics.compute_cpl,
                                      #'Edge Distribution Entropy':graph_statistics.edge_distribution_entropy,
                                      #'Gini':graph_statistics.gini,
                                      #'LCC Size':graph_statistics.LCC,
                                      #'Max Degree':graph_statistics.max_degree,
                                      #'Min Degree':graph_statistics.min_degree,
                                      #'Num Connected Components':graph_statistics.num_connected_components,
                                      #'Power Law α':graph_statistics.power_law_alpha,
                                      #'Spectral Gap':graph_statistics.spectral_gap,
                                      'Square Count':graph_statistics.square_count,
                                      #'Triangle Count':graph_statistics.triangle_count,
                                      'Wedge Count':graph_statistics.wedge_count,
                                     })

In [8]:
eval_ours.compute_statistics()

In [9]:
eval_ours.aggregate_statistics(num_bins=10)

In [11]:
eval_ours.statistics['Edge Overlap (%)'][-1]

array([0.00442282, 0.09479581, 0.34542238, 0.45717234, 0.55889724,
       0.61831048, 0.66637181, 0.71384343, 0.73979065, 0.76146248,
       0.79006339, 0.80967124, 0.81866431, 0.83384933, 0.84357954,
       0.85330974, 0.86171311, 0.8708536 , 0.8799941 , 0.88235294])

## Forge Adjacency 

In [72]:
eval_fa = Evaluation(experiment_root='../logs/CORA-ML/baseline_FA/',
                     statistic_fns={#'Assortativity':graph_statistics.assortativity,
                                    #'Average Degree':graph_statistics.average_degree,
                                    'Claw Count':graph_statistics.claw_count,
                                    #'Clustering Coefficient':graph_statistics.clustering_coefficient,
                                    #'Characteristic Path Length':graph_statistics.compute_cpl,
                                    #'Edge Distribution Entropy':graph_statistics.edge_distribution_entropy,
                                    #'Gini':graph_statistics.gini,
                                    #'LCC Size':graph_statistics.LCC,
                                    #'Max Degree':graph_statistics.max_degree,
                                    #'Min Degree':graph_statistics.min_degree,
                                    #'Num Connected Components':graph_statistics.num_connected_components,
                                    #'Power Law α':graph_statistics.power_law_alpha,
                                    #'Spectral Gap':graph_statistics.spectral_gap,
                                    'Square Count':graph_statistics.square_count,
                                    #'Triangle Count':graph_statistics.triangle_count,
                                    'Wedge Count':graph_statistics.wedge_count,
                                    })

In [73]:
eval_fa.compute_statistics()

In [74]:
eval_fa.aggregate_statistics(num_bins=10)

In [75]:
eval_fa.statistics

{'Claw Count': array([[481830.],
        [593270.],
        [332699.],
        [422081.],
        [550518.],
        [500513.],
        [446634.],
        [495019.],
        [525607.],
        [515139.],
        [658976.],
        [407958.],
        [495019.],
        [516567.],
        [444974.],
        [510862.],
        [468463.],
        [559511.],
        [447798.],
        [469087.]]), 'Square Count': array([[1039.],
        [1188.],
        [ 973.],
        [ 965.],
        [1162.],
        [1229.],
        [1075.],
        [1029.],
        [ 944.],
        [1217.],
        [1158.],
        [ 948.],
        [1127.],
        [1042.],
        [1012.],
        [1273.],
        [1121.],
        [1174.],
        [1094.],
        [1158.]]), 'Wedge Count': array([[47801.],
        [50226.],
        [45017.],
        [47001.],
        [49019.],
        [48823.],
        [47206.],
        [47950.],
        [48139.],
        [49023.],
        [50598.],
        [46539.],
        [48327.],

## Forge transition

In [81]:
eval_ft = Evaluation(experiment_root='../logs/CORA-ML/baseline_FT/',
                     statistic_fns={#'Assortativity':graph_statistics.assortativity,
                                    #'Average Degree':graph_statistics.average_degree,
                                    'Claw Count':graph_statistics.claw_count,
                                    #'Clustering Coefficient':graph_statistics.clustering_coefficient,
                                    #'Characteristic Path Length':graph_statistics.compute_cpl,
                                    #'Edge Distribution Entropy':graph_statistics.edge_distribution_entropy,
                                    #'Gini':graph_statistics.gini,
                                    #'LCC Size':graph_statistics.LCC,
                                    #'Max Degree':graph_statistics.max_degree,
                                    #'Min Degree':graph_statistics.min_degree,
                                    #'Num Connected Components':graph_statistics.num_connected_components,
                                    #'Power Law α':graph_statistics.power_law_alpha,
                                    #'Spectral Gap':graph_statistics.spectral_gap,
                                    'Square Count':graph_statistics.square_count,
                                    #'Triangle Count':graph_statistics.triangle_count,
                                    'Wedge Count':graph_statistics.wedge_count,
                                    })

In [82]:
eval_ft.compute_statistics()

In [85]:
eval_ft.aggregate_statistics(num_bins=10)

In [None]:
e

# Summary

In [76]:
def tabular_from_statistics(EO_criterion, statistics):
    tabular_mean = {}
    tabular_std = {}
    for model_name, (statistics_mean, statistics_std) in statistics.items():
        tabular_mean[model_name] = {}
        tabular_std[model_name] = {}
        # find matching EO
        overlap = statistics_mean['Edge Overlap (%)']
        try:
            arg = np.argwhere(overlap>EO_criterion).min()
        except:
            raise Exception(f'Max Edge Overlap of {model_name} is {np.nan_to_num(overlap, -1).max():.3f}')
        for statistic_name in statistics_mean.keys():
            tabular_mean[model_name][statistic_name] = statistics_mean[statistic_name][arg]
            tabular_std[model_name][statistic_name] = statistics_std[statistic_name][arg]
    return (tabular_mean, tabular_std)

In [52]:
tabular = tabular_from_statistics(EO_criterion=0.5,
                                  statistics={'ours': eval_ours.mean_std})

In [53]:
tabular

({'ours': {'Claw Count': 1738945.6,
   'Square Count': 6755.95,
   'Wedge Count': 81647.35,
   'Edge Overlap (%)': 0.550656051894442,
   'ROC-AUC Score': 0.9250405933379817,
   'Average Precision': 0.9334074743225752,
   'Time (s)': 14.25318933725357}},
 {'ours': {'Claw Count': 238717.88288907052,
   'Square Count': 505.7745026194974,
   'Wedge Count': 2806.073667511243,
   'Edge Overlap (%)': 0.008070070940608833,
   'ROC-AUC Score': 0.004968024675141033,
   'Average Precision': 0.00499826370837697,
   'Time (s)': 0.17529627872599637}})

In [87]:
tabular = tabular_from_statistics(0.5, {
    'fa':eval_fa.mean_std, 
    'ft':eval_ft.mean_std,
    'ours': eval_ours.mean_std,
                                       })

In [91]:
def df_from_tabular(tabular, keys=None):
    mean_dicts, std_dicts = tabular
    string_tabular = {}
    for (model_key, mean_dict) in mean_dicts.items():
        std_dict = std_dicts[model_key]
        string_tabular[model_key] = {}
        for (statistc_key, mean) in mean_dict.items():
            std = std_dict[statistc_key]
            string_tabular[model_key][statistc_key] = (f'{mean:.3f} \u00B1 {std:.3f}')
    df = pd.DataFrame(string_tabular.values(), string_tabular.keys())
    if keys is not None:
        df = df[keys]
    return df

In [92]:
df = df_from_tabular(tabular)

In [93]:
df

Unnamed: 0,Claw Count,Square Count,Wedge Count,Edge Overlap (%),ROC-AUC Score,Average Precision,Time (s)
fa,492126.250 ± 68398.655,1096.400 ± 96.770,48230.300 ± 1264.029,0.525 ± 0.006,0.565 ± 0.000,0.660 ± 0.000,2.075 ± 0.042
ft,725638.550 ± 79992.302,1564.600 ± 80.837,57510.000 ± 1158.362,0.561 ± 0.005,0.706 ± 0.000,0.793 ± 0.000,1.940 ± 0.044
ours,1738945.600 ± 238717.883,6755.950 ± 505.775,81647.350 ± 2806.074,0.551 ± 0.008,0.925 ± 0.005,0.933 ± 0.005,14.253 ± 0.175


In [12]:
a = np.arange(10)[::-1]

In [14]:
a.sort()

In [50]:
print('a\nb')

a
b


In [38]:
print('%')

%


In [25]:
help(load_npz)

Help on function load_npz in module scipy.sparse._matrix_io:

load_npz(file)
    Load a sparse matrix from a file using ``.npz`` format.
    
    Parameters
    ----------
    file : str or file-like object
        Either the file name (string) or an open file (file-like object)
        where the data will be loaded.
    
    Returns
    -------
    result : csc_matrix, csr_matrix, bsr_matrix, dia_matrix or coo_matrix
        A sparse matrix containing the loaded data.
    
    Raises
    ------
    IOError
        If the input file does not exist or cannot be read.
    
    See Also
    --------
    scipy.sparse.save_npz: Save a sparse matrix to a file using ``.npz`` format.
    numpy.load: Load several arrays from a ``.npz`` archive.
    
    Examples
    --------
    Store sparse matrix to disk, and load it again:
    
    >>> import scipy.sparse
    >>> sparse_matrix = scipy.sparse.csc_matrix(np.array([[0, 0, 3], [4, 0, 0]]))
    >>> sparse_matrix
    <2x3 sparse matrix of type '<c

In [19]:
dct = {1:'a', 2:'b', 5:'e'}

In [21]:
max(dct.keys())

5

In [45]:
np.ones([3,3]) > 0#np.zeros([3]) 

array([[ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True]])

In [5]:
a = np.zeros([3,3])
0<a

array([[False, False, False],
       [False, False, False],
       [False, False, False]])

In [6]:
a<=1

array([[ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True]])

In [7]:
np.logical_and(0<a, a<=1)

array([[False, False, False],
       [False, False, False],
       [False, False, False]])

In [8]:
a[np.argwhere(np.logical_and(0<a, a<=1))[:,0],np.argwhere(np.logical_and(0<a, a<=1))[:,1]]

array([], dtype=float64)

In [32]:
os.listdir('../logs/experiments_CORA-ML/Experiment_0/sampled_graphs/')

['graph_090.npz',
 'graph_080.npz',
 'graph_100.npz',
 'overlap.pickle',
 'graph_060.npz',
 'graph_015.npz',
 'graph_075.npz',
 'graph_005.npz',
 'graph_020.npz',
 'graph_070.npz',
 'graph_065.npz',
 'graph_010.npz',
 'graph_040.npz',
 'graph_095.npz',
 'graph_050.npz',
 'graph_035.npz',
 'timing.pickle',
 'graph_025.npz',
 'graph_085.npz',
 'graph_055.npz',
 'graph_045.npz',
 'graph_030.npz']

In [2]:
def timing(f):
    def g(*args, **kwargs):
        start = time.time()
        y = f(*args, **kwargs)
        g.last_time = time.time() - start
        return y
    return g

In [5]:
@timing
def loop(iters):
    x = 0
    for i in range(iters):
        x += i
    return x

In [9]:
loop(100)
loop.last_time

1.3828277587890625e-05

In [12]:
def f(*args, **kwargs):
    y = 0
    for x in args:
        y += x
    return y

In [16]:
lst = [1,2,3,4,5,6]
f(*lst)

21

In [None]:
f(lst[0], lst[1])