In [None]:
#default_exp reachability_evaluation

In [None]:
#hide_output
%load_ext autoreload
%autoreload 2

# Reachability evaluation
> Experimental results of evaluation of how various reachability indices speed up reachability queries.

Imports

In [None]:
#export

In [None]:
import numpy as np
import networkx as nx
import pandas as pd

from math import sqrt

## Load example commit graph, compute reachability queries, save results

In [None]:
from git_commit_graph_ext.commit_graph import _commit_graph_name, commit_graph
from git_commit_graph_ext.checkpoint import compute_cached_graph, compute_cached_reachability_labels

In [None]:
# parameters
repo_url  = 'https://github.com/git/git.git' 
repo_name = 'git'

In [None]:
graph_name = _commit_graph_name(repo_name)

In [None]:
print('loading the commit graph of a git repository')
print('- url:   {}'.format(repo_url))
print('- name:  {}'.format(repo_name))
print('- graph: {}'.format(graph_name))

loading the commit graph of a git repository
- url:   https://github.com/git/git.git
- name:  git
- graph: git-commit_graph


In [None]:
# checkpoint 1
graph = compute_cached_graph(lambda: commit_graph(repo_url, repo_name), graph_name)

In [None]:
print('commit graph of {} repository has:'.format(repo_name))
print('- nodes: {}'.format(graph.number_of_nodes()))
print('- edges: {}'.format(graph.number_of_edges()))
print('\npublic attributes of retrieved/computed graph')
for (attr, val) in graph.__dict__.items():
    if not isinstance(val, type) and not attr.startswith('_'):
        print('- {:s} ({})'.format(attr, type(val)))

commit graph of git repository has:
- nodes: 63829
- edges: 79664

public attributes of retrieved/computed graph
- graph (<class 'dict'>)
- df_edgelist (<class 'pandas.core.frame.DataFrame'>)


In [None]:
graph = compute_cached_reachability_labels(graph, graph_name)

In [None]:
df = graph.df_nodedata
print('public attributes of retrieved/computed graph (with reachability labels):')
for (attr, val) in graph.__dict__.items():
    if not isinstance(val, type) and not attr.startswith('_'):
        print('- {:s} ({})'.format(attr, type(val)))

print('')
print('node data dataframe properties:')
print('- columns: {}'.format(df.columns.tolist()))
print('- rows:    {}...'.format(list(graph.nodes)[0:5]))
print('- lvls:    {}...'.format({k: graph.lvl[k] for k in list(graph.lvl)[:5]}))
print('- mpi_ext: {}...'.format({k: graph.mpi_ext[k] for k in list(graph.mpi_ext)[:2]}))

graph.df_nodedata.head()

public attributes of retrieved/computed graph (with reachability labels):
- graph (<class 'dict'>)
- df_edgelist (<class 'pandas.core.frame.DataFrame'>)
- df_nodedata (<class 'pandas.core.frame.DataFrame'>)
- lvl (<class 'dict'>)
- mpi_ext (<class 'dict'>)
- nodes (<class 'networkx.classes.reportviews.NodeView'>)

node data dataframe properties:
- columns: ['f_min', 'min', 'post', 'level', 'in degree', 'out degree', 'degree']
- rows:    ['836aadd78', 'a93475d10', '55fce44a3', 'df525e622', '6d9d59c31']...
- lvls:    {'e83c51633': 0, '8bc9a0c76': 1, 'e497ea2a9': 2, 'bf0c6e839': 3, '19b2860cb': 4}...
- mpi_ext: {'e83c51633': {'f_min': 1, 'min': 1, 'post': 1}, '8bc9a0c76': {'f_min': 1, 'min': 1, 'post': 2}}...


Unnamed: 0_level_0,f_min,min,post,level,in degree,out degree,degree
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
e83c51633,1,1,1,0,1,0,1
8bc9a0c76,1,1,2,1,1,1,2
e497ea2a9,1,1,3,2,1,1,2
bf0c6e839,1,1,4,3,1,1,2
19b2860cb,1,1,5,4,1,1,2


## N^2 connectivity on random sample of commits / nodes

In [None]:
n_nodes = graph.number_of_nodes()
print('graph "{}" has {:d} nodes'.format(graph_name, n_nodes))

n_pairs = 10000
print('- selecting {:d} pairs of nodes out of {}'.format(n_pairs, n_nodes*n_nodes))
choice_u = np.random.choice(n_nodes, n_pairs)
choice_v = np.random.choice(n_nodes, n_pairs)
conn_choice = list(zip(choice_u, choice_v))
print('- choice: {}...'.format(conn_choice[:5]))

print('- creating a mapping from numbers to node names')
nodes_dict = {num: v for (num,v) in enumerate(list(graph))}
conn_nodes = [(nodes_dict[u], nodes_dict[v]) for (u,v) in conn_choice]

print('- %4d sample size' % len(conn_nodes))
print('- node pairs: {}...'.format(conn_nodes[:5]))

graph "git-commit_graph" has 63829 nodes
- selecting 10000 pairs of nodes out of 4074141241
- choice: [(54306, 8405), (63370, 27936), (40309, 45188), (33813, 2815), (32667, 23583)]...
- creating a mapping from numbers to node names
- 10000 sample size
- node pairs: [('c7f34c180', '23c204455'), ('9dc527adb', '53ec551c8'), ('5e3ce663b', '9affecbc8'), ('6440fdbab', 'c8c35f6a0'), ('f1a7082f2', 'caac7a3ab')]...


In [None]:
%%time

conn_sample = []
print('sample of {:d} node pairs in {} graph'.format(len(conn_nodes), graph_name))
for (u,v) in conn_nodes:
  #print('%r -> %r: %r' % (u, v, nx.has_path(linux_graph_full, u, v)))
  conn_sample.append({'u': u, 'v': v,
                      'l_u': graph.lvl[u],
                      'l_v': graph.lvl[v],
                      'u->v': nx.has_path(graph, u, v)})

conn_sample[:5]

sample of 10000 node pairs in git-commit_graph graph
Wall time: 3min 26s


[{'u': 'c7f34c180',
  'v': '23c204455',
  'l_u': 5579,
  'l_v': 19392,
  'u->v': False},
 {'u': '9dc527adb', 'v': '53ec551c8', 'l_u': 433, 'l_v': 12473, 'u->v': False},
 {'u': '5e3ce663b', 'v': '9affecbc8', 'l_u': 11072, 'l_v': 9538, 'u->v': True},
 {'u': '6440fdbab',
  'v': 'c8c35f6a0',
  'l_u': 12391,
  'l_v': 20919,
  'u->v': False},
 {'u': 'f1a7082f2',
  'v': 'caac7a3ab',
  'l_u': 12813,
  'l_v': 15628,
  'u->v': False}]

Wall time: 3min 5s

In [None]:
conn_sample_df = pd.DataFrame.from_records(conn_sample)
conn_sample_df

Unnamed: 0,u,v,l_u,l_v,u->v
0,c7f34c180,23c204455,5579,19392,False
1,9dc527adb,53ec551c8,433,12473,False
2,5e3ce663b,9affecbc8,11072,9538,True
3,6440fdbab,c8c35f6a0,12391,20919,False
4,f1a7082f2,caac7a3ab,12813,15628,False
...,...,...,...,...,...
9995,9be24a30d,26b59b481,1769,11394,False
9996,2a7453241,80a14665b,11757,13769,False
9997,577ed5c20,a75ef3ff9,4435,17803,False
9998,6f5c77a11,81811a74b,17947,288,True


In [None]:
%%time

conn_sample_df['v->u'] = False
mask = ~conn_sample_df['u->v']
conn_sample_df.loc[mask, 'v->u'] = \
    conn_sample_df[mask].apply(lambda row: nx.has_path(graph, row['v'], row['u']), axis='columns')
conn_sample_df

Wall time: 40.7 s


Unnamed: 0,u,v,l_u,l_v,u->v,v->u
0,c7f34c180,23c204455,5579,19392,False,True
1,9dc527adb,53ec551c8,433,12473,False,True
2,5e3ce663b,9affecbc8,11072,9538,True,False
3,6440fdbab,c8c35f6a0,12391,20919,False,True
4,f1a7082f2,caac7a3ab,12813,15628,False,True
...,...,...,...,...,...,...
9995,9be24a30d,26b59b481,1769,11394,False,False
9996,2a7453241,80a14665b,11757,13769,False,True
9997,577ed5c20,a75ef3ff9,4435,17803,False,True
9998,6f5c77a11,81811a74b,17947,288,True,False


Wall time 41.7 s (for computing v->u)

In [None]:
sample_size = conn_sample_df['u->v'].count()
ppos_size = conn_sample_df[conn_sample_df['u->v'] | conn_sample_df['v->u']]['u->v'].count()
nneg_size = conn_sample_df[~conn_sample_df['u->v'] & ~conn_sample_df['v->u']]['u->v'].count()
print('there were {:4d} out of {:d} ({:5.2f} %) nodes for which have neither u->v nor v->u'.
      format(nneg_size, sample_size, 100.0*nneg_size/sample_size))
print('there were {:4d} out of {:d} ({:5.2f} %) nodes for which have  either u->v  or v->u'.
      format(ppos_size, sample_size, 100.0*ppos_size/sample_size))
print('together {:4d} + {:4d} = {:d} vs {:d}'.
      format(nneg_size, ppos_size, nneg_size + ppos_size, sample_size))

there were  918 out of 10000 ( 9.18 %) nodes for which have neither u->v nor v->u
there were 9082 out of 10000 (90.82 %) nodes for which have  either u->v  or v->u
together  918 + 9082 = 10000 vs 10000


### False positives for backward topological levels (negative cut)

In [None]:
sample_size = conn_sample_df['u->v'].count()
print('connected:        %d of %d (%g +/- %g)' %
      (conn_sample_df['u->v'].sum(),
       conn_sample_df['u->v'].count(),
       conn_sample_df['u->v'].mean(),
       conn_sample_df['u->v'].std()/sqrt(sample_size)))
conn_sample_df['l_v<l_u']=conn_sample_df['l_v']<conn_sample_df['l_u']
conn_sample_df['l_v>l_u']=conn_sample_df['l_v']>conn_sample_df['l_u']
print('levels l_u < l_v: %d (%g +/- %g)' %
      (conn_sample_df['l_v<l_u'].sum(),
       conn_sample_df['l_v<l_u'].mean(),
       conn_sample_df['l_v<l_u'].std()/sqrt(sample_size)))
print('levels l_u > l_v: %d (%g +/- %g)' %
      (conn_sample_df['l_v>l_u'].sum(),
       conn_sample_df['l_v>l_u'].mean(),
       conn_sample_df['l_v>l_u'].std()/sqrt(sample_size)))

connected:        4515 of 10000 (0.4515 +/- 0.00497667)
levels l_u < l_v: 4960 (0.496 +/- 0.00500009)
levels l_u > l_v: 5038 (0.5038 +/- 0.00500011)


In [None]:
conn_sample_df['!u->v'] = ~conn_sample_df['u->v']
conn_sample_df['fp_levels'] = conn_sample_df['l_v<l_u'] & conn_sample_df['!u->v']
print('level: false positives %d out of %d negative queries (%g %%), out of %d total' %
      (conn_sample_df['fp_levels'].sum(),
       conn_sample_df['!u->v'].sum(),
       100.0*conn_sample_df['fp_levels'].sum()/conn_sample_df['!u->v'].sum(),
       conn_sample_df['!u->v'].count()))
conn_sample_df['fp_levels'].describe()

level: false positives 445 out of 5485 negative queries (8.11304 %), out of 10000 total


count     10000
unique        2
top       False
freq       9555
Name: fp_levels, dtype: object

----

In [None]:
#hide
# this should be the last cell of the notebook
from nbdev.export import notebook2script
notebook2script()

Converted 01_tools.ipynb.
Converted 02_related.ipynb.
Converted 03_example_graphs.ipynb.
Converted 05_reachability_index.ipynb.
Converted 06_levels.ipynb.
Converted 07_interval_labels.ipynb.
Converted 08_reach.ipynb.
Converted 09_git.ipynb.
Converted 10_checkpoint.ipynb.
Converted 11_datasets.ipynb.
Converted 12_repos.ipynb.
Converted 14_evaluation.ipynb.
Converted A.09_git_explore.ipynb.
Converted index.ipynb.
