In [None]:
#default_exp checkpoint

In [None]:
#hide_output
%load_ext autoreload
%autoreload 2

# Checkpointing
> Saving and restoring graphs, reachability labels, evaluation results, etc.

In [None]:
#export
# creating graphs in Python
import networkx as nx
# checking for existence of paths, and manipulating paths
from pathlib import Path, PurePath
# data analysis and manipulation
import pandas as pd
# reachability labels
from git_commit_graph_ext.labelling.levels import find_levels
from git_commit_graph_ext.labelling.dfs_intervals import find_dfs_intervals, find_dfs_intervals_extra

## Functions for computing reachability labels, and saving them and graph to a file, etc.

In [None]:
# imports for example graphs
import git_commit_graph_ext.example_graphs as graphs

In [None]:
# before tests (test fixture)

# example graph
example_graph = graphs.commit_graph()
example_graph_name = 'commit_graph'

# check that it worked
print("the example commit graph from Stolee blog posts '{}' has {:d} nodes and {:d} edges".
      format(example_graph_name, example_graph.number_of_nodes(), example_graph.number_of_edges()))

the example commit graph from Stolee blog posts 'commit_graph' has 23 nodes and 31 edges


### Functions for saving graph structure as `DataFrame` (storing edge list) to a file, and restoring it

Helper function to get basename for saving graph and/or graph data to a file (as a dataset).

In [None]:
#export
def _savefile_name(graph_name, out_dir='datasets', kind='df_edgelist', file_format='csv.gz'):
    """Create filename for storing graph structure and other graph data

    This is a helper function used, among others, in ...

    Examples:
    ---------
    >>>> _savefile_name('example_graph')
    Path('datasets/example_graph.df_edgelist.csv.gz')

    Parameters
    ----------
    graph_name : str
        Name of the graph (`<graph>.name` can be used).

    out_dir : str
        Directory where saved commit graph data would be stored.
        Defaults to "datasets".

    kind : str
        What type of data is stored in a file, and in what representation.
        The default value is 'df_edgelist', used to store graph structure in
        the edge list format in a `pandas.DataFrame`.

    file_format : str
        Format of a file, for example how the `DataFrame` is saved.
        Defaults to 'csv.gz' (gzip-compressed Comma Separated Values).

    Returns
    -------
    Path
        Path to the file storing the graph structure or graph data in
        the appropriate representation and appropriate file format.
    """
    # The `out_dir` should not be None
    if out_dir is None:
        out_dir = "."

    # compose the basename of the pathname
    filename = graph_name
    # TODO: there would special case for saving to HDF5 files, which can
    # store multiple data in a single file, so there would be no need
    # to add <kind> to basename of such output file
    if kind is not None and kind != '':
        filename += '.' + kind
    if file_format is not None and file_format != '':
        filename += '.' + file_format
    # generate the name of the output file, as `pathlib.Path` object
    return Path(out_dir) / filename


def _out_basename(graph_name, out_dir='datasets'):
    return _savefile_name(graph_name, out_dir=out_dir, kind=None, file_format=None)



<u>Test</u> that examples from the docstring works:

In [None]:
assert _savefile_name('example_graph') == Path('datasets/example_graph.df_edgelist.csv.gz')
assert _savefile_name('example_graph', kind='adjlist', file_format='txt') == Path('datasets/example_graph.adjlist.txt')
assert _savefile_name('example_graph', out_dir='data', kind='adjlist', file_format='txt') == Path('data/example_graph.adjlist.txt')

Save graph to a `DataFrame` (saving containing edgelist information), and restore it.

In [None]:
#export
def graph_to_dataframe(graph):
    return nx.to_pandas_edgelist(graph)


def dataframe_to_graph(df):
    return nx.from_pandas_edgelist(df, create_using=nx.DiGraph)



<u>Test</u> that saving to the `DataFrame` and restoring from it works correctly,... up to nodes that are not connected -- those cannot be stored using only edge list data, but with addition of node list it should be possible to restore graph exactly:

In [None]:
example_graph_df = graph_to_dataframe(example_graph)
assert len(example_graph_df.index) == example_graph.number_of_edges()
print('ok - number of rows in dataframe with edgelist {} matches number of edges {}'.
      format(len(example_graph_df.index), example_graph.number_of_edges()))
restored_graph = dataframe_to_graph(example_graph_df)
assert restored_graph.number_of_edges() == example_graph.number_of_edges()
print('ok - number of edges {} in restored graph matches number of edges {} in the original'.
      format(restored_graph.number_of_edges(), example_graph.number_of_edges()))
assert restored_graph.edges == example_graph.edges
print('ok - all edges from the original graph got restored')
assert set(restored_graph).issubset(example_graph)
print('ok - all restored nodes are in the original graph')

ok - number of rows in dataframe with edgelist 31 matches number of edges 31
ok - number of edges 31 in restored graph matches number of edges 31 in the original
ok - all edges from the original graph got restored
ok - all restored nodes are in the original graph


Save graph to a file (via `DataFrame` containing edgelist information), and restore it

In [None]:
#export
def save_graph_df(df, graph_name, datasets_dir='datasets', output_format='csv.gz', overwrite=False):
    filename = _savefile_name(graph_name, out_dir=datasets_dir,
                              kind='df_edgelist', file_format=output_format)
    print('-> filename:', filename)
    if not overwrite and Path(filename).is_file():
        return
    if output_format == 'csv' or output_format == 'csv.gz':
        df.to_csv(filename)
    else:
        raise NotImplementedError("Writing to '{}' format is not supported".format(output_format))


def save_graph(graph, graph_name=None, datasets_dir='datasets', output_format='csv.gz', overwrite=False):
    df = graph_to_dataframe(graph)
    # if `graph_name` is not given, check the `name` attribute of the `graph`
    if graph_name is None:
        # NOTE: "'name' in graph" checks if there is node named 'name' in the graph
        if hasattr(graph, 'name'):
            graph_name = graph.name
        else:
            raise RuntimeError("Neither 'graph_name' parameter given, nor 'graph' has 'name' attribute")

    print('-> graph_name:', graph_name)
    save_graph_df(df, graph_name,
                  datasets_dir=datasets_dir, output_format=output_format, overwrite=overwrite)


def guess_format(filename):
    suffixes = PurePath(filename).suffixes
    file_format = suffixes[-1]
    if file_format == '.gz' or file_format == '.txt':
        file_format = suffixes[-2] + file_format
    return file_format[1:]


def load_graph_df_from_file(filename, input_format='csv.gz'):
    if input_format is None:
        input_format = guess_format(filename)
    if input_format == 'csv' or input_format == 'csv.gz':
        return pd.read_csv(filename, index_col=0)
    else:
        raise NotImplementedError("Reading from '{}' format is not supported".format(input_format))


def load_graph_df(graph_name, datasets_dir='datasets', input_format='csv.gz'):
    filename = _savefile_name(graph_name, out_dir=datasets_dir,
                              kind='df_edgelist', file_format=input_format)
    print('<- filename:', filename)
    return load_graph_df_from_file(filename, input_format=input_format)



<u>Test</u> guessing file format from file name

In [None]:
assert guess_format('datasets/hellogitworld-commit_graph.df_edgelist.csv.gz') == 'csv.gz'
assert guess_format('datasets/hellogitworld-commit_graph.df_edgelist.csv') == 'csv'
assert guess_format('datasets/hellogitworld-commit_graph.adjlist.txt') == 'adjlist.txt'

<u>Test</u> saving graph structure (via `DataFrame`) to a file, and restoring / reading such `DataFrame`.

In [None]:
print('graph.name = {}'.format(example_graph.name))
print('testing save_graph()')
save_graph(example_graph)
print('testing save_graph_df()')
save_graph_df(example_graph_df, graph_name=example_graph_name)
print('there should be appropriately named file in the list below:')
["{name:<50} {size:>7}".format(name=p.name,size=p.stat().st_size) for p in Path("datasets").glob(example_graph_name+"*")]

graph.name = 
testing save_graph()
-> graph_name: 
-> filename: datasets\.df_edgelist.csv.gz
testing save_graph_df()
-> filename: datasets\commit_graph.df_edgelist.csv.gz
there should be appropriately named file in the list below:


['commit_graph.df_edgelist.csv.gz                        224']

In [None]:
print('restoring graph named "{}"'.format(example_graph_name))
df = load_graph_df(example_graph_name)
assert example_graph_df.equals(df)
print('ok - dataframe and restored dataframe are equal')

restoring graph named "commit_graph"
<- filename: datasets\commit_graph.df_edgelist.csv.gz
ok - dataframe and restored dataframe are equal


### Functions for computing reachability levels, saving them to a file (as `DataFrame`), and restoring them

Compute levels and min-post intervals for a graph, and store them as attributes of the graph object

In [None]:
#export
def compute_reachability_labels(graph, recompute=False):
    if recompute or not hasattr(graph, 'lvl'):
        graph.lvl = find_levels(graph)
    if recompute or not hasattr(graph, 'mpi_ext'):
        graph.mpi_ext = find_dfs_intervals_extra(graph)
    return graph



<u>Test</u> computing reachability labels and saving them as attributes of the graph object

In [None]:
print('compute reachability labels for {}'.format(example_graph_name))
compute_reachability_labels(example_graph)

print('we should see {} and {} among dict-values public attributes'.format('lvl', 'mpi_ext'))
for (attr, val) in example_graph.__dict__.items():
    if isinstance(val, dict) and not attr.startswith('_'):
        print('- {:s}'.format(attr))
assert hasattr(example_graph, 'lvl')
assert hasattr(example_graph, 'mpi_ext')
print('ok - graph has both "{}" and "{}" attributes'.format('lvl', 'mpi_ext'))

print('reachability labels should be computed for all nodes')
assert set(example_graph.lvl.keys()) == set(example_graph.nodes)
assert set(example_graph.mpi_ext.keys()) == set(example_graph.nodes)
print('ok - both lvl and mpi_ext keys are all {} graph nodes'.format(len(example_graph.nodes)))

compute reachability labels for commit_graph
we should see lvl and mpi_ext among dict-values public attributes
- graph
- pos
- lvl
- mpi_ext
ok - graph has both "lvl" and "mpi_ext" attributes
reachability labels should be computed for all nodes
ok - both lvl and mpi_ext keys are all 23 graph nodes


Store reachability labels and per-node information in a `DataFrame`

In [None]:
#export
def graph_data_to_dataframe(graph, append_to=None):
    compute_reachability_labels(graph)

    # create the DataFrame and name its index
    df = pd.DataFrame.from_dict(graph.mpi_ext, orient='index', columns=['f_min', 'min', 'post'])
    df.index.name = 'node'
    # add other reachability labels
    df['level'] = pd.Series(graph.lvl)
    # add and compute other data
    df['in degree'] = pd.Series(dict(graph.in_degree()))
    df['out degree'] = pd.Series(dict(graph.out_degree()))
    df['degree'] = df['in degree'] + df['out degree']

    # append if needed
    if append_to:
        df = pd.concat([append_to, df], axis=1, join='inner')

    return df



<u>Test</u> computing all per-node data for a graph and storing them in `DataFrame`

In [None]:
df = graph_data_to_dataframe(example_graph)

print('check that the dataframe has all the columns')
assert set(df.columns) == set(['level', 'f_min', 'min', 'post', 'in degree', 'out degree', 'degree'])
print('- columns: {}'.format(df.columns.tolist()))

print('check that the dataframe has all the rows')
assert set(df.index) == set(example_graph.nodes)
print('- rows:    {}...'.format(list(example_graph.nodes)[0:5]))

df.head()

check that the dataframe has all the columns
- columns: ['f_min', 'min', 'post', 'level', 'in degree', 'out degree', 'degree']
check that the dataframe has all the rows
- rows:    ['A', 'a7', 'a5', 'a4', 'a3']...


Unnamed: 0_level_0,f_min,min,post,level,in degree,out degree,degree
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
b0,1,1,1,0,3,0,3
a1,1,1,2,1,2,1,3
a2,1,1,3,2,1,1,2
a3,1,1,4,3,2,1,3
a4,1,1,5,4,1,1,2


Compute and save graph structure, its reachability labels and other per-node info to a file, and restore it.  
Don't redo calculations that can be retrieved from a file.

**TODO**

----

In [None]:
#hide
# this should be the last cell of the notebook
from nbdev.export import notebook2script
notebook2script()

Converted 01_tools.ipynb.
Converted 02_related.ipynb.
Converted 03_example_graphs.ipynb.
Converted 05_reachability_index.ipynb.
Converted 06_levels.ipynb.
Converted 07_interval_labels.ipynb.
Converted 08_reach.ipynb.
Converted 09_git.ipynb.
Converted 10_checkpoint.ipynb.
Converted A.09_git_explore.ipynb.
Converted index.ipynb.
