In [None]:
#default_exp checkpoint

In [None]:
#hide_output
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Checkpointing
> Saving and restoring graphs, reachability labels, evaluation results, etc.

In [None]:
#export
# creating graphs in Python
import networkx as nx
# calling git commands
from pathlib import Path
# data analysis and manipulation
import pandas as pd
# reachability labels
from git_commit_graph_ext.labelling.levels import find_levels
from git_commit_graph_ext.labelling.dfs_intervals import find_dfs_intervals, find_dfs_intervals_extra

### Functions for computing reachability labels, and saving them and graph to a file (TODO)
> DOING: move those to `10_checkpoint.ipynb` (currently not existing)

In [None]:
#temporary imports
from git_commit_graph_ext.commit_graph import commit_graph

In [None]:
# before tests (test fixture)
repo_url="https://github.com/githubtraining/hellogitworld"
repo_path="repos/hellogitworld.git"
repo_name="hellogitworld.git"
#rmtree(repo_path, ignore_errors=True)

# get commit graph of remote repository, without re-cloning or re-computing
repo_graph = commit_graph(repo_url, repo_name)

# check that it worked
print("the commit graph of {} repository has {:d} nodes and {:d} edges".
      format(repo_name, repo_graph.number_of_nodes(), repo_graph.number_of_edges()))

the commit graph of hellogitworld.git repository has 55 nodes and 53 edges


Helper function to get basename for saving graph and/or graph data to a file (as a dataset).

In [None]:
#export
def _repo_basename(repo_path):
    """Create name of repository out of its pathname

    This is a helper function used, among others, in ...

    Examples:
    ---------
    >>>> _repo_basename('hellogitworld')
    'hellogitworld'
    >>>> _repo_basename('repos/hellogitworld.git')
    'hellogitworld'

    Parameters
    ----------
    repo_path : str
        Path to the Git repository, for example 'hellogitworld',
        or 'hellogitworld.git', or 'repos/hellogitworld.git', or 'repos/hellogitworld/.git'
        for Git repository cloned from <https://github.com/githubtraining/hellogitworld>
    
    Returns
    -------
    str
        The name of repository, to be later used as a base for commit graph name,
        and the pathname of a file where to save information about thw history
        structure of the repository.
    """
    # generate the name of the output file, as `pathlib.Path` object
    # removing the '*.git' extension / suffix from `repo_path`, if needed
    out_pathname = Path(repo_name).stem

    # convert it to string
    return str(out_pathname)


def _commit_graph_name(repo_name):
    """Create the name of a commit graph out of repository name.
    
    This is a helper function used, among others, in ...

    Examples:
    ---------
    >>>> _commit_graph_name('hellogitworld')
    'hellogitworld-commit_graph'
    
    Parameters
    ----------
    repo_name : str
        The name of the repository, for example the result of calling the
        `_repo_basename()` function.
    
    Returns
    -------
    str
        The name of the commit graph of the repository, to be used as a base
        for the pathname of a file where to save information about the history
        structure of the repository.
    """
    return repo_name + '-commit_graph'


def _repo_graph_name(repo_path):
    """Create the name of a commit graph out of repository path (its pathname)
    
    This is a helper function used, among others, in ...

    Examples:
    ---------
    >>>> _repo_graph_name('repos/hellogitworld.git')
    'hellogitworld-commit_graph'
    
    Parameters
    ----------
    repo_path : str
        Path to the Git repository (`<graph>.name` can be used), for example
        'hellogitworld', or 'hellogitworld.git', or 'repos/hellogitworld.git',
        or 'repos/hellogitworld/.git' for Git repository cloned from
        <https://github.com/githubtraining/hellogitworld>
    
    Returns
    -------
    str
        The name of the commit graph of the repository, to be used as a base
        for the pathname of a file where to save information about the history
        structure of the repository.
    """
    return _commit_graph_name(_repo_basename(repo_path))



<u>Test</u> that examples from the docstring works:

In [None]:
assert _repo_basename('hellogitworld') == 'hellogitworld'
assert _repo_basename('hellogitworld.git') == 'hellogitworld'
assert _repo_basename('repos/hellogitworld.git') == 'hellogitworld'
assert _repo_basename('repos/hellogitworld/.git') == 'hellogitworld'

In [None]:
assert _commit_graph_name('hellogitworld') == 'hellogitworld-commit_graph'

In [None]:
assert _repo_graph_name('repos/hellogitworld.git') == 'hellogitworld-commit_graph'

In [None]:
#export
def _savefile_name(graph_name, out_dir='datasets', kind='df_edgelist', file_format='csv.gz'):
    """Create filename for storing graph structure and other graph data
    
    This is a helper function used, among others, in ...
    
    Examples:
    ---------
    >>>> _savefile_name('example_graph')
    Path('datasets/example_graph.df_edgelist.csv.gz')
    
    Parameters
    ----------
    graph_name : str
        Name of the graph (`<graph>.name` can be used).
        
    out_dir : str
        Directory where saved commit graph data would be stored.
        Defaults to "datasets".
        
    kind : str
        What type of data is stored in a file, and in what representation.
        The default value is 'df_edgelist', used to store graph structure in
        the edge list format in a `pandas.DataFrame`.
        
    file_format : str
        Format of a file, for example how the `DataFrame` is saved.
        Defaults to 'csv.gz' (gzip-compressed Comma Separated Values).
        
    Returns
    -------
    Path
        Path to the file storing the graph structure or graph data in
        the appropriate representation and appropriate file format.
    """
    # The `out_dir` should not be None
    if out_dir is None:
        out_dir = "."

    # compose the basename of the pathname
    filename = graph_name
    # TODO: there would special case for saving to HDF5 files, which can
    # store multiple data in a single file, so there would be no need
    # to add <kind> to basename of such output file
    if kind is not None and kind != '':
        filename += '.' + kind
    if file_format is not None and file_format != '':
        filename += '.' + file_format
    # generate the name of the output file, as `pathlib.Path` object
    return Path(out_dir) / filename


def _out_basename(graph_name, out_dir='datasets'):
    return _savefile_name(graph_name, out_dir=out_dir, kind=None, file_format=None)



In [None]:
#export
def _repo_graph_savefile(repo_path, out_dir='datasets'):
    """Create filename for storing adjacency list out of repository path

    This is a helper function used, among others, in ...

    Examples:
    ---------
    >>>> _repo_graph_savefile('repos/hellogitworld.git')
    Path('datasets/hellogitworld-commit_graph.adjlist.txt')
    >>>> _repo_graph_savefile('repos/hellogitworld.git', out_dir='data')
    Path('data/hellogitworld-commit_graph.adjlist.txt')

    Parameters
    ----------
    repo_path : str
        Path to the Git repository, for example 'hellogitworld',
        or 'hellogitworld.git', or 'repos/hellogitworld.git', or 'repos/hellogitworld/.git'
        for Git repository cloned from <https://github.com/githubtraining/hellogitworld>

    out_dir : str
        Directory where extracted commit graph data would be stored.
        Defaults to "datasets".

    Returns
    -------
    Path
        Path to the file storing the commit graph in the adjacency list
        file format.

        see: https://networkx.org/documentation/stable/reference/readwrite/adjlist.html
    """
    graph_name = _repo_graph_name(repo_path)
    return _savefile_name(graph_name, out_dir=out_dir, kind='adjlist', file_format='txt')

<u>Test</u> that examples from the docstring works:

In [None]:
assert _savefile_name('example_graph') == Path('datasets/example_graph.df_edgelist.csv.gz')
assert _repo_graph_savefile('repos/hellogitworld.git') == Path('datasets/hellogitworld-commit_graph.adjlist.txt')
assert _repo_graph_savefile('repos/hellogitworld.git', out_dir='data') == Path('data/hellogitworld-commit_graph.adjlist.txt')

Save graph to a `DataFrame` (saving containing edgelist information), and restore it.

In [None]:
#export
def graph_to_dataframe(graph):
    return nx.to_pandas_edgelist(graph)


def dataframe_to_graph(df):
    return nx.from_pandas_edgelist(df, create_using=nx.DiGraph)



<u>Test</u> that saving to the `DataFrame` and restoring from it works correctly,... up to nodes that are not connected -- those cannot be stored using only edge list data, but with addition of node list it should be possible to restore graph exactly:

In [None]:
repo_graph_df = graph_to_dataframe(repo_graph)
assert len(repo_graph_df.index) == repo_graph.number_of_edges()
print('ok - number of rows in dataframe with edgelist {} matches number of edges {}'.
      format(len(repo_graph_df.index), repo_graph.number_of_edges()))
restored_graph = dataframe_to_graph(repo_graph_df)
assert restored_graph.number_of_edges() == repo_graph.number_of_edges()
print('ok - number of edges {} in restored graph matches number of edges {} in the original'.
      format(restored_graph.number_of_edges(), repo_graph.number_of_edges()))
assert restored_graph.edges == repo_graph.edges
print('ok - all edges from the original graph got restored')
assert set(restored_graph).issubset(repo_graph)
print('ok - all restored nodes are in the original graph')

ok - number of rows in dataframe with edgelist 53 matches number of edges 53
ok - number of edges 53 in restored graph matches number of edges 53 in the original
ok - all edges from the original graph got restored
ok - all restored nodes are in the original graph


Save graph to a file (via `DataFrame` containing edgelist information), and restore it

In [None]:
#export
def save_graph_df(df, graph_name, datasets_dir='datasets', output_format='csv.gz', overwrite=False):
    filename = _savefile_name(graph_name, out_dir=datasets_dir,
                              kind='df_edgelist', file_format=output_format)
    print('-> filename:', filename)
    if not overwrite and Path(filename).is_file():
        return
    if output_format == 'csv' or output_format == 'csv.gz':
        df.to_csv(filename)
    else:
        raise NotImplementedError("Writing to '{}' format is not supported".format(output_format))


def save_graph(graph, graph_name=None, datasets_dir='datasets', output_format='csv.gz', overwrite=False):
    df = graph_to_dataframe(graph)
    # if `graph_name` is not given, check the `name` attribute of the `graph`
    if graph_name is None:
        # NOTE: "'name' in graph" checks if there is node named 'name' in the graph
        if hasattr(graph, 'name'):
            graph_name = graph.name
        else:
            raise RuntimeError("Neither 'graph_name' parameter given, nor 'graph' has 'name' attribute")

    print('-> graph_name:', graph_name)
    save_graph_df(df, graph_name,
                  datasets_dir=datasets_dir, output_format=output_format, overwrite=overwrite)


def load_graph_df_from_file(filename, input_format='csv.gz'):
    if input_format == 'csv' or input_format == 'csv.gz':
        return pd.read_csv(filename, index_col=0)
    else:
        raise NotImplementedError("Reading from '{}' format is not supported".format(input_format))


def load_graph_df(graph_name, datasets_dir='datasets', input_format='csv.gz'):
    filename = _savefile_name(graph_name, out_dir=datasets_dir,
                              kind='df_edgelist', file_format=input_format)
    print('<- filename:', filename)
    return load_graph_df_from_file(filename, input_format=input_format)



<u>Test</u> saving graph structure (via `DataFrame`) to a file, and restoring / reading such `DataFrame`.

In [None]:
print('repository path: {}'.format(repo_path))
print('saving commit graph of: {}'.format(repo_name))
print('graph.name = {}'.format(repo_graph.name))
print('commit graph name: {}'.format(_repo_graph_name(repo_path)))
print('testing save_graph()')
save_graph(repo_graph)
print('testing save_graph_df()')
save_graph_df(repo_graph_df, graph_name=_repo_graph_name(repo_path))
print('there should be appropriately named file in the list below:')
["{name:<50} {size:>7}".format(name=p.name,size=p.stat().st_size) for p in Path("datasets").glob("hellogitworld*")]

repository path: repos/hellogitworld.git
saving commit graph of: hellogitworld.git
graph.name = hellogitworld.git
commit graph name: hellogitworld-commit_graph
testing save_graph()
-> graph_name: hellogitworld.git
-> filename: datasets\hellogitworld.git.df_edgelist.csv.gz
testing save_graph_df()
-> filename: datasets\hellogitworld-commit_graph.df_edgelist.csv.gz
there should be appropriately named file in the list below:


['hellogitworld-commit_graph.adjlist.txt                 868',
 'hellogitworld-commit_graph.df_edgelist.csv.gz          523',
 'hellogitworld.git.df_edgelist.csv.gz                   514']

In [None]:
print('restoring commit graph of: {}'.format(repo_name))
df = load_graph_df(repo_name)
assert repo_graph_df.equals(df)
print('ok - dataset and restored dataset are equal')

restoring commit graph of: hellogitworld.git
<- filename: datasets\hellogitworld.git.df_edgelist.csv.gz
ok - dataset and restored dataset are equal


Compute levels and min-post intervals for a graph, and store them as attributes of the graph object

In [None]:
#export
def compute_reachability_labels(graph, recompute=False):
    if recompute or not hasattr(graph, 'lvl'):
        graph.lvl = find_levels(graph)
    if recompute or not hasattr(graph, 'mpi_ext'):
        graph.mpi_ext = find_dfs_intervals_extra(graph)
    return graph



<u>Test</u> computing reachability labels and saving them as attributes of the graph object

In [None]:
print('compute reachability labels for {} commit graph'.format(repo_name))
compute_reachability_labels(repo_graph)

print('we should see {} and {} among dict-values attributes'.format('lvl', 'mpi_ext'))
for (attr,val) in repo_graph.__dict__.items():
    if isinstance(val, dict) and not attr.startswith('_'):
        print('- {:s}'.format(attr))
assert hasattr(repo_graph, 'lvl')
assert hasattr(repo_graph, 'mpi_ext')
print('ok - graph has both "{}" and "{}" attributes'.format('lvl', 'mpi_ext'))
        
print('reachability labels should be computed for all nodes')
assert set(repo_graph.lvl.keys()) == set(repo_graph.nodes)
assert set(repo_graph.mpi_ext.keys()) == set(repo_graph.nodes)
print('ok - both lvl and mpi_ext keys are all {} graph nodes'.format(len(repo_graph.nodes)))

compute reachability labels for hellogitworld.git commit graph
we should see lvl and mpi_ext among dict-values attributes
- graph
- lvl
- mpi_ext
ok - graph has both "lvl" and "mpi_ext" attributes
reachability labels should be computed for all nodes
ok - both lvl and mpi_ext keys are all 55 graph nodes


Store reachability labels and per-node information in a `DataFrame`

In [None]:
#export
def graph_data_to_dataframe(graph, append_to=None):
    compute_reachability_labels(graph)

    # create the DataFrame and name its index
    df = pd.DataFrame.from_dict(graph.mpi_ext, orient='index', columns=['f_min', 'min', 'post'])
    df.index.name = 'node'
    # add other reachability labels
    df['level'] = pd.Series(graph.lvl)
    # add and compute other data
    df['in degree'] = pd.Series(dict(graph.in_degree()))
    df['out degree'] = pd.Series(dict(graph.out_degree()))
    df['degree'] = df['in degree'] + df['out degree']

    # append if needed
    if append_to:
        df = pd.concat([append_to, df], axis=1, join='inner')

    return df



<u>Test</u> computing all per-node data for a graph and storing them in `DataFrame`

In [None]:
df = graph_data_to_dataframe(repo_graph)

print('check that the dataframe has all the columns')
assert set(df.columns) == set(['level', 'f_min', 'min', 'post', 'in degree', 'out degree', 'degree'])
print('- columns: {}'.format(df.columns.tolist()))

print('check that the dataframe has all the rows')
assert set(df.index) == set(repo_graph.nodes)
print('- rows:    {}...'.format(list(repo_graph.nodes)[0:5]))

df.head()

check that the dataframe has all the columns
- columns: ['f_min', 'min', 'post', 'level', 'in degree', 'out degree', 'degree']
check that the dataframe has all the rows
- rows:    ['ef7bebf', 'ebbbf77', '45a30ea', '9805760', '435ffce']...


Unnamed: 0_level_0,f_min,min,post,level,in degree,out degree,degree
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
755fd57,1,1,1,0,1,0,1
32c2737,1,1,2,1,1,1,2
dc64fe4,1,1,3,2,2,1,3
9cdb160,1,1,4,3,1,1,2
f5ed019,1,1,5,4,1,1,2


Compute and save graph structure, its reachability labels and other per-node info to a file, and restore it.  
Don't redo calculations that can be retrieved from a file.

**TODO**

----

In [None]:
#hide
# this should be the last cell of the notebook
from nbdev.export import notebook2script
notebook2script()

Converted 01_tools.ipynb.
Converted 02_related.ipynb.
Converted 03_example_graphs.ipynb.
Converted 05_reachability_index.ipynb.
Converted 06_levels.ipynb.
Converted 07_interval_labels.ipynb.
Converted 08_reach.ipynb.
Converted 09_git.ipynb.
Converted 10_checkpoint.ipynb.
Converted A.09_git_explore.ipynb.
Converted index.ipynb.
