In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from src import paths
from src.utils import save_json, load_json

In [3]:
cp  = paths['catalog_path']

In [4]:
import doctest

## Dataset DAG Spec

A transformer function takes in `input_datasets` and produces `output_datasets`.
Edges can be thought of as directed, indicating a dependency. e.g. `output_datasets` depend on `input_datasets`


```
{
    "hyperedge_1": {
        "input_datasets":[],
        "output_datasets":[],
        transformations: [
            (function_1, kwargs_dict_1 ),
            (function_2, kwargs_dict_2 ),
            ...
        ],
        "suppress_output": False,  # defaults to True
    },
    "sink_edge": {
        "datasource_name": "ds_name",
        "datasource_opts": {},
        "output_dataset: "ds_name",
    }
}
```



## Questions

* Are Sink edges special?
  Right now they define a 1-1 map between a datasource and a dataset. (i.e. these are not hyperedges).

* How to the list of transformers work with many in many out. Take/emit a dict of all of them?

* How to implement traversal. (Apply_transforms should take a dataset name - optionally, and give me an ordered list of what to run)

* add force flag to add_transformer, in case you are overwriting a key

* should it be input_dataset or input_datasets? The most common use case is probably the former
```


In [5]:
def normalize_to_list(str_or_iterable):
    """Convert strings to lists. Pass lists (or None) unchanged.
    """
    if isinstance(str_or_iterable, str):
        return [str_or_iterable]
    if str_or_iterable is None:
        return []
    return str_or_iterable

def get_transformer_dag(
        dag_path=None,
        dag_file=None,
        include_filename=False,
    ):
    """Get the list of transformation pipelines

    Returns
    -------
    If include_filename is True:
        A tuple: (transformer_dag, dag_file_fq)
    else:
        transformer_dag

    Parameters
    ----------
    include_filename: boolean
        if True, returns a tuple: (list, filename)
    dag_path: path. (default: MODULE_DIR/data)
        Location of `dag_file`
    dag_file: string, default 'transformer_dag.json'
        Name of json file that contains the transformer pipeline
    """
    if dag_path is None:
        dag_path = paths['catalog_path']
    else:
        dag_path = pathlib.Path(dag_path)
    if dag_file is None:
        dag_file = 'transformer_dag.json'

    dag_file_fq = dag_path / dag_file
    try:
        transformer_dag = load_json(dag_file_fq)
    except FileNotFoundError:
        transformer_dag = {}
        
    if not isinstance(transformer_dag, dict):
        raise Exception("Obsolete file format: transformer_dag must be a dict")

    if include_filename:
        return transformer_dag, dag_file_fq
    return transformer_dag


def add_transformer(
    name=None,
    datasource_name=None,
    datasource_opts=None,
    input_datasets=None,
    suppress_output=False,
    output_datasets=None,
    transformations=None,
    dag_path=None,
    dag_file=None,
    write_to_catalog=True,
    ):
    """Create and add a dataset transformation pipeline to the workflow.

    Transformer pipelines apply a sequence of transformer functions to a Dataset (or DataSource),
    to produce new Dataset objects.

    Parameters
    ----------
    name: string
        Name for this transformer instance (must be unique).
        By default, one will be created from the input and output dataset names; e.g.
        _input_ds1_input_ds2_to_output_ds1
    input_datasets: string or iterable
        Upstream data dependencies. These must be present
    output_datasets: string or Iterable
        These datasets will be generated
    datasource_name: string
        Name of a DataSource to use to generate the output
        Setting this option will create a source node in the dataset flow graph
        (or a sink node in the data dependency graph).
        Transformers of this type must specify at most one entry in `output_datasets`
    datasource_opts: dict
        Options to use when generating a Dataset from this DataSource
    suppress_output: boolean
        If True, the terminal dataset object is not written to disk.
        This is useful when one of the intervening tranformers handles the writing; e.g. train/test split.
    transformations: list of tuples
        Squence of transformer functions to apply. tuples consist of:
        (transformer_name, transformer_opts)
    dag_path: path. (default: paths['catalog_path'])
        Location of `dag_file`
    dag_file: string, default 'transformer_dag.json'
        Name of json file that contains the transformer pipeline
    write_to_catalog: Boolean, Default True
        If False, don't actually write this entry to the catalog.
    Examples
    --------
    
    If you only have one input or output, it may be specified simply as a string;
    i.e. these are identical
    >>> add_transformer(input_datasets='other', output_datasets='p_other', write_to_catalog=False)
    {'_p_other': {'input_datasets': ['other'], 'output_datasets': ['p_other']}}
    >>> add_transformer(input_datasets=['other'], output_datasets='p_other', write_to_catalog=False)
    {'_p_other': {'input_datasets': ['other'], 'output_datasets': ['p_other']}}
    
    >>> add_transformer(input_datasets=['cc-by', 'cc-by-nc'], output_datasets='cc', write_to_catalog=False)
    {'_cc': {'input_datasets': ['cc-by', 'cc-by-nc'], 'output_datasets': ['cc']}}
    >>> add_transformer(input_datasets=['cc-by', 'cc-by-nc'], output_datasets='cc', write_to_catalog=False)
    {'_cc': {'input_datasets': ['cc-by', 'cc-by-nc'], 'output_datasets': ['cc']}}
    
    Names can be given explicitly:
    
    >>> add_transformer(input_datasets=['cc'], output_datasets=['cc_train','cc_test'], write_to_catalog=False)
    {'_cc_train_cc_test': {'input_datasets': ['cc'], 'output_datasets': ['cc_train', 'cc_test']}}
    >>> add_transformer(input_datasets=['cc'], output_datasets=['cc_train','cc_test'], name='tts', write_to_catalog=False)
    {'tts': {'input_datasets': ['cc'], 'output_datasets': ['cc_train', 'cc_test']}}
    
    
    Invalid use cases:
    
    >>> add_transformer(datasource_name="foo", output_datasets=['bar', 'baz'])
    Traceback (most recent call last):
    ...
    Exception: Edges from data sources must have only one output_dataset.

    >>> add_transformer(datasource_name="foo", input_datasets='bar')
    Traceback (most recent call last):
    ...
    Exception: Cannot set both `datasource_name` and `input_datasets`

    >>> add_transformer(datasource_opts={'foo':'bar'})
    Traceback (most recent call last):
    ...
    Exception: Must specify `datasource_name` when using `datasource_opts`
        
    >>> add_transformer(output_datasets="foo")
    Traceback (most recent call last):
    ...
    Exception: Must specify one of from `datasource_name` or `input_datasets`
    
    >>> add_transformer(input_datasets="foo")
    Traceback (most recent call last):
    ...
    Exception: Must specify `output_dataset` (or use `suppress_output`)
    """
    input_datasets = normalize_to_list(input_datasets)
    output_datasets = normalize_to_list(output_datasets)

    if datasource_name is not None:
        if input_datasets:
            raise Exception('Cannot set both `datasource_name` and `input_datasets`')
        if output_datasets is not None and len(output_datasets) > 1:
            raise Exception("Edges from data sources must have only one output_dataset.")
    if datasource_name is None and datasource_opts is not None:
        raise Exception('Must specify `datasource_name` when using `datasource_opts`')

    if write_to_catalog:
        ds_dag, ds_dag_fq = get_transformer_dag(dag_path=dag_path,
                                                dag_file=dag_file,
                                                include_filename=True)
    transformer = {}
    if datasource_name:
        transformer['datasource_name'] = datasource_name
        if not output_datasets and not suppress_output:
            output_datasets = [datasource_name]
    elif input_datasets:
        transformer['input_datasets'] = input_datasets
    else:
        raise Exception("Must specify one of from `datasource_name` or `input_datasets`")

    if datasource_opts:
        transformer['datasource_opts'] = datasource_opts

    if transformations:
        transformer['transformations'] = transformations

    if not suppress_output:
        if not output_datasets:
            raise Exception("Must specify `output_dataset` (or use `suppress_output`)")
        else:
            transformer['output_datasets'] = output_datasets

    if name is None:
        name = f"_{'_'.join([ids for ids in output_datasets])}"
        
    if write_to_catalog:
        ds_dag[name] = transformer
        save_json(ds_dag_fq, ds_dag)
    return {name:transformer}


In [6]:
doctest.testmod()

TestResults(failed=0, attempted=11)

In [188]:
for source in ['cc-by', 'cc-by-nc', 'other']:
    add_transformer(datasource_name=source)

In [189]:
add_transformer(input_datasets=['cc-by', 'cc-by-nc'], output_datasets=['cc'], transformations=[('merge', {})])

{'_cc': {'input_datasets': ['cc-by', 'cc-by-nc'],
  'transformations': [('merge', {})],
  'output_datasets': ['cc']}}

In [190]:
add_transformer(input_datasets='cc', output_datasets='p_cc', transformations=[('process',{})])

{'_p_cc': {'input_datasets': ['cc'],
  'transformations': [('process', {})],
  'output_datasets': ['p_cc']}}

In [191]:
add_transformer(input_datasets='p_cc', output_datasets=['p_cc_train', 'p_cc_test'], transformations=[('ttsplit',{})])

{'_p_cc_train_p_cc_test': {'input_datasets': ['p_cc'],
  'transformations': [('ttsplit', {})],
  'output_datasets': ['p_cc_train', 'p_cc_test']}}

In [192]:
add_transformer(input_datasets=['cc', 'other'], output_datasets="p_all", transformations=[('merge', {})])

{'_p_all': {'input_datasets': ['cc', 'other'],
  'transformations': [('merge', {})],
  'output_datasets': ['p_all']}}

In [193]:
add_transformer(input_datasets='p_all', output_datasets=['p_all_train', 'p_all_test'], transformations=[('ttsplit',{})])

{'_p_all_train_p_all_test': {'input_datasets': ['p_all'],
  'transformations': [('ttsplit', {})],
  'output_datasets': ['p_all_train', 'p_all_test']}}

In [194]:
dag = get_transformer_dag()

In [195]:
for he_name, he in dag.items():
    for out in he['output_datasets']:
        print(out)

cc
cc-by
cc-by-nc
other
p_all
p_all_train
p_all_test
p_cc
p_cc_train
p_cc_test


In [196]:
from collections import Counter

In [211]:
class HDAG:
    def __init__(self, **kwargs):
        self._dag = get_transformer_dag(**kwargs)
        self.out_degrees = Counter()
        self.in_degrees = Counter()
        for n in self.nodes:
            self.in_degrees[n] = 0
            self.out_degrees[n] = 0
        for he_name, he in self._dag.items():
            for node in he.get('input_datasets', []):
                self.out_degrees[node] += 1
            if he.get('datasource_name', False):
                self.in_degrees[node] = 0
            else:
                for node in he['output_datasets']:
                    self.in_degrees[node] += 1

        
    @property
    def nodes(self):
        ret = set()
        for he_name, he in self._dag.items():
            for node in he['output_datasets']:
                ret.add(node)
        return ret
    @property
    def sources(self):
        ret = set()
        for he_name, he in self._dag.items():
            if he.get('datasource_name', []):
                for node in he['output_datasets']:
                    ret.add(node)
        return ret
    
    @property
    def sinks(self):
        n = self.nodes
        return n.difference(set(self.in_degrees.keys()))
        

In [212]:
dd = HDAG()

In [213]:
dd.nodes

{'cc',
 'cc-by',
 'cc-by-nc',
 'other',
 'p_all',
 'p_all_test',
 'p_all_train',
 'p_cc',
 'p_cc_test',
 'p_cc_train'}

In [214]:
dd.in_degrees

Counter({'other': 1,
         'p_cc': 1,
         'cc-by-nc': 1,
         'p_all_train': 0,
         'p_all_test': 0,
         'p_cc_test': 0,
         'cc-by': 1,
         'cc': 2,
         'p_cc_train': 0,
         'p_all': 1})

In [215]:
dd.out_degrees

Counter({'other': 0,
         'p_cc': 1,
         'cc-by-nc': 0,
         'p_all_train': 1,
         'p_all_test': 1,
         'p_cc_test': 1,
         'cc-by': 0,
         'cc': 0,
         'p_cc_train': 1,
         'p_all': 1})

In [216]:
dd.sinks

set()

In [217]:
dd.sources

{'cc-by', 'cc-by-nc', 'other'}

In [204]:
!cat $cp/transformer_dag.json

{
  "_cc": {
    "input_datasets": [
      "cc-by",
      "cc-by-nc"
    ],
    "output_datasets": [
      "cc"
    ],
    "transformations": [
      [
        "merge",
        {}
      ]
    ]
  },
  "_cc-by": {
    "datasource_name": "cc-by",
    "output_datasets": [
      "cc-by"
    ]
  },
  "_cc-by-nc": {
    "datasource_name": "cc-by-nc",
    "output_datasets": [
      "cc-by-nc"
    ]
  },
  "_other": {
    "datasource_name": "other",
    "output_datasets": [
      "other"
    ]
  },
  "_p_all": {
    "input_datasets": [
      "cc",
      "other"
    ],
    "output_datasets": [
      "p_all"
    ],
    "transformations": [
      [
        "merge",
        {}
      ]
    ]
  },
  "_p_all_train_p_all_test": {
    "input_datasets": [
      "p_all"
    ],
    "output_datasets": [
      "p_all_train",
      "p_all_test"
    ],
    "transformations": [
      [
        "ttsplit",
        {}
      ]
    ]
  },
  "_p_cc": {