In [39]:
#|hide
#|eval: false
! [ -e /content ] && pip install -Uqq fastrl['dev'] pyvirtualdisplay && \
                     apt-get install -y xvfb python-opengl > /dev/null 2>&1 
# NOTE: IF YOU SEE VERSION ERRORS, IT IS SAFE TO IGNORE THEM. COLAB IS BEHIND IN SOME OF THE PACKAGE VERSIONS

In [40]:
#|hide
#|eval: false
from fastcore.imports import in_colab
# Since colab still requires tornado<6, we don't want to import nbdev if we don't have to
if not in_colab():
    from nbdev.showdoc import *
    from nbdev.imports import *
    if not os.environ.get("IN_TEST", None):
        assert IN_NOTEBOOK
        assert not IN_COLAB
        assert IN_IPYTHON
else:
    # Virutual display is needed for colab
    from pyvirtualdisplay import Display
    display = Display(visible=0, size=(400, 300))
    display.start()

In [41]:
#|default_exp data.block

In [42]:
#|export
# Python native modules
import os
import inspect
from typing import Any,Callable,Generator
from inspect import isfunction,ismethod
import pickle
# Third party libs
from fastcore.all import *
from torchdata.dataloader2.dataloader2 import DataLoader2
from torchdata.dataloader2.graph import find_dps,traverse,DataPipe,IterDataPipe,MapDataPipe
from fastai.torch_core import *
from fastai.data.transforms import *
import torchdata.datapipes as dp
from collections import deque
from fastai.imports import *
# Local modules
from fastrl.pipes.core import *
from fastrl.core import *
from fastrl.data.dataloader2 import *

# Data Block
> High level API to quickly get your data in a `DataLoader`s

## Transform Block
> Loosely similar to the fastai==2.* `TransformBlock`, only this time, just like the fastrl **Agent** and **Learner**, is simply a *DataPipe* construction
function with augmentation capabilities.

In [43]:
#|exports
DataPipeOrDataLoader = Union[DataPipe,DataLoader2]
TransformBlock = Callable[[Union[Iterable,DataPipe]],DataPipeOrDataLoader]

`DataBlock` as defined below expects single or tuples of `TransformBlock` callables. These functions 
need to have the above signatures. 

Note that a `TransformBlock` **must** take params `source` and `as_dataloader` at minimum. 
Additional params are up to the developer / user.


The simplest example would be:

In [44]:
def TestTransformBlock(
    # Pipeline Parameters
    a:int=1,
    b:str='_test',
    # Additional pipelines to insert, replace, remove
    dp_augmentation_fns:Tuple[DataPipeAugmentationFn]=None
) -> TransformBlock:
    "This function returns a pipeline builder that either will return a DataPipe or a DataLoader"
    def _TestTransformBlock(
        # `source` likely will be an iterable that gets pushed into the pipeline when an 
        # experiment is actually being run.
        source:Any,
        # Any parameters needed for the dataloader
        num_workers:int=0,
        # This param must exist: as_dataloader for the datablock to create dataloaders
        as_dataloader:bool=False
    ) -> DataPipeOrDataLoader:
        "This is the function that is actually run by `DataBlock`"
        # This is where the template pipeline gets outlined. Notice that we
        # are feeding source into the pipeline.
        pipe = dp.iter.IterableWrapper(source) # In this example, probably a list of numbers
        pipe = pipe.map(lambda o:o+a)          # Add `a` to them
        pipe = pipe.map(lambda o:str(o))       # Convert the numbers to str
        pipe = pipe.map(lambda o:o+b)          # Concat `b` into the str
        # Once the base pipeline is constructed, we give the user the opportinuty to augment the 
        # pipeline however they like.
        pipe = apply_dp_augmentation_fns(pipe,ifnone(dp_augmentation_fns,()))
        # The transform block must be able to return a `DataLoader2` instance
        if as_dataloader:
            pipe = DataLoader2(
                datapipe=pipe,
                reading_service=PrototypeMultiProcessingReadingService(
                    num_workers = num_workers,
                    protocol_client_type = InputItemIterDataPipeQueueProtocolClient,
                    protocol_server_type = InputItemIterDataPipeQueueProtocolServer,
                    pipe_type = item_input_pipe_type,
                    eventloop = SpawnProcessForDataPipeline
                ) if num_workers>0 else None
            )
        return pipe 
    return _TestTransformBlock

Check that we can return a `DataPipe` and that an iteration through it is what we 
expect...

In [45]:
tfm_block = TestTransformBlock()
pipe = tfm_block([1,2,3])
test_eq(type(pipe),dp.iter.Mapper)
test_eq(list(pipe),['2_test', '3_test', '4_test'])

Check that we can return a `DataLoader2` and that an iteration through it is what we expect...

In [46]:
tfm_block = TestTransformBlock()
pipe = tfm_block([1,2,3],as_dataloader=True)
test_eq(type(pipe),DataLoader2)
test_eq(list(pipe),['2_test', '3_test', '4_test'])

In [47]:
#|export
class InvalidTransformBlock(Exception):pass

def validate_transform_block(block:TransformBlock):
    msg = f"Checked {block}:"
    failed = False 
    kwargs = dict(inspect.signature(block).parameters)
    msg += f'\nGiven kwargs: {kwargs}'
    msg += f'\nGiven return: {inspect.signature(block).return_annotation}'
    if 'source' not in kwargs:
        failed = True
        msg += f'\n`source:Any` is missing from the arguments'
    if 'as_dataloader' not in kwargs:
        failed = True
        msg += f'\n`as_dataloader:bool=False` is missing from the arguments'
    if inspect.signature(block).return_annotation != DataPipeOrDataLoader:
        failed = True
        msg += f'\n`DataPipeOrDataLoader` missing from return signature'
    if failed: raise InvalidTransformBlock(msg)

Check that `TestTransformBlock` is infact valid...

In [48]:
validate_transform_block(tfm_block)

And check that invalid `TransformBlock`s get caught...

In [49]:
def invalid_transform_block():
    def _invalid_transform_block():pass
    return _invalid_transform_block

invalid_tfm_block = invalid_transform_block()
with ExceptionExpected(InvalidTransformBlock):
    try: validate_transform_block(invalid_tfm_block)
    except InvalidTransformBlock as e:
        print(str(e))
        raise

Checked <function invalid_transform_block.<locals>._invalid_transform_block at 0x7f7afb523c20>:
Given kwargs: {}
Given return: <class 'inspect._empty'>
`source:Any` is missing from the arguments
`as_dataloader:bool=False` is missing from the arguments
`DataPipeOrDataLoader` missing from return signature


In [50]:
#|export
def DataPipeWrapperTransformBlock(
    dp_cls:DataPipe, # The `DataPipe` to wrap into a `TransformBlock`,
    **dp_kwargs
) -> TransformBlock:
    "Used by `DataBlock` to support converting `DataPipe`s to `TransformBlock`s on the fly."
    def _DataPipeWrapperTransformBlock(
        # `source` likely will be an iterable that gets pushed into the pipeline when an 
        # experiment is actually being run.
        source:Any,
        # Any parameters needed for the dataloader
        num_workers:int=0,
        # If True, returns a `DataLoader2` instead of `DataPipe`
        as_dataloader:bool=False
    ) -> DataPipeOrDataLoader:

        pipe = dp_cls(source,**dp_kwargs) 
        if as_dataloader:
            pipe = DataLoader2(
                datapipe=pipe,
                reading_service=PrototypeMultiProcessingReadingService(
                    num_workers = num_workers,
                    protocol_client_type = InputItemIterDataPipeQueueProtocolClient,
                    protocol_server_type = InputItemIterDataPipeQueueProtocolServer,
                    pipe_type = item_input_pipe_type,
                    eventloop = SpawnProcessForDataPipeline
                ) if num_workers>0 else None
            )
        return pipe 
    return _DataPipeWrapperTransformBlock

Check that we can return a `DataPipe` and that an iteration through it is what we 
expect...

In [51]:
tfm_block = TestTransformBlock()
pipe = tfm_block([1,2,3])
test_eq(type(pipe),dp.iter.Mapper)
test_eq(list(pipe),['2_test', '3_test', '4_test'])

Check that we can return a `DataLoader2` and that an iteration through it is what we expect...

In [52]:
#|export
_DataBlock_msg = """Interpreting `blocks` input as %s, resulting in %s dataloaders"""

class DataBlock(object):
    def __init__(
        self,
        # A tuple of `TransformBlock`s to convert to `DataPipe`s or `DataLoader2`s.
        *blocks:Tuple[Union[Tuple[TransformBlock],TransformBlock]],
        # Debug mode for verbose output
        debug:bool=False
    ):
        self.blocks = blocks 

        if debug:
            block_types = [['datapipe','datapipe_group'][type(b)==tuple] for b in blocks]
            print(_DataBlock_msg%(block_types, len(blocks)))

    def blocks2dp_or_dl(self,
            # Passed into the `blocks`, likely as an iterable.
            source:Any,
            # Single `TransformBlock` or tuples of `TransformBlock`s that are
            # executed and chained together into a single `DataPipe`.
            blocks:Union[TransformBlock,Tuple[TransformBlock]],
            # If True, a `DataLoader2` instance is returned instead of a `DataPipe`  
            as_dataloader:bool=False,
            # Number of workers to use for the dataloader.
            # Requires `as_dataloader=True`
            num_workers:int=0
        ) -> DataPipeOrDataLoader:
        if type(blocks)!=tuple:
            validate_transform_block(blocks)
            pipe = blocks(source,as_dataloader=as_dataloader,num_workers=num_workers)
        elif len(blocks)==1:
            validate_transform_block(blocks[0])
            pipe = blocks[0](source,as_dataloader=as_dataloader,num_workers=num_workers)
        else:
            for b in blocks: validate_transform_block(b)
            pipe = blocks[0](source)
            for sub_block in blocks[1:-1]: pipe = sub_block(pipe)
            pipe = blocks[-1](pipe,as_dataloader=as_dataloader,num_workers=num_workers)
        return pipe


    def datapipes(self,source:Any):
        return tuple(self.blocks2dp_or_dl(source,b) for b in self.blocks)

    def dataloaders(self,source:Any,num_workers=0):
        return tuple(
            self.blocks2dp_or_dl(source,b,as_dataloader=True,num_workers=num_workers) 
            for b in self.blocks
        )


add_docs(DataBlock,
"""`DataBlock` is a single object for constructing datapipes and dataloaders from `blocks`.
Below are examples on how `blocks` eventually get converted to dataloaders.

Example 1: Simplest
blocks = (
    TestTransformBlock,
    TestTransformBlock
) -> (
    DataLoader2(TestTransformBlock(as_dataloader=True)),
    DataLoader2(TestTransformBlock(as_dataloader=True))
)

Example 2: Nested Blocks
blocks = (
    (TestTransformBlock,TestTransformBlock2),
    TestTransformBlock
) -> (
    DataLoader2(TestTransformBlock -> TestTransformBlock2(as_dataloader=True)),
    DataLoader2(TestTransformBlock)
)

In example 2, we can nest the blocks, thus chaining them together. The last
one in the chain is used to create the dataloader that is required.
""",
# wrap_dps="Wrap any `DataPipe`s in `DataPipeWrapperTransformBlock` in `self.blocks`",
blocks2dp_or_dl="""Passes `source` into single `TransformBlock`s or passes `source`
to chained `TransformBlock` outputs. In either case, it results in a single `DataPipe`. 

If `as_dataloader` is True, then a `DataLoader2` instance is returned instead.
""",
datapipes="""Combines `self.blocks` with `source` where `bs` can be defined. `n=None` means 
that the datapipes are infinite / lengthless. If `n` is an integer then the datapipes will have 
an expected max len.
""",
dataloaders="Returns a dataloader for each respoctive combination of blocks." 
)

In the below example we want 2 dataloaders, so we the len(blocks) will be 2. However,
for the second dataloader we want to change the output, and also cycle twice. We can easily do this
by using a tuple instead of a single `TestTransformBlock`.

In [53]:
block = DataBlock(
    TestTransformBlock(),
    (TestTransformBlock(b='_test2'),DataPipeWrapperTransformBlock(dp.iter.Cycler,count=2)),
    debug=True
)

Interpreting `blocks` input as ['datapipe', 'datapipe_group'], resulting in 2 dataloaders


The resulting datapipes are in the format that we expect...

In [54]:
pipes = block.datapipes([1,2,3])
traverse(pipes[0])
test_eq(type(pipes[0]),dp.iter.Mapper)
test_eq(list(pipes[0]),['2_test', '3_test', '4_test'])
# Second pipe has _test2 as a postfix and cycles the dataset twice
test_eq(type(pipes[1]),dp.iter.Cycler)
test_eq(list(pipes[1]),['2_test2', '3_test2', '4_test2', '2_test2', '3_test2', '4_test2'])

We can easily do the same for the dataloaders...

In [57]:
DataLoader2??

[0;31mInit signature:[0m [0mDataLoader2[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwds[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Abstract base class for generic types.

A generic type is typically declared by inheriting from
this class parameterized with one or more type variables.
For example, a generic mapping type might be defined as::

  class Mapping(Generic[KT, VT]):
      def __getitem__(self, key: KT) -> VT:
          ...
      # Etc.

This class can then be used as follows::

  def lookup_name(mapping: Mapping[KT, VT], key: KT, default: VT) -> VT:
      try:
          return mapping[key]
      except KeyError:
          return default
[0;31mSource:[0m        
[0;32mclass[0m [0mDataLoader2[0m[0;34m([0m[0mGeneric[0m[0;34m[[0m[0mT_co[0m[0;34m][0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;32mdef[0m [0m__init__[0m[0;34m([0m[0;34m[0m
[0;34m[0m        [0mself[0m[0;34m,[0m[0;34m[0m
[0;

In [55]:
from shutil import ExecError


pipes = block.dataloaders([1,2,3])
test_eq(type(pipes[0]),DataLoader2)
test_eq(list(pipes[0]),['2_test', '3_test', '4_test'])
# Second pipe has _test2 as a postfix and cycles the dataset twice
test_eq(type(pipes[1]),DataLoader2)
test_eq(list(pipes[1]),['2_test2', '3_test2', '4_test2', '2_test2', '3_test2', '4_test2'])
with ExceptionExpected(TypeError):
    traverse(dp.iter.IterableWrapper(pipes))
    print('torchdata dataloaders are not traverseable once started.')

# TODO: Kind of what I was a afraid of for the transform blocks. In reality,
# I think they should have their inner functions already returned before any
# pickling happens, so this technically shouldn't be happening.
# There are other issues with the dataloader itself though that can only be fixed 
# in torch data.
for k in pipes[0].__dict__:
    try:
        print(k)
        pickle.dumps(pipes[0].__dict__[k])
    except Exception as e:
        print('Got pickle error: ',str(e),' for key ',k)

datapipe
Got pickle error:  Can't pickle local object 'TestTransformBlock.<locals>._TestTransformBlock.<locals>.<lambda>'  for key  datapipe
_adapted
_datapipe_iter
Got pickle error:  can't pickle generator objects  for key  _datapipe_iter
_reset_iter
datapipe_adapter_fns
reading_service
reading_service_state
_terminated
valid_iterator_id
_datapipe_before_reading_service_adapt
Got pickle error:  Can't pickle local object 'TestTransformBlock.<locals>._TestTransformBlock.<locals>.<lambda>'  for key  _datapipe_before_reading_service_adapt


In [56]:
#|hide
#|eval: false
from fastcore.imports import in_colab

# Since colab still requires tornado<6, we don't want to import nbdev if we don't have to
if not in_colab():
    from nbdev import nbdev_export
    nbdev_export()