In [1]:
#|hide
#|eval: false
! [ -e /content ] && pip install -Uqq fastrl['dev'] pyvirtualdisplay && \
                     apt-get install -y xvfb python-opengl > /dev/null 2>&1 
# NOTE: IF YOU SEE VERSION ERRORS, IT IS SAFE TO IGNORE THEM. COLAB IS BEHIND IN SOME OF THE PACKAGE VERSIONS

In [2]:
#|hide
#|eval: false
from fastcore.imports import in_colab
# Since colab still requires tornado<6, we don't want to import nbdev if we don't have to
if not in_colab():
    from nbdev.showdoc import *
    from nbdev.imports import *
    if not os.environ.get("IN_TEST", None):
        assert IN_NOTEBOOK
        assert not IN_COLAB
        assert IN_IPYTHON
else:
    # Virutual display is needed for colab
    from pyvirtualdisplay import Display
    display = Display(visible=0, size=(400, 300))
    display.start()

In [3]:
#|default_exp data.block

In [4]:
#|export
# Python native modules
import os
from typing import Any,Callable,Generator
from inspect import isfunction,ismethod
# Third party libs
from fastcore.all import *
from torchdata.dataloader2.dataloader2 import DataLoader2
from torchdata.dataloader2.graph import find_dps,traverse,DataPipe
from fastai.torch_core import *
from fastai.data.transforms import *
import torchdata.datapipes as dp
from collections import deque
from fastai.imports import *
# Local modules
from fastrl.pipes.core import *
from fastrl.core import *
from fastrl.data.dataloader2 import *

# Data Block
> High level API to quickly get your data in a `DataLoader`s

## Transform Block
> Loosely similar to the fastai==2.* `TransformBlock`, only this time, just like the fastrl **Agent** and **Learner**, is simply a *DataPipe* construction
function with augmentation capabilities.

In [5]:
#|hide
# class TransformBlock():
#     "A basic wrapper that links defaults transforms for the data block API"
#     def __init__(self, 
#         # A function that initializes a datapipeline and returns a datapipe.
#         # Minimum must support:
#         #
#         #     `pipe_fn(source, bs, n)`
#         #
#         # Where:
#         #   - `source` is the data to be input into the datapipes
#         #   - `bs` is the batch size of the returned data
#         #   - `n` is the number of iterations to make through the datapipes per epoch                 
#         pipe_fn:Callable[[Any,int,int],_DataPipeMeta]=None, 
#         # One or more `Transform`s for converting types. These will be re-called if workers!=0 for the dataloader.
#         type_tfms:list=None, 
#         item_tfms:list=None, # `ItemTransform`s, applied per peice of data (not batch)
#         batch_tfms:list=None, # `Transform`s applied over a batch of data
#         # `Callback`s for use in dataloaders. These usually augment a preexisting pipeline in some way
#         cbs:list=None,
#         pipe_fn_kwargs:dict=None, # Additional arguments to be passed to `pipe_fn`
#         dl_type:DataLoader2=None, # Task specific `TfmdDL`, defaults to `TfmdDL`
#         dls_kwargs:dict=None, # Additional arguments to be passed to `DataLoaders`
#     ):
#         self.type_tfms                   = L(type_tfms)
#         self.item_tfms                   = L(item_tfms)
#         self.batch_tfms                  = L(batch_tfms)
#         self.pipe_fn,self.pipe_fn_kwargs = pipe_fn,ifnone(pipe_fn_kwargs,{})
#         self.cbs                         = L(cbs)
#         self.dl_type,self.dls_kwargs     = dl_type,ifnone(dls_kwargs,{})

In [6]:
#|exports
DataPipeOrDataLoader = Union[DataPipe,DataLoader2]
TransformBlock = Callable[[Union[Iterable,DataPipe]],DataPipeOrDataLoader]

`DataBlock` as defined below expects single or tuples of `TransformBlock` callables. These functions need to have the above signature. The simplest example would be:

In [7]:
def TestTransformBlock(
    a:int=1,
    b:str='test',
    dp_augmentation_fns:List[Callable]=None
) -> Callable[[Any],DataPipeOrDataLoader]:
    def _TestTransformBlock(
        source,
        num_workers:int=0,
        as_dataloader:bool=False
    ) -> DataPipeOrDataLoader:
        pipe = dp.iter.IterableWrapper(range(10))
        pipe = pipe.map(lambda o:o+a)
        pipe = pipe.map(lambda o:str(o))
        pipe = pipe.map(lambda o:o+b)

        for fn in ifnone(dp_augmentation_fns,[]):
            result = fn(pipe)
            if result is not None: pipe = result
        
        if as_dataloader:
            pipe = DataLoader2(
                datapipe=pipe,
                reading_service=PrototypeMultiProcessingReadingService(
                    num_workers = num_workers,
                    protocol_client_type = InputItemIterDataPipeQueueProtocolClient,
                    protocol_server_type = InputItemIterDataPipeQueueProtocolServer,
                    pipe_type = item_input_pipe_type,
                    eventloop = SpawnProcessForDataPipeline
                ) if num_workers>0 else None
            )
        return pipe 
    return _TestTransformBlock

In [8]:
#|export
class DataBlock(object):
    def __init__(
        self,
        # `TransformBlock`s where a single transform block is treated as a single dataloader / datapipe
        # and a Tuple[TransformBlock] is also treated as a single dataloader of combined pipes.
        blocks:List[TransformBlock]=None, 
    ):
        store_attr(but='blocks')
        self.blocks = L(blocks)

    def datapipes(
        self,
        source:Any,
        bs=1,
        n=None,
        return_blocks:bool=False
    ) -> Generator[Union[Tuple[_DataPipeMeta,TransformBlock],_DataPipeMeta],None,None]:
        for b in self.blocks:
            pipe = b.pipe_fn(source,bs=bs,n=n,**b.pipe_fn_kwargs)
            yield (pipe,b) if return_blocks else pipe
        
    def dataloaders(
        self,
        source:Any,
        bs=1,
        n=None,
        num_workers=0,
        **kwargs
    ) -> Generator[DataLoader2,None,None]:
        for pipe,block in self.datapipes(source,bs=bs,n=n,return_blocks=True,**kwargs):
            yield block.dl_type(pipe,num_workers=num_workers,**merge(kwargs,block.dls_kwargs))

add_docs(DataBlock,
"""`DataBlock` is a single object for constructing datapipes and dataloaders from `TransformBlock`s.""",
datapipes="""Combines `self.blocks` with `source` where `bs` can be defined. `n=None` means that the datapipes are
infinite / lengthless. If `n` is an integer then the datapipes will have an expected max len.
""",
dataloaders="Returns a dataloader for each respoctive combination of blocks." 
)

NameError: name '_DataPipeMeta' is not defined

In [None]:
#|hide
#|eval: false
from fastcore.imports import in_colab

# Since colab still requires tornado<6, we don't want to import nbdev if we don't have to
if not in_colab():
    from nbdev import nbdev_export
    nbdev_export()