In [1]:
#|hide
#|eval: false
! [ -e /content ] && pip install -Uqq fastrl['dev'] pyvirtualdisplay && \
                     apt-get install -y xvfb python-opengl > /dev/null 2>&1 
# NOTE: IF YOU SEE VERSION ERRORS, IT IS SAFE TO IGNORE THEM. COLAB IS BEHIND IN SOME OF THE PACKAGE VERSIONS

In [2]:
#|hide
from fastcore.imports import in_colab
# Since colab still requires tornado<6, we don't want to import nbdev if we don't have to
if not in_colab():
    from nbdev.showdoc import *
    from nbdev.imports import *
    if not os.environ.get("IN_TEST", None):
        assert IN_NOTEBOOK
        assert not IN_COLAB
        assert IN_IPYTHON
else:
    # Virutual display is needed for colab
    from pyvirtualdisplay import Display
    display = Display(visible=0, size=(400, 300))
    display.start()

In [3]:
#|default_exp fastai.data.block

In [4]:
#|export
# Python native modules
import os
from typing import Any,Callable,Generator
from inspect import isfunction,ismethod
# Third party libs
from fastcore.all import *
from torch.utils.data.dataloader_experimental import DataLoader2
from torch.utils.data.datapipes._typing import _DataPipeMeta, _IterDataPipeMeta
from fastai.torch_core import *
from fastai.data.transforms import *
import torchdata.datapipes as dp
from collections import deque
from fastai.imports import *
# Local modules
from fastrl.pipes.core import *
from fastrl.core import *

# Data Block
> High level API to quickly get your data in a `DataLoader`s

## Transform Block

> Note: We will first validate the lower level API on a dqn before making the data block. This is going to be a naive implimentation.

In [5]:
#|export
class TransformBlock():
    "A basic wrapper that links defaults transforms for the data block API"
    def __init__(self, 
        # A function that initializes a datapipeline and returns a datapipe.
        # Minimum must support:
        #
        #     `pipe_fn(source, bs, n)`
        #
        # Where:
        #   - `source` is the data to be input into the datapipes
        #   - `bs` is the batch size of the returned data
        #   - `n` is the number of iterations to make through the datapipes per epoch                 
        pipe_fn:Callable[[Any,int,int],_DataPipeMeta]=None, 
        # One or more `Transform`s for converting types. These will be re-called if workers!=0 for the dataloader.
        type_tfms:list=None, 
        item_tfms:list=None, # `ItemTransform`s, applied per peice of data (not batch)
        batch_tfms:list=None, # `Transform`s applied over a batch of data
        # `Callback`s for use in dataloaders. These usually augment a preexisting pipeline in some way
        cbs:list=None,
        pipe_fn_kwargs:dict=None, # Additional arguments to be passed to `pipe_fn`
        dl_type:DataLoader2=None, # Task specific `TfmdDL`, defaults to `TfmdDL`
        dls_kwargs:dict=None, # Additional arguments to be passed to `DataLoaders`
    ):
        self.type_tfms                   = L(type_tfms)
        self.item_tfms                   = L(item_tfms)
        self.batch_tfms                  = L(batch_tfms)
        self.pipe_fn,self.pipe_fn_kwargs = pipe_fn,ifnone(pipe_fn_kwargs,{})
        self.cbs                         = L(cbs)
        self.dl_type,self.dls_kwargs     = dl_type,ifnone(dls_kwargs,{})

I have some thoughts on `TransformBlock`. I'm wondering if it would be so bad to have each `TransformBlock` serve as a guaranteed input and a guaranteed output as opposed to https://github.com/fastai/fastai/blob/5b6786a3cf4f98b86dcfed8b30738455ede2c640/fastai/data/block.py#L102 where the n_inp could change based on `get_x,get_y`.  

I think the main issue for me is that there isnt always a `y` in what im doing so the usage of DataBlock can feel awkward. I also find the nesting of functions difficult to debug easily, which is an issue if the idea is that new users will try to plugin their own stuff.

I felt it got more confusing when the blocks are merged together also.

I think that it would be better, that a `TransformBlock` was 1:1 with a dataloader. So if you have 2 transform blocks,
you have 2 dataloaders that do inputs and outputs. In RL I imagine having 2 separate environments running at the same time and 
collecting information from both of them. The required transforms might be different, so having separate `TransformBlocks` would
simplify things.

In [6]:
#|export
class DataBlock(object):
    def __init__(
        self,
        # Each transform block will have its own dataloader. 
        blocks:List[TransformBlock]=None, 
    ):
        store_attr(but='blocks')
        self.blocks = L(blocks)

    def datapipes(
        self,
        source:Any,
        bs=1,
        n=1,
        return_blocks:bool=False
    ) -> Generator[Union[Tuple[_DataPipeMeta,TransformBlock],_DataPipeMeta],None,None]:
        for b in self.blocks:
            pipe = b.pipe_fn(source,bs=bs,n=n,**b.pipe_fn_kwargs)
            yield (pipe,b) if return_blocks else pipe
        
    def dataloaders(
        self,
        source:Any,
        bs=1,
        n=1,
        n_workers=0,
        **kwargs
    ) -> Generator[DataLoader2,None,None]:
        for pipe,block in self.datapipes(source,bs=bs,n=n,return_blocks=True,**kwargs):
            yield block.dl_type(pipe,**merge(kwargs,block.dls_kwargs))

In [7]:
#|hide
from fastcore.imports import in_colab

# Since colab still requires tornado<6, we don't want to import nbdev if we don't have to
if not in_colab():
    from nbdev import nbdev_export
    nbdev_export()