In [1]:
#hide
#skip
%config Completer.use_jedi = False
%config IPCompleter.greedy=True
# upgrade fastrl on colab
! [ -e /content ] && pip install -Uqq fastrl['dev'] pyvirtualdisplay && \
                     apt-get install -y xvfb python-opengl > /dev/null 2>&1 
# NOTE: IF YOU SEE VERSION ERRORS, IT IS SAFE TO IGNORE THEM. COLAB IS BEHIND IN SOME OF THE PACKAGE VERSIONS

In [2]:
# hide
from fastcore.imports import in_colab
# Since colab still requires tornado<6, we don't want to import nbdev if we don't have to
if not in_colab():
    from nbdev.showdoc import *
    from nbdev.imports import *
    if not os.environ.get("IN_TEST", None):
        assert IN_NOTEBOOK
        assert not IN_COLAB
        assert IN_IPYTHON
else:
    # Virutual display is needed for colab
    from pyvirtualdisplay import Display
    display = Display(visible=0, size=(400, 300))
    display.start()

In [3]:
# default_exp fastai.learner

In [4]:
# export
# Python native modules
import os
from typing import *
# Third party libs
from fastcore.all import *
from torch.utils.data.dataloader_experimental import DataLoader2
import torchdata.datapipes as dp
from torch.nn import *
from torch.optim import *
from fastai.torch_basics import *
from fastai.torch_core import *
from torch.utils.data.graph import traverse

# Local modules
from fastrl.fastai.data.block import *
from fastrl.fastai.data.pipes.core import *

# Learner
> A revised fastai learner that uses DataPipe shimming

In [5]:
# export
class XYSplit(dp.iter.IterDataPipe):
    def __init__(self,source_datapipe,learn,x_fld):
        self.source_datapipe = source_datapipe
        self.learn = learn
        self.x_fld = x_fld
        
    def __iter__(self):
        for b in self.source_datapipe:
            
            for v in b.values():
                if v.shape[0]==1 and len(v.shape)!=2: v.squeeze_(0)

            yield (b[self.x_fld],b)

In [6]:
# export
class ModelPredict(dp.iter.IterDataPipe):
    def __init__(self,source_datapipe,learn):
        self.source_datapipe = source_datapipe
        self.learn = learn
        
    def __iter__(self):
        for xb,yb in self.source_datapipe:
            self.learn.xb = xb
            self.learn.yb = yb
            self.learn.preds = self.learn.model(xb)
            yield self.learn.preds

In [7]:
# export
class LossCalc(dp.iter.IterDataPipe):
    def __init__(self,source_datapipe,learn,y_target=None):
        self.source_datapipe = source_datapipe
        self.learn = learn
        self.y_target = y_target
        
    def __iter__(self):
        for batch in self.source_datapipe:
            if self.y_target is None:
                self.learn.loss_grad = self.learn.loss_func(self.learn.preds, *self.learn.yb)
            else:
                self.loss_grad = self.learn.loss_func(self.learn.preds, *self.learn.yb[self.y_target])
            self.learn.loss = self.learn.loss_grad.clone()
            yield self.learn.loss

In [8]:
# export
def default_train_loop(
    dls:List[DataLoader2],
    cbs:Optional[List[Callback]]=None,
):
    train_valid = L(dls).map(dp.iter.IterableWrapper).add_cbs(cbs)
    
    return train_vals

In [9]:
# export
def only_train_loop(
    dls:List[DataLoader2],
    cbs:Optional[List[Callback]]=None,
):
    train = dp.iter.IterableWrapper(dls,deepcopy=False).add_cbs(cbs)

    # train = 
    
    return train

In [10]:
# export
class Learner(dp.iter.IterDataPipe):
    def __init__(self,model,dls,opt,loss_func,cbs,train_loop=None):
        store_attr('model,dls,opt,loss_func')
        self.cbs = L()
        self.add_cbs(cbs)
        self.train_loop = ifnone(train_loop,default_train_loop)
        
    def fit(self,epochs):
        self.it = iter(self.dls[0])
        train_pipe = only_train_loop(L(self.it),self.cbs) # Do not pass tuple, otherwise traverse will try to read the dl datapipes
        for res in train_pipe:
            print(res)

    def add_cbs(self, cbs):
        L(cbs).map(self.add_cb)
        return self

    def remove_cbs(self, cbs):
        L(cbs).map(self.remove_cb)
        return self

    def add_cb(self, cb):
        if isinstance(cb, type): cb = cb()
        cb.learn = self
        cb.init_pipes()
        setattr(self, cb.name, cb)
        self.cbs.append(cb)
        return self

In [11]:
# export
class DQN(Module):
    def __init__(self,state_sz:int,action_sz:int,hidden=512):
        self.layers=Sequential(
            Linear(state_sz,hidden),
            ReLU(),
            Linear(hidden,action_sz),
        )
    def forward(self,x): return self.layers(x)

In [12]:
# export
class QCalc(dp.iter.IterDataPipe):
    def __init__(self,source_datapipe,learn,discount,nsteps):
        self.source_datapipe = source_datapipe
        self.learn = learn
        self.discount = discount
        self.nsteps = nsteps
        
    def __iter__(self):
        for batch in self.source_datapipe:
            
            self.learn.done_mask = self.learn.yb['done'].reshape(-1,)
            self.learn.next_q = self.learn.model(self.learn.yb['next_state'])
            self.learn.next_q = self.learn.next_q.max(dim=1).values.reshape(-1,1)
            self.learn.next_q[self.learn.done_mask] = 0 #xb[done_mask]['reward']
            self.learn.targets = self.learn.yb['reward']+self.learn.next_q*(self.discount**self.nsteps)
            self.learn.pred = self.learn.model(self.learn.yb['state'])
            
            
            t_q=self.learn.pred.clone()
            t_q.scatter_(1,self.learn.yb['action'],self.learn.targets)
            # finalize the xb and yb
            self.learn.yb=(t_q,)
            yield batch
            
class ModelLearnCalc(dp.iter.IterDataPipe):
    def __init__(self,source_datapipe,learn):
        self.source_datapipe = source_datapipe
        self.learn = learn
        
    def __iter__(self):
        for batch in self.source_datapipe:
            self.learn.loss_grad.backward()
            self.learn.opt.step()
            self.learn.opt.zero_grad()
            yield self.learn.loss

In [13]:
class Dict2TensorBatch(dp.iter.IterDataPipe):
    def __init__(self,source_datapipe,learn):
        self.source_datapipe = source_datapipe
        self.learn = learn

    def __iter__(self):
        for batch in self.source_datapipe:
            if not isinstance(batch,dict): 
                raise TypeError(f'Batch should be a dict, not {type(batch)}')
            yield {k:tensor(v) for k,v in batch.items()}

In [14]:
class ExperienceReplay(dp.iter.IterDataPipe):
    debug=False
    def __init__(self,source_datapipe,learn,bs=1,maxsz=100):
        self.memory = {}
        self.source_datapipe = source_datapipe
        self.learn = learn
        if learn is not None:
            self.learn.experience_replay = self
        self.bs = bs
        self.maxsz = maxsz
        self._sz_tracker = 0
    
    def sample(self,bs=None):  
        idxs = np.random.choice(range(self._sz_tracker),size=(ifnone(bs,self.bs),),replace=False)
        return {k:v[idxs] for k,v in self.memory.items()}

    def __iter__(self):
        for b in self.source_datapipe:

            if self.debug: print('Experience Replay Adding: ',b)
            for v in b.values():
                if not isinstance(v,Tensor): continue
                if v.shape[0]==1 and len(v.shape)!=2: v.squeeze_(0)
            if isinstance(b,dict):   self.add(b)
            elif isinstance(b,(list,tuple)): 
                for element in b: self.add(element)
            yield self.sample()

    def add(self,d:Dict): 
        d = {k:to_np(v) if isinstance(v,Tensor) else np.array(v) for k,v in d.items()}
        if len(self.memory)==0: 
            self.memory = d
            self._sz_tracker = 1
        elif self._sz_tracker>self.maxsz:
            for k,v in self.memory.items():
                self.memory[k] = np.vstack((v[1:],d[k]))
            self._sz_tracker+=1
        else:
            for k,v in self.memory.items():
                self.memory[k] = np.vstack((v,d[k]))
            self._sz_tracker+=1

In [15]:
class ReinforcementLearningSimpleCallback(Callback):
    call_on=L(dp.iter.IterableWrapper)
    
    def init_pipes(self):
        self.pipes=L(
            partial(ExperienceReplay,learn=self.learn),
            partial(Dict2TensorBatch,learn=self.learn),
            partial(XYSplit,learn=self.learn,x_fld='state'),
            partial(ModelPredict,learn=self.learn),
            partial(QCalc,learn=self.learn,discount=0.99,nsteps=3),
            partial(LossCalc,learn=self.learn),
            partial(ModelLearnCalc,learn=self.learn)
        )


In [16]:
pipe = dp.iter.IterableWrapper([dict(test=np.array([i])) for i in range(10)])
exp = ExperienceReplay(pipe,None)

In [18]:
# Setup up the core NN
model = DQN(4,2)
# Setup the agent
agent_base = Agent(model,[])
agent = RawOutOfStep(agent_base,agent_base,'state')
agent = DiscreteEpsilonRandomSelect(agent,agent_base,2,min_epsilon=0)
agent = ArgmaxOfStep(agent,agent_base)
agent = ToDiscrete(agent,agent_base)
# Setup the data block 
GymTransformBlock = TransformBlock(
    type_tfms  = GymTypeTransform,
    item_tfms  = (GymStepTransform(agent),DictToTensor),
    batch_tfms = DictCollate,
    cbs = NStepCallback(nsteps=3)
)
# Init with supported loader loop
block = DataBlock(
    blocks=GymTransformBlock,
    loader_loop=simple_iter_loader_loop
)
# Init the loader(s)
dls = block.dataloaders(['CartPole-v1']*3,n=20,n_workers=0,bs=3)

In [19]:
learn = Learner(model, dls, opt=AdamW(model.parameters(),lr=0.01), 
                loss_func=MSELoss(),train_loop=only_train_loop,
               cbs=ReinforcementLearningSimpleCallback)

In [20]:
traverse(learn)

{Learner: {_IterDataPipeSerializationWrapper: {BatcherIterDataPipe: {BatchTransformLoop: {BatcherIterDataPipe: {Flattener: {NStepPipe: {NSkipPipe: {ItemTransformLoop: {ToDiscrete: {ArgmaxOfStep: {DiscreteEpsilonRandomSelect: {RawOutOfStep: {Agent: {}},
             Agent: {RawOutOfStep: {}}},
            Agent: {DiscreteEpsilonRandomSelect: {RawOutOfStep: {}}}},
           Agent: {DiscreteEpsilonRandomSelect: {RawOutOfStep: {}}}},
          CyclerIterDataPipe: {ShardingFilterIterDataPipe: {MapToIterConverterIterDataPipe: {InMemoryCacheHolderMapDataPipe: {TypeTransformLoop: {SequenceWrapperMapDataPipe: {}}}}}}}}}}}}}}}}

In [21]:
ExperienceReplay.debug=False

In [22]:
learn.fit(4)

tensor(0.4057, grad_fn=<CloneBackward0>)
tensor(0.2460, grad_fn=<CloneBackward0>)
tensor(0.1467, grad_fn=<CloneBackward0>)
tensor(0.0826, grad_fn=<CloneBackward0>)
tensor(9.0700, grad_fn=<CloneBackward0>)
tensor(6.4437, grad_fn=<CloneBackward0>)
tensor(0.0281, grad_fn=<CloneBackward0>)
tensor(0.6099, grad_fn=<CloneBackward0>)
tensor(0.3778, grad_fn=<CloneBackward0>)
tensor(0.6248, grad_fn=<CloneBackward0>)
tensor(1.6429, grad_fn=<CloneBackward0>)
tensor(1.4480, grad_fn=<CloneBackward0>)
tensor(0.7320, grad_fn=<CloneBackward0>)
tensor(0.5917, grad_fn=<CloneBackward0>)
tensor(0.2401, grad_fn=<CloneBackward0>)
tensor(0.8641, grad_fn=<CloneBackward0>)
tensor(0.3782, grad_fn=<CloneBackward0>)
tensor(0.3719, grad_fn=<CloneBackward0>)
tensor(0.3894, grad_fn=<CloneBackward0>)
tensor(0.3869, grad_fn=<CloneBackward0>)
tensor(0.3677, grad_fn=<CloneBackward0>)
tensor(0.9694, grad_fn=<CloneBackward0>)
tensor(0.4739, grad_fn=<CloneBackward0>)
tensor(0.2703, grad_fn=<CloneBackward0>)
tensor(0.2282, g

In [23]:
# hide
from fastcore.imports import in_colab

# Since colab still requires tornado<6, we don't want to import nbdev if we don't have to
if not in_colab():
    from nbdev.export import *
    from nbdev.export2html import *
    from nbdev.cli import *
    make_readme()
    notebook2script(silent=True)

converting /home/fastrl_user/fastrl/nbs/index.ipynb to README.md
