In [1]:
#|hide
#|eval: false
! [ -e /content ] && pip install -Uqq fastrl['dev'] pyvirtualdisplay && \
                     apt-get install -y xvfb python-opengl > /dev/null 2>&1 
# NOTE: IF YOU SEE VERSION ERRORS, IT IS SAFE TO IGNORE THEM. COLAB IS BEHIND IN SOME OF THE PACKAGE VERSIONS

In [2]:
#|hide
from fastcore.imports import in_colab
# Since colab still requires tornado<6, we don't want to import nbdev if we don't have to
if not in_colab():
    from nbdev.showdoc import *
    from nbdev.imports import *
    if not os.environ.get("IN_TEST", None):
        assert IN_NOTEBOOK
        assert not IN_COLAB
        assert IN_IPYTHON
else:
    # Virutual display is needed for colab
    from pyvirtualdisplay import Display
    display = Display(visible=0, size=(400, 300))
    display.start()

In [3]:
#|default_exp memory.experience_replay

In [4]:
#|export
# Python native modules
import os
from typing import *
from warnings import warn
# Third party libs
from fastcore.all import *
from fastai.learner import *
from fastai.torch_basics import *
from fastai.torch_core import *
import torchdata.datapipes as dp
# Local modules
from fastrl.core import *

# Experience Replay
> Experience Replay is likely the simplest form of memory used by RL agents. 

In [7]:
#|export
class ExperienceReplay(dp.iter.IterDataPipe):
    debug=False
    def __init__(self,source_datapipe,learn=None,bs=1,max_sz=100,
                 return_idxs=False):
        self.memory = np.array([None]*max_sz)
        self.source_datapipe = source_datapipe
        self.learn = learn
        if learn is not None:
            self.learn.experience_replay = self
        self.bs = bs
        self.max_sz = max_sz
        self._sz_tracker = 0
        self._idx_tracker = 0
        self._cycle_tracker = 0
        self.return_idxs = return_idxs
    
    def sample(self,bs=None):  
        idxs = np.random.choice(range(self._sz_tracker),size=(ifnone(bs,self.bs),),replace=False)
        if self.return_idxs: return self.memory[idxs],idxs
        return self.memory[idxs]
    
    def __repr__(self):
        return str({k:v if k!='memory' else f'{len(self)} elements' for k,v in self.__dict__.items()})

    def __len__(self): return self._sz_tracker
    
    def __iter__(self):
        for i,b in enumerate(self.source_datapipe):
            if self.debug: print('Experience Replay Adding: ',b)
            
            if not issubclass(b.__class__,(StepType,list,tuple)):
                raise Exception(f'Expected typing.NamedTuple,list,tuple object got {type(step)}\n{step}')
            
            if issubclass(b.__class__,StepType):   self.add(b)
            elif issubclass(b.__class__,(list,tuple)): 
                for step in b: self.add(step)
            else:
                raise Exception(f'This should not have occured: {self.__dict__}')
        
            if self._sz_tracker<self.bs: continue
            yield self.sample()

    def add(self,step:StepType): 
        if self._sz_tracker==0: 
            self.memory[self._idx_tracker] = step
            self._sz_tracker += 1
            self._idx_tracker = 1
        elif 0<self._sz_tracker<self.max_sz:
            self.memory[self._idx_tracker] = step
            self._sz_tracker += 1
            self._idx_tracker += 1
        elif self._sz_tracker>=self.max_sz:
            if self._idx_tracker>=self.max_sz:
                self._idx_tracker = 0
                self._cycle_tracker += 1
            self.memory[self._idx_tracker] = step
            self._idx_tracker += 1
        else:
            raise Exception(f'This should not have occured: {self.__dict__}')
            
add_docs(
    ExperienceReplay,
    """Simplest form of memory. Takes steps from `source_datapipe` to stores them in `memory`. 
       It outputs `bs` steps.""",
    sample="Returns `bs` steps from `memory` in a uniform distribution.",
    add="Adds new steps to `memory`. If `memory` reaches size `max_sz` then `step` will be added in earlier steps."
)

lets generate some batches to test with...

In [6]:
from fastrl.pipes.core import *
from fastrl.fastai.data.block import *
from fastrl.envs.gym import *

def baseline_test(envs,total_steps,seed=0):
    pipe = dp.map.Mapper(envs)
    pipe = TypeTransformLoop(pipe,[GymTypeTransform])
    pipe = dp.iter.MapToIterConverter(pipe)
    pipe = dp.iter.InMemoryCacheHolder(pipe)
    pipe = pipe.cycle()
    pipe = GymStepper(pipe,seed=seed)

    steps = [step for _,step in zip(*(range(total_steps),pipe))]
    return steps, pipe

@delegates(ExperienceReplay)
def exp_replay_test(envs,total_steps,seed=0,**kwargs):
    pipe = dp.map.Mapper(envs)
    pipe = TypeTransformLoop(pipe,[GymTypeTransform])
    pipe = dp.iter.MapToIterConverter(pipe)
    pipe = dp.iter.InMemoryCacheHolder(pipe)
    pipe = pipe.cycle()
    pipe = GymStepper(pipe,seed=seed)
    pipe = ExperienceReplay(pipe,**kwargs)

    steps = [step for _,step in zip(*(range(total_steps),pipe))]
    return steps, pipe

In [7]:
steps, experience_replay = exp_replay_test(['CartPole-v1'],0,bs=1)
test_eq(len(experience_replay),0)

**what if we fill up ER?**
Lets add the batches, this process will happen inplace...

In [8]:
steps, experience_replay = exp_replay_test(['CartPole-v1'],10,max_sz=20)
test_eq(experience_replay._sz_tracker,10)
test_eq(experience_replay._idx_tracker,10)
test_eq(experience_replay._cycle_tracker,0)
test_len(experience_replay,10)

If we run 10 more times, the total size should be 20...

In [9]:
steps = [step for step,_ in zip(*(range(10),experience_replay))]
test_eq(experience_replay._sz_tracker,20)
test_eq(experience_replay._idx_tracker,20)
test_eq(experience_replay._cycle_tracker,0)
test_len(experience_replay,20)

`experience_replay` memory should contain identical steps to if we just run without it...

In [10]:
steps, pipe = baseline_test(['CartPole-v1'],20,)

for baseline_step,memory_step in zip(steps,experience_replay.memory):
    test_eq(baseline_step.state,memory_step.state)
    test_eq(baseline_step.next_state,memory_step.next_state)

Since the `max_sz` is 20, and so far we have run a total of 20 steps, if we run another 10 steps,
the `_cycle_tracker` should be 1 (since this is a new cycle),`_idx_tracker` should be 10 since it should 
have reset and stopped half way in the memory. The `_sz_tracker` should still be 20.

In [11]:
steps = [step for step,_ in zip(*(range(10),experience_replay))]
test_eq(experience_replay._sz_tracker,20)
test_eq(experience_replay._idx_tracker,10)
test_eq(experience_replay._cycle_tracker,1)
test_len(experience_replay,20)

...and if we run the baseline, the last 10 steps in the baseline, should match the first 10 steps in memory
since it is in the middle of re-writing the memory due to being at max size.

In [12]:
steps, pipe = baseline_test(['CartPole-v1'],30)

for baseline_step,memory_step in zip(steps[20:],experience_replay.memory[:10]):
    test_eq(baseline_step.state,memory_step.state)
    test_eq(baseline_step.next_state,memory_step.next_state)

Finally we want to finish writing over the memory in its entirety. 

In [13]:
steps = [step for step,_ in zip(*(range(10),experience_replay))]
test_eq(experience_replay._sz_tracker,20)
test_eq(experience_replay._idx_tracker,20)
test_eq(experience_replay._cycle_tracker,1)
test_len(experience_replay,20)

In [14]:
steps, pipe = baseline_test(['CartPole-v1'],40)

for baseline_step,memory_step in zip(steps[20:],experience_replay.memory):
    test_eq(baseline_step.state,memory_step.state)
    test_eq(baseline_step.next_state,memory_step.next_state)

Let's verify that the steps are what we expect...

**What if we sample the experience?**

In [15]:
steps, experience_replay = exp_replay_test(['CartPole-v1'],1000,bs=300,max_sz=1000)
memory = None
for i,sample in enumerate(experience_replay):
    for s in sample:
        if memory is not None: test_ne(s,memory)
        memory = copy(s)
    if i>100:break

We should be able to sample enough times that we have sampled **everything**. 
So we test this by sampling, check if that sample has been seen before, and then record that.

In [16]:
steps, experience_replay = exp_replay_test(['CartPole-v1'],1000,bs=1,max_sz=30,return_idxs=True)
memory_hits = [False]*30
for i in range(150):
    res,idxs = experience_replay.sample()
    for idx in idxs: memory_hits[idx] = True
test_eq(all(memory_hits),True)

In [8]:
#|hide
from fastcore.imports import in_colab

# Since colab still requires tornado<6, we don't want to import nbdev if we don't have to
if not in_colab():
    from nbdev import nbdev_export
    nbdev_export()