In [None]:
#hide
#skip
%config Completer.use_jedi = False
# upgrade fastrl on colab
! [ -e /content ] && pip install -Uqq fastrl['dev'] pyvirtualdisplay && \
                     apt-get install -y xvfb python-opengl > /dev/null 2>&1 
# NOTE: IF YOU SEE VERSION ERRORS, IT IS SAFE TO IGNORE THEM. COLAB IS BEHIND IN SOME OF THE PACKAGE VERSIONS

In [None]:
# hide
from fastcore.imports import in_colab
# Since colab still requires tornado<6, we don't want to import nbdev if we don't have to
if not in_colab():
    from nbdev.showdoc import *
    from nbdev.imports import *
    if not os.environ.get("IN_TEST", None):
        assert IN_NOTEBOOK
        assert not IN_COLAB
        assert IN_IPYTHON
else:
    # Virutual display is needed for colab
    from pyvirtualdisplay import Display
    display = Display(visible=0, size=(400, 300))
    display.start()

In [None]:
# default_exp agents.dqn.core

In [None]:
# export
# Python native modules
import os
from collections import deque
# Third party libs
import torch
from torch.nn import *
from fastcore.all import *
from fastai.learner import *
from fastai.torch_basics import *
from fastai.torch_core import *
from fastai.callback.all import *
# Local modules
from fastrl.data.block import *
from fastrl.agent import *
from fastrl.core import *

# DQN Core
> Contains the minimum DQN API.

In [None]:
# export
class DQN(Module):
    def __init__(self,state_sz:int,action_sz:int,hidden=512):
        self.layers=Sequential(
            Linear(state_sz,hidden),
            ReLU(),
            Linear(hidden,action_sz),
        )
    def forward(self,x): return self.layers(x)

In [None]:
dqn=DQN(4,2)
dqn(torch.randn((10,4)))

tensor([[ 0.0043,  0.0131],
        [-0.2317,  0.2354],
        [-0.1468,  0.1337],
        [-0.0957,  0.1546],
        [-0.1007,  0.1002],
        [-0.3004,  0.1560],
        [-0.2387,  0.1041],
        [-0.1160,  0.3274],
        [-0.1999,  0.2108],
        [-0.3804,  0.0979]], grad_fn=<AddmmBackward>)

In [None]:
# export
class ArgMaxFeed(AgentCallback):
    def before_action(self): 
        raw_action=self.agent.model(self.experience['state'].to(default_device()))
        self.agent.raw_action_shape=raw_action.shape
        self.agent.action=torch.argmax(raw_action,dim=1).reshape(-1,1)
        
class DiscreteEpsilonRandomSelect(AgentCallback):
    
    def __init__(self,epsilon=0.5,idx=0,min_epsilon=0.2,max_epsilon=1,max_steps=5000):
        store_attr()
    
    def before_noise(self): 
        self.mask=torch.randn(size=(self.agent.action.shape[0],))<self.epsilon
        self.experience['randomly_selected']=self.mask.reshape(-1,1)
        self.experience['epsilon']=torch.full(self.agent.action.shape,self.epsilon)
        self.experience['orignal_actions']=self.agent.action.detach().clone()
        self.agent.action[self.mask]=self.agent.action[self.mask].random_(0,self.agent.raw_action_shape[1])
        self.agent.action=self.agent.action.detach().cpu().numpy()
    
        if self.agent.model.training: 
            self.idx+=1
            self.epsilon=max(self.min_epsilon,self.max_epsilon-self.idx/self.max_steps)

In [None]:
agent=Agent(dqn,cbs=[ArgMaxFeed,DiscreteEpsilonRandomSelect(max_epsilon=0.5)]).to(default_device())
agent.do_action(state=torch.randn((10,4)).to(default_device()))


(array([[0],
        [1],
        [1],
        [1],
        [0],
        [1],
        [0],
        [1],
        [0],
        [1]]),
 {'state': tensor([[ 0.6655,  0.1478, -1.0057,  0.0766],
          [ 0.5459,  1.1897, -0.0463, -0.3098],
          [-0.6644, -1.0284,  0.6910, -0.5376],
          [ 0.0307, -0.4979,  0.4100,  0.4971],
          [ 2.3816,  0.3332, -0.5500, -1.5564],
          [-1.0884,  1.2270, -0.6541,  0.1153],
          [ 1.1137,  1.8792, -0.5866,  1.6380],
          [-0.5739,  0.3266,  1.4212, -1.5118],
          [-0.3505,  1.8465,  1.8109, -0.7341],
          [-0.7187,  0.7667,  0.9460, -0.6117]], device='cuda:0'),
  'randomly_selected': tensor([[ True],
          [ True],
          [ True],
          [ True],
          [ True],
          [ True],
          [ True],
          [ True],
          [ True],
          [False]]),
  'epsilon': tensor([[0.5000],
          [0.5000],
          [0.5000],
          [0.5000],
          [0.5000],
          [0.5000],
          [0.500

In [None]:
agent.show_loop()

Start Action
   - before_preprocess: []
   - after_preprocess: []
   - before_action  : [ArgMaxFeed]
   - after_action   : []
   - before_noise   : [DiscreteEpsilonRandomSelect]
   - after_noise    : []
End Action


In [None]:
# export
@patch
def _do_epoch_validate(self:Learner,*args,**kwargs): return 0

In [None]:
# export        
class Epsilon(Metric):
    order=30
    epsilon=0

    @property
    def value(self): return self.epsilon
    def reset(self): self.epsilon=0
    def accumulate(self,learn):
        for cb in learn.model.cbs:
            if type(cb)==DiscreteEpsilonRandomSelect:
                self.epsilon=cb.epsilon

In [None]:
# export
class ExperienceReplay(Callback):
    def __init__(self,*args,bs=16,max_sz=200,warmup_sz=100,**kwargs):
        store_attr()
        self.memory=None
        self.pointer=0
    
    def after_pred(self):
        yb=BD(self.learn.yb[0]).mapv(to_detach)
        if self.memory is None:            self.memory=yb
        elif self.memory.bs()<self.max_sz: self.memory+=yb
        else:
            self.memory=self.memory[:self.pointer]+yb+self.memory[self.pointer+yb.bs():]
            self.pointer+=yb.bs()
            if self.pointer>self.max_sz: self.pointer=0
        with torch.no_grad():
            idxs=np.random.randint(0,self.memory.bs(),self.bs).tolist()
            self.learn.yb=(self.memory[idxs].mapv(to_device),)
        
        if self.memory.bs()<self.warmup_sz: raise CancelBatchException

In [None]:
# export
@patch
def after_create(self:Callback): 
    for cb in self.learn.cbs: 
        if hasattr(cb,'train_metrics'): cb.train_metrics=True


class DQNTrainer(Callback):
    "Performs traditional training on `next_q`. Requires a callback such as `RegularNextQ`"
    def __init__(self,discount=0.99,n_steps=1):
        store_attr()
        self._yb=None  
    
    def after_pred(self): 
        self.learn.yb=self.yb[0]
        self._yb=({k:v.clone() for k,v in self.yb.items()},)
        self.learn.done_mask=self.yb['done'].reshape(-1,)
        self.learn.next_q=self.learn.model.model(self.yb['next_state']).max(dim=1).values.reshape(-1,1)
        self.learn.next_q[self.done_mask]=0 #yb[done_mask]['reward']
        self.learn.targets=self.yb['reward']+self.learn.next_q*(self.discount**self.n_steps)
        self.learn.pred=self.learn.model.model(self.yb['state'])
        t_q=self.pred.clone()
        t_q.scatter_(1,self.yb['action'],self.targets)
        self.learn.yb=(t_q,)
        
    def before_backward(self): self.learn.yb=self._yb

In [None]:
# slow
dqn=DQN(4,2)
agent=Agent(dqn,cbs=[ArgMaxFeed,DiscreteEpsilonRandomSelect])
source=Src('CartPole-v1',agent,steps_count=1,n_envs=1,seed=0,
           steps_delta=1,cbs=[GymSrc,FirstLast])

dls=SourceDataBlock(
    blocks=SourceBlock(source)
).dataloaders([source],n=1000,bs=1,num_workers=0)

exp_replay=ExperienceReplay(bs=32,max_sz=100000,warmup_sz=32)

learn=Learner(dls,agent,loss_func=MSELoss(),
              cbs=[exp_replay,DQNTrainer],
              metrics=[Reward,Epsilon])

> Note: This should work without a exp replay although will perform poorly.

In [None]:
learn.fit(5,lr=0.0001,wd=0)

epoch,train_loss,train_reward,train_epsilon,valid_loss,valid_reward,valid_epsilon,time
0,0.476229,21.67,0.8,00:28,,,
1,0.461504,23.09,0.6,00:29,,,
2,0.452432,23.6,0.4,00:28,,,
3,0.446137,22.62,0.2,00:30,,,
4,0.443432,24.74,0.2,00:31,,,


In [None]:
# hide
from fastcore.imports import in_colab

# Since colab still requires tornado<6, we don't want to import nbdev if we don't have to
if not in_colab():
    from nbdev.export import *
    from nbdev.export2html import *
    from nbdev.cli import make_readme
    make_readme()
    notebook2script()
    notebook2html()

converting /home/fastrl_user/fastrl/nbs/index.ipynb to README.md
Converted 00_core.ipynb.
Converted 00_nbdev_extension.ipynb.
Converted 03_callback.core.ipynb.
Converted 04_agent.ipynb.
Converted 05_data.block.ipynb.
Converted 05_data.test_async.ipynb.
Converted 10a_agents.dqn.core.ipynb.
Converted 10b_agents.dqn.targets.ipynb.
Converted 10c_agents.dqn.double.ipynb.
Converted 10d_agents.dqn.dueling.ipynb.
Converted 10e_agents.dqn.categorical.ipynb.
Converted 20_test_utils.ipynb.
Converted index.ipynb.
Converted nbdev_template.ipynb.
converting: /home/fastrl_user/fastrl/nbs/10a_agents.dqn.core.ipynb
converting: /home/fastrl_user/fastrl/nbs/10e_agents.dqn.categorical.ipynb
