In [1]:
#hide
#skip
%config Completer.use_jedi = False
# upgrade fastrl on colab
! [ -e /content ] && pip install -Uqq fastrl['dev'] pyvirtualdisplay && \
                     apt-get install -y xvfb python-opengl > /dev/null 2>&1 
# NOTE: IF YOU SEE VERSION ERRORS, IT IS SAFE TO IGNORE THEM. COLAB IS BEHIND IN SOME OF THE PACKAGE VERSIONS

In [2]:
# hide
from fastcore.imports import in_colab
# Since colab still requires tornado<6, we don't want to import nbdev if we don't have to
if not in_colab():
    from nbverbose.showdoc import *
    from nbdev.imports import *
    if not os.environ.get("IN_TEST", None):
        assert IN_NOTEBOOK
        assert not IN_COLAB
        assert IN_IPYTHON
else:
    # Virutual display is needed for colab
    from pyvirtualdisplay import Display
    display = Display(visible=0, size=(400, 300))
    display.start()

In [3]:
# default_exp agents.dqn.targets

In [4]:
# export
# Python native modules
import os
from collections import deque
from typing import *
# Third party libs
import torch
from torch.nn import *
from torch import optim
from fastcore.all import *
from fastai.learner import *
from fastai.torch_basics import *
from fastai.torch_core import *
from fastai.optimizer import OptimWrapper
from fastai.callback.all import *
# Local modules
from fastrl.data.block import *
from fastrl.data.gym import *
from fastrl.agent import *
from fastrl.core import *
from fastrl.agents.dqn.core import *
from fastrl.memory.experience_replay import *

# DQN Targets + N-Step
> A Bare-Bones DQN is usually extremely unstable. Target models can eleviate this. We also support First-Last N steps better.

In [5]:
GAMMA=0.99
def calc_target(net, local_reward, next_state,d):
    if next_state is None or d:
        return local_reward
    state_v = torch.tensor([next_state], dtype=torch.float32).to(default_device())
    next_q_v = net(state_v)
    best_q = next_q_v.max(dim=1)[0].item()
    return local_reward + GAMMA * best_q



In [6]:
# # for  i in range(1000):

# # learn.opt.zero_grad()

# learn.state_action_values = learn.model.model(learn.xb['state']).gather(1, learn.xb['action']).squeeze(-1)
# learn.next_state_values = learn.target_model(learn.xb['next_state']).max(1)[0]
# learn.next_state_values[learn.xb['done'].squeeze(-1)] = 0.0

# learn.expected_state_action_values = learn.next_state_values.detach() * (0.99**3) + learn.xb['reward'].squeeze(-1)
# learn.loss= nn.MSELoss()(learn.state_action_values,learn.next_state_values)

# # learn.loss.backward()
# # learn.opt.step()

# print(learn.loss)

In [7]:
# learn.state_action_values

In [8]:
# learn.xb['reward'].squeeze(-1)

In [9]:
# learn.next_state_values.detach()

In [10]:
# export
class DQNTargetTrainer(Callback):
    
    def __init__(self,n_batch=0,target_sync=300,discount=0.99,n_steps=1):
        store_attr()
        self._xb=None  
        
    def before_fit(self):
        self.learn.target_model=deepcopy(self.learn.model.model)
        self.n_batch=0
    
    def after_pred(self):
        self._xb=self.yb
        self.learn.yb=[]
        

  
        self.learn.opt.zero_grad()
        with torch.no_grad():
            s=self.learn.xb['state']
            a=self.learn.xb['action']
            ns=self.xb['next_state']
            r=self.xb['reward']
            d=self.xb['done']
    

        self.learn.state_action_values = self.learn.model.model(s).gather(1,a).squeeze(-1)
        with torch.no_grad():
            self.learn.next_state_values = self.target_model(ns).max(1)[0]
            self.learn.next_state_values[d.squeeze(-1)] = 0.0

            self.learn.expected_state_action_values = self.learn.next_state_values.detach() * (self.discount**self.n_steps) + r.squeeze(-1)
        self.learn.loss= nn.MSELoss()(self.learn.state_action_values,self.learn.expected_state_action_values)
        
        if (self.n_batch-1)%self.target_sync==0:
            print('The loss should be practically zero: ',self.loss)
            print(self.learn.state_action_values-self.learn.expected_state_action_values)
        
        # raise Exception
        self.learn.loss.backward()
        self.learn.opt.step()
#         self.learn.batch_targets = torch.cat([calc_target(self.learn.model.model, r, ns.cpu().numpy(),d)
#                          for r,ns,d in zip(self.learn.xb['reward'],self.learn.xb['next_state'],self.learn.xb['done'])])
        
#         self.learn.opt.zero_grad()
#         self.learn.states_v = self.xb['state'].to(default_device()).float()
#         self.learn.net_q_v = self.learn.model.model(self.learn.states_v)
#         # print(net_q_v)
#         self.learn.target_q = self.learn.net_q_v.cpu().data.numpy().copy()
        
#         # print(batch_targets,target_q)
#         self.learn.target_q[range(self.learn.xb.bs()), self.xb['action'].cpu()] = self.learn.batch_targets.cpu()
#         self.learn.target_q_v = torch.tensor(self.learn.target_q)
#         # print(net_q_v, target_q_v)
#         loss_v = self.learn.loss_func(self.learn.net_q_v.cpu(), self.learn.target_q_v.cpu())
#         loss_v.backward()
#         self.learn.loss=loss_v.cpu()
#         # print(loss_v)
#         self.learn.opt.step()
        
#         self.learn.yb=self.xb
#         self.learn.xb=self.xb[0]
#         self._xb=({k:v.clone() for k,v in self.xb.items()},)
#         self.learn.done_mask=self.xb['done'].reshape(-1,)
#         self.learn.next_q=self.target_model(self.xb['next_state']).max(dim=1).values.reshape(-1,1)
#         self.learn.next_q[self.done_mask]=1
#         self.learn.targets=self.xb['reward']+self.learn.next_q*(self.discount**self.n_steps)
#         self.learn.pred=self.learn.model.model(self.xb['state'])

#         t_q=self.pred.clone()
#         t_q.scatter_(1,self.xb['action'],self.targets)
#         self.learn.yb=(t_q,)
        with torch.no_grad():
            self.learn.td_error=(self.learn.state_action_values.cpu()-self.learn.expected_state_action_values.cpu()).reshape(-1,1)**2
        
    def before_backward(self): self.learn.yb=self._xb
        
    def after_batch(self):
        if self.n_batch%self.target_sync==0:
            self.target_model.load_state_dict(self.learn.model.state_dict())
            # if self.n_batch>1:raise Exception
        self.n_batch+=1

In [17]:
dqn=DQN(4,2)
agent=Agent(dqn,cbs=[ArgMaxFeed,DiscreteEpsilonRandomSelect(min_epsilon=0.02)])
source=Source(cbs=[GymLoop('CartPole-v1',agent,steps_count=1,seed=None,#mode='rgb_array',
                           steps_delta=1),FirstLast#,ResReduce(reduce_by=4)
                  ])
dls=SourceDataBlock().dataloaders([source],n=1000,bs=1,num_workers=0)

er_tb=ExperienceReplayTensorboard(comment='_dqn_target',every_epoch=1)
# opt=optim.Adam(dqn.parameters(), lr=0.0001)
learn=Learner(dls,agent,loss_func=MSELoss(),
              opt_func=partial(OptimWrapper,dqn.parameters(),optim.Adam),
              cbs=[ExperienceReplayCallback(bs=32,max_sz=1000,warmup_sz=32,freeze_at_max=True),
                   DQNTargetTrainer(n_steps=1,target_sync=5000)
                   ,er_tb
                  ],
              metrics=[Reward,Epsilon,NEpisodes])

Could not do one pass in your dataloader, there is something wrong in it


In [18]:
# hide
SHOW_TENSOR_BOARD=True
if not os.environ.get("IN_TEST", None) and SHOW_TENSOR_BOARD:
    run_tensorboard(samples_per_plugin='images=2000')

A few things, the loss should be <1. If it is not, there is something majorly wrong iwth the training.

It is alright for the actual values twe are comparing to be way more than >1 though, but it is expected that the loss should not be all that high at all.

In [19]:
slow=True
learn.fit(3 if not slow else 47,lr=0.0001,wd=None)

epoch,train_loss,train_reward,train_epsilon,train_n_episodes,valid_loss,valid_reward,valid_epsilon,valid_n_episodes,time
0,0.002783,17.960784,0.6,51,00:35,,,,
1,0.001929,19.73,0.2,102,00:21,,,,
2,0.001884,20.88,0.02,145,00:23,,,,
3,0.001725,20.52,0.02,191,00:23,,,,
4,0.001655,21.57,0.02,268,00:22,,,,
5,0.040002,20.93,0.02,331,00:22,,,,
6,0.036043,19.92,0.02,378,00:22,,,,
7,0.036505,19.94,0.02,429,00:21,,,,
8,0.037018,19.19,0.02,481,00:23,,,,
9,0.034429,18.44,0.02,529,00:22,,,,


  warn('image is missing from the experience replay. Image section of the replay will not be logged.')


The loss should be practically zero:  TensorBatch(0.9907, device='cuda:0', grad_fn=<AliasBackward>)
TensorBatch([-1.0440, -1.0236, -1.0498, -0.0246, -1.0358, -1.0149, -1.0102, -0.9955,
        -1.0295, -1.0671, -1.0393, -1.0625, -1.0461, -1.0243, -1.0161, -0.0478,
        -1.0099, -1.0370, -1.0111, -1.0322, -0.9760, -1.0351, -1.0100, -1.0388,
        -0.9724, -1.0122, -1.0463, -1.0477, -1.0482, -1.0032, -1.0263, -1.0659],
       device='cuda:0', grad_fn=<AliasBackward>)
The loss should be practically zero:  TensorBatch(0.9941, device='cuda:0', grad_fn=<AliasBackward>)
TensorBatch([-0.7403, -1.0462, -1.0342, -1.0266, -1.0569, -1.0326, -1.0578, -1.0104,
        -0.8361, -1.0350, -1.0262, -0.9537, -1.0112, -1.0029, -1.0225, -1.0659,
        -1.0202, -0.7528, -0.9697, -1.0647, -1.0191, -0.8861, -0.9843, -0.9945,
        -1.1034, -0.9715, -0.9514, -0.9955, -1.0290, -1.0312, -1.0427, -1.0244],
       device='cuda:0', grad_fn=<AliasBackward>)
The loss should be practically zero:  TensorBatch(

In [None]:
# hide
from fastcore.imports import in_colab

# Since colab still requires tornado<6, we don't want to import nbdev if we don't have to
if not in_colab():
    from nbdev.export import *
    from nbdev.export2html import *
    from nbverbose.cli import *
    make_readme()
    notebook2script()
    notebook2html()