In [None]:
# default_exp qlearning.dqn

In [None]:
#export
import torch.nn.utils as nn_utils
from fastai.torch_basics import *
from fastai.data.all import *
from fastai.basics import *
from dataclasses import field,asdict
from typing import List,Any,Dict,Callable
from collections import deque
import gym
import torch.multiprocessing as mp
from copy import deepcopy
from torch.optim import *

from fastrl.data import *
from fastrl.async_data import *
from fastrl.basic_agents import *
from fastrl.learner import *
from fastrl.metrics import *
from fastrl.ptan_extension import *

if IN_NOTEBOOK:
    from IPython import display
    import PIL.Image

  return torch._C._cuda_getDeviceCount() > 0


# DQN

In [None]:
# export
class LinearDQN(nn.Module):
    def __init__(self, input_shape, n_actions):
        super(LinearDQN, self).__init__()

        self.policy = nn.Sequential(
            nn.Linear(input_shape[0], 512),
            nn.ReLU(),
            nn.Linear(512, n_actions)
        )

    def forward(self,x):
        fx=x.float()
        return self.policy(fx)
    
class ExperienceReplay(Callback):
    def __init__(self,sz=100,bs=128,starting_els=1,max_steps=1):
        store_attr()
        self.queue=deque(maxlen=int(sz))
        self.max_steps=max_steps
        
    def before_fit(self):
        self.learn.agent.warming_up=True
        while len(self.queue)<self.starting_els:
            for i,o in enumerate(self.dls.train):
                batch=[ExperienceFirstLast(state=o[0][i],action=o[1][i],reward=o[2][i],
                                    last_state=o[3][i], done=o[4][i],episode_reward=o[5][i],steps=o[6][i])
                                    for i in range(len(o[0]))]
#                 print(self.max_steps,max([o.steps for o in batch]))
                for _b in batch: self.queue.append(_b)
                if len(self.queue)>self.starting_els:break
        self.learn.agent.warming_up=False

#     def after_epoch(self):
#         print(len(self.queue))
    def before_batch(self):
#         print(len(self.queue))
        b=list(self.learn.xb)+list(self.learn.yb)
        batch=[ExperienceFirstLast(state=b[0][i],action=b[1][i],reward=b[2][i],
                                last_state=b[3][i], done=b[4][i],episode_reward=b[5][i],
                                steps=b[6][i])
                                for i in range(len(b[0]))]
        for _b in batch: self.queue.append(_b)
        idxs=np.random.randint(0,len(self.queue), self.bs)
        self.learn.sample_yb=[deepcopy(self.queue[i]) for i in idxs]

In [None]:
# export
class EpsilonTracker(Callback):
    def __init__(self,e_stop=0.2,e_start=1.0,e_steps=5000,current_step=0):
        store_attr()
        
    def before_fit(self):
        self.learn.agent.a_selector.epsilon=self.e_start
    
    def after_step(self):
        self.learn.agent.a_selector.epsilon=max(self.e_stop,self.e_start-self.current_step/self.e_steps)
        self.current_step+=1

In [None]:
# export
def calc_target(net, local_reward,next_state,done,discount):
    if done: return local_reward
    next_q_v = net(next_state.float().unsqueeze(0))
    best_q = next_q_v.max(dim=1)[0].item()
    return local_reward + discount * best_q

class DQNTrainer(Callback):
    def after_pred(self):
        s,a,r,sp,d,er,steps=(self.learn.xb+self.learn.yb)
        exps=[ExperienceFirstLast(*o) for o in zip(*(self.learn.xb+self.learn.yb))]
        batch_targets=[calc_target(self.learn.model, exp.reward, exp.last_state,exp.done,self.learn.discount)
                         for exp in exps]
        
        s_v = s.float()
        q_v = self.learn.model(s_v)
        t_q=q_v.data.numpy().copy()
        t_q[range(len(exps)), a] = batch_targets
        target_q_v = torch.tensor(t_q)
        self.learn._yb=self.learn.yb
        self.learn.yb=(target_q_v,)
        self.learn.pred=q_v
#         print(*self.learn.yb,self.learn.pred)
#         print(self.learn.pred,self.learn.yb)
#         print(self.learn._yb,self.learn.yb[0])
    
    def after_loss(self):self.learn.yb=self.learn._yb

In [None]:
# export
class DQNLearner(AgentLearner):
    def __init__(self,dls,discount=0.99,**kwargs):
        store_attr()
        self.target_q_v=[]
        super().__init__(dls,loss_func=nn.MSELoss(),**kwargs)

In [None]:
env='CartPole-v1'
model=LinearDQN((4,),2)
agent=DiscreteAgent(model=model.to(default_device()),device=default_device(),
                    a_selector=EpsilonGreedyActionSelector())

block=FirstLastExperienceBlock(agent=agent,seed=0,n_steps=2,dls_kwargs={'bs':8,'num_workers':0,'verbose':False,'indexed':True,'shuffle_train':False})
blk=IterableDataBlock(blocks=(block),
                      splitter=FuncSplitter(lambda x:False),
#                       batch_tfms=lambda x:(x['s'],x),
                     )
dls=blk.dataloaders([env]*1,n=8*1000,device=default_device())

learner=DQNLearner(dls,agent=agent,cbs=[EpsilonTracker,
                                        ExperienceReplay(sz=50000,bs=8,starting_els=8,max_steps=gym.make(env)._max_episode_steps),
                                        DQNTrainer],metrics=[AvgEpisodeRewardMetric(experience_cls=ExperienceFirstLast)])
learner.fit(15,lr=0.01,wd=0)

epoch,train_loss,train_avg_episode_r,valid_loss,valid_avg_episode_r,time
0,24.613981,29.54,,29.54,00:12
1,32.930523,51.57,,51.57,00:12
2,21.004852,82.09,,82.09,00:12
3,3.406,121.94,,121.94,00:12
4,8.700549,157.43,,157.43,00:12
5,25.562292,185.62,,185.62,00:12
6,72.588402,192.84,,192.84,00:12
7,4.033764,207.89,,207.89,00:12
8,39.098164,186.12,,186.12,00:13
9,18.933603,222.09,,222.09,00:13


In [None]:
# hide
from nbdev.export import *
from nbdev.export2html import *
notebook2script()
notebook2html()

Converted 00_core.ipynb.
Converted 01_wrappers.ipynb.
Converted 03_basic_agents.ipynb.
Converted 04_learner.ipynb.
Converted 05a_ptan_extend.ipynb.
Converted 05b_async_data.ipynb.
Converted 05c_data.ipynb.
Converted 13_metrics.ipynb.
Converted 14a_actorcritic.sac.ipynb.
Converted 14b_actorcritic.diayn.ipynb.
Converted 15_actorcritic.a3c_data.ipynb.
Converted 16_actorcritic.a2c.ipynb.
Converted 17_actorcritc.v1.dads.ipynb.
Converted 18_policy_gradient.ppo.ipynb.
Converted 19_policy_gradient.trpo.ipynb.
Converted 20a_qlearning.dqn.ipynb.
Converted 20b_qlearning.dqn_n_step.ipynb.
Converted 20c_qlearning.dqn_target.ipynb.
Converted 20d_qlearning.dqn_double.ipynb.
Converted 20e_qlearning.dqn_noisy.ipynb.
Converted index.ipynb.
Converted notes.ipynb.


converting: /opt/project/fastrl/nbs/20a_qlearning.dqn.ipynb
converting: /opt/project/fastrl/nbs/20c_qlearning.dqn_target.ipynb
