In [1]:
# default_exp actorcritic.a3c_data

In [2]:
#export
import torch.nn.utils as nn_utils
from fastai.torch_basics import *
from fastai.data.all import *
from fastai.basics import *
from dataclasses import field,asdict
from typing import List,Any,Dict,Callable
from collections import deque
import gym
import torch.multiprocessing as mp
from torch.optim import *

from fastrl.data import *
from fastrl.async_data import *
from fastrl.basic_agents import *
from fastrl.learner import *
from fastrl.metrics import *

if IN_NOTEBOOK:
    from IPython import display
    import PIL.Image

In [3]:
# hide
from nbdev.showdoc import *
from nbdev.imports import *
if not os.environ.get("IN_TEST", None):
    assert IN_NOTEBOOK
    assert not IN_COLAB
    assert IN_IPYTHON

# A3C Datawise

## A3C Model

In [4]:
# export
class LinearA2C(nn.Module):
    def __init__(self, input_shape, n_actions):
        super(LinearA2C, self).__init__()

        self.policy = nn.Sequential(
            nn.Linear(input_shape[0], 512),
            nn.ReLU(),
            nn.Linear(512, n_actions)
        )

        self.value = nn.Sequential(
            nn.Linear(input_shape[0], 512),
            nn.ReLU(),
            nn.Linear(512, 1)
        )

    def forward(self,x):
#         print(x)
        fx=x.float()
        return self.policy(fx),self.value(fx)

## A3C Learner

In [5]:
# batch=[
#  Experience(s=tensor([[-0.0285,  0.1640, -0.0033, -0.3421]]),sp=tensor([[-0.0285,  0.1640, -0.0033, -0.3421]]),
#             a=tensor([1]),r=tensor([1.]),d=tensor([0.])),
#  Experience(s=tensor([[-0.0252, -0.0311, -0.0101, -0.0504]]),sp=tensor([[-0.0252, -0.0311, -0.0101, -0.0504]]),
#             a=tensor([0]),r=tensor([1.]),d=tensor([0.])),
#  Experience(s=tensor([[-0.0258, -0.2261, -0.0111,  0.2391]]),sp=tensor([[-0.0258, -0.2261, -0.0111,  0.2391]]),
#             a=tensor([0]),r=tensor([1.]),d=tensor([0.])),
#  Experience(s=tensor([[-0.0517, -0.2260,  0.0195,  0.2377]]),sp=tensor([[-0.0517, -0.2260,  0.0195,  0.2377]]),
#             a=tensor([1]),r=tensor([1.]),d=tensor([0.])),
#  Experience(s=tensor([[-0.0562, -0.4214,  0.0242,  0.5365]]),sp=tensor([[-0.0562, -0.4214,  0.0242,  0.5365]]),
#             a=tensor([0]),r=tensor([1.]),d=tensor([0.])),
#  Experience(s=tensor([[-0.0647, -0.6169,  0.0349,  0.8367]]),sp=tensor([[-0.0647, -0.6169,  0.0349,  0.8367]]),
#             a=tensor([0]),r=tensor([1.]),d=tensor([1.]))
# ]

In [6]:
# export
def unbatch(batch, net, last_val_gamma, device='cpu'):
    states = []
    actions = []
    rewards = []
    not_done_idx = []
    last_states = []

    for idx, exp in enumerate(batch):
#         print(exp.state.numpy()[0].shape,int(exp.action),float(exp.reward),exp.last_state.numpy()[0].shape if not bool(exp.done) else None,exp.done)
        states.append(np.array(exp.state.numpy()[0], copy=False))
        actions.append(int(exp.action))
        rewards.append(float(exp.reward))
        if not exp.done: #exp.last_state is not None:
            not_done_idx.append(idx)
            # if exp.last_state is None:print(exp)
            last_states.append(np.array(exp.last_state.numpy()[0], copy=False))
        # else:
        #     print(exp,'is done, so skipping')
    states_v = torch.FloatTensor(states).to(device)
    actions_t = torch.LongTensor(actions).to(device)

    # handle rewards
    rewards_np = np.array(rewards, dtype=np.float32)
    if not_done_idx:
        # print(last_states)
        last_states_v = torch.FloatTensor(last_states).to(device)
        last_vals_v = net(last_states_v)[1]
        last_vals_np = last_vals_v.data.cpu().numpy()[:, 0]
        # print(last_vals_v.data.cpu().numpy().mean())
        rewards_np[not_done_idx] += last_val_gamma * last_vals_np

    # print(len(not_done_idx),len(rewards_np),len(last_states))
    ref_vals_v = torch.FloatTensor(rewards_np).to(device)
    return states_v, actions_t, ref_vals_v

In [7]:
# model=LinearA2C((4,),2)
# unbatch(batch,model,2)

In [8]:
# export
def loss_func(pred,a,r,sp,d,episode_rewards,learn=None):
#     print(len(learn.xb[0]),len(a),len(r),len(sp),len(d))
#     print(learn.xb[0],a,r,sp,d)
#     print(len(learn.xb[0]))
    
    yb=[]
    for i in range(len(learn.xb[0])):
#         print(learn.xb[0][i],a[i],r[i],sp[i],d[i])
#         print(a[i])
        yb.append(ExperienceFirstLast(state=learn.xb[0][i],action=a[i],reward=r[i],last_state=sp[i],done=d[i],episode_reward=0))
    
    s_t,a_t,r_est=unbatch(yb,learn.model,learn.discount**learn.reward_steps)
#     r_est=r_est.squeeze(1)
#     print(r_est.mean(), np.mean([o.reward for o in yb]))
#     print(s_t.shape,a_t.shape,r_est.shape)
    
#     print(r_est.mean(),np.mean([o.r.numpy() for o in yb]))
#     print(sum([o.d for o in yb]))
#     print(s_t.shape,a_t.shape,r_est.shape,len(yb))

    learn.opt.zero_grad()
    logits_v,value_v=learn.model(s_t)
#     print(logits_v.shape,value_v.shape)

    loss_value_v=F.mse_loss(value_v.squeeze(-1),r_est)

    log_prob_v=F.log_softmax(logits_v,dim=1)
    adv_v=r_est-value_v.detach()

    log_prob_actions_v=adv_v*log_prob_v[range(len(learn.xb[0][0])),a_t]
    loss_policy_v=-log_prob_actions_v.mean()

    prob_v=F.softmax(logits_v,dim=1)
    entropy_loss_v=learn.entropy_beta*(prob_v*log_prob_v).sum(dim=1).mean()

    loss_v=entropy_loss_v+loss_value_v+loss_policy_v
#     print(loss_v.detach(),entropy_loss_v.detach(),loss_value_v.detach(),loss_policy_v.detach(),'\n')

    return loss_v

class A3CLearner(AgentLearner):
    def __init__(self,dls,discount=0.99,entropy_beta=0.01,clip_grad=0.1,reward_steps=1,**kwargs):
        super().__init__(dls,loss_func=partial(loss_func,learn=self),**kwargs)
        self.opt=OptimWrapper(AdamW(self.model.parameters(),eps=1e-3))
        self.model.share_memory()
        self.discount=discount
        self.entropy_beta=entropy_beta
        self.reward_steps=reward_steps
        self.clip_grad=clip_grad
        
#     def _split(self, b):
#         if len(b)==1 and type(b[0])==tuple:b=b[0]
#         super()._split(b)

In [9]:
# export
class A3CTrainer(Callback):
    
    def after_backward(self):
        nn_utils.clip_grad_norm_(self.learn.model.parameters(),self.learn.clip_grad)

In [10]:
def data_fit(queue:mp.JoinableQueue=None,items:L=None,agent:BaseAgent=None,learner_cls:Learner=None,experience_block:ExperienceBlock=None,
             cancel:mp.Event=None):
#     print(agent,flush=True)
    blk=IterableDataBlock(blocks=(experience_block(agent=agent)),
                          splitter=FuncSplitter(lambda x:False))
    dls=blk.dataloaders(items,device='cpu')
    while True:
        for x in dls[0]:
            queue.put(x)
            if cancel.is_set():
                queue.put(None)
                return None

In [11]:
def temp(x):
    return (x[0],x)

In [12]:
env='CartPole-v1'
model=LinearA2C((4,),2)

block=AsyncExperienceBlock(
    experience_block=partial(FirstLastExperienceBlock,a=0,seed=0,n_steps=4,dls_kwargs={'bs':1,'num_workers':0,
                                                                             'verbose':False,'indexed':True,'shuffle_train':False}),
    n_processes=1,
    n=128*100,
    data_fit=data_fit,
    agent=ActorCriticAgent(model)
)
blk=IterableDataBlock(blocks=(block),
                      splitter=FuncSplitter(lambda x:False)#,
#                       batch_tfms=temp,
                     )
dls=blk.dataloaders([env]*1,bs=1)

agent=ActorCriticAgent(model=model)
learner=A3CLearner(dls,agent=agent,cbs=[A3CTrainer],reward_steps=4,metrics=[AvgEpisodeRewardMetric()])
learner.fit(10,lr=0.001,wd=0)

Process DataFitProcess-1:
Traceback (most recent call last):
  File "/opt/conda/envs/fastrl/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/opt/conda/envs/fastrl/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-10-4e9b1d45a02a>", line 6, in data_fit
    dls=blk.dataloaders(items,device='cpu')
  File "/opt/conda/envs/fastrl/lib/python3.7/site-packages/fastai/data/block.py", line 113, in dataloaders
    dsets = self.datasets(source)
  File "/opt/project/fastrl/fastrl/data.py", line 80, in datasets
    tls=L([self.tls_type(items, t,verbose=verbose) for t in L(ifnone(self._combine_type_tfms(),[None]))])
  File "/opt/project/fastrl/fastrl/data.py", line 80, in <listcomp>
    tls=L([self.tls_type(items, t,verbose=verbose) for t in L(ifnone(self._combine_type_tfms(),[None]))])
  File "/opt/conda/envs/fastrl/lib/python3.7/site-packages/fastcore/foundation.py", line 47, in _

KeyboardInterrupt: 

In [None]:

# import gym
# import ptan
# import numpy as np
# import argparse
# import collections

# import torch
# import torch.nn.utils as nn_utils
# import torch.nn.functional as F
# import torch.optim as optim
# import torch.multiprocessing as mp

# # from lib import common

# GAMMA = 0.99
# LEARNING_RATE = 0.001
# ENTROPY_BETA = 0.01
# BATCH_SIZE = 128

# REWARD_STEPS = 4
# CLIP_GRAD = 0.1

# PROCESSES_COUNT = 4
# NUM_ENVS = 15

# def unbatch(batch, net, last_val_gamma, device='cpu'):
#     """
#     Convert batch into training tensors
#     :param batch:
#     :param net:
#     :return: states variable, actions tensor, reference values variable
#     """
#     states = []
#     actions = []
#     rewards = []
#     not_done_idx = []
#     last_states = []
#     for idx, exp in enumerate(batch):
# #         print(exp.state.numpy()[0].shape,int(exp.action),float(exp.reward),exp.last_state.numpy()[0].shape if not bool(exp.done) else None,exp.done)
#         states.append(np.array(exp.state.numpy()[0], copy=False))
#         actions.append(int(exp.action))
#         rewards.append(exp.reward)
#         if not exp.done: #exp.last_state is not None:
#             not_done_idx.append(idx)
#             # if exp.last_state is None:print(exp)
#             last_states.append(np.array(exp.last_state.numpy()[0], copy=False))
#         # else:
#         #     print(exp,'is done, so skipping')
#     states_v = torch.FloatTensor(states).to(device)
#     actions_t = torch.LongTensor(actions).to(device)

#     # handle rewards
#     rewards_np = np.array(rewards, dtype=np.float32)
#     if not_done_idx:
#         # print(last_states)
#         last_states_v = torch.FloatTensor(last_states).to(device)
#         last_vals_v = net(last_states_v)[1]
#         last_vals_np = last_vals_v.data.cpu().numpy()[:, 0]
#         # print(last_vals_v.data.cpu().numpy().mean())
#         rewards_np[not_done_idx] += last_val_gamma * last_vals_np

#     # print(len(not_done_idx),len(rewards_np),len(last_states))
#     ref_vals_v = torch.FloatTensor(rewards_np).to(device)
#     return states_v, actions_t, ref_vals_v



# device = "cpu" #"cuda"
# net = LinearA2C((4,),2).to(device)
# net.share_memory()


# env='CartPole-v1'
# block=AsyncExperienceBlock(
#     experience_block=partial(FirstLastExperienceBlock,a=0,seed=0,n_steps=4,dls_kwargs={'bs':1,'num_workers':0,
#                                                                              'verbose':False,'indexed':True,'shuffle_train':False}),
#     n_processes=4,
#     n=128*1000,
#     data_fit=data_fit,
#     agent=ActorCriticAgent(net)
# )
# blk=IterableDataBlock(blocks=(block),
#                       splitter=FuncSplitter(lambda x:False)#,
# #                       batch_tfms=temp,
#                      )
# dls=blk.dataloaders([env]*15,bs=128)





# optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE, eps=1e-3)

# batch = []
# step_idx = 0
# batch_num = 0

# rs_rolling=deque([],maxlen=100)

# for j,b in enumerate(dls[0]):
#     batch=[]
#     for i in range(len(b[0])):
# #         for j in range(len(b)):
#     #         print(learn.xb[0][i],a[i],r[i],sp[i],d[i])
#     #         print(a[i])
#     #         print(b[0])
# #             print(b[i][j],b[i][j],b[i][j],b[i][j],b[i][j])
#         batch.append(ExperienceFirstLast(state=b[0][i],action=b[1][i],reward=b[2][i],last_state=b[3][i],done=b[4][i],episode_reward=b[5][i]))
    
    
    
#     rs=[o.episode_reward for o in batch if o.done and int(o.episode_reward)!=0]
#     if rs: 
#         rs_rolling.append(np.mean(rs))
#         print(j,np.mean(rs_rolling))
    
    
#     batch_num+=1
#     states_v, actions_t, vals_ref_v = \
#         unbatch(batch, net, last_val_gamma=GAMMA**REWARD_STEPS, device=device)

#     optimizer.zero_grad()
#     logits_v, value_v = net(states_v)
#     loss_value_v = F.mse_loss(value_v.squeeze(-1), vals_ref_v)
#     log_prob_v = F.log_softmax(logits_v, dim=1)
#     adv_v = vals_ref_v - value_v.detach()
#     log_prob_actions_v = adv_v * log_prob_v[range(BATCH_SIZE), actions_t]
#     loss_policy_v = -log_prob_actions_v.mean()
#     prob_v = F.softmax(logits_v, dim=1)
#     entropy_loss_v = ENTROPY_BETA * (prob_v * log_prob_v).sum(dim=1).mean()
#     loss_v = entropy_loss_v + loss_value_v + loss_policy_v

#     # print(entropy_loss_v,loss_policy_v,loss_value_v)
#     loss_v.backward()
#     # getBack(loss_v.grad_fn)
#     nn_utils.clip_grad_norm_(net.parameters(), CLIP_GRAD)
#     # print(step_idx)
#     optimizer.step()



In [None]:
from nbdev.export import *
from nbdev.export2html import *
notebook2script()
notebook2html()