#### 분산 강화학습으로 CartPole을 DQN을 이용하여 구현해보겠습니다. <br>앞선 ReplayBuffer의 경우에 추가적으로 다음을 고려해야합니다. ReplayBuffer에서 설명했던 변수들은 설명을 생략하였습니다. <br>  
    1. ReplayBuffer에서 Learner가 어떤 주기로 weight update를 할지
    2. 각 Actor의 network parameter를 어떤 식으로 Learner로 부터 copy해 올지 --> 비동기? vs 동기? 

In [1]:
import ray 
import gym
import time 
import numpy as np 
import matplotlib.pyplot as plt
from IPython.display import clear_output

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
ray.init() 

2021-01-23 08:13:42,097	INFO services.py:1173 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


{'node_ip_address': '192.168.0.61',
 'raylet_ip_address': '192.168.0.61',
 'redis_address': '192.168.0.61:6379',
 'object_store_address': '/tmp/ray/session_2021-01-23_08-13-41_603694_109332/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2021-01-23_08-13-41_603694_109332/sockets/raylet',
 'webui_url': '127.0.0.1:8265',
 'session_dir': '/tmp/ray/session_2021-01-23_08-13-41_603694_109332',
 'metrics_export_port': 57577,
 'node_id': 'b573591ed7e6015cb640574f8896baacbafba292'}

In [3]:
# Buffer를 정의합니다.
@ray.remote
class ReplayBuffer:
    def __init__(self, 
                 buffer_size: ('int: Buffer_size'), 
                 state_dim: ('tuple: State dim')):
        
        # 1차원 state라할지라도 tuple로 입력받도록 tuple 타입을 강제하였습니다. 밑에 줄의 self.buffer_dim을 구하기 위해서 이렇게 한 것인데요, 사실 빼도 상관없고 얼마든지 다르게 구현해도 무방합니다.
        assert type(state_dim) == tuple 
        
        self.buffer_dim = (buffer_size, ) + state_dim
        self.buffer_size = buffer_size
        self.state_buffer = np.zeros(self.buffer_dim)
        self.action_buffer = np.zeros(buffer_size)
        self.reward_buffer = np.zeros(buffer_size)
        self.next_state_buffer = np.zeros(self.buffer_dim)
        self.done_buffer = np.zeros(buffer_size)
        self.act_idx_buffer = np.zeros(buffer_size)
        
        self.store_idx = 0
        self.current_size = 0

    def store(self, state, action, next_state, reward, done, actor_idx):
        self.state_buffer[self.store_idx] = state
        self.action_buffer[self.store_idx] = action
        self.reward_buffer[self.store_idx] = reward
        self.next_state_buffer[self.store_idx] = next_state
        self.done_buffer[self.store_idx] = done
        self.act_idx_buffer[self.store_idx] = actor_idx
        
        self.store_idx = (self.store_idx + 1) % self.buffer_size
        self.current_size = min(self.current_size+1, self.buffer_size)
    
    def batch_load(self, batch_size): 
        indices = np.random.randint(self.store_idx, size=batch_size)  
        return dict( 
                states=self.state_buffer[indices], 
                actions=self.action_buffer[indices], 
                rewards=self.reward_buffer[indices], 
                next_states=self.next_state_buffer[indices], 
                dones=self.done_buffer[indices],
                actindices=self.act_idx_buffer[indices])  

# buffer_size = 1000
# batch_size = 16
# state_dim = (4, )
# temp_buffer = ReplayBuffer.remote(buffer_size, state_dim)
# for i in range(50):
#     temp_buffer.store.remote(np.array(state_dim), 1, np.array(state_dim), 1, 1, 1)
# batch = temp_buffer.batch_load.remote(batch_size)
# ray.get(batch)['actindices'].shape

In [4]:
class QNetwork(nn.Module):
    def __init__(self, state_size, action_size, hidden=32):
        super(QNetwork, self).__init__()

        state_size = state_size[0]
        self.fc1 = nn.Linear(state_size, hidden)
        self.fc2 = nn.Linear(hidden, action_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# state_size = (4, ) 
# action_size = 2 
# temp_net = QNetwork(state_size, action_size, 32) 
# test = torch.randn(size=(4,)) 
# temp_net(test), temp_net(test).shape 

In [5]:
# actor의 역할은 각각 env에서 경험한 것을 buffer에 넘겨주는 역할을 합니다.
@ray.remote
class Actor:
    def __init__(self, 
                 learner: ("class: Learner class"),
                 env_name: ("str: Environment name"), 
                 actor_idx: ("int: The index of an actor"), 
                 actor_update_freq: ("int: Update frequency of an actor"), 
                 epsilon: ("int: starting epsilon value for e-greedy update"), 
                 eps_decay: ("int: epsilon decay rate"), 
                 eps_min: ("int: minimum epsilon value"), 
                 hidden: ("int: Update frequency of learner's q_behave network"), 
                 device: ("int: Cuda device number")):
        
        self.env = gym.make(env_name)
        self.learner = learner # ray를 통해 공유하는 learner class입니다.
        self.actor_idx = actor_idx # 어떤 actor에서 온 데이터인지 보기 위한 변수입니다.
        self.actor_update_freq = actor_update_freq # actor의 network weight를 얼마나 자주 업데이트 할 것인지 
        self.device = device
        
        # DQN hyperparameters
        self.epsilon = epsilon
        self.eps_decay = eps_decay
        self.eps_min = eps_min

        # Network parameters
        self.state_dim = (self.env.observation_space.shape[0], )
        self.action_dim = self.env.action_space.n
        self.q_behave = QNetwork(self.state_dim, self.action_dim, hidden).to(self.device)

    def select_action(self, state): 
        # e-greedy로 action을 선택 
        if np.random.random() < self.epsilon: 
            return np.zeros(self.action_dim), self.env.action_space.sample() 
        else: 
            state = torch.FloatTensor(state).to(self.device).unsqueeze(0) 
            Qs = self.q_behave(state) 
            action = Qs.argmax() 
            return Qs.detach().cpu().numpy(), action.detach().item() 
        
    def explore(self):
        score = 0
        episodes_cnt = 0
        # actor는 멈추지 않고 무한 loop로 exploration하도록 설정
        state = self.env.reset()
        while 1:
            Qs, action = self.select_action(state)
            next_state, reward, done, _ = self.env.step(action) 
            self.learner.store.remote(state, action, next_state, reward, done, self.actor_idx) 
            print("store executed in actor idx", self.actor_idx)

            state = next_state
            score += reward
            self.epsilon = max(self.epsilon-self.eps_decay, self.eps_min)
            if done:
                print(f"Done in actor index: {self.actor_idx}")
                state = self.env.reset() 
                score = 0
                episodes_cnt = (episodes_cnt+1) % self.actor_update_freq
                if episodes_cnt==0: 
                    print("GET_WEIGHT")
                    self.get_weights()
                    print("GET_WEIGHT")

    def get_weights(self):
        weight_copy = self.learner.return_weights.remote()
        weight_copy = ray.get(weight_copy)
        print(type(weight_copy), self.actor_idx)
        self.q_behave.load_state_dict(weight_copy)

In [6]:
# 공유 Buffer를 통해 학습을 진행하는 Learner를 정의합니다. 
# Learner는 buffer에 있는 샘플을 이용하여 network parameter를 업데이트를 하며, agent에게 network weight을 전달합니다.

@ray.remote
class Learner:
    def __init__(self, 
                 env_name: ("str: Environment name"),
                 gamma: ("float: Discount rate"),
                 buffer_size: ("int: Buffer size"), 
                 batch_size: ("int: Batch size"), 
                 update_buf_start: ("int: Update starting buffer size"), 
                 update_freq: ("int: Update frequency of learner's q_behave network"), 
                 update_target_freq: ("int: Update frequency of learner's q_target network"), 
                 hidden: ("int: Update frequency of learner's q_behave network"), 
                 learning_rate: ("float: Learning rate for updating the q_behave network"),
                 device: ("int: Cuda device number")):
        
        self.env = gym.make(env_name)
        self.gamma = gamma
        # Discrete action과 Box state인 경우
        self.state_dim = (self.env.observation_space.shape[0], )
        self.action_dim = self.env.action_space.n
        
        self.memory = ReplayBuffer.remote(buffer_size, state_dim)
        
        self.batch_size = batch_size
        self.update_cnt = 0 # q_behave 업데이트 횟수
        self.update_freq = update_freq # q_behave 업데이트 주기
        self.update_buf_start = update_buf_start # 업데이트 시작 buffer size
        self.update_target_freq = update_target_freq # q_target 업데이트 주기
        self.device = device
        self.total_steps = 0
        self.scores = []
        self.losses = []

        self.q_behave = QNetwork(self.state_dim, self.action_dim, hidden).to(self.device)
        self.q_target = QNetwork(self.state_dim, self.action_dim, hidden).to(self.device)
        self.q_target.load_state_dict(self.q_behave.state_dict())
        self.q_target.eval()

        self.optimizer = optim.Adam(self.q_behave.parameters(), lr=learning_rate) 

    # __init__에서 정의된 replay buffer에 저장합니다. 이 메소드는 각 actor마다 실행됩니다. 
    def store(self, state, action, next_state, reward, done, actor_idx):
        print("self.total_steps_1", self.total_steps)
        self.total_steps += 1
        self.memory.store.remote(state, action, next_state, reward, done, actor_idx)

    # 저장된 buffer에서 데이터를 로딩한 후 q_network을 업데이트합니다.
    def update_q_network(self):
        # update_cnt를 q_behave를 업데이트 할 때마다 1씩 상승 (self.update_target_freq 만큼 q_behave를 업데이트를 할 때마다 q_target을 업데이트 하기 위함)
        self.update_cnt = (self.update_cnt+1) % self.target_update_freq

        batch = self.memory.batch_load.remote(self.batch_size)
        batch = ray.get(batch)
        loss = self._compute_loss(batch)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.losses.append(loss.item()) 

    def target_hard_update(self):  
        # Hard update 방식
        self.q_target.load_state_dict(self.q_behave.state_dict()) 

    def return_weights(self):
        return self.q_behave.state_dict() # target network을 return할지 or behave network을 할지는 선택사항인 것 같습니다.

    def select_action(self, state): 
        # e-greedy로 action을 선택 
        test_epsilon = 0.05
        if np.random.random() < test_epsilon: 
            return np.zeros(self.action_dim), self.env.action_space.sample() 
        else: 
            state = torch.FloatTensor(state).to(self.device).unsqueeze(0) 
            Qs = self.q_behave(state) 
            action = Qs.argmax() 
            return Qs.detach().cpu().numpy(), action.detach().item() 

    def train(self):
        # 여기서는 training의 종료시점을 정하지 않았습니다.
        print("training start..")
        loop_cnt = 0
        while 1:
            loop_cnt += 1
            score = 0
            state = self.env.reset()
            # buffer에 어느 정도 sample이 쌓인 후에, 그리고 update_freq 마다 learner의 q_behave를 업데이트 합니다.
            time.sleep(1)
            print("SELF.MEMORY.store_idx", self.memory.store_idx)
            if (self.memory.store_idx > self.update_buf_start) and ((self.memory.store_idx%self.update_freq) == 0):
                self.update_q_network()
                Qs, action = select_action(state)
                next_state, reward, done = self.env.step(action) 
                state = next_state
                score += reward
                
                if done:
                    self.scores.append(score)
                    self._plot_status()
                    score=0
                # 만일 target_update_freq의 횟수 만큼 q_behave를 업데이트 했다면, target_network을 복사해옵니다.
                if self.update_cnt==0: self.target_hard_update()

    def _compute_loss(self, batch: "Dictionary (S, A, R', S', Dones)"):
        states = torch.FloatTensor(batch['states']).to(self.device)
        next_states = torch.FloatTensor(batch['next_states']).to(self.device)
        actions = torch.LongTensor(batch['actions'].reshape(-1, 1)).to(self.device)
        rewards = torch.FloatTensor(batch['rewards'].reshape(-1, 1)).to(self.device)
        dones = torch.FloatTensor(batch['dones'].reshape(-1, 1)).to(self.device)

        current_q = self.q_behave(states).gather(1, actions)
        next_q = self.q_target(next_states).max(dim=1, keepdim=True)[0].detach()
        mask = 1 - dones
        target = (rewards + (mask * self.gamma * next_q)).to(self.device)

        loss = F.smooth_l1_loss(current_q, target)
        return loss

    def _plot_status(self):
        clear_output(True) 
        plt.figure(figsize=(20, 5), facecolor='w') 
        plt.subplot(121)  
        plt.title(f'Score w.r.t. Total number of steps {self.total_steps}.')
        plt.plot(self.scores) 
        plt.subplot(122) 
        plt.title('loss') 
        plt.plot(self.losses) 


In [7]:
env_lists = ['CartPole-v0']
env_name = env_lists[0]
gamma = 0.99

buffer_size = 5000 # Replay Buffer 사이즈
batch_size = 16    # Replay Buffer에서 가지고 올 샘플 개수
update_buf_start = 100
update_freq = 25
update_target_freq = 100

hidden = 32
learning_rate = 0.001
# device = "cuda:1" if torch.cuda.is_available() else "cpu"
device = "cpu"

learner = Learner.remote(env_name, gamma, buffer_size, batch_size, update_buf_start, update_freq, update_target_freq, hidden, learning_rate, device) 

2021-01-23 08:13:58,194	ERROR worker.py:980 -- Possible unhandled error from worker: [36mray::Learner.__init__()[39m (pid=109483, ip=192.168.0.61)
  File "python/ray/_raylet.pyx", line 463, in ray._raylet.execute_task
  File "python/ray/_raylet.pyx", line 415, in ray._raylet.execute_task.function_executor
  File "<ipython-input-6-8f267e0cb323>", line 18, in __init__
NameError: name 'state_dim' is not defined


In [None]:
# num_actors 개수만큼 선언하고, explore 실행. actor라는 변수가 계속 중복이 되지만 실행은 잘 된다.
num_actors = 3 # actor의 개수
actor_update_freq = 5 # 몇 episode 만에 actor의 weight를 업데이트하는가
epsilon = 1.0
eps_decay = 0.0005
eps_min = 0.1

for actor_idx in range(num_actors):
    actor = Actor.remote(learner, env_name, actor_idx, actor_update_freq, epsilon, eps_decay, eps_min, hidden, device)
    actor.explore.remote()
    
learner.train.remote()

[2m[36m(pid=107485)[0m store executed in actor idx 1
[2m[36m(pid=107485)[0m store executed in actor idx 1
[2m[36m(pid=107485)[0m store executed in actor idx 1
[2m[36m(pid=107485)[0m Done in actor index: 1
[2m[36m(pid=107485)[0m GET_WEIGHT
[2m[36m(pid=107496)[0m self.total_steps_1 18704
[2m[36m(pid=107496)[0m self.total_steps_1 18705
[2m[36m(pid=107496)[0m self.total_steps_1 18706
[2m[36m(pid=107496)[0m self.total_steps_1 18707
[2m[36m(pid=107496)[0m self.total_steps_1 18708
[2m[36m(pid=107496)[0m self.total_steps_1 18709
[2m[36m(pid=107496)[0m self.total_steps_1 18710
[2m[36m(pid=107493)[0m <class 'collections.OrderedDict'> 2
[2m[36m(pid=107493)[0m GET_WEIGHT
[2m[36m(pid=107496)[0m self.total_steps_1 18711
[2m[36m(pid=107496)[0m self.total_steps_1 18712
[2m[36m(pid=107496)[0m self.total_steps_1 18713
[2m[36m(pid=107493)[0m store executed in actor idx 2
[2m[36m(pid=107493)[0m store executed in actor idx 2
[2m[36m(pid=107496)[0m



[2m[36m(pid=107485)[0m store executed in actor idx 1
[2m[36m(pid=107485)[0m Done in actor index: 1
[2m[36m(pid=107485)[0m store executed in actor idx 1
[2m[36m(pid=107485)[0m store executed in actor idx 1
[2m[36m(pid=107485)[0m store executed in actor idx 1
[2m[36m(pid=107485)[0m store executed in actor idx 1
[2m[36m(pid=107496)[0m self.total_steps_1 18809
[2m[36m(pid=107496)[0m self.total_steps_1 18810
[2m[36m(pid=107496)[0m self.total_steps_1 18811
[2m[36m(pid=107496)[0m self.total_steps_1 18812
[2m[36m(pid=107496)[0m self.total_steps_1 18813
[2m[36m(pid=107496)[0m self.total_steps_1 18814
[2m[36m(pid=107496)[0m self.total_steps_1 18815
[2m[36m(pid=107496)[0m self.total_steps_1 18816
[2m[36m(pid=107485)[0m store executed in actor idx 1
[2m[36m(pid=107485)[0m store executed in actor idx 1
[2m[36m(pid=107485)[0m store executed in actor idx 1
[2m[36m(pid=107496)[0m self.total_steps_1 18817
[2m[36m(pid=107496)[0m self.total_steps_1

[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 
[2m[36m(pid=107521)[0m 0
[2m[36m(pid=107496)[0m self.total_steps_1 18931
[2m[36m(pid=107496)[0m self.total_steps_1 18932
[2m[36m(pid=107496)[0m self.total_steps_1 18933
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m Done in actor index: 0
[2m[36m(pid=107496)[0m self.total_steps_1 18934
[2m[36m(pid=107496)[0m self.total_steps_1 18935
[2m[36m(pid=107496)[0m 
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107496)[0m self.total_steps_1 18936
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107496)[0m self.

[2m[36m(pid=107496)[0m self.total_steps_1 19067
[2m[36m(pid=107496)[0m self.total_steps_1 19068
[2m[36m(pid=107493)[0m store executed in actor idx 2
[2m[36m(pid=107493)[0m store executed in actor idx 2
[2m[36m(pid=107493)[0m Done in actor index: 2
[2m[36m(pid=107496)[0m self.total_steps_1 19069
[2m[36m(pid=107496)[0m self.total_steps_1 19070
[2m[36m(pid=107496)[0m self.total_steps_1 19071
[2m[36m(pid=107493)[0m store executed in actor idx 2
[2m[36m(pid=107493)[0m store executed in actor idx 2
[2m[36m(pid=107493)[0m store executed in actor idx 2
[2m[36m(pid=107493)[0m store executed in actor idx 
[2m[36m(pid=107496)[0m self.total_steps_1 19072
[2m[36m(pid=107496)[0m self.total_steps_1 19073
[2m[36m(pid=107493)[0m 2
[2m[36m(pid=107493)[0m store executed in actor idx 2
[2m[36m(pid=107493)[0m store executed in actor idx 2
[2m[36m(pid=107493)[0m store executed in actor idx 2
[2m[36m(pid=107496)[0m self.total_steps_1 19074
[2m[36m(pi

[2m[36m(pid=107493)[0m store executed in actor idx 2
[2m[36m(pid=107493)[0m store executed in actor idx 2
[2m[36m(pid=107493)[0m store executed in actor idx 2
[2m[36m(pid=107496)[0m self.total_steps_1 19192
[2m[36m(pid=107496)[0m self.total_steps_1 19193
[2m[36m(pid=107493)[0m store executed in actor idx 2
[2m[36m(pid=107493)[0m store executed in actor idx 2
[2m[36m(pid=107496)[0m self.total_steps_1 19194
[2m[36m(pid=107493)[0m store executed in actor idx 2
[2m[36m(pid=107493)[0m store executed in actor idx 2
[2m[36m(pid=107493)[0m store executed in actor idx 2
[2m[36m(pid=107496)[0m self.total_steps_1 19195
[2m[36m(pid=107493)[0m store executed in actor idx 2
[2m[36m(pid=107493)[0m store executed in actor idx 2
[2m[36m(pid=107493)[0m store executed in actor idx 2
[2m[36m(pid=107493)[0m Done in actor index: 2
[2m[36m(pid=107496)[0m self.total_steps_1 19196
[2m[36m(pid=107493)[0m store executed in actor idx 2
[2m[36m(pid=107493)[0

[2m[36m(pid=107485)[0m <class 'collections.OrderedDict'> 1
[2m[36m(pid=107485)[0m GET_WEIGHT
[2m[36m(pid=107485)[0m store executed in actor idx 1
[2m[36m(pid=107485)[0m store executed in actor idx 1
[2m[36m(pid=107485)[0m store executed in actor idx 1
[2m[36m(pid=107485)[0m store executed in actor idx 1
[2m[36m(pid=107485)[0m store executed in actor idx 1
[2m[36m(pid=107485)[0m store executed in actor idx 1
[2m[36m(pid=107485)[0m store executed in actor idx 1
[2m[36m(pid=107485)[0m store executed in actor idx 1
[2m[36m(pid=107485)[0m store executed in actor idx 1
[2m[36m(pid=107485)[0m store executed in actor idx 1
[2m[36m(pid=107485)[0m store executed in actor idx 1
[2m[36m(pid=107485)[0m store executed in actor idx 1
[2m[36m(pid=107485)[0m store executed in actor idx 1
[2m[36m(pid=107485)[0m Done in actor index: 1
[2m[36m(pid=107485)[0m store executed in actor idx 1
[2m[36m(pid=107485)[0m store executed in actor idx 1
[2m[36m(pi

[2m[36m(pid=107521)[0m <class 'collections.OrderedDict'> 0
[2m[36m(pid=107521)[0m GET_WEIGHT
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107496)[0m self.total_steps_1 19393
[2m[36m(pid=107496)[0m self.total_steps_1 19394
[2m[36m(pid=107496)[0m self.total_steps_1 19395
[2m[36m(pid=107496)[0m self.total_steps_1 19396
[2m[36m(pid=107496)[0m self.total_steps_1 19397
[2m[36m(pid=107496)[0m self

[2m[36m(pid=107485)[0m store executed in actor idx 1
[2m[36m(pid=107485)[0m store executed in actor idx 1
[2m[36m(pid=107496)[0m self.total_steps_1 19496
[2m[36m(pid=107496)[0m self.total_steps_1 19497
[2m[36m(pid=107485)[0m store executed in actor idx 1
[2m[36m(pid=107485)[0m store executed in actor idx 1
[2m[36m(pid=107485)[0m store executed in actor idx 1
[2m[36m(pid=107485)[0m store executed in actor idx 1
[2m[36m(pid=107496)[0m self.total_steps_1 19498
[2m[36m(pid=107485)[0m store executed in actor idx 1
[2m[36m(pid=107485)[0m store executed in actor idx 1
[2m[36m(pid=107485)[0m store executed in actor idx 1
[2m[36m(pid=107485)[0m store executed in actor idx 1
[2m[36m(pid=107496)[0m self.total_steps_1 19499
[2m[36m(pid=107485)[0m store executed in actor idx 1
[2m[36m(pid=107485)[0m Done in actor index: 1
[2m[36m(pid=107485)[0m GET_WEIGHT
[2m[36m(pid=107496)[0m self.total_steps_1 19500
[2m[36m(pid=107496)[0m self.total_steps_

[2m[36m(pid=107485)[0m store executed in actor idx 1
[2m[36m(pid=107485)[0m store executed in actor idx 1
[2m[36m(pid=107485)[0m store executed in actor idx 1
[2m[36m(pid=107496)[0m self.total_steps_1 19646
[2m[36m(pid=107496)[0m self.total_steps_1 19647
[2m[36m(pid=107496)[0m self.total_steps_1 19648
[2m[36m(pid=107496)[0m self.total_steps_1 19649
[2m[36m(pid=107496)[0m self.total_steps_1 19650
[2m[36m(pid=107485)[0m store executed in actor idx 1
[2m[36m(pid=107485)[0m store executed in actor idx 1
[2m[36m(pid=107485)[0m Done in actor index: 1
[2m[36m(pid=107485)[0m store executed in actor idx 1
[2m[36m(pid=107485)[0m store executed in actor idx 1
[2m[36m(pid=107496)[0m self.total_steps_1 19651
[2m[36m(pid=107496)[0m self.total_steps_1 19652
[2m[36m(pid=107496)[0m self.total_steps_1 19653
[2m[36m(pid=107485)[0m store executed in actor idx 1
[2m[36m(pid=107485)[0m store executed in actor idx 1
[2m[36m(pid=107496)[0m self.total_st

[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107496)[0m self.total_steps_1 19756
[2m[36m(pid=107496)[0m self.total_steps_1 19757
[2m[36m(pid=107496)[0m self.total_steps_1 19758
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m Done in actor index: 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107496)[0m self.total_steps_1 19759
[2m[36m(pid=107496)[0m self.total_steps_1 19760
[2m[36m(pid=107496)[0m self.total_steps_1 19761
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107496)[0m sel

[2m[36m(pid=107496)[0m self.total_steps_1 19910
[2m[36m(pid=107496)[0m self.total_steps_1 19911
[2m[36m(pid=107496)[0m self.total_steps_1 19912
[2m[36m(pid=107496)[0m self.total_steps_1
[2m[36m(pid=107521)[0m <class 'collections.OrderedDict'> 0
[2m[36m(pid=107521)[0m GET_WEIGHT
[2m[36m(pid=107496)[0m  19913
[2m[36m(pid=107496)[0m self.total_steps_1 19914
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107496)[0m self.total_steps_1 19915
[2m[36m(pid=107496)[0m self.total_steps_1 19916
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107496)[0m self.total_steps_1 19917
[2m[36m(pid=107496)[0m self.total_steps_1 19918
[2m[36m(pid=107496)[0m self.total_steps_1 19919
[2m[36m(pid=107496)[0m self.total_steps_1 19920
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store e

[2m[36m(pid=107496)[0m self.total_steps_1 20047
[2m[36m(pid=107493)[0m store executed in actor idx 2
[2m[36m(pid=107493)[0m store executed in actor idx 2
[2m[36m(pid=107493)[0m store executed in actor idx 2
[2m[36m(pid=107493)[0m store executed in actor idx 2
[2m[36m(pid=107496)[0m self.total_steps_1 20048
[2m[36m(pid=107496)[0m self.total_steps_1 20049
[2m[36m(pid=107493)[0m store executed in actor idx 2
[2m[36m(pid=107496)[0m self.total_steps_1 20050
[2m[36m(pid=107496)[0m self.total_steps_1 20051
[2m[36m(pid=107496)[0m self.total_steps_1 20052
[2m[36m(pid=107496)[0m self.total_steps_1 20053
[2m[36m(pid=107493)[0m store executed in actor idx 2
[2m[36m(pid=107493)[0m store executed in actor idx 2
[2m[36m(pid=107493)[0m store executed in actor idx 2
[2m[36m(pid=107493)[0m store executed in actor idx 2
[2m[36m(pid=107493)[0m store executed in actor idx 2
[2m[36m(pid=107493)[0m Done in actor index: 2
[2m[36m(pid=107496)[0m self.tot

[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m Done in actor index: 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0

[2m[36m(pid=107521)[0m <class 'collections.OrderedDict'> 0
[2m[36m(pid=107521)[0m GET_WEIGHT
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m[36m(pid=107521)[0m store executed in actor idx 0
[2m