#### 분산 강화학습을 DQN을 이용하여 구현해보겠습니다. <br>기본적인 방식은 다음과 같습니다. <br>  
    1. Replay Buffer: Actor로부터 data를 받고, Learner에게 data를 전달하는 역할
    2. Parameter Server: Learner로부터 parameter를 받고, Actor에게 paramter를 전달하는 역할.
    3. Learner: Replay Buffer로 부터 데이터를 받아 학습을 진행하고, Parameter Server로 Learner 모델의 parameter를 전달하는 역할.
    4. Actor: Environment와 상호작용하며 data를 Replay Buffer에 전달하고, Parameter Server로부터 Learner 모델의 parameter를 받아 자신의 모델 parameter를 update.

#### 5번 노트북과는 달리 "with restricted update steps" 이라는 단어를 뺐습니다.<br>

    이번 DQN은 actor와 learner가 모델의 parameter를 주고 받는 타이밍, actor가 environment와 상호작용하며 step을 진행하는 속도 등을 
    인위적으로 조절하지 않은 모델입니다. 
    
    5번 이외에 이 노트북 파일을 둔 이유는, 인위적인 조절없이 학습하는 것이 제가(그리고 A3C나 Ape-X등의 논문에서도) 본래 구현하고자 했던 것이기 때문인데요.
    
    다만, hyper parameter 설정을 잘 해주야 한다는 단점이 있습니다.
<br>

#### 그렇다면 5번과의 코드상의 차이점은?<br>

    Actor와 Learner의 메소드에 들어있는, while 1: 과 if ray.get(self.memory.return_batch_update_status.remote()) 가 들어가 있는 loop 등 몇가지 loop과 if문이 없습니다. 
    이외에는 동일합니다.

In [1]:
import sys
IN_COLAB = "google.colab" in sys.modules

if IN_COLAB:
    !pip install ray
    !pip install wandb

In [2]:
import ray 
import gym
import time 
import numpy as np 
from copy import deepcopy
import matplotlib.pyplot as plt
from collections import defaultdict
from IPython.display import clear_output

import wandb

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [3]:
from adabelief_pytorch import AdaBelief

In [4]:
ray.init() 

2021-02-03 13:44:48,101	INFO services.py:1173 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


{'node_ip_address': '192.168.0.61',
 'raylet_ip_address': '192.168.0.61',
 'redis_address': '192.168.0.61:6379',
 'object_store_address': '/tmp/ray/session_2021-02-03_13-44-47_598460_37133/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2021-02-03_13-44-47_598460_37133/sockets/raylet',
 'webui_url': '127.0.0.1:8265',
 'session_dir': '/tmp/ray/session_2021-02-03_13-44-47_598460_37133',
 'metrics_export_port': 60760,
 'node_id': 'a6f36f9c034869415b793757d0132a42cb9f47d8'}

In [5]:
# Buffer를 정의합니다. 
@ray.remote 
class ReplayBuffer:
    def __init__(self, 
                   buffer_size: ('int: Buffer_size'), 
                 state_dim: ('tuple: State dim')):

        # 1차원 state라할지라도 tuple로 입력받도록 tuple 타입을 강제하였습니다. 
        # 밑에 줄의 self.buffer_dim을 구하기 위해서 이렇게 한 것인데요, 사실 빼도 상관없고 얼마든지 다르게 구현하셔도 무방합니다.
        # 참고) ray를 쓸 때는, class선언 시에 assert조건을 만족 못하여도 에러를 주지 않습니다! class의 메소드를 실행하고 나서야 __init__에서 assertion 에러가 있다고 표시를 해줍니다. 
        assert type(state_dim) == tuple
        
        self.buffer_dim = (buffer_size, ) + state_dim
        self.buffer_size = buffer_size
        self.batch_update_status = True
        
        self.state_buffer = np.zeros(self.buffer_dim)
        self.action_buffer = np.zeros(buffer_size)
        self.reward_buffer = np.zeros(buffer_size)
        self.next_state_buffer = np.zeros(self.buffer_dim)
        self.done_buffer = np.zeros(buffer_size)
        self.act_idx_buffer = np.zeros(buffer_size)

        self.store_idx = 0
        self.current_size = 0
        self.total_store_count = 0

    def store(self, state, action, next_state, reward, done, actor_idx): 
        self.state_buffer[self.store_idx] = state
        self.action_buffer[self.store_idx] = action
        self.reward_buffer[self.store_idx] = reward
        self.next_state_buffer[self.store_idx] = next_state
        self.done_buffer[self.store_idx] = done

        # actor_idx는 학습시 쓰이지 않지만, 여러 actor로 부터 데이터 저장이 잘 되는지 확인용 변수입니다
        self.act_idx_buffer[self.store_idx] = actor_idx
        
        self.total_store_count += 1 # 학습 중에 쌓은 data의 총 개수를 counting하기 위한 변수
        self.store_idx = (self.store_idx + 1) % self.buffer_size
        self.current_size = min(self.current_size+1, self.buffer_size)
    
    def batch_load(self, batch_size): 
        indices = np.random.randint(self.current_size, size=batch_size)  
        return dict( 
                states=self.state_buffer[indices], 
                actions=self.action_buffer[indices], 
                rewards=self.reward_buffer[indices], 
                next_states=self.next_state_buffer[indices], 
                dones=self.done_buffer[indices],
                actindices=self.act_idx_buffer[indices])  
    
    # 아래의 메소드들은 ray로 다른 객체가 current_size, store_idx, total_store_count 변수들을 접근할 때 쓰기 위해서 선언
    def return_current_size(self):
        return self.current_size

    def return_store_idx(self):
        return self.store_idx

    def return_total_store_count(self):
        return self.total_store_count
    
    def batch_update_on(self):
        self.batch_update_status = True

    def batch_update_off(self):
        self.batch_update_status = False

    def return_batch_update_status(self):
        return self.batch_update_status
    
# # test
# buffer_size = 1000
# batch_size = 16
# state_dim = (4, )
# temp_buffer = ReplayBuffer.remote(buffer_size, state_dim)

# for i in range(50):
#     temp_buffer.store.remote(np.array(state_dim), 1, np.array(state_dim), 1, 1, 1)

# batch = temp_buffer.batch_load.remote(batch_size)
# print("Batch Size:", ray.get(batch)['actindices'].shape) 

# current_size = temp_buffer.return_current_size.remote()
# print("Current Size: ", ray.get(current_size))

# return_store_idx = temp_buffer.return_store_idx.remote()
# print("Store Index: ", ray.get(return_store_idx))

In [6]:
class QNetwork(nn.Module):
    def __init__(self, state_size, action_size, hidden=32):
        super(QNetwork, self).__init__()

        state_size = state_size[0]
        self.fc1 = nn.Linear(state_size, hidden)
        self.fc2 = nn.Linear(hidden, action_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# state_size = (4, ) 
# action_size = 2 
# temp_net = QNetwork(state_size, action_size, 32) 
# test = torch.randn(size=(4,)) 
# temp_net(test), temp_net(test).shape 

In [7]:
@ray.remote
class Network_parameter_server:

    def update_parameters(self, learner_params): 
        self.learner_params = learner_params

    def return_parameters(self):
        return self.learner_params

    def return_saving_status(self):
        return self.is_saved

In [8]:
# Jupyter notebook에서 plotting 용도로 정의한 함수
@ray.remote
class Plot_inline:  
    def __init__(self): 
        # actor의 개수가 여러개 있을 때 동적으로 dictionary의 key를 생성할 수 있도록 defalutdict를 활용
        self.score_dict = defaultdict(list) 
    
    def store_actor_data(self, actor_idx, score): 
        self.score_dict[actor_idx].append(score) 
        
    def store_learner_data(self, score): 
        self.score_dict['learner'].append(score) 

    def get_status(self): 
        return self.score_dict 

In [9]:
# actor의 역할은 각각 env에서 경험한 것을 buffer에 넘겨주는 역할을 합니다.
@ray.remote
class Actor:  
    def __init__(self, 
                 params_server: ("class(ray decorated): Network parameter server"),
                 memory: ("class(ray decorated): Replay Buffer"),
                 env_name: ("str: Environment name"), 
                 actor_idx: ("int: The index of an actor"), 
                 actor_update_freq: ("int: Frequency of updating actor's network. (unit: steps)"),
                 update_buf_start: ("int: Update starting buffer size"), 
                 epsilon: ("int: starting epsilon value for e-greedy update"), 
                 eps_decay: ("int: epsilon decay rate"), 
                 eps_min: ("int: minimum epsilon value"), 
                 hidden: ("int: Update frequency of learner's q_behave network"), 
                 device: ("int: Cuda device number"),
                 plot_mode: ("str: whether to plot in wandb or inline in jupyter or none-plotting(False)"),
                 plot_util: ("class(ray decorated): Plotting tool for visualizing in jupyter"),
                 num_actors: ("int: The number of actors"),
                 WANDB_GROUP_NAME: ("str: Wandb's group name for all actors"),
                 WANDB_CONFIG: ("str: Wandb configuration dictionary")
                ):

        # wandb init config 
        if plot_mode=='wandb':
            entity = 'rl_flip_school_team'  
            project_name = 'Distributed_DQN'
            wandb.init(
                    group=WANDB_GROUP_NAME,
                    project=project_name, 
                    entity=entity,
                    config=WANDB_CONFIG,
                    name=f'{actor_idx}_Distributed_DQN'
                    ) 

        self.env = gym.make(env_name)
        self.params_server = params_server
        self.memory = memory   # ray를 통해 공유하는 Replaybuffer class입니다.
        self.actor_idx = actor_idx # 어떤 actor에서 온 데이터인지 보기 위한 변수입니다.
        self.actor_update_freq = actor_update_freq
        self.update_buf_start = update_buf_start
        self.plot_mode = plot_mode
        self.plot_util = plot_util
        self.num_actors = num_actors
        self.device = device

        # DQN hyperparameters
        self.epsilon = epsilon
        self.eps_decay = eps_decay
        self.eps_min = eps_min

        # Network parameters
        self.state_dim = (self.env.observation_space.shape[0], )
        try: self.action_dim = self.env.action_space.n # Discrete action
        except: self.action_dim = env.action_space.shape[0] # Continous action            
        self.q_behave = QNetwork(self.state_dim, self.action_dim, hidden).to(self.device)

    def select_action(self, state): 
        # e-greedy로 action을 선택 
        if np.random.random() < self.epsilon: 
            return np.zeros(self.action_dim), self.env.action_space.sample() 
        else: 
            state = torch.FloatTensor(state).to(self.device).unsqueeze(0) 
            Qs = self.q_behave(state) 
            action = Qs.argmax() 
            return Qs.detach().cpu().numpy(), action.detach().item() 

    def make_buffer_ready(self):
        # 공유 ReplayBuffer에 update_buf_start 개수까지 data 저장 
        state = self.env.reset() 
        while 1:
            Qs, action = self.select_action(state)
            next_state, reward, done, _ = self.env.step(action) 

            self.memory.store.remote(state, action, next_state, reward, done, self.actor_idx) 

            state = next_state
            if done:
                state = self.env.reset() 
                if ray.get(self.memory.return_total_store_count.remote()) > self.update_buf_start: break

        done_signal = f"Done in Actor-#{self.actor_idx}" 
        return done_signal

    def explore(self):
        score = 0 
        update_freq = 0 
        state = self.env.reset() 

        # actor는 멈추지 않고 무한 loop로 exploration하도록 설정 
        while 1:
            Qs, action = self.select_action(state)
            next_state, reward, done, _ = self.env.step(action) 

            self.memory.store.remote(state, action, next_state, reward, done, self.actor_idx) # 공유 ReplayBuffer에 저장

            score += reward
            state = next_state
            self.epsilon = max(self.epsilon-self.eps_decay, self.eps_min)

            update_freq = (update_freq + 1) % self.actor_update_freq
            time.sleep(0.013)
            if update_freq==0: self._pull_parameters() 

            if done:
                state = self.env.reset() 
                self._plot_status(score)
                self.plot_util.store_actor_data.remote(self.actor_idx, score) 
                score = 0

    def _pull_parameters(self):
        updated_params = ray.get(self.params_server.return_parameters.remote()) 
        self.q_behave.load_state_dict(updated_params) 

    def _plot_status(self, score): 
        if self.plot_mode=='wandb': 
            wandb.log({'Score': score, 
                       f"Epsilon_{self.actor_idx}": self.epsilon,
                       f'Score_{self.actor_idx}': score}) 

In [10]:
# Learner는 buffer에 있는 샘플을 이용하여 network parameter를 업데이트를 하며, parameter server에 network weight을 전달합니다.
# Learner는 network update 등 cuda 연산을 하고 cpu로 병렬처리하는 것이 없으므로 ray를 이용하여 선언하지 않습니다(정확히는, 선언할 수 없게 되어있습니다).
class Learner: 
    def __init__(self, 
                 env_name: ("str: Environment name"),
                 params_server: ("Class: Network parameter server"),
                 memory: ("class: ReplayBuffer"),
                 gamma: ("float: Discount rate"), 
                 epsilon: ("int: starting epsilon value for e-greedy update"), 
                 eps_decay: ("int: epsilon decay rate"), 
                 eps_min: ("int: minimum epsilon value"), 
                 update_freq: ("int: Frequency of updating learner's q_behave network"), 
                 update_target_freq: ("int: Frequency of updating learner's q_target network"), 
                 update_push_freq: ("int: Frequency of sending learner's paratemers to parameter-server"), 
                 hidden: ("int: Update frequency of learner's q_behave network"), 
                 batch_size: ("int: Batch size for updating network"),
                 learning_rate: ("float: Learning rate for updating the q_behave network"),
                 device: ("int: Cuda device number"),
                 plot_mode: ("str: whether to plot in wandb or inline in jupyter"),
                 plot_util: ("class(ray decorated): Plotting tool for visualizing in jupyter"),
                 WANDB_GROUP_NAME: ("str: Wandb's group name for all actors"),
                 WANDB_CONFIG: ("str: Wandb configuration dictionary")
                ):

        if plot_mode=='wandb':
            entity = 'rl_flip_school_team'  
            project_name = 'Distributed_DQN'
            wandb.init(
                    group=WANDB_GROUP_NAME,
                    project=project_name, 
                    entity=entity,
                    config=WANDB_CONFIG,
                    name='Learner_Distributed_DQN'
                    ) 

        self.env = gym.make(env_name)
        self.params_server = params_server
        self.memory = memory
        self.gamma = gamma
        self.plot_mode = plot_mode
        self.plot_util = plot_util

        # DQN hyperparameters
        self.epsilon = epsilon
        self.eps_decay = eps_decay
        self.eps_min = eps_min

        self.state_dim = (self.env.observation_space.shape[0], )
        try: self.action_dim = self.env.action_space.n # Discrete action
        except: self.action_dim = env.action_space.shape[0] # Continous action 

        self.batch_size = batch_size
        self.update_cnt = 0 # q_behave 업데이트 횟수
        self.update_freq = update_freq # q_behave 업데이트 주기
        self.update_target_freq = update_target_freq # q_target 업데이트 주기
        self.update_push_freq = update_push_freq # parameter server에 보내는 주기
        self.device = device
        self.total_steps = 0
        self.scores = []
        self.losses = [0]

        self.q_behave = QNetwork(self.state_dim, self.action_dim, hidden).to(self.device)
        self.q_target = QNetwork(self.state_dim, self.action_dim, hidden).to(self.device)
        self.q_target.load_state_dict(self.q_behave.state_dict())
        self.q_target.eval()
        self.push_parameters()

        self.optimizer = optim.Adam(self.q_behave.parameters(), lr=learning_rate) 

    # 저장된 buffer에서 데이터를 로딩한 후 q_network을 업데이트합니다.
    def update_q_network(self):
        # update_cnt를 q_behave를 업데이트 할 때마다 1씩 상승 (self.update_target_freq 만큼 q_behave를 업데이트를 할 때마다 q_target을 업데이트 하기 위함)
        self.update_cnt += 1
        batch = ray.get(self.memory.batch_load.remote(self.batch_size)) 
        loss = self._compute_loss(batch) 

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.losses.append(loss.item()) # for plotting the losses
        self.memory.batch_update_on.remote()

    def target_hard_update(self):  
        # Hard update 방식
        self.q_target.load_state_dict(self.q_behave.state_dict()) 

    def eval_select_action(self, state): 
        state = torch.FloatTensor(state).to(self.device).unsqueeze(0) 
        Qs = self.q_behave(state) 
        action = Qs.argmax() 
        return Qs.detach().cpu().numpy(), action.detach().item() 

    def push_parameters(self):
        # Send paramters to server 
        copied_model = deepcopy(self.q_behave).cpu()
        self.params_server.update_parameters.remote(copied_model.state_dict())

    def train(self):
        # 여기서는 training의 종료시점을 정하지 않았습니다.
        print("training start..")

        # Learner는 environment와 상호작용을 할 필요가 없지만, 여기서는 learner의 학습률도 plot해보기 위해서 도입
        score = 0 
        state = self.env.reset() 
        while 1: 
            Qs, action = self.select_action(state) 
            next_state, reward, done, _ = self.env.step(action)  
            score += reward 
            state = next_state 
            self.epsilon = max(self.epsilon-self.eps_decay, self.eps_min) 
            time.sleep(0.007) 
            self.update_q_network() 
            # 만일 update_push_freq 횟수 만큼 q_behave를 업데이트 했다면, server에 parameter를 보냅니다. 
            # 현재는 target_network이 업데이트 될때, 함께 parameter에 보내기 때문에 pass로 했습니다.
            if (self.update_cnt%self.update_push_freq)==0: pass

            # 만일 target_update_freq의 횟수 만큼 q_behave를 업데이트 했다면, target_network을 복사해옵니다.
            if (self.update_cnt%self.update_target_freq)==0: 
                self.target_hard_update()
                self.push_parameters()

#             print("return_total_store_count", ray.get(self.memory.return_total_store_count.remote()))
            if done:
#                 print(score)
#                 print("Done")
                state = self.env.reset() 
                self.plot_util.store_learner_data.remote(score) 
                self._plot_status(score)
                score = 0

    def select_action(self, state): 
        if np.random.random() < self.epsilon: 
            return np.zeros(self.action_dim), self.env.action_space.sample() 
        else: 
            state = torch.FloatTensor(state).to(self.device).unsqueeze(0) 
            Qs = self.q_behave(state) 
            action = Qs.argmax() 
            return Qs.detach().cpu().numpy(), action.detach().item() 

    def _compute_loss(self, batch: "Dictionary (S, A, R', S', Dones)"):
        states = torch.FloatTensor(batch['states']).to(self.device)
        next_states = torch.FloatTensor(batch['next_states']).to(self.device)
        actions = torch.LongTensor(batch['actions'].reshape(-1, 1)).to(self.device)
        rewards = torch.FloatTensor(batch['rewards'].reshape(-1, 1)).to(self.device)
        dones = torch.FloatTensor(batch['dones'].reshape(-1, 1)).to(self.device)
        
        current_q = self.q_behave(states).gather(1, actions)
        next_q = self.q_target(next_states).max(dim=1, keepdim=True)[0].detach()
        mask = 1 - dones
        target = (rewards + (mask * self.gamma * next_q)).to(self.device)
        loss = F.smooth_l1_loss(target, current_q)

        return loss

    def _plot_status(self, score): 
        if self.plot_mode=='wandb': 
            wandb.log({"Learner Score": score, 
                       "Learner Epsilon": self.epsilon, 
                       "loss(10 frames avg)": np.mean(self.losses[-10:]), 
                       "Number of frames": ray.get(self.memory.return_total_store_count.remote()) 
                      }) 
            
        elif self.plot_mode=='inline':
            ''' Plotting in jupyter notebook '''
            score_dict = ray.get(self.plot_util.get_status.remote()) 
            clear_output(True)
            plt.figure(facecolor='w', figsize=(25,25)) 
            for idx in range(1, len(score_dict)):
                i,j = (idx-1)//5, (idx-1)%5 
                plt.subplot2grid((5,5), (i,j)) 
                plt.plot(score_dict[idx]) 
                plt.title(f"Score of Actor {idx}")
            if (j+1)%5==0:
                plt.subplot2grid((5,5), (i+1,0)) 
            else:
                plt.subplot2grid((5,5), (i,j+1)) 
            plt.plot(score_dict['learner']) 
            plt.title("Score of Learner") 
            plt.show() 

In [11]:
# hyperparameter 설정
env_lists = ['CartPole-v0', 'LunarLander-v2']
num_actors = 8 # actor의 개수
actor_device = "cpu"
actor_update_freq = 100

# # lunarlander
env_name = env_lists[1]
buffer_size = 150000 
update_buf_start = 100
learning_rate = 0.0002
hidden = 256 
epsilon = 1.
gamma = 0.995
batch_size = 1024
update_freq = 1
update_target_freq = 150
update_push_freq = 1
eps_decay_list = np.linspace(1/130000, 1/50000, num_actors)
eps_min_list = np.linspace(0.125, 0.02, num_actors)
config_dict = { 
    "learner_eps_decay" : eps_decay_list[num_actors//2],
    "learner_eps_min" : 0.0, # make the learner deterministic when reaching the minimum epsilon
    "eps_decay" : eps_decay_list,
    "eps_min" : eps_min_list
} 

# # CartPole
# env_name = env_lists[0]
# buffer_size = 4000
# update_buf_start = 100
# learning_rate = 0.001
# hidden = 128
# epsilon = 1.
# gamma = 0.99
# batch_size = 32   
# update_freq = 1
# update_target_freq = 150
# update_push_freq = 1
# eps_decay_list = np.linspace(1/1000, 1/4000, num_actors)
# eps_min_list = np.linspace(0.125, 0.02, num_actors)
# config_dict = { 
#     "learner_eps_decay" : eps_decay_list[num_actors//2],
#     "learner_eps_min" : 0.0, # make the learner deterministic when reaching the minimum epsilon
#     "eps_decay" : eps_decay_list,
#     "eps_min" : eps_min_list
# } 

env = gym.make(env_name)
state_dim = (env.observation_space.shape[0], ) 

learner_device = "cuda:1" if torch.cuda.is_available() else "cpu"
plot_mode = 'wandb' # plot options: 'wandb' or 'inline' or False
WANDB_GROUP_NAME = str(np.random.randint(10000))+ '_Dist_DQN_' + env_name[:4]
print("WANDB_GROUP_NAME", WANDB_GROUP_NAME)

WANDB_GROUP_NAME 337_Dist_DQN_Luna


In [12]:
params_server = Network_parameter_server.remote() 
memory = ReplayBuffer.remote(buffer_size, state_dim)
plot_util = Plot_inline.remote()
learner_eps_decay = config_dict["learner_eps_decay"]
learner_eps_min = config_dict["learner_eps_min"]

WANDB_CONFIG_learner={"env_name": env_name, 
            "gamma": gamma,
            "num_actors": num_actors,
            "buffer_size": buffer_size,
            "update_start_buffer_size": update_buf_start,
            "batch_size": batch_size,
            "learning_rate": learning_rate,
            "hidden": hidden,
            "target_update_freq (unit:step)": update_target_freq,
            "behave_update_freq (unit:step)": update_freq,
            "push_to_params_server_frew (unit:step)": update_push_freq,
            "eps_max": epsilon,
            "eps_min": learner_eps_min,
            "eps_decay": learner_eps_decay,
            }
learner = Learner(env_name, params_server, memory, gamma, epsilon, learner_eps_decay, learner_eps_min,
                  update_freq, update_target_freq, update_push_freq, 
                  hidden, batch_size, learning_rate, learner_device, plot_mode, plot_util, WANDB_GROUP_NAME, WANDB_CONFIG_learner) 

Failed to query for notebook name, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable
[34m[1mwandb[0m: Wandb version 0.10.17 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [13]:
# 동적 변수 할당을 통해서, actor를 다수의 서로 다른 index로 변수를 선언합니다 (다만, ray는 서로 다른 변수로 actor를 지정하지 않아도 잘 작동합니다).
# 먼저 replay buffer를 learner가 update할 수 있는 상태가 되도록 채웁니다.
# for문 안을 보시면 globals()로 선언이 되어있는데, 이는 반드시 필요한 작업은 아닙니다. actor라는 단일 변수로 받아도 실행가능합니다.
buffer_ready_done = [] 
for actor_idx in range(1, num_actors+1):

    eps_min = config_dict["eps_min"][actor_idx-1]
    eps_decay = config_dict["eps_decay"][actor_idx-1]
    WANDB_CONFIG={"env_name": env_name, 
                "actor_index": actor_idx,
                "gamma": gamma,
                "buffer_size": buffer_size,
                "update_start_buffer_size": update_buf_start,
                "batch_size": batch_size,
                "learning_rate": learning_rate,
                "hidden": hidden,
                "target_update_freq (unit:step)": update_target_freq,
                "behave_update_freq (unit:step)": update_freq,
                "push_to_params_server_frew (unit:step)": update_push_freq,
                "eps_max": epsilon,
                "eps_min": eps_min,
                "eps_decay": eps_decay,
                }
    globals()[f"actor_{actor_idx}"] = Actor.remote(params_server, memory, env_name, actor_idx, actor_update_freq, update_buf_start,
                                                   epsilon, eps_decay, eps_min, hidden, actor_device, plot_mode, plot_util, num_actors, WANDB_GROUP_NAME, WANDB_CONFIG)
    buffer_ready_done.append(globals()[f"actor_{actor_idx}"].make_buffer_ready.remote())

In [14]:
# 모든 actor의 활동이 종료되었는지 확인
print(ray.get(buffer_ready_done)) 

[2m[36m(pid=37286)[0m wandb: Tracking run with wandb version 0.9.4
[2m[36m(pid=37255)[0m wandb: Tracking run with wandb version 0.9.4
[2m[36m(pid=37242)[0m wandb: Tracking run with wandb version 0.9.4
[2m[36m(pid=37237)[0m wandb: Tracking run with wandb version 0.9.4
[2m[36m(pid=37236)[0m wandb: Tracking run with wandb version 0.9.4
[2m[36m(pid=37256)[0m wandb: Tracking run with wandb version 0.9.4
[2m[36m(pid=37222)[0m wandb: Tracking run with wandb version 0.9.4
[2m[36m(pid=37239)[0m wandb: Tracking run with wandb version 0.9.4
[2m[36m(pid=37286)[0m wandb: Wandb version 0.10.17 is available!  To upgrade, please run:
[2m[36m(pid=37286)[0m wandb:  $ pip install wandb --upgrade
[2m[36m(pid=37286)[0m wandb: Run data is saved locally in wandb/run-20210203_134457-2mqvre69
[2m[36m(pid=37255)[0m wandb: Wandb version 0.10.17 is available!  To upgrade, please run:
[2m[36m(pid=37255)[0m wandb:  $ pip install wandb --upgrade
[2m[36m(pid=37255)[0m wandb: 

['Done in Actor-#1', 'Done in Actor-#2', 'Done in Actor-#3', 'Done in Actor-#4', 'Done in Actor-#5', 'Done in Actor-#6', 'Done in Actor-#7', 'Done in Actor-#8']


In [15]:
# 학습 시작
for actor_idx in range(1, num_actors+1):
    globals()[f"actor_{actor_idx}"].explore.remote() 

In [None]:
learner.train()   

training start..


Failed to query for notebook name, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable
[34m[1mwandb[0m: Wandb version 0.10.17 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
requests_with_retry encountered retryable exception: 500 Server Error: Internal Server Error for url: https://api.wandb.ai/files/rl_flip_school_team/Distributed_DQN/biepuahx/file_stream. args: ('https://api.wandb.ai/files/rl_flip_school_team/Distributed_DQN/biepuahx/file_stream',), kwargs: {'json': {'files': {'wandb-history.jsonl': {'offset': 396, 'content': ['{"Learner Score": 206.3206860501575, "Learner Epsilon": 0.14133750000064968, "loss(10 frames avg)": 2.898272681236267, "Number of frames": 592722, "_runtime": 7998.136448144913, "_timestamp": 1612335485.2165499, "_step": 396}\n']}}}}
[2m[36m(pid=37222)[0m wandb: Network error (ReadTimeout), entering retry loop. See /home/sonic/Blanksheet/git_repo/JungKH/MacaronRL/Ray_tutorial/wandb/debug.lo

[2m[36m(pid=37255)[0m wandb: Network error resolved after 0:01:02.438985, resuming normal operation.
[2m[36m(pid=37239)[0m wandb: Network error resolved after 0:01:01.160661, resuming normal operation.
[2m[36m(pid=37286)[0m wandb: Network error resolved after 0:01:28.054334, resuming normal operation.
[2m[36m(pid=37236)[0m wandb: Network error resolved after 0:01:24.015044, resuming normal operation.
[2m[36m(pid=37237)[0m wandb: Network error resolved after 0:01:27.067823, resuming normal operation.
[2m[36m(pid=37256)[0m wandb: Network error resolved after 0:01:25.382677, resuming normal operation.
[2m[36m(pid=37242)[0m wandb: Network error resolved after 0:01:25.320196, resuming normal operation.
[2m[36m(pid=37255)[0m wandb: Network error (ReadTimeout), entering retry loop. See /home/sonic/Blanksheet/git_repo/JungKH/MacaronRL/Ray_tutorial/wandb/debug.log for full traceback.
[2m[36m(pid=37256)[0m wandb: Network error (ReadTimeout), entering retry loop. See /h

requests_with_retry encountered retryable exception: 500 Server Error: Internal Server Error for url: https://api.wandb.ai/files/rl_flip_school_team/Distributed_DQN/biepuahx/file_stream. args: ('https://api.wandb.ai/files/rl_flip_school_team/Distributed_DQN/biepuahx/file_stream',), kwargs: {'json': {'files': {'wandb-events.jsonl': {'offset': 2429, 'content': ['{"system.gpu.0.gpu": 7.0, "system.gpu.0.memory": 7.2, "system.gpu.0.memoryAllocated": 30.65, "system.gpu.0.temp": 48.73, "system.gpu.process.0.gpu": 7.0, "system.gpu.process.0.memory": 7.2, "system.gpu.process.0.memoryAllocated": 30.65, "system.gpu.process.0.temp": 48.73, "system.gpu.0.powerWatts": 84.99, "system.gpu.0.powerPercent": 32.69, "system.gpu.process.0.powerWatts": 84.99, "system.gpu.process.0.powerPercent": 32.69, "system.gpu.1.gpu": 1.07, "system.gpu.1.memory": 0.0, "system.gpu.1.memoryAllocated": 31.51, "system.gpu.1.temp": 53.0, "system.gpu.process.1.gpu": 1.07, "system.gpu.process.1.memory": 0.0, "system.gpu.proces