Version 0.4.2, For the next minor version 0.5.0 and beyond, the buffe…

…r api will become incompatible with current design.
iffiX · Jun 17, 2021 · 2da92eb · 2da92eb
1 parent 5289511
commit 2da92eb
Show file tree

Hide file tree

Showing 29 changed files with 350 additions and 403 deletions.
diff --git a/examples/framework_examples/ddpg.py b/examples/framework_examples/ddpg.py
@@ -69,6 +69,7 @@ def forward(self, state, action):
         terminal = False
         step = 0
         state = t.tensor(env.reset(), dtype=t.float32).view(1, observe_dim)
+        tmp_observations = []
 
         while not terminal and step <= max_steps:
             step += 1
@@ -82,7 +83,7 @@ def forward(self, state, action):
                 state = t.tensor(state, dtype=t.float32).view(1, observe_dim)
                 total_reward += reward[0]
 
-                ddpg.store_transition(
+                tmp_observations.append(
                     {
                         "state": {"state": old_state},
                         "action": {"action": action},
@@ -92,6 +93,7 @@ def forward(self, state, action):
                     }
                 )
 
+        ddpg.store_episode(tmp_observations)
         # update, update more if episode is longer, else less
         if episode > 100:
             for _ in range(step):

diff --git a/examples/framework_examples/ddpg_apex.py b/examples/framework_examples/ddpg_apex.py
@@ -100,6 +100,8 @@ def main(rank):
 
             # manually pull the newest parameters
             ddpg_apex.manual_sync()
+            tmp_observations = []
+
             while not terminal and step <= max_steps:
                 step += 1
                 with t.no_grad():
@@ -112,7 +114,7 @@ def main(rank):
                     state = t.tensor(state, dtype=t.float32).view(1, observe_dim)
                     total_reward += reward[0]
 
-                    ddpg_apex.store_transition(
+                    tmp_observations.append(
                         {
                             "state": {"state": old_state},
                             "action": {"action": action},
@@ -122,9 +124,11 @@ def main(rank):
                         }
                     )
 
+            ddpg_apex.store_episode(tmp_observations)
             smoothed_total_reward = smoothed_total_reward * 0.9 + total_reward * 0.1
             logger.info(
-                f"Process {rank} Episode {episode} total reward={smoothed_total_reward:.2f}"
+                f"Process {rank} Episode {episode} "
+                f"total reward={smoothed_total_reward:.2f}"
             )
 
             if smoothed_total_reward > solved_reward:

diff --git a/examples/framework_examples/ddpg_per.py b/examples/framework_examples/ddpg_per.py
@@ -69,6 +69,7 @@ def forward(self, state, action):
         terminal = False
         step = 0
         state = t.tensor(env.reset(), dtype=t.float32).view(1, observe_dim)
+        tmp_observations = []
 
         while not terminal and step <= max_steps:
             step += 1
@@ -82,7 +83,7 @@ def forward(self, state, action):
                 state = t.tensor(state, dtype=t.float32).view(1, observe_dim)
                 total_reward += reward[0]
 
-                ddpg_per.store_transition(
+                tmp_observations.append(
                     {
                         "state": {"state": old_state},
                         "action": {"action": action},
@@ -92,6 +93,7 @@ def forward(self, state, action):
                     }
                 )
 
+        ddpg_per.store_episode(tmp_observations)
         # update, update more if episode is longer, else less
         if episode > 100:
             for _ in range(step):

diff --git a/examples/framework_examples/dqn.py b/examples/framework_examples/dqn.py
@@ -44,6 +44,7 @@ def forward(self, state):
         terminal = False
         step = 0
         state = t.tensor(env.reset(), dtype=t.float32).view(1, observe_dim)
+        tmp_observations = []
 
         while not terminal and step <= max_steps:
             step += 1
@@ -55,7 +56,7 @@ def forward(self, state):
                 state = t.tensor(state, dtype=t.float32).view(1, observe_dim)
                 total_reward += reward
 
-                dqn.store_transition(
+                tmp_observations.append(
                     {
                         "state": {"state": old_state},
                         "action": {"action": action},
@@ -65,6 +66,7 @@ def forward(self, state):
                     }
                 )
 
+        dqn.store_episode(tmp_observations)
         # update, update more if episode is longer, else less
         if episode > 100:
             for _ in range(step):

diff --git a/examples/framework_examples/dqn_apex.py b/examples/framework_examples/dqn_apex.py
@@ -70,6 +70,8 @@ def main(rank):
 
             # manually pull the newest parameters
             dqn_apex.manual_sync()
+            tmp_observations = []
+
             while not terminal and step <= max_steps:
                 step += 1
                 with t.no_grad():
@@ -80,7 +82,7 @@ def main(rank):
                     state = t.tensor(state, dtype=t.float32).view(1, observe_dim)
                     total_reward += reward
 
-                    dqn_apex.store_transition(
+                    tmp_observations.append(
                         {
                             "state": {"state": old_state},
                             "action": {"action": action},
@@ -90,6 +92,7 @@ def main(rank):
                         }
                     )
 
+            dqn_apex.store_episode(tmp_observations)
             smoothed_total_reward = smoothed_total_reward * 0.9 + total_reward * 0.1
             logger.info(
                 f"Process {rank} Episode {episode} total reward={smoothed_total_reward:.2f}"

diff --git a/examples/framework_examples/dqn_per.py b/examples/framework_examples/dqn_per.py
@@ -44,6 +44,7 @@ def forward(self, state):
         terminal = False
         step = 0
         state = t.tensor(env.reset(), dtype=t.float32).view(1, observe_dim)
+        tmp_observations = []
 
         while not terminal and step <= max_steps:
             step += 1
@@ -55,7 +56,7 @@ def forward(self, state):
                 state = t.tensor(state, dtype=t.float32).view(1, observe_dim)
                 total_reward += reward
 
-                dqn_per.store_transition(
+                tmp_observations.append(
                     {
                         "state": {"state": old_state},
                         "action": {"action": action},
@@ -65,6 +66,7 @@ def forward(self, state):
                     }
                 )
 
+        dqn_per.store_episode(tmp_observations)
         # update, update more if episode is longer, else less
         if episode > 100:
             for _ in range(step):

diff --git a/examples/framework_examples/hddpg.py b/examples/framework_examples/hddpg.py
@@ -69,6 +69,7 @@ def forward(self, state, action):
         terminal = False
         step = 0
         state = t.tensor(env.reset(), dtype=t.float32).view(1, observe_dim)
+        tmp_observations = []
 
         while not terminal and step <= max_steps:
             step += 1
@@ -82,7 +83,7 @@ def forward(self, state, action):
                 state = t.tensor(state, dtype=t.float32).view(1, observe_dim)
                 total_reward += reward[0]
 
-                hddpg.store_transition(
+                tmp_observations.append(
                     {
                         "state": {"state": old_state},
                         "action": {"action": action},
@@ -92,6 +93,7 @@ def forward(self, state, action):
                     }
                 )
 
+        hddpg.store_episode(tmp_observations)
         # update, update more if episode is longer, else less
         if episode > 100:
             for _ in range(step):

diff --git a/examples/framework_examples/maddpg.py b/examples/framework_examples/maddpg.py
@@ -91,9 +91,9 @@ def forward(self, state, action):
         [deepcopy(actor) for _ in range(agent_num)],
         [deepcopy(critic) for _ in range(agent_num)],
         [deepcopy(critic) for _ in range(agent_num)],
-        [list(range(agent_num))] * agent_num,
         t.optim.Adam,
         nn.MSELoss(reduction="sum"),
+        critic_visible_actors=[list(range(agent_num))] * agent_num,
     )
 
     episode, step, reward_fulfilled = 0, 0, 0
@@ -107,6 +107,7 @@ def forward(self, state, action):
         states = [
             t.tensor(st, dtype=t.float32).view(1, observe_dim) for st in env.reset()
         ]
+        tmp_observations_list = [[] for _ in range(agent_num)]
 
         while not terminal and step <= max_steps:
             step += 1
@@ -125,21 +126,27 @@ def forward(self, state, action):
                 ]
                 total_reward += float(sum(rewards)) / agent_num
 
-                maddpg.store_transitions(
-                    [
-                        {
-                            "state": {"state": ost},
-                            "action": {"action": act},
-                            "next_state": {"state": st},
-                            "reward": float(rew),
-                            "terminal": term or step == max_steps,
-                        }
-                        for ost, act, st, rew, term in zip(
-                            old_states, action_probs, states, rewards, terminals
-                        )
-                    ]
-                )
-
+                for tmp_observations, ost, act, st, rew, term in zip(
+                    tmp_observations_list,
+                    old_states,
+                    action_probs,
+                    states,
+                    rewards,
+                    terminals,
+                ):
+                    tmp_observations.append(
+                        [
+                            {
+                                "state": {"state": ost},
+                                "action": {"action": act},
+                                "next_state": {"state": st},
+                                "reward": float(rew),
+                                "terminal": term or step == max_steps,
+                            }
+                        ]
+                    )
+
+        maddpg.store_episodes(tmp_observations_list)
         # total reward is divided by steps here, since:
         # "Agents are rewarded based on minimum agent distance
         #  to each landmark, penalized for collisions"

diff --git a/examples/framework_examples/sac.py b/examples/framework_examples/sac.py
@@ -116,6 +116,7 @@ def forward(self, state, action):
         terminal = False
         step = 0
         state = t.tensor(env.reset(), dtype=t.float32).view(1, observe_dim)
+        tmp_observations = []
 
         while not terminal and step <= max_steps:
             step += 1
@@ -127,7 +128,7 @@ def forward(self, state, action):
                 state = t.tensor(state, dtype=t.float32).view(1, observe_dim)
                 total_reward += reward[0]
 
-                sac.store_transition(
+                tmp_observations.append(
                     {
                         "state": {"state": old_state},
                         "action": {"action": action},
@@ -137,6 +138,7 @@ def forward(self, state, action):
                     }
                 )
 
+        sac.store_episode(tmp_observations)
         # update, update more if episode is longer, else less
         if episode > 100:
             for _ in range(step):

diff --git a/examples/framework_examples/td3.py b/examples/framework_examples/td3.py
@@ -78,6 +78,7 @@ def forward(self, state, action):
         terminal = False
         step = 0
         state = t.tensor(env.reset(), dtype=t.float32).view(1, observe_dim)
+        tmp_observations = []
 
         while not terminal and step <= max_steps:
             step += 1
@@ -91,7 +92,7 @@ def forward(self, state, action):
                 state = t.tensor(state, dtype=t.float32).view(1, observe_dim)
                 total_reward += reward[0]
 
-                td3.store_transition(
+                tmp_observations.append(
                     {
                         "state": {"state": old_state},
                         "action": {"action": action},
@@ -101,6 +102,7 @@ def forward(self, state, action):
                     }
                 )
 
+        td3.store_episode(tmp_observations)
         # update, update more if episode is longer, else less
         if episode > 100:
             for _ in range(step):

diff --git a/machin/auto/config.py b/machin/auto/config.py
@@ -79,7 +79,12 @@ def generate_env_config(environment: str, config: Union[Dict[str, Any], Config]
     config = deepcopy(config) or {}
     if hasattr(envs, environment):
         e_module = getattr(envs, environment)
-        if hasattr(e_module, "launch") and hasattr(e_module, "generate_env_config"):
+        if (
+            hasattr(e_module, "launch")
+            and isinstance(e_module.launch, callable)
+            and hasattr(e_module, "generate_env_config")
+            and isinstance(e_module.generate_env_config, callable)
+        ):
             return e_module.generate_env_config(config)
     raise ValueError(
         f"Invalid environment: {environment}, "