Skip to content

Commit

Permalink
Version 0.4.2, For the next minor version 0.5.0 and beyond, the buffe…
Browse files Browse the repository at this point in the history
…r api will become incompatible with current design.
  • Loading branch information
iffiX committed Jun 17, 2021
1 parent 5289511 commit 2da92eb
Show file tree
Hide file tree
Showing 29 changed files with 350 additions and 403 deletions.
4 changes: 3 additions & 1 deletion examples/framework_examples/ddpg.py
Expand Up @@ -69,6 +69,7 @@ def forward(self, state, action):
terminal = False
step = 0
state = t.tensor(env.reset(), dtype=t.float32).view(1, observe_dim)
tmp_observations = []

while not terminal and step <= max_steps:
step += 1
Expand All @@ -82,7 +83,7 @@ def forward(self, state, action):
state = t.tensor(state, dtype=t.float32).view(1, observe_dim)
total_reward += reward[0]

ddpg.store_transition(
tmp_observations.append(
{
"state": {"state": old_state},
"action": {"action": action},
Expand All @@ -92,6 +93,7 @@ def forward(self, state, action):
}
)

ddpg.store_episode(tmp_observations)
# update, update more if episode is longer, else less
if episode > 100:
for _ in range(step):
Expand Down
8 changes: 6 additions & 2 deletions examples/framework_examples/ddpg_apex.py
Expand Up @@ -100,6 +100,8 @@ def main(rank):

# manually pull the newest parameters
ddpg_apex.manual_sync()
tmp_observations = []

while not terminal and step <= max_steps:
step += 1
with t.no_grad():
Expand All @@ -112,7 +114,7 @@ def main(rank):
state = t.tensor(state, dtype=t.float32).view(1, observe_dim)
total_reward += reward[0]

ddpg_apex.store_transition(
tmp_observations.append(
{
"state": {"state": old_state},
"action": {"action": action},
Expand All @@ -122,9 +124,11 @@ def main(rank):
}
)

ddpg_apex.store_episode(tmp_observations)
smoothed_total_reward = smoothed_total_reward * 0.9 + total_reward * 0.1
logger.info(
f"Process {rank} Episode {episode} total reward={smoothed_total_reward:.2f}"
f"Process {rank} Episode {episode} "
f"total reward={smoothed_total_reward:.2f}"
)

if smoothed_total_reward > solved_reward:
Expand Down
4 changes: 3 additions & 1 deletion examples/framework_examples/ddpg_per.py
Expand Up @@ -69,6 +69,7 @@ def forward(self, state, action):
terminal = False
step = 0
state = t.tensor(env.reset(), dtype=t.float32).view(1, observe_dim)
tmp_observations = []

while not terminal and step <= max_steps:
step += 1
Expand All @@ -82,7 +83,7 @@ def forward(self, state, action):
state = t.tensor(state, dtype=t.float32).view(1, observe_dim)
total_reward += reward[0]

ddpg_per.store_transition(
tmp_observations.append(
{
"state": {"state": old_state},
"action": {"action": action},
Expand All @@ -92,6 +93,7 @@ def forward(self, state, action):
}
)

ddpg_per.store_episode(tmp_observations)
# update, update more if episode is longer, else less
if episode > 100:
for _ in range(step):
Expand Down
4 changes: 3 additions & 1 deletion examples/framework_examples/dqn.py
Expand Up @@ -44,6 +44,7 @@ def forward(self, state):
terminal = False
step = 0
state = t.tensor(env.reset(), dtype=t.float32).view(1, observe_dim)
tmp_observations = []

while not terminal and step <= max_steps:
step += 1
Expand All @@ -55,7 +56,7 @@ def forward(self, state):
state = t.tensor(state, dtype=t.float32).view(1, observe_dim)
total_reward += reward

dqn.store_transition(
tmp_observations.append(
{
"state": {"state": old_state},
"action": {"action": action},
Expand All @@ -65,6 +66,7 @@ def forward(self, state):
}
)

dqn.store_episode(tmp_observations)
# update, update more if episode is longer, else less
if episode > 100:
for _ in range(step):
Expand Down
5 changes: 4 additions & 1 deletion examples/framework_examples/dqn_apex.py
Expand Up @@ -70,6 +70,8 @@ def main(rank):

# manually pull the newest parameters
dqn_apex.manual_sync()
tmp_observations = []

while not terminal and step <= max_steps:
step += 1
with t.no_grad():
Expand All @@ -80,7 +82,7 @@ def main(rank):
state = t.tensor(state, dtype=t.float32).view(1, observe_dim)
total_reward += reward

dqn_apex.store_transition(
tmp_observations.append(
{
"state": {"state": old_state},
"action": {"action": action},
Expand All @@ -90,6 +92,7 @@ def main(rank):
}
)

dqn_apex.store_episode(tmp_observations)
smoothed_total_reward = smoothed_total_reward * 0.9 + total_reward * 0.1
logger.info(
f"Process {rank} Episode {episode} total reward={smoothed_total_reward:.2f}"
Expand Down
4 changes: 3 additions & 1 deletion examples/framework_examples/dqn_per.py
Expand Up @@ -44,6 +44,7 @@ def forward(self, state):
terminal = False
step = 0
state = t.tensor(env.reset(), dtype=t.float32).view(1, observe_dim)
tmp_observations = []

while not terminal and step <= max_steps:
step += 1
Expand All @@ -55,7 +56,7 @@ def forward(self, state):
state = t.tensor(state, dtype=t.float32).view(1, observe_dim)
total_reward += reward

dqn_per.store_transition(
tmp_observations.append(
{
"state": {"state": old_state},
"action": {"action": action},
Expand All @@ -65,6 +66,7 @@ def forward(self, state):
}
)

dqn_per.store_episode(tmp_observations)
# update, update more if episode is longer, else less
if episode > 100:
for _ in range(step):
Expand Down
4 changes: 3 additions & 1 deletion examples/framework_examples/hddpg.py
Expand Up @@ -69,6 +69,7 @@ def forward(self, state, action):
terminal = False
step = 0
state = t.tensor(env.reset(), dtype=t.float32).view(1, observe_dim)
tmp_observations = []

while not terminal and step <= max_steps:
step += 1
Expand All @@ -82,7 +83,7 @@ def forward(self, state, action):
state = t.tensor(state, dtype=t.float32).view(1, observe_dim)
total_reward += reward[0]

hddpg.store_transition(
tmp_observations.append(
{
"state": {"state": old_state},
"action": {"action": action},
Expand All @@ -92,6 +93,7 @@ def forward(self, state, action):
}
)

hddpg.store_episode(tmp_observations)
# update, update more if episode is longer, else less
if episode > 100:
for _ in range(step):
Expand Down
39 changes: 23 additions & 16 deletions examples/framework_examples/maddpg.py
Expand Up @@ -91,9 +91,9 @@ def forward(self, state, action):
[deepcopy(actor) for _ in range(agent_num)],
[deepcopy(critic) for _ in range(agent_num)],
[deepcopy(critic) for _ in range(agent_num)],
[list(range(agent_num))] * agent_num,
t.optim.Adam,
nn.MSELoss(reduction="sum"),
critic_visible_actors=[list(range(agent_num))] * agent_num,
)

episode, step, reward_fulfilled = 0, 0, 0
Expand All @@ -107,6 +107,7 @@ def forward(self, state, action):
states = [
t.tensor(st, dtype=t.float32).view(1, observe_dim) for st in env.reset()
]
tmp_observations_list = [[] for _ in range(agent_num)]

while not terminal and step <= max_steps:
step += 1
Expand All @@ -125,21 +126,27 @@ def forward(self, state, action):
]
total_reward += float(sum(rewards)) / agent_num

maddpg.store_transitions(
[
{
"state": {"state": ost},
"action": {"action": act},
"next_state": {"state": st},
"reward": float(rew),
"terminal": term or step == max_steps,
}
for ost, act, st, rew, term in zip(
old_states, action_probs, states, rewards, terminals
)
]
)

for tmp_observations, ost, act, st, rew, term in zip(
tmp_observations_list,
old_states,
action_probs,
states,
rewards,
terminals,
):
tmp_observations.append(
[
{
"state": {"state": ost},
"action": {"action": act},
"next_state": {"state": st},
"reward": float(rew),
"terminal": term or step == max_steps,
}
]
)

maddpg.store_episodes(tmp_observations_list)
# total reward is divided by steps here, since:
# "Agents are rewarded based on minimum agent distance
# to each landmark, penalized for collisions"
Expand Down
4 changes: 3 additions & 1 deletion examples/framework_examples/sac.py
Expand Up @@ -116,6 +116,7 @@ def forward(self, state, action):
terminal = False
step = 0
state = t.tensor(env.reset(), dtype=t.float32).view(1, observe_dim)
tmp_observations = []

while not terminal and step <= max_steps:
step += 1
Expand All @@ -127,7 +128,7 @@ def forward(self, state, action):
state = t.tensor(state, dtype=t.float32).view(1, observe_dim)
total_reward += reward[0]

sac.store_transition(
tmp_observations.append(
{
"state": {"state": old_state},
"action": {"action": action},
Expand All @@ -137,6 +138,7 @@ def forward(self, state, action):
}
)

sac.store_episode(tmp_observations)
# update, update more if episode is longer, else less
if episode > 100:
for _ in range(step):
Expand Down
4 changes: 3 additions & 1 deletion examples/framework_examples/td3.py
Expand Up @@ -78,6 +78,7 @@ def forward(self, state, action):
terminal = False
step = 0
state = t.tensor(env.reset(), dtype=t.float32).view(1, observe_dim)
tmp_observations = []

while not terminal and step <= max_steps:
step += 1
Expand All @@ -91,7 +92,7 @@ def forward(self, state, action):
state = t.tensor(state, dtype=t.float32).view(1, observe_dim)
total_reward += reward[0]

td3.store_transition(
tmp_observations.append(
{
"state": {"state": old_state},
"action": {"action": action},
Expand All @@ -101,6 +102,7 @@ def forward(self, state, action):
}
)

td3.store_episode(tmp_observations)
# update, update more if episode is longer, else less
if episode > 100:
for _ in range(step):
Expand Down
7 changes: 6 additions & 1 deletion machin/auto/config.py
Expand Up @@ -79,7 +79,12 @@ def generate_env_config(environment: str, config: Union[Dict[str, Any], Config]
config = deepcopy(config) or {}
if hasattr(envs, environment):
e_module = getattr(envs, environment)
if hasattr(e_module, "launch") and hasattr(e_module, "generate_env_config"):
if (
hasattr(e_module, "launch")
and isinstance(e_module.launch, callable)
and hasattr(e_module, "generate_env_config")
and isinstance(e_module.generate_env_config, callable)
):
return e_module.generate_env_config(config)
raise ValueError(
f"Invalid environment: {environment}, "
Expand Down

0 comments on commit 2da92eb

Please sign in to comment.