/
halfcheetah_forward_backward.py
112 lines (88 loc) · 4.04 KB
/
halfcheetah_forward_backward.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/env python3
import gym
import numpy as np
from gym.envs.mujoco.mujoco_env import MujocoEnv
from learn2learn.gym.envs.meta_env import MetaEnv
class HalfCheetahForwardBackwardEnv(MetaEnv, MujocoEnv, gym.utils.EzPickle):
"""
[[Source]](https://github.com/learnables/learn2learn/blob/master/learn2learn/gym/envs/mujoco/halfcheetah_forward_backward.py)
**Description**
This environment requires the half-cheetah to learn to run forward or backward.
At each time step the half-cheetah receives a signal composed of a
control cost and a reward equal to its average velocity in the direction
of the plane. The tasks are Bernoulli samples on {-1, 1} with probability 0.5, where -1 indicates the half-cheetah should
move backward and +1 indicates the half-cheetah should move forward.
The velocity is calculated as the distance (in the target direction) of the half-cheetah's torso
position before and after taking the specified action divided by a small value dt.
**Credit**
Adapted from Jonas Rothfuss' implementation.
**References**
1. Finn et al. 2017. "Model-Agnostic Meta-Learning for Fast Adaptation of Deep Networks." arXiv [cs.LG].
2. Rothfuss et al. 2018. "ProMP: Proximal Meta-Policy Search." arXiv [cs.LG].
"""
def __init__(self, task=None):
MetaEnv.__init__(self, task)
MujocoEnv.__init__(self, 'half_cheetah.xml', 5)
gym.utils.EzPickle.__init__(self)
# -------- MetaEnv Methods --------
def set_task(self, task):
MetaEnv.set_task(self, task)
self.goal_direction = task['direction']
def sample_tasks(self, num_tasks):
directions = np.random.choice((-1.0, 1.0), (num_tasks,))
tasks = [{'direction': direction} for direction in directions]
return tasks
# -------- Mujoco Methods --------
def _get_obs(self):
return np.concatenate([
self.sim.data.qpos.flat[1:],
self.sim.data.qvel.flat,
self.get_body_com("torso").flat,
]).astype(np.float32).flatten()
def viewer_setup(self):
camera_id = self.model.camera_name2id('track')
self.viewer.cam.type = 2
self.viewer.cam.fixedcamid = camera_id
self.viewer.cam.distance = self.model.stat.extent * 0.5
# Hide the overlay
self.viewer._hide_overlay = True
def reset_model(self):
qpos = self.init_qpos + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq)
qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
self.set_state(qpos, qvel)
return self._get_obs()
# -------- Gym Methods --------
def step(self, action):
xposbefore = self.sim.data.qpos[0]
self.do_simulation(action, self.frame_skip)
xposafter = self.sim.data.qpos[0]
forward_vel = (xposafter - xposbefore) / self.dt
forward_reward = self.goal_direction * forward_vel
ctrl_cost = 0.5 * 1e-1 * np.sum(np.square(action))
observation = self._get_obs()
reward = forward_reward - ctrl_cost
done = False
infos = dict(reward_forward=forward_reward,
reward_ctrl=-ctrl_cost, task=self._task)
return (observation, reward, done, infos)
def reset(self, *args, **kwargs):
MujocoEnv.reset(self, *args, **kwargs)
return self._get_obs()
def render(self, mode='human'):
if mode == 'rgb_array':
self._get_viewer(mode).render()
# window size used for old mujoco-py:
width, height = 500, 500
data = self._get_viewer(mode).read_pixels(width,
height,
depth=False)
return data
elif mode == 'human':
self._get_viewer(mode).render()
if __name__ == '__main__':
env = HalfCheetahForwardBackwardEnv()
for task in [env.get_task(), env.sample_tasks(1)[0]]:
env.set_task(task)
env.reset()
action = env.action_space.sample()
env.step(action)