In [26]:
import torch
from stable_baselines3 import PPO
import gymnasium as gym
import numpy as np
import imageio
import subprocess
import shutil
import time
import os

In [27]:
class CustomRewardWrapper(gym.Wrapper):
    def __init__(self, env):
        super(CustomRewardWrapper, self).__init__(env)

    def step(self, action):
        obs, reward, done, truncated, info = self.env.step(action)
        # Access relevant state and action variables
        
        #state
        z = obs[0]                  # Height of the robot
        a = obs[1]                  # Angle of the robot
        a_hip = obs[2]              # Angle of the hip
        a_knee = obs[3]             # Angle of the knee
        a_ankle = obs[4]            # Angle of the ankle
        v_x = obs[5]                # Velocity in x direction
        v_z = obs[6]                # Velocity in z direction
        a_d = obs[7]                # Angular velocity 
        a_hip_d = obs[8]            # Angular velocity of the hip
        a_knee_d = obs[9]           # Angular velocity of the knee
        a_ankle_d = obs[10]         # Angular velocity of the ankle
        
        #action
        torque_hip = action[0]      # Torque applied to the hip
        torque_knee = action[1]     # Torque applied to the knee
        torque_ankle = action[2]    # Torque applied to the ankle

        #different criteria for reward
        energy_used = np.sum(np.square(action))  # Simplistic energy calculation

        # Custom reward logic
        custom_reward = z * np.exp(-a**2/(np.pi)) * 6               # Reward for starting jump
        custom_reward += z                                          # Reward for height
        custom_reward += a_d * np.exp(-(a-np.pi)**2/4*np.pi) * 1    # Reward for angular velocity
        custom_reward += 4*np.exp(-(a-(2*np.pi))**2/(np.pi))        # Reward for 2pi angle
        custom_reward += np.tanh(a)                                 # Penalty for <0 angle
        #custom_reward -= energy_used * 0.01                        # Penalize energy consumption

        # used by openai
        # backroll = -obs[7]
        # height = obs[0]
        # vel_act = action[0] * obs[8] + action[1] * obs[9] + a[2] * obs[10]
        # backslide = -obs[5]
        # reward = backroll * (1.0 + .3 * height + .1 * vel_act + .05 * backslide)

        if done:
            custom_reward -= 10  # Heavy penalty for falling

        return obs, custom_reward, done, truncated, info

In [28]:
healthy_reward = 0.8
healthy_z_range = (0.2, float("inf"))
healthy_angle_range = (-np.pi/4, 2*np.pi+np.pi/4)
reset_noise_scale = 5e-3
exclude_current_positions_from_observation = True

env = gym.make('Hopper-v4', render_mode='rgb_array', healthy_reward=healthy_reward, healthy_z_range=healthy_z_range, healthy_angle_range=healthy_angle_range, reset_noise_scale=reset_noise_scale, exclude_current_positions_from_observation=exclude_current_positions_from_observation)
env = CustomRewardWrapper(env)

model = PPO("MlpPolicy", env, verbose=1)

# Train the model
n_learning_steps = 100000
model.learn(total_timesteps=n_learning_steps)

# Save the model
model.save("hopper")

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 41.3     |
|    ep_rew_mean     | 314      |
| time/              |          |
|    fps             | 1037     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 49.3        |
|    ep_rew_mean          | 370         |
| time/                   |             |
|    fps                  | 598         |
|    iterations           | 2           |
|    time_elapsed         | 6           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009576555 |
|    clip_fraction        | 0.122       |
|    clip_range           | 0.2         |
|    entropy_loss   

In [29]:
vec_env = model.get_env()
obs = vec_env.reset()

writer = imageio.get_writer('hopper-flip.mp4', fps=100)

N_step = 100000
for i in range(N_step):
    action, _state = model.predict(obs, deterministic=True)
    obs, reward, done, info = vec_env.step(action)
    print(obs[0][1])
    writer.append_data(vec_env.render("rgb_array"))
    #VecEnv resets automatically
    if done:
      print("Episode finished after {} timesteps".format(i+1))
      break
      obs = vec_env.reset()

writer.close()

# d = time.strftime("%Y-%m-%d-%H-%M-%S")
# current_py_file = os.getcwd() + '/hopper.ipynb'
# new_py_file = os.getcwd() + '/Run/hopper%s.ipynb'%(d)

# shutil.copyfile(current_py_file, new_py_file)
# shutil.copyfile('hopper-flip.mp4', os.getcwd()+'/Render/hopper-flip%s.mp4'%(d))

0.0011022820125121899
0.0055418588787869575
0.00780128211424761
0.00879094441893703
0.009166328996638922
0.009280786554898168
0.009316108723978506
0.009358128894187719
0.010055983907488441
0.011614361698354789
0.013758390389411849
0.016353808641497224
0.019332061688851553
0.02265204037624813
0.026282712103252334
0.030195129700486985
0.03433296849385307
0.03860129338130815
0.04291755550962702
0.04724122611281982
0.05154354980120861
0.055800347017192935
0.05999022192791768
0.06409406431372579
0.06809483749442585
0.07197740056919695
0.07572830551898556
0.07933555695040269
0.0827883451177526
0.08607676658944524
0.08919154934217871
0.09212380448122023
0.09486481981606905
0.0974059079388082
0.09973831655984947
0.10185320201407998
0.1037416635744702
0.10539483000333226
0.1068039878743404
0.10796073904528809
0.10885717310400905
0.10948603936308562
0.10984090477041282
0.10993266346895465
0.11031478584355309
0.1113735075383734
0.11316181827889901
0.1155241608957194
0.11829096659876298
0.12132446

In [30]:
np.pi/2

1.5707963267948966