In [50]:
import torch
from stable_baselines3 import PPO
import gymnasium as gym
import numpy as np
import imageio
import subprocess
import shutil
import time
import os
import tqdm

In [47]:
class CustomRewardWrapper(gym.Wrapper):
    def __init__(self, env):
        super(CustomRewardWrapper, self).__init__(env)

    def step(self, action):
        obs, reward, done, truncated, info = self.env.step(action)
        # Access relevant state and action variables
        
        #state
        z = obs[0]                  # Height of the robot
        a = obs[1]                  # Angle of the robot
        a_hip = obs[2]              # Angle of the hip
        a_knee = obs[3]             # Angle of the knee
        a_ankle = obs[4]            # Angle of the ankle
        v_x = obs[5]                # Velocity in x direction
        v_z = obs[6]                # Velocity in z direction
        a_d = obs[7]                # Angular velocity 
        a_hip_d = obs[8]            # Angular velocity of the hip
        a_knee_d = obs[9]           # Angular velocity of the knee
        a_ankle_d = obs[10]         # Angular velocity of the ankle
        
        #action
        torque_hip = action[0]      # Torque applied to the hip
        torque_knee = action[1]     # Torque applied to the knee
        torque_ankle = action[2]    # Torque applied to the ankle

        #vel_act = action[0] * obs[8] + obs[9] * action[1] + action[2] * obs[10]

        #different criteria for reward
        energy_used = np.sum(np.square(action))  # Simplistic energy calculation

        # Custom reward logic
        # custom_reward = z * np.exp(-a**2/(np.pi)) * 6               # Reward for starting jump
        # custom_reward += z                                          # Reward for height
        # custom_reward += a_d * np.exp(-(a-np.pi)**2/4*np.pi) * 1    # Reward for angular velocity
        # custom_reward += 4*np.exp(-(a-(2*np.pi))**2/(np.pi))        # Reward for 2pi angle
        # custom_reward += np.tanh(a)                                 # Penalty for <0 angle
        #custom_reward -= energy_used * 0.01                        # Penalize energy consumption

        # used by openai
        backroll = -obs[7]
        height = obs[0]
        vel_act = action[0] * obs[8] + action[1] * obs[9] + action[2] * obs[10]
        backslide = -obs[5]
        custom_reward = backroll * (1.0 + .3 * height + .1 * vel_act + .05 * backslide)

        if done:
            custom_reward -= 10  # Heavy penalty for falling

        return obs, custom_reward, done, truncated, info

In [48]:
healthy_reward = 1
healthy_z_range = (0.2, float("inf"))
healthy_angle_range = (-float("inf"), float("inf"))
reset_noise_scale = 5e-3
exclude_current_positions_from_observation = True

env = gym.make('Hopper-v4', render_mode='rgb_array', healthy_reward=healthy_reward, healthy_z_range=healthy_z_range, healthy_angle_range=healthy_angle_range, reset_noise_scale=reset_noise_scale, exclude_current_positions_from_observation=exclude_current_positions_from_observation)
env = CustomRewardWrapper(env)

model = PPO("MlpPolicy", env, verbose=1)

# Train the model
n_learning_steps = 10_000_000
model.learn(total_timesteps=n_learning_steps)

# Save the model
model.save("hopper_model")

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 144      |
|    ep_rew_mean     | 380      |
| time/              |          |
|    fps             | 1103     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 169         |
|    ep_rew_mean          | 406         |
| time/                   |             |
|    fps                  | 652         |
|    iterations           | 2           |
|    time_elapsed         | 6           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009120392 |
|    clip_fraction        | 0.113       |
|    clip_range           | 0.2         |
|    entropy_loss   

In [54]:
vec_env = model.get_env()
obs = vec_env.reset()

writer = imageio.get_writer('hopper-flip.mp4', fps=50)

N_step = 100000
for i in tqdm.tqdm(range(N_step)):
    action, _state = model.predict(obs, deterministic=True)
    obs, reward, done, info = vec_env.step(action)
    writer.append_data(vec_env.render("rgb_array"))
    #VecEnv resets automatically
    if done:
      print("Episode finished after {} timesteps".format(i+1))
      break
      obs = vec_env.reset()

writer.close()

  0%|          | 386/100000 [00:05<24:17, 68.36it/s]


Episode finished after 387 timesteps


In [None]:
d = "backflip_1"
current_py_file = os.getcwd() + '/hopper.ipynb'
new_py_file = os.getcwd() + '/Run/hopper%s.ipynb'%(d)

shutil.copyfile(current_py_file, new_py_file)
shutil.copyfile('hopper-flip.mp4', os.getcwd()+'/Render/hopper-flip%s.mp4'%(d))
shutil.copyfile('hopper_model.zip', os.getcwd()+'/Model/hopper_model%s.zip'%(d))