In [290]:
import os
import shutil
import subprocess
import time

import gymnasium as gym
import imageio
import numpy as np
import matplotlib.pyplot as plt
import torch
import tqdm
from stable_baselines3 import PPO

from utils import gauss, inv_hea, hea

In [291]:
class CustomRewardWrapper(gym.Wrapper):
    def __init__(self, env):
        super(CustomRewardWrapper, self).__init__(env)

    def step(self, action):
        obs, reward, done, truncated, info = self.env.step(action)
        # Access relevant state and action variables
        
        #state
        mapping = {"z": 0, "a": 1, "a_hip": 2, "a_knee": 3, "a_ankle": 4, "v_x": 5, "v_z": 6, "a_d": 7, "a_hip_d": 8, "a_knee_d": 9, "a_ankle_d": 10}
        z = obs[0]                  # Height of the robot
        a = obs[1]                  # Angle of the robot
        a_hip = obs[2]              # Angle of the hip
        a_knee = obs[3]             # Angle of the knee
        a_ankle = obs[4]            # Angle of the ankle
        v_x = obs[5]                # Velocity in x direction
        v_z = obs[6]                # Velocity in z direction
        a_d = obs[7]                # Angular velocity 
        a_hip_d = obs[8]            # Angular velocity of the hip
        a_knee_d = obs[9]           # Angular velocity of the knee
        a_ankle_d = obs[10]         # Angular velocity of the ankle
        
        #action
        torque_hip = action[0]      # Torque applied to the hip
        torque_knee = action[1]     # Torque applied to the knee
        torque_ankle = action[2]    # Torque applied to the ankle

        #vel_act = action[0] * obs[8] + obs[9] * action[1] + action[2] * obs[10]

        #different criteria for reward
        energy_used = np.sum(np.square(action))  # Simplistic energy calculation

        # Custom reward logic
        get_straight = 1/np.sum(np.abs([a_hip, a_knee, a_ankle]))  # Reward for getting straight

        height = obs[0]
        vel_act = - 2 * torque_hip * a_hip + torque_knee * a_knee
        backslide = -obs[5]
        if a<2*np.pis: 
            spin = a_d 
        else: 
            spin = 0  
        custom_reward = - 2 * a_d * (1.0 + 18 * z + .6 * vel_act + .8 * backslide) - 28 * abs(a_hip)
        
        custom_reward += 1000 * np.exp((height)**4) * (inv_hea(a, -2*np.pi) + hea(a, -2*np.pi) * gauss(a, -2*np.pi, 0.5*2*np.pi)) * gauss(a_hip, 0, 0.5*2*np.pi) * gauss(a_knee, 0, 0.5*2*np.pi) * gauss(a_ankle, 0, 0.5*2*np.pi)

        # custom_reward += inv_hea(a, -0.9*2*np.pi) * obs[0] * 10
        
        if done:
            custom_reward -= 20  # Heavy penalty for falling

        return obs, custom_reward, done, truncated, info

In [292]:
d = time.strftime("%Y-%m-%d_%H-%M-%S")

healthy_reward = 1
healthy_z_range = (0.2, float("inf"))
healthy_angle_range = (-float("inf"), float("inf"))
reset_noise_scale = 5e-3
exclude_current_positions_from_observation = True

env = gym.make('Hopper-v4', render_mode='rgb_array', healthy_reward=healthy_reward, healthy_z_range=healthy_z_range, healthy_angle_range=healthy_angle_range, reset_noise_scale=reset_noise_scale, exclude_current_positions_from_observation=exclude_current_positions_from_observation)
env = CustomRewardWrapper(env)

model = PPO("MlpPolicy", env, verbose=1)

# Train the model
n_learning_steps = 5_000_000
model.learn(total_timesteps=n_learning_steps)

# Save the model
model.save("hopper_model")

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 148      |
|    ep_rew_mean     | 8.08e+03 |
| time/              |          |
|    fps             | 1095     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 141          |
|    ep_rew_mean          | 8.32e+03     |
| time/                   |              |
|    fps                  | 571          |
|    iterations           | 2            |
|    time_elapsed         | 7            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0065062377 |
|    clip_fraction        | 0.0543       |
|    clip_range           | 0.2          |
|    en

  obs, self.buf_rews[env_idx], terminated, truncated, self.buf_infos[env_idx] = self.envs[env_idx].step(


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 99           |
|    ep_rew_mean          | -2.68e+83    |
| time/                   |              |
|    fps                  | 347          |
|    iterations           | 15           |
|    time_elapsed         | 88           |
|    total_timesteps      | 30720        |
| train/                  |              |
|    approx_kl            | 0.0028959415 |
|    clip_fraction        | 0.0114       |
|    clip_range           | 0.2          |
|    entropy_loss         | -4.24        |
|    explained_variance   | 0.0551       |
|    learning_rate        | 0.0003       |
|    loss                 | 2.73e+06     |
|    n_updates            | 140          |
|    policy_gradient_loss | -0.00361     |
|    std                  | 0.992        |
|    value_loss           | 5.89e+06     |
------------------------------------------


  last_gae_lam = delta + self.gamma * self.gae_lambda * next_non_terminal * last_gae_lam


ValueError: Expected parameter loc (Tensor of shape (64, 3)) of distribution Normal(loc: torch.Size([64, 3]), scale: torch.Size([64, 3])) to satisfy the constraint Real(), but found invalid values:
tensor([[nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan],
        [nan, nan, nan]], grad_fn=<AddmmBackward0>)

In [None]:
model.save("hopper_model")
vec_env = model.get_env()
obs = vec_env.reset()

writer = imageio.get_writer('hopper-flip.mp4', fps=50)


N_step = 1000
s_a = np.zeros((N_step, 14))

for i in range(N_step):
    action, _state = model.predict(obs, deterministic=True)
    obs, reward, done, info = vec_env.step(action)
    writer.append_data(vec_env.render("rgb_array"))
    s_a[i, :] = np.concatenate((obs[0], action[0]))
    #VecEnv resets automatically
    if done:
      N_stop = i
      print("Episode finished after {} timesteps".format(i+1))
      break
      obs = vec_env.reset()

writer.close()

#truncate the array
s_a = s_a[:N_stop, :]

def plot(map: str="z"):
  plt.figure(map)
  plt.plot(s_a[:, mapping[map]], label=map)
  plt.legend()
  plt.show()
  
plot("z")
plot("a")
plot("a_hip")
plot("a_knee")
plot("a_ankle")
plot("v_x")
plot("v_z")
plot("a_d")
plot("a_hip_d")
plot("a_knee_d")
plot("a_ankle_d")

ValueError: Expected parameter loc (Tensor of shape (1, 3)) of distribution Normal(loc: torch.Size([1, 3]), scale: torch.Size([1, 3])) to satisfy the constraint Real(), but found invalid values:
tensor([[nan, nan, nan]])

In [None]:
name = d

folder = 'c:/Users/killi/Documents/code/Hopper-4-flip'

shutil.copyfile('hopper.ipynb', folder + '/Run/hopper_%s.ipynb'%(name))
shutil.copyfile('hopper-flip.mp4', folder + '/Render/hopper-flip_%s.mp4'%(name))
shutil.copyfile('hopper_model.zip', folder + '/Model/hopper_model_%s.zip'%(name))

'c:/Users/killi/Documents/code/Hopper-4-flip/Model/hopper_model_2024-08-12_09-14-37.zip'