In [30]:
import torch
from stable_baselines3 import PPO
import gymnasium as gym
import numpy as np
from imageio import mimsave

In [31]:
class CustomRewardWrapper(gym.Wrapper):
    def __init__(self, env):
        super(CustomRewardWrapper, self).__init__(env)

    def step(self, action):
        obs, reward, done, truncated, info = self.env.step(action)
        # Access relevant state variables (these might differ based on your environment)
        forward_velocity = obs[0]  # Example: forward velocity
        height = obs[1]            # Example: height
        energy_used = np.sum(np.square(action))  # Simplistic energy calculation

        # Custom reward logic
        custom_reward = forward_velocity * 1.0     # Reward for moving forward
        custom_reward += height * 0.5              # Encourage higher jumps
        custom_reward -= energy_used * 0.1         # Penalize energy consumption

        if done:
            custom_reward -= 10.0  # Heavy penalty for falling

        return obs, custom_reward, done, truncated, info

In [None]:
# Use the wrapper with the Hopper environment
env = gym.make('Hopper-v4', render_mode='rgb_array')
env = CustomRewardWrapper(env)

model = PPO("MlpPolicy", env, verbose=1)

# Train the model
model.learn(total_timesteps=10)

# Save the model
model.save("hopper")

In [29]:
vec_env = model.get_env()
obs = vec_env.reset()

render = []

N_step = 10
for i in range(N_step):
    action, _state = model.predict(obs, deterministic=True)
    obs, reward, done, info = vec_env.step(action)
    render.append(vec_env.render("rgb_array"))
    if done:
        break
    # VecEnv resets automatically
    # if done:
    #   obs = vec_env.reset()

mimsave('hopper_render.gif', render, duration=1) 

(480, 480, 3)
