In [1]:
import numpy as np
from gym_unbalanced_disk import UnbalancedDisk
import gymnasium as gym
from gymnasium import spaces

class AC_UnbalancedDisk(UnbalancedDisk):
    def __init__(self, umax=3., dt = 0.025, render_mode='human'):
        super().__init__(umax=umax, dt=dt, render_mode=render_mode)

        #self.actions_values = np.linspace(-self.umax, self.umax, self.num_actions)
        #self.action_space = spaces.Discrete(self.num_actions)
        self.target = np.pi
        low = [-np.pi,-40] 
        high = [np.pi,40]
        self.observation_space = spaces.Box(low=np.array(low,dtype=np.float32),high=np.array(high,dtype=np.float32),shape=(2,))
    
    def step(self,action):
        #discrete_action = self.actions_values[np.abs(action - self.actions_values).argmin()]
        obs, reward, terminated, truncated, info = super().step(action)

        ### setup our reward
        th = obs[0]
        omega = obs[1]
        norm_theta = (th)%(2*np.pi) #- np.pi
        error = abs(self.target-abs(norm_theta))
        if abs(norm_theta)< np.pi/2:
           reward = np.min([-0.5, -5 + abs(omega)])
        else:
           reward = abs(norm_theta)**3/(1 + abs(omega)*2)
        #             if abs(error)> np.pi/4:
        #    reward = 0.1*np.sign(error)*omega
        #else:
        # reward = - norm_theta**2 -0.1*(omega / (1 + 10*abs(norm_theta)))**2 - 0.01*action**2# try np.exp 
        ### discritize obs if we use Q learning
        return obs, reward, terminated, truncated, info
    
    def reset(self, seed=None, options=None):
        obs, info = super().reset()

        return obs, info

In [2]:
from gymnasium.wrappers import TimeLimit
from stable_baselines3 import PPO
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import StopTrainingOnMaxEpisodes

In [None]:
env = AC_UnbalancedDisk()
env = TimeLimit(env, max_episode_steps=500)
env = Monitor(env)
stop_cb = StopTrainingOnMaxEpisodes(max_episodes=100, verbose=1)

model = PPO(
    policy='MlpPolicy',
    env=env,
    learning_rate=1e-3,
    verbose=1,
)

model.learn(
    total_timesteps=1_000_000,  
    callback=stop_cb,
)

Using cpu device
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 500      |
|    ep_rew_mean     | 2.38e+04 |
| time/              |          |
|    fps             | 905      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 500         |
|    ep_rew_mean          | 2.68e+04    |
| time/                   |             |
|    fps                  | 665         |
|    iterations           | 2           |
|    time_elapsed         | 6           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.005776522 |
|    clip_fraction        | 0.0653      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.41       |
|    explained_varia

In [43]:
env = AC_UnbalancedDisk()
obs, _ = env.reset()
for i in range(5000):
    action, _states = model.predict(obs)  # policy
    obs, reward, terminated, truncated, info = env.step(action)
    env.render()
    t = (obs[0] + np.pi)%(2*np.pi) 
    print( f'theta = {t: .4f}, omega: {obs[1]: .4f}')
    if terminated or truncated:
        obs, _ = env.reset()
    
env.close()

theta =  3.1412, omega: -0.2649
theta =  3.1340, omega: -0.1672
theta =  3.1311, omega: -0.0987
theta =  3.1323, omega:  0.1480
theta =  3.1309, omega: -0.1320
theta =  3.1274, omega: -0.1519
theta =  3.1237, omega: -0.1695
theta =  3.1189, omega: -0.2659
theta =  3.1163, omega:  0.0334
theta =  3.1179, omega: -0.0200
theta =  3.1158, omega:  0.0026
theta =  3.1159, omega: -0.1299
theta =  3.1095, omega: -0.3568
theta =  3.1006, omega: -0.1856
theta =  3.0981, omega: -0.0624
theta =  3.0938, omega: -0.3239
theta =  3.0850, omega: -0.2973
theta =  3.0816, omega: -0.0265
theta =  3.0812, omega:  0.0103
theta =  3.0785, omega: -0.2069
theta =  3.0756, omega: -0.1076
theta =  3.0713, omega: -0.0345
theta =  3.0734, omega:  0.1144
theta =  3.0764, omega: -0.0002
theta =  3.0751, omega:  0.0003
theta =  3.0750, omega: -0.0012
theta =  3.0782, omega:  0.2012
theta =  3.0792, omega: -0.0047
theta =  3.0789, omega: -0.0430
theta =  3.0783, omega:  0.0001
theta =  3.0780, omega:  0.0297
theta = 