In [55]:
import numpy as np
from gym_unbalanced_disk import UnbalancedDisk
import gymnasium as gym
from gymnasium import spaces
class AC_UnbalancedDisk1(UnbalancedDisk):
    def __init__(self, umax=3., dt=0.025, render_mode='human'):
        super().__init__(umax=umax, dt=dt, render_mode=render_mode)

        self.target = np.pi
        low = [-np.pi, -40]
        high = [np.pi, 40]
        self.observation_space = spaces.Box(
            low=np.array(low, dtype=np.float32),
            high=np.array(high, dtype=np.float32),
            shape=(2,)
        )

        self.recent_omegas = []

    def step(self, action):
        obs, reward, terminated, truncated, info = super().step(action)

        th = obs[0]
        omega = obs[1]

        # Normalize angle so π maps to 0
        theta = ((th - np.pi) % (2 * np.pi)) - np.pi

        # Update buffer of recent omega values
        self.recent_omegas.append(omega)
        if len(self.recent_omegas) > 10:
            self.recent_omegas.pop(0)

        # Base reward structure
        if abs(theta) < np.pi / 2:
            reward = min(-0.5, -5 + abs(omega))
        elif abs(theta) > np.pi / 2 and abs(theta) < 3 * np.pi / 4:
            reward = abs(theta)**2 / (1 + abs(omega))**1
        elif abs(theta) > 3 * np.pi / 4 and abs(theta) < 11 * np.pi / 12:
            reward = abs(theta)**4 / (1 + abs(omega))**2

            # Add anti-stall penalty
            # Stall detection: angular velocity near zero for several steps
            if all(abs(w) < 0.005 for w in self.recent_omegas):
                reward = 1 / (1 + abs(omega))
        else:
            reward = abs(theta)**4 / (1 + abs(omega))**2
             

        return obs, reward, terminated, truncated, info

    def reset(self, seed=None, options=None):
        obs, info = super().reset()
        self.recent_omegas = []  # Reset history
        return obs, info


In [None]:


class AC_UnbalancedDisk2(UnbalancedDisk):
    def __init__(self, umax=3., dt = 0.025, render_mode='human'):
        super().__init__(umax=umax, dt=dt, render_mode=render_mode)

        #self.actions_values = np.linspace(-self.umax, self.umax, self.num_actions)
        #self.action_space = spaces.Discrete(self.num_actions)
        self.target = np.pi
        low = [-np.pi,-40] 
        high = [np.pi,40]
        self.observation_space = spaces.Box(low=np.array(low,dtype=np.float32),high=np.array(high,dtype=np.float32),shape=(2,))
    
    def step(self, action):
        obs, reward, terminated, truncated, info = super().step(action)

        th = obs[0]
        omega = obs[1]

        # Normalize angle so π maps to 0
        th_norm = abs(((th - np.pi) % (2 * np.pi)) - np.pi)
        omaga_norm = abs(omega)
        reward = th_norm - np.cos(th_norm) / (1 + omaga_norm)


        #             if abs(error)> np.pi/4:
        #    reward = 0.1*np.sign(error)*omega
        #else:
        # reward = - norm_theta**2 -0.1*(omega / (1 + 10*abs(norm_theta)))**2 - 0.01*action**2# try np.exp 
        ### discritize obs if we use Q learning
        return obs, reward, terminated, truncated, info
    
    def reset(self, seed=None, options=None):
        obs, info = super().reset()

        return obs, info

In [17]:
import numpy as np
from gym_unbalanced_disk import UnbalancedDisk
import gymnasium as gym
from gymnasium import spaces

class AC_UnbalancedDisk3(UnbalancedDisk):
    def __init__(self, umax=3., dt = 0.025, render_mode='human'):
        super().__init__(umax=umax, dt=dt, render_mode=render_mode)

        #self.actions_values = np.linspace(-self.umax, self.umax, self.num_actions)
        #self.action_space = spaces.Discrete(self.num_actions)
        self.target = np.pi
        low = [-np.pi,-40] 
        high = [np.pi,40]
        self.observation_space = spaces.Box(low=np.array(low,dtype=np.float32),high=np.array(high,dtype=np.float32),shape=(2,))
    
    def step(self, action):
        obs, reward, terminated, truncated, info = super().step(action)

        th = obs[0]
        omega = obs[1]


        # Normalize angle so π maps to 0
        th_norm = abs(((th - np.pi) % (2 * np.pi)) - np.pi)
        omaga_norm = abs(omega)


        if th_norm < 0.5*np.pi:
            reward = min(-0.5, -7.5 + omaga_norm)
        elif th_norm > 0.5*np.pi and th_norm < 0.75 * np.pi:
            reward = th_norm - np.cos(th_norm) /(1 + omaga_norm)
        else:
            reward = th_norm**2 - np.cos(th_norm) / (1 + omaga_norm)**2

 
        #             if abs(error)> np.pi/4:
        #    reward = 0.1*np.sign(error)*omega
        #else:
        # reward = - norm_theta**2 -0.1*(omega / (1 + 10*abs(norm_theta)))**2 - 0.01*action**2# try np.exp 
        ### discritize obs if we use Q learning
        return obs, reward, terminated, truncated, info
    
    def reset(self, seed=None, options=None):
        obs, info = super().reset()

        return obs, info

In [42]:
from gymnasium.wrappers import TimeLimit
from stable_baselines3 import PPO, SAC
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import StopTrainingOnMaxEpisodes

In [61]:
from stable_baselines3 import SAC
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnMaxEpisodes
from stable_baselines3.common.env_util import make_vec_env
from gymnasium.wrappers import TimeLimit

# Training environment
env = AC_UnbalancedDisk1()
env = TimeLimit(env, max_episode_steps=500)
env = Monitor(env)

# Separate eval environment for unbiased performance tracking
eval_env = AC_UnbalancedDisk1()
eval_env = TimeLimit(eval_env, max_episode_steps=500)
eval_env = Monitor(eval_env)

# Stop after 100 episodes
stop_cb = StopTrainingOnMaxEpisodes(max_episodes=100, verbose=1)

# Save best model based on mean reward
eval_cb = EvalCallback(
    eval_env,
    best_model_save_path="./best_sac_model",
    log_path="./logs",
    eval_freq=5000,
    deterministic=True,
    render=False
)

# Chain both callbacks
from stable_baselines3.common.callbacks import CallbackList
callback = CallbackList([stop_cb, eval_cb])

# Model
model_sac1 = SAC(
    policy='MlpPolicy',
    env=env,
    learning_rate=1e-3,
    verbose=1,
    ent_coef=1e-2,
)

# Train
model_sac1.learn(
    total_timesteps=1_000_000,
    callback=callback,
)


Using cuda device
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 500      |
|    ep_rew_mean     | -574     |
| time/              |          |
|    episodes        | 4        |
|    fps             | 225      |
|    time_elapsed    | 8        |
|    total_timesteps | 2000     |
| train/             |          |
|    actor_loss      | 11.9     |
|    critic_loss     | 1.92     |
|    ent_coef        | 0.01     |
|    learning_rate   | 0.001    |
|    n_updates       | 1899     |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 500      |
|    ep_rew_mean     | -758     |
| time/              |          |
|    episodes        | 8        |
|    fps             | 216      |
|    time_elapsed    | 18       |
|    total_timesteps | 4000     |
| train/             |          |
|    actor_loss      | 14.8     |
|    critic_loss     | 9.21  

<stable_baselines3.sac.sac.SAC at 0x7fb54e543520>

In [63]:
# To use the best-performing model
from stable_baselines3 import SAC
model_sac1 = SAC.load("./best_sac_model/best_model.zip")


env = AC_UnbalancedDisk1()
obs, _ = env.reset()
for i in range(5000):
    action, _states = model_sac1.predict(obs)  # policy
    obs, reward, terminated, truncated, info = env.step(action)
    env.render()
    t = obs[0]
    print( f'theta = {t: .4f}, omega: {obs[1]: .4f}')
    if terminated or truncated:
        obs, _ = env.reset()
    
env.close()

theta =  0.0228, omega:  1.9035
theta =  0.0927, omega:  3.5962
theta =  0.1999, omega:  4.9519
theta =  0.3361, omega:  5.8830
theta =  0.4910, omega:  6.3474
theta =  0.6507, omega:  6.3596
theta =  0.8045, omega:  5.9737
theta =  0.9469, omega:  5.2750
theta =  1.0651, omega:  4.3542
theta =  1.1631, omega:  3.2974
theta =  1.1801, omega: -1.8024
theta =  1.0747, omega: -6.5376
theta =  0.8566, omega: -10.8625
theta =  0.5376, omega: -14.4713
theta =  0.1425, omega: -16.9825
theta = -0.2992, omega: -18.1089
theta = -0.7525, omega: -17.8661
theta = -1.1854, omega: -16.6162
theta = -1.5784, omega: -14.9024
theta = -1.9279, omega: -13.2116
theta = -2.2322, omega: -10.9666
theta = -2.4600, omega: -7.4735
theta = -2.6301, omega: -6.1460
theta = -2.7518, omega: -3.7968
theta = -2.8559, omega: -4.5403
theta = -2.9353, omega: -1.8701
theta = -2.9984, omega: -3.1365
theta = -3.0523, omega: -1.2779
theta = -3.0840, omega: -1.3745
theta = -3.1040, omega: -0.2771
theta = -3.1068, omega:  0.0301

In [35]:
env = AC_UnbalancedDisk2()
env = TimeLimit(env, max_episode_steps=500)
env = Monitor(env)
stop_cb = StopTrainingOnMaxEpisodes(max_episodes=100, verbose=1)

model_sac2 = SAC(
    policy='MlpPolicy',
    env=env,
    learning_rate=1e-3,
    verbose=1,
)

model_sac2.learn(
    total_timesteps=1_000_000,  
    callback=stop_cb,
)

Using cuda device
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 500      |
|    ep_rew_mean     | 424      |
| time/              |          |
|    episodes        | 4        |
|    fps             | 219      |
|    time_elapsed    | 9        |
|    total_timesteps | 2000     |
| train/             |          |
|    actor_loss      | -296     |
|    critic_loss     | 4.43     |
|    ent_coef        | 0.231    |
|    ent_coef_loss   | -0.421   |
|    learning_rate   | 0.001    |
|    n_updates       | 1899     |
---------------------------------


KeyboardInterrupt: 

In [None]:
env = AC_UnbalancedDisk2()
obs, _ = env.reset()
for i in range(5000):
    action, _states = model_sac2.predict(obs)  # policy
    obs, reward, terminated, truncated, info = env.step(action)
    env.render()
    t = obs[0]
    print( f'theta = {t: .4f}, omega: {obs[1]: .4f}')
    if terminated or truncated:
        obs, _ = env.reset()
    
env.close()

theta =  0.0224, omega:  1.6995
theta =  0.0852, omega:  3.3604
theta =  0.1850, omega:  4.7154
theta =  0.3173, omega:  5.6453
theta =  0.4637, omega:  6.0301
theta =  0.6140, omega:  6.1004
theta =  0.7659, omega:  5.7709
theta =  0.9025, omega:  5.1099
theta =  1.0164, omega:  4.0338
theta =  1.1049, omega:  3.0502
theta =  1.1196, omega: -1.8312
theta =  1.0167, omega: -6.3278
theta =  0.8033, omega: -10.5396
theta =  0.4995, omega: -13.6730
theta =  0.1245, omega: -16.1026
theta = -0.2939, omega: -17.1227
theta = -0.7208, omega: -16.9262
theta = -1.1328, omega: -15.7441
theta = -1.5024, omega: -13.9472
theta = -1.8280, omega: -12.2727
theta = -2.1162, omega: -10.7332
theta = -2.3566, omega: -8.6161
theta = -2.5666, omega: -8.2060
theta = -2.7724, omega: -8.3129
theta = -2.9854, omega: -8.9274
theta = -3.2149, omega: -9.4522
theta = -3.4715, omega: -11.3057
theta = -3.7785, omega: -13.3034
theta = -4.1501, omega: -16.7499
theta = -4.6234, omega: -20.8037
theta = -5.1933, omega: -24

In [19]:
env = AC_UnbalancedDisk3()
env = TimeLimit(env, max_episode_steps=500)
env = Monitor(env)
stop_cb = StopTrainingOnMaxEpisodes(max_episodes=100, verbose=1)

model_sac3 = SAC(
    policy='MlpPolicy',
    env=env,
    learning_rate=1e-3,
    verbose=1,
)

model_sac3.learn(
    total_timesteps=1_000_000,  
    callback=stop_cb,
)

Using cuda device
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 500      |
|    ep_rew_mean     | -90.4    |
| time/              |          |
|    episodes        | 4        |
|    fps             | 219      |
|    time_elapsed    | 9        |
|    total_timesteps | 2000     |
| train/             |          |
|    actor_loss      | -355     |
|    critic_loss     | 9.26     |
|    ent_coef        | 0.359    |
|    ent_coef_loss   | 0.319    |
|    learning_rate   | 0.001    |
|    n_updates       | 1899     |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 500      |
|    ep_rew_mean     | 501      |
| time/              |          |
|    episodes        | 8        |
|    fps             | 217      |
|    time_elapsed    | 18       |
|    total_timesteps | 4000     |
| train/             |          |
|    actor_loss      | -695  

<stable_baselines3.sac.sac.SAC at 0x7fb56d7058a0>

In [20]:
env = AC_UnbalancedDisk3()
obs, _ = env.reset()
for i in range(5000):
    action, _states = model_sac3.predict(obs)  # policy
    obs, reward, terminated, truncated, info = env.step(action)
    env.render()
    t = obs[0]
    print( f'theta = {t: .4f}, omega: {obs[1]: .4f}')
    if terminated or truncated:
        obs, _ = env.reset()
    
env.close()

theta = -0.0242, omega: -1.8812
theta = -0.0921, omega: -3.5407
theta = -0.1969, omega: -4.8572
theta = -0.3312, omega: -5.7147
theta = -0.4789, omega: -5.9764
theta = -0.6289, omega: -5.9880
theta = -0.7746, omega: -5.5471
theta = -0.8543, omega: -0.8385
theta = -0.8190, omega:  3.5135
theta = -0.6819, omega:  7.4565
theta = -0.4525, omega:  10.8259
theta = -0.1494, omega:  13.0650
theta =  0.1973, omega:  14.4913
theta =  0.5655, omega:  14.7520
theta =  0.9235, omega:  13.8823
theta =  1.2523, omega:  12.2693
theta =  1.5387, omega:  10.6017
theta =  1.7808, omega:  8.7660
theta =  1.9810, omega:  7.3901
theta =  2.1504, omega:  6.1903
theta =  2.2884, omega:  4.9188
theta =  2.4071, omega:  4.4268
theta =  2.5105, omega:  3.9894
theta =  2.6100, omega:  4.0258
theta =  2.7139, omega:  4.2789
theta =  2.8223, omega:  4.6279
theta =  2.9481, omega:  5.3426
theta =  3.0686, omega:  4.4651
theta =  3.1842, omega:  4.8324
theta =  3.2933, omega:  3.9498
theta =  3.4097, omega:  5.5820
t