# Фреймворк для RL и новые методы

Рассмотрим работу методов DDPG, TD3, SAC реализованных в библиотеке `stable_baselines3`.

In [1]:
import stable_baselines3
stable_baselines3.__version__

'2.5.0'

Сравним работу алгоритмов DDPG, TD3 на примере среды `Pendulum`.

In [10]:
import os
import gymnasium as gym
from stable_baselines3 import DDPG, TD3, SAC

from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

In [12]:
save_path_DDPG = os.path.join('savedModels', 'SB3_DDPG_Pendulum')
log_path = os.path.join('logs', 'SB3_DDPG_TD3_Pendulum')
save_path_TD3 = os.path.join('savedModels', 'SB3_TD3_Pendulum')


env = gym.make("Pendulum-v1")

model_DDPG = DDPG('MlpPolicy', env,  tensorboard_log=log_path)
model_TD3 = TD3('MlpPolicy', env,  tensorboard_log=log_path)

In [14]:
full_logs_path = os.path.join(os.getcwd() ,log_path)
print("".join(("tensorboard --logdir=",'"',full_logs_path,'"')))

tensorboard --logdir="C:\Users\AlexK\Documents\Python Scripts\RL\logs\SB3_DDPG_TD3_Pendulum"


In [16]:
model_DDPG.learn(total_timesteps=30000, progress_bar=True)

Output()

<stable_baselines3.ddpg.ddpg.DDPG at 0x1f98c0bd9a0>

In [28]:
model_DDPG.save(save_path_DDPG)

In [17]:
model_TD3.learn(total_timesteps=30000, progress_bar=True)

Output()

<stable_baselines3.td3.td3.TD3 at 0x1f9868d3fb0>

In [30]:
model_TD3.save(save_path_TD3)

In [34]:
model_DDPG = DDPG.load(save_path_DDPG, env=env)
model_TD3 = TD3.load(save_path_TD3, env=env)

In [24]:
mean_reward, std_reward = evaluate_policy(model_DDPG, env, n_eval_episodes=100, render=False)

print("Средний доход: ", mean_reward)
print("Стандартное отклонение: ", std_reward)

Средний доход:  -139.8770171607705
Стандартное отклонение:  70.22451910749129


In [25]:
mean_reward, std_reward = evaluate_policy(model_TD3, env, n_eval_episodes=100, render=False)

print("Средний доход: ", mean_reward)
print("Стандартное отклонение: ", std_reward)

Средний доход:  -141.01297798662404
Стандартное отклонение:  70.56964080995978


### Пример

Рассмотрим среду `Humanoid-v5`, которая моделирует шагающего человека. См. подробности по ссылке:

https://gymnasium.farama.org/environments/mujoco/humanoid/

Цель в том, чтобы научить человечка двигаться как можно дольше. Состояние - это вектор размерности 347, действия - это 17-мерный вектор.

Запустим симуляцию со случайными действями.

In [40]:
import os
import gymnasium as gym
import numpy as np
from stable_baselines3 import SAC
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

In [42]:
env = gym.make('Humanoid-v5', render_mode = 'human')
env = gym.wrappers.RecordEpisodeStatistics(env, 1)
state, _ = env.reset()
gain = 0

while True:
    action = env.action_space.sample()
    state, reward, terminated, truncated, info = env.step(action)
    gain += reward
    if terminated or truncated:
        break

print("Средний доход: ", np.mean(env.return_queue))
env.close()

Средний доход:  141.45276010607796


In [54]:
import os
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

save_path = os.path.join('savedModels', 'SB3_SAC_Humanoid')
save_path_best = os.path.join('savedModels', 'SB3_SAC_Humanoid')
log_path = os.path.join('logs', 'SB3_SAC_Humanoid')


vec_env = make_vec_env("Humanoid-v5", n_envs=4)

# создание callback
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=4000, verbose=1)
eval_callback = EvalCallback(env, 
                             callback_on_new_best=stop_callback, 
                             eval_freq=20000, 
                             best_model_save_path=save_path, 
                             verbose=1)

In [56]:
vec_env = make_vec_env("Humanoid-v5", n_envs=4)
model = SAC('MlpPolicy', vec_env, tensorboard_log=log_path)

In [58]:
from torchsummary import summary

summary(model.policy, input_size=(1,348,))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
           Flatten-1                  [-1, 348]               0
  FlattenExtractor-2                  [-1, 348]               0
            Linear-3                  [-1, 256]          89,344
              ReLU-4                  [-1, 256]               0
            Linear-5                  [-1, 256]          65,792
              ReLU-6                  [-1, 256]               0
            Linear-7                   [-1, 17]           4,369
            Linear-8                   [-1, 17]           4,369
             Actor-9                   [-1, 17]               0
Total params: 163,874
Trainable params: 163,874
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.01
Params size (MB): 0.63
Estimated Total Size (MB): 0.64
-------------------------------------------

In [60]:
full_logs_path = os.path.join(os.getcwd() ,log_path)
print("".join(("tensorboard --logdir=",'"',full_logs_path,'"')))

tensorboard --logdir="C:\Users\AlexK\Documents\Python Scripts\RL\logs\SB3_SAC_Humanoid"


In [62]:
model.learn(total_timesteps=1500, callback=eval_callback, progress_bar=True)

Output()

<stable_baselines3.sac.sac.SAC at 0x1f998e5c380>

In [64]:
model.save(save_path)

In [13]:
env = gym.make('Humanoid-v5')
load_path = os.path.join(save_path_best, 'best_model')
model = SAC.load(load_path, env=env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [None]:
env = gym.make('Humanoid-v5')
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100, render=False)
print(mean_reward)
env.close()

In [66]:
env = gym.make('Humanoid-v5', render_mode = 'human')
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=1, render=True)
print(mean_reward)
env.close()