In [1]:
from pathlib import Path
import datetime

import gymnasium as gym
import torch

from policy_based import PolicyNetFunc, PolicyNetTrainer, PolicyNetTester
from deep_q import Discrete1ContinuousAction
from env import Env
from utils import clear_target_path, show_gif_on_jupyternb, to_gif

### Lab1 Policy-Base Train

使用CarPole-V1 环境，测试Policy-Based REINFORCE 算法

In [None]:
GYM_ENV_NAME = 'CartPole-v1'
_train_gym_env = gym.make(GYM_ENV_NAME)
env = Env(_train_gym_env)

# 打印查看环境的动作空间和状态空间 
action_nums, state_space = _train_gym_env.action_space.n, _train_gym_env.observation_space
print(f'action num: {action_nums}, space: {state_space}')

TRAIN_EPOCH = 1000
HIDDEN_DIM = 256
LEARNING_RATE = 2e-3
GAMMA = 0.99


LOG_PATH = Path('./run/logs/cartpoleV1/REINFORCE')
MODEL_PATH = Path('./run/model/cartpoleV1/REINFORCE.pth')
TEST_OUTPUT_PATH = Path('./run/test_result/cartpoleV1_REINFORCE')

_USE_CUDA = True and torch.cuda.is_available()
# _USE_CUDA = False and torch.cuda.is_available()

policy_func = PolicyNetFunc(state_space.shape[0], 
                   action_nums, 
                   hidden_dim=HIDDEN_DIM, 
                   device=torch.device('cuda') if _USE_CUDA else None)


policy_func_trainer = PolicyNetTrainer(policy_func=policy_func,
                                  env=env,
                                  learning_rate=LEARNING_RATE,
                                  gamma=GAMMA,
                                  logger_folder=LOG_PATH)

#### 训练模型

In [None]:
clear_target_path(LOG_PATH)
clear_target_path(MODEL_PATH)
print(f'start training, now datetime: {datetime.datetime.now()}')
policy_func_trainer.train(train_epoch=TRAIN_EPOCH)
print(f'end training, saving model to: {MODEL_PATH}, now datetime: {datetime.datetime.now()}')

policy_func.save(MODEL_PATH)

#### 开始测试

In [None]:
test_policy_func = PolicyNetFunc.from_file(MODEL_PATH)
_render_env = Env(gym.make(GYM_ENV_NAME, render_mode='rgb_array_list'))
policy_func_tester = PolicyNetTester(
    policy_fun=test_policy_func.to('cpu'),
    env=_render_env
)
RESULT_GIF = TEST_OUTPUT_PATH / 'result.gif'
clear_target_path(RESULT_GIF)
policy_func_tester.test(1000)
to_gif(_render_env._gym_env, RESULT_GIF, 1/30)

show_gif_on_jupyternb(RESULT_GIF)

### 使用Pendulum-v1 环境，测试Policy-Based REINFORCE 算法

In [4]:
GYM_ENV_NAME = 'Pendulum-v1'
RESULT_DIR_NAME = 'pendulumV1'

_train_gym_env = gym.make(GYM_ENV_NAME)
env = Env(_train_gym_env)

LOG_PATH = Path(f'./run/logs/{RESULT_DIR_NAME}/policy_based')
MODEL_PATH = Path(f'./run/model/{RESULT_DIR_NAME}/policy_model.pth')
TEST_OUTPUT_PATH = Path(f'./run/test_result/{RESULT_DIR_NAME}_policy')

# 打印查看环境的动作空间和状态空间 
action_space, state_space = _train_gym_env.action_space, _train_gym_env.observation_space
print(f'action: {action_space}, space: {state_space}')

# 动作空间离散化程度（用11个区间来替代连续动作空间）
BINS = 100

TRAIN_EPOCH = 10000
HIDDEN_DIM = 512
LEARNING_RATE = 1e-3
GAMMA = 0.99

_USE_CUDA = True and torch.cuda.is_available()
# _USE_CUDA = False and torch.cuda.is_available()

policy_func = PolicyNetFunc(state_space.shape[0], 
                   action_nums=BINS, 
                   hidden_dim=HIDDEN_DIM, 
                   device=torch.device('cuda') if _USE_CUDA else None)


policy_func_trainer = PolicyNetTrainer(policy_func=policy_func,
                                  env=env,
                                  learning_rate=LEARNING_RATE,
                                  gamma=GAMMA,
                                  logger_folder=LOG_PATH,
                                  action_converter=Discrete1ContinuousAction(action_space.low, action_space.high, BINS))

action: Box(-2.0, 2.0, (1,), float32), space: Box([-1. -1. -8.], [1. 1. 8.], (3,), float32)


#### 训练模型

In [5]:
clear_target_path(LOG_PATH)
clear_target_path(MODEL_PATH)
print(f'start training, now datetime: {datetime.datetime.now()}')
policy_func_trainer.train(train_epoch=TRAIN_EPOCH)
print(f'end training, saving model to: {MODEL_PATH}, now datetime: {datetime.datetime.now()}')

policy_func.save(MODEL_PATH)

start training, now datetime: 2024-06-23 19:03:50.556291


  0%|          | 0/10000 [00:00<?, ?it/s]


ValueError: Expected parameter probs (Tensor of shape (1, 100)) of distribution Categorical(probs: torch.Size([1, 100])) to satisfy the constraint Simplex(), but found invalid values:
tensor([[-5.3898,  2.3251,  0.6364,  1.5145, -7.3999,  1.2428, -4.6396, -3.7236,
          3.8088,  0.8192,  0.1155, -4.4931,  2.4001,  1.2866,  2.9567,  5.5142,
          5.5776,  4.4540,  0.8457, -0.0687,  3.7172, -7.5980, -4.9409, -3.8109,
         -2.1625, -0.0636, -4.1293,  1.0067,  2.0914, -0.0776,  0.4372, -4.9335,
          2.4915, -0.1473, -3.7097, -0.0342,  0.4190, -0.5307, -0.1253,  5.7843,
         -2.7124,  3.8801,  1.0071, -1.6388, -3.2328,  3.0431, -4.4419, -0.1577,
          1.2578, -1.8795,  2.2531,  1.3166,  5.1002,  5.4249, -1.1794,  1.3867,
         -1.8710,  6.1590, -0.5850,  0.8934, -3.3684,  0.0735, -2.1041, -4.4007,
         -0.0197,  1.8454,  6.8711, -1.5800,  2.2488,  7.7269,  3.7603, -0.8571,
          3.3215,  5.7406,  2.3631, -3.0928,  2.6089,  0.4041, -4.0942,  0.4256,
         -3.3377, -7.4192,  0.7133,  0.7641, -0.8953, -5.0373, -3.0433,  0.6991,
         -2.4090, -2.6097,  0.3464,  5.2790, -0.4483, -5.9656,  2.4049,  2.2071,
         -1.3429,  0.8588, -0.1596, -2.9672]], grad_fn=<DivBackward0>)

#### 开始测试

In [None]:
test_policy_func = PolicyNetFunc.from_file(MODEL_PATH)
_render_env = Env(gym.make(GYM_ENV_NAME, render_mode='rgb_array_list'))
policy_func_tester = PolicyNetTester(
    policy_fun=test_policy_func.to('cpu'),
    env=_render_env,
    action_converter=Discrete1ContinuousAction(action_space.low, action_space.high, BINS)
)
RESULT_GIF = TEST_OUTPUT_PATH / 'result.gif'
clear_target_path(RESULT_GIF)
policy_func_tester.test(1000)
to_gif(_render_env._gym_env, RESULT_GIF, 1/30)

show_gif_on_jupyternb(RESULT_GIF)

### 使用Pendulum-v1 环境，测试Policy-Based AC 算法

In [2]:
from policy_based import PolicyValueNetTrainer, ValueNetFunc

GYM_ENV_NAME = 'Pendulum-v1'
RESULT_DIR_NAME = 'pendulumV1'

_train_gym_env = gym.make(GYM_ENV_NAME)
env = Env(_train_gym_env)

LOG_PATH = Path(f'./run/logs/{RESULT_DIR_NAME}/AC')
MODEL_PATH = Path(f'./run/model/{RESULT_DIR_NAME}/AC.pth')
TEST_OUTPUT_PATH = Path(f'./run/test_result/{RESULT_DIR_NAME}_AC')

# 打印查看环境的动作空间和状态空间 
action_space, state_space = _train_gym_env.action_space, _train_gym_env.observation_space
print(f'action: {action_space}, space: {state_space}')

# 动作空间离散化程度（用11个区间来替代连续动作空间）
BINS = 100

TRAIN_EPOCH = 10000
HIDDEN_DIM = 512
LEARNING_RATE = 1e-3
GAMMA = 0.99

_USE_CUDA = True and torch.cuda.is_available()
# _USE_CUDA = False and torch.cuda.is_available()

policy_func = PolicyNetFunc(state_space.shape[0], 
                   action_nums=BINS, 
                   hidden_dim=HIDDEN_DIM, 
                   device=torch.device('cuda') if _USE_CUDA else None)

value_func = ValueNetFunc(state_space.shape[0],
                          action_nums=BINS,
                          hidden_dim=HIDDEN_DIM,
                          device=torch.device('cuda') if _USE_CUDA else None)


policy_func_trainer = PolicyValueNetTrainer(
                                  policy_func=policy_func,
                                  value_func=value_func,
                                  env=env,
                                  learning_rate=LEARNING_RATE,
                                  gamma=GAMMA,
                                  logger_folder=LOG_PATH,
                                  action_converter=Discrete1ContinuousAction(action_space.low, action_space.high, BINS))

action: Box(-2.0, 2.0, (1,), float32), space: Box([-1. -1. -8.], [1. 1. 8.], (3,), float32)


#### 开始训练

In [3]:
clear_target_path(LOG_PATH)
clear_target_path(MODEL_PATH)
print(f'start training, now datetime: {datetime.datetime.now()}')
policy_func_trainer.train(train_epoch=TRAIN_EPOCH)
print(f'end training, saving model to: {MODEL_PATH}, now datetime: {datetime.datetime.now()}')

policy_func.save(MODEL_PATH)

start training, now datetime: 2024-06-23 18:57:12.444726


  0%|          | 0/10000 [00:00<?, ?it/s]


ValueError: Expected parameter probs (Tensor of shape (1, 100)) of distribution Categorical(probs: torch.Size([1, 100])) to satisfy the constraint Simplex(), but found invalid values:
tensor([[ 0.2406,  0.0961,  0.1337,  0.0782, -0.1154, -0.1222,  0.1802,  0.0244,
          0.0349,  0.0377,  0.0848,  0.1287, -0.0695, -0.1559,  0.1158,  0.1033,
          0.0769, -0.0256, -0.1743, -0.0521, -0.1410,  0.0686, -0.1318,  0.0248,
          0.0215,  0.1120,  0.0047,  0.0177, -0.0195,  0.0919,  0.1800, -0.2526,
         -0.0118, -0.0618,  0.0461,  0.0384, -0.1582, -0.1590,  0.0863, -0.1202,
          0.1166, -0.1542,  0.1339,  0.0280, -0.0096, -0.0305,  0.2425,  0.0803,
          0.1145, -0.0994,  0.0407,  0.0984, -0.0208, -0.1882,  0.0378,  0.1055,
          0.0508,  0.3263, -0.1178,  0.1769,  0.0296,  0.0965, -0.0600,  0.0566,
         -0.0968,  0.1186,  0.0347, -0.0248,  0.0909, -0.2014, -0.0301,  0.1220,
          0.0784,  0.0249,  0.0017, -0.2301, -0.1467,  0.0487, -0.1127,  0.0840,
          0.0855,  0.0126, -0.1214,  0.2169, -0.1897, -0.0806,  0.1497,  0.2023,
          0.0372, -0.1580,  0.1418,  0.0940, -0.0623, -0.0199,  0.0118, -0.0523,
         -0.2351,  0.0395, -0.1983,  0.0548]], grad_fn=<DivBackward0>)