In [1]:
%load_ext autoreload
%autoreload 2

In [22]:
from pathlib import Path
import datetime

import gymnasium as gym
import torch

from policy_based import PolicyNetFunc, PolicyNetTrainer, PolicyNetTester, PolicyNetTrainerWithBase, ValueNetFunc
from deep_q import Discrete1ContinuousAction
from env import Env, get_action_discreter
from utils import clear_target_path, show_gif_on_jupyternb, to_gif
from train_test_util import start_test, start_train, StandarTestProcess, StandarTrainProcess

### 使用CarPole-V1 环境，测试Policy-Based REINFORCE 算法

In [None]:
GYM_ENV_NAME = 'CartPole-v1'
env = Env.from_env_name(GYM_ENV_NAME)
RESULT_DIR_NAME='cartpoleV1'

LOG_PATH = Path(f'./run/logs/{RESULT_DIR_NAME}/REINFORCE')
MODEL_PATH = Path(f'./run/model/{RESULT_DIR_NAME}/REINFORCE.pth')
TEST_OUTPUT_PATH = Path(f'./run/test_result/{RESULT_DIR_NAME}/REINFORCE')

# 打印查看环境的动作空间和状态空间 
env.print_state_action_dims()


TRAIN_EPOCH = 1000
HIDDEN_DIM = 256
LEARNING_RATE = 2e-3
GAMMA = 0.99

_USE_CUDA = True and torch.cuda.is_available()
# _USE_CUDA = False and torch.cuda.is_available()

policy_func = PolicyNetFunc(env.get_state_dim()[0], 
                   env.get_action_dim()[0], 
                   hidden_dim=HIDDEN_DIM, 
                   device=torch.device('cuda') if _USE_CUDA else None)


policy_func_trainer = PolicyNetTrainer(policy_func=policy_func,
                                  env=env,
                                  learning_rate=LEARNING_RATE,
                                  gamma=GAMMA,
                                  logger_folder=LOG_PATH)

#### 训练模型

In [None]:
start_train(StandarTrainProcess(
    trainer=policy_func_trainer,
    model=policy_func,
    train_epoch=TRAIN_EPOCH,
    log_path=LOG_PATH,
    model_path=MODEL_PATH
))

#### 开始测试

In [None]:
test_policy_func = PolicyNetFunc.from_file(MODEL_PATH)
_render_env = Env(gym.make(GYM_ENV_NAME, render_mode='rgb_array_list'))

policy_func_tester = PolicyNetTester(
    policy_fun=test_policy_func,
    env=_render_env
)


start_test(
    StandarTestProcess(
        model=test_policy_func,
        tester=policy_func_tester,
        env=_render_env,
        test_output_path=TEST_OUTPUT_PATH,
        test_epoch=1000,
    )
)

### 使用CarPole-V1 环境，测试Policy-Based + 基线的 REINFORCE 算法

In [None]:
GYM_ENV_NAME = 'CartPole-v1'
RESULT_DIR_NAME='cartpoleV1'
env = Env.from_env_name(GYM_ENV_NAME)

LOG_PATH = Path(f'./run/logs/{RESULT_DIR_NAME}/REINFORCE_With_BASE')
MODEL_PATH = Path(f'./run/model/{RESULT_DIR_NAME}/REINFORCE_With_BASE.pth')
TEST_OUTPUT_PATH = Path(f'./run/test_result/{RESULT_DIR_NAME}/REINFORCE_With_BASE')

# 打印查看环境的动作空间和状态空间 
env.print_state_action_dims()


TRAIN_EPOCH = 1000
HIDDEN_DIM = 256
LEARNING_RATE = 2e-3
VALUE_LEARNING_RATE = 1e-3
GAMMA = 0.99

_USE_CUDA = True and torch.cuda.is_available()
# _USE_CUDA = False and torch.cuda.is_available()

value_func = ValueNetFunc(
                env.get_state_dim()[0], 
                hidden_dim=HIDDEN_DIM, 
                device=torch.device('cuda') if _USE_CUDA else None)

policy_func = PolicyNetFunc(env.get_state_dim()[0], 
                   env.get_action_dim()[0], 
                   hidden_dim=HIDDEN_DIM, 
                   device=torch.device('cuda') if _USE_CUDA else None)


policy_func_trainer = PolicyNetTrainerWithBase(
                                  policy_func=policy_func,
                                  value_func=value_func,
                                  value_learning_rate=LEARNING_RATE,
                                  env=env,
                                  learning_rate=VALUE_LEARNING_RATE,
                                  gamma=GAMMA,
                                  logger_folder=LOG_PATH)

#### 训练模型

In [None]:
start_train(StandarTrainProcess(
    trainer=policy_func_trainer,
    model=policy_func,
    train_epoch=TRAIN_EPOCH,
    log_path=LOG_PATH,
    model_path=MODEL_PATH
))

#### 测试模型

In [None]:
test_policy_func = PolicyNetFunc.from_file(MODEL_PATH)
_render_env = Env(gym.make(GYM_ENV_NAME, render_mode='rgb_array_list'))

policy_func_tester = PolicyNetTester(
    policy_fun=test_policy_func,
    env=_render_env
)


start_test(
    StandarTestProcess(
        model=test_policy_func,
        tester=policy_func_tester,
        env=_render_env,
        test_output_path=TEST_OUTPUT_PATH,
        test_epoch=1000,
    )
)

### 使用Pendulum-v1 环境，测试Policy-Based REINFORCE 算法

In [None]:
GYM_ENV_NAME = 'Pendulum-v1'
RESULT_DIR_NAME = 'pendulumV1'

env = Env.from_env_name(GYM_ENV_NAME)

LOG_PATH = Path(f'./run/logs/{RESULT_DIR_NAME}/policy_based')
MODEL_PATH = Path(f'./run/model/{RESULT_DIR_NAME}/policy_model.pth')
TEST_OUTPUT_PATH = Path(f'./run/test_result/{RESULT_DIR_NAME}_policy')

# 打印查看环境的动作空间和状态空间 
env.print_state_action_dims()

# 动作空间离散化程度（用11个区间来替代连续动作空间）
BINS = 11

TRAIN_EPOCH = 1000
HIDDEN_DIM = 512
LEARNING_RATE = 1e-3
GAMMA = 0.99

_USE_CUDA = True and torch.cuda.is_available()
# _USE_CUDA = False and torch.cuda.is_available()

policy_func = PolicyNetFunc(env.get_state_dim()[0], 
                   action_nums=BINS, 
                   hidden_dim=HIDDEN_DIM, 
                   device=torch.device('cuda') if _USE_CUDA else None)


policy_func_trainer = PolicyNetTrainer(policy_func=policy_func,
                                  env=env,
                                  learning_rate=LEARNING_RATE,
                                  gamma=GAMMA,
                                  logger_folder=LOG_PATH,
                                  action_converter=get_action_discreter(env, BINS))

#### 训练模型

In [None]:
start_train(StandarTrainProcess(
    trainer=policy_func_trainer,
    model=policy_func,
    train_epoch=TRAIN_EPOCH,
    log_path=LOG_PATH,
    model_path=MODEL_PATH
))

#### 开始测试

In [None]:
test_policy_func = PolicyNetFunc.from_file(MODEL_PATH)
_render_env = Env(gym.make(GYM_ENV_NAME, render_mode='rgb_array_list'))

policy_func_tester = PolicyNetTester(
    policy_fun=test_policy_func,
    env=_render_env,
    action_converter=get_action_discreter(env, BINS)
)


start_test(
    StandarTestProcess(
        model=test_policy_func,
        tester=policy_func_tester,
        env=_render_env,
        test_output_path=TEST_OUTPUT_PATH,
        test_epoch=1000,
    )
)

### 使用Pendulum-v1 环境，测试Policy-Based + 基线的 REINFORCE 算法

In [None]:
GYM_ENV_NAME = 'Pendulum-v1'
RESULT_DIR_NAME = 'pendulumV1'

env = Env.from_env_name(GYM_ENV_NAME)

LOG_PATH = Path(f'./run/logs/{RESULT_DIR_NAME}/REINFORCE_With_BASE')
MODEL_PATH = Path(f'./run/model/{RESULT_DIR_NAME}/REINFORCE_With_BASE.pth')
TEST_OUTPUT_PATH = Path(f'./run/test_result/{RESULT_DIR_NAME}/REINFORCE_With_BASE')

# 打印查看环境的动作空间和状态空间 
env.print_state_action_dims()

# 动作空间离散化程度（用11个区间来替代连续动作空间）
BINS = 11

TRAIN_EPOCH = 3000
HIDDEN_DIM = 512
LEARNING_RATE = 5e-4
VALUE_LEARNING_RATE = 5e-3
GAMMA = 0.999

_USE_CUDA = True and torch.cuda.is_available()
# _USE_CUDA = False and torch.cuda.is_available()

value_func = ValueNetFunc(
                env.get_state_dim()[0], 
                hidden_dim=HIDDEN_DIM, 
                device=torch.device('cuda') if _USE_CUDA else None)

policy_func = PolicyNetFunc(env.get_state_dim()[0], 
                   action_nums=BINS, 
                   hidden_dim=HIDDEN_DIM, 
                   device=torch.device('cuda') if _USE_CUDA else None)


policy_func_trainer = PolicyNetTrainerWithBase(
                                  policy_func=policy_func,
                                  value_func=value_func,
                                  value_learning_rate=VALUE_LEARNING_RATE,
                                  env=env,
                                  learning_rate=LEARNING_RATE,
                                  gamma=GAMMA,
                                  logger_folder=LOG_PATH,
                                  action_converter=get_action_discreter(env, BINS))

#### 训练模型

In [None]:
start_train(StandarTrainProcess(
    trainer=policy_func_trainer,
    model=policy_func,
    train_epoch=TRAIN_EPOCH,
    log_path=LOG_PATH,
    model_path=MODEL_PATH
))

#### 测试模型

In [None]:
test_policy_func = PolicyNetFunc.from_file(MODEL_PATH)
_render_env = Env(gym.make(GYM_ENV_NAME, render_mode='rgb_array_list'))

policy_func_tester = PolicyNetTester(
    policy_fun=test_policy_func,
    env=_render_env,
    action_converter=get_action_discreter(_render_env, BINS),
    stochastic=True
)


start_test(
    StandarTestProcess(
        model=test_policy_func,
        tester=policy_func_tester,
        env=_render_env,
        test_output_path=TEST_OUTPUT_PATH,
        test_epoch=1000,
    )
)

### 使用CarPole-V1 环境，测试Policy-Based AC 算法

In [7]:
from policy_based import PolicyValueNetTrainer, ActionStateValueNetFunc

GYM_ENV_NAME = 'CartPole-v1'
RESULT_DIR_NAME='cartpoleV1'
env = Env.from_env_name(GYM_ENV_NAME)

LOG_PATH = Path(f'./run/logs/{RESULT_DIR_NAME}/AC')
MODEL_PATH = Path(f'./run/model/{RESULT_DIR_NAME}/AC/AC.pth')
TEST_OUTPUT_PATH = Path(f'./run/test_result/{RESULT_DIR_NAME}/AC')

# 打印查看环境的动作空间和状态空间 
env.print_state_action_dims()

TRAIN_EPOCH = 3000
HIDDEN_DIM = 256
LEARNING_RATE = 2e-3
VLEARNING_RATE = 5e-3
GAMMA = 0.99

_USE_CUDA = True and torch.cuda.is_available()
# _USE_CUDA = False and torch.cuda.is_available()

policy_func = PolicyNetFunc(env.get_state_dim()[0], 
                   action_nums=env.get_action_dim()[0], 
                   hidden_dim=HIDDEN_DIM * 2, 
                   device=torch.device('cuda') if _USE_CUDA else None)

value_func = ActionStateValueNetFunc(env.get_state_dim()[0],
                          action_nums=env.get_action_dim()[0],
                          hidden_dim=HIDDEN_DIM,
                          device=torch.device('cuda') if _USE_CUDA else None)


policy_func_trainer = PolicyValueNetTrainer(
                                  policy_func=policy_func,
                                  value_func=value_func,
                                  vlearning_rate=VLEARNING_RATE,
                                  env=env,
                                  learning_rate=LEARNING_RATE,
                                  gamma=GAMMA,
                                  logger_folder=LOG_PATH
                                  )

[1mINFO    [0m | [36menv[0m: - [1maction: 2, space: Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)[0m; [32m2024-06-27 07:42:13[0m [36mprint_state_action_dims[0m:[36m68[0m


#### 开始训练

In [8]:
start_train(
    StandarTrainProcess(
        trainer=policy_func_trainer,
        model=policy_func,
        train_epoch=TRAIN_EPOCH,
        log_path=LOG_PATH,
        model_path=MODEL_PATH
    )
)

[1mINFO    [0m | [36mtrain_test_util[0m: - [1mstart training, now datetime: 2024-06-27 07:42:18.131471[0m; [32m2024-06-27 07:42:18[0m [36mstart_train[0m:[36m49[0m
[1mINFO    [0m | [36mtrain_test_util[0m: - [1mFirst, clean log path: run\logs\cartpoleV1\AC, and clean model path: run\model\cartpoleV1\AC\AC.pth[0m; [32m2024-06-27 07:42:18[0m [36mstart_train[0m:[36m50[0m
[1mINFO    [0m | [36mutils[0m: - [1mclear_target_path: run\logs\cartpoleV1\AC dose not exist[0m; [32m2024-06-27 07:42:18[0m [36mclear_target_path[0m:[36m38[0m
[1mINFO    [0m | [36mtrain_test_util[0m: - [1mtrain started[0m; [32m2024-06-27 07:42:18[0m [36mstart_train[0m:[36m55[0m


100%|██████████| 3000/3000 [01:06<00:00, 45.35it/s, reward=62.00, step=62]  

[1mINFO    [0m | [36mtrain_test_util[0m: - [1mend training, now datetime: 2024-06-27 07:43:24.283603[0m; [32m2024-06-27 07:43:24[0m [36mstart_train[0m:[36m57[0m
[1mINFO    [0m | [36mtrain_test_util[0m: - [1msaving model to: run\model\cartpoleV1\AC\AC.pth,[0m; [32m2024-06-27 07:43:24[0m [36mstart_train[0m:[36m63[0m





In [23]:
test_policy_func = PolicyNetFunc.from_file(MODEL_PATH)
_render_env = Env(gym.make(GYM_ENV_NAME, render_mode='rgb_array_list'))

policy_func_tester = PolicyNetTester(
    policy_fun=test_policy_func,
    env=_render_env
)


avg_reward = start_test(
    StandarTestProcess(
        model=test_policy_func,
        tester=policy_func_tester,
        env=_render_env,
        test_output_path=TEST_OUTPUT_PATH,
        test_epoch=100,
        show_result=False
    )
)
print(f'avg reward: {avg_reward}')

[1mINFO    [0m | [36mtrain_test_util[0m: - [1mstart testing, now datetime: 2024-06-27 08:04:45.300102, test_epoch: 100[0m; [32m2024-06-27 08:04:45[0m [36mstart_test[0m:[36m42[0m


100%|██████████| 100/100 [00:46<00:00,  2.14epoch/s]

[1mINFO    [0m | [36mtrain_test_util[0m: - [1mend testing, now datetime: 2024-06-27 08:05:32.049740[0m; [32m2024-06-27 08:05:32[0m [36mstart_test[0m:[36m46[0m
avg reward: 272.83





### 参数搜索： 使用CarPole-V1 环境，测试Policy-Based AC 算法

In [28]:
import optuna
from policy_based import PolicyValueNetTrainer, ActionStateValueNetFunc, EpochEndCallback
GYM_ENV_NAME = 'CartPole-v1'
RESULT_DIR_NAME='cartpoleV1'

_USE_CUDA = True and torch.cuda.is_available()
# _USE_CUDA = False and torch.cuda.is_available()

def objective(trial: optuna.Trial):
    t_number = trial.number
    env = Env.from_env_name(GYM_ENV_NAME)

    LOG_PATH = Path(f'./run/logs/{RESULT_DIR_NAME}/{t_number}/AC')
    MODEL_PATH = Path(f'./run/model/{RESULT_DIR_NAME}/AC_{t_number}/AC.pth')

    
    TRAIN_EPOCH = trial.suggest_int(name='train_epoch', low=1000, high=5000, step=1000)
    HIDDEN_DIM_POLICY = trial.suggest_categorical(name='HIDDEN_DIM_POLICY', choices=[128, 256, 512])
    HIDDEN_DIM_VALUE = trial.suggest_categorical(name='HIDDEN_DIM_VALUE', choices=[128, 256, 512])
    
    LEARNING_RATE = trial.suggest_float('p_learn_rate', low=5e-5, high=1e-1, log=True)
    VLEARNING_RATE = trial.suggest_float('v_learn_rate', low=5e-5, high=1e-1, log=True)
    GAMMA = trial.suggest_categorical(name='gamma', choices=[0.8, 0.9, 0.95, 0.99])


    policy_func = PolicyNetFunc(env.get_state_dim()[0], 
                       action_nums=env.get_action_dim()[0], 
                       hidden_dim=HIDDEN_DIM_POLICY,
                       device=torch.device('cuda') if _USE_CUDA else None)

    value_func = ActionStateValueNetFunc(env.get_state_dim()[0],
                              action_nums=env.get_action_dim()[0],
                              hidden_dim=HIDDEN_DIM_VALUE,
                              device=torch.device('cuda') if _USE_CUDA else None)

    def epoch_end_callback(epoch, avg_reward, policy):
        trial.report(avg_reward, epoch)
        if epoch > 1000 and avg_reward < 200:
            raise optuna.TrialPruned()

    policy_func_trainer = PolicyValueNetTrainer(
                                      policy_func=policy_func,
                                      value_func=value_func,
                                      vlearning_rate=VLEARNING_RATE,
                                      env=env,
                                      learning_rate=LEARNING_RATE,
                                      gamma=GAMMA,
                                      logger_folder=LOG_PATH,
                                      epoch_end_callback=epoch_end_callback
                                      )
    
    start_train(
        StandarTrainProcess(
            trainer=policy_func_trainer,
            model=policy_func,
            train_epoch=TRAIN_EPOCH,
            log_path=LOG_PATH,
            model_path=MODEL_PATH
        )
    )

    policy_func_tester = PolicyNetTester(
        policy_fun=policy_func,
        env=env
    )
    
    avg_reward = start_test(
        StandarTestProcess(
            model=test_policy_func,
            tester=policy_func_tester,
            env=env,
            test_output_path=TEST_OUTPUT_PATH,
            test_epoch=100,
            show_result=False
        )
    )

    return avg_reward


In [29]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2024-06-27 09:49:40,952] A new study created in memory with name: no-name-bca75f0d-469d-4261-bef3-db8017c8bebe


[1mINFO    [0m | [36mtrain_test_util[0m: - [1mstart training, now datetime: 2024-06-27 09:49:40.958028[0m; [32m2024-06-27 09:49:40[0m [36mstart_train[0m:[36m62[0m
[1mINFO    [0m | [36mtrain_test_util[0m: - [1mFirst, clean log path: run\logs\cartpoleV1\0\AC, and clean model path: run\model\cartpoleV1\AC_0\AC.pth[0m; [32m2024-06-27 09:49:40[0m [36mstart_train[0m:[36m63[0m
[1mINFO    [0m | [36mtrain_test_util[0m: - [1mtrain started[0m; [32m2024-06-27 09:49:40[0m [36mstart_train[0m:[36m68[0m


 50%|█████     | 1001/2000 [00:13<00:13, 73.51it/s, reward=9.00, step=9] 

[31m[1mERROR   [0m | [36mtrain_test_util[0m: - [31m[1merror occured: [0m; [32m2024-06-27 09:49:54[0m [36mstart_train[0m:[36m74[0m
[33m[1mTraceback (most recent call last):[0m

  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "f:\conda\envs\quant\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
    │   └ <bound method Application.launch_instance of <class 'ipykernel.kernelapp.IPKernelApp'>>
    └ <module 'ipykernel.kernelapp' from 'f:\\conda\\envs\\quant\\Lib\\site-packages\\ipykernel\\kernelapp.py'>
  File "f:\conda\envs\quant\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
    │   └ <function IPKernelApp.start at 0x0000028BC44C2A20>
    └ <ipykernel.kernelapp.IPKernelApp object at 0x0000028BBF22E450>
  File "f:\conda\envs\quant\Lib\site-packages\ipykernel\kernelapp.py", line 739, in start
    self.io_loop.start()



  0%|          | 0/100 [00:00<?, ?epoch/s]
[W 2024-06-27 09:49:54,589] Trial 0 failed with parameters: {'train_epoch': 2000, 'HIDDEN_DIM_POLICY': 128, 'HIDDEN_DIM_VALUE': 256, 'p_learn_rate': 0.0046705415200356416, 'v_learn_rate': 9.441997456665225e-05, 'gamma': 0.95} because of the following error: RuntimeError('Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument mat1 in method wrapper_CUDA_addmm)').
Traceback (most recent call last):
  File "f:\conda\envs\quant\Lib\site-packages\optuna\study\_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\29000\AppData\Local\Temp\ipykernel_1248\2355348299.py", line 67, in objective
    avg_reward = start_test(
                 ^^^^^^^^^^^
  File "f:\ws\rf_learning\train_test_util.py", line 44, in start_test
    reward, _ = test_process.tester.test(1000)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument mat1 in method wrapper_CUDA_addmm)