In [1]:
from pathlib import Path
import shutil

import gymnasium as gym
import torch

from deep_q import DeepQFunc, DeepQFuncTrainer, DeepQFuncTester, ReplayBuffer, Discrete1ContinuousAction
from env import Env


#### 使用CarPole-V1 环境，测试简单的Deep Q 如何处理连续的State空间和离散的Action空间

In [None]:
GYM_ENV_NAME = 'CartPole-v1'
_train_gym_env = gym.make(GYM_ENV_NAME)

# 打印查看环境的动作空间和状态空间 
action_nums, state_space = _train_gym_env.action_space.n, _train_gym_env.observation_space
print(f'action num: {action_nums}, space: {state_space}')

TRAIN_EPOCH = 300
HIDDEN_DIM = 256
LEARNING_RATE = 2e-3
GAMMA = 0.99

# 使用指数递减的epsilon-greedy策略
START_EPSILON = 0.5
END_EPSILON = 0.05
DECAY_RATE = 0.99
EPSILON_LIST = [max(START_EPSILON * (DECAY_RATE ** i), END_EPSILON) for i in range(TRAIN_EPOCH)]


log_path = Path('./logs/run2')
if log_path.exists():
    shutil.rmtree(log_path)

# _USE_CUDA = True and torch.cuda.is_available()
_USE_CUDA = False and torch.cuda.is_available()

q_func = DeepQFunc(state_space.shape[0], 
                   action_nums, 
                   hidden_dim=HIDDEN_DIM, 
                   device=torch.device('cuda') if _USE_CUDA else None)

env = Env(_train_gym_env)

replay_buffer = ReplayBuffer(10000)
q_func_trainer = DeepQFuncTrainer(q_func=q_func, 
                                  env=env,
                                  replay_buffer=replay_buffer,
                                  learning_rate=LEARNING_RATE,
                                  batch_size=64,
                                  gamma=GAMMA,
                                  epsilon_list=EPSILON_LIST,
                                  logger_folder=log_path)


In [None]:
q_func_trainer.train(train_epoch=TRAIN_EPOCH, 
                     max_steps=1000, 
                     minimal_replay_size_to_train=64 * 10,
                     target_q_update_freq=10)

#### 开始测试

In [None]:
_render_env = Env(gym.make(GYM_ENV_NAME, render_mode='human'))
q_func_tester = DeepQFuncTester(
    q_func=q_func.to('cpu'),
    env=_render_env
)

In [None]:
q_func_tester.test(2000)

In [None]:
p = Path('./model')
if not p.exists():
    p.mkdir()
q_func.save(Path('./model/trained_for_cartpole.pth'))

In [None]:
q_func_from_checkpoint = DeepQFunc(state_space.shape[0], 
                   action_nums, 
                   hidden_dim=HIDDEN_DIM)
q_func_from_checkpoint.load(Path('./model/trained_for_cartpole.pth'))

In [None]:
q_func_tester = DeepQFuncTester(
    q_func=q_func_from_checkpoint.to('cpu'),
    env=_render_env
)

In [None]:
q_func_tester.test(2000)

#### 使用Pendulum-v1 测试Double Q Learning对Q值系统高估的处理能力

In [2]:
GYM_ENV_NAME = 'Pendulum-v1'
_train_gym_env = gym.make(GYM_ENV_NAME)

# 打印查看环境的动作空间和状态空间 
action_space, state_space = _train_gym_env.action_space, _train_gym_env.observation_space
print(f'action: {action_space}, space: {state_space}')

BINS = 11


TRAIN_EPOCH = 1000
HIDDEN_DIM = 128
LEARNING_RATE = 2e-3
GAMMA = 0.99

# 使用指数递减的epsilon-greedy策略
START_EPSILON = 0.5
END_EPSILON = 0.05
DECAY_RATE = 0.99
EPSILON_LIST = [max(START_EPSILON * (DECAY_RATE ** i), END_EPSILON) for i in range(TRAIN_EPOCH)]


log_path = Path('./logs/pendulum/run_dqn')
if log_path.exists():
    shutil.rmtree(log_path)

# _USE_CUDA = True and torch.cuda.is_available()
_USE_CUDA = False and torch.cuda.is_available()

q_func = DeepQFunc(state_space.shape[0], 
                   BINS, 
                   hidden_dim=HIDDEN_DIM, 
                   device=torch.device('cuda') if _USE_CUDA else None)

env = Env(_train_gym_env)

replay_buffer = ReplayBuffer(10000)
q_func_trainer = DeepQFuncTrainer(q_func=q_func, 
                                  env=env,
                                  replay_buffer=replay_buffer,
                                  learning_rate=LEARNING_RATE,
                                  batch_size=64,
                                  gamma=GAMMA,
                                  epsilon_list=EPSILON_LIST,
                                  logger_folder=log_path,
                                  action_converter=Discrete1ContinuousAction(action_space.low, action_space.high, BINS))

action: Box(-2.0, 2.0, (1,), float32), space: Box([-1. -1. -8.], [1. 1. 8.], (3,), float32)


In [None]:
q_func_trainer.train(train_epoch=TRAIN_EPOCH, 
                     max_steps=1000, 
                     minimal_replay_size_to_train=64 * 10,
                     target_q_update_freq=10)

In [None]:
_render_env = Env(gym.make(GYM_ENV_NAME, render_mode='human'))
q_func_tester = DeepQFuncTester(
    q_func=q_func.to('cpu'),
    env=_render_env,
    action_converter=Discrete1ContinuousAction(action_space.low, action_space.high, BINS)
)

In [None]:
q_func_tester.test(1000)

#### 使用Double DQN 进行训练

In [3]:
from deep_q import DoubleQFuncTrainer

log_path = Path('./logs/pendulum/run_double_dqn')
if log_path.exists():
    shutil.rmtree(log_path)

q_func2 = DeepQFunc(state_space.shape[0], 
                   BINS, 
                   hidden_dim=HIDDEN_DIM, 
                   device=torch.device('cuda') if _USE_CUDA else None)

replay_buffer = ReplayBuffer(10000)
q_func_trainer = DoubleQFuncTrainer(q_func=q_func2, 
                                  env=env,
                                  replay_buffer=replay_buffer,
                                  learning_rate=LEARNING_RATE,
                                  batch_size=64,
                                  gamma=GAMMA,
                                  epsilon_list=EPSILON_LIST,
                                  logger_folder=log_path,
                                  action_converter=Discrete1ContinuousAction(action_space.low, action_space.high, BINS))

In [4]:
q_func_trainer.train(train_epoch=TRAIN_EPOCH, 
                     max_steps=1000, 
                     minimal_replay_size_to_train=64 * 10,
                     target_q_update_freq=10)

  return func(*args, **kwargs)
  loss = torch.nn.functional.mse_loss(q_values_now_value, target_q_values)
100%|██████████| 1000/1000 [05:08<00:00,  3.24it/s, reward=-1.52e+3, step=200]


In [5]:
_render_env = Env(gym.make(GYM_ENV_NAME, render_mode='human'))
q_func_tester = DeepQFuncTester(
    q_func=q_func2.to('cpu'),
    env=_render_env,
    action_converter=Discrete1ContinuousAction(action_space.low, action_space.high, BINS)
)

In [6]:
q_func_tester.test(1000)

Test reward: -1587.9141391611133
Step Rewards: [-8.200818814300813, -8.293559493080187, -8.3735609796617, -8.438001719024937, -8.484637731720484, -8.511874468543352, -8.51881284377955, -8.505269923449484, -8.47177560906836, -8.419547058659036, -8.350442645728188, -8.266897147406864, -8.171839761819966, -8.0685966432735, -7.960780009415742, -7.852166518544274, -7.746568431183363, -7.647701865637723, -7.559057012450378, -7.483775317189709, -7.4245383196688675, -7.383472134708622, -7.362070667419733, -7.361139800983642, -7.380764148733194, -7.42029757495848, -7.478378468161167, -7.552970495953539, -7.641429050047052, -7.740592632343593, -7.846897009927621, -7.956508225852184, -8.065468794622518, -8.169850007276391, -8.26590254862307, -8.350197767901646, -8.419752918766507, -8.472135278945116, -8.505541941388092, -8.518853890496437, -8.511664468894622, -8.484283359311213, -8.437717742151285, -8.37363243792946, -8.294290766497317, -8.202477738884687, -8.10140721598634, -7.994614942493837, -