In [1]:
from pathlib import Path
import shutil
import datetime

import gymnasium as gym
import torch

from deep_q import DeepQFunc, DeepQFuncTrainer, DeepQFuncTester, ReplayBuffer, Discrete1ContinuousAction
from env import Env
from utils import clear_target_path, show_gif_on_jupyternb


#### 使用CarPole-V1 环境，测试简单的Deep Q 如何处理连续的State空间和离散的Action空间

In [2]:
GYM_ENV_NAME = 'CartPole-v1'
_train_gym_env = gym.make(GYM_ENV_NAME)
env = Env(_train_gym_env)

# 打印查看环境的动作空间和状态空间 
action_nums, state_space = _train_gym_env.action_space.n, _train_gym_env.observation_space
print(f'action num: {action_nums}, space: {state_space}')

TRAIN_EPOCH = 800
HIDDEN_DIM = 256
LEARNING_RATE = 2e-3
GAMMA = 0.99

# 使用指数递减的epsilon-greedy策略
START_EPSILON = 0.5
END_EPSILON = 0.05
DECAY_RATE = 0.99
EPSILON_LIST = [max(START_EPSILON * (DECAY_RATE ** i), END_EPSILON) for i in range(TRAIN_EPOCH)]

LOG_PATH = Path('./run/logs/cartpoleV1/run_normal')
MODEL_PATH = Path('./run/model/cartpoleV1/model.pth')
TEST_OUTPUT_PATH = Path('./run/test_result/cartpoleV1')

# _USE_CUDA = True and torch.cuda.is_available()
_USE_CUDA = False and torch.cuda.is_available()

q_func = DeepQFunc(state_space.shape[0], 
                   action_nums, 
                   hidden_dim=HIDDEN_DIM, 
                   device=torch.device('cuda') if _USE_CUDA else None)


replay_buffer = ReplayBuffer(10000)
q_func_trainer = DeepQFuncTrainer(q_func=q_func,
                                  env=env,
                                  replay_buffer=replay_buffer,
                                  learning_rate=LEARNING_RATE,
                                  batch_size=64,
                                  gamma=GAMMA,
                                  epsilon_list=EPSILON_LIST,
                                  logger_folder=LOG_PATH)


action num: 2, space: Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)


In [None]:
clear_target_path(LOG_PATH)
clear_target_path(MODEL_PATH)
print(f'start training, now datetime: {datetime.datetime.now()}')
q_func_trainer.train(train_epoch=TRAIN_EPOCH, 
                     max_steps=1000, 
                     minimal_replay_size_to_train=64 * 10,
                     target_q_update_freq=10)
print(f'end training, saving model to: {MODEL_PATH}, now datetime: {datetime.datetime.now()}')

q_func.save(MODEL_PATH)

#### 开始测试

In [3]:
q_test_func = DeepQFunc.from_file(MODEL_PATH)
_render_env = Env(gym.make(GYM_ENV_NAME, render_mode='rgb_array_list'))
q_func_tester = DeepQFuncTester(
    q_func=q_func.to('cpu'),
    env=_render_env
)

FileNotFoundError: [Errno 2] No such file or directory: 'run\\model\\cartpoleV1\\model.pth'

In [None]:
RESULT_GIF = TEST_OUTPUT_PATH / 'result.gif'
clear_target_path(RESULT_GIF)

q_func_tester.test(2000)

In [None]:
q_func_from_checkpoint = DeepQFunc(state_space.shape[0], 
                   action_nums, 
                   hidden_dim=HIDDEN_DIM)
q_func_from_checkpoint.load(Path('./model/trained_for_cartpole.pth'))

In [None]:
q_func_tester = DeepQFuncTester(
    q_func=q_func_from_checkpoint.to('cpu'),
    env=_render_env
)

In [None]:
q_func_tester.test(2000)

#### 使用Pendulum-v1 测试Double Q Learning对Q值系统高估的处理能力

In [None]:
GYM_ENV_NAME = 'Pendulum-v1'
_train_gym_env = gym.make(GYM_ENV_NAME)

# 打印查看环境的动作空间和状态空间 
action_space, state_space = _train_gym_env.action_space, _train_gym_env.observation_space
print(f'action: {action_space}, space: {state_space}')

BINS = 11


TRAIN_EPOCH = 1000
HIDDEN_DIM = 256
LEARNING_RATE = 2e-3
GAMMA = 0.99

# 使用指数递减的epsilon-greedy策略
START_EPSILON = 0.5
END_EPSILON = 0.05
DECAY_RATE = 0.99
EPSILON_LIST = [max(START_EPSILON * (DECAY_RATE ** i), END_EPSILON) for i in range(TRAIN_EPOCH)]


log_path = Path('./logs/pendulum/run_dqn')
if log_path.exists():
    shutil.rmtree(log_path)

# _USE_CUDA = True and torch.cuda.is_available()
_USE_CUDA = False and torch.cuda.is_available()

q_func = DeepQFunc(state_space.shape[0], 
                   BINS, 
                   hidden_dim=HIDDEN_DIM, 
                   device=torch.device('cuda') if _USE_CUDA else None)

env = Env(_train_gym_env)

replay_buffer = ReplayBuffer(10000)
q_func_trainer = DeepQFuncTrainer(q_func=q_func, 
                                  env=env,
                                  replay_buffer=replay_buffer,
                                  learning_rate=LEARNING_RATE,
                                  batch_size=64,
                                  gamma=GAMMA,
                                  epsilon_list=EPSILON_LIST,
                                  logger_folder=log_path,
                                  action_converter=Discrete1ContinuousAction(action_space.low, action_space.high, BINS))

In [None]:
q_func_trainer.train(train_epoch=TRAIN_EPOCH, 
                     max_steps=1000, 
                     minimal_replay_size_to_train=64 * 10,
                     target_q_update_freq=10)

In [None]:
_render_env = Env(gym.make(GYM_ENV_NAME, render_mode='human'))
q_func_tester = DeepQFuncTester(
    q_func=q_func.to('cpu'),
    env=_render_env,
    action_converter=Discrete1ContinuousAction(action_space.low, action_space.high, BINS)
)

In [None]:
q_func_tester.test(1000)

#### 使用Double DQN 进行训练

In [None]:
from deep_q import DoubleQFuncTrainer

log_path = Path('./logs/pendulum/run_double_dqn3')
if log_path.exists():
    shutil.rmtree(log_path)

q_func2 = DeepQFunc(state_space.shape[0], 
                   BINS, 
                   hidden_dim=HIDDEN_DIM, 
                   device=torch.device('cuda') if _USE_CUDA else None)

replay_buffer = ReplayBuffer(10000)
q_func_trainer = DoubleQFuncTrainer(q_func=q_func2, 
                                  env=env,
                                  replay_buffer=replay_buffer,
                                  learning_rate=LEARNING_RATE,
                                  batch_size=64,
                                  gamma=GAMMA,
                                  epsilon_list=EPSILON_LIST,
                                  logger_folder=log_path,
                                  action_converter=Discrete1ContinuousAction(action_space.low, action_space.high, BINS))

In [None]:
q_func_trainer.train(train_epoch=TRAIN_EPOCH, 
                     max_steps=1000, 
                     minimal_replay_size_to_train=64 * 10,
                     target_q_update_freq=10)

In [None]:
_render_env = Env(gym.make(GYM_ENV_NAME, render_mode='human'))
q_func_tester = DeepQFuncTester(
    q_func=q_func2.to('cpu'),
    env=_render_env,
    action_converter=Discrete1ContinuousAction(action_space.low, action_space.high, BINS)
)

In [None]:
q_func_tester.test(1000)