In [5]:
from pathlib import Path
import shutil

import gymnasium as gym
import torch

from deep_q import DeepQFunc, DeepQFuncTrainer, DeepQFuncTester, ReplayBuffer, Discrete1ContinuousAction
from env import Env


#### 使用CarPole-V1 环境，测试简单的Deep Q 如何处理连续的State空间和离散的Action空间

In [6]:
GYM_ENV_NAME = 'CartPole-v1'
_train_gym_env = gym.make(GYM_ENV_NAME)

# 打印查看环境的动作空间和状态空间 
action_nums, state_space = _train_gym_env.action_space.n, _train_gym_env.observation_space
print(f'action num: {action_nums}, space: {state_space}')

TRAIN_EPOCH = 300
HIDDEN_DIM = 256
LEARNING_RATE = 2e-3
GAMMA = 0.99

# 使用指数递减的epsilon-greedy策略
START_EPSILON = 0.5
END_EPSILON = 0.05
DECAY_RATE = 0.99
EPSILON_LIST = [max(START_EPSILON * (DECAY_RATE ** i), END_EPSILON) for i in range(TRAIN_EPOCH)]


log_path = Path('./logs/run2')
if log_path.exists():
    shutil.rmtree(log_path)

# _USE_CUDA = True and torch.cuda.is_available()
_USE_CUDA = False and torch.cuda.is_available()

q_func = DeepQFunc(state_space.shape[0], 
                   action_nums, 
                   hidden_dim=HIDDEN_DIM, 
                   device=torch.device('cuda') if _USE_CUDA else None)

env = Env(_train_gym_env)

replay_buffer = ReplayBuffer(10000)
q_func_trainer = DeepQFuncTrainer(q_func=q_func, 
                                  env=env,
                                  replay_buffer=replay_buffer,
                                  learning_rate=LEARNING_RATE,
                                  batch_size=64,
                                  gamma=GAMMA,
                                  epsilon_list=EPSILON_LIST,
                                  logger_folder=log_path)


action num: 2, space: Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)


In [7]:
q_func_trainer.train(train_epoch=TRAIN_EPOCH, 
                     max_steps=1000, 
                     minimal_replay_size_to_train=64 * 10,
                     target_q_update_freq=10)

 26%|██▋       | 79/300 [00:00<00:02, 84.39it/s, reward=21, step=21] 


KeyboardInterrupt: 

#### 开始测试

In [None]:
_render_env = Env(gym.make(GYM_ENV_NAME, render_mode='human'))
q_func_tester = DeepQFuncTester(
    q_func=q_func.to('cpu'),
    env=_render_env
)

In [None]:
q_func_tester.test(2000)

In [None]:
p = Path('./model')
if not p.exists():
    p.mkdir()
q_func.save(Path('./model/trained_for_cartpole.pth'))

In [None]:
q_func_from_checkpoint = DeepQFunc(state_space.shape[0], 
                   action_nums, 
                   hidden_dim=HIDDEN_DIM)
q_func_from_checkpoint.load(Path('./model/trained_for_cartpole.pth'))

In [None]:
q_func_tester = DeepQFuncTester(
    q_func=q_func_from_checkpoint.to('cpu'),
    env=_render_env
)

In [None]:
q_func_tester.test(2000)

#### 使用Pendulum-v1 测试Double Q Learning对Q值系统高估的处理能力

In [None]:
GYM_ENV_NAME = 'Pendulum-v1'
_train_gym_env = gym.make(GYM_ENV_NAME)

# 打印查看环境的动作空间和状态空间 
action_space, state_space = _train_gym_env.action_space, _train_gym_env.observation_space
print(f'action: {action_space}, space: {state_space}')

BINS = 11


TRAIN_EPOCH = 1000
HIDDEN_DIM = 256
LEARNING_RATE = 2e-3
GAMMA = 0.99

# 使用指数递减的epsilon-greedy策略
START_EPSILON = 0.5
END_EPSILON = 0.05
DECAY_RATE = 0.99
EPSILON_LIST = [max(START_EPSILON * (DECAY_RATE ** i), END_EPSILON) for i in range(TRAIN_EPOCH)]


log_path = Path('./logs/pendulum/run_dqn')
if log_path.exists():
    shutil.rmtree(log_path)

# _USE_CUDA = True and torch.cuda.is_available()
_USE_CUDA = False and torch.cuda.is_available()

q_func = DeepQFunc(state_space.shape[0], 
                   BINS, 
                   hidden_dim=HIDDEN_DIM, 
                   device=torch.device('cuda') if _USE_CUDA else None)

env = Env(_train_gym_env)

replay_buffer = ReplayBuffer(10000)
q_func_trainer = DeepQFuncTrainer(q_func=q_func, 
                                  env=env,
                                  replay_buffer=replay_buffer,
                                  learning_rate=LEARNING_RATE,
                                  batch_size=64,
                                  gamma=GAMMA,
                                  epsilon_list=EPSILON_LIST,
                                  logger_folder=log_path,
                                  action_converter=Discrete1ContinuousAction(action_space.low, action_space.high, BINS))

action: Box(-2.0, 2.0, (1,), float32), space: Box([-1. -1. -8.], [1. 1. 8.], (3,), float32)


In [None]:
q_func_trainer.train(train_epoch=TRAIN_EPOCH, 
                     max_steps=1000, 
                     minimal_replay_size_to_train=64 * 10,
                     target_q_update_freq=10)

In [None]:
_render_env = Env(gym.make(GYM_ENV_NAME, render_mode='human'))
q_func_tester = DeepQFuncTester(
    q_func=q_func.to('cpu'),
    env=_render_env,
    action_converter=Discrete1ContinuousAction(action_space.low, action_space.high, BINS)
)

In [None]:
q_func_tester.test(1000)

#### 使用Double DQN 进行训练

In [None]:
from deep_q import DoubleQFuncTrainer

log_path = Path('./logs/pendulum/run_double_dqn3')
if log_path.exists():
    shutil.rmtree(log_path)

q_func2 = DeepQFunc(state_space.shape[0], 
                   BINS, 
                   hidden_dim=HIDDEN_DIM, 
                   device=torch.device('cuda') if _USE_CUDA else None)

replay_buffer = ReplayBuffer(10000)
q_func_trainer = DoubleQFuncTrainer(q_func=q_func2, 
                                  env=env,
                                  replay_buffer=replay_buffer,
                                  learning_rate=LEARNING_RATE,
                                  batch_size=64,
                                  gamma=GAMMA,
                                  epsilon_list=EPSILON_LIST,
                                  logger_folder=log_path,
                                  action_converter=Discrete1ContinuousAction(action_space.low, action_space.high, BINS))

In [None]:
q_func_trainer.train(train_epoch=TRAIN_EPOCH, 
                     max_steps=1000, 
                     minimal_replay_size_to_train=64 * 10,
                     target_q_update_freq=10)

  return func(*args, **kwargs)
  loss = torch.nn.functional.mse_loss(q_values_now_value, target_q_values.unsqueeze(1))


q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1])

  0%|          | 4/1000 [00:00<02:15,  7.36it/s, reward=-1.26e+3, step=200]

q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1])

  0%|          | 4/1000 [00:00<02:15,  7.36it/s, reward=-1.63e+3, step=200]

q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1])

  1%|          | 6/1000 [00:01<04:21,  3.81it/s, reward=-1.54e+3, step=200]

q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1])

  1%|          | 6/1000 [00:01<04:21,  3.81it/s, reward=-1.49e+3, step=200]

q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1])

  1%|          | 7/1000 [00:01<05:05,  3.25it/s, reward=-1.49e+3, step=200]

q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1])

  1%|          | 8/1000 [00:02<05:22,  3.07it/s, reward=-1.07e+3, step=200]

q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1])

  1%|          | 9/1000 [00:02<05:36,  2.95it/s, reward=-1.5e+3, step=200] 

q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1])

  1%|          | 10/1000 [00:02<05:46,  2.86it/s, reward=-1.59e+3, step=200]

q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1])

  1%|          | 11/1000 [00:03<06:02,  2.73it/s, reward=-1.38e+3, step=200]

q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1])

  1%|          | 12/1000 [00:03<06:08,  2.68it/s, reward=-1.2e+3, step=200] 

q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1])

  1%|▏         | 13/1000 [00:04<06:10,  2.67it/s, reward=-1.3e+3, step=200]

q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1])

  1%|▏         | 14/1000 [00:04<06:12,  2.64it/s, reward=-1.17e+3, step=200]

q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1])

  2%|▏         | 15/1000 [00:04<06:16,  2.61it/s, reward=-1.73e+3, step=200]

q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1])

  2%|▏         | 16/1000 [00:05<06:15,  2.62it/s, reward=-1.71e+3, step=200]

q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1])

  2%|▏         | 16/1000 [00:05<06:15,  2.62it/s, reward=-1.26e+3, step=200]

q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1])

  2%|▏         | 17/1000 [00:05<06:14,  2.62it/s, reward=-1.26e+3, step=200]

q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1])

  2%|▏         | 18/1000 [00:06<06:25,  2.55it/s, reward=-1.06e+3, step=200]

q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1])

  2%|▏         | 19/1000 [00:06<06:24,  2.55it/s, reward=-1.28e+3, step=200]

q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1])

  2%|▏         | 20/1000 [00:06<06:18,  2.59it/s, reward=-1.18e+3, step=200]

q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1])

  2%|▏         | 21/1000 [00:07<06:21,  2.56it/s, reward=-1.08e+3, step=200]

q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1])

  2%|▏         | 22/1000 [00:07<06:31,  2.50it/s, reward=-955, step=200]    

q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1])

  2%|▏         | 23/1000 [00:08<06:42,  2.43it/s, reward=-1.1e+3, step=200]

q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1])

  2%|▏         | 24/1000 [00:08<06:34,  2.47it/s, reward=-1.47e+3, step=200]

q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1])

  2%|▏         | 24/1000 [00:08<06:34,  2.47it/s, reward=-902, step=200]    

q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1])

  2%|▎         | 25/1000 [00:08<06:32,  2.48it/s, reward=-902, step=200]

q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1])

  2%|▎         | 25/1000 [00:09<05:54,  2.75it/s, reward=-902, step=200]

q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1]), target_q_values dim: torch.Size([64, 64])
q_values_now_value dim: torch.Size([64, 1])




KeyboardInterrupt: 

In [None]:
_render_env = Env(gym.make(GYM_ENV_NAME, render_mode='human'))
q_func_tester = DeepQFuncTester(
    q_func=q_func2.to('cpu'),
    env=_render_env,
    action_converter=Discrete1ContinuousAction(action_space.low, action_space.high, BINS)
)

In [None]:
q_func_tester.test(1000)