#### 考虑连续State空间、离散Action 空间的Q函数

In [2]:
from typing import Callable, List, Tuple, Optional
from pathlib import Path

import numpy as np
from tqdm import tqdm
import gymnasium as gym

from tensorboardX import SummaryWriter
import torch
import torch.optim as optim
from collections import deque
import random



State = int
# 表示状态特征向量的维度
StateDim = int
StateVec = List[float]

Action = int
Reward = float
ActionProbDistribution = List[float]

class AbstractQFunc():
    def get_value(self, state: State, action: Action) -> float:
        raise NotImplementedError()
    
    def get_action_distribute(self, state: State) -> ActionProbDistribution:
        raise NotImplementedError()

    def get_actions_count(self) -> int:
        raise NotImplementedError()
    
    def set_value(self, state: State, action: Action, value: float) -> None:
        raise NotImplementedError()

class DeepQFunc(AbstractQFunc, torch.nn.Module):
    def __init__(self, state_dim: int, action_nums: int, hidden_dim: int = 128) -> None:
        # here use full-connect layer to represent Q function
        super().__init__() 
        self._state_dims = state_dim 
        self._action_nums = action_nums
        
        self._fc1 = torch.nn.Linear(state_dim, hidden_dim)
        self._fc2 = torch.nn.Linear(hidden_dim, action_nums)

    def forward(self, x): 
        x = torch.nn.functional.relu(self._fc1(x))
        return self._fc2(x)
        
    def get_action_distribute(self, state: State) -> ActionProbDistribution:
        out = self.forward(torch.tensor([state]))
        return torch.nn.functional.softmax(out, dim=0).detach().numpy()

    def get_optimal_action(self, state: State) -> Action:    
        out = self.forward(torch.tensor([state]))
        return torch.argmax(out).item()

    def get_actions_count(self) -> int:
        return self._action_nums

In [3]:
class Env:
    def __init__(self, gym_env: gym.Env):
        self._gym_env = gym_env

    def step(self, action: Action) ->  Tuple[Reward, Optional[State]]: 
        next_state, reward, is_terminated, is_truncated, _ = self._gym_env.step(action)
        if is_terminated or is_truncated:
            return reward, None
        else:
            return reward, next_state

    def reset(self) -> State:
        init_state, _ = self._gym_env.reset()
        return init_state

In [4]:
# 策略函数
# todo: change the right type
ActionProbDistribution = List[float]
Strategy = Callable[[State], ActionProbDistribution]


def to_strategy(f: DeepQFunc) -> Strategy:
    def _strategy(s: State) -> ActionProbDistribution:
        x = f.get_action_distribute(s).detach().numpy()
        return x
    return _strategy

def to_strategy_epsilon_greedy(f: DeepQFunc, epsilon: float) -> Strategy:
    def _strategy(s: State) -> ActionProbDistribution:
        # e-greedy 策略
        if np.random.uniform(0, 1) > epsilon:
            # 这里选择最优动作（没有随机性）
            optimal_action = f.get_optimal_action(s)
            # 创建一个one-hot编码的动作分布
            action_distribution = np.zeros(f.get_actions_count(), dtype=np.float32)
            action_distribution[optimal_action] = 1.0
            return action_distribution
        else:
            # 随机选择动作 
            return np.ones(f.get_actions_count(), dtype=np.float32) / f.get_actions_count()
    return _strategy

In [5]:

class ReplayBuffer:
    def __init__(self, capacity: int) -> None:
        self.buffer = deque(maxlen=capacity)
    
    def add(self, state, action, reward, next_state, weight):
        self.buffer.append((state, action, reward, next_state, weight))

    def sample(self, batch_size):
        transitions = random.sample(self.buffer, batch_size)
        return zip(*transitions)        

class DQFuncTrainer():
    def __init__(self, q_func: DeepQFunc, 
                 env: Env, 
                 replay_buffer: ReplayBuffer,
                 optimizer: optim.Optimizer,
                 batch_size: int,
                 gamma: float,
                 epsilon_list: List[float],
                 logger_folder: Optional[Path] = None,
                 ) -> None:
        self._q_func = q_func
        self._env = env
        self._replay_buffer = replay_buffer
        self._gamma = gamma
        self._optimizer = optimizer
        self._batch_size = batch_size
        self._epsilon_list = epsilon_list

        self._logger_folder = logger_folder if logger_folder is not None else Path('./logs')

    def train(self, epoch: int, max_steps: int, minimal_train_size: int):
        writer = SummaryWriter(self._logger_folder) 
        for epoch in tqdm(range(epoch)):
            init_state = self._env.reset()
            current_state = init_state
            acc_reward = 0
            step_cnt = 0
            
            for _s in range(max_steps):
                step_cnt += 1
                # 获取此时DeepQFunc的策略 
                e_greedy_s = to_strategy_epsilon_greedy(self._q_func, self._epsilon_list[epoch])
                # 使用该策略进行决策 
                action_dist = e_greedy_s(current_state)
                action = np.random.choice(self._q_func.get_actions_count(), p=action_dist)
                # 执行这个动作，获取下一个状态      
                reward, next_state = self._env.step(action)
                acc_reward += reward
                
                if next_state is not None:
                    self._replay_buffer.add(current_state, action, reward, next_state, 1)
                else:
                    self._replay_buffer.add(current_state, action, reward, current_state, 0)
                current_state = next_state
                if len(self._replay_buffer.buffer) > minimal_train_size:
                    self.update_q_func()

                if current_state is None: 
                    break
            writer.add_scalar('reward', acc_reward, epoch)
            writer.add_scalar('step', step_cnt, epoch)
    
    def update_q_func(self):
        state, action, reward, next_state, weight = self._replay_buffer.sample(self._batch_size)
        state = torch.tensor(state, dtype=torch.float32)
        action = torch.tensor(action, dtype=torch.int64)
        reward = torch.tensor(reward, dtype=torch.float32)
        next_state = torch.tensor(next_state, dtype=torch.float32)
        weight = torch.tensor(weight, dtype=torch.int)
        
        q_values = self._q_func(state)
        next_q_values = self._q_func(next_state)
        target_q_values = reward + self._gamma * torch.max(next_q_values, dim=1).values * weight
        target_q_values = target_q_values.detach()
        loss = torch.nn.functional.mse_loss(q_values.gather(1, action.unsqueeze(1)), target_q_values.unsqueeze(1))
        
        self._optimizer.zero_grad()
        loss.backward()
        self._optimizer.step()

class DoubleQ_DQFuncTrainer():
    def __init__(self, 
                 q_func: DeepQFunc,
                 target_q_func: DeepQFunc, 
                 env: Env, 
                 replay_buffer: ReplayBuffer,
                 optimizer: optim.Optimizer,
                 batch_size: int,
                 gamma: float,
                 epsilon_list: List[float],
                 target_update_freq: int,
                 logger_folder: Optional[Path] = None,
                 ) -> None:
        self._q_func = q_func
        self._target_q_func = target_q_func
        self._env = env
        self._replay_buffer = replay_buffer
        self._gamma = gamma
        self._optimizer = optimizer
        self._batch_size = batch_size
        self._epsilon_list = epsilon_list
        self._target_update_freq = target_update_freq

        self._logger_folder = logger_folder if logger_folder is not None else Path('./logs')


        self.__update_count = 0
    def train(self, epoch: int, max_steps: int, minimal_train_size: int):
        writer = SummaryWriter(self._logger_folder) 
        for epoch in tqdm(range(epoch)):
            init_state = self._env.reset()
            current_state = init_state
            acc_reward = 0
            step_cnt = 0
            
            for _s in range(max_steps):
                step_cnt += 1
                # 获取此时DeepQFunc的策略 
                e_greedy_s = to_strategy_epsilon_greedy(self._q_func, self._epsilon_list[epoch])
                # 使用该策略进行决策 
                action_dist = e_greedy_s(current_state)
                action = np.random.choice(self._q_func.get_actions_count(), p=action_dist)
                # 执行这个动作，获取下一个状态      
                reward, next_state = self._env.step(action)
                acc_reward += reward
                
                if next_state is not None:
                    self._replay_buffer.add(current_state, action, reward, next_state, 1)
                else:
                    self._replay_buffer.add(current_state, action, reward, current_state, 0)

                current_state = next_state
                if len(self._replay_buffer.buffer) > minimal_train_size:
                    self.update_q_func()

                if current_state is None: 
                    break
            writer.add_scalar('reward', acc_reward, epoch)
            writer.add_scalar('step', step_cnt, epoch)
    
    def update_q_func(self):
        self.__update_count += 1

        state, action, reward, next_state, weight = self._replay_buffer.sample(self._batch_size)
        state = torch.tensor(state, dtype=torch.float32)
        action = torch.tensor(action, dtype=torch.int64)
        reward = torch.tensor(reward, dtype=torch.float32)
        next_state = torch.tensor(next_state, dtype=torch.float32)
        weight = torch.tensor(weight, dtype=int)

        q_values = self._q_func(state)
        q_values = q_values.gather(1, action.unsqueeze(1))

        next_q_values = self._target_q_func(next_state)
        target_q_values = reward + self._gamma * torch.max(next_q_values, dim=1).values * weight
        
        target_q_values = target_q_values.detach()
        loss = torch.nn.functional.mse_loss(q_values, target_q_values.unsqueeze(1))
        
        self._optimizer.zero_grad()
        loss.backward()
        self._optimizer.step()

        if self.__update_count % self._target_update_freq == 0:
            self._target_q_func.load_state_dict(self._q_func.state_dict())
        

In [6]:
class DQFuncTester():
    def __init__(self, q_func: DeepQFunc, env: Env) -> None:
        self._q_func = q_func
        self._env = env
    
    def test(self, max_step: int):
        init_state = self._env.reset()
        current_state = init_state
        acc_reward = 0
        reward_list = []
        # greedy_strategy = to_strategy(self._q_func)

        for _ in range(max_step):
            # action_dist = greedy_strategy(current_state)
            # action = np.argmax(action_dist)
            action = self._q_func.get_optimal_action(current_state)
            
            reward, next_state = self._env.step(action)
            acc_reward += reward
            reward_list.append(reward)
            current_state = next_state
            if current_state is None:
                break
    
        print(f'Test reward: {acc_reward}')
        print(f'Step Rewards: {reward_list}')

In [75]:
GYM_ENV_NAME = 'CartPole-v1'
_gym_env = gym.make(GYM_ENV_NAME)

action_nums, state_space = _gym_env.action_space.n, _gym_env.observation_space
print(f'action num: {action_nums}, space: {state_space}')

TRAIN_EPOCH = 1000
HIDDEN_DIM = 128
LEARNING_RATE = 2e-3
GAMMA = 0.98

START_EPSILON = 0.2
END_EPSILON = 0.05
DECAY_RATE = 0.999
EPSILON_LIST = [max(START_EPSILON * (DECAY_RATE ** i), END_EPSILON) for i in range(TRAIN_EPOCH)]


log_path = Path('./logs/run5_use_weight_todonestate5')
import shutil
if log_path.exists():
    shutil.rmtree(log_path)

q_func = DeepQFunc(state_space.shape[0], action_nums, HIDDEN_DIM)
env = Env(_gym_env)
q_trainer = DQFuncTrainer(
    q_func=q_func,
    env=env,
    replay_buffer=ReplayBuffer(10000),
    optimizer=optim.Adam(q_func.parameters(), lr=LEARNING_RATE),
    batch_size=64,
    gamma=GAMMA,
    epsilon_list=EPSILON_LIST,
    logger_folder=log_path
)

action num: 2, space: Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)


In [76]:
q_trainer.train(
    epoch=TRAIN_EPOCH,
    max_steps=1000,
    minimal_train_size=500
)

 24%|██▍       | 1216/5000 [07:30<23:23,  2.70it/s]


KeyboardInterrupt: 

In [9]:
t = DQFuncTester(q_func, Env(gym.make(GYM_ENV_NAME, render_mode='human')))

In [10]:
t.test(10000)

Test reward: 500.0
Step Rewards: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1

In [74]:
torch.set_default_tensor_type('torch.FloatTensor')

In [68]:
torch.set_default_tensor_type('torch.cuda.FloatTensor')

In [16]:
GYM_ENV_NAME = 'CartPole-v1'
_gym_env = gym.make(GYM_ENV_NAME)

In [18]:
_gym_env.observation_space.shape

(4,)

#### Double-Deep-Q

In [7]:
GYM_ENV_NAME = 'CartPole-v1'
_gym_env = gym.make(GYM_ENV_NAME)

action_nums, state_space = _gym_env.action_space.n, _gym_env.observation_space
print(f'action num: {action_nums}, space: {state_space}')

TRAIN_EPOCH = 1000
HIDDEN_DIM = 128
LEARNING_RATE = 1e-3
GAMMA = 0.98

START_EPSILON = 0.2
END_EPSILON = 0.05
DECAY_RATE = 0.999
EPSILON_LIST = [max(START_EPSILON * (DECAY_RATE ** i), END_EPSILON) for i in range(TRAIN_EPOCH)]


log_path = Path('./logs/run4_doubleDeepQ_withdones')
import shutil
if log_path.exists():
    shutil.rmtree(log_path)

q_func = DeepQFunc(state_space.shape[0], action_nums, HIDDEN_DIM)
target_q_func = DeepQFunc(state_space.shape[0], action_nums, HIDDEN_DIM)
env = Env(_gym_env)
q_trainer = DoubleQ_DQFuncTrainer(
    q_func=q_func,
    target_q_func=target_q_func,
    env=env,
    replay_buffer=ReplayBuffer(10000),
    optimizer=optim.Adam(q_func.parameters(), lr=LEARNING_RATE),
    batch_size=64,
    gamma=GAMMA,
    epsilon_list=EPSILON_LIST,
    target_update_freq=10,
    logger_folder=log_path
)

action num: 2, space: Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)


In [8]:
q_trainer.train(
    epoch=TRAIN_EPOCH,
    max_steps=1000,
    minimal_train_size=500
)

  out = self.forward(torch.tensor([state]))
100%|██████████| 1000/1000 [05:46<00:00,  2.88it/s]


#### 问题的关键在于每个epoch结束的时刻，那个state-action-reward 特别重要，如果丢失，则Q函数无法工作！！！