## Clone code bài tập lớn

In [1]:
%%capture
!git clone https://github.com/cuongtv312/marl-delivery.git
%cd marl-delivery
!pip install -r requirements.txt

## QMix
Khai báo thư viện

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical
import numpy as np
import random
from os import path

Chọn device

In [3]:
GPU = True
device_idx = 0
if GPU:
    device = torch.device("cuda:" + str(device_idx) if torch.cuda.is_available() else "cpu")
else:
    device = torch.device("cpu")
print(device)

cpu


Tạo ReplayBufferGRU để lưu lại những action trước của agent

In [4]:
class ReplayBufferGRU:
    """ 
    Replay buffer for agent with GRU network additionally storing previous action, 
    initial input hidden state and output hidden state of GRU.
    And each sample contains the whole episode instead of a single step.
    'hidden_in' and 'hidden_out' are only the initial hidden state for each episode, for GRU initialization.

    """
    
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []
        self.position = 0

    def push(self, hidden_in, hidden_out, state, action, last_action, reward, next_state):
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = (hidden_in, hidden_out, state, action, last_action, reward, next_state)
        self.position = int((self.position + 1) % self.capacity) # ring buffer

    def sample(self, batch_size):
        s_lst, a_lst, la_lst, r_lst, ns_lst, hi_lst, ho_lst = [], [], [], [], [], [], []
        batch = random.sample(self.buffer, batch_size)
        min_seq_len = float('inf')
        for sample in batch:
            h_in, h_out, state, action, last_action, reward, next_state = sample
            min_seq_len = min(len(state), min_seq_len)
            hi_lst.append(h_in)  # h_in: (1, batch_size=1, n_agents, hidden_size)
            ho_lst.append(h_out)
        hi_lst = torch.cat(hi_lst, dim = -3).detach()  # cat along the batch dim
        ho_lst = torch.cat(ho_lst, dim = -3).detach()

        # strip sequence length
        for sample in batch:
            h_in, h_out, state, action, last_action, reward, next_state = sample
            sample_len = len(state)
            start_idx = int((sample_len - min_seq_len)/2)
            end_idx = start_idx + min_seq_len
            s_lst.append(state[start_idx:end_idx])
            a_lst.append(action[start_idx:end_idx])
            la_lst.append(last_action[start_idx:end_idx])
            r_lst.append(reward[start_idx:end_idx])
            ns_lst.append(next_state[start_idx:end_idx])

        return hi_lst, ho_lst, s_lst, a_lst, la_lst, r_lst, ns_lst

    def get_length(self):
        return len(self.buffer)
        

Xây class RNNAgent cho mỗi agent

In [5]:
class RNNAgent(nn.Module):
    '''
    @brief:
        evaluate Q value given a state and the action
    '''
    
    def __init__(self, num_inputs, action_shape, num_actions, hidden_size):
        super(RNNAgent, self).__init__()

        self.num_inputs = num_inputs
        self.action_shape = action_shape
        self.num_actions = num_actions

        self.linear1 = nn.Linear(num_inputs + action_shape*num_actions, hidden_size)
        self.linear2 = nn.Linear(hidden_size, hidden_size)
        self.rnn = nn.GRU(hidden_dim, hidden_dim)
        self.linear3 = nn.Linear(hidden_size, hidden_size)
        self.linear4 = nn.Linear(hidden_size, action_shape*num_actions)

    def forward(self, state, action, hidden_in):
        '''
        @params:
            state: [#batch, #sequence, #agent, #n_feature]
            action: [#batch, #sequence, #agent, action_shape]
        @return:
            qs: [#batch, #sequence, #agent, action_shape, num_actions]
        '''
        #  to [#sequence, #batch, #agent, #n_feature]
        bs, seq_len, n_agents, _= state.shape
        state = state.permute(1, 0, 2, 3)
        action = action.permute(1, 0, 2, 3)
        action = F.one_hot(action, num_classes=self.num_actions)
        action = action.view(seq_len, bs, n_agents, -1) # [#sequence, #batch, #agent, action_shape*num_actions]

        x = torch.cat([state, action], -1)  # the dim 0 is number of samples
        x = x.view(seq_len, bs*n_agents, -1) # change x to [#sequence, #batch*#agent, -1] to meet rnn's input requirement
        hidden_in = hidden_in.view(1, bs*n_agents, -1)
        x = F.relu(self.linear1(x))
        x = F.relu(self.linear2(x))
        x, hidden = self.rnn(x, hidden_in)
        x = F.relu(self.linear3(x))
        x = self.linear4(x) # [#sequence, #batch, #agents, #action_shape*#num_actions]
        # [#sequence, #batch, #agent, #head * #action]
        x = x.view(seq_len, bs, n_agents, self.action_shape, self.num_actions)
        hidden = hidden.view(1, bs, n_agents, -1)
        # categorical over the discretized actions
        qs = F.softmax(x, dim=-1)
        qs = qs.permute(1, 0, 2, 3, 4)  # permute back [#batch, #sequence, #agents, #action_shape, #actions]

        return qs, hidden
    def get_action(self, state, last_action, hidden_in, deterministic=False):
        '''
        @brief:
            for each distributed agent, generate action for one step given input data
        @params:
            state: [n_agents, n_feature]
            last_action: [n_agents, action_shape]
        '''
        state = torch.FloatTensor(state).unsqueeze(0).unsqueeze(0).to(device) # add #sequence and #batch: [[#batch, #sequence, n_agents, n_feature]] 
        last_action = torch.LongTensor(
            last_action).unsqueeze(0).unsqueeze(0).to(device)  # add #sequence and #batch: [#batch, #sequence, n_agents, action_shape]
        hidden_in = hidden_in.unsqueeze(1) # add #batch: [#batch, n_agents, hidden_dim]
        agent_outs, hidden_out = self.forward(state, last_action, hidden_in)  # agents_out: [#batch, #sequence, n_agents, action_shape, action_dim]; hidden_out same as hidden_in
        dist = Categorical(agent_outs)

        if deterministic:
            action = np.argmax(agent_outs.detach().cpu().numpy(), axis=-1)
        else:
            action = dist.sample().squeeze(0).squeeze(0).detach().cpu().numpy()  # squeeze the added #batch and #sequence dimension
        return action, hidden_out  # [n_agents, action_shape]

Xây class QMix để tính Q_tot

In [6]:
class QMix(nn.Module):
    def __init__(self, state_dim, n_agents, action_shape, embed_dim=64, hypernet_embed=128, abs=True):
        """
        Critic network class for Qmix. Outputs centralized value function predictions given independent q value.
        :param args: (argparse) arguments containing relevant model information.
        """
        super(QMix, self).__init__()

        self.n_agents = n_agents
        self.state_dim = state_dim * n_agents * action_shape # #features*n_agents
        self.action_shape = action_shape

        self.embed_dim = embed_dim
        self.hypernet_embed = hypernet_embed
        self.abs = abs

        self.hyper_w_1 = nn.Sequential(nn.Linear(self.state_dim, self.hypernet_embed),
                                        nn.ReLU(inplace=True),
                                        nn.Linear(self.hypernet_embed, self.action_shape * self.embed_dim * self.n_agents))
        self.hyper_w_final = nn.Sequential(nn.Linear(self.state_dim, self.hypernet_embed),
                                            nn.ReLU(inplace=True),
                                           nn.Linear(self.hypernet_embed, self.embed_dim))

        # State dependent bias for hidden layer
        self.hyper_b_1 = nn.Linear(self.state_dim, self.embed_dim)

        # V(s) instead of a bias for the last layers
        self.V = nn.Sequential(nn.Linear(self.state_dim, self.embed_dim),
                               nn.ReLU(inplace=True),
                               nn.Linear(self.embed_dim, 1))

    def forward(self, agent_qs, states):
        """
        Compute actions from the given inputs.
        @params:
            agent_qs: [#batch, #sequence, #agent, #action_shape]
            states: [#batch, #sequence, #agent, #features*action_shape]
        :param agent_qs: q value inputs into network [batch_size, #agent, action_shape]
        :param states: state observation.
        :return q_tot: (torch.Tensor) return q-total .
        """
        bs = agent_qs.size(0)
        states = states.reshape(-1, self.state_dim)  # [#batch*#sequence, action_shape*#features*#agent]
        agent_qs = agent_qs.reshape(-1, 1, self.n_agents*self.action_shape)  # [#batch*#sequence, 1, #agent*#action_shape]
        # First layer
        w1 = self.hyper_w_1(states).abs() if self.abs else self.hyper_w_1(states)  # [#batch*#sequence, action_shape*self.embed_dim*#agent]
        b1 = self.hyper_b_1(states)  # [#batch*#sequence, self.embed_dim]
        w1 = w1.view(-1, self.n_agents*self.action_shape, self.embed_dim)  # [#batch*#sequence, #agent*action_shape, self.embed_dim]
        b1 = b1.view(-1, 1, self.embed_dim)   # [#batch*#sequence, 1, self.embed_dim]
        hidden = F.elu(torch.bmm(agent_qs, w1) + b1)  # [#batch*#sequence, 1, self.embed_dim]

        # Second layer
        w_final = self.hyper_w_final(states).abs() if self.abs else self.hyper_w_final(states)  # [#batch*#sequence, self.embed_dim]
        w_final = w_final.view(-1, self.embed_dim, 1)  # [#batch*#sequence, self.embed_dim, 1]
        # State-dependent bias
        v = self.V(states).view(-1, 1, 1)  # [#batch*#sequence, 1, 1]
        # Compute final output
        y = torch.bmm(hidden, w_final) + v  
        # Reshape and return
        q_tot = y.view(bs, -1, 1) # [#batch, #sequence, 1]
        return q_tot


Tạo class QMix_Trainer để train QMix

In [7]:
class QMix_Trainer():
    def __init__(self, replay_buffer, n_agents, state_dim, action_shape, action_dim, hidden_dim, hypernet_dim, target_update_interval, lr=0.001, logger=None):
        self.replay_buffer = replay_buffer

        self.action_dim = action_dim
        self.action_shape = action_shape
        self.n_agents = n_agents
        self.target_update_interval = target_update_interval
        
        self.agent = RNNAgent(state_dim, action_shape, action_dim, hidden_dim).to(device)
        self.target_agent = RNNAgent(state_dim, action_shape, action_dim, hidden_dim).to(device)
        
        self.mixer = QMix(state_dim, n_agents, action_shape,
                          hidden_dim, hypernet_dim).to(device)
        self.target_mixer = QMix(state_dim, n_agents, action_shape,
                          hidden_dim, hypernet_dim).to(device)
        
        self._update_targets()
        self.update_cnt = 0
        
        self.criterion = nn.MSELoss()

        self.optimizer = optim.Adam(
            list(self.agent.parameters())+list(self.mixer.parameters()), lr=lr)

    def sample_action(self):
        probs = torch.FloatTensor(
            np.ones(self.action_dim)/self.action_dim).to(device)
        dist = Categorical(probs)
        action = dist.sample((self.n_agents, self.action_shape))

        return action.type(torch.FloatTensor).numpy()

    def get_action(self, state, last_action, hidden_in, deterministic=False):
        '''
        @return:
            action: w/ shape [#active_as]
        '''

        action, hidden_out = self.agent.get_action(state, last_action, hidden_in, deterministic=deterministic)

        return action, hidden_out

    def push_replay_buffer(self, ini_hidden_in, ini_hidden_out, episode_state, episode_action, episode_last_action,
                           episode_reward, episode_next_state):
        '''
        @brief: push arguments into replay buffer
        '''
        self.replay_buffer.push(ini_hidden_in, ini_hidden_out, episode_state, episode_action, episode_last_action,
                                episode_reward, episode_next_state)

    def update(self, batch_size):
        hidden_in, hidden_out, state, action, last_action, reward, next_state = self.replay_buffer.sample(batch_size)

        state = torch.FloatTensor(state).to(device) # [#batch, sequence, #agents, #features*action_shape]
        next_state = torch.FloatTensor(next_state).to(device)
        action = torch.LongTensor(action).to(device) # [#batch, sequence, #agents, #action_shape]
        last_action = torch.LongTensor(last_action).to(device)
        reward = torch.FloatTensor(reward).unsqueeze(-1).to(device) # reward is scalar, add 1 dim to be [reward] at the same dim

        agent_outs, _ = self.agent(state, last_action, hidden_in) # [#batch, #sequence, #agent, action_shape, num_actions]
        
        chosen_action_qvals = torch.gather(  # [#batch, #sequence, #agent, action_shape]
            agent_outs, dim=-1, index=action.unsqueeze(-1)).squeeze(-1)

        qtot = self.mixer(chosen_action_qvals, state) # [#batch, #sequence, 1]

        # target q
        target_agent_outs, _ = self.target_agent(next_state, action, hidden_out)
        target_max_qvals = target_agent_outs.max(dim=-1, keepdim=True)[0] # [#batch, #sequence, #agents, action_shape]
        target_qtot = self.target_mixer(target_max_qvals, next_state)
        
        reward = reward[:, :, 0]  # reward is the same for agents, so take one
        targets = self._build_td_lambda_targets(reward, target_qtot)

        loss = self.criterion(qtot, targets.detach())
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.update_cnt += 1
        if self.update_cnt % self.target_update_interval == 0:
            self._update_targets()

        return loss.item()

    def _build_td_lambda_targets(self, rewards, target_qs, gamma=0.99, td_lambda=0.6):
        '''
        @params:
            rewards: [#batch, #sequence, 1]
            target_qs: [#batch, #sequence, 1]
        '''
        ret = target_qs.new_zeros(*target_qs.shape)
        ret[:, -1] = target_qs[:, -1]
        # backwards recursive update of the "forward view"
        for t in range(ret.shape[1] - 2, -1, -1):
            ret[:, t] = td_lambda * gamma * ret[:, t+1] + (rewards[:, t] + (1 - td_lambda) * gamma * target_qs[:, t+1])
        return ret

    def _update_targets(self):
        for target_param, param in zip(self.target_mixer.parameters(), self.mixer.parameters()):
            target_param.data.copy_(param.data)
        for target_param, param in zip(self.target_agent.parameters(), self.agent.parameters()):
            target_param.data.copy_(param.data)

    def save_model(self, path):
        torch.save(self.agent.state_dict(), path+'_agent')
        torch.save(self.mixer.state_dict(), path+'_mixer')

    def load_model(self, path):
        self.agent.load_state_dict(torch.load(path+'_agent'))
        self.mixer.load_state_dict(torch.load(path+'_mixer'))

        self.agent.eval()
        self.mixer.eval()

## Train agent

In [8]:
from env import Environment
import gymnasium as gym
from gymnasium import spaces

  File "/usr/local/lib/python3.11/dist-packages/gymnasium/envs/registration.py", line 594, in load_plugin_envs
    fn()
  File "/usr/local/lib/python3.11/dist-packages/shimmy/registration.py", line 304, in register_gymnasium_envs
    _register_atari_envs()
  File "/usr/local/lib/python3.11/dist-packages/shimmy/registration.py", line 205, in _register_atari_envs
    import ale_py
  File "/usr/local/lib/python3.11/dist-packages/ale_py/__init__.py", line 68, in <module>
    register_v0_v4_envs()
  File "/usr/local/lib/python3.11/dist-packages/ale_py/registration.py", line 179, in register_v0_v4_envs
    _register_rom_configs(legacy_games, obs_types, versions)
  File "/usr/local/lib/python3.11/dist-packages/ale_py/registration.py", line 64, in _register_rom_configs
    gymnasium.register(
    ^^^^^^^^^^^^^^^^^^
AttributeError: partially initialized module 'gymnasium' has no attribute 'register' (most likely due to a circular import)
[0m
  logger.warn(f"plugin: {plugin.value} raised {trace

In [9]:
def convert_state(state):
    ret_state = {}
    # state["time_step"] = np.array([state["time_step"]]).astype(np.float32).flatten(0)
    # state["map"] = np.array(state["map"]).astype(np.float32)
    ret_state["robots"] = np.array(state["robots"]).astype(np.float32).flatten()
    ret_state["packages"] = np.array(state["packages"]).astype(np.float32).flatten()[:100]
    if len(ret_state["packages"]) < 1000:
        ret_state["packages"] = np.concatenate((ret_state["packages"], np.zeros(100-len(ret_state["packages"]))))
    return np.concatenate(list(ret_state.values()))

In [10]:
def reward_shaping(r, env, state, action):
    return r

In [11]:
# Avoid to modify the Env class,
# If it is neccessary, you should describe those changes clearly in report and code
class Env(gym.Env):
    def __init__(self, *args, **kwargs):
        super(Env, self).__init__()
        self.env = Environment(*args, **kwargs)

        self.action_space = spaces.multi_discrete.MultiDiscrete([5, 3]*self.env.n_robots)


        self.prev_state = self.env.reset()
        first_state=convert_state(self.prev_state)
        # Define observation space as a dictionary

        self.observation_space = spaces.Box(low=0, high=100, shape=first_state.shape, dtype=np.float32)


        from sklearn.preprocessing import LabelEncoder
        self.le1, self.le2= LabelEncoder(), LabelEncoder()
        self.le1.fit(['S', 'L', 'R', 'U', 'D'])
        self.le2.fit(['0','1', '2'])

    def reset(self, *args, **kwargs):
        self.prev_state = self.env.reset()
        return convert_state(self.prev_state), {}

    def render(self, *args, **kwargs):
        return self.env.render()

    def step(self, action):
        ret = []
        ret.append(self.le1.inverse_transform(action.reshape(-1, 2).T[0]))
        ret.append(self.le2.inverse_transform(action.reshape(-1, 2).T[1]))
        action = list(zip(*ret))

        # You should not modify the infos object
        s, r, done, infos = self.env.step(action)
        new_r = reward_shaping(r, self.env, self.prev_state, action)
        self.prev_state = s
        return convert_state(s), new_r, \
            done, False, infos

In [12]:
replay_buffer_size = 1e4
hidden_dim = 64
hypernet_dim = 128
max_steps = 100
max_episodes = 1000
update_iter  = 1
batch_size = 2
save_interval = 10
target_update_interval = 10
model_path = 'model/qmix'

env = Env('map2.txt', 100, 5, 20, -0.01, 10., 1., 10)
state_dim = env.observation_space.shape[0]
n_agents = 5
action_dim = 5*3*n_agents
action_shape = 1

replay_buffer = ReplayBufferGRU(replay_buffer_size)
learner = QMix_Trainer(replay_buffer, n_agents, state_dim, action_shape, action_dim, hidden_dim, hypernet_dim, target_update_interval)

loss = None

for epi in range(max_episodes):
    hidden_out = torch.zeros([1, n_agents, hidden_dim], dtype=torch.float).to(device)
    last_action = learner.sample_action()
    episode_state = []
    episode_action = []
    episode_last_action = []
    episode_reward = []
    episode_next_state = []

    state = env.reset()

    for step in range(max_steps):
        hidden_in = hidden_out
        action, hidden_out = learner.get_action(state, last_action, hidden_in)

        action = action.reshape(-1)
        ret = []
        ret.append(le1.inverse_transform(action.reshape(-1, 2).T[0]))
        ret.append(le2.inverse_transform(action.reshape(-1, 2).T[1]))
        action = list(zip(*ret))

        next_state, reward, done, info = env.step(action)

        if step == 0:
            ini_hidden_in = hidden_in
            ini_hidden_out = hidden_out

        episode_state.append(state)
        episode_action.append(action)
        episode_last_action.append(last_action)
        episode_reward.append(reward)
        episode_next_state.append(next_state)

        state = next_state
        last_action = action

        if np.any(done):
            break

    if args.train:
        learner.push_replay_buffer(ini_hidden_in, ini_hidden_out, episode_state, episode_action, episode_last_action,
                            episode_reward, episode_next_state)
        if epi > batch_size:
            for _ in range(update_iter):
                loss = learner.update(batch_size)

        if epi % save_interval == 0:
            learner.save_model(model_path)

    print(f"Episode: {epi}, Episode Reward: {np.sum(episode_reward)}, Loss: {loss}")

  state = torch.FloatTensor(state).unsqueeze(0).unsqueeze(0).to(device) # add #sequence and #batch: [[#batch, #sequence, n_agents, n_feature]]


ValueError: expected sequence of length 115 at dim 1 (got 0)